Merge branch 'aesni'
authorMartin Willi <martin@revosec.ch>
Wed, 15 Apr 2015 12:33:45 +0000 (14:33 +0200)
committerMartin Willi <martin@revosec.ch>
Wed, 15 Apr 2015 12:33:45 +0000 (14:33 +0200)
Add an aesni plugin providing CBC, CTR, XCBC, CMAC, CCM and GCM modes for
for AES-128/192/256 based on AES-NI/PCLMULQDQ intrinsics.

32 files changed:
NEWS
configure.ac
scripts/crypt_burn.c
src/libstrongswan/Makefile.am
src/libstrongswan/crypto/crypto_factory.c
src/libstrongswan/crypto/crypto_factory.h
src/libstrongswan/crypto/crypto_tester.c
src/libstrongswan/plugins/aesni/Makefile.am [new file with mode: 0644]
src/libstrongswan/plugins/aesni/aesni_cbc.c [new file with mode: 0644]
src/libstrongswan/plugins/aesni/aesni_cbc.h [new file with mode: 0644]
src/libstrongswan/plugins/aesni/aesni_ccm.c [new file with mode: 0644]
src/libstrongswan/plugins/aesni/aesni_ccm.h [new file with mode: 0644]
src/libstrongswan/plugins/aesni/aesni_cmac.c [new file with mode: 0644]
src/libstrongswan/plugins/aesni/aesni_cmac.h [new file with mode: 0644]
src/libstrongswan/plugins/aesni/aesni_ctr.c [new file with mode: 0644]
src/libstrongswan/plugins/aesni/aesni_ctr.h [new file with mode: 0644]
src/libstrongswan/plugins/aesni/aesni_gcm.c [new file with mode: 0644]
src/libstrongswan/plugins/aesni/aesni_gcm.h [new file with mode: 0644]
src/libstrongswan/plugins/aesni/aesni_key.c [new file with mode: 0644]
src/libstrongswan/plugins/aesni/aesni_key.h [new file with mode: 0644]
src/libstrongswan/plugins/aesni/aesni_plugin.c [new file with mode: 0644]
src/libstrongswan/plugins/aesni/aesni_plugin.h [new file with mode: 0644]
src/libstrongswan/plugins/aesni/aesni_xcbc.c [new file with mode: 0644]
src/libstrongswan/plugins/aesni/aesni_xcbc.h [new file with mode: 0644]
src/libstrongswan/plugins/plugin_feature.c
src/libstrongswan/plugins/test_vectors/test_vectors.h
src/libstrongswan/plugins/test_vectors/test_vectors/aes_ccm.c
src/libstrongswan/plugins/test_vectors/test_vectors/aes_gcm.c
src/libstrongswan/tests/suites/test_utils.c
src/libstrongswan/tests/test_suite.h
src/libstrongswan/utils/utils.c
src/libstrongswan/utils/utils.h

diff --git a/NEWS b/NEWS
index 81a7fc5..77951dd 100644 (file)
--- a/NEWS
+++ b/NEWS
@@ -1,3 +1,9 @@
+- The new aesni plugin provides CBC, CTR, XCBC, CMAC, CCM and GCM crypto
+  primitives for AES-128/192/256. The plugin requires AES-NI and PCLMULQDQ
+  instructions and works on both x86 and x64 architectures. It provides
+  superior crypto performance in userland without any external libraries.
+
+
 strongswan-5.3.0
 ----------------
 
index 1cd4f24..59e8675 100644 (file)
@@ -144,6 +144,7 @@ ARG_ENABL_SET([padlock],        [enables VIA Padlock crypto plugin.])
 ARG_DISBL_SET([random],         [disable RNG implementation on top of /dev/(u)random.])
 ARG_DISBL_SET([rc2],            [disable RC2 software implementation plugin.])
 ARG_ENABL_SET([rdrand],         [enable Intel RDRAND random generator plugin.])
+ARG_ENABL_SET([aesni],          [enable Intel AES-NI crypto plugin.])
 ARG_DISBL_SET([sha1],           [disable SHA1 software implementation plugin.])
 ARG_DISBL_SET([sha2],           [disable SHA256/SHA384/SHA512 software implementation plugin.])
 ARG_DISBL_SET([xcbc],           [disable xcbc crypto implementation plugin.])
@@ -1243,6 +1244,7 @@ ADD_PLUGIN([test-vectors],         [s charon scepclient pki])
 ADD_PLUGIN([unbound],              [s charon scripts])
 ADD_PLUGIN([ldap],                 [s charon scepclient scripts nm cmd])
 ADD_PLUGIN([pkcs11],               [s charon pki nm cmd])
+ADD_PLUGIN([aesni],                [s charon scepclient pki scripts medsrv attest nm cmd aikgen])
 ADD_PLUGIN([aes],                  [s charon scepclient pki scripts nm cmd])
 ADD_PLUGIN([des],                  [s charon scepclient pki scripts nm cmd])
 ADD_PLUGIN([blowfish],             [s charon scepclient pki scripts nm cmd])
@@ -1406,6 +1408,7 @@ AM_CONDITIONAL(USE_SHA2, test x$sha2 = xtrue)
 AM_CONDITIONAL(USE_FIPS_PRF, test x$fips_prf = xtrue)
 AM_CONDITIONAL(USE_GMP, test x$gmp = xtrue)
 AM_CONDITIONAL(USE_RDRAND, test x$rdrand = xtrue)
+AM_CONDITIONAL(USE_AESNI, test x$aesni = xtrue)
 AM_CONDITIONAL(USE_RANDOM, test x$random = xtrue)
 AM_CONDITIONAL(USE_NONCE, test x$nonce = xtrue)
 AM_CONDITIONAL(USE_X509, test x$x509 = xtrue)
@@ -1649,6 +1652,7 @@ AC_CONFIG_FILES([
        src/libstrongswan/plugins/fips_prf/Makefile
        src/libstrongswan/plugins/gmp/Makefile
        src/libstrongswan/plugins/rdrand/Makefile
+       src/libstrongswan/plugins/aesni/Makefile
        src/libstrongswan/plugins/random/Makefile
        src/libstrongswan/plugins/nonce/Makefile
        src/libstrongswan/plugins/hmac/Makefile
index 1768d76..c0143d0 100644 (file)
 #include <stdio.h>
 #include <library.h>
 
-int main(int argc, char *argv[])
+static int burn_crypter(const proposal_token_t *token, u_int limit, u_int len)
 {
-       const proposal_token_t *token;
-       aead_t *aead;
+       chunk_t iv, key, data;
        crypter_t *crypter;
-       char buffer[1024], assoc[8], iv[32];
-       size_t bs;
-       int i = 0, limit = 0;
+       int i = 0;
+       bool ok;
+
+       crypter = lib->crypto->create_crypter(lib->crypto, token->algorithm,
+                                                                                 token->keysize / 8);
+       if (!crypter)
+       {
+               fprintf(stderr, "%N-%zu not supported\n",
+                               encryption_algorithm_names, token->algorithm, token->keysize);
+               return FALSE;
+       }
+
+       iv = chunk_alloc(crypter->get_iv_size(crypter));
+       memset(iv.ptr, 0xFF, iv.len);
+       data = chunk_alloc(round_up(len, crypter->get_block_size(crypter)));
+       memset(data.ptr, 0xDD, data.len);
+       key = chunk_alloc(crypter->get_key_size(crypter));
+       memset(key.ptr, 0xAA, key.len);
+
+       ok = crypter->set_key(crypter, key);
+       while (ok)
+       {
+               if (!crypter->encrypt(crypter, data, iv, NULL))
+               {
+                       fprintf(stderr, "encryption failed!\n");
+                       ok = FALSE;
+                       break;
+               }
+               if (!crypter->decrypt(crypter, data, iv, NULL))
+               {
+                       fprintf(stderr, "decryption failed!\n");
+                       ok = FALSE;
+                       break;
+               }
+               if (limit && ++i == limit)
+               {
+                       break;
+               }
+       }
+       crypter->destroy(crypter);
+
+       free(iv.ptr);
+       free(data.ptr);
+       free(key.ptr);
+
+       return ok;
+}
+
+static bool burn_aead(const proposal_token_t *token, u_int limit, u_int len)
+{
+       chunk_t iv, key, data, dataicv, assoc;
+       aead_t *aead;
+       int i = 0;
+       bool ok;
+
+       aead = lib->crypto->create_aead(lib->crypto, token->algorithm,
+                                                                       token->keysize / 8, 0);
+       if (!aead)
+       {
+               fprintf(stderr, "%N-%zu not supported\n",
+                               encryption_algorithm_names, token->algorithm, token->keysize);
+               return FALSE;
+       }
+
+       iv = chunk_alloc(aead->get_iv_size(aead));
+       memset(iv.ptr, 0xFF, iv.len);
+       dataicv = chunk_alloc(round_up(len, aead->get_block_size(aead)) +
+                                                 aead->get_icv_size(aead));
+       data = chunk_create(dataicv.ptr, dataicv.len - aead->get_icv_size(aead));
+       memset(data.ptr, 0xDD, data.len);
+       assoc = chunk_alloc(13);
+       memset(assoc.ptr, 0xCC, assoc.len);
+       key = chunk_alloc(aead->get_key_size(aead));
+       memset(key.ptr, 0xAA, key.len);
+
+       ok = aead->set_key(aead, key);
+       while (ok)
+       {
+               if (!aead->encrypt(aead, data, assoc, iv, NULL))
+               {
+                       fprintf(stderr, "aead encryption failed!\n");
+                       ok = FALSE;
+                       break;
+               }
+               if (!aead->decrypt(aead, dataicv, assoc, iv, NULL))
+               {
+                       fprintf(stderr, "aead integrity check failed!\n");
+                       ok = FALSE;
+                       break;
+               }
+               if (limit && ++i == limit)
+               {
+                       break;
+               }
+       }
+       aead->destroy(aead);
+
+       free(iv.ptr);
+       free(data.ptr);
+       free(key.ptr);
+
+       return ok;
+}
+
+static int burn_signer(const proposal_token_t *token, u_int limit, u_int len)
+{
+       chunk_t  key, data, sig;
+       signer_t *signer;
+       int i = 0;
+       bool ok;
+
+       signer = lib->crypto->create_signer(lib->crypto, token->algorithm);
+       if (!signer)
+       {
+               fprintf(stderr, "%N not supported\n",
+                               integrity_algorithm_names, token->algorithm);
+               return FALSE;
+       }
+
+       data = chunk_alloc(len);
+       memset(data.ptr, 0xDD, data.len);
+       key = chunk_alloc(signer->get_key_size(signer));
+       memset(key.ptr, 0xAA, key.len);
+       sig = chunk_alloc(signer->get_block_size(signer));
+
+       ok = signer->set_key(signer, key);
+       while (ok)
+       {
+               if (!signer->get_signature(signer, data, sig.ptr))
+               {
+                       fprintf(stderr, "creating signature failed!\n");
+                       ok = FALSE;
+                       break;
+               }
+               if (!signer->verify_signature(signer, data, sig))
+               {
+                       fprintf(stderr, "verifying signature failed!\n");
+                       ok = FALSE;
+                       break;
+               }
+               if (limit && ++i == limit)
+               {
+                       break;
+               }
+       }
+       signer->destroy(signer);
 
+       free(data.ptr);
+       free(key.ptr);
+       free(sig.ptr);
+
+       return ok;
+}
+
+int main(int argc, char *argv[])
+{
+       const proposal_token_t *token;
+       u_int limit = 0, len = 1024;
+       bool ok;
 
        library_init(NULL, "crypt_burn");
-       lib->plugins->load(lib->plugins, PLUGINS);
+       lib->plugins->load(lib->plugins, getenv("PLUGINS") ?: PLUGINS);
        atexit(library_deinit);
 
-       printf("loaded: %s\n", PLUGINS);
-
-       memset(buffer, 0x12, sizeof(buffer));
-       memset(assoc, 0x34, sizeof(assoc));
-       memset(iv, 0x56, sizeof(iv));
+       fprintf(stderr, "loaded: %s\n", lib->plugins->loaded_plugins(lib->plugins));
 
        if (argc < 2)
        {
-               fprintf(stderr, "usage: %s <algorithm>!\n", argv[0]);
+               fprintf(stderr, "usage: %s <algorithm> [buflen=%u] [rounds=%u]\n",
+                               argv[0], len, limit);
                return 1;
        }
        if (argc > 2)
        {
-               limit = atoi(argv[2]);
+               len = atoi(argv[2]);
+       }
+       if (argc > 3)
+       {
+               limit = atoi(argv[3]);
        }
 
        token = lib->proposal->get_token(lib->proposal, argv[1]);
@@ -52,76 +207,26 @@ int main(int argc, char *argv[])
                fprintf(stderr, "algorithm '%s' unknown!\n", argv[1]);
                return 1;
        }
-       if (token->type != ENCRYPTION_ALGORITHM)
-       {
-               fprintf(stderr, "'%s' is not an encryption/aead algorithm!\n", argv[1]);
-               return 1;
-       }
 
-       if (encryption_algorithm_is_aead(token->algorithm))
+       switch (token->type)
        {
-               aead = lib->crypto->create_aead(lib->crypto,
-                                                                       token->algorithm, token->keysize / 8, 0);
-               if (!aead)
-               {
-                       fprintf(stderr, "aead '%s' not supported!\n", argv[1]);
-                       return 1;
-               }
-               while (TRUE)
-               {
-                       if (!aead->encrypt(aead,
-                               chunk_create(buffer, sizeof(buffer) - aead->get_icv_size(aead)),
-                               chunk_from_thing(assoc),
-                               chunk_create(iv, aead->get_iv_size(aead)), NULL))
+               case ENCRYPTION_ALGORITHM:
+                       if (encryption_algorithm_is_aead(token->algorithm))
                        {
-                               fprintf(stderr, "aead encryption failed!\n");
-                               return 1;
+                               ok = burn_aead(token, limit, len);
                        }
-                       if (!aead->decrypt(aead, chunk_create(buffer, sizeof(buffer)),
-                               chunk_from_thing(assoc),
-                               chunk_create(iv, aead->get_iv_size(aead)), NULL))
+                       else
                        {
-                               fprintf(stderr, "aead integrity check failed!\n");
-                               return 1;
+                               ok = burn_crypter(token, limit, len);
                        }
-                       if (limit && ++i == limit)
-                       {
-                               break;
-                       }
-               }
-               aead->destroy(aead);
-       }
-       else
-       {
-               crypter = lib->crypto->create_crypter(lib->crypto,
-                                                                               token->algorithm, token->keysize / 8);
-               if (!crypter)
-               {
-                       fprintf(stderr, "crypter '%s' not supported!\n", argv[1]);
-                       return 1;
-               }
-               bs = crypter->get_block_size(crypter);
-
-               while (TRUE)
-               {
-                       if (!crypter->encrypt(crypter,
-                                       chunk_create(buffer, sizeof(buffer) / bs * bs),
-                                       chunk_create(iv, crypter->get_iv_size(crypter)), NULL))
-                       {
-                               continue;
-                       }
-                       if (!crypter->decrypt(crypter,
-                                       chunk_create(buffer, sizeof(buffer) / bs * bs),
-                                       chunk_create(iv, crypter->get_iv_size(crypter)), NULL))
-                       {
-                               continue;
-                       }
-                       if (limit && ++i == limit)
-                       {
-                               break;
-                       }
-               }
-               crypter->destroy(crypter);
+                       break;
+               case INTEGRITY_ALGORITHM:
+                       ok = burn_signer(token, limit, len);
+                       break;
+               default:
+                       fprintf(stderr, "'%s' is not a crypter/aead algorithm!\n", argv[1]);
+                       ok = FALSE;
+                       break;
        }
-       return 0;
+       return !ok;
 }
index f738072..462c34b 100644 (file)
@@ -296,6 +296,13 @@ if MONOLITHIC
 endif
 endif
 
+if USE_AESNI
+  SUBDIRS += plugins/aesni
+if MONOLITHIC
+  libstrongswan_la_LIBADD += plugins/aesni/libstrongswan-aesni.la
+endif
+endif
+
 if USE_RANDOM
   SUBDIRS += plugins/random
 if MONOLITHIC
index 96fbc0d..1bc1ac7 100644 (file)
@@ -439,14 +439,14 @@ static void add_entry(private_crypto_factory_t *this, linked_list_t *list,
 }
 
 METHOD(crypto_factory_t, add_crypter, bool,
-       private_crypto_factory_t *this, encryption_algorithm_t algo,
+       private_crypto_factory_t *this, encryption_algorithm_t algo, size_t key_size,
        const char *plugin_name, crypter_constructor_t create)
 {
        u_int speed = 0;
 
        if (!this->test_on_add ||
-               this->tester->test_crypter(this->tester, algo, 0, create,
-                                                                  this->bench ? &speed : NULL, plugin_name))
+               this->tester->test_crypter(this->tester, algo, key_size, create,
+                                                                  this->bench ? &speed : NULL, plugin_name))
        {
                add_entry(this, this->crypters, algo, plugin_name, speed, create);
                return TRUE;
@@ -476,13 +476,13 @@ METHOD(crypto_factory_t, remove_crypter, void,
 }
 
 METHOD(crypto_factory_t, add_aead, bool,
-       private_crypto_factory_t *this, encryption_algorithm_t algo,
+       private_crypto_factory_t *this, encryption_algorithm_t algo, size_t key_size,
        const char *plugin_name, aead_constructor_t create)
 {
        u_int speed = 0;
 
        if (!this->test_on_add ||
-               this->tester->test_aead(this->tester, algo, 0, 0, create,
+               this->tester->test_aead(this->tester, algo, key_size, 0, create,
                                                                this->bench ? &speed : NULL, plugin_name))
        {
                add_entry(this, this->aeads, algo, plugin_name, speed, create);
index 7865bcb..b1e18df 100644 (file)
@@ -162,12 +162,14 @@ struct crypto_factory_t {
         * Register a crypter constructor.
         *
         * @param algo                  algorithm to constructor
+        * @param key size              key size to peform benchmarking for
         * @param plugin_name   plugin that registered this algorithm
         * @param create                constructor function for that algorithm
         * @return                              TRUE if registered, FALSE if test vector failed
         */
        bool (*add_crypter)(crypto_factory_t *this, encryption_algorithm_t algo,
-                                               const char *plugin_name, crypter_constructor_t create);
+                                               size_t key_size, const char *plugin_name,
+                                               crypter_constructor_t create);
 
        /**
         * Unregister a crypter constructor.
@@ -187,12 +189,14 @@ struct crypto_factory_t {
         * Register a aead constructor.
         *
         * @param algo                  algorithm to constructor
+        * @param key size              key size to peform benchmarking for
         * @param plugin_name   plugin that registered this algorithm
         * @param create                constructor function for that algorithm
         * @return                              TRUE if registered, FALSE if test vector failed
         */
        bool (*add_aead)(crypto_factory_t *this, encryption_algorithm_t algo,
-                                        const char *plugin_name, aead_constructor_t create);
+                                        size_t key_size, const char *plugin_name,
+                                        aead_constructor_t create);
 
        /**
         * Register a signer constructor.
index 15ed173..20f64c3 100644 (file)
@@ -138,11 +138,11 @@ static u_int end_timing(struct timespec *start)
  * Benchmark a crypter
  */
 static u_int bench_crypter(private_crypto_tester_t *this,
-       encryption_algorithm_t alg, crypter_constructor_t create)
+       encryption_algorithm_t alg, crypter_constructor_t create, size_t key_size)
 {
        crypter_t *crypter;
 
-       crypter = create(alg, 0);
+       crypter = create(alg, key_size);
        if (crypter)
        {
                char iv[crypter->get_iv_size(crypter)];
@@ -280,8 +280,8 @@ failure:
        {
                if (failed)
                {
-                       DBG1(DBG_LIB,"disable %N[%s]: no key size supported",
-                                encryption_algorithm_names, alg, plugin_name);
+                       DBG1(DBG_LIB,"disable %N[%s]: %zd byte key size not supported",
+                                encryption_algorithm_names, alg, plugin_name, key_size);
                        return FALSE;
                }
                else
@@ -296,9 +296,10 @@ failure:
        {
                if (speed)
                {
-                       *speed = bench_crypter(this, alg, create);
-                       DBG1(DBG_LIB, "enabled  %N[%s]: passed %u test vectors, %d points",
-                                encryption_algorithm_names, alg, plugin_name, tested, *speed);
+                       *speed = bench_crypter(this, alg, create, key_size);
+                       DBG1(DBG_LIB, "enabled  %N[%s]: passed %u test vectors, %d points "
+                                "(%zd bit key)", encryption_algorithm_names, alg,
+                                plugin_name, tested, *speed, key_size * 8);
                }
                else
                {
@@ -313,11 +314,11 @@ failure:
  * Benchmark an aead transform
  */
 static u_int bench_aead(private_crypto_tester_t *this,
-       encryption_algorithm_t alg, aead_constructor_t create)
+       encryption_algorithm_t alg, aead_constructor_t create, size_t key_size)
 {
        aead_t *aead;
 
-       aead = create(alg, 0, 0);
+       aead = create(alg, key_size, 0);
        if (aead)
        {
                char iv[aead->get_iv_size(aead)];
@@ -474,8 +475,8 @@ failure:
        {
                if (failed)
                {
-                       DBG1(DBG_LIB,"disable %N[%s]: no key size supported",
-                                encryption_algorithm_names, alg, plugin_name);
+                       DBG1(DBG_LIB,"disable %N[%s]: %zd byte key size not supported",
+                                encryption_algorithm_names, alg, plugin_name, key_size);
                        return FALSE;
                }
                else
@@ -490,9 +491,10 @@ failure:
        {
                if (speed)
                {
-                       *speed = bench_aead(this, alg, create);
-                       DBG1(DBG_LIB, "enabled  %N[%s]: passed %u test vectors, %d points",
-                                encryption_algorithm_names, alg, plugin_name, tested, *speed);
+                       *speed = bench_aead(this, alg, create, key_size);
+                       DBG1(DBG_LIB, "enabled  %N[%s]: passed %u test vectors, %d points "
+                                "(%zd bit key)", encryption_algorithm_names, alg,
+                                plugin_name, tested, *speed, key_size * 8);
                }
                else
                {
diff --git a/src/libstrongswan/plugins/aesni/Makefile.am b/src/libstrongswan/plugins/aesni/Makefile.am
new file mode 100644 (file)
index 0000000..2fe85c6
--- /dev/null
@@ -0,0 +1,26 @@
+AM_CPPFLAGS = \
+       -I$(top_srcdir)/src/libstrongswan
+
+AM_CFLAGS = \
+       -maes \
+       -mpclmul \
+       -mssse3 \
+       $(PLUGIN_CFLAGS)
+
+if MONOLITHIC
+noinst_LTLIBRARIES = libstrongswan-aesni.la
+else
+plugin_LTLIBRARIES = libstrongswan-aesni.la
+endif
+
+libstrongswan_aesni_la_SOURCES = \
+       aesni_key.h aesni_key.c \
+       aesni_cbc.h aesni_cbc.c \
+       aesni_ctr.h aesni_ctr.c \
+       aesni_ccm.h aesni_ccm.c \
+       aesni_gcm.h aesni_gcm.c \
+       aesni_xcbc.h aesni_xcbc.c \
+       aesni_cmac.h aesni_cmac.c \
+       aesni_plugin.h aesni_plugin.c
+
+libstrongswan_aesni_la_LDFLAGS = -module -avoid-version
diff --git a/src/libstrongswan/plugins/aesni/aesni_cbc.c b/src/libstrongswan/plugins/aesni/aesni_cbc.c
new file mode 100644 (file)
index 0000000..78ada76
--- /dev/null
@@ -0,0 +1,671 @@
+/*
+ * Copyright (C) 2015 Martin Willi
+ * Copyright (C) 2015 revosec AG
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.  See <http://www.fsf.org/copyleft/gpl.txt>.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ * or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * for more details.
+ */
+
+#include "aesni_cbc.h"
+#include "aesni_key.h"
+
+/**
+ * Pipeline parallelism we use for CBC decryption
+ */
+#define CBC_DECRYPT_PARALLELISM 4
+
+typedef struct private_aesni_cbc_t private_aesni_cbc_t;
+
+/**
+ * CBC en/decryption method type
+ */
+typedef void (*aesni_cbc_fn_t)(aesni_key_t*, u_int, u_char*, u_char*, u_char*);
+
+/**
+ * Private data of an aesni_cbc_t object.
+ */
+struct private_aesni_cbc_t {
+
+       /**
+        * Public aesni_cbc_t interface.
+        */
+       aesni_cbc_t public;
+
+       /**
+        * Key size
+        */
+       u_int key_size;
+
+       /**
+        * Encryption key schedule
+        */
+       aesni_key_t *ekey;
+
+       /**
+        * Decryption key schedule
+        */
+       aesni_key_t *dkey;
+
+       /**
+        * Encryption method
+        */
+       aesni_cbc_fn_t encrypt;
+
+       /**
+        * Decryption method
+        */
+       aesni_cbc_fn_t decrypt;
+};
+
+/**
+ * AES-128 CBC encryption
+ */
+static void encrypt_cbc128(aesni_key_t *key, u_int blocks, u_char *in,
+                                                  u_char *iv, u_char *out)
+{
+       __m128i *ks, t, fb, *bi, *bo;
+       int i;
+
+       ks = key->schedule;
+       bi = (__m128i*)in;
+       bo = (__m128i*)out;
+
+       fb = _mm_loadu_si128((__m128i*)iv);
+       for (i = 0; i < blocks; i++)
+       {
+               t = _mm_loadu_si128(bi + i);
+               fb = _mm_xor_si128(t, fb);
+               fb = _mm_xor_si128(fb, ks[0]);
+
+               fb = _mm_aesenc_si128(fb, ks[1]);
+               fb = _mm_aesenc_si128(fb, ks[2]);
+               fb = _mm_aesenc_si128(fb, ks[3]);
+               fb = _mm_aesenc_si128(fb, ks[4]);
+               fb = _mm_aesenc_si128(fb, ks[5]);
+               fb = _mm_aesenc_si128(fb, ks[6]);
+               fb = _mm_aesenc_si128(fb, ks[7]);
+               fb = _mm_aesenc_si128(fb, ks[8]);
+               fb = _mm_aesenc_si128(fb, ks[9]);
+
+               fb = _mm_aesenclast_si128(fb, ks[10]);
+               _mm_storeu_si128(bo + i, fb);
+       }
+}
+
+/**
+ * AES-128 CBC decryption
+ */
+static void decrypt_cbc128(aesni_key_t *key, u_int blocks, u_char *in,
+                                                  u_char *iv, u_char *out)
+{
+       __m128i *ks, last, *bi, *bo;
+       __m128i t1, t2, t3, t4;
+       __m128i f1, f2, f3, f4;
+       u_int i, pblocks;
+
+       ks = key->schedule;
+       bi = (__m128i*)in;
+       bo = (__m128i*)out;
+       pblocks = blocks - (blocks % CBC_DECRYPT_PARALLELISM);
+
+       f1 = _mm_loadu_si128((__m128i*)iv);
+
+       for (i = 0; i < pblocks; i += CBC_DECRYPT_PARALLELISM)
+       {
+               t1 = _mm_loadu_si128(bi + i + 0);
+               t2 = _mm_loadu_si128(bi + i + 1);
+               t3 = _mm_loadu_si128(bi + i + 2);
+               t4 = _mm_loadu_si128(bi + i + 3);
+
+               f2 = t1;
+               f3 = t2;
+               f4 = t3;
+               last = t4;
+
+               t1 = _mm_xor_si128(t1, ks[0]);
+               t2 = _mm_xor_si128(t2, ks[0]);
+               t3 = _mm_xor_si128(t3, ks[0]);
+               t4 = _mm_xor_si128(t4, ks[0]);
+
+               t1 = _mm_aesdec_si128(t1, ks[1]);
+               t2 = _mm_aesdec_si128(t2, ks[1]);
+               t3 = _mm_aesdec_si128(t3, ks[1]);
+               t4 = _mm_aesdec_si128(t4, ks[1]);
+               t1 = _mm_aesdec_si128(t1, ks[2]);
+               t2 = _mm_aesdec_si128(t2, ks[2]);
+               t3 = _mm_aesdec_si128(t3, ks[2]);
+               t4 = _mm_aesdec_si128(t4, ks[2]);
+               t1 = _mm_aesdec_si128(t1, ks[3]);
+               t2 = _mm_aesdec_si128(t2, ks[3]);
+               t3 = _mm_aesdec_si128(t3, ks[3]);
+               t4 = _mm_aesdec_si128(t4, ks[3]);
+               t1 = _mm_aesdec_si128(t1, ks[4]);
+               t2 = _mm_aesdec_si128(t2, ks[4]);
+               t3 = _mm_aesdec_si128(t3, ks[4]);
+               t4 = _mm_aesdec_si128(t4, ks[4]);
+               t1 = _mm_aesdec_si128(t1, ks[5]);
+               t2 = _mm_aesdec_si128(t2, ks[5]);
+               t3 = _mm_aesdec_si128(t3, ks[5]);
+               t4 = _mm_aesdec_si128(t4, ks[5]);
+               t1 = _mm_aesdec_si128(t1, ks[6]);
+               t2 = _mm_aesdec_si128(t2, ks[6]);
+               t3 = _mm_aesdec_si128(t3, ks[6]);
+               t4 = _mm_aesdec_si128(t4, ks[6]);
+               t1 = _mm_aesdec_si128(t1, ks[7]);
+               t2 = _mm_aesdec_si128(t2, ks[7]);
+               t3 = _mm_aesdec_si128(t3, ks[7]);
+               t4 = _mm_aesdec_si128(t4, ks[7]);
+               t1 = _mm_aesdec_si128(t1, ks[8]);
+               t2 = _mm_aesdec_si128(t2, ks[8]);
+               t3 = _mm_aesdec_si128(t3, ks[8]);
+               t4 = _mm_aesdec_si128(t4, ks[8]);
+               t1 = _mm_aesdec_si128(t1, ks[9]);
+               t2 = _mm_aesdec_si128(t2, ks[9]);
+               t3 = _mm_aesdec_si128(t3, ks[9]);
+               t4 = _mm_aesdec_si128(t4, ks[9]);
+
+               t1 = _mm_aesdeclast_si128(t1, ks[10]);
+               t2 = _mm_aesdeclast_si128(t2, ks[10]);
+               t3 = _mm_aesdeclast_si128(t3, ks[10]);
+               t4 = _mm_aesdeclast_si128(t4, ks[10]);
+               t1 = _mm_xor_si128(t1, f1);
+               t2 = _mm_xor_si128(t2, f2);
+               t3 = _mm_xor_si128(t3, f3);
+               t4 = _mm_xor_si128(t4, f4);
+               _mm_storeu_si128(bo + i + 0, t1);
+               _mm_storeu_si128(bo + i + 1, t2);
+               _mm_storeu_si128(bo + i + 2, t3);
+               _mm_storeu_si128(bo + i + 3, t4);
+               f1 = last;
+       }
+
+       for (i = pblocks; i < blocks; i++)
+       {
+               last = _mm_loadu_si128(bi + i);
+               t1 = _mm_xor_si128(last, ks[0]);
+
+               t1 = _mm_aesdec_si128(t1, ks[1]);
+               t1 = _mm_aesdec_si128(t1, ks[2]);
+               t1 = _mm_aesdec_si128(t1, ks[3]);
+               t1 = _mm_aesdec_si128(t1, ks[4]);
+               t1 = _mm_aesdec_si128(t1, ks[5]);
+               t1 = _mm_aesdec_si128(t1, ks[6]);
+               t1 = _mm_aesdec_si128(t1, ks[7]);
+               t1 = _mm_aesdec_si128(t1, ks[8]);
+               t1 = _mm_aesdec_si128(t1, ks[9]);
+
+               t1 = _mm_aesdeclast_si128(t1, ks[10]);
+               t1 = _mm_xor_si128(t1, f1);
+               _mm_storeu_si128(bo + i, t1);
+               f1 = last;
+       }
+}
+
+/**
+ * AES-192 CBC encryption
+ */
+static void encrypt_cbc192(aesni_key_t *key, u_int blocks, u_char *in,
+                                                  u_char *iv, u_char *out)
+{
+       __m128i *ks, t, fb, *bi, *bo;
+       int i;
+
+       ks = key->schedule;
+       bi = (__m128i*)in;
+       bo = (__m128i*)out;
+
+       fb = _mm_loadu_si128((__m128i*)iv);
+       for (i = 0; i < blocks; i++)
+       {
+               t = _mm_loadu_si128(bi + i);
+               fb = _mm_xor_si128(t, fb);
+               fb = _mm_xor_si128(fb, ks[0]);
+
+               fb = _mm_aesenc_si128(fb, ks[1]);
+               fb = _mm_aesenc_si128(fb, ks[2]);
+               fb = _mm_aesenc_si128(fb, ks[3]);
+               fb = _mm_aesenc_si128(fb, ks[4]);
+               fb = _mm_aesenc_si128(fb, ks[5]);
+               fb = _mm_aesenc_si128(fb, ks[6]);
+               fb = _mm_aesenc_si128(fb, ks[7]);
+               fb = _mm_aesenc_si128(fb, ks[8]);
+               fb = _mm_aesenc_si128(fb, ks[9]);
+               fb = _mm_aesenc_si128(fb, ks[10]);
+               fb = _mm_aesenc_si128(fb, ks[11]);
+
+               fb = _mm_aesenclast_si128(fb, ks[12]);
+               _mm_storeu_si128(bo + i, fb);
+       }
+}
+
+/**
+ * AES-192 CBC decryption
+ */
+static void decrypt_cbc192(aesni_key_t *key, u_int blocks, u_char *in,
+                                                  u_char *iv, u_char *out)
+{
+       __m128i *ks, last, *bi, *bo;
+       __m128i t1, t2, t3, t4;
+       __m128i f1, f2, f3, f4;
+       u_int i, pblocks;
+
+       ks = key->schedule;
+       bi = (__m128i*)in;
+       bo = (__m128i*)out;
+       pblocks = blocks - (blocks % CBC_DECRYPT_PARALLELISM);
+
+       f1 = _mm_loadu_si128((__m128i*)iv);
+
+       for (i = 0; i < pblocks; i += CBC_DECRYPT_PARALLELISM)
+       {
+               t1 = _mm_loadu_si128(bi + i + 0);
+               t2 = _mm_loadu_si128(bi + i + 1);
+               t3 = _mm_loadu_si128(bi + i + 2);
+               t4 = _mm_loadu_si128(bi + i + 3);
+
+               f2 = t1;
+               f3 = t2;
+               f4 = t3;
+               last = t4;
+
+               t1 = _mm_xor_si128(t1, ks[0]);
+               t2 = _mm_xor_si128(t2, ks[0]);
+               t3 = _mm_xor_si128(t3, ks[0]);
+               t4 = _mm_xor_si128(t4, ks[0]);
+
+               t1 = _mm_aesdec_si128(t1, ks[1]);
+               t2 = _mm_aesdec_si128(t2, ks[1]);
+               t3 = _mm_aesdec_si128(t3, ks[1]);
+               t4 = _mm_aesdec_si128(t4, ks[1]);
+               t1 = _mm_aesdec_si128(t1, ks[2]);
+               t2 = _mm_aesdec_si128(t2, ks[2]);
+               t3 = _mm_aesdec_si128(t3, ks[2]);
+               t4 = _mm_aesdec_si128(t4, ks[2]);
+               t1 = _mm_aesdec_si128(t1, ks[3]);
+               t2 = _mm_aesdec_si128(t2, ks[3]);
+               t3 = _mm_aesdec_si128(t3, ks[3]);
+               t4 = _mm_aesdec_si128(t4, ks[3]);
+               t1 = _mm_aesdec_si128(t1, ks[4]);
+               t2 = _mm_aesdec_si128(t2, ks[4]);
+               t3 = _mm_aesdec_si128(t3, ks[4]);
+               t4 = _mm_aesdec_si128(t4, ks[4]);
+               t1 = _mm_aesdec_si128(t1, ks[5]);
+               t2 = _mm_aesdec_si128(t2, ks[5]);
+               t3 = _mm_aesdec_si128(t3, ks[5]);
+               t4 = _mm_aesdec_si128(t4, ks[5]);
+               t1 = _mm_aesdec_si128(t1, ks[6]);
+               t2 = _mm_aesdec_si128(t2, ks[6]);
+               t3 = _mm_aesdec_si128(t3, ks[6]);
+               t4 = _mm_aesdec_si128(t4, ks[6]);
+               t1 = _mm_aesdec_si128(t1, ks[7]);
+               t2 = _mm_aesdec_si128(t2, ks[7]);
+               t3 = _mm_aesdec_si128(t3, ks[7]);
+               t4 = _mm_aesdec_si128(t4, ks[7]);
+               t1 = _mm_aesdec_si128(t1, ks[8]);
+               t2 = _mm_aesdec_si128(t2, ks[8]);
+               t3 = _mm_aesdec_si128(t3, ks[8]);
+               t4 = _mm_aesdec_si128(t4, ks[8]);
+               t1 = _mm_aesdec_si128(t1, ks[9]);
+               t2 = _mm_aesdec_si128(t2, ks[9]);
+               t3 = _mm_aesdec_si128(t3, ks[9]);
+               t4 = _mm_aesdec_si128(t4, ks[9]);
+               t1 = _mm_aesdec_si128(t1, ks[10]);
+               t2 = _mm_aesdec_si128(t2, ks[10]);
+               t3 = _mm_aesdec_si128(t3, ks[10]);
+               t4 = _mm_aesdec_si128(t4, ks[10]);
+               t1 = _mm_aesdec_si128(t1, ks[11]);
+               t2 = _mm_aesdec_si128(t2, ks[11]);
+               t3 = _mm_aesdec_si128(t3, ks[11]);
+               t4 = _mm_aesdec_si128(t4, ks[11]);
+
+               t1 = _mm_aesdeclast_si128(t1, ks[12]);
+               t2 = _mm_aesdeclast_si128(t2, ks[12]);
+               t3 = _mm_aesdeclast_si128(t3, ks[12]);
+               t4 = _mm_aesdeclast_si128(t4, ks[12]);
+               t1 = _mm_xor_si128(t1, f1);
+               t2 = _mm_xor_si128(t2, f2);
+               t3 = _mm_xor_si128(t3, f3);
+               t4 = _mm_xor_si128(t4, f4);
+               _mm_storeu_si128(bo + i + 0, t1);
+               _mm_storeu_si128(bo + i + 1, t2);
+               _mm_storeu_si128(bo + i + 2, t3);
+               _mm_storeu_si128(bo + i + 3, t4);
+               f1 = last;
+       }
+
+       for (i = pblocks; i < blocks; i++)
+       {
+               last = _mm_loadu_si128(bi + i);
+               t1 = _mm_xor_si128(last, ks[0]);
+
+               t1 = _mm_aesdec_si128(t1, ks[1]);
+               t1 = _mm_aesdec_si128(t1, ks[2]);
+               t1 = _mm_aesdec_si128(t1, ks[3]);
+               t1 = _mm_aesdec_si128(t1, ks[4]);
+               t1 = _mm_aesdec_si128(t1, ks[5]);
+               t1 = _mm_aesdec_si128(t1, ks[6]);
+               t1 = _mm_aesdec_si128(t1, ks[7]);
+               t1 = _mm_aesdec_si128(t1, ks[8]);
+               t1 = _mm_aesdec_si128(t1, ks[9]);
+               t1 = _mm_aesdec_si128(t1, ks[10]);
+               t1 = _mm_aesdec_si128(t1, ks[11]);
+
+               t1 = _mm_aesdeclast_si128(t1, ks[12]);
+               t1 = _mm_xor_si128(t1, f1);
+               _mm_storeu_si128(bo + i, t1);
+               f1 = last;
+       }
+}
+
+/**
+ * AES-256 CBC encryption
+ */
+static void encrypt_cbc256(aesni_key_t *key, u_int blocks, u_char *in,
+                                                  u_char *iv, u_char *out)
+{
+       __m128i *ks, t, fb, *bi, *bo;
+       int i;
+
+       ks = key->schedule;
+       bi = (__m128i*)in;
+       bo = (__m128i*)out;
+
+       fb = _mm_loadu_si128((__m128i*)iv);
+       for (i = 0; i < blocks; i++)
+       {
+               t = _mm_loadu_si128(bi + i);
+               fb = _mm_xor_si128(t, fb);
+               fb = _mm_xor_si128(fb, ks[0]);
+
+               fb = _mm_aesenc_si128(fb, ks[1]);
+               fb = _mm_aesenc_si128(fb, ks[2]);
+               fb = _mm_aesenc_si128(fb, ks[3]);
+               fb = _mm_aesenc_si128(fb, ks[4]);
+               fb = _mm_aesenc_si128(fb, ks[5]);
+               fb = _mm_aesenc_si128(fb, ks[6]);
+               fb = _mm_aesenc_si128(fb, ks[7]);
+               fb = _mm_aesenc_si128(fb, ks[8]);
+               fb = _mm_aesenc_si128(fb, ks[9]);
+               fb = _mm_aesenc_si128(fb, ks[10]);
+               fb = _mm_aesenc_si128(fb, ks[11]);
+               fb = _mm_aesenc_si128(fb, ks[12]);
+               fb = _mm_aesenc_si128(fb, ks[13]);
+
+               fb = _mm_aesenclast_si128(fb, ks[14]);
+               _mm_storeu_si128(bo + i, fb);
+       }
+}
+
+/**
+ * AES-256 CBC decryption
+ */
+static void decrypt_cbc256(aesni_key_t *key, u_int blocks, u_char *in,
+                                                  u_char *iv, u_char *out)
+{
+       __m128i *ks, last, *bi, *bo;
+       __m128i t1, t2, t3, t4;
+       __m128i f1, f2, f3, f4;
+       u_int i, pblocks;
+
+       ks = key->schedule;
+       bi = (__m128i*)in;
+       bo = (__m128i*)out;
+       pblocks = blocks - (blocks % CBC_DECRYPT_PARALLELISM);
+
+       f1 = _mm_loadu_si128((__m128i*)iv);
+
+       for (i = 0; i < pblocks; i += CBC_DECRYPT_PARALLELISM)
+       {
+               t1 = _mm_loadu_si128(bi + i + 0);
+               t2 = _mm_loadu_si128(bi + i + 1);
+               t3 = _mm_loadu_si128(bi + i + 2);
+               t4 = _mm_loadu_si128(bi + i + 3);
+
+               f2 = t1;
+               f3 = t2;
+               f4 = t3;
+               last = t4;
+
+               t1 = _mm_xor_si128(t1, ks[0]);
+               t2 = _mm_xor_si128(t2, ks[0]);
+               t3 = _mm_xor_si128(t3, ks[0]);
+               t4 = _mm_xor_si128(t4, ks[0]);
+
+               t1 = _mm_aesdec_si128(t1, ks[1]);
+               t2 = _mm_aesdec_si128(t2, ks[1]);
+               t3 = _mm_aesdec_si128(t3, ks[1]);
+               t4 = _mm_aesdec_si128(t4, ks[1]);
+               t1 = _mm_aesdec_si128(t1, ks[2]);
+               t2 = _mm_aesdec_si128(t2, ks[2]);
+               t3 = _mm_aesdec_si128(t3, ks[2]);
+               t4 = _mm_aesdec_si128(t4, ks[2]);
+               t1 = _mm_aesdec_si128(t1, ks[3]);
+               t2 = _mm_aesdec_si128(t2, ks[3]);
+               t3 = _mm_aesdec_si128(t3, ks[3]);
+               t4 = _mm_aesdec_si128(t4, ks[3]);
+               t1 = _mm_aesdec_si128(t1, ks[4]);
+               t2 = _mm_aesdec_si128(t2, ks[4]);
+               t3 = _mm_aesdec_si128(t3, ks[4]);
+               t4 = _mm_aesdec_si128(t4, ks[4]);
+               t1 = _mm_aesdec_si128(t1, ks[5]);
+               t2 = _mm_aesdec_si128(t2, ks[5]);
+               t3 = _mm_aesdec_si128(t3, ks[5]);
+               t4 = _mm_aesdec_si128(t4, ks[5]);
+               t1 = _mm_aesdec_si128(t1, ks[6]);
+               t2 = _mm_aesdec_si128(t2, ks[6]);
+               t3 = _mm_aesdec_si128(t3, ks[6]);
+               t4 = _mm_aesdec_si128(t4, ks[6]);
+               t1 = _mm_aesdec_si128(t1, ks[7]);
+               t2 = _mm_aesdec_si128(t2, ks[7]);
+               t3 = _mm_aesdec_si128(t3, ks[7]);
+               t4 = _mm_aesdec_si128(t4, ks[7]);
+               t1 = _mm_aesdec_si128(t1, ks[8]);
+               t2 = _mm_aesdec_si128(t2, ks[8]);
+               t3 = _mm_aesdec_si128(t3, ks[8]);
+               t4 = _mm_aesdec_si128(t4, ks[8]);
+               t1 = _mm_aesdec_si128(t1, ks[9]);
+               t2 = _mm_aesdec_si128(t2, ks[9]);
+               t3 = _mm_aesdec_si128(t3, ks[9]);
+               t4 = _mm_aesdec_si128(t4, ks[9]);
+               t1 = _mm_aesdec_si128(t1, ks[10]);
+               t2 = _mm_aesdec_si128(t2, ks[10]);
+               t3 = _mm_aesdec_si128(t3, ks[10]);
+               t4 = _mm_aesdec_si128(t4, ks[10]);
+               t1 = _mm_aesdec_si128(t1, ks[11]);
+               t2 = _mm_aesdec_si128(t2, ks[11]);
+               t3 = _mm_aesdec_si128(t3, ks[11]);
+               t4 = _mm_aesdec_si128(t4, ks[11]);
+               t1 = _mm_aesdec_si128(t1, ks[12]);
+               t2 = _mm_aesdec_si128(t2, ks[12]);
+               t3 = _mm_aesdec_si128(t3, ks[12]);
+               t4 = _mm_aesdec_si128(t4, ks[12]);
+               t1 = _mm_aesdec_si128(t1, ks[13]);
+               t2 = _mm_aesdec_si128(t2, ks[13]);
+               t3 = _mm_aesdec_si128(t3, ks[13]);
+               t4 = _mm_aesdec_si128(t4, ks[13]);
+
+               t1 = _mm_aesdeclast_si128(t1, ks[14]);
+               t2 = _mm_aesdeclast_si128(t2, ks[14]);
+               t3 = _mm_aesdeclast_si128(t3, ks[14]);
+               t4 = _mm_aesdeclast_si128(t4, ks[14]);
+               t1 = _mm_xor_si128(t1, f1);
+               t2 = _mm_xor_si128(t2, f2);
+               t3 = _mm_xor_si128(t3, f3);
+               t4 = _mm_xor_si128(t4, f4);
+               _mm_storeu_si128(bo + i + 0, t1);
+               _mm_storeu_si128(bo + i + 1, t2);
+               _mm_storeu_si128(bo + i + 2, t3);
+               _mm_storeu_si128(bo + i + 3, t4);
+               f1 = last;
+       }
+
+       for (i = pblocks; i < blocks; i++)
+       {
+               last = _mm_loadu_si128(bi + i);
+               t1 = _mm_xor_si128(last, ks[0]);
+
+               t1 = _mm_aesdec_si128(t1, ks[1]);
+               t1 = _mm_aesdec_si128(t1, ks[2]);
+               t1 = _mm_aesdec_si128(t1, ks[3]);
+               t1 = _mm_aesdec_si128(t1, ks[4]);
+               t1 = _mm_aesdec_si128(t1, ks[5]);
+               t1 = _mm_aesdec_si128(t1, ks[6]);
+               t1 = _mm_aesdec_si128(t1, ks[7]);
+               t1 = _mm_aesdec_si128(t1, ks[8]);
+               t1 = _mm_aesdec_si128(t1, ks[9]);
+               t1 = _mm_aesdec_si128(t1, ks[10]);
+               t1 = _mm_aesdec_si128(t1, ks[11]);
+               t1 = _mm_aesdec_si128(t1, ks[12]);
+               t1 = _mm_aesdec_si128(t1, ks[13]);
+
+               t1 = _mm_aesdeclast_si128(t1, ks[14]);
+               t1 = _mm_xor_si128(t1, f1);
+               _mm_storeu_si128(bo + i, t1);
+               f1 = last;
+       }
+}
+
+/**
+ * Do inline or allocated de/encryption using key schedule
+ */
+static bool crypt(aesni_cbc_fn_t fn, aesni_key_t *key,
+                                 chunk_t data, chunk_t iv, chunk_t *out)
+{
+       u_char *buf;
+
+       if (!key || iv.len != AES_BLOCK_SIZE || data.len % AES_BLOCK_SIZE)
+       {
+               return FALSE;
+       }
+       if (out)
+       {
+               *out = chunk_alloc(data.len);
+               buf = out->ptr;
+       }
+       else
+       {
+               buf = data.ptr;
+       }
+       fn(key, data.len / AES_BLOCK_SIZE, data.ptr, iv.ptr, buf);
+       return TRUE;
+}
+
+METHOD(crypter_t, encrypt, bool,
+       private_aesni_cbc_t *this, chunk_t data, chunk_t iv, chunk_t *encrypted)
+{
+       return crypt(this->encrypt, this->ekey, data, iv, encrypted);
+}
+
+METHOD(crypter_t, decrypt, bool,
+       private_aesni_cbc_t *this, chunk_t data, chunk_t iv, chunk_t *decrypted)
+{
+       return crypt(this->decrypt, this->dkey, data, iv, decrypted);
+}
+
+METHOD(crypter_t, get_block_size, size_t,
+       private_aesni_cbc_t *this)
+{
+       return AES_BLOCK_SIZE;
+}
+
+METHOD(crypter_t, get_iv_size, size_t,
+       private_aesni_cbc_t *this)
+{
+       return AES_BLOCK_SIZE;
+}
+
+METHOD(crypter_t, get_key_size, size_t,
+       private_aesni_cbc_t *this)
+{
+       return this->key_size;
+}
+
+METHOD(crypter_t, set_key, bool,
+       private_aesni_cbc_t *this, chunk_t key)
+{
+       if (key.len != this->key_size)
+       {
+               return FALSE;
+       }
+
+       DESTROY_IF(this->ekey);
+       DESTROY_IF(this->dkey);
+
+       this->ekey = aesni_key_create(TRUE, key);
+       this->dkey = aesni_key_create(FALSE, key);
+
+       return this->ekey && this->dkey;
+}
+
+METHOD(crypter_t, destroy, void,
+       private_aesni_cbc_t *this)
+{
+       DESTROY_IF(this->ekey);
+       DESTROY_IF(this->dkey);
+       free_align(this);
+}
+
+/**
+ * See header
+ */
+aesni_cbc_t *aesni_cbc_create(encryption_algorithm_t algo, size_t key_size)
+{
+       private_aesni_cbc_t *this;
+
+       if (algo != ENCR_AES_CBC)
+       {
+               return NULL;
+       }
+       switch (key_size)
+       {
+               case 0:
+                       key_size = 16;
+                       break;
+               case 16:
+               case 24:
+               case 32:
+                       break;
+               default:
+                       return NULL;
+       }
+
+       INIT_ALIGN(this, sizeof(__m128i),
+               .public = {
+                       .crypter = {
+                               .encrypt = _encrypt,
+                               .decrypt = _decrypt,
+                               .get_block_size = _get_block_size,
+                               .get_iv_size = _get_iv_size,
+                               .get_key_size = _get_key_size,
+                               .set_key = _set_key,
+                               .destroy = _destroy,
+                       },
+               },
+               .key_size = key_size,
+       );
+
+       switch (key_size)
+       {
+               case 16:
+                       this->encrypt = encrypt_cbc128;
+                       this->decrypt = decrypt_cbc128;
+                       break;
+               case 24:
+                       this->encrypt = encrypt_cbc192;
+                       this->decrypt = decrypt_cbc192;
+                       break;
+               case 32:
+                       this->encrypt = encrypt_cbc256;
+                       this->decrypt = decrypt_cbc256;
+                       break;
+       }
+
+       return &this->public;
+}
diff --git a/src/libstrongswan/plugins/aesni/aesni_cbc.h b/src/libstrongswan/plugins/aesni/aesni_cbc.h
new file mode 100644 (file)
index 0000000..c004ec6
--- /dev/null
@@ -0,0 +1,48 @@
+/*
+ * Copyright (C) 2015 Martin Willi
+ * Copyright (C) 2015 revosec AG
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.  See <http://www.fsf.org/copyleft/gpl.txt>.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ * or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * for more details.
+ */
+
+/**
+ * @defgroup aesni_cbc aesni_cbc
+ * @{ @ingroup aesni
+ */
+
+#ifndef AESNI_CBC_H_
+#define AESNI_CBC_H_
+
+#include <library.h>
+
+typedef struct aesni_cbc_t aesni_cbc_t;
+
+/**
+ * CBC mode crypter using AES-NI
+ */
+struct aesni_cbc_t {
+
+       /**
+        * Implements crypter interface
+        */
+       crypter_t crypter;
+};
+
+/**
+ * Create a aesni_cbc instance.
+ *
+ * @param algo                 encryption algorithm, AES_ENCR_CBC
+ * @param key_size             AES key size, in bytes
+ * @return                             AES-CBC crypter, NULL if not supported
+ */
+aesni_cbc_t *aesni_cbc_create(encryption_algorithm_t algo, size_t key_size);
+
+#endif /** AESNI_CBC_H_ @}*/
diff --git a/src/libstrongswan/plugins/aesni/aesni_ccm.c b/src/libstrongswan/plugins/aesni/aesni_ccm.c
new file mode 100644 (file)
index 0000000..d523bc1
--- /dev/null
@@ -0,0 +1,914 @@
+/*
+ * Copyright (C) 2010-2015 Martin Willi
+ * Copyright (C) 2010-2015 revosec AG
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.  See <http://www.fsf.org/copyleft/gpl.txt>.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ * or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * for more details.
+ */
+
+#include "aesni_ccm.h"
+#include "aesni_key.h"
+
+#include <crypto/iv/iv_gen_seq.h>
+
+#include <tmmintrin.h>
+
+#define SALT_SIZE 3
+#define IV_SIZE 8
+#define NONCE_SIZE (SALT_SIZE + IV_SIZE) /* 11 */
+#define Q_SIZE (AES_BLOCK_SIZE - NONCE_SIZE - 1) /* 4 */
+
+typedef struct private_aesni_ccm_t private_aesni_ccm_t;
+
+/**
+ * CCM en/decryption method type
+ */
+typedef void (*aesni_ccm_fn_t)(private_aesni_ccm_t*, size_t, u_char*, u_char*,
+                                                          u_char*, size_t, u_char*, u_char*);
+
+/**
+ * Private data of an aesni_ccm_t object.
+ */
+struct private_aesni_ccm_t {
+
+       /**
+        * Public aesni_ccm_t interface.
+        */
+       aesni_ccm_t public;
+
+       /**
+        * Encryption key schedule
+        */
+       aesni_key_t *key;
+
+       /**
+        * IV generator.
+        */
+       iv_gen_t *iv_gen;
+
+       /**
+        * Length of the integrity check value
+        */
+       size_t icv_size;
+
+       /**
+        * Length of the key in bytes
+        */
+       size_t key_size;
+
+       /**
+        * CCM encryption function
+        */
+       aesni_ccm_fn_t encrypt;
+
+       /**
+        * CCM decryption function
+        */
+       aesni_ccm_fn_t decrypt;
+
+       /**
+        * salt to add to nonce
+        */
+       u_char salt[SALT_SIZE];
+};
+
+/**
+ * First block with control information
+ */
+typedef struct __attribute__((packed)) {
+       BITFIELD4(u_int8_t,
+               /* size of p length field q, as q-1 */
+               q_len: 3,
+               /* size of our ICV t, as (t-2)/2 */
+               t_len: 3,
+               /* do we have associated data */
+               assoc: 1,
+               reserved: 1,
+       ) flags;
+       /* nonce value */
+       struct __attribute__((packed)) {
+               u_char salt[SALT_SIZE];
+               u_char iv[IV_SIZE];
+       } nonce;
+       /* length of plain text, q */
+       u_char q[Q_SIZE];
+} b0_t;
+
+/**
+ * Counter block
+ */
+typedef struct __attribute__((packed)) {
+       BITFIELD3(u_int8_t,
+               /* size of p length field q, as q-1 */
+               q_len: 3,
+               zero: 3,
+               reserved: 2,
+       ) flags;
+       /* nonce value */
+       struct __attribute__((packed)) {
+               u_char salt[SALT_SIZE];
+               u_char iv[IV_SIZE];
+       } nonce;
+       /* counter value */
+       u_char i[Q_SIZE];
+} ctr_t;
+
+/**
+ * Build the first block B0
+ */
+static void build_b0(private_aesni_ccm_t *this, size_t len, size_t alen,
+                                        u_char *iv, void *out)
+{
+       b0_t *block = out;
+
+       block->flags.reserved = 0;
+       block->flags.assoc = alen ? 1 : 0;
+       block->flags.t_len = (this->icv_size - 2) / 2;
+       block->flags.q_len = Q_SIZE - 1;
+       memcpy(block->nonce.salt, this->salt, SALT_SIZE);
+       memcpy(block->nonce.iv, iv, IV_SIZE);
+       htoun32(block->q, len);
+}
+
+/**
+ * Build a counter block for counter i
+ */
+static void build_ctr(private_aesni_ccm_t *this, u_int32_t i, u_char *iv,
+                                         void *out)
+{
+       ctr_t *ctr = out;
+
+       ctr->flags.reserved = 0;
+       ctr->flags.zero = 0;
+       ctr->flags.q_len = Q_SIZE - 1;
+       memcpy(ctr->nonce.salt, this->salt, SALT_SIZE);
+       memcpy(ctr->nonce.iv, iv, IV_SIZE);
+       htoun32(ctr->i, i);
+}
+
+/**
+ * Calculate the ICV for the b0 and associated data
+ */
+static __m128i icv_header(private_aesni_ccm_t *this, size_t len, u_char *iv,
+                                                 u_int16_t alen, u_char *assoc)
+{
+       __m128i *ks, b, t, c;
+       u_int i, round, blocks, rem;
+
+       ks = this->key->schedule;
+       build_b0(this, len, alen, iv, &b);
+       c = _mm_loadu_si128(&b);
+       c = _mm_xor_si128(c, ks[0]);
+       for (round = 1; round < this->key->rounds; round++)
+       {
+               c = _mm_aesenc_si128(c, ks[round]);
+       }
+       c = _mm_aesenclast_si128(c, ks[this->key->rounds]);
+
+       if (alen)
+       {
+               blocks = (alen + sizeof(alen)) / AES_BLOCK_SIZE;
+               rem = (alen + sizeof(alen)) % AES_BLOCK_SIZE;
+               if (rem)
+               {
+                       blocks++;
+               }
+               for (i = 0; i < blocks; i++)
+               {
+                       if (i == 0)
+                       {       /* first block */
+                               memset(&b, 0, sizeof(b));
+                               htoun16(&b, alen);
+                               memcpy(((u_char*)&b) + sizeof(alen), assoc,
+                                          min(alen, sizeof(b) - sizeof(alen)));
+                               t = _mm_loadu_si128(&b);
+                       }
+                       else if (i == blocks - 1 && rem)
+                       {       /* last block with padding */
+                               memset(&b, 0, sizeof(b));
+                               memcpy(&b, ((__m128i*)(assoc - sizeof(alen))) + i, rem);
+                               t = _mm_loadu_si128(&b);
+                       }
+                       else
+                       {       /* full block */
+                               t = _mm_loadu_si128(((__m128i*)(assoc - sizeof(alen))) + i);
+                       }
+                       c = _mm_xor_si128(t, c);
+                       c = _mm_xor_si128(c, ks[0]);
+                       for (round = 1; round < this->key->rounds; round++)
+                       {
+                               c = _mm_aesenc_si128(c, ks[round]);
+                       }
+                       c = _mm_aesenclast_si128(c, ks[this->key->rounds]);
+               }
+       }
+       return c;
+}
+
+/**
+ * En-/Decrypt the ICV, trim and store it
+ */
+static void crypt_icv(private_aesni_ccm_t *this, u_char *iv,
+                                         __m128i c, u_char *icv)
+{
+       __m128i *ks, b, t;
+       u_int round;
+
+       ks = this->key->schedule;
+       build_ctr(this, 0, iv, &b);
+
+       t = _mm_loadu_si128(&b);
+       t = _mm_xor_si128(t, ks[0]);
+       for (round = 1; round < this->key->rounds; round++)
+       {
+               t = _mm_aesenc_si128(t, ks[round]);
+       }
+       t = _mm_aesenclast_si128(t, ks[this->key->rounds]);
+
+       t = _mm_xor_si128(t, c);
+
+       _mm_storeu_si128(&b, t);
+       memcpy(icv, &b, this->icv_size);
+}
+
+/**
+ * Do big-endian increment on x
+ */
+static inline __m128i increment_be(__m128i x)
+{
+       __m128i swap;
+
+       swap = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+
+       x = _mm_shuffle_epi8(x, swap);
+       x = _mm_add_epi64(x, _mm_set_epi32(0, 0, 0, 1));
+       x = _mm_shuffle_epi8(x, swap);
+
+       return x;
+}
+
+/**
+ * Encrypt a remaining incomplete block
+ */
+static __m128i encrypt_ccm_rem(aesni_key_t *key, u_int rem, __m128i state,
+                                                          void *in, void *out, __m128i c)
+{
+       __m128i *ks, t, b, d;
+       u_int round;
+
+       ks = key->schedule;
+       memset(&b, 0, sizeof(b));
+       memcpy(&b, in, rem);
+       d = _mm_loadu_si128(&b);
+
+       c = _mm_xor_si128(d, c);
+       c = _mm_xor_si128(c, ks[0]);
+       t = _mm_xor_si128(state, ks[0]);
+       for (round = 1; round < key->rounds; round++)
+       {
+               c = _mm_aesenc_si128(c, ks[round]);
+               t = _mm_aesenc_si128(t, ks[round]);
+       }
+       c = _mm_aesenclast_si128(c, ks[key->rounds]);
+       t = _mm_aesenclast_si128(t, ks[key->rounds]);
+
+       t = _mm_xor_si128(t, d);
+       _mm_storeu_si128(&b, t);
+
+       memcpy(out, &b, rem);
+
+       return c;
+}
+
+/**
+ * Decrypt a remaining incomplete block
+ */
+static __m128i decrypt_ccm_rem(aesni_key_t *key, u_int rem, __m128i state,
+                                                          void *in, void *out, __m128i c)
+{
+       __m128i *ks, t, b, d;
+       u_int round;
+
+       ks = key->schedule;
+       memset(&b, 0, sizeof(b));
+       memcpy(&b, in, rem);
+       d = _mm_loadu_si128(&b);
+
+       t = _mm_xor_si128(state, ks[0]);
+       for (round = 1; round < key->rounds; round++)
+       {
+               t = _mm_aesenc_si128(t, ks[round]);
+       }
+       t = _mm_aesenclast_si128(t, ks[key->rounds]);
+       t = _mm_xor_si128(t, d);
+       _mm_storeu_si128(&b, t);
+
+       memset((u_char*)&b + rem, 0, sizeof(b) - rem);
+       t = _mm_loadu_si128(&b);
+       c = _mm_xor_si128(t, c);
+       c = _mm_xor_si128(c, ks[0]);
+       for (round = 1; round < key->rounds; round++)
+       {
+               c = _mm_aesenc_si128(c, ks[round]);
+       }
+       c = _mm_aesenclast_si128(c, ks[key->rounds]);
+
+       memcpy(out, &b, rem);
+
+       return c;
+}
+
+/**
+ * AES-128 CCM encryption/ICV generation
+ */
+static void encrypt_ccm128(private_aesni_ccm_t *this,
+                                                  size_t len, u_char *in, u_char *out, u_char *iv,
+                                                  size_t alen, u_char *assoc, u_char *icv)
+{
+       __m128i *ks, d, t, c, b, state, *bi, *bo;
+       u_int blocks, rem, i;
+
+       c = icv_header(this, len, iv, alen, assoc);
+       build_ctr(this, 1, iv, &b);
+       state = _mm_load_si128(&b);
+       blocks = len / AES_BLOCK_SIZE;
+       rem = len % AES_BLOCK_SIZE;
+       bi = (__m128i*)in;
+       bo = (__m128i*)out;
+
+       ks = this->key->schedule;
+
+       for (i = 0; i < blocks; i++)
+       {
+               d = _mm_loadu_si128(bi + i);
+
+               c = _mm_xor_si128(d, c);
+               c = _mm_xor_si128(c, ks[0]);
+               t = _mm_xor_si128(state, ks[0]);
+
+               c = _mm_aesenc_si128(c, ks[1]);
+               t = _mm_aesenc_si128(t, ks[1]);
+               c = _mm_aesenc_si128(c, ks[2]);
+               t = _mm_aesenc_si128(t, ks[2]);
+               c = _mm_aesenc_si128(c, ks[3]);
+               t = _mm_aesenc_si128(t, ks[3]);
+               c = _mm_aesenc_si128(c, ks[4]);
+               t = _mm_aesenc_si128(t, ks[4]);
+               c = _mm_aesenc_si128(c, ks[5]);
+               t = _mm_aesenc_si128(t, ks[5]);
+               c = _mm_aesenc_si128(c, ks[6]);
+               t = _mm_aesenc_si128(t, ks[6]);
+               c = _mm_aesenc_si128(c, ks[7]);
+               t = _mm_aesenc_si128(t, ks[7]);
+               c = _mm_aesenc_si128(c, ks[8]);
+               t = _mm_aesenc_si128(t, ks[8]);
+               c = _mm_aesenc_si128(c, ks[9]);
+               t = _mm_aesenc_si128(t, ks[9]);
+
+               c = _mm_aesenclast_si128(c, ks[10]);
+               t = _mm_aesenclast_si128(t, ks[10]);
+
+               t = _mm_xor_si128(t, d);
+               _mm_storeu_si128(bo + i, t);
+
+               state = increment_be(state);
+       }
+
+       if (rem)
+       {
+               c = encrypt_ccm_rem(this->key, rem, state, bi + blocks, bo + blocks, c);
+       }
+       crypt_icv(this, iv, c, icv);
+}
+
+/**
+ * AES-128 CCM decryption/ICV generation
+ */
+static void decrypt_ccm128(private_aesni_ccm_t *this,
+                                                  size_t len, u_char *in, u_char *out, u_char *iv,
+                                                  size_t alen, u_char *assoc, u_char *icv)
+{
+       __m128i *ks, d, t, c, b, state, *bi, *bo;
+       u_int blocks, rem, i;
+
+       c = icv_header(this, len, iv, alen, assoc);
+       build_ctr(this, 1, iv, &b);
+       state = _mm_load_si128(&b);
+       blocks = len / AES_BLOCK_SIZE;
+       rem = len % AES_BLOCK_SIZE;
+       bi = (__m128i*)in;
+       bo = (__m128i*)out;
+
+       ks = this->key->schedule;
+
+       for (i = 0; i < blocks; i++)
+       {
+               d = _mm_loadu_si128(bi + i);
+
+               t = _mm_xor_si128(state, ks[0]);
+
+               t = _mm_aesenc_si128(t, ks[1]);
+               t = _mm_aesenc_si128(t, ks[2]);
+               t = _mm_aesenc_si128(t, ks[3]);
+               t = _mm_aesenc_si128(t, ks[4]);
+               t = _mm_aesenc_si128(t, ks[5]);
+               t = _mm_aesenc_si128(t, ks[6]);
+               t = _mm_aesenc_si128(t, ks[7]);
+               t = _mm_aesenc_si128(t, ks[8]);
+               t = _mm_aesenc_si128(t, ks[9]);
+
+               t = _mm_aesenclast_si128(t, ks[10]);
+               t = _mm_xor_si128(t, d);
+               _mm_storeu_si128(bo + i, t);
+
+               c = _mm_xor_si128(t, c);
+               c = _mm_xor_si128(c, ks[0]);
+
+               c = _mm_aesenc_si128(c, ks[1]);
+               c = _mm_aesenc_si128(c, ks[2]);
+               c = _mm_aesenc_si128(c, ks[3]);
+               c = _mm_aesenc_si128(c, ks[4]);
+               c = _mm_aesenc_si128(c, ks[5]);
+               c = _mm_aesenc_si128(c, ks[6]);
+               c = _mm_aesenc_si128(c, ks[7]);
+               c = _mm_aesenc_si128(c, ks[8]);
+               c = _mm_aesenc_si128(c, ks[9]);
+
+               c = _mm_aesenclast_si128(c, ks[10]);
+
+               state = increment_be(state);
+       }
+
+       if (rem)
+       {
+               c = decrypt_ccm_rem(this->key, rem, state, bi + blocks, bo + blocks, c);
+       }
+       crypt_icv(this, iv, c, icv);
+}
+
+/**
+ * AES-192 CCM encryption/ICV generation
+ */
+static void encrypt_ccm192(private_aesni_ccm_t *this,
+                                                  size_t len, u_char *in, u_char *out, u_char *iv,
+                                                  size_t alen, u_char *assoc, u_char *icv)
+{
+       __m128i *ks, d, t, c, b, state, *bi, *bo;
+       u_int blocks, rem, i;
+
+       c = icv_header(this, len, iv, alen, assoc);
+       build_ctr(this, 1, iv, &b);
+       state = _mm_load_si128(&b);
+       blocks = len / AES_BLOCK_SIZE;
+       rem = len % AES_BLOCK_SIZE;
+       bi = (__m128i*)in;
+       bo = (__m128i*)out;
+
+       ks = this->key->schedule;
+
+       for (i = 0; i < blocks; i++)
+       {
+               d = _mm_loadu_si128(bi + i);
+
+               c = _mm_xor_si128(d, c);
+               c = _mm_xor_si128(c, ks[0]);
+               t = _mm_xor_si128(state, ks[0]);
+
+               c = _mm_aesenc_si128(c, ks[1]);
+               t = _mm_aesenc_si128(t, ks[1]);
+               c = _mm_aesenc_si128(c, ks[2]);
+               t = _mm_aesenc_si128(t, ks[2]);
+               c = _mm_aesenc_si128(c, ks[3]);
+               t = _mm_aesenc_si128(t, ks[3]);
+               c = _mm_aesenc_si128(c, ks[4]);
+               t = _mm_aesenc_si128(t, ks[4]);
+               c = _mm_aesenc_si128(c, ks[5]);
+               t = _mm_aesenc_si128(t, ks[5]);
+               c = _mm_aesenc_si128(c, ks[6]);
+               t = _mm_aesenc_si128(t, ks[6]);
+               c = _mm_aesenc_si128(c, ks[7]);
+               t = _mm_aesenc_si128(t, ks[7]);
+               c = _mm_aesenc_si128(c, ks[8]);
+               t = _mm_aesenc_si128(t, ks[8]);
+               c = _mm_aesenc_si128(c, ks[9]);
+               t = _mm_aesenc_si128(t, ks[9]);
+               c = _mm_aesenc_si128(c, ks[10]);
+               t = _mm_aesenc_si128(t, ks[10]);
+               c = _mm_aesenc_si128(c, ks[11]);
+               t = _mm_aesenc_si128(t, ks[11]);
+
+               c = _mm_aesenclast_si128(c, ks[12]);
+               t = _mm_aesenclast_si128(t, ks[12]);
+
+               t = _mm_xor_si128(t, d);
+               _mm_storeu_si128(bo + i, t);
+
+               state = increment_be(state);
+       }
+
+       if (rem)
+       {
+               c = encrypt_ccm_rem(this->key, rem, state, bi + blocks, bo + blocks, c);
+       }
+       crypt_icv(this, iv, c, icv);
+}
+
+/**
+ * AES-192 CCM decryption/ICV generation
+ */
+static void decrypt_ccm192(private_aesni_ccm_t *this,
+                                                  size_t len, u_char *in, u_char *out, u_char *iv,
+                                                  size_t alen, u_char *assoc, u_char *icv)
+{
+       __m128i *ks, d, t, c, b, state, *bi, *bo;
+       u_int blocks, rem, i;
+
+       c = icv_header(this, len, iv, alen, assoc);
+       build_ctr(this, 1, iv, &b);
+       state = _mm_load_si128(&b);
+       blocks = len / AES_BLOCK_SIZE;
+       rem = len % AES_BLOCK_SIZE;
+       bi = (__m128i*)in;
+       bo = (__m128i*)out;
+
+       ks = this->key->schedule;
+
+       for (i = 0; i < blocks; i++)
+       {
+               d = _mm_loadu_si128(bi + i);
+
+               t = _mm_xor_si128(state, ks[0]);
+
+               t = _mm_aesenc_si128(t, ks[1]);
+               t = _mm_aesenc_si128(t, ks[2]);
+               t = _mm_aesenc_si128(t, ks[3]);
+               t = _mm_aesenc_si128(t, ks[4]);
+               t = _mm_aesenc_si128(t, ks[5]);
+               t = _mm_aesenc_si128(t, ks[6]);
+               t = _mm_aesenc_si128(t, ks[7]);
+               t = _mm_aesenc_si128(t, ks[8]);
+               t = _mm_aesenc_si128(t, ks[9]);
+               t = _mm_aesenc_si128(t, ks[10]);
+               t = _mm_aesenc_si128(t, ks[11]);
+
+               t = _mm_aesenclast_si128(t, ks[12]);
+               t = _mm_xor_si128(t, d);
+               _mm_storeu_si128(bo + i, t);
+
+               c = _mm_xor_si128(t, c);
+               c = _mm_xor_si128(c, ks[0]);
+
+               c = _mm_aesenc_si128(c, ks[1]);
+               c = _mm_aesenc_si128(c, ks[2]);
+               c = _mm_aesenc_si128(c, ks[3]);
+               c = _mm_aesenc_si128(c, ks[4]);
+               c = _mm_aesenc_si128(c, ks[5]);
+               c = _mm_aesenc_si128(c, ks[6]);
+               c = _mm_aesenc_si128(c, ks[7]);
+               c = _mm_aesenc_si128(c, ks[8]);
+               c = _mm_aesenc_si128(c, ks[9]);
+               c = _mm_aesenc_si128(c, ks[10]);
+               c = _mm_aesenc_si128(c, ks[11]);
+
+               c = _mm_aesenclast_si128(c, ks[12]);
+
+               state = increment_be(state);
+       }
+
+       if (rem)
+       {
+               c = decrypt_ccm_rem(this->key, rem, state, bi + blocks, bo + blocks, c);
+       }
+       crypt_icv(this, iv, c, icv);
+}
+
+/**
+ * AES-256 CCM encryption/ICV generation
+ */
+static void encrypt_ccm256(private_aesni_ccm_t *this,
+                                                  size_t len, u_char *in, u_char *out, u_char *iv,
+                                                  size_t alen, u_char *assoc, u_char *icv)
+{
+       __m128i *ks, d, t, c, b, state, *bi, *bo;
+       u_int blocks, rem, i;
+
+       c = icv_header(this, len, iv, alen, assoc);
+       build_ctr(this, 1, iv, &b);
+       state = _mm_load_si128(&b);
+       blocks = len / AES_BLOCK_SIZE;
+       rem = len % AES_BLOCK_SIZE;
+       bi = (__m128i*)in;
+       bo = (__m128i*)out;
+
+       ks = this->key->schedule;
+
+       for (i = 0; i < blocks; i++)
+       {
+               d = _mm_loadu_si128(bi + i);
+
+               c = _mm_xor_si128(d, c);
+               c = _mm_xor_si128(c, ks[0]);
+               t = _mm_xor_si128(state, ks[0]);
+
+               c = _mm_aesenc_si128(c, ks[1]);
+               t = _mm_aesenc_si128(t, ks[1]);
+               c = _mm_aesenc_si128(c, ks[2]);
+               t = _mm_aesenc_si128(t, ks[2]);
+               c = _mm_aesenc_si128(c, ks[3]);
+               t = _mm_aesenc_si128(t, ks[3]);
+               c = _mm_aesenc_si128(c, ks[4]);
+               t = _mm_aesenc_si128(t, ks[4]);
+               c = _mm_aesenc_si128(c, ks[5]);
+               t = _mm_aesenc_si128(t, ks[5]);
+               c = _mm_aesenc_si128(c, ks[6]);
+               t = _mm_aesenc_si128(t, ks[6]);
+               c = _mm_aesenc_si128(c, ks[7]);
+               t = _mm_aesenc_si128(t, ks[7]);
+               c = _mm_aesenc_si128(c, ks[8]);
+               t = _mm_aesenc_si128(t, ks[8]);
+               c = _mm_aesenc_si128(c, ks[9]);
+               t = _mm_aesenc_si128(t, ks[9]);
+               c = _mm_aesenc_si128(c, ks[10]);
+               t = _mm_aesenc_si128(t, ks[10]);
+               c = _mm_aesenc_si128(c, ks[11]);
+               t = _mm_aesenc_si128(t, ks[11]);
+               c = _mm_aesenc_si128(c, ks[12]);
+               t = _mm_aesenc_si128(t, ks[12]);
+               c = _mm_aesenc_si128(c, ks[13]);
+               t = _mm_aesenc_si128(t, ks[13]);
+
+               c = _mm_aesenclast_si128(c, ks[14]);
+               t = _mm_aesenclast_si128(t, ks[14]);
+
+               t = _mm_xor_si128(t, d);
+               _mm_storeu_si128(bo + i, t);
+
+               state = increment_be(state);
+       }
+
+       if (rem)
+       {
+               c = encrypt_ccm_rem(this->key, rem, state, bi + blocks, bo + blocks, c);
+       }
+       crypt_icv(this, iv, c, icv);
+}
+
+/**
+ * AES-256 CCM decryption/ICV generation
+ */
+static void decrypt_ccm256(private_aesni_ccm_t *this,
+                                                  size_t len, u_char *in, u_char *out, u_char *iv,
+                                                  size_t alen, u_char *assoc, u_char *icv)
+{
+       __m128i *ks, d, t, c, b, state, *bi, *bo;
+       u_int blocks, rem, i;
+
+       c = icv_header(this, len, iv, alen, assoc);
+       build_ctr(this, 1, iv, &b);
+       state = _mm_load_si128(&b);
+       blocks = len / AES_BLOCK_SIZE;
+       rem = len % AES_BLOCK_SIZE;
+       bi = (__m128i*)in;
+       bo = (__m128i*)out;
+
+       ks = this->key->schedule;
+
+       for (i = 0; i < blocks; i++)
+       {
+               d = _mm_loadu_si128(bi + i);
+
+               t = _mm_xor_si128(state, ks[0]);
+
+               t = _mm_aesenc_si128(t, ks[1]);
+               t = _mm_aesenc_si128(t, ks[2]);
+               t = _mm_aesenc_si128(t, ks[3]);
+               t = _mm_aesenc_si128(t, ks[4]);
+               t = _mm_aesenc_si128(t, ks[5]);
+               t = _mm_aesenc_si128(t, ks[6]);
+               t = _mm_aesenc_si128(t, ks[7]);
+               t = _mm_aesenc_si128(t, ks[8]);
+               t = _mm_aesenc_si128(t, ks[9]);
+               t = _mm_aesenc_si128(t, ks[10]);
+               t = _mm_aesenc_si128(t, ks[11]);
+               t = _mm_aesenc_si128(t, ks[12]);
+               t = _mm_aesenc_si128(t, ks[13]);
+
+               t = _mm_aesenclast_si128(t, ks[14]);
+               t = _mm_xor_si128(t, d);
+               _mm_storeu_si128(bo + i, t);
+
+               c = _mm_xor_si128(t, c);
+               c = _mm_xor_si128(c, ks[0]);
+
+               c = _mm_aesenc_si128(c, ks[1]);
+               c = _mm_aesenc_si128(c, ks[2]);
+               c = _mm_aesenc_si128(c, ks[3]);
+               c = _mm_aesenc_si128(c, ks[4]);
+               c = _mm_aesenc_si128(c, ks[5]);
+               c = _mm_aesenc_si128(c, ks[6]);
+               c = _mm_aesenc_si128(c, ks[7]);
+               c = _mm_aesenc_si128(c, ks[8]);
+               c = _mm_aesenc_si128(c, ks[9]);
+               c = _mm_aesenc_si128(c, ks[10]);
+               c = _mm_aesenc_si128(c, ks[11]);
+               c = _mm_aesenc_si128(c, ks[12]);
+               c = _mm_aesenc_si128(c, ks[13]);
+
+               c = _mm_aesenclast_si128(c, ks[14]);
+
+               state = increment_be(state);
+       }
+
+       if (rem)
+       {
+               c = decrypt_ccm_rem(this->key, rem, state, bi + blocks, bo + blocks, c);
+       }
+       crypt_icv(this, iv, c, icv);
+}
+
+METHOD(aead_t, encrypt, bool,
+       private_aesni_ccm_t *this, chunk_t plain, chunk_t assoc, chunk_t iv,
+       chunk_t *encr)
+{
+       u_char *out;
+
+       if (!this->key || iv.len != IV_SIZE)
+       {
+               return FALSE;
+       }
+       out = plain.ptr;
+       if (encr)
+       {
+               *encr = chunk_alloc(plain.len + this->icv_size);
+               out = encr->ptr;
+       }
+       this->encrypt(this, plain.len, plain.ptr, out, iv.ptr,
+                                 assoc.len, assoc.ptr, out + plain.len);
+       return TRUE;
+}
+
+METHOD(aead_t, decrypt, bool,
+       private_aesni_ccm_t *this, chunk_t encr, chunk_t assoc, chunk_t iv,
+       chunk_t *plain)
+{
+       u_char *out, icv[this->icv_size];
+
+       if (!this->key || iv.len != IV_SIZE || encr.len < this->icv_size)
+       {
+               return FALSE;
+       }
+       encr.len -= this->icv_size;
+       out = encr.ptr;
+       if (plain)
+       {
+               *plain = chunk_alloc(encr.len);
+               out = plain->ptr;
+       }
+
+       this->decrypt(this, encr.len, encr.ptr, out, iv.ptr,
+                                 assoc.len, assoc.ptr, icv);
+       return memeq_const(icv, encr.ptr + encr.len, this->icv_size);
+}
+
+METHOD(aead_t, get_block_size, size_t,
+       private_aesni_ccm_t *this)
+{
+       return 1;
+}
+
+METHOD(aead_t, get_icv_size, size_t,
+       private_aesni_ccm_t *this)
+{
+       return this->icv_size;
+}
+
+METHOD(aead_t, get_iv_size, size_t,
+       private_aesni_ccm_t *this)
+{
+       return IV_SIZE;
+}
+
+METHOD(aead_t, get_iv_gen, iv_gen_t*,
+       private_aesni_ccm_t *this)
+{
+       return this->iv_gen;
+}
+
+METHOD(aead_t, get_key_size, size_t,
+       private_aesni_ccm_t *this)
+{
+       return this->key_size + SALT_SIZE;
+}
+
+METHOD(aead_t, set_key, bool,
+       private_aesni_ccm_t *this, chunk_t key)
+{
+       if (key.len != this->key_size + SALT_SIZE)
+       {
+               return FALSE;
+       }
+
+       memcpy(this->salt, key.ptr + key.len - SALT_SIZE, SALT_SIZE);
+       key.len -= SALT_SIZE;
+
+       DESTROY_IF(this->key);
+       this->key = aesni_key_create(TRUE, key);
+       return TRUE;
+}
+
+METHOD(aead_t, destroy, void,
+       private_aesni_ccm_t *this)
+{
+       DESTROY_IF(this->key);
+       this->iv_gen->destroy(this->iv_gen);
+       free_align(this);
+}
+
+/**
+ * See header
+ */
+aesni_ccm_t *aesni_ccm_create(encryption_algorithm_t algo,
+                                                         size_t key_size, size_t salt_size)
+{
+       private_aesni_ccm_t *this;
+       size_t icv_size;
+
+       switch (key_size)
+       {
+               case 0:
+                       key_size = 16;
+                       break;
+               case 16:
+               case 24:
+               case 32:
+                       break;
+               default:
+                       return NULL;
+       }
+       if (salt_size && salt_size != SALT_SIZE)
+       {
+               /* currently not supported */
+               return NULL;
+       }
+       switch (algo)
+       {
+               case ENCR_AES_CCM_ICV8:
+                       algo = ENCR_AES_CBC;
+                       icv_size = 8;
+                       break;
+               case ENCR_AES_CCM_ICV12:
+                       algo = ENCR_AES_CBC;
+                       icv_size = 12;
+                       break;
+               case ENCR_AES_CCM_ICV16:
+                       algo = ENCR_AES_CBC;
+                       icv_size = 16;
+                       break;
+               default:
+                       return NULL;
+       }
+
+       INIT_ALIGN(this, sizeof(__m128i),
+               .public = {
+                       .aead = {
+                               .encrypt = _encrypt,
+                               .decrypt = _decrypt,
+                               .get_block_size = _get_block_size,
+                               .get_icv_size = _get_icv_size,
+                               .get_iv_size = _get_iv_size,
+                               .get_iv_gen = _get_iv_gen,
+                               .get_key_size = _get_key_size,
+                               .set_key = _set_key,
+                               .destroy = _destroy,
+                       },
+               },
+               .key_size = key_size,
+               .iv_gen = iv_gen_seq_create(),
+               .icv_size = icv_size,
+       );
+
+       switch (key_size)
+       {
+               case 16:
+                       this->encrypt = encrypt_ccm128;
+                       this->decrypt = decrypt_ccm128;
+                       break;
+               case 24:
+                       this->encrypt = encrypt_ccm192;
+                       this->decrypt = decrypt_ccm192;
+                       break;
+               case 32:
+                       this->encrypt = encrypt_ccm256;
+                       this->decrypt = decrypt_ccm256;
+                       break;
+       }
+
+       return &this->public;
+}
diff --git a/src/libstrongswan/plugins/aesni/aesni_ccm.h b/src/libstrongswan/plugins/aesni/aesni_ccm.h
new file mode 100644 (file)
index 0000000..69612b5
--- /dev/null
@@ -0,0 +1,50 @@
+/*
+ * Copyright (C) 2015 Martin Willi
+ * Copyright (C) 2015 revosec AG
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.  See <http://www.fsf.org/copyleft/gpl.txt>.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ * or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * for more details.
+ */
+
+/**
+ * @defgroup aesni_ccm aesni_ccm
+ * @{ @ingroup aesni
+ */
+
+#ifndef AESNI_CCM_H_
+#define AESNI_CCM_H_
+
+#include <library.h>
+
+typedef struct aesni_ccm_t aesni_ccm_t;
+
+/**
+ * CCM mode AEAD using AES-NI
+ */
+struct aesni_ccm_t {
+
+       /**
+        * Implements aead_t interface
+        */
+       aead_t aead;
+};
+
+/**
+ * Create a aesni_ccm instance.
+ *
+ * @param algo                 encryption algorithm, ENCR_AES_CCM*
+ * @param key_size             AES key size, in bytes
+ * @param salt_size            size of salt value
+ * @return                             AES-CCM AEAD, NULL if not supported
+ */
+aesni_ccm_t *aesni_ccm_create(encryption_algorithm_t algo,
+                                                         size_t key_size, size_t salt_size);
+
+#endif /** AESNI_CCM_H_ @}*/
diff --git a/src/libstrongswan/plugins/aesni/aesni_cmac.c b/src/libstrongswan/plugins/aesni/aesni_cmac.c
new file mode 100644 (file)
index 0000000..d6a87e6
--- /dev/null
@@ -0,0 +1,371 @@
+/*
+ * Copyright (C) 2012 Tobias Brunner
+ * Hochschule fuer Technik Rapperswil
+ * Copyright (C) 2015 Martin Willi
+ * Copyright (C) 2015 revosec AG
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.  See <http://www.fsf.org/copyleft/gpl.txt>.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ * or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * for more details.
+ */
+
+#include "aesni_cmac.h"
+#include "aesni_key.h"
+
+#include <crypto/prfs/mac_prf.h>
+#include <crypto/signers/mac_signer.h>
+
+typedef struct private_mac_t private_mac_t;
+
+/**
+ * Private data of a mac_t object.
+ */
+struct private_mac_t {
+
+       /**
+        * Public interface.
+        */
+       mac_t public;
+
+       /**
+        * Key schedule for key K
+        */
+       aesni_key_t *k;
+
+       /**
+        * K1
+        */
+       __m128i k1;
+
+       /**
+        * K2
+        */
+       __m128i k2;
+
+       /**
+        * T
+        */
+       __m128i t;
+
+       /**
+        * remaining, unprocessed bytes in append mode
+        */
+       u_char rem[AES_BLOCK_SIZE];
+
+       /**
+        * number of bytes in remaining
+        */
+       int rem_size;
+};
+
+METHOD(mac_t, get_mac, bool,
+       private_mac_t *this, chunk_t data, u_int8_t *out)
+{
+       __m128i *ks, t, l, *bi;
+       u_int blocks, rem, i;
+
+       if (!this->k)
+       {
+               return FALSE;
+       }
+
+       ks = this->k->schedule;
+       t = this->t;
+
+       if (this->rem_size + data.len > AES_BLOCK_SIZE)
+       {
+               /* T := 0x00000000000000000000000000000000 (initially)
+                * for each block M_i (except the last)
+                *   X := T XOR M_i;
+                *   T := AES-128(K, X);
+                */
+
+               /* append data to remaining bytes, process block M_1 */
+               memcpy(this->rem + this->rem_size, data.ptr,
+                          AES_BLOCK_SIZE - this->rem_size);
+               data = chunk_skip(data, AES_BLOCK_SIZE - this->rem_size);
+
+               t = _mm_xor_si128(t, _mm_loadu_si128((__m128i*)this->rem));
+
+               t = _mm_xor_si128(t, ks[0]);
+               t = _mm_aesenc_si128(t, ks[1]);
+               t = _mm_aesenc_si128(t, ks[2]);
+               t = _mm_aesenc_si128(t, ks[3]);
+               t = _mm_aesenc_si128(t, ks[4]);
+               t = _mm_aesenc_si128(t, ks[5]);
+               t = _mm_aesenc_si128(t, ks[6]);
+               t = _mm_aesenc_si128(t, ks[7]);
+               t = _mm_aesenc_si128(t, ks[8]);
+               t = _mm_aesenc_si128(t, ks[9]);
+               t = _mm_aesenclast_si128(t, ks[10]);
+
+               /* process blocks M_2 ... M_n-1 */
+               bi = (__m128i*)data.ptr;
+               rem = data.len % AES_BLOCK_SIZE;
+               blocks = data.len / AES_BLOCK_SIZE;
+               if (!rem && blocks)
+               {       /* don't do last block */
+                       rem = AES_BLOCK_SIZE;
+                       blocks--;
+               }
+
+               /* process blocks M[2] ... M[n-1] */
+               for (i = 0; i < blocks; i++)
+               {
+                       t = _mm_xor_si128(t, _mm_loadu_si128(bi + i));
+
+                       t = _mm_xor_si128(t, ks[0]);
+                       t = _mm_aesenc_si128(t, ks[1]);
+                       t = _mm_aesenc_si128(t, ks[2]);
+                       t = _mm_aesenc_si128(t, ks[3]);
+                       t = _mm_aesenc_si128(t, ks[4]);
+                       t = _mm_aesenc_si128(t, ks[5]);
+                       t = _mm_aesenc_si128(t, ks[6]);
+                       t = _mm_aesenc_si128(t, ks[7]);
+                       t = _mm_aesenc_si128(t, ks[8]);
+                       t = _mm_aesenc_si128(t, ks[9]);
+                       t = _mm_aesenclast_si128(t, ks[10]);
+               }
+
+               /* store remaining bytes of block M_n */
+               memcpy(this->rem, data.ptr + data.len - rem, rem);
+               this->rem_size = rem;
+       }
+       else
+       {
+               /* no complete block (or last block), just copy into remaining */
+               memcpy(this->rem + this->rem_size, data.ptr, data.len);
+               this->rem_size += data.len;
+       }
+       if (out)
+       {
+               /* if last block is complete
+                *   M_last := M_n XOR K1;
+                * else
+                *   M_last := padding(M_n) XOR K2;
+                */
+               if (this->rem_size == AES_BLOCK_SIZE)
+               {
+                       l = _mm_loadu_si128((__m128i*)this->rem);
+                       l = _mm_xor_si128(l, this->k1);
+               }
+               else
+               {
+                       /* padding(x) = x || 10^i  where i is 128-8*r-1
+                        * That is, padding(x) is the concatenation of x and a single '1',
+                        * followed by the minimum number of '0's, so that the total length is
+                        * equal to 128 bits.
+                        */
+                       if (this->rem_size < AES_BLOCK_SIZE)
+                       {
+                               memset(this->rem + this->rem_size, 0,
+                                          AES_BLOCK_SIZE - this->rem_size);
+                               this->rem[this->rem_size] = 0x80;
+                       }
+                       l = _mm_loadu_si128((__m128i*)this->rem);
+                       l = _mm_xor_si128(l, this->k2);
+               }
+               /* T := M_last XOR T;
+                * T := AES-128(K,T);
+                */
+               t = _mm_xor_si128(l, t);
+
+               t = _mm_xor_si128(t, ks[0]);
+               t = _mm_aesenc_si128(t, ks[1]);
+               t = _mm_aesenc_si128(t, ks[2]);
+               t = _mm_aesenc_si128(t, ks[3]);
+               t = _mm_aesenc_si128(t, ks[4]);
+               t = _mm_aesenc_si128(t, ks[5]);
+               t = _mm_aesenc_si128(t, ks[6]);
+               t = _mm_aesenc_si128(t, ks[7]);
+               t = _mm_aesenc_si128(t, ks[8]);
+               t = _mm_aesenc_si128(t, ks[9]);
+               t = _mm_aesenclast_si128(t, ks[10]);
+
+               _mm_storeu_si128((__m128i*)out, t);
+
+               /* reset state */
+               t = _mm_setzero_si128();
+               this->rem_size = 0;
+       }
+       this->t = t;
+       return TRUE;
+}
+
+METHOD(mac_t, get_mac_size, size_t,
+       private_mac_t *this)
+{
+       return AES_BLOCK_SIZE;
+}
+
+/**
+ * Left-shift the given chunk by one bit.
+ */
+static void bit_shift(chunk_t chunk)
+{
+       size_t i;
+
+       for (i = 0; i < chunk.len; i++)
+       {
+               chunk.ptr[i] <<= 1;
+               if (i < chunk.len - 1 && chunk.ptr[i + 1] & 0x80)
+               {
+                       chunk.ptr[i] |= 0x01;
+               }
+       }
+}
+
+METHOD(mac_t, set_key, bool,
+       private_mac_t *this, chunk_t key)
+{
+       __m128i rb, msb, l, a;
+       u_int round;
+       chunk_t k;
+
+       this->t = _mm_setzero_si128();
+       this->rem_size = 0;
+
+       /* we support variable keys as defined in RFC 4615 */
+       if (key.len == AES_BLOCK_SIZE)
+       {
+               k = key;
+       }
+       else
+       {       /* use cmac recursively to resize longer or shorter keys */
+               k = chunk_alloca(AES_BLOCK_SIZE);
+               memset(k.ptr, 0, k.len);
+               if (!set_key(this, k) || !get_mac(this, key, k.ptr))
+               {
+                       return FALSE;
+               }
+       }
+
+       DESTROY_IF(this->k);
+       this->k = aesni_key_create(TRUE, k);
+       if (!this->k)
+       {
+               return FALSE;
+       }
+
+       /*
+        * Rb = 0x00000000000000000000000000000087
+        * L = 0x00000000000000000000000000000000 encrypted with K
+        * if MSB(L) == 0
+        *   K1 = L << 1
+        * else
+        *   K1 = (L << 1) XOR Rb
+        * if MSB(K1) == 0
+        *   K2 = K1 << 1
+        * else
+        *   K2 = (K1 << 1) XOR Rb
+        */
+
+       rb = _mm_set_epi32(0x87000000, 0, 0, 0);
+       msb = _mm_set_epi32(0, 0, 0, 0x80);
+
+       l = _mm_setzero_si128();
+
+       l = _mm_xor_si128(l, this->k->schedule[0]);
+       for (round = 1; round < this->k->rounds; round++)
+       {
+               l = _mm_aesenc_si128(l, this->k->schedule[round]);
+       }
+       l = _mm_aesenclast_si128(l, this->k->schedule[this->k->rounds]);
+
+       this->k1 = l;
+       bit_shift(chunk_from_thing(this->k1));
+       a = _mm_and_si128(l, msb);
+       if (memchr(&a, 0x80, 1))
+       {
+               this->k1 = _mm_xor_si128(this->k1, rb);
+       }
+       this->k2 = this->k1;
+       bit_shift(chunk_from_thing(this->k2));
+       a = _mm_and_si128(this->k1, msb);
+       if (memchr(&a, 0x80, 1))
+       {
+               this->k2 = _mm_xor_si128(this->k2, rb);
+       }
+
+       return TRUE;
+}
+
+METHOD(mac_t, destroy, void,
+       private_mac_t *this)
+{
+       DESTROY_IF(this->k);
+       memwipe(&this->k1, sizeof(this->k1));
+       memwipe(&this->k2, sizeof(this->k2));
+       free_align(this);
+}
+
+/*
+ * Described in header
+ */
+mac_t *aesni_cmac_create(encryption_algorithm_t algo, size_t key_size)
+{
+       private_mac_t *this;
+
+       INIT_ALIGN(this, sizeof(__m128i),
+               .public = {
+                       .get_mac = _get_mac,
+                       .get_mac_size = _get_mac_size,
+                       .set_key = _set_key,
+                       .destroy = _destroy,
+               },
+       );
+
+       return &this->public;
+}
+
+/*
+ * Described in header.
+ */
+prf_t *aesni_cmac_prf_create(pseudo_random_function_t algo)
+{
+       mac_t *cmac;
+
+       switch (algo)
+       {
+               case PRF_AES128_CMAC:
+                       cmac = aesni_cmac_create(ENCR_AES_CBC, 16);
+                       break;
+               default:
+                       return NULL;
+       }
+       if (cmac)
+       {
+               return mac_prf_create(cmac);
+       }
+       return NULL;
+}
+
+/*
+ * Described in header
+ */
+signer_t *aesni_cmac_signer_create(integrity_algorithm_t algo)
+{
+       size_t truncation;
+       mac_t *cmac;
+
+       switch (algo)
+       {
+               case AUTH_AES_CMAC_96:
+                       cmac = aesni_cmac_create(ENCR_AES_CBC, 16);
+                       truncation = 12;
+                       break;
+               default:
+                       return NULL;
+       }
+       if (cmac)
+       {
+               return mac_signer_create(cmac, truncation);
+       }
+       return NULL;
+}
diff --git a/src/libstrongswan/plugins/aesni/aesni_cmac.h b/src/libstrongswan/plugins/aesni/aesni_cmac.h
new file mode 100644 (file)
index 0000000..5f0af73
--- /dev/null
@@ -0,0 +1,52 @@
+/*
+ * Copyright (C) 2015 Martin Willi
+ * Copyright (C) 2015 revosec AG
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.  See <http://www.fsf.org/copyleft/gpl.txt>.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ * or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * for more details.
+ */
+
+/**
+ * @defgroup aesni_xcbc aesni_xcbc
+ * @{ @ingroup aesni
+ */
+
+#ifndef CMAC_H_
+#define CMAC_H_
+
+#include <crypto/mac.h>
+#include <crypto/prfs/prf.h>
+#include <crypto/signers/signer.h>
+
+/**
+ * Create a generic mac_t object using AESNI CMAC.
+ *
+ * @param algo         underlying encryption algorithm
+ * @param key_size     size of encryption key, in bytes
+ */
+mac_t *aesni_cmac_create(encryption_algorithm_t algo, size_t key_size);
+
+/**
+ * Creates a new prf_t object based AESNI CMAC.
+ *
+ * @param algo         algorithm to implement
+ * @return                     prf_t object, NULL if not supported
+ */
+prf_t *aesni_cmac_prf_create(pseudo_random_function_t algo);
+
+/**
+ * Creates a new signer_t object based on AESNI CMAC.
+ *
+ * @param algo         algorithm to implement
+ * @return                     signer_t, NULL if  not supported
+ */
+signer_t *aesni_cmac_signer_create(integrity_algorithm_t algo);
+
+#endif /** CMAC_H_ @}*/
diff --git a/src/libstrongswan/plugins/aesni/aesni_ctr.c b/src/libstrongswan/plugins/aesni/aesni_ctr.c
new file mode 100644 (file)
index 0000000..9898138
--- /dev/null
@@ -0,0 +1,643 @@
+/*
+ * Copyright (C) 2015 Martin Willi
+ * Copyright (C) 2015 revosec AG
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.  See <http://www.fsf.org/copyleft/gpl.txt>.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ * or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * for more details.
+ */
+
+#include "aesni_ctr.h"
+#include "aesni_key.h"
+
+#include <tmmintrin.h>
+
+/**
+ * Pipeline parallelism we use for CTR en/decryption
+ */
+#define CTR_CRYPT_PARALLELISM 4
+
+typedef struct private_aesni_ctr_t private_aesni_ctr_t;
+
+/**
+ * CTR en/decryption method type
+ */
+typedef void (*aesni_ctr_fn_t)(private_aesni_ctr_t*, size_t, u_char*, u_char*);
+
+/**
+ * Private data of an aesni_ctr_t object.
+ */
+struct private_aesni_ctr_t {
+
+       /**
+        * Public aesni_ctr_t interface.
+        */
+       aesni_ctr_t public;
+
+       /**
+        * Key size
+        */
+       u_int key_size;
+
+       /**
+        * Key schedule
+        */
+       aesni_key_t *key;
+
+       /**
+        * Encryption method
+        */
+       aesni_ctr_fn_t crypt;
+
+       /**
+        * Counter state
+        */
+       struct {
+               char nonce[4];
+               char iv[8];
+               u_int32_t counter;
+       } __attribute__((packed, aligned(sizeof(__m128i)))) state;
+};
+
+/**
+ * Do big-endian increment on x
+ */
+static inline __m128i increment_be(__m128i x)
+{
+       __m128i swap;
+
+       swap = _mm_setr_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+
+       x = _mm_shuffle_epi8(x, swap);
+       x = _mm_add_epi64(x, _mm_set_epi32(0, 0, 0, 1));
+       x = _mm_shuffle_epi8(x, swap);
+
+       return x;
+}
+
+/**
+ * AES-128 CTR encryption
+ */
+static void encrypt_ctr128(private_aesni_ctr_t *this,
+                                                  size_t len, u_char *in, u_char *out)
+{
+       __m128i t1, t2, t3, t4;
+       __m128i d1, d2, d3, d4;
+       __m128i *ks, state, b, *bi, *bo;
+       u_int i, blocks, pblocks, rem;
+
+       state = _mm_load_si128((__m128i*)&this->state);
+       blocks = len / AES_BLOCK_SIZE;
+       pblocks = blocks - (blocks % CTR_CRYPT_PARALLELISM);
+       rem = len % AES_BLOCK_SIZE;
+       bi = (__m128i*)in;
+       bo = (__m128i*)out;
+
+       ks = this->key->schedule;
+
+       for (i = 0; i < pblocks; i += CTR_CRYPT_PARALLELISM)
+       {
+               d1 = _mm_loadu_si128(bi + i + 0);
+               d2 = _mm_loadu_si128(bi + i + 1);
+               d3 = _mm_loadu_si128(bi + i + 2);
+               d4 = _mm_loadu_si128(bi + i + 3);
+
+               t1 = _mm_xor_si128(state, ks[0]);
+               state = increment_be(state);
+               t2 = _mm_xor_si128(state, ks[0]);
+               state = increment_be(state);
+               t3 = _mm_xor_si128(state, ks[0]);
+               state = increment_be(state);
+               t4 = _mm_xor_si128(state, ks[0]);
+               state = increment_be(state);
+
+               t1 = _mm_aesenc_si128(t1, ks[1]);
+               t2 = _mm_aesenc_si128(t2, ks[1]);
+               t3 = _mm_aesenc_si128(t3, ks[1]);
+               t4 = _mm_aesenc_si128(t4, ks[1]);
+               t1 = _mm_aesenc_si128(t1, ks[2]);
+               t2 = _mm_aesenc_si128(t2, ks[2]);
+               t3 = _mm_aesenc_si128(t3, ks[2]);
+               t4 = _mm_aesenc_si128(t4, ks[2]);
+               t1 = _mm_aesenc_si128(t1, ks[3]);
+               t2 = _mm_aesenc_si128(t2, ks[3]);
+               t3 = _mm_aesenc_si128(t3, ks[3]);
+               t4 = _mm_aesenc_si128(t4, ks[3]);
+               t1 = _mm_aesenc_si128(t1, ks[4]);
+               t2 = _mm_aesenc_si128(t2, ks[4]);
+               t3 = _mm_aesenc_si128(t3, ks[4]);
+               t4 = _mm_aesenc_si128(t4, ks[4]);
+               t1 = _mm_aesenc_si128(t1, ks[5]);
+               t2 = _mm_aesenc_si128(t2, ks[5]);
+               t3 = _mm_aesenc_si128(t3, ks[5]);
+               t4 = _mm_aesenc_si128(t4, ks[5]);
+               t1 = _mm_aesenc_si128(t1, ks[6]);
+               t2 = _mm_aesenc_si128(t2, ks[6]);
+               t3 = _mm_aesenc_si128(t3, ks[6]);
+               t4 = _mm_aesenc_si128(t4, ks[6]);
+               t1 = _mm_aesenc_si128(t1, ks[7]);
+               t2 = _mm_aesenc_si128(t2, ks[7]);
+               t3 = _mm_aesenc_si128(t3, ks[7]);
+               t4 = _mm_aesenc_si128(t4, ks[7]);
+               t1 = _mm_aesenc_si128(t1, ks[8]);
+               t2 = _mm_aesenc_si128(t2, ks[8]);
+               t3 = _mm_aesenc_si128(t3, ks[8]);
+               t4 = _mm_aesenc_si128(t4, ks[8]);
+               t1 = _mm_aesenc_si128(t1, ks[9]);
+               t2 = _mm_aesenc_si128(t2, ks[9]);
+               t3 = _mm_aesenc_si128(t3, ks[9]);
+               t4 = _mm_aesenc_si128(t4, ks[9]);
+
+               t1 = _mm_aesenclast_si128(t1, ks[10]);
+               t2 = _mm_aesenclast_si128(t2, ks[10]);
+               t3 = _mm_aesenclast_si128(t3, ks[10]);
+               t4 = _mm_aesenclast_si128(t4, ks[10]);
+               t1 = _mm_xor_si128(t1, d1);
+               t2 = _mm_xor_si128(t2, d2);
+               t3 = _mm_xor_si128(t3, d3);
+               t4 = _mm_xor_si128(t4, d4);
+               _mm_storeu_si128(bo + i + 0, t1);
+               _mm_storeu_si128(bo + i + 1, t2);
+               _mm_storeu_si128(bo + i + 2, t3);
+               _mm_storeu_si128(bo + i + 3, t4);
+       }
+
+       for (i = pblocks; i < blocks; i++)
+       {
+               d1 = _mm_loadu_si128(bi + i);
+
+               t1 = _mm_xor_si128(state, ks[0]);
+               state = increment_be(state);
+
+               t1 = _mm_aesenc_si128(t1, ks[1]);
+               t1 = _mm_aesenc_si128(t1, ks[2]);
+               t1 = _mm_aesenc_si128(t1, ks[3]);
+               t1 = _mm_aesenc_si128(t1, ks[4]);
+               t1 = _mm_aesenc_si128(t1, ks[5]);
+               t1 = _mm_aesenc_si128(t1, ks[6]);
+               t1 = _mm_aesenc_si128(t1, ks[7]);
+               t1 = _mm_aesenc_si128(t1, ks[8]);
+               t1 = _mm_aesenc_si128(t1, ks[9]);
+
+               t1 = _mm_aesenclast_si128(t1, ks[10]);
+               t1 = _mm_xor_si128(t1, d1);
+               _mm_storeu_si128(bo + i, t1);
+       }
+
+       if (rem)
+       {
+               memset(&b, 0, sizeof(b));
+               memcpy(&b, bi + blocks, rem);
+
+               d1 = _mm_loadu_si128(&b);
+               t1 = _mm_xor_si128(state, ks[0]);
+
+               t1 = _mm_aesenc_si128(t1, ks[1]);
+               t1 = _mm_aesenc_si128(t1, ks[2]);
+               t1 = _mm_aesenc_si128(t1, ks[3]);
+               t1 = _mm_aesenc_si128(t1, ks[4]);
+               t1 = _mm_aesenc_si128(t1, ks[5]);
+               t1 = _mm_aesenc_si128(t1, ks[6]);
+               t1 = _mm_aesenc_si128(t1, ks[7]);
+               t1 = _mm_aesenc_si128(t1, ks[8]);
+               t1 = _mm_aesenc_si128(t1, ks[9]);
+
+               t1 = _mm_aesenclast_si128(t1, ks[10]);
+               t1 = _mm_xor_si128(t1, d1);
+               _mm_storeu_si128(&b, t1);
+
+               memcpy(bo + blocks, &b, rem);
+       }
+}
+
+/**
+ * AES-192 CTR encryption
+ */
+static void encrypt_ctr192(private_aesni_ctr_t *this,
+                                                  size_t len, u_char *in, u_char *out)
+{
+       __m128i t1, t2, t3, t4;
+       __m128i d1, d2, d3, d4;
+       __m128i *ks, state, b, *bi, *bo;
+       u_int i, blocks, pblocks, rem;
+
+       state = _mm_load_si128((__m128i*)&this->state);
+       blocks = len / AES_BLOCK_SIZE;
+       pblocks = blocks - (blocks % CTR_CRYPT_PARALLELISM);
+       rem = len % AES_BLOCK_SIZE;
+       bi = (__m128i*)in;
+       bo = (__m128i*)out;
+
+       ks = this->key->schedule;
+
+       for (i = 0; i < pblocks; i += CTR_CRYPT_PARALLELISM)
+       {
+               d1 = _mm_loadu_si128(bi + i + 0);
+               d2 = _mm_loadu_si128(bi + i + 1);
+               d3 = _mm_loadu_si128(bi + i + 2);
+               d4 = _mm_loadu_si128(bi + i + 3);
+
+               t1 = _mm_xor_si128(state, ks[0]);
+               state = increment_be(state);
+               t2 = _mm_xor_si128(state, ks[0]);
+               state = increment_be(state);
+               t3 = _mm_xor_si128(state, ks[0]);
+               state = increment_be(state);
+               t4 = _mm_xor_si128(state, ks[0]);
+               state = increment_be(state);
+
+               t1 = _mm_aesenc_si128(t1, ks[1]);
+               t2 = _mm_aesenc_si128(t2, ks[1]);
+               t3 = _mm_aesenc_si128(t3, ks[1]);
+               t4 = _mm_aesenc_si128(t4, ks[1]);
+               t1 = _mm_aesenc_si128(t1, ks[2]);
+               t2 = _mm_aesenc_si128(t2, ks[2]);
+               t3 = _mm_aesenc_si128(t3, ks[2]);
+               t4 = _mm_aesenc_si128(t4, ks[2]);
+               t1 = _mm_aesenc_si128(t1, ks[3]);
+               t2 = _mm_aesenc_si128(t2, ks[3]);
+               t3 = _mm_aesenc_si128(t3, ks[3]);
+               t4 = _mm_aesenc_si128(t4, ks[3]);
+               t1 = _mm_aesenc_si128(t1, ks[4]);
+               t2 = _mm_aesenc_si128(t2, ks[4]);
+               t3 = _mm_aesenc_si128(t3, ks[4]);
+               t4 = _mm_aesenc_si128(t4, ks[4]);
+               t1 = _mm_aesenc_si128(t1, ks[5]);
+               t2 = _mm_aesenc_si128(t2, ks[5]);
+               t3 = _mm_aesenc_si128(t3, ks[5]);
+               t4 = _mm_aesenc_si128(t4, ks[5]);
+               t1 = _mm_aesenc_si128(t1, ks[6]);
+               t2 = _mm_aesenc_si128(t2, ks[6]);
+               t3 = _mm_aesenc_si128(t3, ks[6]);
+               t4 = _mm_aesenc_si128(t4, ks[6]);
+               t1 = _mm_aesenc_si128(t1, ks[7]);
+               t2 = _mm_aesenc_si128(t2, ks[7]);
+               t3 = _mm_aesenc_si128(t3, ks[7]);
+               t4 = _mm_aesenc_si128(t4, ks[7]);
+               t1 = _mm_aesenc_si128(t1, ks[8]);
+               t2 = _mm_aesenc_si128(t2, ks[8]);
+               t3 = _mm_aesenc_si128(t3, ks[8]);
+               t4 = _mm_aesenc_si128(t4, ks[8]);
+               t1 = _mm_aesenc_si128(t1, ks[9]);
+               t2 = _mm_aesenc_si128(t2, ks[9]);
+               t3 = _mm_aesenc_si128(t3, ks[9]);
+               t4 = _mm_aesenc_si128(t4, ks[9]);
+               t1 = _mm_aesenc_si128(t1, ks[10]);
+               t2 = _mm_aesenc_si128(t2, ks[10]);
+               t3 = _mm_aesenc_si128(t3, ks[10]);
+               t4 = _mm_aesenc_si128(t4, ks[10]);
+               t1 = _mm_aesenc_si128(t1, ks[11]);
+               t2 = _mm_aesenc_si128(t2, ks[11]);
+               t3 = _mm_aesenc_si128(t3, ks[11]);
+               t4 = _mm_aesenc_si128(t4, ks[11]);
+
+               t1 = _mm_aesenclast_si128(t1, ks[12]);
+               t2 = _mm_aesenclast_si128(t2, ks[12]);
+               t3 = _mm_aesenclast_si128(t3, ks[12]);
+               t4 = _mm_aesenclast_si128(t4, ks[12]);
+               t1 = _mm_xor_si128(t1, d1);
+               t2 = _mm_xor_si128(t2, d2);
+               t3 = _mm_xor_si128(t3, d3);
+               t4 = _mm_xor_si128(t4, d4);
+               _mm_storeu_si128(bo + i + 0, t1);
+               _mm_storeu_si128(bo + i + 1, t2);
+               _mm_storeu_si128(bo + i + 2, t3);
+               _mm_storeu_si128(bo + i + 3, t4);
+       }
+
+       for (i = pblocks; i < blocks; i++)
+       {
+               d1 = _mm_loadu_si128(bi + i);
+
+               t1 = _mm_xor_si128(state, ks[0]);
+               state = increment_be(state);
+
+               t1 = _mm_aesenc_si128(t1, ks[1]);
+               t1 = _mm_aesenc_si128(t1, ks[2]);
+               t1 = _mm_aesenc_si128(t1, ks[3]);
+               t1 = _mm_aesenc_si128(t1, ks[4]);
+               t1 = _mm_aesenc_si128(t1, ks[5]);
+               t1 = _mm_aesenc_si128(t1, ks[6]);
+               t1 = _mm_aesenc_si128(t1, ks[7]);
+               t1 = _mm_aesenc_si128(t1, ks[8]);
+               t1 = _mm_aesenc_si128(t1, ks[9]);
+               t1 = _mm_aesenc_si128(t1, ks[10]);
+               t1 = _mm_aesenc_si128(t1, ks[11]);
+
+               t1 = _mm_aesenclast_si128(t1, ks[12]);
+               t1 = _mm_xor_si128(t1, d1);
+               _mm_storeu_si128(bo + i, t1);
+       }
+
+       if (rem)
+       {
+               memset(&b, 0, sizeof(b));
+               memcpy(&b, bi + blocks, rem);
+
+               d1 = _mm_loadu_si128(&b);
+               t1 = _mm_xor_si128(state, ks[0]);
+
+               t1 = _mm_aesenc_si128(t1, ks[1]);
+               t1 = _mm_aesenc_si128(t1, ks[2]);
+               t1 = _mm_aesenc_si128(t1, ks[3]);
+               t1 = _mm_aesenc_si128(t1, ks[4]);
+               t1 = _mm_aesenc_si128(t1, ks[5]);
+               t1 = _mm_aesenc_si128(t1, ks[6]);
+               t1 = _mm_aesenc_si128(t1, ks[7]);
+               t1 = _mm_aesenc_si128(t1, ks[8]);
+               t1 = _mm_aesenc_si128(t1, ks[9]);
+               t1 = _mm_aesenc_si128(t1, ks[10]);
+               t1 = _mm_aesenc_si128(t1, ks[11]);
+
+               t1 = _mm_aesenclast_si128(t1, ks[12]);
+               t1 = _mm_xor_si128(t1, d1);
+               _mm_storeu_si128(&b, t1);
+
+               memcpy(bo + blocks, &b, rem);
+       }
+}
+
+/**
+ * AES-256 CTR encryption
+ */
+static void encrypt_ctr256(private_aesni_ctr_t *this,
+                                                  size_t len, u_char *in, u_char *out)
+{
+       __m128i t1, t2, t3, t4;
+       __m128i d1, d2, d3, d4;
+       __m128i *ks, state, b, *bi, *bo;
+       u_int i, blocks, pblocks, rem;
+
+       state = _mm_load_si128((__m128i*)&this->state);
+       blocks = len / AES_BLOCK_SIZE;
+       pblocks = blocks - (blocks % CTR_CRYPT_PARALLELISM);
+       rem = len % AES_BLOCK_SIZE;
+       bi = (__m128i*)in;
+       bo = (__m128i*)out;
+
+       ks = this->key->schedule;
+
+       for (i = 0; i < pblocks; i += CTR_CRYPT_PARALLELISM)
+       {
+               d1 = _mm_loadu_si128(bi + i + 0);
+               d2 = _mm_loadu_si128(bi + i + 1);
+               d3 = _mm_loadu_si128(bi + i + 2);
+               d4 = _mm_loadu_si128(bi + i + 3);
+
+               t1 = _mm_xor_si128(state, ks[0]);
+               state = increment_be(state);
+               t2 = _mm_xor_si128(state, ks[0]);
+               state = increment_be(state);
+               t3 = _mm_xor_si128(state, ks[0]);
+               state = increment_be(state);
+               t4 = _mm_xor_si128(state, ks[0]);
+               state = increment_be(state);
+
+               t1 = _mm_aesenc_si128(t1, ks[1]);
+               t2 = _mm_aesenc_si128(t2, ks[1]);
+               t3 = _mm_aesenc_si128(t3, ks[1]);
+               t4 = _mm_aesenc_si128(t4, ks[1]);
+               t1 = _mm_aesenc_si128(t1, ks[2]);
+               t2 = _mm_aesenc_si128(t2, ks[2]);
+               t3 = _mm_aesenc_si128(t3, ks[2]);
+               t4 = _mm_aesenc_si128(t4, ks[2]);
+               t1 = _mm_aesenc_si128(t1, ks[3]);
+               t2 = _mm_aesenc_si128(t2, ks[3]);
+               t3 = _mm_aesenc_si128(t3, ks[3]);
+               t4 = _mm_aesenc_si128(t4, ks[3]);
+               t1 = _mm_aesenc_si128(t1, ks[4]);
+               t2 = _mm_aesenc_si128(t2, ks[4]);
+               t3 = _mm_aesenc_si128(t3, ks[4]);
+               t4 = _mm_aesenc_si128(t4, ks[4]);
+               t1 = _mm_aesenc_si128(t1, ks[5]);
+               t2 = _mm_aesenc_si128(t2, ks[5]);
+               t3 = _mm_aesenc_si128(t3, ks[5]);
+               t4 = _mm_aesenc_si128(t4, ks[5]);
+               t1 = _mm_aesenc_si128(t1, ks[6]);
+               t2 = _mm_aesenc_si128(t2, ks[6]);
+               t3 = _mm_aesenc_si128(t3, ks[6]);
+               t4 = _mm_aesenc_si128(t4, ks[6]);
+               t1 = _mm_aesenc_si128(t1, ks[7]);
+               t2 = _mm_aesenc_si128(t2, ks[7]);
+               t3 = _mm_aesenc_si128(t3, ks[7]);
+               t4 = _mm_aesenc_si128(t4, ks[7]);
+               t1 = _mm_aesenc_si128(t1, ks[8]);
+               t2 = _mm_aesenc_si128(t2, ks[8]);
+               t3 = _mm_aesenc_si128(t3, ks[8]);
+               t4 = _mm_aesenc_si128(t4, ks[8]);
+               t1 = _mm_aesenc_si128(t1, ks[9]);
+               t2 = _mm_aesenc_si128(t2, ks[9]);
+               t3 = _mm_aesenc_si128(t3, ks[9]);
+               t4 = _mm_aesenc_si128(t4, ks[9]);
+               t1 = _mm_aesenc_si128(t1, ks[10]);
+               t2 = _mm_aesenc_si128(t2, ks[10]);
+               t3 = _mm_aesenc_si128(t3, ks[10]);
+               t4 = _mm_aesenc_si128(t4, ks[10]);
+               t1 = _mm_aesenc_si128(t1, ks[11]);
+               t2 = _mm_aesenc_si128(t2, ks[11]);
+               t3 = _mm_aesenc_si128(t3, ks[11]);
+               t4 = _mm_aesenc_si128(t4, ks[11]);
+               t1 = _mm_aesenc_si128(t1, ks[12]);
+               t2 = _mm_aesenc_si128(t2, ks[12]);
+               t3 = _mm_aesenc_si128(t3, ks[12]);
+               t4 = _mm_aesenc_si128(t4, ks[12]);
+               t1 = _mm_aesenc_si128(t1, ks[13]);
+               t2 = _mm_aesenc_si128(t2, ks[13]);
+               t3 = _mm_aesenc_si128(t3, ks[13]);
+               t4 = _mm_aesenc_si128(t4, ks[13]);
+
+               t1 = _mm_aesenclast_si128(t1, ks[14]);
+               t2 = _mm_aesenclast_si128(t2, ks[14]);
+               t3 = _mm_aesenclast_si128(t3, ks[14]);
+               t4 = _mm_aesenclast_si128(t4, ks[14]);
+               t1 = _mm_xor_si128(t1, d1);
+               t2 = _mm_xor_si128(t2, d2);
+               t3 = _mm_xor_si128(t3, d3);
+               t4 = _mm_xor_si128(t4, d4);
+               _mm_storeu_si128(bo + i + 0, t1);
+               _mm_storeu_si128(bo + i + 1, t2);
+               _mm_storeu_si128(bo + i + 2, t3);
+               _mm_storeu_si128(bo + i + 3, t4);
+       }
+
+       for (i = pblocks; i < blocks; i++)
+       {
+               d1 = _mm_loadu_si128(bi + i);
+
+               t1 = _mm_xor_si128(state, ks[0]);
+               state = increment_be(state);
+
+               t1 = _mm_aesenc_si128(t1, ks[1]);
+               t1 = _mm_aesenc_si128(t1, ks[2]);
+               t1 = _mm_aesenc_si128(t1, ks[3]);
+               t1 = _mm_aesenc_si128(t1, ks[4]);
+               t1 = _mm_aesenc_si128(t1, ks[5]);
+               t1 = _mm_aesenc_si128(t1, ks[6]);
+               t1 = _mm_aesenc_si128(t1, ks[7]);
+               t1 = _mm_aesenc_si128(t1, ks[8]);
+               t1 = _mm_aesenc_si128(t1, ks[9]);
+               t1 = _mm_aesenc_si128(t1, ks[10]);
+               t1 = _mm_aesenc_si128(t1, ks[11]);
+               t1 = _mm_aesenc_si128(t1, ks[12]);
+               t1 = _mm_aesenc_si128(t1, ks[13]);
+
+               t1 = _mm_aesenclast_si128(t1, ks[14]);
+               t1 = _mm_xor_si128(t1, d1);
+               _mm_storeu_si128(bo + i, t1);
+       }
+
+       if (rem)
+       {
+               memset(&b, 0, sizeof(b));
+               memcpy(&b, bi + blocks, rem);
+
+               d1 = _mm_loadu_si128(&b);
+               t1 = _mm_xor_si128(state, ks[0]);
+
+               t1 = _mm_aesenc_si128(t1, ks[1]);
+               t1 = _mm_aesenc_si128(t1, ks[2]);
+               t1 = _mm_aesenc_si128(t1, ks[3]);
+               t1 = _mm_aesenc_si128(t1, ks[4]);
+               t1 = _mm_aesenc_si128(t1, ks[5]);
+               t1 = _mm_aesenc_si128(t1, ks[6]);
+               t1 = _mm_aesenc_si128(t1, ks[7]);
+               t1 = _mm_aesenc_si128(t1, ks[8]);
+               t1 = _mm_aesenc_si128(t1, ks[9]);
+               t1 = _mm_aesenc_si128(t1, ks[10]);
+               t1 = _mm_aesenc_si128(t1, ks[11]);
+               t1 = _mm_aesenc_si128(t1, ks[12]);
+               t1 = _mm_aesenc_si128(t1, ks[13]);
+
+               t1 = _mm_aesenclast_si128(t1, ks[14]);
+               t1 = _mm_xor_si128(t1, d1);
+               _mm_storeu_si128(&b, t1);
+
+               memcpy(bo + blocks, &b, rem);
+       }
+}
+
+METHOD(crypter_t, crypt, bool,
+       private_aesni_ctr_t *this, chunk_t in, chunk_t iv, chunk_t *out)
+{
+       u_char *buf;
+
+       if (!this->key || iv.len != sizeof(this->state.iv))
+       {
+               return FALSE;
+       }
+       memcpy(this->state.iv, iv.ptr, sizeof(this->state.iv));
+       this->state.counter = htonl(1);
+
+       buf = in.ptr;
+       if (out)
+       {
+               *out = chunk_alloc(in.len);
+               buf = out->ptr;
+       }
+       this->crypt(this, in.len, in.ptr, buf);
+       return TRUE;
+}
+
+METHOD(crypter_t, get_block_size, size_t,
+       private_aesni_ctr_t *this)
+{
+       return 1;
+}
+
+METHOD(crypter_t, get_iv_size, size_t,
+       private_aesni_ctr_t *this)
+{
+       return sizeof(this->state.iv);
+}
+
+METHOD(crypter_t, get_key_size, size_t,
+       private_aesni_ctr_t *this)
+{
+       return this->key_size + sizeof(this->state.nonce);
+}
+
+METHOD(crypter_t, set_key, bool,
+       private_aesni_ctr_t *this, chunk_t key)
+{
+       if (key.len != get_key_size(this))
+       {
+               return FALSE;
+       }
+
+       memcpy(this->state.nonce, key.ptr + key.len - sizeof(this->state.nonce),
+                  sizeof(this->state.nonce));
+       key.len -= sizeof(this->state.nonce);
+
+       DESTROY_IF(this->key);
+       this->key = aesni_key_create(TRUE, key);
+
+       return this->key;
+}
+
+METHOD(crypter_t, destroy, void,
+       private_aesni_ctr_t *this)
+{
+       DESTROY_IF(this->key);
+       free_align(this);
+}
+
+/**
+ * See header
+ */
+aesni_ctr_t *aesni_ctr_create(encryption_algorithm_t algo, size_t key_size)
+{
+       private_aesni_ctr_t *this;
+
+       if (algo != ENCR_AES_CTR)
+       {
+               return NULL;
+       }
+       switch (key_size)
+       {
+               case 0:
+                       key_size = 16;
+                       break;
+               case 16:
+               case 24:
+               case 32:
+                       break;
+               default:
+                       return NULL;
+       }
+
+       INIT_ALIGN(this, sizeof(__m128i),
+               .public = {
+                       .crypter = {
+                               .encrypt = _crypt,
+                               .decrypt = _crypt,
+                               .get_block_size = _get_block_size,
+                               .get_iv_size = _get_iv_size,
+                               .get_key_size = _get_key_size,
+                               .set_key = _set_key,
+                               .destroy = _destroy,
+                       },
+               },
+               .key_size = key_size,
+       );
+
+       switch (key_size)
+       {
+               case 16:
+                       this->crypt = encrypt_ctr128;
+                       break;
+               case 24:
+                       this->crypt = encrypt_ctr192;
+                       break;
+               case 32:
+                       this->crypt = encrypt_ctr256;
+                       break;
+       }
+
+       return &this->public;
+}
diff --git a/src/libstrongswan/plugins/aesni/aesni_ctr.h b/src/libstrongswan/plugins/aesni/aesni_ctr.h
new file mode 100644 (file)
index 0000000..6126a2c
--- /dev/null
@@ -0,0 +1,48 @@
+/*
+ * Copyright (C) 2015 Martin Willi
+ * Copyright (C) 2015 revosec AG
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.  See <http://www.fsf.org/copyleft/gpl.txt>.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ * or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * for more details.
+ */
+
+/**
+ * @defgroup aesni_ctr aesni_ctr
+ * @{ @ingroup aesni
+ */
+
+#ifndef AESNI_CTR_H_
+#define AESNI_CTR_H_
+
+#include <library.h>
+
+typedef struct aesni_ctr_t aesni_ctr_t;
+
+/**
+ * CTR mode crypter using AES-NI
+ */
+struct aesni_ctr_t {
+
+       /**
+        * Implements crypter interface
+        */
+       crypter_t crypter;
+};
+
+/**
+ * Create a aesni_ctr instance.
+ *
+ * @param algo                 encryption algorithm, AES_ENCR_CTR
+ * @param key_size             AES key size, in bytes
+ * @return                             AES-CTR crypter, NULL if not supported
+ */
+aesni_ctr_t *aesni_ctr_create(encryption_algorithm_t algo, size_t key_size);
+
+#endif /** AESNI_CTR_H_ @}*/
diff --git a/src/libstrongswan/plugins/aesni/aesni_gcm.c b/src/libstrongswan/plugins/aesni/aesni_gcm.c
new file mode 100644 (file)
index 0000000..53c0b14
--- /dev/null
@@ -0,0 +1,1447 @@
+/*
+ * Copyright (C) 2015 Martin Willi
+ * Copyright (C) 2015 revosec AG
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.  See <http://www.fsf.org/copyleft/gpl.txt>.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ * or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * for more details.
+ */
+
+#include "aesni_gcm.h"
+#include "aesni_key.h"
+
+#include <crypto/iv/iv_gen_seq.h>
+
+#include <tmmintrin.h>
+
+#define NONCE_SIZE 12
+#define IV_SIZE 8
+#define SALT_SIZE (NONCE_SIZE - IV_SIZE)
+
+/**
+ * Parallel pipelining
+ */
+#define GCM_CRYPT_PARALLELISM 4
+
+typedef struct private_aesni_gcm_t private_aesni_gcm_t;
+
+/**
+ * GCM en/decryption method type
+ */
+typedef void (*aesni_gcm_fn_t)(private_aesni_gcm_t*, size_t, u_char*, u_char*,
+                                                          u_char*, size_t, u_char*, u_char*);
+
+/**
+ * Private data of an aesni_gcm_t object.
+ */
+struct private_aesni_gcm_t {
+
+       /**
+        * Public aesni_gcm_t interface.
+        */
+       aesni_gcm_t public;
+
+       /**
+        * Encryption key schedule
+        */
+       aesni_key_t *key;
+
+       /**
+        * IV generator.
+        */
+       iv_gen_t *iv_gen;
+
+       /**
+        * Length of the integrity check value
+        */
+       size_t icv_size;
+
+       /**
+        * Length of the key in bytes
+        */
+       size_t key_size;
+
+       /**
+        * GCM encryption function
+        */
+       aesni_gcm_fn_t encrypt;
+
+       /**
+        * GCM decryption function
+        */
+       aesni_gcm_fn_t decrypt;
+
+       /**
+        * salt to add to nonce
+        */
+       u_char salt[SALT_SIZE];
+
+       /**
+        * GHASH subkey H, big-endian
+        */
+       __m128i h;
+
+       /**
+        * GHASH key H^2, big-endian
+        */
+       __m128i hh;
+
+       /**
+        * GHASH key H^3, big-endian
+        */
+       __m128i hhh;
+
+       /**
+        * GHASH key H^4, big-endian
+        */
+       __m128i hhhh;
+};
+
+/**
+ * Byte-swap a 128-bit integer
+ */
+static inline __m128i swap128(__m128i x)
+{
+       return _mm_shuffle_epi8(x,
+                       _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15));
+}
+
+/**
+ * Multiply two blocks in GF128
+ */
+static __m128i mult_block(__m128i h, __m128i y)
+{
+       __m128i t1, t2, t3, t4, t5, t6;
+
+       y = swap128(y);
+
+       t1 = _mm_clmulepi64_si128(h, y, 0x00);
+       t2 = _mm_clmulepi64_si128(h, y, 0x01);
+       t3 = _mm_clmulepi64_si128(h, y, 0x10);
+       t4 = _mm_clmulepi64_si128(h, y, 0x11);
+
+       t2 = _mm_xor_si128(t2, t3);
+       t3 = _mm_slli_si128(t2, 8);
+       t2 = _mm_srli_si128(t2, 8);
+       t1 = _mm_xor_si128(t1, t3);
+       t4 = _mm_xor_si128(t4, t2);
+
+       t5 = _mm_srli_epi32(t1, 31);
+       t1 = _mm_slli_epi32(t1, 1);
+       t6 = _mm_srli_epi32(t4, 31);
+       t4 = _mm_slli_epi32(t4, 1);
+
+       t3 = _mm_srli_si128(t5, 12);
+       t6 = _mm_slli_si128(t6, 4);
+       t5 = _mm_slli_si128(t5, 4);
+       t1 = _mm_or_si128(t1, t5);
+       t4 = _mm_or_si128(t4, t6);
+       t4 = _mm_or_si128(t4, t3);
+
+       t5 = _mm_slli_epi32(t1, 31);
+       t6 = _mm_slli_epi32(t1, 30);
+       t3 = _mm_slli_epi32(t1, 25);
+
+       t5 = _mm_xor_si128(t5, t6);
+       t5 = _mm_xor_si128(t5, t3);
+       t6 = _mm_srli_si128(t5, 4);
+       t4 = _mm_xor_si128(t4, t6);
+       t5 = _mm_slli_si128(t5, 12);
+       t1 = _mm_xor_si128(t1, t5);
+       t4 = _mm_xor_si128(t4, t1);
+
+       t5 = _mm_srli_epi32(t1, 1);
+       t2 = _mm_srli_epi32(t1, 2);
+       t3 = _mm_srli_epi32(t1, 7);
+       t4 = _mm_xor_si128(t4, t2);
+       t4 = _mm_xor_si128(t4, t3);
+       t4 = _mm_xor_si128(t4, t5);
+
+       return swap128(t4);
+}
+
+/**
+ * Multiply four consecutive blocks by their respective GHASH key, XOR
+ */
+static inline __m128i mult4xor(__m128i h1, __m128i h2, __m128i h3, __m128i h4,
+                                                          __m128i d1, __m128i d2, __m128i d3, __m128i d4)
+{
+       __m128i t0, t1, t2, t3, t4, t5, t6, t7, t8, t9;
+
+       d1 = swap128(d1);
+       d2 = swap128(d2);
+       d3 = swap128(d3);
+       d4 = swap128(d4);
+
+       t0 = _mm_clmulepi64_si128(h1, d1, 0x00);
+       t1 = _mm_clmulepi64_si128(h2, d2, 0x00);
+       t2 = _mm_clmulepi64_si128(h3, d3, 0x00);
+       t3 = _mm_clmulepi64_si128(h4, d4, 0x00);
+       t8 = _mm_xor_si128(t0, t1);
+       t8 = _mm_xor_si128(t8, t2);
+       t8 = _mm_xor_si128(t8, t3);
+
+       t4 = _mm_clmulepi64_si128(h1, d1, 0x11);
+       t5 = _mm_clmulepi64_si128(h2, d2, 0x11);
+       t6 = _mm_clmulepi64_si128(h3, d3, 0x11);
+       t7 = _mm_clmulepi64_si128(h4, d4, 0x11);
+       t9 = _mm_xor_si128(t4, t5);
+       t9 = _mm_xor_si128(t9, t6);
+       t9 = _mm_xor_si128(t9, t7);
+
+       t0 = _mm_shuffle_epi32(h1, 78);
+       t4 = _mm_shuffle_epi32(d1, 78);
+       t0 = _mm_xor_si128(t0, h1);
+       t4 = _mm_xor_si128(t4, d1);
+       t1 = _mm_shuffle_epi32(h2, 78);
+       t5 = _mm_shuffle_epi32(d2, 78);
+       t1 = _mm_xor_si128(t1, h2);
+       t5 = _mm_xor_si128(t5, d2);
+       t2 = _mm_shuffle_epi32(h3, 78);
+       t6 = _mm_shuffle_epi32(d3, 78);
+       t2 = _mm_xor_si128(t2, h3);
+       t6 = _mm_xor_si128(t6, d3);
+       t3 = _mm_shuffle_epi32(h4, 78);
+       t7 = _mm_shuffle_epi32(d4, 78);
+       t3 = _mm_xor_si128(t3, h4);
+       t7 = _mm_xor_si128(t7, d4);
+
+       t0 = _mm_clmulepi64_si128(t0, t4, 0x00);
+       t1 = _mm_clmulepi64_si128(t1, t5, 0x00);
+       t2 = _mm_clmulepi64_si128(t2, t6, 0x00);
+       t3 = _mm_clmulepi64_si128(t3, t7, 0x00);
+       t0 = _mm_xor_si128(t0, t8);
+       t0 = _mm_xor_si128(t0, t9);
+       t0 = _mm_xor_si128(t1, t0);
+       t0 = _mm_xor_si128(t2, t0);
+
+       t0 = _mm_xor_si128(t3, t0);
+       t4 = _mm_slli_si128(t0, 8);
+       t0 = _mm_srli_si128(t0, 8);
+       t3 = _mm_xor_si128(t4, t8);
+       t6 = _mm_xor_si128(t0, t9);
+       t7 = _mm_srli_epi32(t3, 31);
+       t8 = _mm_srli_epi32(t6, 31);
+       t3 = _mm_slli_epi32(t3, 1);
+       t6 = _mm_slli_epi32(t6, 1);
+       t9 = _mm_srli_si128(t7, 12);
+       t8 = _mm_slli_si128(t8, 4);
+       t7 = _mm_slli_si128(t7, 4);
+       t3 = _mm_or_si128(t3, t7);
+       t6 = _mm_or_si128(t6, t8);
+       t6 = _mm_or_si128(t6, t9);
+       t7 = _mm_slli_epi32(t3, 31);
+       t8 = _mm_slli_epi32(t3, 30);
+       t9 = _mm_slli_epi32(t3, 25);
+       t7 = _mm_xor_si128(t7, t8);
+       t7 = _mm_xor_si128(t7, t9);
+       t8 = _mm_srli_si128(t7, 4);
+       t7 = _mm_slli_si128(t7, 12);
+       t3 = _mm_xor_si128(t3, t7);
+       t2 = _mm_srli_epi32(t3, 1);
+       t4 = _mm_srli_epi32(t3, 2);
+       t5 = _mm_srli_epi32(t3, 7);
+       t2 = _mm_xor_si128(t2, t4);
+       t2 = _mm_xor_si128(t2, t5);
+       t2 = _mm_xor_si128(t2, t8);
+       t3 = _mm_xor_si128(t3, t2);
+       t6 = _mm_xor_si128(t6, t3);
+
+       return swap128(t6);
+}
+
+/**
+ * GHASH on a single block
+ */
+static __m128i ghash(__m128i h, __m128i y, __m128i x)
+{
+       return mult_block(h, _mm_xor_si128(y, x));
+}
+
+/**
+ * Start constructing the ICV for the associated data
+ */
+static __m128i icv_header(private_aesni_gcm_t *this, void *assoc, size_t alen)
+{
+       u_int blocks, pblocks, rem, i;
+       __m128i h1, h2, h3, h4, d1, d2, d3, d4;
+       __m128i y, last, *ab;
+
+       h1 = this->hhhh;
+       h2 = this->hhh;
+       h3 = this->hh;
+       h4 = this->h;
+
+       y = _mm_setzero_si128();
+       ab = assoc;
+       blocks = alen / AES_BLOCK_SIZE;
+       pblocks = blocks - (blocks % GCM_CRYPT_PARALLELISM);
+       rem = alen % AES_BLOCK_SIZE;
+       for (i = 0; i < pblocks; i += GCM_CRYPT_PARALLELISM)
+       {
+               d1 = _mm_loadu_si128(ab + i + 0);
+               d2 = _mm_loadu_si128(ab + i + 1);
+               d3 = _mm_loadu_si128(ab + i + 2);
+               d4 = _mm_loadu_si128(ab + i + 3);
+               y = _mm_xor_si128(y, d1);
+               y = mult4xor(h1, h2, h3, h4, y, d2, d3, d4);
+       }
+       for (i = pblocks; i < blocks; i++)
+       {
+               y = ghash(this->h, y, _mm_loadu_si128(ab + i));
+       }
+       if (rem)
+       {
+               last = _mm_setzero_si128();
+               memcpy(&last, ab + blocks, rem);
+
+               y = ghash(this->h, y, last);
+       }
+
+       return y;
+}
+
+/**
+ * Complete the ICV by hashing a assoc/data length block
+ */
+static __m128i icv_tailer(private_aesni_gcm_t *this, __m128i y,
+                                                 size_t alen, size_t dlen)
+{
+       __m128i b;
+
+       htoun64(&b, alen * 8);
+       htoun64((u_char*)&b + sizeof(u_int64_t), dlen * 8);
+
+       return ghash(this->h, y, b);
+}
+
+/**
+ * En-/Decrypt the ICV, trim and store it
+ */
+static void icv_crypt(private_aesni_gcm_t *this, __m128i y, __m128i j,
+                                         u_char *icv)
+{
+       __m128i *ks, t, b;
+       u_int round;
+
+       ks = this->key->schedule;
+       t = _mm_xor_si128(j, ks[0]);
+       for (round = 1; round < this->key->rounds; round++)
+       {
+               t = _mm_aesenc_si128(t, ks[round]);
+       }
+       t = _mm_aesenclast_si128(t, ks[this->key->rounds]);
+
+       t = _mm_xor_si128(y, t);
+
+       _mm_storeu_si128(&b, t);
+       memcpy(icv, &b, this->icv_size);
+}
+
+/**
+ * Do big-endian increment on x
+ */
+static inline __m128i increment_be(__m128i x)
+{
+       x = swap128(x);
+       x = _mm_add_epi64(x, _mm_set_epi32(0, 0, 0, 1));
+       x = swap128(x);
+
+       return x;
+}
+
+/**
+ * Generate the block J0
+ */
+static inline __m128i create_j(private_aesni_gcm_t *this, u_char *iv)
+{
+       u_char j[AES_BLOCK_SIZE];
+
+       memcpy(j, this->salt, SALT_SIZE);
+       memcpy(j + SALT_SIZE, iv, IV_SIZE);
+       htoun32(j + SALT_SIZE + IV_SIZE, 1);
+
+       return _mm_loadu_si128((__m128i*)j);
+}
+
+/**
+ * Encrypt a remaining incomplete block, return updated Y
+ */
+static __m128i encrypt_gcm_rem(private_aesni_gcm_t *this, u_int rem,
+                                                          void *in, void *out, __m128i cb, __m128i y)
+{
+       __m128i *ks, t, b;
+       u_int round;
+
+       memset(&b, 0, sizeof(b));
+       memcpy(&b, in, rem);
+
+       ks = this->key->schedule;
+       t = _mm_xor_si128(cb, ks[0]);
+       for (round = 1; round < this->key->rounds; round++)
+       {
+               t = _mm_aesenc_si128(t, ks[round]);
+       }
+       t = _mm_aesenclast_si128(t, ks[this->key->rounds]);
+       b = _mm_xor_si128(t, b);
+
+       memcpy(out, &b, rem);
+
+       memset((u_char*)&b + rem, 0, AES_BLOCK_SIZE - rem);
+       return ghash(this->h, y, b);
+}
+
+/**
+ * Decrypt a remaining incomplete block, return updated Y
+ */
+static __m128i decrypt_gcm_rem(private_aesni_gcm_t *this, u_int rem,
+                                                          void *in, void *out, __m128i cb, __m128i y)
+{
+       __m128i *ks, t, b;
+       u_int round;
+
+       memset(&b, 0, sizeof(b));
+       memcpy(&b, in, rem);
+
+       y = ghash(this->h, y, b);
+
+       ks = this->key->schedule;
+       t = _mm_xor_si128(cb, ks[0]);
+       for (round = 1; round < this->key->rounds; round++)
+       {
+               t = _mm_aesenc_si128(t, ks[round]);
+       }
+       t = _mm_aesenclast_si128(t, ks[this->key->rounds]);
+       b = _mm_xor_si128(t, b);
+
+       memcpy(out, &b, rem);
+
+       return y;
+}
+
+/**
+ * AES-128 GCM encryption/ICV generation
+ */
+static void encrypt_gcm128(private_aesni_gcm_t *this,
+                                                  size_t len, u_char *in, u_char *out, u_char *iv,
+                                                  size_t alen, u_char *assoc, u_char *icv)
+{
+       __m128i d1, d2, d3, d4, t1, t2, t3, t4;
+       __m128i *ks, y, j, cb, *bi, *bo;
+       u_int blocks, pblocks, rem, i;
+
+       j = create_j(this, iv);
+       cb = increment_be(j);
+       y = icv_header(this, assoc, alen);
+       blocks = len / AES_BLOCK_SIZE;
+       pblocks = blocks - (blocks % GCM_CRYPT_PARALLELISM);
+       rem = len % AES_BLOCK_SIZE;
+       bi = (__m128i*)in;
+       bo = (__m128i*)out;
+
+       ks = this->key->schedule;
+
+       for (i = 0; i < pblocks; i += GCM_CRYPT_PARALLELISM)
+       {
+               d1 = _mm_loadu_si128(bi + i + 0);
+               d2 = _mm_loadu_si128(bi + i + 1);
+               d3 = _mm_loadu_si128(bi + i + 2);
+               d4 = _mm_loadu_si128(bi + i + 3);
+
+               t1 = _mm_xor_si128(cb, ks[0]);
+               cb = increment_be(cb);
+               t2 = _mm_xor_si128(cb, ks[0]);
+               cb = increment_be(cb);
+               t3 = _mm_xor_si128(cb, ks[0]);
+               cb = increment_be(cb);
+               t4 = _mm_xor_si128(cb, ks[0]);
+               cb = increment_be(cb);
+
+               t1 = _mm_aesenc_si128(t1, ks[1]);
+               t2 = _mm_aesenc_si128(t2, ks[1]);
+               t3 = _mm_aesenc_si128(t3, ks[1]);
+               t4 = _mm_aesenc_si128(t4, ks[1]);
+               t1 = _mm_aesenc_si128(t1, ks[2]);
+               t2 = _mm_aesenc_si128(t2, ks[2]);
+               t3 = _mm_aesenc_si128(t3, ks[2]);
+               t4 = _mm_aesenc_si128(t4, ks[2]);
+               t1 = _mm_aesenc_si128(t1, ks[3]);
+               t2 = _mm_aesenc_si128(t2, ks[3]);
+               t3 = _mm_aesenc_si128(t3, ks[3]);
+               t4 = _mm_aesenc_si128(t4, ks[3]);
+               t1 = _mm_aesenc_si128(t1, ks[4]);
+               t2 = _mm_aesenc_si128(t2, ks[4]);
+               t3 = _mm_aesenc_si128(t3, ks[4]);
+               t4 = _mm_aesenc_si128(t4, ks[4]);
+               t1 = _mm_aesenc_si128(t1, ks[5]);
+               t2 = _mm_aesenc_si128(t2, ks[5]);
+               t3 = _mm_aesenc_si128(t3, ks[5]);
+               t4 = _mm_aesenc_si128(t4, ks[5]);
+               t1 = _mm_aesenc_si128(t1, ks[6]);
+               t2 = _mm_aesenc_si128(t2, ks[6]);
+               t3 = _mm_aesenc_si128(t3, ks[6]);
+               t4 = _mm_aesenc_si128(t4, ks[6]);
+               t1 = _mm_aesenc_si128(t1, ks[7]);
+               t2 = _mm_aesenc_si128(t2, ks[7]);
+               t3 = _mm_aesenc_si128(t3, ks[7]);
+               t4 = _mm_aesenc_si128(t4, ks[7]);
+               t1 = _mm_aesenc_si128(t1, ks[8]);
+               t2 = _mm_aesenc_si128(t2, ks[8]);
+               t3 = _mm_aesenc_si128(t3, ks[8]);
+               t4 = _mm_aesenc_si128(t4, ks[8]);
+               t1 = _mm_aesenc_si128(t1, ks[9]);
+               t2 = _mm_aesenc_si128(t2, ks[9]);
+               t3 = _mm_aesenc_si128(t3, ks[9]);
+               t4 = _mm_aesenc_si128(t4, ks[9]);
+
+               t1 = _mm_aesenclast_si128(t1, ks[10]);
+               t2 = _mm_aesenclast_si128(t2, ks[10]);
+               t3 = _mm_aesenclast_si128(t3, ks[10]);
+               t4 = _mm_aesenclast_si128(t4, ks[10]);
+
+               t1 = _mm_xor_si128(t1, d1);
+               t2 = _mm_xor_si128(t2, d2);
+               t3 = _mm_xor_si128(t3, d3);
+               t4 = _mm_xor_si128(t4, d4);
+
+               y = _mm_xor_si128(y, t1);
+               y = mult4xor(this->hhhh, this->hhh, this->hh, this->h, y, t2, t3, t4);
+
+               _mm_storeu_si128(bo + i + 0, t1);
+               _mm_storeu_si128(bo + i + 1, t2);
+               _mm_storeu_si128(bo + i + 2, t3);
+               _mm_storeu_si128(bo + i + 3, t4);
+       }
+
+       for (i = pblocks; i < blocks; i++)
+       {
+               d1 = _mm_loadu_si128(bi + i);
+
+               t1 = _mm_xor_si128(cb, ks[0]);
+               t1 = _mm_aesenc_si128(t1, ks[1]);
+               t1 = _mm_aesenc_si128(t1, ks[2]);
+               t1 = _mm_aesenc_si128(t1, ks[3]);
+               t1 = _mm_aesenc_si128(t1, ks[4]);
+               t1 = _mm_aesenc_si128(t1, ks[5]);
+               t1 = _mm_aesenc_si128(t1, ks[6]);
+               t1 = _mm_aesenc_si128(t1, ks[7]);
+               t1 = _mm_aesenc_si128(t1, ks[8]);
+               t1 = _mm_aesenc_si128(t1, ks[9]);
+               t1 = _mm_aesenclast_si128(t1, ks[10]);
+
+               t1 = _mm_xor_si128(t1, d1);
+               _mm_storeu_si128(bo + i, t1);
+
+               y = ghash(this->h, y, t1);
+
+               cb = increment_be(cb);
+       }
+
+       if (rem)
+       {
+               y = encrypt_gcm_rem(this, rem, bi + blocks, bo + blocks, cb, y);
+       }
+       y = icv_tailer(this, y, alen, len);
+       icv_crypt(this, y, j, icv);
+}
+
+/**
+ * AES-128 GCM decryption/ICV generation
+ */
+static void decrypt_gcm128(private_aesni_gcm_t *this,
+                                                  size_t len, u_char *in, u_char *out, u_char *iv,
+                                                  size_t alen, u_char *assoc, u_char *icv)
+{
+       __m128i d1, d2, d3, d4, t1, t2, t3, t4;
+       __m128i *ks, y, j, cb, *bi, *bo;
+       u_int blocks, pblocks, rem, i;
+
+       j = create_j(this, iv);
+       cb = increment_be(j);
+       y = icv_header(this, assoc, alen);
+       blocks = len / AES_BLOCK_SIZE;
+       pblocks = blocks - (blocks % GCM_CRYPT_PARALLELISM);
+       rem = len % AES_BLOCK_SIZE;
+       bi = (__m128i*)in;
+       bo = (__m128i*)out;
+
+       ks = this->key->schedule;
+
+       for (i = 0; i < pblocks; i += GCM_CRYPT_PARALLELISM)
+       {
+               d1 = _mm_loadu_si128(bi + i + 0);
+               d2 = _mm_loadu_si128(bi + i + 1);
+               d3 = _mm_loadu_si128(bi + i + 2);
+               d4 = _mm_loadu_si128(bi + i + 3);
+
+               y = _mm_xor_si128(y, d1);
+               y = mult4xor(this->hhhh, this->hhh, this->hh, this->h, y, d2, d3, d4);
+
+               t1 = _mm_xor_si128(cb, ks[0]);
+               cb = increment_be(cb);
+               t2 = _mm_xor_si128(cb, ks[0]);
+               cb = increment_be(cb);
+               t3 = _mm_xor_si128(cb, ks[0]);
+               cb = increment_be(cb);
+               t4 = _mm_xor_si128(cb, ks[0]);
+               cb = increment_be(cb);
+
+               t1 = _mm_aesenc_si128(t1, ks[1]);
+               t2 = _mm_aesenc_si128(t2, ks[1]);
+               t3 = _mm_aesenc_si128(t3, ks[1]);
+               t4 = _mm_aesenc_si128(t4, ks[1]);
+               t1 = _mm_aesenc_si128(t1, ks[2]);
+               t2 = _mm_aesenc_si128(t2, ks[2]);
+               t3 = _mm_aesenc_si128(t3, ks[2]);
+               t4 = _mm_aesenc_si128(t4, ks[2]);
+               t1 = _mm_aesenc_si128(t1, ks[3]);
+               t2 = _mm_aesenc_si128(t2, ks[3]);
+               t3 = _mm_aesenc_si128(t3, ks[3]);
+               t4 = _mm_aesenc_si128(t4, ks[3]);
+               t1 = _mm_aesenc_si128(t1, ks[4]);
+               t2 = _mm_aesenc_si128(t2, ks[4]);
+               t3 = _mm_aesenc_si128(t3, ks[4]);
+               t4 = _mm_aesenc_si128(t4, ks[4]);
+               t1 = _mm_aesenc_si128(t1, ks[5]);
+               t2 = _mm_aesenc_si128(t2, ks[5]);
+               t3 = _mm_aesenc_si128(t3, ks[5]);
+               t4 = _mm_aesenc_si128(t4, ks[5]);
+               t1 = _mm_aesenc_si128(t1, ks[6]);
+               t2 = _mm_aesenc_si128(t2, ks[6]);
+               t3 = _mm_aesenc_si128(t3, ks[6]);
+               t4 = _mm_aesenc_si128(t4, ks[6]);
+               t1 = _mm_aesenc_si128(t1, ks[7]);
+               t2 = _mm_aesenc_si128(t2, ks[7]);
+               t3 = _mm_aesenc_si128(t3, ks[7]);
+               t4 = _mm_aesenc_si128(t4, ks[7]);
+               t1 = _mm_aesenc_si128(t1, ks[8]);
+               t2 = _mm_aesenc_si128(t2, ks[8]);
+               t3 = _mm_aesenc_si128(t3, ks[8]);
+               t4 = _mm_aesenc_si128(t4, ks[8]);
+               t1 = _mm_aesenc_si128(t1, ks[9]);
+               t2 = _mm_aesenc_si128(t2, ks[9]);
+               t3 = _mm_aesenc_si128(t3, ks[9]);
+               t4 = _mm_aesenc_si128(t4, ks[9]);
+
+               t1 = _mm_aesenclast_si128(t1, ks[10]);
+               t2 = _mm_aesenclast_si128(t2, ks[10]);
+               t3 = _mm_aesenclast_si128(t3, ks[10]);
+               t4 = _mm_aesenclast_si128(t4, ks[10]);
+
+               t1 = _mm_xor_si128(t1, d1);
+               t2 = _mm_xor_si128(t2, d2);
+               t3 = _mm_xor_si128(t3, d3);
+               t4 = _mm_xor_si128(t4, d4);
+
+               _mm_storeu_si128(bo + i + 0, t1);
+               _mm_storeu_si128(bo + i + 1, t2);
+               _mm_storeu_si128(bo + i + 2, t3);
+               _mm_storeu_si128(bo + i + 3, t4);
+       }
+
+       for (i = pblocks; i < blocks; i++)
+       {
+               d1 = _mm_loadu_si128(bi + i);
+
+               y = ghash(this->h, y, d1);
+
+               t1 = _mm_xor_si128(cb, ks[0]);
+               t1 = _mm_aesenc_si128(t1, ks[1]);
+               t1 = _mm_aesenc_si128(t1, ks[2]);
+               t1 = _mm_aesenc_si128(t1, ks[3]);
+               t1 = _mm_aesenc_si128(t1, ks[4]);
+               t1 = _mm_aesenc_si128(t1, ks[5]);
+               t1 = _mm_aesenc_si128(t1, ks[6]);
+               t1 = _mm_aesenc_si128(t1, ks[7]);
+               t1 = _mm_aesenc_si128(t1, ks[8]);
+               t1 = _mm_aesenc_si128(t1, ks[9]);
+               t1 = _mm_aesenclast_si128(t1, ks[10]);
+
+               t1 = _mm_xor_si128(t1, d1);
+               _mm_storeu_si128(bo + i, t1);
+
+               cb = increment_be(cb);
+       }
+
+       if (rem)
+       {
+               y = decrypt_gcm_rem(this, rem, bi + blocks, bo + blocks, cb, y);
+       }
+       y = icv_tailer(this, y, alen, len);
+       icv_crypt(this, y, j, icv);
+}
+
+/**
+ * AES-192 GCM encryption/ICV generation
+ */
+static void encrypt_gcm192(private_aesni_gcm_t *this,
+                                                  size_t len, u_char *in, u_char *out, u_char *iv,
+                                                  size_t alen, u_char *assoc, u_char *icv)
+{
+       __m128i d1, d2, d3, d4, t1, t2, t3, t4;
+       __m128i *ks, y, j, cb, *bi, *bo;
+       u_int blocks, pblocks, rem, i;
+
+       j = create_j(this, iv);
+       cb = increment_be(j);
+       y = icv_header(this, assoc, alen);
+       blocks = len / AES_BLOCK_SIZE;
+       pblocks = blocks - (blocks % GCM_CRYPT_PARALLELISM);
+       rem = len % AES_BLOCK_SIZE;
+       bi = (__m128i*)in;
+       bo = (__m128i*)out;
+
+       ks = this->key->schedule;
+
+       for (i = 0; i < pblocks; i += GCM_CRYPT_PARALLELISM)
+       {
+               d1 = _mm_loadu_si128(bi + i + 0);
+               d2 = _mm_loadu_si128(bi + i + 1);
+               d3 = _mm_loadu_si128(bi + i + 2);
+               d4 = _mm_loadu_si128(bi + i + 3);
+
+               t1 = _mm_xor_si128(cb, ks[0]);
+               cb = increment_be(cb);
+               t2 = _mm_xor_si128(cb, ks[0]);
+               cb = increment_be(cb);
+               t3 = _mm_xor_si128(cb, ks[0]);
+               cb = increment_be(cb);
+               t4 = _mm_xor_si128(cb, ks[0]);
+               cb = increment_be(cb);
+
+               t1 = _mm_aesenc_si128(t1, ks[1]);
+               t2 = _mm_aesenc_si128(t2, ks[1]);
+               t3 = _mm_aesenc_si128(t3, ks[1]);
+               t4 = _mm_aesenc_si128(t4, ks[1]);
+               t1 = _mm_aesenc_si128(t1, ks[2]);
+               t2 = _mm_aesenc_si128(t2, ks[2]);
+               t3 = _mm_aesenc_si128(t3, ks[2]);
+               t4 = _mm_aesenc_si128(t4, ks[2]);
+               t1 = _mm_aesenc_si128(t1, ks[3]);
+               t2 = _mm_aesenc_si128(t2, ks[3]);
+               t3 = _mm_aesenc_si128(t3, ks[3]);
+               t4 = _mm_aesenc_si128(t4, ks[3]);
+               t1 = _mm_aesenc_si128(t1, ks[4]);
+               t2 = _mm_aesenc_si128(t2, ks[4]);
+               t3 = _mm_aesenc_si128(t3, ks[4]);
+               t4 = _mm_aesenc_si128(t4, ks[4]);
+               t1 = _mm_aesenc_si128(t1, ks[5]);
+               t2 = _mm_aesenc_si128(t2, ks[5]);
+               t3 = _mm_aesenc_si128(t3, ks[5]);
+               t4 = _mm_aesenc_si128(t4, ks[5]);
+               t1 = _mm_aesenc_si128(t1, ks[6]);
+               t2 = _mm_aesenc_si128(t2, ks[6]);
+               t3 = _mm_aesenc_si128(t3, ks[6]);
+               t4 = _mm_aesenc_si128(t4, ks[6]);
+               t1 = _mm_aesenc_si128(t1, ks[7]);
+               t2 = _mm_aesenc_si128(t2, ks[7]);
+               t3 = _mm_aesenc_si128(t3, ks[7]);
+               t4 = _mm_aesenc_si128(t4, ks[7]);
+               t1 = _mm_aesenc_si128(t1, ks[8]);
+               t2 = _mm_aesenc_si128(t2, ks[8]);
+               t3 = _mm_aesenc_si128(t3, ks[8]);
+               t4 = _mm_aesenc_si128(t4, ks[8]);
+               t1 = _mm_aesenc_si128(t1, ks[9]);
+               t2 = _mm_aesenc_si128(t2, ks[9]);
+               t3 = _mm_aesenc_si128(t3, ks[9]);
+               t4 = _mm_aesenc_si128(t4, ks[9]);
+               t1 = _mm_aesenc_si128(t1, ks[10]);
+               t2 = _mm_aesenc_si128(t2, ks[10]);
+               t3 = _mm_aesenc_si128(t3, ks[10]);
+               t4 = _mm_aesenc_si128(t4, ks[10]);
+               t1 = _mm_aesenc_si128(t1, ks[11]);
+               t2 = _mm_aesenc_si128(t2, ks[11]);
+               t3 = _mm_aesenc_si128(t3, ks[11]);
+               t4 = _mm_aesenc_si128(t4, ks[11]);
+
+               t1 = _mm_aesenclast_si128(t1, ks[12]);
+               t2 = _mm_aesenclast_si128(t2, ks[12]);
+               t3 = _mm_aesenclast_si128(t3, ks[12]);
+               t4 = _mm_aesenclast_si128(t4, ks[12]);
+
+               t1 = _mm_xor_si128(t1, d1);
+               t2 = _mm_xor_si128(t2, d2);
+               t3 = _mm_xor_si128(t3, d3);
+               t4 = _mm_xor_si128(t4, d4);
+
+               y = _mm_xor_si128(y, t1);
+               y = mult4xor(this->hhhh, this->hhh, this->hh, this->h, y, t2, t3, t4);
+
+               _mm_storeu_si128(bo + i + 0, t1);
+               _mm_storeu_si128(bo + i + 1, t2);
+               _mm_storeu_si128(bo + i + 2, t3);
+               _mm_storeu_si128(bo + i + 3, t4);
+       }
+
+       for (i = pblocks; i < blocks; i++)
+       {
+               d1 = _mm_loadu_si128(bi + i);
+
+               t1 = _mm_xor_si128(cb, ks[0]);
+               t1 = _mm_aesenc_si128(t1, ks[1]);
+               t1 = _mm_aesenc_si128(t1, ks[2]);
+               t1 = _mm_aesenc_si128(t1, ks[3]);
+               t1 = _mm_aesenc_si128(t1, ks[4]);
+               t1 = _mm_aesenc_si128(t1, ks[5]);
+               t1 = _mm_aesenc_si128(t1, ks[6]);
+               t1 = _mm_aesenc_si128(t1, ks[7]);
+               t1 = _mm_aesenc_si128(t1, ks[8]);
+               t1 = _mm_aesenc_si128(t1, ks[9]);
+               t1 = _mm_aesenc_si128(t1, ks[10]);
+               t1 = _mm_aesenc_si128(t1, ks[11]);
+               t1 = _mm_aesenclast_si128(t1, ks[12]);
+
+               t1 = _mm_xor_si128(t1, d1);
+               _mm_storeu_si128(bo + i, t1);
+
+               y = ghash(this->h, y, t1);
+
+               cb = increment_be(cb);
+       }
+
+       if (rem)
+       {
+               y = encrypt_gcm_rem(this, rem, bi + blocks, bo + blocks, cb, y);
+       }
+       y = icv_tailer(this, y, alen, len);
+       icv_crypt(this, y, j, icv);
+}
+
+/**
+ * AES-192 GCM decryption/ICV generation
+ */
+static void decrypt_gcm192(private_aesni_gcm_t *this,
+                                                  size_t len, u_char *in, u_char *out, u_char *iv,
+                                                  size_t alen, u_char *assoc, u_char *icv)
+{
+       __m128i d1, d2, d3, d4, t1, t2, t3, t4;
+       __m128i *ks, y, j, cb, *bi, *bo;
+       u_int blocks, pblocks, rem, i;
+
+       j = create_j(this, iv);
+       cb = increment_be(j);
+       y = icv_header(this, assoc, alen);
+       blocks = len / AES_BLOCK_SIZE;
+       pblocks = blocks - (blocks % GCM_CRYPT_PARALLELISM);
+       rem = len % AES_BLOCK_SIZE;
+       bi = (__m128i*)in;
+       bo = (__m128i*)out;
+
+       ks = this->key->schedule;
+
+       for (i = 0; i < pblocks; i += GCM_CRYPT_PARALLELISM)
+       {
+               d1 = _mm_loadu_si128(bi + i + 0);
+               d2 = _mm_loadu_si128(bi + i + 1);
+               d3 = _mm_loadu_si128(bi + i + 2);
+               d4 = _mm_loadu_si128(bi + i + 3);
+
+               y = _mm_xor_si128(y, d1);
+               y = mult4xor(this->hhhh, this->hhh, this->hh, this->h, y, d2, d3, d4);
+
+               t1 = _mm_xor_si128(cb, ks[0]);
+               cb = increment_be(cb);
+               t2 = _mm_xor_si128(cb, ks[0]);
+               cb = increment_be(cb);
+               t3 = _mm_xor_si128(cb, ks[0]);
+               cb = increment_be(cb);
+               t4 = _mm_xor_si128(cb, ks[0]);
+               cb = increment_be(cb);
+
+               t1 = _mm_aesenc_si128(t1, ks[1]);
+               t2 = _mm_aesenc_si128(t2, ks[1]);
+               t3 = _mm_aesenc_si128(t3, ks[1]);
+               t4 = _mm_aesenc_si128(t4, ks[1]);
+               t1 = _mm_aesenc_si128(t1, ks[2]);
+               t2 = _mm_aesenc_si128(t2, ks[2]);
+               t3 = _mm_aesenc_si128(t3, ks[2]);
+               t4 = _mm_aesenc_si128(t4, ks[2]);
+               t1 = _mm_aesenc_si128(t1, ks[3]);
+               t2 = _mm_aesenc_si128(t2, ks[3]);
+               t3 = _mm_aesenc_si128(t3, ks[3]);
+               t4 = _mm_aesenc_si128(t4, ks[3]);
+               t1 = _mm_aesenc_si128(t1, ks[4]);
+               t2 = _mm_aesenc_si128(t2, ks[4]);
+               t3 = _mm_aesenc_si128(t3, ks[4]);
+               t4 = _mm_aesenc_si128(t4, ks[4]);
+               t1 = _mm_aesenc_si128(t1, ks[5]);
+               t2 = _mm_aesenc_si128(t2, ks[5]);
+               t3 = _mm_aesenc_si128(t3, ks[5]);
+               t4 = _mm_aesenc_si128(t4, ks[5]);
+               t1 = _mm_aesenc_si128(t1, ks[6]);
+               t2 = _mm_aesenc_si128(t2, ks[6]);
+               t3 = _mm_aesenc_si128(t3, ks[6]);
+               t4 = _mm_aesenc_si128(t4, ks[6]);
+               t1 = _mm_aesenc_si128(t1, ks[7]);
+               t2 = _mm_aesenc_si128(t2, ks[7]);
+               t3 = _mm_aesenc_si128(t3, ks[7]);
+               t4 = _mm_aesenc_si128(t4, ks[7]);
+               t1 = _mm_aesenc_si128(t1, ks[8]);
+               t2 = _mm_aesenc_si128(t2, ks[8]);
+               t3 = _mm_aesenc_si128(t3, ks[8]);
+               t4 = _mm_aesenc_si128(t4, ks[8]);
+               t1 = _mm_aesenc_si128(t1, ks[9]);
+               t2 = _mm_aesenc_si128(t2, ks[9]);
+               t3 = _mm_aesenc_si128(t3, ks[9]);
+               t4 = _mm_aesenc_si128(t4, ks[9]);
+               t1 = _mm_aesenc_si128(t1, ks[10]);
+               t2 = _mm_aesenc_si128(t2, ks[10]);
+               t3 = _mm_aesenc_si128(t3, ks[10]);
+               t4 = _mm_aesenc_si128(t4, ks[10]);
+               t1 = _mm_aesenc_si128(t1, ks[11]);
+               t2 = _mm_aesenc_si128(t2, ks[11]);
+               t3 = _mm_aesenc_si128(t3, ks[11]);
+               t4 = _mm_aesenc_si128(t4, ks[11]);
+
+               t1 = _mm_aesenclast_si128(t1, ks[12]);
+               t2 = _mm_aesenclast_si128(t2, ks[12]);
+               t3 = _mm_aesenclast_si128(t3, ks[12]);
+               t4 = _mm_aesenclast_si128(t4, ks[12]);
+
+               t1 = _mm_xor_si128(t1, d1);
+               t2 = _mm_xor_si128(t2, d2);
+               t3 = _mm_xor_si128(t3, d3);
+               t4 = _mm_xor_si128(t4, d4);
+
+               _mm_storeu_si128(bo + i + 0, t1);
+               _mm_storeu_si128(bo + i + 1, t2);
+               _mm_storeu_si128(bo + i + 2, t3);
+               _mm_storeu_si128(bo + i + 3, t4);
+       }
+
+       for (i = pblocks; i < blocks; i++)
+       {
+               d1 = _mm_loadu_si128(bi + i);
+
+               y = ghash(this->h, y, d1);
+
+               t1 = _mm_xor_si128(cb, ks[0]);
+               t1 = _mm_aesenc_si128(t1, ks[1]);
+               t1 = _mm_aesenc_si128(t1, ks[2]);
+               t1 = _mm_aesenc_si128(t1, ks[3]);
+               t1 = _mm_aesenc_si128(t1, ks[4]);
+               t1 = _mm_aesenc_si128(t1, ks[5]);
+               t1 = _mm_aesenc_si128(t1, ks[6]);
+               t1 = _mm_aesenc_si128(t1, ks[7]);
+               t1 = _mm_aesenc_si128(t1, ks[8]);
+               t1 = _mm_aesenc_si128(t1, ks[9]);
+               t1 = _mm_aesenc_si128(t1, ks[10]);
+               t1 = _mm_aesenc_si128(t1, ks[11]);
+               t1 = _mm_aesenclast_si128(t1, ks[12]);
+
+               t1 = _mm_xor_si128(t1, d1);
+               _mm_storeu_si128(bo + i, t1);
+
+               cb = increment_be(cb);
+       }
+
+       if (rem)
+       {
+               y = decrypt_gcm_rem(this, rem, bi + blocks, bo + blocks, cb, y);
+       }
+       y = icv_tailer(this, y, alen, len);
+       icv_crypt(this, y, j, icv);
+}
+
+/**
+ * AES-256 GCM encryption/ICV generation
+ */
+static void encrypt_gcm256(private_aesni_gcm_t *this,
+                                                  size_t len, u_char *in, u_char *out, u_char *iv,
+                                                  size_t alen, u_char *assoc, u_char *icv)
+{
+       __m128i d1, d2, d3, d4, t1, t2, t3, t4;
+       __m128i *ks, y, j, cb, *bi, *bo;
+       u_int blocks, pblocks, rem, i;
+
+       j = create_j(this, iv);
+       cb = increment_be(j);
+       y = icv_header(this, assoc, alen);
+       blocks = len / AES_BLOCK_SIZE;
+       pblocks = blocks - (blocks % GCM_CRYPT_PARALLELISM);
+       rem = len % AES_BLOCK_SIZE;
+       bi = (__m128i*)in;
+       bo = (__m128i*)out;
+
+       ks = this->key->schedule;
+
+       for (i = 0; i < pblocks; i += GCM_CRYPT_PARALLELISM)
+       {
+               d1 = _mm_loadu_si128(bi + i + 0);
+               d2 = _mm_loadu_si128(bi + i + 1);
+               d3 = _mm_loadu_si128(bi + i + 2);
+               d4 = _mm_loadu_si128(bi + i + 3);
+
+               t1 = _mm_xor_si128(cb, ks[0]);
+               cb = increment_be(cb);
+               t2 = _mm_xor_si128(cb, ks[0]);
+               cb = increment_be(cb);
+               t3 = _mm_xor_si128(cb, ks[0]);
+               cb = increment_be(cb);
+               t4 = _mm_xor_si128(cb, ks[0]);
+               cb = increment_be(cb);
+
+               t1 = _mm_aesenc_si128(t1, ks[1]);
+               t2 = _mm_aesenc_si128(t2, ks[1]);
+               t3 = _mm_aesenc_si128(t3, ks[1]);
+               t4 = _mm_aesenc_si128(t4, ks[1]);
+               t1 = _mm_aesenc_si128(t1, ks[2]);
+               t2 = _mm_aesenc_si128(t2, ks[2]);
+               t3 = _mm_aesenc_si128(t3, ks[2]);
+               t4 = _mm_aesenc_si128(t4, ks[2]);
+               t1 = _mm_aesenc_si128(t1, ks[3]);
+               t2 = _mm_aesenc_si128(t2, ks[3]);
+               t3 = _mm_aesenc_si128(t3, ks[3]);
+               t4 = _mm_aesenc_si128(t4, ks[3]);
+               t1 = _mm_aesenc_si128(t1, ks[4]);
+               t2 = _mm_aesenc_si128(t2, ks[4]);
+               t3 = _mm_aesenc_si128(t3, ks[4]);
+               t4 = _mm_aesenc_si128(t4, ks[4]);
+               t1 = _mm_aesenc_si128(t1, ks[5]);
+               t2 = _mm_aesenc_si128(t2, ks[5]);
+               t3 = _mm_aesenc_si128(t3, ks[5]);
+               t4 = _mm_aesenc_si128(t4, ks[5]);
+               t1 = _mm_aesenc_si128(t1, ks[6]);
+               t2 = _mm_aesenc_si128(t2, ks[6]);
+               t3 = _mm_aesenc_si128(t3, ks[6]);
+               t4 = _mm_aesenc_si128(t4, ks[6]);
+               t1 = _mm_aesenc_si128(t1, ks[7]);
+               t2 = _mm_aesenc_si128(t2, ks[7]);
+               t3 = _mm_aesenc_si128(t3, ks[7]);
+               t4 = _mm_aesenc_si128(t4, ks[7]);
+               t1 = _mm_aesenc_si128(t1, ks[8]);
+               t2 = _mm_aesenc_si128(t2, ks[8]);
+               t3 = _mm_aesenc_si128(t3, ks[8]);
+               t4 = _mm_aesenc_si128(t4, ks[8]);
+               t1 = _mm_aesenc_si128(t1, ks[9]);
+               t2 = _mm_aesenc_si128(t2, ks[9]);
+               t3 = _mm_aesenc_si128(t3, ks[9]);
+               t4 = _mm_aesenc_si128(t4, ks[9]);
+               t1 = _mm_aesenc_si128(t1, ks[10]);
+               t2 = _mm_aesenc_si128(t2, ks[10]);
+               t3 = _mm_aesenc_si128(t3, ks[10]);
+               t4 = _mm_aesenc_si128(t4, ks[10]);
+               t1 = _mm_aesenc_si128(t1, ks[11]);
+               t2 = _mm_aesenc_si128(t2, ks[11]);
+               t3 = _mm_aesenc_si128(t3, ks[11]);
+               t4 = _mm_aesenc_si128(t4, ks[11]);
+               t1 = _mm_aesenc_si128(t1, ks[12]);
+               t2 = _mm_aesenc_si128(t2, ks[12]);
+               t3 = _mm_aesenc_si128(t3, ks[12]);
+               t4 = _mm_aesenc_si128(t4, ks[12]);
+               t1 = _mm_aesenc_si128(t1, ks[13]);
+               t2 = _mm_aesenc_si128(t2, ks[13]);
+               t3 = _mm_aesenc_si128(t3, ks[13]);
+               t4 = _mm_aesenc_si128(t4, ks[13]);
+
+               t1 = _mm_aesenclast_si128(t1, ks[14]);
+               t2 = _mm_aesenclast_si128(t2, ks[14]);
+               t3 = _mm_aesenclast_si128(t3, ks[14]);
+               t4 = _mm_aesenclast_si128(t4, ks[14]);
+
+               t1 = _mm_xor_si128(t1, d1);
+               t2 = _mm_xor_si128(t2, d2);
+               t3 = _mm_xor_si128(t3, d3);
+               t4 = _mm_xor_si128(t4, d4);
+
+               y = _mm_xor_si128(y, t1);
+               y = mult4xor(this->hhhh, this->hhh, this->hh, this->h, y, t2, t3, t4);
+
+               _mm_storeu_si128(bo + i + 0, t1);
+               _mm_storeu_si128(bo + i + 1, t2);
+               _mm_storeu_si128(bo + i + 2, t3);
+               _mm_storeu_si128(bo + i + 3, t4);
+       }
+
+       for (i = pblocks; i < blocks; i++)
+       {
+               d1 = _mm_loadu_si128(bi + i);
+
+               t1 = _mm_xor_si128(cb, ks[0]);
+               t1 = _mm_aesenc_si128(t1, ks[1]);
+               t1 = _mm_aesenc_si128(t1, ks[2]);
+               t1 = _mm_aesenc_si128(t1, ks[3]);
+               t1 = _mm_aesenc_si128(t1, ks[4]);
+               t1 = _mm_aesenc_si128(t1, ks[5]);
+               t1 = _mm_aesenc_si128(t1, ks[6]);
+               t1 = _mm_aesenc_si128(t1, ks[7]);
+               t1 = _mm_aesenc_si128(t1, ks[8]);
+               t1 = _mm_aesenc_si128(t1, ks[9]);
+               t1 = _mm_aesenc_si128(t1, ks[10]);
+               t1 = _mm_aesenc_si128(t1, ks[11]);
+               t1 = _mm_aesenc_si128(t1, ks[12]);
+               t1 = _mm_aesenc_si128(t1, ks[13]);
+               t1 = _mm_aesenclast_si128(t1, ks[14]);
+
+               t1 = _mm_xor_si128(t1, d1);
+               _mm_storeu_si128(bo + i, t1);
+
+               y = ghash(this->h, y, t1);
+
+               cb = increment_be(cb);
+       }
+
+       if (rem)
+       {
+               y = encrypt_gcm_rem(this, rem, bi + blocks, bo + blocks, cb, y);
+       }
+       y = icv_tailer(this, y, alen, len);
+       icv_crypt(this, y, j, icv);
+}
+
+/**
+ * AES-256 GCM decryption/ICV generation
+ */
+static void decrypt_gcm256(private_aesni_gcm_t *this,
+                                                  size_t len, u_char *in, u_char *out, u_char *iv,
+                                                  size_t alen, u_char *assoc, u_char *icv)
+{
+       __m128i d1, d2, d3, d4, t1, t2, t3, t4;
+       __m128i *ks, y, j, cb, *bi, *bo;
+       u_int blocks, pblocks, rem, i;
+
+       j = create_j(this, iv);
+       cb = increment_be(j);
+       y = icv_header(this, assoc, alen);
+       blocks = len / AES_BLOCK_SIZE;
+       pblocks = blocks - (blocks % GCM_CRYPT_PARALLELISM);
+       rem = len % AES_BLOCK_SIZE;
+       bi = (__m128i*)in;
+       bo = (__m128i*)out;
+
+       ks = this->key->schedule;
+
+       for (i = 0; i < pblocks; i += GCM_CRYPT_PARALLELISM)
+       {
+               d1 = _mm_loadu_si128(bi + i + 0);
+               d2 = _mm_loadu_si128(bi + i + 1);
+               d3 = _mm_loadu_si128(bi + i + 2);
+               d4 = _mm_loadu_si128(bi + i + 3);
+
+               y = _mm_xor_si128(y, d1);
+               y = mult4xor(this->hhhh, this->hhh, this->hh, this->h, y, d2, d3, d4);
+
+               t1 = _mm_xor_si128(cb, ks[0]);
+               cb = increment_be(cb);
+               t2 = _mm_xor_si128(cb, ks[0]);
+               cb = increment_be(cb);
+               t3 = _mm_xor_si128(cb, ks[0]);
+               cb = increment_be(cb);
+               t4 = _mm_xor_si128(cb, ks[0]);
+               cb = increment_be(cb);
+
+               t1 = _mm_aesenc_si128(t1, ks[1]);
+               t2 = _mm_aesenc_si128(t2, ks[1]);
+               t3 = _mm_aesenc_si128(t3, ks[1]);
+               t4 = _mm_aesenc_si128(t4, ks[1]);
+               t1 = _mm_aesenc_si128(t1, ks[2]);
+               t2 = _mm_aesenc_si128(t2, ks[2]);
+               t3 = _mm_aesenc_si128(t3, ks[2]);
+               t4 = _mm_aesenc_si128(t4, ks[2]);
+               t1 = _mm_aesenc_si128(t1, ks[3]);
+               t2 = _mm_aesenc_si128(t2, ks[3]);
+               t3 = _mm_aesenc_si128(t3, ks[3]);
+               t4 = _mm_aesenc_si128(t4, ks[3]);
+               t1 = _mm_aesenc_si128(t1, ks[4]);
+               t2 = _mm_aesenc_si128(t2, ks[4]);
+               t3 = _mm_aesenc_si128(t3, ks[4]);
+               t4 = _mm_aesenc_si128(t4, ks[4]);
+               t1 = _mm_aesenc_si128(t1, ks[5]);
+               t2 = _mm_aesenc_si128(t2, ks[5]);
+               t3 = _mm_aesenc_si128(t3, ks[5]);
+               t4 = _mm_aesenc_si128(t4, ks[5]);
+               t1 = _mm_aesenc_si128(t1, ks[6]);
+               t2 = _mm_aesenc_si128(t2, ks[6]);
+               t3 = _mm_aesenc_si128(t3, ks[6]);
+               t4 = _mm_aesenc_si128(t4, ks[6]);
+               t1 = _mm_aesenc_si128(t1, ks[7]);
+               t2 = _mm_aesenc_si128(t2, ks[7]);
+               t3 = _mm_aesenc_si128(t3, ks[7]);
+               t4 = _mm_aesenc_si128(t4, ks[7]);
+               t1 = _mm_aesenc_si128(t1, ks[8]);
+               t2 = _mm_aesenc_si128(t2, ks[8]);
+               t3 = _mm_aesenc_si128(t3, ks[8]);
+               t4 = _mm_aesenc_si128(t4, ks[8]);
+               t1 = _mm_aesenc_si128(t1, ks[9]);
+               t2 = _mm_aesenc_si128(t2, ks[9]);
+               t3 = _mm_aesenc_si128(t3, ks[9]);
+               t4 = _mm_aesenc_si128(t4, ks[9]);
+               t1 = _mm_aesenc_si128(t1, ks[10]);
+               t2 = _mm_aesenc_si128(t2, ks[10]);
+               t3 = _mm_aesenc_si128(t3, ks[10]);
+               t4 = _mm_aesenc_si128(t4, ks[10]);
+               t1 = _mm_aesenc_si128(t1, ks[11]);
+               t2 = _mm_aesenc_si128(t2, ks[11]);
+               t3 = _mm_aesenc_si128(t3, ks[11]);
+               t4 = _mm_aesenc_si128(t4, ks[11]);
+               t1 = _mm_aesenc_si128(t1, ks[12]);
+               t2 = _mm_aesenc_si128(t2, ks[12]);
+               t3 = _mm_aesenc_si128(t3, ks[12]);
+               t4 = _mm_aesenc_si128(t4, ks[12]);
+               t1 = _mm_aesenc_si128(t1, ks[13]);
+               t2 = _mm_aesenc_si128(t2, ks[13]);
+               t3 = _mm_aesenc_si128(t3, ks[13]);
+               t4 = _mm_aesenc_si128(t4, ks[13]);
+
+               t1 = _mm_aesenclast_si128(t1, ks[14]);
+               t2 = _mm_aesenclast_si128(t2, ks[14]);
+               t3 = _mm_aesenclast_si128(t3, ks[14]);
+               t4 = _mm_aesenclast_si128(t4, ks[14]);
+
+               t1 = _mm_xor_si128(t1, d1);
+               t2 = _mm_xor_si128(t2, d2);
+               t3 = _mm_xor_si128(t3, d3);
+               t4 = _mm_xor_si128(t4, d4);
+
+               _mm_storeu_si128(bo + i + 0, t1);
+               _mm_storeu_si128(bo + i + 1, t2);
+               _mm_storeu_si128(bo + i + 2, t3);
+               _mm_storeu_si128(bo + i + 3, t4);
+       }
+
+       for (i = pblocks; i < blocks; i++)
+       {
+               d1 = _mm_loadu_si128(bi + i);
+
+               y = ghash(this->h, y, d1);
+
+               t1 = _mm_xor_si128(cb, ks[0]);
+               t1 = _mm_aesenc_si128(t1, ks[1]);
+               t1 = _mm_aesenc_si128(t1, ks[2]);
+               t1 = _mm_aesenc_si128(t1, ks[3]);
+               t1 = _mm_aesenc_si128(t1, ks[4]);
+               t1 = _mm_aesenc_si128(t1, ks[5]);
+               t1 = _mm_aesenc_si128(t1, ks[6]);
+               t1 = _mm_aesenc_si128(t1, ks[7]);
+               t1 = _mm_aesenc_si128(t1, ks[8]);
+               t1 = _mm_aesenc_si128(t1, ks[9]);
+               t1 = _mm_aesenc_si128(t1, ks[10]);
+               t1 = _mm_aesenc_si128(t1, ks[11]);
+               t1 = _mm_aesenc_si128(t1, ks[12]);
+               t1 = _mm_aesenc_si128(t1, ks[13]);
+               t1 = _mm_aesenclast_si128(t1, ks[14]);
+
+               t1 = _mm_xor_si128(t1, d1);
+               _mm_storeu_si128(bo + i, t1);
+
+               cb = increment_be(cb);
+       }
+
+       if (rem)
+       {
+               y = decrypt_gcm_rem(this, rem, bi + blocks, bo + blocks, cb, y);
+       }
+       y = icv_tailer(this, y, alen, len);
+       icv_crypt(this, y, j, icv);
+}
+
+METHOD(aead_t, encrypt, bool,
+       private_aesni_gcm_t *this, chunk_t plain, chunk_t assoc, chunk_t iv,
+       chunk_t *encr)
+{
+       u_char *out;
+
+       if (!this->key || iv.len != IV_SIZE)
+       {
+               return FALSE;
+       }
+       out = plain.ptr;
+       if (encr)
+       {
+               *encr = chunk_alloc(plain.len + this->icv_size);
+               out = encr->ptr;
+       }
+       this->encrypt(this, plain.len, plain.ptr, out, iv.ptr,
+                                 assoc.len, assoc.ptr, out + plain.len);
+       return TRUE;
+}
+
+METHOD(aead_t, decrypt, bool,
+       private_aesni_gcm_t *this, chunk_t encr, chunk_t assoc, chunk_t iv,
+       chunk_t *plain)
+{
+       u_char *out, icv[this->icv_size];
+
+       if (!this->key || iv.len != IV_SIZE || encr.len < this->icv_size)
+       {
+               return FALSE;
+       }
+       encr.len -= this->icv_size;
+       out = encr.ptr;
+       if (plain)
+       {
+               *plain = chunk_alloc(encr.len);
+               out = plain->ptr;
+       }
+       this->decrypt(this, encr.len, encr.ptr, out, iv.ptr,
+                                 assoc.len, assoc.ptr, icv);
+       return memeq_const(icv, encr.ptr + encr.len, this->icv_size);
+}
+
+METHOD(aead_t, get_block_size, size_t,
+       private_aesni_gcm_t *this)
+{
+       return 1;
+}
+
+METHOD(aead_t, get_icv_size, size_t,
+       private_aesni_gcm_t *this)
+{
+       return this->icv_size;
+}
+
+METHOD(aead_t, get_iv_size, size_t,
+       private_aesni_gcm_t *this)
+{
+       return IV_SIZE;
+}
+
+METHOD(aead_t, get_iv_gen, iv_gen_t*,
+       private_aesni_gcm_t *this)
+{
+       return this->iv_gen;
+}
+
+METHOD(aead_t, get_key_size, size_t,
+       private_aesni_gcm_t *this)
+{
+       return this->key_size + SALT_SIZE;
+}
+
+METHOD(aead_t, set_key, bool,
+       private_aesni_gcm_t *this, chunk_t key)
+{
+       u_int round;
+       __m128i *ks, h;
+
+       if (key.len != this->key_size + SALT_SIZE)
+       {
+               return FALSE;
+       }
+
+       memcpy(this->salt, key.ptr + key.len - SALT_SIZE, SALT_SIZE);
+       key.len -= SALT_SIZE;
+
+       DESTROY_IF(this->key);
+       this->key = aesni_key_create(TRUE, key);
+
+       ks = this->key->schedule;
+       h = _mm_xor_si128(_mm_setzero_si128(), ks[0]);
+       for (round = 1; round < this->key->rounds; round++)
+       {
+               h = _mm_aesenc_si128(h, ks[round]);
+       }
+       h = _mm_aesenclast_si128(h, ks[this->key->rounds]);
+
+       this->h = h;
+       h = swap128(h);
+       this->hh = mult_block(h, this->h);
+       this->hhh = mult_block(h, this->hh);
+       this->hhhh = mult_block(h, this->hhh);
+       this->h = swap128(this->h);
+       this->hh = swap128(this->hh);
+       this->hhh = swap128(this->hhh);
+       this->hhhh = swap128(this->hhhh);
+
+       return TRUE;
+}
+
+METHOD(aead_t, destroy, void,
+       private_aesni_gcm_t *this)
+{
+       DESTROY_IF(this->key);
+       memwipe(&this->h, sizeof(this->h));
+       memwipe(&this->hh, sizeof(this->hh));
+       memwipe(&this->hhh, sizeof(this->hhh));
+       memwipe(&this->hhhh, sizeof(this->hhhh));
+       this->iv_gen->destroy(this->iv_gen);
+       free_align(this);
+}
+
+/**
+ * See header
+ */
+aesni_gcm_t *aesni_gcm_create(encryption_algorithm_t algo,
+                                                         size_t key_size, size_t salt_size)
+{
+       private_aesni_gcm_t *this;
+       size_t icv_size;
+
+       switch (key_size)
+       {
+               case 0:
+                       key_size = 16;
+                       break;
+               case 16:
+               case 24:
+               case 32:
+                       break;
+               default:
+                       return NULL;
+       }
+       if (salt_size && salt_size != SALT_SIZE)
+       {
+               /* currently not supported */
+               return NULL;
+       }
+       switch (algo)
+       {
+               case ENCR_AES_GCM_ICV8:
+                       algo = ENCR_AES_CBC;
+                       icv_size = 8;
+                       break;
+               case ENCR_AES_GCM_ICV12:
+                       algo = ENCR_AES_CBC;
+                       icv_size = 12;
+                       break;
+               case ENCR_AES_GCM_ICV16:
+                       algo = ENCR_AES_CBC;
+                       icv_size = 16;
+                       break;
+               default:
+                       return NULL;
+       }
+
+       INIT_ALIGN(this, sizeof(__m128i),
+               .public = {
+                       .aead = {
+                               .encrypt = _encrypt,
+                               .decrypt = _decrypt,
+                               .get_block_size = _get_block_size,
+                               .get_icv_size = _get_icv_size,
+                               .get_iv_size = _get_iv_size,
+                               .get_iv_gen = _get_iv_gen,
+                               .get_key_size = _get_key_size,
+                               .set_key = _set_key,
+                               .destroy = _destroy,
+                       },
+               },
+               .key_size = key_size,
+               .iv_gen = iv_gen_seq_create(),
+               .icv_size = icv_size,
+       );
+
+       switch (key_size)
+       {
+               case 16:
+                       this->encrypt = encrypt_gcm128;
+                       this->decrypt = decrypt_gcm128;
+                       break;
+               case 24:
+                       this->encrypt = encrypt_gcm192;
+                       this->decrypt = decrypt_gcm192;
+                       break;
+               case 32:
+                       this->encrypt = encrypt_gcm256;
+                       this->decrypt = decrypt_gcm256;
+                       break;
+       }
+
+       return &this->public;
+}
diff --git a/src/libstrongswan/plugins/aesni/aesni_gcm.h b/src/libstrongswan/plugins/aesni/aesni_gcm.h
new file mode 100644 (file)
index 0000000..5a256c8
--- /dev/null
@@ -0,0 +1,50 @@
+/*
+ * Copyright (C) 2015 Martin Willi
+ * Copyright (C) 2015 revosec AG
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.  See <http://www.fsf.org/copyleft/gpl.txt>.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ * or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * for more details.
+ */
+
+/**
+ * @defgroup aesni_gcm aesni_gcm
+ * @{ @ingroup aesni
+ */
+
+#ifndef AESNI_GCM_H_
+#define AESNI_GCM_H_
+
+#include <library.h>
+
+typedef struct aesni_gcm_t aesni_gcm_t;
+
+/**
+ * GCM mode AEAD using AES-NI
+ */
+struct aesni_gcm_t {
+
+       /**
+        * Implements aead_t interface
+        */
+       aead_t aead;
+};
+
+/**
+ * Create a aesni_gcm instance.
+ *
+ * @param algo                 encryption algorithm, ENCR_AES_GCM*
+ * @param key_size             AES key size, in bytes
+ * @param salt_size            size of salt value
+ * @return                             AES-GCM AEAD, NULL if not supported
+ */
+aesni_gcm_t *aesni_gcm_create(encryption_algorithm_t algo,
+                                                         size_t key_size, size_t salt_size);
+
+#endif /** AESNI_GCM_H_ @}*/
diff --git a/src/libstrongswan/plugins/aesni/aesni_key.c b/src/libstrongswan/plugins/aesni/aesni_key.c
new file mode 100644 (file)
index 0000000..523266a
--- /dev/null
@@ -0,0 +1,301 @@
+/*
+ * Copyright (C) 2015 Martin Willi
+ * Copyright (C) 2015 revosec AG
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.  See <http://www.fsf.org/copyleft/gpl.txt>.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ * or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * for more details.
+ */
+
+#include "aesni_key.h"
+
+/**
+ * Rounds used for each AES key size
+ */
+#define AES128_ROUNDS 10
+#define AES192_ROUNDS 12
+#define AES256_ROUNDS 14
+
+typedef struct private_aesni_key_t private_aesni_key_t;
+
+/**
+ * Private data of an aesni_key_t object.
+ */
+struct private_aesni_key_t {
+
+       /**
+        * Public aesni_key_t interface.
+        */
+       aesni_key_t public;
+};
+
+/**
+ * Invert round encryption keys to get a decryption key schedule
+ */
+static void reverse_key(aesni_key_t *this)
+{
+       __m128i t[this->rounds + 1];
+       int i;
+
+       for (i = 0; i <= this->rounds; i++)
+       {
+               t[i] = this->schedule[i];
+       }
+       this->schedule[this->rounds] = t[0];
+       for (i = 1; i < this->rounds; i++)
+       {
+               this->schedule[this->rounds - i] = _mm_aesimc_si128(t[i]);
+       }
+       this->schedule[0] = t[this->rounds];
+
+       memwipe(t, sizeof(t));
+}
+
+/**
+ * Assist in creating a 128-bit round key
+ */
+static __m128i assist128(__m128i a, __m128i b)
+{
+       __m128i c;
+
+       b = _mm_shuffle_epi32(b ,0xff);
+       c = _mm_slli_si128(a, 0x04);
+       a = _mm_xor_si128(a, c);
+       c = _mm_slli_si128(c, 0x04);
+       a = _mm_xor_si128(a, c);
+       c = _mm_slli_si128(c, 0x04);
+       a = _mm_xor_si128(a, c);
+       a = _mm_xor_si128(a, b);
+
+       return a;
+}
+
+/**
+ * Expand a 128-bit key to encryption round keys
+ */
+static void expand128(__m128i *key, __m128i *schedule)
+{
+       __m128i t;
+
+       schedule[0] = t = _mm_loadu_si128(key);
+       schedule[1] = t = assist128(t, _mm_aeskeygenassist_si128(t, 0x01));
+       schedule[2] = t = assist128(t, _mm_aeskeygenassist_si128(t, 0x02));
+       schedule[3] = t = assist128(t, _mm_aeskeygenassist_si128(t, 0x04));
+       schedule[4] = t = assist128(t, _mm_aeskeygenassist_si128(t, 0x08));
+       schedule[5] = t = assist128(t, _mm_aeskeygenassist_si128(t, 0x10));
+       schedule[6] = t = assist128(t, _mm_aeskeygenassist_si128(t, 0x20));
+       schedule[7] = t = assist128(t, _mm_aeskeygenassist_si128(t, 0x40));
+       schedule[8] = t = assist128(t, _mm_aeskeygenassist_si128(t, 0x80));
+       schedule[9] = t = assist128(t, _mm_aeskeygenassist_si128(t, 0x1b));
+       schedule[10]    = assist128(t, _mm_aeskeygenassist_si128(t, 0x36));
+}
+
+/**
+ * Assist in creating a 192-bit round key
+ */
+static __m128i assist192(__m128i b, __m128i c, __m128i *a)
+{
+       __m128i t;
+
+        b = _mm_shuffle_epi32(b, 0x55);
+        t = _mm_slli_si128(*a, 0x04);
+       *a = _mm_xor_si128(*a, t);
+        t = _mm_slli_si128(t, 0x04);
+       *a = _mm_xor_si128(*a, t);
+        t = _mm_slli_si128(t, 0x04);
+       *a = _mm_xor_si128(*a, t);
+       *a = _mm_xor_si128(*a, b);
+        b = _mm_shuffle_epi32(*a, 0xff);
+        t = _mm_slli_si128(c, 0x04);
+        t = _mm_xor_si128(c, t);
+        t = _mm_xor_si128(t, b);
+
+       return t;
+}
+
+/**
+ * return a[63:0] | b[63:0] << 64
+ */
+static __m128i _mm_shuffle_i00(__m128i a, __m128i b)
+{
+       return (__m128i)_mm_shuffle_pd((__m128d)a, (__m128d)b, 0);
+}
+
+/**
+ * return a[127:64] >> 64 | b[63:0] << 64
+ */
+static __m128i _mm_shuffle_i01(__m128i a, __m128i b)
+{
+       return (__m128i)_mm_shuffle_pd((__m128d)a, (__m128d)b, 1);
+}
+
+/**
+ * Expand a 192-bit encryption key to round keys
+ */
+static void expand192(__m128i *key, __m128i *schedule)
+{
+       __m128i t1, t2, t3;
+
+       schedule[0] = t1 = _mm_loadu_si128(key);
+       t2 = t3 = _mm_loadu_si128(key + 1);
+
+       t2 = assist192(_mm_aeskeygenassist_si128(t2, 0x1), t2, &t1);
+       schedule[1] = _mm_shuffle_i00(t3, t1);
+       schedule[2] = _mm_shuffle_i01(t1, t2);
+       t2 = t3 = assist192(_mm_aeskeygenassist_si128(t2, 0x2), t2, &t1);
+       schedule[3] = t1;
+
+       t2 = assist192(_mm_aeskeygenassist_si128(t2, 0x4), t2, &t1);
+       schedule[4] = _mm_shuffle_i00(t3, t1);
+       schedule[5] = _mm_shuffle_i01(t1, t2);
+       t2 = t3 = assist192(_mm_aeskeygenassist_si128(t2, 0x8), t2, &t1);
+       schedule[6] = t1;
+
+       t2 = assist192(_mm_aeskeygenassist_si128 (t2,0x10), t2, &t1);
+       schedule[7] = _mm_shuffle_i00(t3, t1);
+       schedule[8] = _mm_shuffle_i01(t1, t2);
+       t2 = t3 = assist192(_mm_aeskeygenassist_si128 (t2,0x20), t2, &t1);
+       schedule[9] = t1;
+
+       t2 = assist192(_mm_aeskeygenassist_si128(t2, 0x40), t2, &t1);
+       schedule[10] = _mm_shuffle_i00(t3, t1);
+       schedule[11] = _mm_shuffle_i01(t1, t2);
+       assist192(_mm_aeskeygenassist_si128(t2, 0x80), t2, &t1);
+       schedule[12] = t1;
+}
+
+/**
+ * Assist in creating a 256-bit round key
+ */
+static __m128i assist256_1(__m128i a, __m128i b)
+{
+       __m128i x, y;
+
+       b = _mm_shuffle_epi32(b, 0xff);
+       y = _mm_slli_si128(a, 0x04);
+       x = _mm_xor_si128(a, y);
+       y = _mm_slli_si128(y, 0x04);
+       x = _mm_xor_si128 (x, y);
+       y = _mm_slli_si128(y, 0x04);
+       x = _mm_xor_si128(x, y);
+       x = _mm_xor_si128(x, b);
+
+       return x;
+}
+
+/**
+ * Assist in creating a 256-bit round key
+ */
+static __m128i assist256_2(__m128i a, __m128i b)
+{
+       __m128i x, y, z;
+
+       y = _mm_aeskeygenassist_si128(a, 0x00);
+       z = _mm_shuffle_epi32(y, 0xaa);
+       y = _mm_slli_si128(b, 0x04);
+       x = _mm_xor_si128(b, y);
+       y = _mm_slli_si128(y, 0x04);
+       x = _mm_xor_si128(x, y);
+       y = _mm_slli_si128(y, 0x04);
+       x = _mm_xor_si128(x, y);
+       x = _mm_xor_si128(x, z);
+
+       return x;
+}
+
+/**
+ * Expand a 256-bit encryption key to round keys
+ */
+static void expand256(__m128i *key, __m128i *schedule)
+{
+       __m128i t1, t2;
+
+       schedule[0] = t1 = _mm_loadu_si128(key);
+       schedule[1] = t2 = _mm_loadu_si128(key + 1);
+
+       schedule[2] = t1 = assist256_1(t1, _mm_aeskeygenassist_si128(t2, 0x01));
+       schedule[3] = t2 = assist256_2(t1, t2);
+
+       schedule[4] = t1 = assist256_1(t1, _mm_aeskeygenassist_si128(t2, 0x02));
+       schedule[5] = t2 = assist256_2(t1, t2);
+
+       schedule[6] = t1 = assist256_1(t1, _mm_aeskeygenassist_si128(t2, 0x04));
+       schedule[7] = t2 = assist256_2(t1, t2);
+
+       schedule[8] = t1 = assist256_1(t1, _mm_aeskeygenassist_si128(t2, 0x08));
+       schedule[9] = t2 = assist256_2(t1, t2);
+
+       schedule[10] = t1 = assist256_1(t1, _mm_aeskeygenassist_si128(t2, 0x10));
+       schedule[11] = t2 = assist256_2(t1, t2);
+
+       schedule[12] = t1 = assist256_1(t1, _mm_aeskeygenassist_si128(t2, 0x20));
+       schedule[13] = t2 = assist256_2(t1, t2);
+
+       schedule[14] = assist256_1(t1, _mm_aeskeygenassist_si128(t2, 0x40));
+}
+
+METHOD(aesni_key_t, destroy, void,
+       private_aesni_key_t *this)
+{
+       memwipe(this, sizeof(*this) + (this->public.rounds + 1) * AES_BLOCK_SIZE);
+       free_align(this);
+}
+
+/**
+ * See header
+ */
+aesni_key_t *aesni_key_create(bool encrypt, chunk_t key)
+{
+       private_aesni_key_t *this;
+       int rounds;
+
+       switch (key.len)
+       {
+               case 16:
+                       rounds = AES128_ROUNDS;
+                       break;
+               case 24:
+                       rounds = AES192_ROUNDS;
+                       break;
+               case 32:
+                       rounds = AES256_ROUNDS;
+                       break;
+               default:
+                       return NULL;
+       }
+
+       INIT_EXTRA_ALIGN(this, (rounds + 1) * AES_BLOCK_SIZE, sizeof(__m128i),
+               .public = {
+                       .destroy = _destroy,
+                       .rounds = rounds,
+               },
+       );
+
+       switch (key.len)
+       {
+               case 16:
+                       expand128((__m128i*)key.ptr, this->public.schedule);
+                       break;
+               case 24:
+                       expand192((__m128i*)key.ptr, this->public.schedule);
+                       break;
+               case 32:
+                       expand256((__m128i*)key.ptr, this->public.schedule);
+                       break;
+               default:
+                       break;
+       }
+
+       if (!encrypt)
+       {
+               reverse_key(&this->public);
+       }
+
+       return &this->public;
+}
diff --git a/src/libstrongswan/plugins/aesni/aesni_key.h b/src/libstrongswan/plugins/aesni/aesni_key.h
new file mode 100644 (file)
index 0000000..12dcd22
--- /dev/null
@@ -0,0 +1,65 @@
+/*
+ * Copyright (C) 2015 Martin Willi
+ * Copyright (C) 2015 revosec AG
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.  See <http://www.fsf.org/copyleft/gpl.txt>.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ * or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * for more details.
+ */
+
+/**
+ * @defgroup aesni_key aesni_key
+ * @{ @ingroup aesni
+ */
+
+#ifndef AESNI_KEY_H_
+#define AESNI_KEY_H_
+
+#include <library.h>
+
+#include <wmmintrin.h>
+
+/**
+ * AES block size, in bytes
+ */
+#define AES_BLOCK_SIZE 16
+
+typedef struct aesni_key_t aesni_key_t;
+
+/**
+ * Key schedule for encryption/decryption using on AES-NI.
+ */
+struct aesni_key_t {
+
+       /**
+        * Destroy a aesni_key_t.
+        */
+       void (*destroy)(aesni_key_t *this);
+
+       /**
+        * Number of AES rounds (10, 12, 14)
+        */
+       int rounds;
+
+       /**
+        * Key schedule, for each round + the round 0 (whitening)
+        */
+       __attribute__((aligned(sizeof(__m128i)))) __m128i schedule[];
+};
+
+/**
+ * Create a AESNI key schedule instance.
+ *
+ * @param encrypt              TRUE for encryption schedule, FALSE for decryption
+ * @param key                  non-expanded crypto key, 16, 24 or 32 bytes
+ * @return                             key schedule, NULL on invalid key size
+ */
+aesni_key_t *aesni_key_create(bool encrypt, chunk_t key);
+
+#endif /** AESNI_KEY_H_ @}*/
diff --git a/src/libstrongswan/plugins/aesni/aesni_plugin.c b/src/libstrongswan/plugins/aesni/aesni_plugin.c
new file mode 100644 (file)
index 0000000..b92419d
--- /dev/null
@@ -0,0 +1,125 @@
+/*
+ * Copyright (C) 2015 Martin Willi
+ * Copyright (C) 2015 revosec AG
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.  See <http://www.fsf.org/copyleft/gpl.txt>.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ * or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * for more details.
+ */
+
+#include "aesni_plugin.h"
+#include "aesni_cbc.h"
+#include "aesni_ctr.h"
+#include "aesni_ccm.h"
+#include "aesni_gcm.h"
+#include "aesni_xcbc.h"
+#include "aesni_cmac.h"
+
+#include <stdio.h>
+
+#include <library.h>
+#include <utils/debug.h>
+#include <utils/cpu_feature.h>
+
+typedef struct private_aesni_plugin_t private_aesni_plugin_t;
+typedef enum cpuid_feature_t cpuid_feature_t;
+
+/**
+ * private data of aesni_plugin
+ */
+struct private_aesni_plugin_t {
+
+       /**
+        * public functions
+        */
+       aesni_plugin_t public;
+};
+
+METHOD(plugin_t, get_name, char*,
+       private_aesni_plugin_t *this)
+{
+       return "aesni";
+}
+
+METHOD(plugin_t, get_features, int,
+       private_aesni_plugin_t *this, plugin_feature_t *features[])
+{
+       static plugin_feature_t f[] = {
+               PLUGIN_REGISTER(CRYPTER, aesni_cbc_create),
+                       PLUGIN_PROVIDE(CRYPTER, ENCR_AES_CBC, 16),
+                       PLUGIN_PROVIDE(CRYPTER, ENCR_AES_CBC, 24),
+                       PLUGIN_PROVIDE(CRYPTER, ENCR_AES_CBC, 32),
+               PLUGIN_REGISTER(CRYPTER, aesni_ctr_create),
+                       PLUGIN_PROVIDE(CRYPTER, ENCR_AES_CTR, 16),
+                       PLUGIN_PROVIDE(CRYPTER, ENCR_AES_CTR, 24),
+                       PLUGIN_PROVIDE(CRYPTER, ENCR_AES_CTR, 32),
+               PLUGIN_REGISTER(AEAD, aesni_ccm_create),
+                       PLUGIN_PROVIDE(AEAD, ENCR_AES_CCM_ICV8,  16),
+                       PLUGIN_PROVIDE(AEAD, ENCR_AES_CCM_ICV12, 16),
+                       PLUGIN_PROVIDE(AEAD, ENCR_AES_CCM_ICV16, 16),
+                       PLUGIN_PROVIDE(AEAD, ENCR_AES_CCM_ICV8,  24),
+                       PLUGIN_PROVIDE(AEAD, ENCR_AES_CCM_ICV12, 24),
+                       PLUGIN_PROVIDE(AEAD, ENCR_AES_CCM_ICV16, 24),
+                       PLUGIN_PROVIDE(AEAD, ENCR_AES_CCM_ICV8,  32),
+                       PLUGIN_PROVIDE(AEAD, ENCR_AES_CCM_ICV12, 32),
+                       PLUGIN_PROVIDE(AEAD, ENCR_AES_CCM_ICV16, 32),
+               PLUGIN_REGISTER(AEAD, aesni_gcm_create),
+                       PLUGIN_PROVIDE(AEAD, ENCR_AES_GCM_ICV8,  16),
+                       PLUGIN_PROVIDE(AEAD, ENCR_AES_GCM_ICV12, 16),
+                       PLUGIN_PROVIDE(AEAD, ENCR_AES_GCM_ICV16, 16),
+                       PLUGIN_PROVIDE(AEAD, ENCR_AES_GCM_ICV8,  24),
+                       PLUGIN_PROVIDE(AEAD, ENCR_AES_GCM_ICV12, 24),
+                       PLUGIN_PROVIDE(AEAD, ENCR_AES_GCM_ICV16, 24),
+                       PLUGIN_PROVIDE(AEAD, ENCR_AES_GCM_ICV8,  32),
+                       PLUGIN_PROVIDE(AEAD, ENCR_AES_GCM_ICV12, 32),
+                       PLUGIN_PROVIDE(AEAD, ENCR_AES_GCM_ICV16, 32),
+               PLUGIN_REGISTER(PRF, aesni_xcbc_prf_create),
+                       PLUGIN_PROVIDE(PRF, PRF_AES128_XCBC),
+               PLUGIN_REGISTER(SIGNER, aesni_xcbc_signer_create),
+                       PLUGIN_PROVIDE(SIGNER, AUTH_AES_XCBC_96),
+               PLUGIN_REGISTER(PRF, aesni_cmac_prf_create),
+                       PLUGIN_PROVIDE(PRF, PRF_AES128_CMAC),
+               PLUGIN_REGISTER(SIGNER, aesni_cmac_signer_create),
+                       PLUGIN_PROVIDE(SIGNER, AUTH_AES_CMAC_96),
+       };
+
+       *features = f;
+       if (cpu_feature_available(CPU_FEATURE_AESNI | CPU_FEATURE_PCLMULQDQ))
+       {
+               return countof(f);
+       }
+       return 0;
+}
+
+METHOD(plugin_t, destroy, void,
+       private_aesni_plugin_t *this)
+{
+       free(this);
+}
+
+/*
+ * see header file
+ */
+plugin_t *aesni_plugin_create()
+{
+       private_aesni_plugin_t *this;
+
+       INIT(this,
+               .public = {
+                       .plugin = {
+                               .get_name = _get_name,
+                               .get_features = _get_features,
+                               .reload = (void*)return_false,
+                               .destroy = _destroy,
+                       },
+               },
+       );
+
+       return &this->public.plugin;
+}
diff --git a/src/libstrongswan/plugins/aesni/aesni_plugin.h b/src/libstrongswan/plugins/aesni/aesni_plugin.h
new file mode 100644 (file)
index 0000000..4744d6c
--- /dev/null
@@ -0,0 +1,42 @@
+/*
+ * Copyright (C) 2015 Martin Willi
+ * Copyright (C) 2015 revosec AG
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.  See <http://www.fsf.org/copyleft/gpl.txt>.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ * or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * for more details.
+ */
+
+/**
+ * @defgroup aesni_p aesni
+ * @ingroup plugins
+ *
+ * @defgroup aesni_plugin aesni_plugin
+ * @{ @ingroup aesni_p
+ */
+
+#ifndef AESNI_PLUGIN_H_
+#define AESNI_PLUGIN_H_
+
+#include <plugins/plugin.h>
+
+typedef struct aesni_plugin_t aesni_plugin_t;
+
+/**
+ * Plugin providing crypto primitives based on Intel AES-NI instructions.
+ */
+struct aesni_plugin_t {
+
+       /**
+        * implements plugin interface
+        */
+       plugin_t plugin;
+};
+
+#endif /** AESNI_PLUGIN_H_ @}*/
diff --git a/src/libstrongswan/plugins/aesni/aesni_xcbc.c b/src/libstrongswan/plugins/aesni/aesni_xcbc.c
new file mode 100644 (file)
index 0000000..24a75ce
--- /dev/null
@@ -0,0 +1,367 @@
+/*
+ * Copyright (C) 2008-2015 Martin Willi
+ * Copyright (C) 2012 Tobias Brunner
+ * Hochschule fuer Technik Rapperswil
+ * Copyright (C) 2015 revosec AG
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.  See <http://www.fsf.org/copyleft/gpl.txt>.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ * or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * for more details.
+ */
+
+#include "aesni_xcbc.h"
+#include "aesni_key.h"
+
+#include <crypto/prfs/mac_prf.h>
+#include <crypto/signers/mac_signer.h>
+
+typedef struct private_aesni_mac_t private_aesni_mac_t;
+
+/**
+ * Private data of a mac_t object.
+ */
+struct private_aesni_mac_t {
+
+       /**
+        * Public mac_t interface.
+        */
+       mac_t public;
+
+       /**
+        * Key schedule for K1
+        */
+       aesni_key_t *k1;
+
+       /**
+        * k2
+        */
+       __m128i k2;
+
+       /**
+        * k3
+        */
+       __m128i k3;
+
+       /**
+        * E
+        */
+       __m128i e;
+
+       /**
+        * remaining, unprocessed bytes in append mode
+        */
+       u_char rem[AES_BLOCK_SIZE];
+
+       /**
+        * number of bytes used in remaining
+        */
+       int rem_size;
+
+       /**
+        * TRUE if we have zero bytes to xcbc in final()
+        */
+       bool zero;
+};
+
+METHOD(mac_t, get_mac, bool,
+       private_aesni_mac_t *this, chunk_t data, u_int8_t *out)
+{
+       __m128i *ks, e, *bi;
+       u_int blocks, rem, i;
+
+       if (!this->k1)
+       {
+               return FALSE;
+       }
+
+       ks = this->k1->schedule;
+
+       e = this->e;
+
+       if (data.len)
+       {
+               this->zero = FALSE;
+       }
+
+       if (this->rem_size + data.len > AES_BLOCK_SIZE)
+       {
+               /* (3) For each block M[i], where i = 1 ... n-1:
+                *     XOR M[i] with E[i-1], then encrypt the result with Key K1,
+                *     yielding E[i].
+                */
+
+               /* append data to remaining bytes, process block M[1] */
+               memcpy(this->rem + this->rem_size, data.ptr,
+                          AES_BLOCK_SIZE - this->rem_size);
+               data = chunk_skip(data, AES_BLOCK_SIZE - this->rem_size);
+
+               e = _mm_xor_si128(e, _mm_loadu_si128((__m128i*)this->rem));
+
+               e = _mm_xor_si128(e, ks[0]);
+               e = _mm_aesenc_si128(e, ks[1]);
+               e = _mm_aesenc_si128(e, ks[2]);
+               e = _mm_aesenc_si128(e, ks[3]);
+               e = _mm_aesenc_si128(e, ks[4]);
+               e = _mm_aesenc_si128(e, ks[5]);
+               e = _mm_aesenc_si128(e, ks[6]);
+               e = _mm_aesenc_si128(e, ks[7]);
+               e = _mm_aesenc_si128(e, ks[8]);
+               e = _mm_aesenc_si128(e, ks[9]);
+               e = _mm_aesenclast_si128(e, ks[10]);
+
+               bi = (__m128i*)data.ptr;
+               rem = data.len % AES_BLOCK_SIZE;
+               blocks = data.len / AES_BLOCK_SIZE;
+               if (!rem && blocks)
+               {       /* don't do last block */
+                       rem = AES_BLOCK_SIZE;
+                       blocks--;
+               }
+
+               /* process blocks M[2] ... M[n-1] */
+               for (i = 0; i < blocks; i++)
+               {
+                       e = _mm_xor_si128(e, _mm_loadu_si128(bi + i));
+
+                       e = _mm_xor_si128(e, ks[0]);
+                       e = _mm_aesenc_si128(e, ks[1]);
+                       e = _mm_aesenc_si128(e, ks[2]);
+                       e = _mm_aesenc_si128(e, ks[3]);
+                       e = _mm_aesenc_si128(e, ks[4]);
+                       e = _mm_aesenc_si128(e, ks[5]);
+                       e = _mm_aesenc_si128(e, ks[6]);
+                       e = _mm_aesenc_si128(e, ks[7]);
+                       e = _mm_aesenc_si128(e, ks[8]);
+                       e = _mm_aesenc_si128(e, ks[9]);
+                       e = _mm_aesenclast_si128(e, ks[10]);
+               }
+
+               /* store remaining bytes of block M[n] */
+               memcpy(this->rem, data.ptr + data.len - rem, rem);
+               this->rem_size = rem;
+       }
+       else
+       {
+               /* no complete block, just copy into remaining */
+               memcpy(this->rem + this->rem_size, data.ptr, data.len);
+               this->rem_size += data.len;
+       }
+
+       if (out)
+       {
+               /* (4) For block M[n]: */
+               if (this->rem_size == AES_BLOCK_SIZE && !this->zero)
+               {
+                       /* a) If the blocksize of M[n] is 128 bits:
+                        *    XOR M[n] with E[n-1] and Key K2, then encrypt the result with
+                        *    Key K1, yielding E[n].
+                        */
+                       e = _mm_xor_si128(e, this->k2);
+               }
+               else
+               {
+                       /* b) If the blocksize of M[n] is less than 128 bits:
+                        *
+                        *  i) Pad M[n] with a single "1" bit, followed by the number of
+                        *     "0" bits (possibly none) required to increase M[n]'s
+                        *     blocksize to 128 bits.
+                        */
+                       if (this->rem_size < AES_BLOCK_SIZE)
+                       {
+                               memset(this->rem + this->rem_size, 0,
+                                          AES_BLOCK_SIZE - this->rem_size);
+                               this->rem[this->rem_size] = 0x80;
+                       }
+                       /*  ii) XOR M[n] with E[n-1] and Key K3, then encrypt the result
+                        *      with Key K1, yielding E[n].
+                        */
+                       e = _mm_xor_si128(e, this->k3);
+               }
+               e = _mm_xor_si128(e, _mm_loadu_si128((__m128i*)this->rem));
+
+               e = _mm_xor_si128(e, ks[0]);
+               e = _mm_aesenc_si128(e, ks[1]);
+               e = _mm_aesenc_si128(e, ks[2]);
+               e = _mm_aesenc_si128(e, ks[3]);
+               e = _mm_aesenc_si128(e, ks[4]);
+               e = _mm_aesenc_si128(e, ks[5]);
+               e = _mm_aesenc_si128(e, ks[6]);
+               e = _mm_aesenc_si128(e, ks[7]);
+               e = _mm_aesenc_si128(e, ks[8]);
+               e = _mm_aesenc_si128(e, ks[9]);
+               e = _mm_aesenclast_si128(e, ks[10]);
+               _mm_storeu_si128((__m128i*)out, e);
+
+               /* (2) Define E[0] = 0x00000000000000000000000000000000 */
+               e = _mm_setzero_si128();
+               this->rem_size = 0;
+               this->zero = TRUE;
+       }
+       this->e = e;
+       return TRUE;
+}
+
+METHOD(mac_t, get_mac_size, size_t,
+       private_aesni_mac_t *this)
+{
+       return AES_BLOCK_SIZE;
+}
+
+METHOD(mac_t, set_key, bool,
+       private_aesni_mac_t *this, chunk_t key)
+{
+       __m128i t1, t2, t3;
+       u_char k1[AES_BLOCK_SIZE];
+       u_int round;
+       chunk_t k;
+
+       /* reset state */
+       this->e = _mm_setzero_si128();
+       this->rem_size = 0;
+       this->zero = TRUE;
+
+       /* Create RFC4434 variable keys if required */
+       if (key.len == AES_BLOCK_SIZE)
+       {
+               k = key;
+       }
+       else if (key.len < AES_BLOCK_SIZE)
+       {       /* pad short keys */
+               k = chunk_alloca(AES_BLOCK_SIZE);
+               memset(k.ptr, 0, k.len);
+               memcpy(k.ptr, key.ptr, key.len);
+       }
+       else
+       {       /* shorten key using XCBC */
+               k = chunk_alloca(AES_BLOCK_SIZE);
+               memset(k.ptr, 0, k.len);
+               if (!set_key(this, k) || !get_mac(this, key, k.ptr))
+               {
+                       return FALSE;
+               }
+       }
+
+       /*
+        * (1) Derive 3 128-bit keys (K1, K2 and K3) from the 128-bit secret
+        *     key K, as follows:
+        *     K1 = 0x01010101010101010101010101010101 encrypted with Key K
+        *     K2 = 0x02020202020202020202020202020202 encrypted with Key K
+        *     K3 = 0x03030303030303030303030303030303 encrypted with Key K
+        */
+
+       DESTROY_IF(this->k1);
+       this->k1 = aesni_key_create(TRUE, k);
+       if (!this->k1)
+       {
+               return FALSE;
+       }
+
+       t1 = _mm_set1_epi8(0x01);
+       t2 = _mm_set1_epi8(0x02);
+       t3 = _mm_set1_epi8(0x03);
+
+       t1 = _mm_xor_si128(t1, this->k1->schedule[0]);
+       t2 = _mm_xor_si128(t2, this->k1->schedule[0]);
+       t3 = _mm_xor_si128(t3, this->k1->schedule[0]);
+
+       for (round = 1; round < this->k1->rounds; round++)
+       {
+               t1 = _mm_aesenc_si128(t1, this->k1->schedule[round]);
+               t2 = _mm_aesenc_si128(t2, this->k1->schedule[round]);
+               t3 = _mm_aesenc_si128(t3, this->k1->schedule[round]);
+       }
+
+       t1 = _mm_aesenclast_si128(t1, this->k1->schedule[this->k1->rounds]);
+       t2 = _mm_aesenclast_si128(t2, this->k1->schedule[this->k1->rounds]);
+       t3 = _mm_aesenclast_si128(t3, this->k1->schedule[this->k1->rounds]);
+
+       _mm_storeu_si128((__m128i*)k1, t1);
+       this->k2 = t2;
+       this->k3 = t3;
+
+       this->k1->destroy(this->k1);
+       this->k1 = aesni_key_create(TRUE, chunk_from_thing(k1));
+
+       memwipe(k1, AES_BLOCK_SIZE);
+       return this->k1 != NULL;
+}
+
+METHOD(mac_t, destroy, void,
+       private_aesni_mac_t *this)
+{
+       DESTROY_IF(this->k1);
+       memwipe(&this->k2, sizeof(this->k2));
+       memwipe(&this->k3, sizeof(this->k3));
+       free_align(this);
+}
+
+/*
+ * Described in header
+ */
+mac_t *aesni_xcbc_create(encryption_algorithm_t algo, size_t key_size)
+{
+       private_aesni_mac_t *this;
+
+       INIT_ALIGN(this, sizeof(__m128i),
+               .public = {
+                       .get_mac = _get_mac,
+                       .get_mac_size = _get_mac_size,
+                       .set_key = _set_key,
+                       .destroy = _destroy,
+               },
+       );
+
+       return &this->public;
+}
+
+/*
+ * Described in header.
+ */
+prf_t *aesni_xcbc_prf_create(pseudo_random_function_t algo)
+{
+       mac_t *xcbc;
+
+       switch (algo)
+       {
+               case PRF_AES128_XCBC:
+                       xcbc = aesni_xcbc_create(ENCR_AES_CBC, 16);
+                       break;
+               default:
+                       return NULL;
+       }
+       if (xcbc)
+       {
+               return mac_prf_create(xcbc);
+       }
+       return NULL;
+}
+
+/*
+ * Described in header
+ */
+signer_t *aesni_xcbc_signer_create(integrity_algorithm_t algo)
+{
+       size_t trunc;
+       mac_t *xcbc;
+
+       switch (algo)
+       {
+               case AUTH_AES_XCBC_96:
+                       xcbc = aesni_xcbc_create(ENCR_AES_CBC, 16);
+                       trunc = 12;
+                       break;
+               default:
+                       return NULL;
+       }
+       if (xcbc)
+       {
+               return mac_signer_create(xcbc, trunc);
+       }
+       return NULL;
+}
diff --git a/src/libstrongswan/plugins/aesni/aesni_xcbc.h b/src/libstrongswan/plugins/aesni/aesni_xcbc.h
new file mode 100644 (file)
index 0000000..53f559f
--- /dev/null
@@ -0,0 +1,52 @@
+/*
+ * Copyright (C) 2015 Martin Willi
+ * Copyright (C) 2015 revosec AG
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.  See <http://www.fsf.org/copyleft/gpl.txt>.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ * or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * for more details.
+ */
+
+/**
+ * @defgroup aesni_xcbc aesni_xcbc
+ * @{ @ingroup aesni
+ */
+
+#ifndef AESNI_XCBC_H_
+#define AESNI_XCBC_H_
+
+#include <crypto/mac.h>
+#include <crypto/prfs/prf.h>
+#include <crypto/signers/signer.h>
+
+/**
+ * Create a generic mac_t object using AESNI XCBC
+ *
+ * @param algo         underlying encryption algorithm
+ * @param key_size     size of encryption key, in bytes
+ */
+mac_t *aesni_xcbc_create(encryption_algorithm_t algo, size_t key_size);
+
+/**
+ * Creates a new prf_t object based AESNI XCBC.
+ *
+ * @param algo         algorithm to implement
+ * @return                     prf_t object, NULL if not supported
+ */
+prf_t *aesni_xcbc_prf_create(pseudo_random_function_t algo);
+
+/**
+ * Creates a new signer_t object based on AESNI XCBC.
+ *
+ * @param algo         algorithm to implement
+ * @return                     signer_t, NULL if not supported
+ */
+signer_t *aesni_xcbc_signer_create(integrity_algorithm_t algo);
+
+#endif /** AESNI_XCBC_H_ @}*/
index 65cdbe9..2d0ce8a 100644 (file)
@@ -437,10 +437,12 @@ bool plugin_feature_load(plugin_t *plugin, plugin_feature_t *feature,
        {
                case FEATURE_CRYPTER:
                        lib->crypto->add_crypter(lib->crypto, feature->arg.crypter.alg,
+                                                               feature->arg.crypter.key_size,
                                                                name, reg->arg.reg.f);
                        break;
                case FEATURE_AEAD:
                        lib->crypto->add_aead(lib->crypto, feature->arg.aead.alg,
+                                                               feature->arg.aead.key_size,
                                                                name, reg->arg.reg.f);
                        break;
                case FEATURE_SIGNER:
index 33c13d9..9b0676b 100644 (file)
@@ -86,6 +86,11 @@ TEST_VECTOR_AEAD(aes_ccm8)
 TEST_VECTOR_AEAD(aes_ccm9)
 TEST_VECTOR_AEAD(aes_ccm10)
 TEST_VECTOR_AEAD(aes_ccm11)
+TEST_VECTOR_AEAD(aes_ccm12)
+TEST_VECTOR_AEAD(aes_ccm13)
+TEST_VECTOR_AEAD(aes_ccm14)
+TEST_VECTOR_AEAD(aes_ccm15)
+TEST_VECTOR_AEAD(aes_ccm16)
 TEST_VECTOR_AEAD(aes_gcm1)
 TEST_VECTOR_AEAD(aes_gcm2)
 TEST_VECTOR_AEAD(aes_gcm3_1)
@@ -100,6 +105,13 @@ TEST_VECTOR_AEAD(aes_gcm13)
 TEST_VECTOR_AEAD(aes_gcm14)
 TEST_VECTOR_AEAD(aes_gcm15)
 TEST_VECTOR_AEAD(aes_gcm16)
+TEST_VECTOR_AEAD(aes_gcm17)
+TEST_VECTOR_AEAD(aes_gcm18)
+TEST_VECTOR_AEAD(aes_gcm19)
+TEST_VECTOR_AEAD(aes_gcm20)
+TEST_VECTOR_AEAD(aes_gcm21)
+TEST_VECTOR_AEAD(aes_gcm22)
+TEST_VECTOR_AEAD(aes_gcm23)
 
 TEST_VECTOR_SIGNER(aes_xcbc_s1)
 TEST_VECTOR_SIGNER(aes_xcbc_s2)
@@ -227,4 +239,3 @@ TEST_VECTOR_RNG(rng_poker_3)
 TEST_VECTOR_RNG(rng_runs_1)
 TEST_VECTOR_RNG(rng_runs_2)
 TEST_VECTOR_RNG(rng_runs_3)
-
index 95c41ec..cb45254 100644 (file)
@@ -166,3 +166,82 @@ aead_test_vector_t aes_ccm11 = {
                          "\x66\xca\x61\x1e\x96\x7a\x61\xb3\x1c\x16\x45\x52\xba\x04\x9c\x9f"
                          "\xb1\xd2\x40\xbc\x52\x7c\x6f\xb1",
 };
+
+/**
+ * The vectors below are defined by ourself
+ */
+aead_test_vector_t aes_ccm12 = {
+       .alg = ENCR_AES_CCM_ICV8, .key_size = 24, .salt_size = 3,
+       .len = 32, .alen = 27,
+       .key    = "\x58\x5d\xa0\x96\x65\x1a\x04\xd7\x96\xe5\xc5\x68\xaa\x95\x35\xe0"
+                         "\x29\xa0\xba\x9e\x48\x78\xd1\xba\xee\x49\x83",
+       .iv             = "\xe9\xa9\xff\xe9\x57\xba\xfd\x9e",
+       .adata  = "\x44\xa6\x2c\x05\xe9\xe1\x43\xb1\x58\x7c\xf2\x5c\x6d\x39\x0a\x64"
+                         "\xa4\xf0\x13\x05\xd1\x77\x99\x67\x11\xc4\xc6",
+       .plain  = "\x85\x34\x66\x42\xc8\x92\x0f\x36\x58\xe0\x6b\x91\x3c\x98\x5c\xbb"
+                         "\x0a\x85\xcc\x02\xad\x7a\x96\xe9\x65\x43\xa4\xc3\x0f\xdc\x55\x81",
+       .cipher = "\xfb\xe5\x5d\x34\xbe\xe5\xe8\xe7\x5a\xef\x2f\xbf\x1f\x7f\xd4\xb2"
+                         "\x66\xca\x61\x1e\x96\x7a\x61\xb3\x1c\x16\x45\x52\xba\x04\x9c\x9f"
+                         "\x24\x0e\xd1\xa5\x40\x74\xc8\x4e",
+};
+
+aead_test_vector_t aes_ccm13 = {
+       .alg = ENCR_AES_CCM_ICV8, .key_size = 24, .salt_size = 3,
+       .len = 27, .alen = 32,
+       .key    = "\x58\x5d\xa0\x96\x65\x1a\x04\xd7\x96\xe5\xc5\x68\xaa\x95\x35\xe0"
+                         "\x29\xa0\xba\x9e\x48\x78\xd1\xba\xee\x49\x83",
+       .iv             = "\xe9\xa9\xff\xe9\x57\xba\xfd\x9e",
+       .adata  = "\x44\xa6\x2c\x05\xe9\xe1\x43\xb1\x58\x7c\xf2\x5c\x6d\x39\x0a\x64"
+                         "\xa4\xf0\x13\x05\xd1\x77\x99\x67\x11\xc4\xc6\xdb\x00\x56\x36\x61",
+       .plain  = "\x85\x34\x66\x42\xc8\x92\x0f\x36\x58\xe0\x6b\x91\x3c\x98\x5c\xbb"
+                         "\x0a\x85\xcc\x02\xad\x7a\x96\xe9\x65\x43\xa4",
+       .cipher = "\xfb\xe5\x5d\x34\xbe\xe5\xe8\xe7\x5a\xef\x2f\xbf\x1f\x7f\xd4\xb2"
+                         "\x66\xca\x61\x1e\x96\x7a\x61\xb3\x1c\x16\x45\xa6\xe9\x3c\xa8\x50"
+                         "\x4e\x62\x97",
+};
+
+aead_test_vector_t aes_ccm14 = {
+       .alg = ENCR_AES_CCM_ICV8, .key_size = 24, .salt_size = 3,
+       .len = 27, .alen = 27,
+       .key    = "\x58\x5d\xa0\x96\x65\x1a\x04\xd7\x96\xe5\xc5\x68\xaa\x95\x35\xe0"
+                         "\x29\xa0\xba\x9e\x48\x78\xd1\xba\xee\x49\x83",
+       .iv             = "\xe9\xa9\xff\xe9\x57\xba\xfd\x9e",
+       .adata  = "\x44\xa6\x2c\x05\xe9\xe1\x43\xb1\x58\x7c\xf2\x5c\x6d\x39\x0a\x64"
+                         "\xa4\xf0\x13\x05\xd1\x77\x99\x67\x11\xc4\xc6",
+       .plain  = "\x85\x34\x66\x42\xc8\x92\x0f\x36\x58\xe0\x6b\x91\x3c\x98\x5c\xbb"
+                         "\x0a\x85\xcc\x02\xad\x7a\x96\xe9\x65\x43\xa4",
+       .cipher = "\xfb\xe5\x5d\x34\xbe\xe5\xe8\xe7\x5a\xef\x2f\xbf\x1f\x7f\xd4\xb2"
+                         "\x66\xca\x61\x1e\x96\x7a\x61\xb3\x1c\x16\x45\x11\x03\x16\x48\xfb"
+                         "\xb7\xde\xf1",
+};
+
+aead_test_vector_t aes_ccm15 = {
+       .alg = ENCR_AES_CCM_ICV12, .key_size = 16, .salt_size = 3,
+       .len = 32, .alen = 32,
+       .key    = "\x7c\xc8\x18\x3b\x8d\x99\xe0\x7c\x45\x41\xb8\xbd\x5c\xa7\xc2\x32"
+                         "\x8a\xb8\x02\x59\xa4\xfe\xa9\x2c\x09\x75\x9a\x9b\x3c\x9b\x27\x39"
+                         "\xf9\xd9\x4e",
+       .iv             = "\x63\xb5\x3d\x9d\x43\xf6\x1e\x50",
+       .adata  = "\x57\xf5\x6b\x8b\x57\x5c\x3d\x3b\x13\x02\x01\x0c\x83\x4c\x96\x35"
+                         "\x8e\xd6\x39\xcf\x7d\x14\x9b\x94\xb0\x39\x36\xe6\x8f\x57\xe0\x13",
+       .plain  = "\x3b\x6c\x29\x36\xb6\xef\x07\xa6\x83\x72\x07\x4f\xcf\xfa\x66\x89"
+                         "\x5f\xca\xb1\xba\xd5\x8f\x2c\x27\x30\xdb\x75\x09\x93\xd4\x65\xe4",
+       .cipher = "\x2b\x94\x71\x1a\xd3\x28\x21\xe5\xe2\xeb\x75\xe8\x09\x98\x9c\x0a"
+                         "\xc9\xea\x3e\xe4\x3a\xf9\x71\x4c\x4f\x16\x73\x1d\xa5\x10\x93\x5b"
+                         "\x83\xcd\xdd\x30\xb9\x3f\x86\xb3\x14\xbb\x7d\x81",
+};
+
+aead_test_vector_t aes_ccm16 = {
+       .alg = ENCR_AES_CCM_ICV12, .key_size = 24, .salt_size = 3,
+       .len = 32, .alen = 32,
+       .key    = "\x7c\xc8\x18\x3b\x8d\x99\xe0\x7c\x45\x41\xb8\xbd\x5c\xa7\xc2\x32"
+                         "\x8a\xb8\x02\x59\xa4\xfe\xa9\x2c\xf9\xd9\x4e",
+       .iv             = "\x63\xb5\x3d\x9d\x43\xf6\x1e\x50",
+       .adata  = "\x57\xf5\x6b\x8b\x57\x5c\x3d\x3b\x13\x02\x01\x0c\x83\x4c\x96\x35"
+                         "\x8e\xd6\x39\xcf\x7d\x14\x9b\x94\xb0\x39\x36\xe6\x8f\x57\xe0\x13",
+       .plain  = "\x3b\x6c\x29\x36\xb6\xef\x07\xa6\x83\x72\x07\x4f\xcf\xfa\x66\x89"
+                         "\x5f\xca\xb1\xba\xd5\x8f\x2c\x27\x30\xdb\x75\x09\x93\xd4\x65\xe4",
+       .cipher = "\x48\x19\x60\xbb\x65\xa8\x00\xb8\x26\xf1\x7f\x16\x1f\x3c\xfc\x6d"
+                         "\x86\x62\x10\xc5\x51\xcf\xef\x74\xac\xc6\xdf\x28\xac\x36\x6f\xa0"
+                         "\x3a\x38\x24\x50\x68\x0f\x40\x1e\xaf\xea\x42\x16",
+};
index 1f33bcb..f348cd4 100644 (file)
@@ -220,3 +220,153 @@ aead_test_vector_t aes_gcm16 = {
                          "\xc5\xf6\x1e\x63\x93\xba\x7a\x0a\xbc\xc9\xf6\x62\x76\xfc\x6e\xce"
                          "\x0f\x4e\x17\x68\xcd\xdf\x88\x53\xbb\x2d\x55\x1b",
 };
+
+/**
+ * Some self made vectors for AES-192/256 with ICV8/12
+ */
+aead_test_vector_t aes_gcm17 = {
+       .alg = ENCR_AES_GCM_ICV8, .key_size = 24, .salt_size = 4,
+       .len = 70, .alen = 0,
+       .key    = "\xfe\xff\xe9\x92\x86\x65\x73\x1c\x6d\x6a\x8f\x94\x67\x30\x83\x08"
+                         "\xfe\xff\xe9\x92\x86\x65\x73\x1c\xca\xfe\xba\xbe",
+       .iv             = "\xfa\xce\xdb\xad\xde\xca\xf8\x88",
+       .plain  = "\xd9\x31\x32\x25\xf8\x84\x06\xe5\xa5\x59\x09\xc5\xaf\xf5\x26\x9a"
+                         "\x86\xa7\xa9\x53\x15\x34\xf7\xda\x2e\x4c\x30\x3d\x8a\x31\x8a\x72"
+                         "\x1c\x3c\x0c\x95\x95\x68\x09\x53\x2f\xcf\x0e\x24\x49\xa6\xb5\x25"
+                         "\x1c\x3c\x0c\x95\x95\x68\x09\x53\x2f\xcf\x0e\x24\x49\xa6\xb5\x25"
+                         "\xb1\x6a\xed\xf5\xaa\x0d",
+       .cipher = "\x39\x80\xca\x0b\x3c\x00\xe8\x41\xeb\x06\xfa\xc4\x87\x2a\x27\x57"
+                         "\x85\x9e\x1c\xea\xa6\xef\xd9\x84\x62\x85\x93\xb4\x0c\xa1\xe1\x9c"
+                         "\x7d\x77\x3d\x00\xc1\x44\xc5\x25\xac\x61\x9d\x18\xc8\x4a\x3f\x47"
+                         "\xb5\xb4\xa5\xeb\x10\x86\xcb\xdd\x59\x76\x52\x0d\xff\xa4\x85\x26"
+                         "\x4b\x54\x22\xa0\xc6\x65\x4d\xa8\x46\x73\xec\xc0\x61\x68",
+};
+aead_test_vector_t aes_gcm18 = {
+       .alg = ENCR_AES_GCM_ICV12, .key_size = 24, .salt_size = 4,
+       .len = 70, .alen = 0,
+       .key    = "\xfe\xff\xe9\x92\x86\x65\x73\x1c\x6d\x6a\x8f\x94\x67\x30\x83\x08"
+                         "\xfe\xff\xe9\x92\x86\x65\x73\x1c\xca\xfe\xba\xbe",
+       .iv             = "\xfa\xce\xdb\xad\xde\xca\xf8\x88",
+       .plain  = "\xd9\x31\x32\x25\xf8\x84\x06\xe5\xa5\x59\x09\xc5\xaf\xf5\x26\x9a"
+                         "\x86\xa7\xa9\x53\x15\x34\xf7\xda\x2e\x4c\x30\x3d\x8a\x31\x8a\x72"
+                         "\x1c\x3c\x0c\x95\x95\x68\x09\x53\x2f\xcf\x0e\x24\x49\xa6\xb5\x25"
+                         "\x1c\x3c\x0c\x95\x95\x68\x09\x53\x2f\xcf\x0e\x24\x49\xa6\xb5\x25"
+                         "\xb1\x6a\xed\xf5\xaa\x0d",
+       .cipher = "\x39\x80\xca\x0b\x3c\x00\xe8\x41\xeb\x06\xfa\xc4\x87\x2a\x27\x57"
+                         "\x85\x9e\x1c\xea\xa6\xef\xd9\x84\x62\x85\x93\xb4\x0c\xa1\xe1\x9c"
+                         "\x7d\x77\x3d\x00\xc1\x44\xc5\x25\xac\x61\x9d\x18\xc8\x4a\x3f\x47"
+                         "\xb5\xb4\xa5\xeb\x10\x86\xcb\xdd\x59\x76\x52\x0d\xff\xa4\x85\x26"
+                         "\x4b\x54\x22\xa0\xc6\x65\x4d\xa8\x46\x73\xec\xc0\x61\x68\x0f\x00"
+                         "\x0c\x32",
+};
+aead_test_vector_t aes_gcm19 = {
+       .alg = ENCR_AES_GCM_ICV8, .key_size = 32, .salt_size = 4,
+       .len = 70, .alen = 0,
+       .key    = "\xfe\xff\xe9\x92\x86\x65\x73\x1c\x6d\x6a\x8f\x94\x67\x30\x83\x08"
+                         "\xfe\xff\xe9\x92\x86\x65\x73\x1c\x6d\x6a\x8f\x94\x67\x30\x83\x08"
+                         "\xca\xfe\xba\xbe",
+       .iv             = "\xfa\xce\xdb\xad\xde\xca\xf8\x88",
+       .plain  = "\xd9\x31\x32\x25\xf8\x84\x06\xe5\xa5\x59\x09\xc5\xaf\xf5\x26\x9a"
+                         "\x86\xa7\xa9\x53\x15\x34\xf7\xda\x2e\x4c\x30\x3d\x8a\x31\x8a\x72"
+                         "\x1c\x3c\x0c\x95\x95\x68\x09\x53\x2f\xcf\x0e\x24\x49\xa6\xb5\x25"
+                         "\x1c\x3c\x0c\x95\x95\x68\x09\x53\x2f\xcf\x0e\x24\x49\xa6\xb5\x25"
+                         "\xb1\x6a\xed\xf5\xaa\x0d",
+       .cipher = "\x52\x2d\xc1\xf0\x99\x56\x7d\x07\xf4\x7f\x37\xa3\x2a\x84\x42\x7d"
+                         "\x64\x3a\x8c\xdc\xbf\xe5\xc0\xc9\x75\x98\xa2\xbd\x25\x55\xd1\xaa"
+                         "\x8c\xb0\x8e\x48\x59\x0d\xbb\x3d\xa7\xb0\x8b\x10\x56\x82\x88\x38"
+                         "\x68\xa0\xff\x03\xac\xdf\x95\x0e\x29\x65\x83\x7f\xda\x89\x72\xdd"
+                         "\xd5\xc5\x96\xa3\x4a\xe0\xe6\x2f\x1e\xe2\x04\x80\xd7\xb7",
+};
+aead_test_vector_t aes_gcm20 = {
+       .alg = ENCR_AES_GCM_ICV12, .key_size = 32, .salt_size = 4,
+       .len = 70, .alen = 0,
+       .key    = "\xfe\xff\xe9\x92\x86\x65\x73\x1c\x6d\x6a\x8f\x94\x67\x30\x83\x08"
+                         "\xfe\xff\xe9\x92\x86\x65\x73\x1c\x6d\x6a\x8f\x94\x67\x30\x83\x08"
+                         "\xca\xfe\xba\xbe",
+       .iv             = "\xfa\xce\xdb\xad\xde\xca\xf8\x88",
+       .plain  = "\xd9\x31\x32\x25\xf8\x84\x06\xe5\xa5\x59\x09\xc5\xaf\xf5\x26\x9a"
+                         "\x86\xa7\xa9\x53\x15\x34\xf7\xda\x2e\x4c\x30\x3d\x8a\x31\x8a\x72"
+                         "\x1c\x3c\x0c\x95\x95\x68\x09\x53\x2f\xcf\x0e\x24\x49\xa6\xb5\x25"
+                         "\x1c\x3c\x0c\x95\x95\x68\x09\x53\x2f\xcf\x0e\x24\x49\xa6\xb5\x25"
+                         "\xb1\x6a\xed\xf5\xaa\x0d",
+       .cipher = "\x52\x2d\xc1\xf0\x99\x56\x7d\x07\xf4\x7f\x37\xa3\x2a\x84\x42\x7d"
+                         "\x64\x3a\x8c\xdc\xbf\xe5\xc0\xc9\x75\x98\xa2\xbd\x25\x55\xd1\xaa"
+                         "\x8c\xb0\x8e\x48\x59\x0d\xbb\x3d\xa7\xb0\x8b\x10\x56\x82\x88\x38"
+                         "\x68\xa0\xff\x03\xac\xdf\x95\x0e\x29\x65\x83\x7f\xda\x89\x72\xdd"
+                         "\xd5\xc5\x96\xa3\x4a\xe0\xe6\x2f\x1e\xe2\x04\x80\xd7\xb7\x5b\x65"
+                         "\x9a\xad",
+};
+
+/**
+ * Some self-made vectors using more associated data
+ */
+aead_test_vector_t aes_gcm21 = {
+       .alg = ENCR_AES_GCM_ICV16, .key_size = 16, .salt_size = 4,
+       .len = 70, .alen = 69,
+       .key    = "\xfe\xff\xe9\x92\x86\x65\x73\x1c\x6d\x6a\x8f\x94\x67\x30\x83\x08"
+                         "\xca\xfe\xba\xbe",
+       .iv             = "\xfa\xce\xdb\xad\xde\xca\xf8\x88",
+       .adata  = "\xfe\xed\xfa\xce\xde\xad\xbe\xef\xfe\xed\xfa\xce\xde\xad\xbe\xef"
+                         "\xab\xad\xda\xd2\xfe\xed\xfa\xce\xde\xad\xbe\xef\xfe\xed\xfa\xce"
+                         "\xde\xad\xbe\xef\xda\xd2\xfe\xed\xfa\xce\xde\xad\xbe\xef\xfe\xda"
+                         "\xd2\xfe\xed\xfa\xce\xde\xad\xbe\xef\xfe\xda\xd2\xfe\xed\xfa\xce"
+                         "\xde\xad\xbe\xef\xfe",
+       .plain  = "\xd9\x31\x32\x25\xf8\x84\x06\xe5\xa5\x59\x09\xc5\xaf\xf5\x26\x9a"
+                         "\x86\xa7\xa9\x53\x15\x34\xf7\xda\x2e\x4c\x30\x3d\x8a\x31\x8a\x72"
+                         "\x1c\x3c\x0c\x95\x95\x68\x09\x53\x2f\xcf\x0e\x24\x49\xa6\xb5\x25"
+                         "\x1c\x3c\x0c\x95\x95\x68\x09\x53\x2f\xcf\x0e\x24\x49\xa6\xb5\x25"
+                         "\xb1\x6a\xed\xf5\xaa\x0d",
+       .cipher = "\x42\x83\x1e\xc2\x21\x77\x74\x24\x4b\x72\x21\xb7\x84\xd0\xd4\x9c"
+                         "\xe3\xaa\x21\x2f\x2c\x02\xa4\xe0\x35\xc1\x7e\x23\x29\xac\xa1\x2e"
+                         "\x21\xd5\x14\xb2\x54\x66\x93\x1c\x7d\x8f\x6a\x5a\xac\x84\xaa\x05"
+                         "\xb6\xf5\xea\x59\x55\x6f\x43\x93\xa8\xf4\x95\x8c\x14\x36\x3e\xf5"
+                         "\x6c\xc2\x8a\x31\x64\xff\xe9\x24\x77\xc3\xaf\x6b\x64\xc7\x8b\xb9"
+                         "\xec\xb9\x48\x84\xa2\xdb",
+};
+aead_test_vector_t aes_gcm22 = {
+       .alg = ENCR_AES_GCM_ICV16, .key_size = 24, .salt_size = 4,
+       .len = 70, .alen = 69,
+       .key    = "\xfe\xff\xe9\x92\x86\x65\x73\x1c\x6d\x6a\x8f\x94\x67\x30\x83\x08"
+                         "\xfe\xff\xe9\x92\x86\x65\x73\x1c\xca\xfe\xba\xbe",
+       .iv             = "\xfa\xce\xdb\xad\xde\xca\xf8\x88",
+       .adata  = "\xfe\xed\xfa\xce\xde\xad\xbe\xef\xfe\xed\xfa\xce\xde\xad\xbe\xef"
+                         "\xab\xad\xda\xd2\xfe\xed\xfa\xce\xde\xad\xbe\xef\xfe\xed\xfa\xce"
+                         "\xde\xad\xbe\xef\xda\xd2\xfe\xed\xfa\xce\xde\xad\xbe\xef\xfe\xda"
+                         "\xd2\xfe\xed\xfa\xce\xde\xad\xbe\xef\xfe\xda\xd2\xfe\xed\xfa\xce"
+                         "\xde\xad\xbe\xef\xfe",
+       .plain  = "\xd9\x31\x32\x25\xf8\x84\x06\xe5\xa5\x59\x09\xc5\xaf\xf5\x26\x9a"
+                         "\x86\xa7\xa9\x53\x15\x34\xf7\xda\x2e\x4c\x30\x3d\x8a\x31\x8a\x72"
+                         "\x1c\x3c\x0c\x95\x95\x68\x09\x53\x2f\xcf\x0e\x24\x49\xa6\xb5\x25"
+                         "\x1c\x3c\x0c\x95\x95\x68\x09\x53\x2f\xcf\x0e\x24\x49\xa6\xb5\x25"
+                         "\xb1\x6a\xed\xf5\xaa\x0d",
+       .cipher = "\x39\x80\xca\x0b\x3c\x00\xe8\x41\xeb\x06\xfa\xc4\x87\x2a\x27\x57"
+                         "\x85\x9e\x1c\xea\xa6\xef\xd9\x84\x62\x85\x93\xb4\x0c\xa1\xe1\x9c"
+                         "\x7d\x77\x3d\x00\xc1\x44\xc5\x25\xac\x61\x9d\x18\xc8\x4a\x3f\x47"
+                         "\xb5\xb4\xa5\xeb\x10\x86\xcb\xdd\x59\x76\x52\x0d\xff\xa4\x85\x26"
+                         "\x4b\x54\x22\xa0\xc6\x65\x82\x33\xf3\x2d\x00\xe5\x03\x29\x8f\x7f"
+                         "\x70\x74\xe6\xfe\x60\x75",
+};
+aead_test_vector_t aes_gcm23 = {
+       .alg = ENCR_AES_GCM_ICV16, .key_size = 32, .salt_size = 4,
+       .len = 70, .alen = 69,
+       .key    = "\xfe\xff\xe9\x92\x86\x65\x73\x1c\x6d\x6a\x8f\x94\x67\x30\x83\x08"
+                         "\xfe\xff\xe9\x92\x86\x65\x73\x1c\x6d\x6a\x8f\x94\x67\x30\x83\x08"
+                         "\xca\xfe\xba\xbe",
+       .iv             = "\xfa\xce\xdb\xad\xde\xca\xf8\x88",
+       .adata  = "\xfe\xed\xfa\xce\xde\xad\xbe\xef\xfe\xed\xfa\xce\xde\xad\xbe\xef"
+                         "\xab\xad\xda\xd2\xfe\xed\xfa\xce\xde\xad\xbe\xef\xfe\xed\xfa\xce"
+                         "\xde\xad\xbe\xef\xda\xd2\xfe\xed\xfa\xce\xde\xad\xbe\xef\xfe\xda"
+                         "\xd2\xfe\xed\xfa\xce\xde\xad\xbe\xef\xfe\xda\xd2\xfe\xed\xfa\xce"
+                         "\xde\xad\xbe\xef\xfe",
+       .plain  = "\xd9\x31\x32\x25\xf8\x84\x06\xe5\xa5\x59\x09\xc5\xaf\xf5\x26\x9a"
+                         "\x86\xa7\xa9\x53\x15\x34\xf7\xda\x2e\x4c\x30\x3d\x8a\x31\x8a\x72"
+                         "\x1c\x3c\x0c\x95\x95\x68\x09\x53\x2f\xcf\x0e\x24\x49\xa6\xb5\x25"
+                         "\x1c\x3c\x0c\x95\x95\x68\x09\x53\x2f\xcf\x0e\x24\x49\xa6\xb5\x25"
+                         "\xb1\x6a\xed\xf5\xaa\x0d",
+       .cipher = "\x52\x2d\xc1\xf0\x99\x56\x7d\x07\xf4\x7f\x37\xa3\x2a\x84\x42\x7d"
+                         "\x64\x3a\x8c\xdc\xbf\xe5\xc0\xc9\x75\x98\xa2\xbd\x25\x55\xd1\xaa"
+                         "\x8c\xb0\x8e\x48\x59\x0d\xbb\x3d\xa7\xb0\x8b\x10\x56\x82\x88\x38"
+                         "\x68\xa0\xff\x03\xac\xdf\x95\x0e\x29\x65\x83\x7f\xda\x89\x72\xdd"
+                         "\xd5\xc5\x96\xa3\x4a\xe0\xa8\xb6\x0f\xfe\xd5\xe5\x33\xf4\x37\x74"
+                         "\x83\x93\xf8\xaf\x80\x43",
+};
index f151fb3..b38f2cb 100644 (file)
@@ -229,6 +229,41 @@ START_TEST(test_strpfx)
 END_TEST
 
 /*******************************************************************************
+ * mallac_align/free_align
+ */
+
+START_TEST(test_malloc_align)
+{
+       void *ptr[128][256];
+       int size, align;
+
+       for (size = 0; size < countof(ptr); size++)
+       {
+               for (align = 0; align < countof(ptr[0]); align++)
+               {
+                       ptr[size][align] = malloc_align(size, align);
+                       if (align)
+                       {
+                               ck_assert((uintptr_t)ptr[size][align] % align == 0);
+                       }
+                       if (size)
+                       {
+                               ck_assert(ptr[size][align]);
+                               memset(ptr[size][align], 0xEF, size);
+                       }
+               }
+       }
+       for (size = 0; size < countof(ptr); size++)
+       {
+               for (align = 0; align < countof(ptr[0]); align++)
+               {
+                       free_align(ptr[size][align]);
+               }
+       }
+}
+END_TEST
+
+/*******************************************************************************
  * memxor
  */
 
@@ -816,6 +851,10 @@ Suite *utils_suite_create()
        tcase_add_loop_test(tc, test_strpfx, 0, countof(strpfx_data));
        suite_add_tcase(s, tc);
 
+       tc = tcase_create("malloc_align");
+       tcase_add_test(tc, test_malloc_align);
+       suite_add_tcase(s, tc);
+
        tc = tcase_create("memxor");
        tcase_add_test(tc, test_memxor);
        tcase_add_test(tc, test_memxor_aligned);
index da57ab4..aedd8c0 100644 (file)
@@ -313,7 +313,7 @@ void test_fail_if_worker_failed();
        test_fail_if_worker_failed(); \
        if (!(x)) \
        { \
-               test_fail_msg(__FILE__, __LINE__, #x); \
+               test_fail_msg(__FILE__, __LINE__, "%s", #x); \
        } \
 })
 
@@ -329,7 +329,7 @@ void test_fail_if_worker_failed();
        test_fail_if_worker_failed(); \
        if (!(x)) \
        { \
-               test_fail_msg(__FILE__, __LINE__, #x ": " fmt, ##__VA_ARGS__); \
+               test_fail_msg(__FILE__, __LINE__, "%s: " fmt, #x, ##__VA_ARGS__); \
        } \
 })
 
@@ -349,7 +349,7 @@ void test_fail_if_worker_failed();
        test_fail_if_worker_failed(); \
        if (x) \
        { \
-               test_fail_msg(__FILE__, __LINE__, #x ": " fmt, ##__VA_ARGS__); \
+               test_fail_msg(__FILE__, __LINE__, "%s : " fmt, #x, ##__VA_ARGS__); \
        } \
 })
 #define fail_unless test_assert_msg
index 3d5e3df..119c656 100644 (file)
@@ -61,6 +61,50 @@ ENUM(status_names, SUCCESS, NEED_MORE,
 /**
  * Described in header.
  */
+void* malloc_align(size_t size, u_int8_t align)
+{
+       u_int8_t pad;
+       void *ptr;
+
+       if (align == 0)
+       {
+               align = 1;
+       }
+       ptr = malloc(align + sizeof(pad) + size);
+       if (!ptr)
+       {
+               return NULL;
+       }
+       /* store padding length just before data, down to the allocation boundary
+        * to do some verification during free_align() */
+       pad = align - ((uintptr_t)ptr % align);
+       memset(ptr, pad, pad);
+       return ptr + pad;
+}
+
+/**
+ * Described in header.
+ */
+void free_align(void *ptr)
+{
+       u_int8_t pad, *pos;
+
+       pos = ptr - 1;
+       /* verify padding to check any corruption */
+       for (pad = *pos; (void*)pos >= ptr - pad; pos--)
+       {
+               if (*pos != pad)
+               {
+                       DBG1(DBG_LIB, "!!!! invalid free_align() !!!!");
+                       return;
+               }
+       }
+       free(ptr - pad);
+}
+
+/**
+ * Described in header.
+ */
 void memxor(u_int8_t dst[], u_int8_t src[], size_t n)
 {
        int m, i;
index 029a375..c42f881 100644 (file)
@@ -274,6 +274,50 @@ static inline void *memset_noop(void *s, int c, size_t n)
                                                   *(this) = (typeof(*(this))){ __VA_ARGS__ }; }
 
 /**
+ * Aligning version of INIT().
+ *
+ * The returned pointer must be freed using free_align(), not free().
+ *
+ * @param this         object to allocate/initialize
+ * @param align                alignment for allocation, in bytes
+ * @param ...          initializer
+ */
+#define INIT_ALIGN(this, align, ...) { \
+                                               (this) = malloc_align(sizeof(*(this)), align); \
+                                               *(this) = (typeof(*(this))){ __VA_ARGS__ }; }
+
+/**
+ * Object allocation/initialization macro, with extra allocated bytes at tail.
+ *
+ * The extra space gets zero-initialized.
+ *
+ * @param this         pointer to object to allocate memory for
+ * @param extra                number of bytes to allocate at end of this
+ * @param ...          initializer
+ */
+#define INIT_EXTRA(this, extra, ...) { \
+                                               typeof(extra) _extra = (extra); \
+                                               (this) = malloc(sizeof(*(this)) + _extra); \
+                                               *(this) = (typeof(*(this))){ __VA_ARGS__ }; \
+                                               memset((this) + 1, 0, _extra); }
+
+/**
+ * Aligning version of INIT_EXTRA().
+ *
+ * The returned pointer must be freed using free_align(), not free().
+ *
+ * @param this         object to allocate/initialize
+ * @param extra                number of bytes to allocate at end of this
+ * @param align                alignment for allocation, in bytes
+ * @param ...          initializer
+ */
+#define INIT_EXTRA_ALIGN(this, extra, align, ...) { \
+                                               typeof(extra) _extra = (extra); \
+                                               (this) = malloc_align(sizeof(*(this)) + _extra, align); \
+                                               *(this) = (typeof(*(this))){ __VA_ARGS__ }; \
+                                               memset((this) + 1, 0, _extra); }
+
+/**
  * Method declaration/definition macro, providing private and public interface.
  *
  * Defines a method name with this as first parameter and a return value ret,
@@ -552,6 +596,24 @@ typedef struct timespec timespec_t;
 typedef struct sockaddr sockaddr_t;
 
 /**
+ * malloc(), but returns aligned memory.
+ *
+ * The returned pointer must be freed using free_align(), not free().
+ *
+ * @param size                 size of allocated data
+ * @param align                        alignment, up to 255 bytes, usually a power of 2
+ * @return                             allocated hunk, aligned to align bytes
+ */
+void* malloc_align(size_t size, u_int8_t align);
+
+/**
+ * Free a hunk allocated by malloc_align().
+ *
+ * @param ptr                  hunk to free
+ */
+void free_align(void *ptr);
+
+/**
  * Same as memcpy, but XORs src into dst instead of copy
  */
 void memxor(u_int8_t dest[], u_int8_t src[], size_t n);