libhydra: Move all kernel plugins to libcharon
authorTobias Brunner <tobias@strongswan.org>
Fri, 12 Feb 2016 14:21:54 +0000 (15:21 +0100)
committerTobias Brunner <tobias@strongswan.org>
Thu, 3 Mar 2016 16:36:11 +0000 (17:36 +0100)
51 files changed:
configure.ac
src/libcharon/Android.mk
src/libcharon/Makefile.am
src/libcharon/plugins/kernel_netlink/.gitignore [new file with mode: 0644]
src/libcharon/plugins/kernel_netlink/Makefile.am [new file with mode: 0644]
src/libcharon/plugins/kernel_netlink/kernel_netlink_ipsec.c [new file with mode: 0644]
src/libcharon/plugins/kernel_netlink/kernel_netlink_ipsec.h [new file with mode: 0644]
src/libcharon/plugins/kernel_netlink/kernel_netlink_net.c [new file with mode: 0644]
src/libcharon/plugins/kernel_netlink/kernel_netlink_net.h [new file with mode: 0644]
src/libcharon/plugins/kernel_netlink/kernel_netlink_plugin.c [new file with mode: 0644]
src/libcharon/plugins/kernel_netlink/kernel_netlink_plugin.h [new file with mode: 0644]
src/libcharon/plugins/kernel_netlink/kernel_netlink_shared.c [new file with mode: 0644]
src/libcharon/plugins/kernel_netlink/kernel_netlink_shared.h [new file with mode: 0644]
src/libcharon/plugins/kernel_netlink/suites/test_socket.c [new file with mode: 0644]
src/libcharon/plugins/kernel_netlink/tests.c [new file with mode: 0644]
src/libcharon/plugins/kernel_netlink/tests.h [new file with mode: 0644]
src/libcharon/plugins/kernel_pfkey/Makefile.am [new file with mode: 0644]
src/libcharon/plugins/kernel_pfkey/kernel_pfkey_ipsec.c [new file with mode: 0644]
src/libcharon/plugins/kernel_pfkey/kernel_pfkey_ipsec.h [new file with mode: 0644]
src/libcharon/plugins/kernel_pfkey/kernel_pfkey_plugin.c [new file with mode: 0644]
src/libcharon/plugins/kernel_pfkey/kernel_pfkey_plugin.h [new file with mode: 0644]
src/libcharon/plugins/kernel_pfroute/Makefile.am [new file with mode: 0644]
src/libcharon/plugins/kernel_pfroute/kernel_pfroute_net.c [new file with mode: 0644]
src/libcharon/plugins/kernel_pfroute/kernel_pfroute_net.h [new file with mode: 0644]
src/libcharon/plugins/kernel_pfroute/kernel_pfroute_plugin.c [new file with mode: 0644]
src/libcharon/plugins/kernel_pfroute/kernel_pfroute_plugin.h [new file with mode: 0644]
src/libhydra/Android.mk
src/libhydra/Makefile.am
src/libhydra/plugins/kernel_netlink/.gitignore [deleted file]
src/libhydra/plugins/kernel_netlink/Makefile.am [deleted file]
src/libhydra/plugins/kernel_netlink/kernel_netlink_ipsec.c [deleted file]
src/libhydra/plugins/kernel_netlink/kernel_netlink_ipsec.h [deleted file]
src/libhydra/plugins/kernel_netlink/kernel_netlink_net.c [deleted file]
src/libhydra/plugins/kernel_netlink/kernel_netlink_net.h [deleted file]
src/libhydra/plugins/kernel_netlink/kernel_netlink_plugin.c [deleted file]
src/libhydra/plugins/kernel_netlink/kernel_netlink_plugin.h [deleted file]
src/libhydra/plugins/kernel_netlink/kernel_netlink_shared.c [deleted file]
src/libhydra/plugins/kernel_netlink/kernel_netlink_shared.h [deleted file]
src/libhydra/plugins/kernel_netlink/suites/test_socket.c [deleted file]
src/libhydra/plugins/kernel_netlink/tests.c [deleted file]
src/libhydra/plugins/kernel_netlink/tests.h [deleted file]
src/libhydra/plugins/kernel_pfkey/Makefile.am [deleted file]
src/libhydra/plugins/kernel_pfkey/kernel_pfkey_ipsec.c [deleted file]
src/libhydra/plugins/kernel_pfkey/kernel_pfkey_ipsec.h [deleted file]
src/libhydra/plugins/kernel_pfkey/kernel_pfkey_plugin.c [deleted file]
src/libhydra/plugins/kernel_pfkey/kernel_pfkey_plugin.h [deleted file]
src/libhydra/plugins/kernel_pfroute/Makefile.am [deleted file]
src/libhydra/plugins/kernel_pfroute/kernel_pfroute_net.c [deleted file]
src/libhydra/plugins/kernel_pfroute/kernel_pfroute_net.h [deleted file]
src/libhydra/plugins/kernel_pfroute/kernel_pfroute_plugin.c [deleted file]
src/libhydra/plugins/kernel_pfroute/kernel_pfroute_plugin.h [deleted file]

index 6278076..b1c91e2 100644 (file)
@@ -1280,9 +1280,8 @@ cmd_plugins=
 aikgen_plugins=
 
 # location specific lists for checksumming,
-# for src/libcharon, src/libhydra, src/libstrongswan and src/libtnccs
+# for src/libcharon, src/libstrongswan and src/libtnccs
 c_plugins=
-h_plugins=
 s_plugins=
 t_plugins=
 
@@ -1347,9 +1346,9 @@ ADD_PLUGIN([load-tester],          [c charon])
 ADD_PLUGIN([kernel-libipsec],      [c charon cmd])
 ADD_PLUGIN([kernel-wfp],           [c charon])
 ADD_PLUGIN([kernel-iph],           [c charon])
-ADD_PLUGIN([kernel-pfkey],         [h charon starter nm cmd])
-ADD_PLUGIN([kernel-pfroute],       [h charon starter nm cmd])
-ADD_PLUGIN([kernel-netlink],       [h charon starter nm cmd])
+ADD_PLUGIN([kernel-pfkey],         [c charon starter nm cmd])
+ADD_PLUGIN([kernel-pfroute],       [c charon starter nm cmd])
+ADD_PLUGIN([kernel-netlink],       [c charon starter nm cmd])
 ADD_PLUGIN([resolve],              [c charon cmd])
 ADD_PLUGIN([socket-default],       [c charon nm cmd])
 ADD_PLUGIN([socket-dynamic],       [c charon cmd])
@@ -1511,6 +1510,9 @@ AM_CONDITIONAL(USE_UPDOWN, test x$updown = xtrue)
 AM_CONDITIONAL(USE_DHCP, test x$dhcp = xtrue)
 AM_CONDITIONAL(USE_LOAD_TESTER, test x$load_tester = xtrue)
 AM_CONDITIONAL(USE_HA, test x$ha = xtrue)
+AM_CONDITIONAL(USE_KERNEL_NETLINK, test x$kernel_netlink = xtrue)
+AM_CONDITIONAL(USE_KERNEL_PFKEY, test x$kernel_pfkey = xtrue)
+AM_CONDITIONAL(USE_KERNEL_PFROUTE, test x$kernel_pfroute = xtrue)
 AM_CONDITIONAL(USE_KERNEL_LIBIPSEC, test x$kernel_libipsec = xtrue)
 AM_CONDITIONAL(USE_KERNEL_WFP, test x$kernel_wfp = xtrue)
 AM_CONDITIONAL(USE_KERNEL_IPH, test x$kernel_iph = xtrue)
@@ -1578,12 +1580,6 @@ AM_CONDITIONAL(USE_RESOLVE, test x$resolve = xtrue)
 AM_CONDITIONAL(USE_ATTR, test x$attr = xtrue)
 AM_CONDITIONAL(USE_ATTR_SQL, test x$attr_sql = xtrue)
 
-#  hydra plugins
-# ---------------
-AM_CONDITIONAL(USE_KERNEL_NETLINK, test x$kernel_netlink = xtrue)
-AM_CONDITIONAL(USE_KERNEL_PFKEY, test x$kernel_pfkey = xtrue)
-AM_CONDITIONAL(USE_KERNEL_PFROUTE, test x$kernel_pfroute = xtrue)
-
 #  other options
 # ---------------
 AM_CONDITIONAL(USE_LEAK_DETECTIVE, test x$leak_detective = xtrue)
@@ -1749,9 +1745,6 @@ AC_CONFIG_FILES([
        src/libstrongswan/plugins/test_vectors/Makefile
        src/libstrongswan/tests/Makefile
        src/libhydra/Makefile
-       src/libhydra/plugins/kernel_netlink/Makefile
-       src/libhydra/plugins/kernel_pfkey/Makefile
-       src/libhydra/plugins/kernel_pfroute/Makefile
        src/libhydra/tests/Makefile
        src/libipsec/Makefile
        src/libipsec/tests/Makefile
@@ -1828,6 +1821,9 @@ AC_CONFIG_FILES([
        src/libcharon/plugins/unity/Makefile
        src/libcharon/plugins/uci/Makefile
        src/libcharon/plugins/ha/Makefile
+       src/libcharon/plugins/kernel_netlink/Makefile
+       src/libcharon/plugins/kernel_pfkey/Makefile
+       src/libcharon/plugins/kernel_pfroute/Makefile
        src/libcharon/plugins/kernel_libipsec/Makefile
        src/libcharon/plugins/kernel_wfp/Makefile
        src/libcharon/plugins/kernel_iph/Makefile
@@ -1922,6 +1918,5 @@ AC_MSG_RESULT([-----------------------------------------------------])
 
 AC_MSG_RESULT([libstrongswan:$s_plugins])
 AC_MSG_RESULT([libcharon:    $c_plugins])
-AC_MSG_RESULT([libhydra:     $h_plugins])
 AC_MSG_RESULT([libtnccs:     $t_plugins])
 AC_MSG_RESULT([])
index 1008579..65d0e2b 100644 (file)
@@ -216,6 +216,10 @@ endif
 
 LOCAL_SRC_FILES += $(call add_plugin, load-tester)
 
+LOCAL_SRC_FILES += $(call add_plugin, kernel-pfkey)
+
+LOCAL_SRC_FILES += $(call add_plugin, kernel-netlink)
+
 LOCAL_SRC_FILES += $(call add_plugin, socket-default)
 
 LOCAL_SRC_FILES += $(call add_plugin, socket-dynamic)
index 66fce81..0eee3c2 100644 (file)
@@ -512,6 +512,27 @@ if MONOLITHIC
 endif
 endif
 
+if USE_KERNEL_PFKEY
+  SUBDIRS += plugins/kernel_pfkey
+if MONOLITHIC
+  libcharon_la_LIBADD += plugins/kernel_pfkey/libstrongswan-kernel-pfkey.la
+endif
+endif
+
+if USE_KERNEL_PFROUTE
+  SUBDIRS += plugins/kernel_pfroute
+if MONOLITHIC
+  libcharon_la_LIBADD += plugins/kernel_pfroute/libstrongswan-kernel-pfroute.la
+endif
+endif
+
+if USE_KERNEL_NETLINK
+  SUBDIRS += plugins/kernel_netlink
+if MONOLITHIC
+  libcharon_la_LIBADD += plugins/kernel_netlink/libstrongswan-kernel-netlink.la
+endif
+endif
+
 if USE_KERNEL_LIBIPSEC
   SUBDIRS += plugins/kernel_libipsec
 if MONOLITHIC
diff --git a/src/libcharon/plugins/kernel_netlink/.gitignore b/src/libcharon/plugins/kernel_netlink/.gitignore
new file mode 100644 (file)
index 0000000..2b29f27
--- /dev/null
@@ -0,0 +1 @@
+tests
diff --git a/src/libcharon/plugins/kernel_netlink/Makefile.am b/src/libcharon/plugins/kernel_netlink/Makefile.am
new file mode 100644 (file)
index 0000000..cc88554
--- /dev/null
@@ -0,0 +1,44 @@
+AM_CPPFLAGS = \
+       -I${linux_headers} \
+       -I$(top_srcdir)/src/libstrongswan \
+       -I$(top_srcdir)/src/libhydra \
+       -DROUTING_TABLE=${routing_table} \
+       -DROUTING_TABLE_PRIO=${routing_table_prio}
+
+AM_CFLAGS = \
+       $(PLUGIN_CFLAGS)
+
+if MONOLITHIC
+noinst_LTLIBRARIES = libstrongswan-kernel-netlink.la
+else
+plugin_LTLIBRARIES = libstrongswan-kernel-netlink.la
+endif
+
+libstrongswan_kernel_netlink_la_SOURCES = \
+       kernel_netlink_plugin.h kernel_netlink_plugin.c \
+       kernel_netlink_ipsec.h kernel_netlink_ipsec.c \
+       kernel_netlink_net.h kernel_netlink_net.c \
+       kernel_netlink_shared.h kernel_netlink_shared.c
+
+libstrongswan_kernel_netlink_la_LDFLAGS = -module -avoid-version
+
+
+TESTS = tests
+
+check_PROGRAMS = $(TESTS)
+
+tests_SOURCES = \
+       tests.h tests.c \
+       suites/test_socket.c \
+       kernel_netlink_shared.c
+
+tests_CFLAGS = \
+       -I$(top_srcdir)/src/libstrongswan \
+       -I$(top_srcdir)/src/libstrongswan/tests \
+       -DNETLINK_MSG_LOSS_HOOK=netlink_msg_loss \
+       @COVERAGE_CFLAGS@
+
+tests_LDFLAGS = @COVERAGE_LDFLAGS@
+tests_LDADD = \
+       $(top_builddir)/src/libstrongswan/libstrongswan.la \
+       $(top_builddir)/src/libstrongswan/tests/libtest.la
diff --git a/src/libcharon/plugins/kernel_netlink/kernel_netlink_ipsec.c b/src/libcharon/plugins/kernel_netlink/kernel_netlink_ipsec.c
new file mode 100644 (file)
index 0000000..8c506d9
--- /dev/null
@@ -0,0 +1,2966 @@
+/*
+ * Copyright (C) 2006-2015 Tobias Brunner
+ * Copyright (C) 2005-2009 Martin Willi
+ * Copyright (C) 2008 Andreas Steffen
+ * Copyright (C) 2006-2007 Fabian Hartmann, Noah Heusser
+ * Copyright (C) 2006 Daniel Roethlisberger
+ * Copyright (C) 2005 Jan Hutter
+ * Hochschule fuer Technik Rapperswil
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.  See <http://www.fsf.org/copyleft/gpl.txt>.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ * or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * for more details.
+ */
+
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <stdint.h>
+#include <linux/ipsec.h>
+#include <linux/netlink.h>
+#include <linux/rtnetlink.h>
+#include <linux/xfrm.h>
+#include <linux/udp.h>
+#include <unistd.h>
+#include <time.h>
+#include <errno.h>
+#include <string.h>
+#include <fcntl.h>
+
+#include "kernel_netlink_ipsec.h"
+#include "kernel_netlink_shared.h"
+
+#include <hydra.h>
+#include <utils/debug.h>
+#include <threading/mutex.h>
+#include <collections/array.h>
+#include <collections/hashtable.h>
+#include <collections/linked_list.h>
+
+/** Required for Linux 2.6.26 kernel and later */
+#ifndef XFRM_STATE_AF_UNSPEC
+#define XFRM_STATE_AF_UNSPEC 32
+#endif
+
+/** From linux/in.h */
+#ifndef IP_XFRM_POLICY
+#define IP_XFRM_POLICY 17
+#endif
+
+/** Missing on uclibc */
+#ifndef IPV6_XFRM_POLICY
+#define IPV6_XFRM_POLICY 34
+#endif /*IPV6_XFRM_POLICY*/
+
+/* from linux/udp.h */
+#ifndef UDP_ENCAP
+#define UDP_ENCAP 100
+#endif
+
+#ifndef UDP_ENCAP_ESPINUDP
+#define UDP_ENCAP_ESPINUDP 2
+#endif
+
+/* this is not defined on some platforms */
+#ifndef SOL_UDP
+#define SOL_UDP IPPROTO_UDP
+#endif
+
+/** Base priority for installed policies */
+#define PRIO_BASE 384
+
+/** Default lifetime of an acquire XFRM state (in seconds) */
+#define DEFAULT_ACQUIRE_LIFETIME 165
+
+/**
+ * Map the limit for bytes and packets to XFRM_INF by default
+ */
+#define XFRM_LIMIT(x) ((x) == 0 ? XFRM_INF : (x))
+
+/**
+ * Create ORable bitfield of XFRM NL groups
+ */
+#define XFRMNLGRP(x) (1<<(XFRMNLGRP_##x-1))
+
+/**
+ * Returns a pointer to the first rtattr following the nlmsghdr *nlh and the
+ * 'usual' netlink data x like 'struct xfrm_usersa_info'
+ */
+#define XFRM_RTA(nlh, x) ((struct rtattr*)(NLMSG_DATA(nlh) + \
+                                                                                  NLMSG_ALIGN(sizeof(x))))
+/**
+ * Returns the total size of attached rta data
+ * (after 'usual' netlink data x like 'struct xfrm_usersa_info')
+ */
+#define XFRM_PAYLOAD(nlh, x) NLMSG_PAYLOAD(nlh, sizeof(x))
+
+typedef struct kernel_algorithm_t kernel_algorithm_t;
+
+/**
+ * Mapping of IKEv2 kernel identifier to linux crypto API names
+ */
+struct kernel_algorithm_t {
+       /**
+        * Identifier specified in IKEv2
+        */
+       int ikev2;
+
+       /**
+        * Name of the algorithm in linux crypto API
+        */
+       char *name;
+};
+
+ENUM(xfrm_msg_names, XFRM_MSG_NEWSA, XFRM_MSG_MAPPING,
+       "XFRM_MSG_NEWSA",
+       "XFRM_MSG_DELSA",
+       "XFRM_MSG_GETSA",
+       "XFRM_MSG_NEWPOLICY",
+       "XFRM_MSG_DELPOLICY",
+       "XFRM_MSG_GETPOLICY",
+       "XFRM_MSG_ALLOCSPI",
+       "XFRM_MSG_ACQUIRE",
+       "XFRM_MSG_EXPIRE",
+       "XFRM_MSG_UPDPOLICY",
+       "XFRM_MSG_UPDSA",
+       "XFRM_MSG_POLEXPIRE",
+       "XFRM_MSG_FLUSHSA",
+       "XFRM_MSG_FLUSHPOLICY",
+       "XFRM_MSG_NEWAE",
+       "XFRM_MSG_GETAE",
+       "XFRM_MSG_REPORT",
+       "XFRM_MSG_MIGRATE",
+       "XFRM_MSG_NEWSADINFO",
+       "XFRM_MSG_GETSADINFO",
+       "XFRM_MSG_NEWSPDINFO",
+       "XFRM_MSG_GETSPDINFO",
+       "XFRM_MSG_MAPPING"
+);
+
+ENUM(xfrm_attr_type_names, XFRMA_UNSPEC, XFRMA_REPLAY_ESN_VAL,
+       "XFRMA_UNSPEC",
+       "XFRMA_ALG_AUTH",
+       "XFRMA_ALG_CRYPT",
+       "XFRMA_ALG_COMP",
+       "XFRMA_ENCAP",
+       "XFRMA_TMPL",
+       "XFRMA_SA",
+       "XFRMA_POLICY",
+       "XFRMA_SEC_CTX",
+       "XFRMA_LTIME_VAL",
+       "XFRMA_REPLAY_VAL",
+       "XFRMA_REPLAY_THRESH",
+       "XFRMA_ETIMER_THRESH",
+       "XFRMA_SRCADDR",
+       "XFRMA_COADDR",
+       "XFRMA_LASTUSED",
+       "XFRMA_POLICY_TYPE",
+       "XFRMA_MIGRATE",
+       "XFRMA_ALG_AEAD",
+       "XFRMA_KMADDRESS",
+       "XFRMA_ALG_AUTH_TRUNC",
+       "XFRMA_MARK",
+       "XFRMA_TFCPAD",
+       "XFRMA_REPLAY_ESN_VAL",
+);
+
+/**
+ * Algorithms for encryption
+ */
+static kernel_algorithm_t encryption_algs[] = {
+/*     {ENCR_DES_IV64,                         "***"                           }, */
+       {ENCR_DES,                                      "des"                           },
+       {ENCR_3DES,                                     "des3_ede"                      },
+/*     {ENCR_RC5,                                      "***"                           }, */
+/*     {ENCR_IDEA,                                     "***"                           }, */
+       {ENCR_CAST,                                     "cast5"                         },
+       {ENCR_BLOWFISH,                         "blowfish"                      },
+/*     {ENCR_3IDEA,                            "***"                           }, */
+/*     {ENCR_DES_IV32,                         "***"                           }, */
+       {ENCR_NULL,                                     "cipher_null"           },
+       {ENCR_AES_CBC,                          "aes"                           },
+       {ENCR_AES_CTR,                          "rfc3686(ctr(aes))"     },
+       {ENCR_AES_CCM_ICV8,                     "rfc4309(ccm(aes))"     },
+       {ENCR_AES_CCM_ICV12,            "rfc4309(ccm(aes))"     },
+       {ENCR_AES_CCM_ICV16,            "rfc4309(ccm(aes))"     },
+       {ENCR_AES_GCM_ICV8,                     "rfc4106(gcm(aes))"     },
+       {ENCR_AES_GCM_ICV12,            "rfc4106(gcm(aes))"     },
+       {ENCR_AES_GCM_ICV16,            "rfc4106(gcm(aes))"     },
+       {ENCR_NULL_AUTH_AES_GMAC,       "rfc4543(gcm(aes))"     },
+       {ENCR_CAMELLIA_CBC,                     "cbc(camellia)"         },
+/*     {ENCR_CAMELLIA_CTR,                     "***"                           }, */
+/*     {ENCR_CAMELLIA_CCM_ICV8,        "***"                           }, */
+/*     {ENCR_CAMELLIA_CCM_ICV12,       "***"                           }, */
+/*     {ENCR_CAMELLIA_CCM_ICV16,       "***"                           }, */
+       {ENCR_SERPENT_CBC,                      "serpent"                       },
+       {ENCR_TWOFISH_CBC,                      "twofish"                       },
+       {ENCR_CHACHA20_POLY1305,        "rfc7539esp(chacha20,poly1305)"},
+};
+
+/**
+ * Algorithms for integrity protection
+ */
+static kernel_algorithm_t integrity_algs[] = {
+       {AUTH_HMAC_MD5_96,                      "md5"                           },
+       {AUTH_HMAC_MD5_128,                     "hmac(md5)"                     },
+       {AUTH_HMAC_SHA1_96,                     "sha1"                          },
+       {AUTH_HMAC_SHA1_160,            "hmac(sha1)"            },
+       {AUTH_HMAC_SHA2_256_96,         "sha256"                        },
+       {AUTH_HMAC_SHA2_256_128,        "hmac(sha256)"          },
+       {AUTH_HMAC_SHA2_384_192,        "hmac(sha384)"          },
+       {AUTH_HMAC_SHA2_512_256,        "hmac(sha512)"          },
+/*     {AUTH_DES_MAC,                          "***"                           }, */
+/*     {AUTH_KPDK_MD5,                         "***"                           }, */
+       {AUTH_AES_XCBC_96,                      "xcbc(aes)"                     },
+};
+
+/**
+ * Algorithms for IPComp
+ */
+static kernel_algorithm_t compression_algs[] = {
+/*     {IPCOMP_OUI,                            "***"                           }, */
+       {IPCOMP_DEFLATE,                        "deflate"                       },
+       {IPCOMP_LZS,                            "lzs"                           },
+       {IPCOMP_LZJH,                           "lzjh"                          },
+};
+
+/**
+ * Look up a kernel algorithm name and its key size
+ */
+static char* lookup_algorithm(transform_type_t type, int ikev2)
+{
+       kernel_algorithm_t *list;
+       int i, count;
+       char *name;
+
+       switch (type)
+       {
+               case ENCRYPTION_ALGORITHM:
+                       list = encryption_algs;
+                       count = countof(encryption_algs);
+                       break;
+               case INTEGRITY_ALGORITHM:
+                       list = integrity_algs;
+                       count = countof(integrity_algs);
+                       break;
+               case COMPRESSION_ALGORITHM:
+                       list = compression_algs;
+                       count = countof(compression_algs);
+                       break;
+               default:
+                       return NULL;
+       }
+       for (i = 0; i < count; i++)
+       {
+               if (list[i].ikev2 == ikev2)
+               {
+                       return list[i].name;
+               }
+       }
+       if (hydra->kernel_interface->lookup_algorithm(hydra->kernel_interface,
+                                                                                                 ikev2, type, NULL, &name))
+       {
+               return name;
+       }
+       return NULL;
+}
+
+typedef struct private_kernel_netlink_ipsec_t private_kernel_netlink_ipsec_t;
+
+/**
+ * Private variables and functions of kernel_netlink class.
+ */
+struct private_kernel_netlink_ipsec_t {
+       /**
+        * Public part of the kernel_netlink_t object
+        */
+       kernel_netlink_ipsec_t public;
+
+       /**
+        * Mutex to lock access to installed policies
+        */
+       mutex_t *mutex;
+
+       /**
+        * Hash table of installed policies (policy_entry_t)
+        */
+       hashtable_t *policies;
+
+       /**
+        * Hash table of IPsec SAs using policies (ipsec_sa_t)
+        */
+       hashtable_t *sas;
+
+       /**
+        * Netlink xfrm socket (IPsec)
+        */
+       netlink_socket_t *socket_xfrm;
+
+       /**
+        * Netlink xfrm socket to receive acquire and expire events
+        */
+       int socket_xfrm_events;
+
+       /**
+        * Whether to install routes along policies
+        */
+       bool install_routes;
+
+       /**
+        * Whether to set protocol and ports on selector installed with transport
+        * mode IPsec SAs
+        */
+       bool proto_port_transport;
+
+       /**
+        * Whether to always use UPDATE to install policies
+        */
+       bool policy_update;
+
+       /**
+        * Installed port based IKE bypass policies, as bypass_t
+        */
+       array_t *bypass;
+};
+
+typedef struct route_entry_t route_entry_t;
+
+/**
+ * Installed routing entry
+ */
+struct route_entry_t {
+       /** Name of the interface the route is bound to */
+       char *if_name;
+
+       /** Source ip of the route */
+       host_t *src_ip;
+
+       /** Gateway for this route */
+       host_t *gateway;
+
+       /** Destination net */
+       chunk_t dst_net;
+
+       /** Destination net prefixlen */
+       u_int8_t prefixlen;
+};
+
+/**
+ * Destroy a route_entry_t object
+ */
+static void route_entry_destroy(route_entry_t *this)
+{
+       free(this->if_name);
+       this->src_ip->destroy(this->src_ip);
+       DESTROY_IF(this->gateway);
+       chunk_free(&this->dst_net);
+       free(this);
+}
+
+/**
+ * Compare two route_entry_t objects
+ */
+static bool route_entry_equals(route_entry_t *a, route_entry_t *b)
+{
+       return a->if_name && b->if_name && streq(a->if_name, b->if_name) &&
+                  a->src_ip->ip_equals(a->src_ip, b->src_ip) &&
+                  a->gateway->ip_equals(a->gateway, b->gateway) &&
+                  chunk_equals(a->dst_net, b->dst_net) && a->prefixlen == b->prefixlen;
+}
+
+typedef struct ipsec_sa_t ipsec_sa_t;
+
+/**
+ * IPsec SA assigned to a policy.
+ */
+struct ipsec_sa_t {
+       /** Source address of this SA */
+       host_t *src;
+
+       /** Destination address of this SA */
+       host_t *dst;
+
+       /** Optional mark */
+       mark_t mark;
+
+       /** Description of this SA */
+       ipsec_sa_cfg_t cfg;
+
+       /** Reference count for this SA */
+       refcount_t refcount;
+};
+
+/**
+ * Hash function for ipsec_sa_t objects
+ */
+static u_int ipsec_sa_hash(ipsec_sa_t *sa)
+{
+       return chunk_hash_inc(sa->src->get_address(sa->src),
+                                                 chunk_hash_inc(sa->dst->get_address(sa->dst),
+                                                 chunk_hash_inc(chunk_from_thing(sa->mark),
+                                                 chunk_hash(chunk_from_thing(sa->cfg)))));
+}
+
+/**
+ * Equality function for ipsec_sa_t objects
+ */
+static bool ipsec_sa_equals(ipsec_sa_t *sa, ipsec_sa_t *other_sa)
+{
+       return sa->src->ip_equals(sa->src, other_sa->src) &&
+                  sa->dst->ip_equals(sa->dst, other_sa->dst) &&
+                  memeq(&sa->mark, &other_sa->mark, sizeof(mark_t)) &&
+                  memeq(&sa->cfg, &other_sa->cfg, sizeof(ipsec_sa_cfg_t));
+}
+
+/**
+ * Allocate or reference an IPsec SA object
+ */
+static ipsec_sa_t *ipsec_sa_create(private_kernel_netlink_ipsec_t *this,
+                                                                  host_t *src, host_t *dst, mark_t mark,
+                                                                  ipsec_sa_cfg_t *cfg)
+{
+       ipsec_sa_t *sa, *found;
+       INIT(sa,
+               .src = src,
+               .dst = dst,
+               .mark = mark,
+               .cfg = *cfg,
+       );
+       found = this->sas->get(this->sas, sa);
+       if (!found)
+       {
+               sa->src = src->clone(src);
+               sa->dst = dst->clone(dst);
+               this->sas->put(this->sas, sa, sa);
+       }
+       else
+       {
+               free(sa);
+               sa = found;
+       }
+       ref_get(&sa->refcount);
+       return sa;
+}
+
+/**
+ * Release and destroy an IPsec SA object
+ */
+static void ipsec_sa_destroy(private_kernel_netlink_ipsec_t *this,
+                                                        ipsec_sa_t *sa)
+{
+       if (ref_put(&sa->refcount))
+       {
+               this->sas->remove(this->sas, sa);
+               DESTROY_IF(sa->src);
+               DESTROY_IF(sa->dst);
+               free(sa);
+       }
+}
+
+typedef struct policy_sa_t policy_sa_t;
+typedef struct policy_sa_fwd_t policy_sa_fwd_t;
+
+/**
+ * Mapping between a policy and an IPsec SA.
+ */
+struct policy_sa_t {
+       /** Priority assigned to the policy when installed with this SA */
+       u_int32_t priority;
+
+       /** Type of the policy */
+       policy_type_t type;
+
+       /** Assigned SA */
+       ipsec_sa_t *sa;
+};
+
+/**
+ * For forward policies we also cache the traffic selectors in order to install
+ * the route.
+ */
+struct policy_sa_fwd_t {
+       /** Generic interface */
+       policy_sa_t generic;
+
+       /** Source traffic selector of this policy */
+       traffic_selector_t *src_ts;
+
+       /** Destination traffic selector of this policy */
+       traffic_selector_t *dst_ts;
+};
+
+/**
+ * Create a policy_sa(_fwd)_t object
+ */
+static policy_sa_t *policy_sa_create(private_kernel_netlink_ipsec_t *this,
+       policy_dir_t dir, policy_type_t type, host_t *src, host_t *dst,
+       traffic_selector_t *src_ts, traffic_selector_t *dst_ts, mark_t mark,
+       ipsec_sa_cfg_t *cfg)
+{
+       policy_sa_t *policy;
+
+       if (dir == POLICY_FWD)
+       {
+               policy_sa_fwd_t *fwd;
+               INIT(fwd,
+                       .src_ts = src_ts->clone(src_ts),
+                       .dst_ts = dst_ts->clone(dst_ts),
+               );
+               policy = &fwd->generic;
+       }
+       else
+       {
+               INIT(policy, .priority = 0);
+       }
+       policy->type = type;
+       policy->sa = ipsec_sa_create(this, src, dst, mark, cfg);
+       return policy;
+}
+
+/**
+ * Destroy a policy_sa(_fwd)_t object
+ */
+static void policy_sa_destroy(policy_sa_t *policy, policy_dir_t *dir,
+                                                         private_kernel_netlink_ipsec_t *this)
+{
+       if (*dir == POLICY_FWD)
+       {
+               policy_sa_fwd_t *fwd = (policy_sa_fwd_t*)policy;
+               fwd->src_ts->destroy(fwd->src_ts);
+               fwd->dst_ts->destroy(fwd->dst_ts);
+       }
+       ipsec_sa_destroy(this, policy->sa);
+       free(policy);
+}
+
+typedef struct policy_entry_t policy_entry_t;
+
+/**
+ * Installed kernel policy.
+ */
+struct policy_entry_t {
+
+       /** Direction of this policy: in, out, forward */
+       u_int8_t direction;
+
+       /** Parameters of installed policy */
+       struct xfrm_selector sel;
+
+       /** Optional mark */
+       u_int32_t mark;
+
+       /** Associated route installed for this policy */
+       route_entry_t *route;
+
+       /** List of SAs this policy is used by, ordered by priority */
+       linked_list_t *used_by;
+
+       /** reqid for this policy */
+       u_int32_t reqid;
+};
+
+/**
+ * Destroy a policy_entry_t object
+ */
+static void policy_entry_destroy(private_kernel_netlink_ipsec_t *this,
+                                                                policy_entry_t *policy)
+{
+       if (policy->route)
+       {
+               route_entry_destroy(policy->route);
+       }
+       if (policy->used_by)
+       {
+               policy->used_by->invoke_function(policy->used_by,
+                                                                               (linked_list_invoke_t)policy_sa_destroy,
+                                                                                &policy->direction, this);
+               policy->used_by->destroy(policy->used_by);
+       }
+       free(policy);
+}
+
+/**
+ * Hash function for policy_entry_t objects
+ */
+static u_int policy_hash(policy_entry_t *key)
+{
+       chunk_t chunk = chunk_from_thing(key->sel);
+       return chunk_hash_inc(chunk, chunk_hash(chunk_from_thing(key->mark)));
+}
+
+/**
+ * Equality function for policy_entry_t objects
+ */
+static bool policy_equals(policy_entry_t *key, policy_entry_t *other_key)
+{
+       return memeq(&key->sel, &other_key->sel, sizeof(struct xfrm_selector)) &&
+                  key->mark == other_key->mark &&
+                  key->direction == other_key->direction;
+}
+
+/**
+ * Calculate the priority of a policy
+ */
+static inline u_int32_t get_priority(policy_entry_t *policy,
+                                                                        policy_priority_t prio)
+{
+       u_int32_t priority = PRIO_BASE;
+       switch (prio)
+       {
+               case POLICY_PRIORITY_FALLBACK:
+                       priority <<= 1;
+                       /* fall-through */
+               case POLICY_PRIORITY_ROUTED:
+                       priority <<= 1;
+                       /* fall-through */
+               case POLICY_PRIORITY_DEFAULT:
+                       priority <<= 1;
+                       /* fall-through */
+               case POLICY_PRIORITY_PASS:
+                       break;
+       }
+       /* calculate priority based on selector size, small size = high prio */
+       priority -= policy->sel.prefixlen_s;
+       priority -= policy->sel.prefixlen_d;
+       priority <<= 2; /* make some room for the two flags */
+       priority += policy->sel.sport_mask || policy->sel.dport_mask ? 0 : 2;
+       priority += policy->sel.proto ? 0 : 1;
+       return priority;
+}
+
+/**
+ * Convert the general ipsec mode to the one defined in xfrm.h
+ */
+static u_int8_t mode2kernel(ipsec_mode_t mode)
+{
+       switch (mode)
+       {
+               case MODE_TRANSPORT:
+                       return XFRM_MODE_TRANSPORT;
+               case MODE_TUNNEL:
+                       return XFRM_MODE_TUNNEL;
+               case MODE_BEET:
+                       return XFRM_MODE_BEET;
+               default:
+                       return mode;
+       }
+}
+
+/**
+ * Convert a host_t to a struct xfrm_address
+ */
+static void host2xfrm(host_t *host, xfrm_address_t *xfrm)
+{
+       chunk_t chunk = host->get_address(host);
+       memcpy(xfrm, chunk.ptr, min(chunk.len, sizeof(xfrm_address_t)));
+}
+
+/**
+ * Convert a struct xfrm_address to a host_t
+ */
+static host_t* xfrm2host(int family, xfrm_address_t *xfrm, u_int16_t port)
+{
+       chunk_t chunk;
+
+       switch (family)
+       {
+               case AF_INET:
+                       chunk = chunk_create((u_char*)&xfrm->a4, sizeof(xfrm->a4));
+                       break;
+               case AF_INET6:
+                       chunk = chunk_create((u_char*)&xfrm->a6, sizeof(xfrm->a6));
+                       break;
+               default:
+                       return NULL;
+       }
+       return host_create_from_chunk(family, chunk, ntohs(port));
+}
+
+/**
+ * Convert a traffic selector address range to subnet and its mask.
+ */
+static void ts2subnet(traffic_selector_t* ts,
+                                         xfrm_address_t *net, u_int8_t *mask)
+{
+       host_t *net_host;
+       chunk_t net_chunk;
+
+       ts->to_subnet(ts, &net_host, mask);
+       net_chunk = net_host->get_address(net_host);
+       memcpy(net, net_chunk.ptr, net_chunk.len);
+       net_host->destroy(net_host);
+}
+
+/**
+ * Convert a traffic selector port range to port/portmask
+ */
+static void ts2ports(traffic_selector_t* ts,
+                                        u_int16_t *port, u_int16_t *mask)
+{
+       /* Linux does not seem to accept complex portmasks. Only
+        * any or a specific port is allowed. We set to any, if we have
+        * a port range, or to a specific, if we have one port only.
+        */
+       u_int16_t from, to;
+
+       from = ts->get_from_port(ts);
+       to = ts->get_to_port(ts);
+
+       if (from == to)
+       {
+               *port = htons(from);
+               *mask = ~0;
+       }
+       else
+       {
+               *port = 0;
+               *mask = 0;
+       }
+}
+
+/**
+ * Convert a pair of traffic_selectors to an xfrm_selector
+ */
+static struct xfrm_selector ts2selector(traffic_selector_t *src,
+                                                                               traffic_selector_t *dst)
+{
+       struct xfrm_selector sel;
+       u_int16_t port;
+
+       memset(&sel, 0, sizeof(sel));
+       sel.family = (src->get_type(src) == TS_IPV4_ADDR_RANGE) ? AF_INET : AF_INET6;
+       /* src or dest proto may be "any" (0), use more restrictive one */
+       sel.proto = max(src->get_protocol(src), dst->get_protocol(dst));
+       ts2subnet(dst, &sel.daddr, &sel.prefixlen_d);
+       ts2subnet(src, &sel.saddr, &sel.prefixlen_s);
+       ts2ports(dst, &sel.dport, &sel.dport_mask);
+       ts2ports(src, &sel.sport, &sel.sport_mask);
+       if ((sel.proto == IPPROTO_ICMP || sel.proto == IPPROTO_ICMPV6) &&
+               (sel.dport || sel.sport))
+       {
+               /* the kernel expects the ICMP type and code in the source and
+                * destination port fields, respectively. */
+               port = ntohs(max(sel.dport, sel.sport));
+               sel.sport = htons(traffic_selector_icmp_type(port));
+               sel.sport_mask = sel.sport ? ~0 : 0;
+               sel.dport = htons(traffic_selector_icmp_code(port));
+               sel.dport_mask = sel.dport ? ~0 : 0;
+       }
+       sel.ifindex = 0;
+       sel.user = 0;
+
+       return sel;
+}
+
+/**
+ * Convert an xfrm_selector to a src|dst traffic_selector
+ */
+static traffic_selector_t* selector2ts(struct xfrm_selector *sel, bool src)
+{
+       u_char *addr;
+       u_int8_t prefixlen;
+       u_int16_t port = 0;
+       host_t *host = NULL;
+
+       if (src)
+       {
+               addr = (u_char*)&sel->saddr;
+               prefixlen = sel->prefixlen_s;
+               if (sel->sport_mask)
+               {
+                       port = ntohs(sel->sport);
+               }
+       }
+       else
+       {
+               addr = (u_char*)&sel->daddr;
+               prefixlen = sel->prefixlen_d;
+               if (sel->dport_mask)
+               {
+                       port = ntohs(sel->dport);
+               }
+       }
+       if (sel->proto == IPPROTO_ICMP || sel->proto == IPPROTO_ICMPV6)
+       {       /* convert ICMP[v6] message type and code as supplied by the kernel in
+                * source and destination ports (both in network order) */
+               port = (sel->sport >> 8) | (sel->dport & 0xff00);
+               port = ntohs(port);
+       }
+       /* The Linux 2.6 kernel does not set the selector's family field,
+        * so as a kludge we additionally test the prefix length.
+        */
+       if (sel->family == AF_INET || sel->prefixlen_s == 32)
+       {
+               host = host_create_from_chunk(AF_INET, chunk_create(addr, 4), 0);
+       }
+       else if (sel->family == AF_INET6 || sel->prefixlen_s == 128)
+       {
+               host = host_create_from_chunk(AF_INET6, chunk_create(addr, 16), 0);
+       }
+
+       if (host)
+       {
+               return traffic_selector_create_from_subnet(host, prefixlen,
+                                                                                       sel->proto, port, port ?: 65535);
+       }
+       return NULL;
+}
+
+/**
+ * Process a XFRM_MSG_ACQUIRE from kernel
+ */
+static void process_acquire(private_kernel_netlink_ipsec_t *this,
+                                                       struct nlmsghdr *hdr)
+{
+       struct xfrm_user_acquire *acquire;
+       struct rtattr *rta;
+       size_t rtasize;
+       traffic_selector_t *src_ts, *dst_ts;
+       u_int32_t reqid = 0;
+       int proto = 0;
+
+       acquire = NLMSG_DATA(hdr);
+       rta = XFRM_RTA(hdr, struct xfrm_user_acquire);
+       rtasize = XFRM_PAYLOAD(hdr, struct xfrm_user_acquire);
+
+       DBG2(DBG_KNL, "received a XFRM_MSG_ACQUIRE");
+
+       while (RTA_OK(rta, rtasize))
+       {
+               DBG2(DBG_KNL, "  %N", xfrm_attr_type_names, rta->rta_type);
+
+               if (rta->rta_type == XFRMA_TMPL)
+               {
+                       struct xfrm_user_tmpl* tmpl;
+                       tmpl = (struct xfrm_user_tmpl*)RTA_DATA(rta);
+                       reqid = tmpl->reqid;
+                       proto = tmpl->id.proto;
+               }
+               rta = RTA_NEXT(rta, rtasize);
+       }
+       switch (proto)
+       {
+               case 0:
+               case IPPROTO_ESP:
+               case IPPROTO_AH:
+                       break;
+               default:
+                       /* acquire for AH/ESP only, not for IPCOMP */
+                       return;
+       }
+       src_ts = selector2ts(&acquire->sel, TRUE);
+       dst_ts = selector2ts(&acquire->sel, FALSE);
+
+       hydra->kernel_interface->acquire(hydra->kernel_interface, reqid, src_ts,
+                                                                        dst_ts);
+}
+
+/**
+ * Process a XFRM_MSG_EXPIRE from kernel
+ */
+static void process_expire(private_kernel_netlink_ipsec_t *this,
+                                                  struct nlmsghdr *hdr)
+{
+       struct xfrm_user_expire *expire;
+       u_int32_t spi;
+       u_int8_t protocol;
+       host_t *dst;
+
+       expire = NLMSG_DATA(hdr);
+       protocol = expire->state.id.proto;
+       spi = expire->state.id.spi;
+
+       DBG2(DBG_KNL, "received a XFRM_MSG_EXPIRE");
+
+       if (protocol == IPPROTO_ESP || protocol == IPPROTO_AH)
+       {
+               dst = xfrm2host(expire->state.family, &expire->state.id.daddr, 0);
+               if (dst)
+               {
+                       hydra->kernel_interface->expire(hydra->kernel_interface, protocol,
+                                                                                       spi, dst, expire->hard != 0);
+                       dst->destroy(dst);
+               }
+       }
+}
+
+/**
+ * Process a XFRM_MSG_MIGRATE from kernel
+ */
+static void process_migrate(private_kernel_netlink_ipsec_t *this,
+                                                       struct nlmsghdr *hdr)
+{
+       struct xfrm_userpolicy_id *policy_id;
+       struct rtattr *rta;
+       size_t rtasize;
+       traffic_selector_t *src_ts, *dst_ts;
+       host_t *local = NULL, *remote = NULL;
+       host_t *old_src = NULL, *old_dst = NULL;
+       host_t *new_src = NULL, *new_dst = NULL;
+       u_int32_t reqid = 0;
+       policy_dir_t dir;
+
+       policy_id = NLMSG_DATA(hdr);
+       rta     = XFRM_RTA(hdr, struct xfrm_userpolicy_id);
+       rtasize = XFRM_PAYLOAD(hdr, struct xfrm_userpolicy_id);
+
+       DBG2(DBG_KNL, "received a XFRM_MSG_MIGRATE");
+
+       src_ts = selector2ts(&policy_id->sel, TRUE);
+       dst_ts = selector2ts(&policy_id->sel, FALSE);
+       dir = (policy_dir_t)policy_id->dir;
+
+       DBG2(DBG_KNL, "  policy: %R === %R %N", src_ts, dst_ts, policy_dir_names);
+
+       while (RTA_OK(rta, rtasize))
+       {
+               DBG2(DBG_KNL, "  %N", xfrm_attr_type_names, rta->rta_type);
+               if (rta->rta_type == XFRMA_KMADDRESS)
+               {
+                       struct xfrm_user_kmaddress *kmaddress;
+
+                       kmaddress = (struct xfrm_user_kmaddress*)RTA_DATA(rta);
+                       local  = xfrm2host(kmaddress->family, &kmaddress->local, 0);
+                       remote = xfrm2host(kmaddress->family, &kmaddress->remote, 0);
+                       DBG2(DBG_KNL, "  kmaddress: %H...%H", local, remote);
+               }
+               else if (rta->rta_type == XFRMA_MIGRATE)
+               {
+                       struct xfrm_user_migrate *migrate;
+
+                       migrate = (struct xfrm_user_migrate*)RTA_DATA(rta);
+                       old_src = xfrm2host(migrate->old_family, &migrate->old_saddr, 0);
+                       old_dst = xfrm2host(migrate->old_family, &migrate->old_daddr, 0);
+                       new_src = xfrm2host(migrate->new_family, &migrate->new_saddr, 0);
+                       new_dst = xfrm2host(migrate->new_family, &migrate->new_daddr, 0);
+                       reqid = migrate->reqid;
+                       DBG2(DBG_KNL, "  migrate %H...%H to %H...%H, reqid {%u}",
+                                                  old_src, old_dst, new_src, new_dst, reqid);
+                       DESTROY_IF(old_src);
+                       DESTROY_IF(old_dst);
+                       DESTROY_IF(new_src);
+                       DESTROY_IF(new_dst);
+               }
+               rta = RTA_NEXT(rta, rtasize);
+       }
+
+       if (src_ts && dst_ts && local && remote)
+       {
+               hydra->kernel_interface->migrate(hydra->kernel_interface, reqid,
+                                                                                src_ts, dst_ts, dir, local, remote);
+       }
+       else
+       {
+               DESTROY_IF(src_ts);
+               DESTROY_IF(dst_ts);
+               DESTROY_IF(local);
+               DESTROY_IF(remote);
+       }
+}
+
+/**
+ * Process a XFRM_MSG_MAPPING from kernel
+ */
+static void process_mapping(private_kernel_netlink_ipsec_t *this,
+                                                       struct nlmsghdr *hdr)
+{
+       struct xfrm_user_mapping *mapping;
+       u_int32_t spi;
+
+       mapping = NLMSG_DATA(hdr);
+       spi = mapping->id.spi;
+
+       DBG2(DBG_KNL, "received a XFRM_MSG_MAPPING");
+
+       if (mapping->id.proto == IPPROTO_ESP)
+       {
+               host_t *dst, *new;
+
+               dst = xfrm2host(mapping->id.family, &mapping->id.daddr, 0);
+               if (dst)
+               {
+                       new = xfrm2host(mapping->id.family, &mapping->new_saddr,
+                                                       mapping->new_sport);
+                       if (new)
+                       {
+                               hydra->kernel_interface->mapping(hydra->kernel_interface,
+                                                                                                IPPROTO_ESP, spi, dst, new);
+                               new->destroy(new);
+                       }
+                       dst->destroy(dst);
+               }
+       }
+}
+
+/**
+ * Receives events from kernel
+ */
+static bool receive_events(private_kernel_netlink_ipsec_t *this, int fd,
+                                                  watcher_event_t event)
+{
+       char response[1024];
+       struct nlmsghdr *hdr = (struct nlmsghdr*)response;
+       struct sockaddr_nl addr;
+       socklen_t addr_len = sizeof(addr);
+       int len;
+
+       len = recvfrom(this->socket_xfrm_events, response, sizeof(response),
+                                  MSG_DONTWAIT, (struct sockaddr*)&addr, &addr_len);
+       if (len < 0)
+       {
+               switch (errno)
+               {
+                       case EINTR:
+                               /* interrupted, try again */
+                               return TRUE;
+                       case EAGAIN:
+                               /* no data ready, select again */
+                               return TRUE;
+                       default:
+                               DBG1(DBG_KNL, "unable to receive from xfrm event socket");
+                               sleep(1);
+                               return TRUE;
+               }
+       }
+
+       if (addr.nl_pid != 0)
+       {       /* not from kernel. not interested, try another one */
+               return TRUE;
+       }
+
+       while (NLMSG_OK(hdr, len))
+       {
+               switch (hdr->nlmsg_type)
+               {
+                       case XFRM_MSG_ACQUIRE:
+                               process_acquire(this, hdr);
+                               break;
+                       case XFRM_MSG_EXPIRE:
+                               process_expire(this, hdr);
+                               break;
+                       case XFRM_MSG_MIGRATE:
+                               process_migrate(this, hdr);
+                               break;
+                       case XFRM_MSG_MAPPING:
+                               process_mapping(this, hdr);
+                               break;
+                       default:
+                               DBG1(DBG_KNL, "received unknown event from xfrm event "
+                                                         "socket: %d", hdr->nlmsg_type);
+                               break;
+               }
+               hdr = NLMSG_NEXT(hdr, len);
+       }
+       return TRUE;
+}
+
+METHOD(kernel_ipsec_t, get_features, kernel_feature_t,
+       private_kernel_netlink_ipsec_t *this)
+{
+       return KERNEL_ESP_V3_TFC;
+}
+
+/**
+ * Get an SPI for a specific protocol from the kernel.
+ */
+static status_t get_spi_internal(private_kernel_netlink_ipsec_t *this,
+       host_t *src, host_t *dst, u_int8_t proto, u_int32_t min, u_int32_t max,
+       u_int32_t *spi)
+{
+       netlink_buf_t request;
+       struct nlmsghdr *hdr, *out;
+       struct xfrm_userspi_info *userspi;
+       u_int32_t received_spi = 0;
+       size_t len;
+
+       memset(&request, 0, sizeof(request));
+
+       hdr = &request.hdr;
+       hdr->nlmsg_flags = NLM_F_REQUEST;
+       hdr->nlmsg_type = XFRM_MSG_ALLOCSPI;
+       hdr->nlmsg_len = NLMSG_LENGTH(sizeof(struct xfrm_userspi_info));
+
+       userspi = NLMSG_DATA(hdr);
+       host2xfrm(src, &userspi->info.saddr);
+       host2xfrm(dst, &userspi->info.id.daddr);
+       userspi->info.id.proto = proto;
+       userspi->info.mode = XFRM_MODE_TUNNEL;
+       userspi->info.family = src->get_family(src);
+       userspi->min = min;
+       userspi->max = max;
+
+       if (this->socket_xfrm->send(this->socket_xfrm, hdr, &out, &len) == SUCCESS)
+       {
+               hdr = out;
+               while (NLMSG_OK(hdr, len))
+               {
+                       switch (hdr->nlmsg_type)
+                       {
+                               case XFRM_MSG_NEWSA:
+                               {
+                                       struct xfrm_usersa_info* usersa = NLMSG_DATA(hdr);
+                                       received_spi = usersa->id.spi;
+                                       break;
+                               }
+                               case NLMSG_ERROR:
+                               {
+                                       struct nlmsgerr *err = NLMSG_DATA(hdr);
+                                       DBG1(DBG_KNL, "allocating SPI failed: %s (%d)",
+                                                strerror(-err->error), -err->error);
+                                       break;
+                               }
+                               default:
+                                       hdr = NLMSG_NEXT(hdr, len);
+                                       continue;
+                               case NLMSG_DONE:
+                                       break;
+                       }
+                       break;
+               }
+               free(out);
+       }
+
+       if (received_spi == 0)
+       {
+               return FAILED;
+       }
+
+       *spi = received_spi;
+       return SUCCESS;
+}
+
+METHOD(kernel_ipsec_t, get_spi, status_t,
+       private_kernel_netlink_ipsec_t *this, host_t *src, host_t *dst,
+       u_int8_t protocol, u_int32_t *spi)
+{
+       if (get_spi_internal(this, src, dst, protocol,
+                                                0xc0000000, 0xcFFFFFFF, spi) != SUCCESS)
+       {
+               DBG1(DBG_KNL, "unable to get SPI");
+               return FAILED;
+       }
+
+       DBG2(DBG_KNL, "got SPI %.8x", ntohl(*spi));
+       return SUCCESS;
+}
+
+METHOD(kernel_ipsec_t, get_cpi, status_t,
+       private_kernel_netlink_ipsec_t *this, host_t *src, host_t *dst,
+       u_int16_t *cpi)
+{
+       u_int32_t received_spi = 0;
+
+       if (get_spi_internal(this, src, dst, IPPROTO_COMP,
+                                                0x100, 0xEFFF, &received_spi) != SUCCESS)
+       {
+               DBG1(DBG_KNL, "unable to get CPI");
+               return FAILED;
+       }
+
+       *cpi = htons((u_int16_t)ntohl(received_spi));
+
+       DBG2(DBG_KNL, "got CPI %.4x", ntohs(*cpi));
+       return SUCCESS;
+}
+
+/**
+ * Add a XFRM mark to message if required
+ */
+static bool add_mark(struct nlmsghdr *hdr, int buflen, mark_t mark)
+{
+       if (mark.value)
+       {
+               struct xfrm_mark *xmrk;
+
+               xmrk = netlink_reserve(hdr, buflen, XFRMA_MARK, sizeof(*xmrk));
+               if (!xmrk)
+               {
+                       return FALSE;
+               }
+               xmrk->v = mark.value;
+               xmrk->m = mark.mask;
+       }
+       return TRUE;
+}
+
+METHOD(kernel_ipsec_t, add_sa, status_t,
+       private_kernel_netlink_ipsec_t *this, host_t *src, host_t *dst,
+       u_int32_t spi, u_int8_t protocol, u_int32_t reqid, mark_t mark,
+       u_int32_t tfc, lifetime_cfg_t *lifetime, u_int16_t enc_alg, chunk_t enc_key,
+       u_int16_t int_alg, chunk_t int_key, ipsec_mode_t mode,
+       u_int16_t ipcomp, u_int16_t cpi, u_int32_t replay_window,
+       bool initiator, bool encap, bool esn, bool inbound, bool update,
+       linked_list_t* src_ts, linked_list_t* dst_ts)
+{
+       netlink_buf_t request;
+       char *alg_name;
+       struct nlmsghdr *hdr;
+       struct xfrm_usersa_info *sa;
+       u_int16_t icv_size = 64;
+       ipsec_mode_t original_mode = mode;
+       traffic_selector_t *first_src_ts, *first_dst_ts;
+       status_t status = FAILED;
+
+       /* if IPComp is used, we install an additional IPComp SA. if the cpi is 0
+        * we are in the recursive call below */
+       if (ipcomp != IPCOMP_NONE && cpi != 0)
+       {
+               lifetime_cfg_t lft = {{0,0,0},{0,0,0},{0,0,0}};
+               add_sa(this, src, dst, htonl(ntohs(cpi)), IPPROTO_COMP, reqid, mark,
+                          tfc, &lft, ENCR_UNDEFINED, chunk_empty, AUTH_UNDEFINED,
+                          chunk_empty, mode, ipcomp, 0, 0, initiator, FALSE, FALSE,
+                          inbound, update, src_ts, dst_ts);
+               ipcomp = IPCOMP_NONE;
+               /* use transport mode ESP SA, IPComp uses tunnel mode */
+               mode = MODE_TRANSPORT;
+       }
+
+       memset(&request, 0, sizeof(request));
+
+       DBG2(DBG_KNL, "adding SAD entry with SPI %.8x and reqid {%u}  (mark "
+                                 "%u/0x%08x)", ntohl(spi), reqid, mark.value, mark.mask);
+
+       hdr = &request.hdr;
+       hdr->nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
+       hdr->nlmsg_type = update ? XFRM_MSG_UPDSA : XFRM_MSG_NEWSA;
+       hdr->nlmsg_len = NLMSG_LENGTH(sizeof(struct xfrm_usersa_info));
+
+       sa = NLMSG_DATA(hdr);
+       host2xfrm(src, &sa->saddr);
+       host2xfrm(dst, &sa->id.daddr);
+       sa->id.spi = spi;
+       sa->id.proto = protocol;
+       sa->family = src->get_family(src);
+       sa->mode = mode2kernel(mode);
+       switch (mode)
+       {
+               case MODE_TUNNEL:
+                       sa->flags |= XFRM_STATE_AF_UNSPEC;
+                       break;
+               case MODE_BEET:
+               case MODE_TRANSPORT:
+                       if (original_mode == MODE_TUNNEL)
+                       {       /* don't install selectors for switched SAs.  because only one
+                                * selector can be installed other traffic would get dropped */
+                               break;
+                       }
+                       if (src_ts->get_first(src_ts, (void**)&first_src_ts) == SUCCESS &&
+                               dst_ts->get_first(dst_ts, (void**)&first_dst_ts) == SUCCESS)
+                       {
+                               sa->sel = ts2selector(first_src_ts, first_dst_ts);
+                               if (!this->proto_port_transport)
+                               {
+                                       /* don't install proto/port on SA. This would break
+                                        * potential secondary SAs for the same address using a
+                                        * different prot/port. */
+                                       sa->sel.proto = 0;
+                                       sa->sel.dport = sa->sel.dport_mask = 0;
+                                       sa->sel.sport = sa->sel.sport_mask = 0;
+                               }
+                       }
+                       break;
+               default:
+                       break;
+       }
+
+       sa->reqid = reqid;
+       sa->lft.soft_byte_limit = XFRM_LIMIT(lifetime->bytes.rekey);
+       sa->lft.hard_byte_limit = XFRM_LIMIT(lifetime->bytes.life);
+       sa->lft.soft_packet_limit = XFRM_LIMIT(lifetime->packets.rekey);
+       sa->lft.hard_packet_limit = XFRM_LIMIT(lifetime->packets.life);
+       /* we use lifetimes since added, not since used */
+       sa->lft.soft_add_expires_seconds = lifetime->time.rekey;
+       sa->lft.hard_add_expires_seconds = lifetime->time.life;
+       sa->lft.soft_use_expires_seconds = 0;
+       sa->lft.hard_use_expires_seconds = 0;
+
+       switch (enc_alg)
+       {
+               case ENCR_UNDEFINED:
+                       /* no encryption */
+                       break;
+               case ENCR_AES_CCM_ICV16:
+               case ENCR_AES_GCM_ICV16:
+               case ENCR_NULL_AUTH_AES_GMAC:
+               case ENCR_CAMELLIA_CCM_ICV16:
+               case ENCR_CHACHA20_POLY1305:
+                       icv_size += 32;
+                       /* FALL */
+               case ENCR_AES_CCM_ICV12:
+               case ENCR_AES_GCM_ICV12:
+               case ENCR_CAMELLIA_CCM_ICV12:
+                       icv_size += 32;
+                       /* FALL */
+               case ENCR_AES_CCM_ICV8:
+               case ENCR_AES_GCM_ICV8:
+               case ENCR_CAMELLIA_CCM_ICV8:
+               {
+                       struct xfrm_algo_aead *algo;
+
+                       alg_name = lookup_algorithm(ENCRYPTION_ALGORITHM, enc_alg);
+                       if (alg_name == NULL)
+                       {
+                               DBG1(DBG_KNL, "algorithm %N not supported by kernel!",
+                                                encryption_algorithm_names, enc_alg);
+                                       goto failed;
+                       }
+                       DBG2(DBG_KNL, "  using encryption algorithm %N with key size %d",
+                                encryption_algorithm_names, enc_alg, enc_key.len * 8);
+
+                       algo = netlink_reserve(hdr, sizeof(request), XFRMA_ALG_AEAD,
+                                                                  sizeof(*algo) + enc_key.len);
+                       if (!algo)
+                       {
+                               goto failed;
+                       }
+                       algo->alg_key_len = enc_key.len * 8;
+                       algo->alg_icv_len = icv_size;
+                       strncpy(algo->alg_name, alg_name, sizeof(algo->alg_name));
+                       algo->alg_name[sizeof(algo->alg_name) - 1] = '\0';
+                       memcpy(algo->alg_key, enc_key.ptr, enc_key.len);
+                       break;
+               }
+               default:
+               {
+                       struct xfrm_algo *algo;
+
+                       alg_name = lookup_algorithm(ENCRYPTION_ALGORITHM, enc_alg);
+                       if (alg_name == NULL)
+                       {
+                               DBG1(DBG_KNL, "algorithm %N not supported by kernel!",
+                                        encryption_algorithm_names, enc_alg);
+                               goto failed;
+                       }
+                       DBG2(DBG_KNL, "  using encryption algorithm %N with key size %d",
+                                encryption_algorithm_names, enc_alg, enc_key.len * 8);
+
+                       algo = netlink_reserve(hdr, sizeof(request), XFRMA_ALG_CRYPT,
+                                                                  sizeof(*algo) + enc_key.len);
+                       if (!algo)
+                       {
+                               goto failed;
+                       }
+                       algo->alg_key_len = enc_key.len * 8;
+                       strncpy(algo->alg_name, alg_name, sizeof(algo->alg_name));
+                       algo->alg_name[sizeof(algo->alg_name) - 1] = '\0';
+                       memcpy(algo->alg_key, enc_key.ptr, enc_key.len);
+               }
+       }
+
+       if (int_alg != AUTH_UNDEFINED)
+       {
+               u_int trunc_len = 0;
+
+               alg_name = lookup_algorithm(INTEGRITY_ALGORITHM, int_alg);
+               if (alg_name == NULL)
+               {
+                       DBG1(DBG_KNL, "algorithm %N not supported by kernel!",
+                                integrity_algorithm_names, int_alg);
+                       goto failed;
+               }
+               DBG2(DBG_KNL, "  using integrity algorithm %N with key size %d",
+                        integrity_algorithm_names, int_alg, int_key.len * 8);
+
+               switch (int_alg)
+               {
+                       case AUTH_HMAC_MD5_128:
+                       case AUTH_HMAC_SHA2_256_128:
+                               trunc_len = 128;
+                               break;
+                       case AUTH_HMAC_SHA1_160:
+                               trunc_len = 160;
+                               break;
+                       default:
+                               break;
+               }
+
+               if (trunc_len)
+               {
+                       struct xfrm_algo_auth* algo;
+
+                       /* the kernel uses SHA256 with 96 bit truncation by default,
+                        * use specified truncation size supported by newer kernels.
+                        * also use this for untruncated MD5 and SHA1. */
+                       algo = netlink_reserve(hdr, sizeof(request), XFRMA_ALG_AUTH_TRUNC,
+                                                                  sizeof(*algo) + int_key.len);
+                       if (!algo)
+                       {
+                               goto failed;
+                       }
+                       algo->alg_key_len = int_key.len * 8;
+                       algo->alg_trunc_len = trunc_len;
+                       strncpy(algo->alg_name, alg_name, sizeof(algo->alg_name));
+                       algo->alg_name[sizeof(algo->alg_name) - 1] = '\0';
+                       memcpy(algo->alg_key, int_key.ptr, int_key.len);
+               }
+               else
+               {
+                       struct xfrm_algo* algo;
+
+                       algo = netlink_reserve(hdr, sizeof(request), XFRMA_ALG_AUTH,
+                                                                  sizeof(*algo) + int_key.len);
+                       if (!algo)
+                       {
+                               goto failed;
+                       }
+                       algo->alg_key_len = int_key.len * 8;
+                       strncpy(algo->alg_name, alg_name, sizeof(algo->alg_name));
+                       algo->alg_name[sizeof(algo->alg_name) - 1] = '\0';
+                       memcpy(algo->alg_key, int_key.ptr, int_key.len);
+               }
+       }
+
+       if (ipcomp != IPCOMP_NONE)
+       {
+               struct xfrm_algo* algo;
+
+               alg_name = lookup_algorithm(COMPRESSION_ALGORITHM, ipcomp);
+               if (alg_name == NULL)
+               {
+                       DBG1(DBG_KNL, "algorithm %N not supported by kernel!",
+                                ipcomp_transform_names, ipcomp);
+                       goto failed;
+               }
+               DBG2(DBG_KNL, "  using compression algorithm %N",
+                        ipcomp_transform_names, ipcomp);
+
+               algo = netlink_reserve(hdr, sizeof(request), XFRMA_ALG_COMP,
+                                                          sizeof(*algo));
+               if (!algo)
+               {
+                       goto failed;
+               }
+               algo->alg_key_len = 0;
+               strncpy(algo->alg_name, alg_name, sizeof(algo->alg_name));
+               algo->alg_name[sizeof(algo->alg_name) - 1] = '\0';
+       }
+
+       if (encap)
+       {
+               struct xfrm_encap_tmpl *tmpl;
+
+               tmpl = netlink_reserve(hdr, sizeof(request), XFRMA_ENCAP, sizeof(*tmpl));
+               if (!tmpl)
+               {
+                       goto failed;
+               }
+               tmpl->encap_type = UDP_ENCAP_ESPINUDP;
+               tmpl->encap_sport = htons(src->get_port(src));
+               tmpl->encap_dport = htons(dst->get_port(dst));
+               memset(&tmpl->encap_oa, 0, sizeof (xfrm_address_t));
+               /* encap_oa could probably be derived from the
+                * traffic selectors [rfc4306, p39]. In the netlink kernel
+                * implementation pluto does the same as we do here but it uses
+                * encap_oa in the pfkey implementation.
+                * BUT as /usr/src/linux/net/key/af_key.c indicates the kernel ignores
+                * it anyway
+                *   -> does that mean that NAT-T encap doesn't work in transport mode?
+                * No. The reason the kernel ignores NAT-OA is that it recomputes
+                * (or, rather, just ignores) the checksum. If packets pass the IPsec
+                * checks it marks them "checksum ok" so OA isn't needed. */
+       }
+
+       if (!add_mark(hdr, sizeof(request), mark))
+       {
+               goto failed;
+       }
+
+       if (tfc && protocol == IPPROTO_ESP && mode == MODE_TUNNEL)
+       {       /* the kernel supports TFC padding only for tunnel mode ESP SAs */
+               u_int32_t *tfcpad;
+
+               tfcpad = netlink_reserve(hdr, sizeof(request), XFRMA_TFCPAD,
+                                                                sizeof(*tfcpad));
+               if (!tfcpad)
+               {
+                       goto failed;
+               }
+               *tfcpad = tfc;
+       }
+
+       if (protocol != IPPROTO_COMP)
+       {
+               if (replay_window != 0 && (esn || replay_window > 32))
+               {
+                       /* for ESN or larger replay windows we need the new
+                        * XFRMA_REPLAY_ESN_VAL attribute to configure a bitmap */
+                       struct xfrm_replay_state_esn *replay;
+                       u_int32_t bmp_size;
+
+                       bmp_size = round_up(replay_window, sizeof(u_int32_t) * 8) / 8;
+                       replay = netlink_reserve(hdr, sizeof(request), XFRMA_REPLAY_ESN_VAL,
+                                                                        sizeof(*replay) + bmp_size);
+                       if (!replay)
+                       {
+                               goto failed;
+                       }
+                       /* bmp_len contains number uf __u32's */
+                       replay->bmp_len = bmp_size / sizeof(u_int32_t);
+                       replay->replay_window = replay_window;
+                       DBG2(DBG_KNL, "  using replay window of %u packets", replay_window);
+
+                       if (esn)
+                       {
+                               DBG2(DBG_KNL, "  using extended sequence numbers (ESN)");
+                               sa->flags |= XFRM_STATE_ESN;
+                       }
+               }
+               else
+               {
+                       DBG2(DBG_KNL, "  using replay window of %u packets", replay_window);
+                       sa->replay_window = replay_window;
+               }
+       }
+
+       if (this->socket_xfrm->send_ack(this->socket_xfrm, hdr) != SUCCESS)
+       {
+               if (mark.value)
+               {
+                       DBG1(DBG_KNL, "unable to add SAD entry with SPI %.8x  "
+                                                 "(mark %u/0x%08x)", ntohl(spi), mark.value, mark.mask);
+               }
+               else
+               {
+                       DBG1(DBG_KNL, "unable to add SAD entry with SPI %.8x", ntohl(spi));
+               }
+               goto failed;
+       }
+
+       status = SUCCESS;
+
+failed:
+       memwipe(&request, sizeof(request));
+       return status;
+}
+
+/**
+ * Get the ESN replay state (i.e. sequence numbers) of an SA.
+ *
+ * Allocates into one the replay state structure we get from the kernel.
+ */
+static void get_replay_state(private_kernel_netlink_ipsec_t *this,
+                                                        u_int32_t spi, u_int8_t protocol,
+                                                        host_t *dst, mark_t mark,
+                                                        struct xfrm_replay_state_esn **replay_esn,
+                                                        u_int32_t *replay_esn_len,
+                                                        struct xfrm_replay_state **replay,
+                                                        struct xfrm_lifetime_cur **lifetime)
+{
+       netlink_buf_t request;
+       struct nlmsghdr *hdr, *out = NULL;
+       struct xfrm_aevent_id *out_aevent = NULL, *aevent_id;
+       size_t len;
+       struct rtattr *rta;
+       size_t rtasize;
+
+       memset(&request, 0, sizeof(request));
+
+       DBG2(DBG_KNL, "querying replay state from SAD entry with SPI %.8x",
+                                  ntohl(spi));
+
+       hdr = &request.hdr;
+       hdr->nlmsg_flags = NLM_F_REQUEST;
+       hdr->nlmsg_type = XFRM_MSG_GETAE;
+       hdr->nlmsg_len = NLMSG_LENGTH(sizeof(struct xfrm_aevent_id));
+
+       aevent_id = NLMSG_DATA(hdr);
+       aevent_id->flags = XFRM_AE_RVAL;
+
+       host2xfrm(dst, &aevent_id->sa_id.daddr);
+       aevent_id->sa_id.spi = spi;
+       aevent_id->sa_id.proto = protocol;
+       aevent_id->sa_id.family = dst->get_family(dst);
+
+       if (!add_mark(hdr, sizeof(request), mark))
+       {
+               return;
+       }
+
+       if (this->socket_xfrm->send(this->socket_xfrm, hdr, &out, &len) == SUCCESS)
+       {
+               hdr = out;
+               while (NLMSG_OK(hdr, len))
+               {
+                       switch (hdr->nlmsg_type)
+                       {
+                               case XFRM_MSG_NEWAE:
+                               {
+                                       out_aevent = NLMSG_DATA(hdr);
+                                       break;
+                               }
+                               case NLMSG_ERROR:
+                               {
+                                       struct nlmsgerr *err = NLMSG_DATA(hdr);
+                                       DBG1(DBG_KNL, "querying replay state from SAD entry "
+                                                                 "failed: %s (%d)", strerror(-err->error),
+                                                                 -err->error);
+                                       break;
+                               }
+                               default:
+                                       hdr = NLMSG_NEXT(hdr, len);
+                                       continue;
+                               case NLMSG_DONE:
+                                       break;
+                       }
+                       break;
+               }
+       }
+
+       if (out_aevent)
+       {
+               rta = XFRM_RTA(out, struct xfrm_aevent_id);
+               rtasize = XFRM_PAYLOAD(out, struct xfrm_aevent_id);
+               while (RTA_OK(rta, rtasize))
+               {
+                       if (rta->rta_type == XFRMA_LTIME_VAL &&
+                               RTA_PAYLOAD(rta) == sizeof(**lifetime))
+                       {
+                               free(*lifetime);
+                               *lifetime = malloc(RTA_PAYLOAD(rta));
+                               memcpy(*lifetime, RTA_DATA(rta), RTA_PAYLOAD(rta));
+                       }
+                       if (rta->rta_type == XFRMA_REPLAY_VAL &&
+                               RTA_PAYLOAD(rta) == sizeof(**replay))
+                       {
+                               free(*replay);
+                               *replay = malloc(RTA_PAYLOAD(rta));
+                               memcpy(*replay, RTA_DATA(rta), RTA_PAYLOAD(rta));
+                       }
+                       if (rta->rta_type == XFRMA_REPLAY_ESN_VAL &&
+                               RTA_PAYLOAD(rta) >= sizeof(**replay_esn))
+                       {
+                               free(*replay_esn);
+                               *replay_esn = malloc(RTA_PAYLOAD(rta));
+                               *replay_esn_len = RTA_PAYLOAD(rta);
+                               memcpy(*replay_esn, RTA_DATA(rta), RTA_PAYLOAD(rta));
+                       }
+                       rta = RTA_NEXT(rta, rtasize);
+               }
+       }
+       free(out);
+}
+
+METHOD(kernel_ipsec_t, query_sa, status_t,
+       private_kernel_netlink_ipsec_t *this, host_t *src, host_t *dst,
+       u_int32_t spi, u_int8_t protocol, mark_t mark,
+       u_int64_t *bytes, u_int64_t *packets, time_t *time)
+{
+       netlink_buf_t request;
+       struct nlmsghdr *out = NULL, *hdr;
+       struct xfrm_usersa_id *sa_id;
+       struct xfrm_usersa_info *sa = NULL;
+       status_t status = FAILED;
+       size_t len;
+
+       memset(&request, 0, sizeof(request));
+
+       DBG2(DBG_KNL, "querying SAD entry with SPI %.8x  (mark %u/0x%08x)",
+                                  ntohl(spi), mark.value, mark.mask);
+
+       hdr = &request.hdr;
+       hdr->nlmsg_flags = NLM_F_REQUEST;
+       hdr->nlmsg_type = XFRM_MSG_GETSA;
+       hdr->nlmsg_len = NLMSG_LENGTH(sizeof(struct xfrm_usersa_id));
+
+       sa_id = NLMSG_DATA(hdr);
+       host2xfrm(dst, &sa_id->daddr);
+       sa_id->spi = spi;
+       sa_id->proto = protocol;
+       sa_id->family = dst->get_family(dst);
+
+       if (!add_mark(hdr, sizeof(request), mark))
+       {
+               return FAILED;
+       }
+
+       if (this->socket_xfrm->send(this->socket_xfrm, hdr, &out, &len) == SUCCESS)
+       {
+               hdr = out;
+               while (NLMSG_OK(hdr, len))
+               {
+                       switch (hdr->nlmsg_type)
+                       {
+                               case XFRM_MSG_NEWSA:
+                               {
+                                       sa = NLMSG_DATA(hdr);
+                                       break;
+                               }
+                               case NLMSG_ERROR:
+                               {
+                                       struct nlmsgerr *err = NLMSG_DATA(hdr);
+
+                                       if (mark.value)
+                                       {
+                                               DBG1(DBG_KNL, "querying SAD entry with SPI %.8x  "
+                                                                         "(mark %u/0x%08x) failed: %s (%d)",
+                                                                          ntohl(spi), mark.value, mark.mask,
+                                                                          strerror(-err->error), -err->error);
+                                       }
+                                       else
+                                       {
+                                               DBG1(DBG_KNL, "querying SAD entry with SPI %.8x "
+                                                                         "failed: %s (%d)", ntohl(spi),
+                                                                          strerror(-err->error), -err->error);
+                                       }
+                                       break;
+                               }
+                               default:
+                                       hdr = NLMSG_NEXT(hdr, len);
+                                       continue;
+                               case NLMSG_DONE:
+                                       break;
+                       }
+                       break;
+               }
+       }
+
+       if (sa == NULL)
+       {
+               DBG2(DBG_KNL, "unable to query SAD entry with SPI %.8x", ntohl(spi));
+       }
+       else
+       {
+               if (bytes)
+               {
+                       *bytes = sa->curlft.bytes;
+               }
+               if (packets)
+               {
+                       *packets = sa->curlft.packets;
+               }
+               if (time)
+               {       /* curlft contains an "use" time, but that contains a timestamp
+                        * of the first use, not the last. Last use time must be queried
+                        * on the policy on Linux */
+                       *time = 0;
+               }
+               status = SUCCESS;
+       }
+       memwipe(out, len);
+       free(out);
+       return status;
+}
+
+METHOD(kernel_ipsec_t, del_sa, status_t,
+       private_kernel_netlink_ipsec_t *this, host_t *src, host_t *dst,
+       u_int32_t spi, u_int8_t protocol, u_int16_t cpi, mark_t mark)
+{
+       netlink_buf_t request;
+       struct nlmsghdr *hdr;
+       struct xfrm_usersa_id *sa_id;
+
+       /* if IPComp was used, we first delete the additional IPComp SA */
+       if (cpi)
+       {
+               del_sa(this, src, dst, htonl(ntohs(cpi)), IPPROTO_COMP, 0, mark);
+       }
+
+       memset(&request, 0, sizeof(request));
+
+       DBG2(DBG_KNL, "deleting SAD entry with SPI %.8x  (mark %u/0x%08x)",
+                                  ntohl(spi), mark.value, mark.mask);
+
+       hdr = &request.hdr;
+       hdr->nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
+       hdr->nlmsg_type = XFRM_MSG_DELSA;
+       hdr->nlmsg_len = NLMSG_LENGTH(sizeof(struct xfrm_usersa_id));
+
+       sa_id = NLMSG_DATA(hdr);
+       host2xfrm(dst, &sa_id->daddr);
+       sa_id->spi = spi;
+       sa_id->proto = protocol;
+       sa_id->family = dst->get_family(dst);
+
+       if (!add_mark(hdr, sizeof(request), mark))
+       {
+               return FAILED;
+       }
+
+       switch (this->socket_xfrm->send_ack(this->socket_xfrm, hdr))
+       {
+               case SUCCESS:
+                       DBG2(DBG_KNL, "deleted SAD entry with SPI %.8x (mark %u/0x%08x)",
+                                ntohl(spi), mark.value, mark.mask);
+                       return SUCCESS;
+               case NOT_FOUND:
+                       return NOT_FOUND;
+               default:
+                       if (mark.value)
+                       {
+                               DBG1(DBG_KNL, "unable to delete SAD entry with SPI %.8x "
+                                        "(mark %u/0x%08x)", ntohl(spi), mark.value, mark.mask);
+                       }
+                       else
+                       {
+                               DBG1(DBG_KNL, "unable to delete SAD entry with SPI %.8x",
+                                        ntohl(spi));
+                       }
+                       return FAILED;
+       }
+}
+
+METHOD(kernel_ipsec_t, update_sa, status_t,
+       private_kernel_netlink_ipsec_t *this, u_int32_t spi, u_int8_t protocol,
+       u_int16_t cpi, host_t *src, host_t *dst, host_t *new_src, host_t *new_dst,
+       bool old_encap, bool new_encap, mark_t mark)
+{
+       netlink_buf_t request;
+       struct nlmsghdr *hdr, *out = NULL;
+       struct xfrm_usersa_id *sa_id;
+       struct xfrm_usersa_info *out_sa = NULL, *sa;
+       size_t len;
+       struct rtattr *rta;
+       size_t rtasize;
+       struct xfrm_encap_tmpl* tmpl = NULL;
+       struct xfrm_replay_state *replay = NULL;
+       struct xfrm_replay_state_esn *replay_esn = NULL;
+       struct xfrm_lifetime_cur *lifetime = NULL;
+       u_int32_t replay_esn_len = 0;
+       status_t status = FAILED;
+
+       /* if IPComp is used, we first update the IPComp SA */
+       if (cpi)
+       {
+               update_sa(this, htonl(ntohs(cpi)), IPPROTO_COMP, 0,
+                                 src, dst, new_src, new_dst, FALSE, FALSE, mark);
+       }
+
+       memset(&request, 0, sizeof(request));
+
+       DBG2(DBG_KNL, "querying SAD entry with SPI %.8x for update", ntohl(spi));
+
+       /* query the existing SA first */
+       hdr = &request.hdr;
+       hdr->nlmsg_flags = NLM_F_REQUEST;
+       hdr->nlmsg_type = XFRM_MSG_GETSA;
+       hdr->nlmsg_len = NLMSG_LENGTH(sizeof(struct xfrm_usersa_id));
+
+       sa_id = NLMSG_DATA(hdr);
+       host2xfrm(dst, &sa_id->daddr);
+       sa_id->spi = spi;
+       sa_id->proto = protocol;
+       sa_id->family = dst->get_family(dst);
+
+       if (!add_mark(hdr, sizeof(request), mark))
+       {
+               return FAILED;
+       }
+
+       if (this->socket_xfrm->send(this->socket_xfrm, hdr, &out, &len) == SUCCESS)
+       {
+               hdr = out;
+               while (NLMSG_OK(hdr, len))
+               {
+                       switch (hdr->nlmsg_type)
+                       {
+                               case XFRM_MSG_NEWSA:
+                               {
+                                       out_sa = NLMSG_DATA(hdr);
+                                       break;
+                               }
+                               case NLMSG_ERROR:
+                               {
+                                       struct nlmsgerr *err = NLMSG_DATA(hdr);
+                                       DBG1(DBG_KNL, "querying SAD entry failed: %s (%d)",
+                                                strerror(-err->error), -err->error);
+                                       break;
+                               }
+                               default:
+                                       hdr = NLMSG_NEXT(hdr, len);
+                                       continue;
+                               case NLMSG_DONE:
+                                       break;
+                       }
+                       break;
+               }
+       }
+       if (out_sa == NULL)
+       {
+               DBG1(DBG_KNL, "unable to update SAD entry with SPI %.8x", ntohl(spi));
+               goto failed;
+       }
+
+       get_replay_state(this, spi, protocol, dst, mark, &replay_esn,
+                                        &replay_esn_len, &replay, &lifetime);
+
+       /* delete the old SA (without affecting the IPComp SA) */
+       if (del_sa(this, src, dst, spi, protocol, 0, mark) != SUCCESS)
+       {
+               DBG1(DBG_KNL, "unable to delete old SAD entry with SPI %.8x",
+                                          ntohl(spi));
+               goto failed;
+       }
+
+       DBG2(DBG_KNL, "updating SAD entry with SPI %.8x from %#H..%#H to %#H..%#H",
+                                  ntohl(spi), src, dst, new_src, new_dst);
+       /* copy over the SA from out to request */
+       hdr = &request.hdr;
+       hdr->nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
+       hdr->nlmsg_type = XFRM_MSG_NEWSA;
+       hdr->nlmsg_len = NLMSG_LENGTH(sizeof(struct xfrm_usersa_info));
+       sa = NLMSG_DATA(hdr);
+       memcpy(sa, NLMSG_DATA(out), sizeof(struct xfrm_usersa_info));
+       sa->family = new_dst->get_family(new_dst);
+
+       if (!src->ip_equals(src, new_src))
+       {
+               host2xfrm(new_src, &sa->saddr);
+       }
+       if (!dst->ip_equals(dst, new_dst))
+       {
+               host2xfrm(new_dst, &sa->id.daddr);
+       }
+
+       rta = XFRM_RTA(out, struct xfrm_usersa_info);
+       rtasize = XFRM_PAYLOAD(out, struct xfrm_usersa_info);
+       while (RTA_OK(rta, rtasize))
+       {
+               /* copy all attributes, but not XFRMA_ENCAP if we are disabling it */
+               if (rta->rta_type != XFRMA_ENCAP || new_encap)
+               {
+                       if (rta->rta_type == XFRMA_ENCAP)
+                       {       /* update encap tmpl */
+                               tmpl = RTA_DATA(rta);
+                               tmpl->encap_sport = ntohs(new_src->get_port(new_src));
+                               tmpl->encap_dport = ntohs(new_dst->get_port(new_dst));
+                       }
+                       netlink_add_attribute(hdr, rta->rta_type,
+                                                                 chunk_create(RTA_DATA(rta), RTA_PAYLOAD(rta)),
+                                                                 sizeof(request));
+               }
+               rta = RTA_NEXT(rta, rtasize);
+       }
+
+       if (tmpl == NULL && new_encap)
+       {       /* add tmpl if we are enabling it */
+               tmpl = netlink_reserve(hdr, sizeof(request), XFRMA_ENCAP, sizeof(*tmpl));
+               if (!tmpl)
+               {
+                       goto failed;
+               }
+               tmpl->encap_type = UDP_ENCAP_ESPINUDP;
+               tmpl->encap_sport = ntohs(new_src->get_port(new_src));
+               tmpl->encap_dport = ntohs(new_dst->get_port(new_dst));
+               memset(&tmpl->encap_oa, 0, sizeof (xfrm_address_t));
+       }
+
+       if (replay_esn)
+       {
+               struct xfrm_replay_state_esn *state;
+
+               state = netlink_reserve(hdr, sizeof(request), XFRMA_REPLAY_ESN_VAL,
+                                                               replay_esn_len);
+               if (!state)
+               {
+                       goto failed;
+               }
+               memcpy(state, replay_esn, replay_esn_len);
+       }
+       else if (replay)
+       {
+               struct xfrm_replay_state *state;
+
+               state = netlink_reserve(hdr, sizeof(request), XFRMA_REPLAY_VAL,
+                                                               sizeof(*state));
+               if (!state)
+               {
+                       goto failed;
+               }
+               memcpy(state, replay, sizeof(*state));
+       }
+       else
+       {
+               DBG1(DBG_KNL, "unable to copy replay state from old SAD entry with "
+                        "SPI %.8x", ntohl(spi));
+       }
+       if (lifetime)
+       {
+               struct xfrm_lifetime_cur *state;
+
+               state = netlink_reserve(hdr, sizeof(request), XFRMA_LTIME_VAL,
+                                                               sizeof(*state));
+               if (!state)
+               {
+                       goto failed;
+               }
+               memcpy(state, lifetime, sizeof(*state));
+       }
+       else
+       {
+               DBG1(DBG_KNL, "unable to copy usage stats from old SAD entry with "
+                        "SPI %.8x", ntohl(spi));
+       }
+
+       if (this->socket_xfrm->send_ack(this->socket_xfrm, hdr) != SUCCESS)
+       {
+               DBG1(DBG_KNL, "unable to update SAD entry with SPI %.8x", ntohl(spi));
+               goto failed;
+       }
+
+       status = SUCCESS;
+failed:
+       free(replay);
+       free(replay_esn);
+       free(lifetime);
+       memwipe(out, len);
+       memwipe(&request, sizeof(request));
+       free(out);
+
+       return status;
+}
+
+METHOD(kernel_ipsec_t, flush_sas, status_t,
+       private_kernel_netlink_ipsec_t *this)
+{
+       netlink_buf_t request;
+       struct nlmsghdr *hdr;
+       struct xfrm_usersa_flush *flush;
+       struct {
+               u_int8_t proto;
+               char *name;
+       } protos[] = {
+               { IPPROTO_AH, "AH" },
+               { IPPROTO_ESP, "ESP" },
+               { IPPROTO_COMP, "IPComp" },
+       };
+       int i;
+
+       memset(&request, 0, sizeof(request));
+
+       hdr = &request.hdr;
+       hdr->nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
+       hdr->nlmsg_type = XFRM_MSG_FLUSHSA;
+       hdr->nlmsg_len = NLMSG_LENGTH(sizeof(struct xfrm_usersa_flush));
+
+       flush = NLMSG_DATA(hdr);
+
+       for (i = 0; i < countof(protos); i++)
+       {
+               DBG2(DBG_KNL, "flushing all %s SAD entries", protos[i].name);
+
+               flush->proto = protos[i].proto;
+
+               if (this->socket_xfrm->send_ack(this->socket_xfrm, hdr) != SUCCESS)
+               {
+                       DBG1(DBG_KNL, "unable to flush %s SAD entries", protos[i].name);
+                       return FAILED;
+               }
+       }
+       return SUCCESS;
+}
+
+/**
+ * Add or update a policy in the kernel.
+ *
+ * Note: The mutex has to be locked when entering this function
+ * and is unlocked here in any case.
+ */
+static status_t add_policy_internal(private_kernel_netlink_ipsec_t *this,
+       policy_entry_t *policy, policy_sa_t *mapping, bool update)
+{
+       netlink_buf_t request;
+       policy_entry_t clone;
+       ipsec_sa_t *ipsec = mapping->sa;
+       struct xfrm_userpolicy_info *policy_info;
+       struct nlmsghdr *hdr;
+       status_t status;
+       int i;
+
+       /* clone the policy so we are able to check it out again later */
+       memcpy(&clone, policy, sizeof(policy_entry_t));
+
+       memset(&request, 0, sizeof(request));
+       hdr = &request.hdr;
+       hdr->nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
+       hdr->nlmsg_type = update ? XFRM_MSG_UPDPOLICY : XFRM_MSG_NEWPOLICY;
+       hdr->nlmsg_len = NLMSG_LENGTH(sizeof(struct xfrm_userpolicy_info));
+
+       policy_info = NLMSG_DATA(hdr);
+       policy_info->sel = policy->sel;
+       policy_info->dir = policy->direction;
+
+       /* calculate priority based on selector size, small size = high prio */
+       policy_info->priority = mapping->priority;
+       policy_info->action = mapping->type != POLICY_DROP ? XFRM_POLICY_ALLOW
+                                                                                                          : XFRM_POLICY_BLOCK;
+       policy_info->share = XFRM_SHARE_ANY;
+
+       /* policies don't expire */
+       policy_info->lft.soft_byte_limit = XFRM_INF;
+       policy_info->lft.soft_packet_limit = XFRM_INF;
+       policy_info->lft.hard_byte_limit = XFRM_INF;
+       policy_info->lft.hard_packet_limit = XFRM_INF;
+       policy_info->lft.soft_add_expires_seconds = 0;
+       policy_info->lft.hard_add_expires_seconds = 0;
+       policy_info->lft.soft_use_expires_seconds = 0;
+       policy_info->lft.hard_use_expires_seconds = 0;
+
+       if (mapping->type == POLICY_IPSEC)
+       {
+               struct xfrm_user_tmpl *tmpl;
+               struct {
+                       u_int8_t proto;
+                       bool use;
+               } protos[] = {
+                       { IPPROTO_COMP, ipsec->cfg.ipcomp.transform != IPCOMP_NONE },
+                       { IPPROTO_ESP, ipsec->cfg.esp.use },
+                       { IPPROTO_AH, ipsec->cfg.ah.use },
+               };
+               ipsec_mode_t proto_mode = ipsec->cfg.mode;
+               int count = 0;
+
+               for (i = 0; i < countof(protos); i++)
+               {
+                       if (protos[i].use)
+                       {
+                               count++;
+                       }
+               }
+               tmpl = netlink_reserve(hdr, sizeof(request), XFRMA_TMPL,
+                                                          count * sizeof(*tmpl));
+               if (!tmpl)
+               {
+                       this->mutex->unlock(this->mutex);
+                       return FAILED;
+               }
+
+               for (i = 0; i < countof(protos); i++)
+               {
+                       if (!protos[i].use)
+                       {
+                               continue;
+                       }
+                       tmpl->reqid = ipsec->cfg.reqid;
+                       tmpl->id.proto = protos[i].proto;
+                       tmpl->aalgos = tmpl->ealgos = tmpl->calgos = ~0;
+                       tmpl->mode = mode2kernel(proto_mode);
+                       tmpl->optional = protos[i].proto == IPPROTO_COMP &&
+                                                        policy->direction != POLICY_OUT;
+                       tmpl->family = ipsec->src->get_family(ipsec->src);
+
+                       if (proto_mode == MODE_TUNNEL || proto_mode == MODE_BEET)
+                       {       /* only for tunnel mode */
+                               host2xfrm(ipsec->src, &tmpl->saddr);
+                               host2xfrm(ipsec->dst, &tmpl->id.daddr);
+                       }
+
+                       tmpl++;
+
+                       /* use transport mode for other SAs */
+                       proto_mode = MODE_TRANSPORT;
+               }
+       }
+
+       if (!add_mark(hdr, sizeof(request), ipsec->mark))
+       {
+               this->mutex->unlock(this->mutex);
+               return FAILED;
+       }
+       this->mutex->unlock(this->mutex);
+
+       status = this->socket_xfrm->send_ack(this->socket_xfrm, hdr);
+       if (status == ALREADY_DONE && !update)
+       {
+               DBG1(DBG_KNL, "policy already exists, try to update it");
+               hdr->nlmsg_type = XFRM_MSG_UPDPOLICY;
+               status = this->socket_xfrm->send_ack(this->socket_xfrm, hdr);
+       }
+       if (status != SUCCESS)
+       {
+               return FAILED;
+       }
+
+       /* find the policy again */
+       this->mutex->lock(this->mutex);
+       policy = this->policies->get(this->policies, &clone);
+       if (!policy ||
+                policy->used_by->find_first(policy->used_by,
+                                                                        NULL, (void**)&mapping) != SUCCESS)
+       {       /* policy or mapping is already gone, ignore */
+               this->mutex->unlock(this->mutex);
+               return SUCCESS;
+       }
+
+       /* install a route, if:
+        * - this is a forward policy (to just get one for each child)
+        * - we are in tunnel/BEET mode or install a bypass policy
+        * - routing is not disabled via strongswan.conf
+        */
+       if (policy->direction == POLICY_FWD && this->install_routes &&
+               (mapping->type != POLICY_IPSEC || ipsec->cfg.mode != MODE_TRANSPORT))
+       {
+               policy_sa_fwd_t *fwd = (policy_sa_fwd_t*)mapping;
+               route_entry_t *route;
+               host_t *iface;
+
+               INIT(route,
+                       .prefixlen = policy->sel.prefixlen_s,
+               );
+
+               if (hydra->kernel_interface->get_address_by_ts(hydra->kernel_interface,
+                               fwd->dst_ts, &route->src_ip, NULL) == SUCCESS)
+               {
+                       /* get the nexthop to src (src as we are in POLICY_FWD) */
+                       if (!ipsec->src->is_anyaddr(ipsec->src))
+                       {
+                               route->gateway = hydra->kernel_interface->get_nexthop(
+                                                                                       hydra->kernel_interface, ipsec->src,
+                                                                                       -1, ipsec->dst);
+                       }
+                       else
+                       {       /* for shunt policies */
+                               iface = xfrm2host(policy->sel.family, &policy->sel.saddr, 0);
+                               route->gateway = hydra->kernel_interface->get_nexthop(
+                                                                               hydra->kernel_interface, iface,
+                                                                               policy->sel.prefixlen_s, route->src_ip);
+                               iface->destroy(iface);
+                       }
+                       route->dst_net = chunk_alloc(policy->sel.family == AF_INET ? 4 : 16);
+                       memcpy(route->dst_net.ptr, &policy->sel.saddr, route->dst_net.len);
+
+                       /* get the interface to install the route for. If we have a local
+                        * address, use it. Otherwise (for shunt policies) use the
+                        * routes source address. */
+                       iface = ipsec->dst;
+                       if (iface->is_anyaddr(iface))
+                       {
+                               iface = route->src_ip;
+                       }
+                       /* install route via outgoing interface */
+                       if (!hydra->kernel_interface->get_interface(hydra->kernel_interface,
+                                                                                                               iface, &route->if_name))
+                       {
+                               this->mutex->unlock(this->mutex);
+                               route_entry_destroy(route);
+                               return SUCCESS;
+                       }
+
+                       if (policy->route)
+                       {
+                               route_entry_t *old = policy->route;
+                               if (route_entry_equals(old, route))
+                               {
+                                       this->mutex->unlock(this->mutex);
+                                       route_entry_destroy(route);
+                                       return SUCCESS;
+                               }
+                               /* uninstall previously installed route */
+                               if (hydra->kernel_interface->del_route(hydra->kernel_interface,
+                                               old->dst_net, old->prefixlen, old->gateway,
+                                               old->src_ip, old->if_name) != SUCCESS)
+                               {
+                                       DBG1(DBG_KNL, "error uninstalling route installed with "
+                                                                 "policy %R === %R %N", fwd->src_ts,
+                                                                  fwd->dst_ts, policy_dir_names,
+                                                                  policy->direction);
+                               }
+                               route_entry_destroy(old);
+                               policy->route = NULL;
+                       }
+
+                       DBG2(DBG_KNL, "installing route: %R via %H src %H dev %s",
+                                fwd->src_ts, route->gateway, route->src_ip, route->if_name);
+                       switch (hydra->kernel_interface->add_route(
+                                                               hydra->kernel_interface, route->dst_net,
+                                                               route->prefixlen, route->gateway,
+                                                               route->src_ip, route->if_name))
+                       {
+                               default:
+                                       DBG1(DBG_KNL, "unable to install source route for %H",
+                                                                  route->src_ip);
+                                       /* FALL */
+                               case ALREADY_DONE:
+                                       /* route exists, do not uninstall */
+                                       route_entry_destroy(route);
+                                       break;
+                               case SUCCESS:
+                                       /* cache the installed route */
+                                       policy->route = route;
+                                       break;
+                       }
+               }
+               else
+               {
+                       free(route);
+               }
+       }
+       this->mutex->unlock(this->mutex);
+       return SUCCESS;
+}
+
+METHOD(kernel_ipsec_t, add_policy, status_t,
+       private_kernel_netlink_ipsec_t *this, host_t *src, host_t *dst,
+       traffic_selector_t *src_ts, traffic_selector_t *dst_ts,
+       policy_dir_t direction, policy_type_t type, ipsec_sa_cfg_t *sa,
+       mark_t mark, policy_priority_t priority)
+{
+       policy_entry_t *policy, *current;
+       policy_sa_t *assigned_sa, *current_sa;
+       enumerator_t *enumerator;
+       bool found = FALSE, update = TRUE;
+
+       /* create a policy */
+       INIT(policy,
+               .sel = ts2selector(src_ts, dst_ts),
+               .mark = mark.value & mark.mask,
+               .direction = direction,
+               .reqid = sa->reqid,
+       );
+
+       /* find the policy, which matches EXACTLY */
+       this->mutex->lock(this->mutex);
+       current = this->policies->get(this->policies, policy);
+       if (current)
+       {
+               if (current->reqid && sa->reqid && current->reqid != sa->reqid)
+               {
+                       DBG1(DBG_CFG, "unable to install policy %R === %R %N (mark "
+                                "%u/0x%08x) for reqid %u, the same policy for reqid %u exists",
+                                src_ts, dst_ts, policy_dir_names, direction,
+                                mark.value, mark.mask, sa->reqid, current->reqid);
+                       policy_entry_destroy(this, policy);
+                       this->mutex->unlock(this->mutex);
+                       return INVALID_STATE;
+               }
+               /* use existing policy */
+               DBG2(DBG_KNL, "policy %R === %R %N  (mark %u/0x%08x) "
+                                         "already exists, increasing refcount",
+                                          src_ts, dst_ts, policy_dir_names, direction,
+                                          mark.value, mark.mask);
+               policy_entry_destroy(this, policy);
+               policy = current;
+               found = TRUE;
+       }
+       else
+       {       /* use the new one, if we have no such policy */
+               policy->used_by = linked_list_create();
+               this->policies->put(this->policies, policy, policy);
+       }
+
+       /* cache the assigned IPsec SA */
+       assigned_sa = policy_sa_create(this, direction, type, src, dst, src_ts,
+                                                                  dst_ts, mark, sa);
+       assigned_sa->priority = get_priority(policy, priority);
+
+       /* insert the SA according to its priority */
+       enumerator = policy->used_by->create_enumerator(policy->used_by);
+       while (enumerator->enumerate(enumerator, (void**)&current_sa))
+       {
+               if (current_sa->priority >= assigned_sa->priority)
+               {
+                       break;
+               }
+               update = FALSE;
+       }
+       policy->used_by->insert_before(policy->used_by, enumerator,
+                                                                  assigned_sa);
+       enumerator->destroy(enumerator);
+
+       if (!update)
+       {       /* we don't update the policy if the priority is lower than that of
+                * the currently installed one */
+               this->mutex->unlock(this->mutex);
+               return SUCCESS;
+       }
+
+       if (this->policy_update)
+       {
+               found = TRUE;
+       }
+
+       DBG2(DBG_KNL, "%s policy %R === %R %N  (mark %u/0x%08x)",
+                                  found ? "updating" : "adding", src_ts, dst_ts,
+                                  policy_dir_names, direction, mark.value, mark.mask);
+
+       if (add_policy_internal(this, policy, assigned_sa, found) != SUCCESS)
+       {
+               DBG1(DBG_KNL, "unable to %s policy %R === %R %N",
+                                          found ? "update" : "add", src_ts, dst_ts,
+                                          policy_dir_names, direction);
+               return FAILED;
+       }
+       return SUCCESS;
+}
+
+METHOD(kernel_ipsec_t, query_policy, status_t,
+       private_kernel_netlink_ipsec_t *this, traffic_selector_t *src_ts,
+       traffic_selector_t *dst_ts, policy_dir_t direction, mark_t mark,
+       time_t *use_time)
+{
+       netlink_buf_t request;
+       struct nlmsghdr *out = NULL, *hdr;
+       struct xfrm_userpolicy_id *policy_id;
+       struct xfrm_userpolicy_info *policy = NULL;
+       size_t len;
+
+       memset(&request, 0, sizeof(request));
+
+       DBG2(DBG_KNL, "querying policy %R === %R %N  (mark %u/0x%08x)",
+                                  src_ts, dst_ts, policy_dir_names, direction,
+                                  mark.value, mark.mask);
+
+       hdr = &request.hdr;
+       hdr->nlmsg_flags = NLM_F_REQUEST;
+       hdr->nlmsg_type = XFRM_MSG_GETPOLICY;
+       hdr->nlmsg_len = NLMSG_LENGTH(sizeof(struct xfrm_userpolicy_id));
+
+       policy_id = NLMSG_DATA(hdr);
+       policy_id->sel = ts2selector(src_ts, dst_ts);
+       policy_id->dir = direction;
+
+       if (!add_mark(hdr, sizeof(request), mark))
+       {
+               return FAILED;
+       }
+
+       if (this->socket_xfrm->send(this->socket_xfrm, hdr, &out, &len) == SUCCESS)
+       {
+               hdr = out;
+               while (NLMSG_OK(hdr, len))
+               {
+                       switch (hdr->nlmsg_type)
+                       {
+                               case XFRM_MSG_NEWPOLICY:
+                               {
+                                       policy = NLMSG_DATA(hdr);
+                                       break;
+                               }
+                               case NLMSG_ERROR:
+                               {
+                                       struct nlmsgerr *err = NLMSG_DATA(hdr);
+                                       DBG1(DBG_KNL, "querying policy failed: %s (%d)",
+                                                                  strerror(-err->error), -err->error);
+                                       break;
+                               }
+                               default:
+                                       hdr = NLMSG_NEXT(hdr, len);
+                                       continue;
+                               case NLMSG_DONE:
+                                       break;
+                       }
+                       break;
+               }
+       }
+
+       if (policy == NULL)
+       {
+               DBG2(DBG_KNL, "unable to query policy %R === %R %N", src_ts, dst_ts,
+                                          policy_dir_names, direction);
+               free(out);
+               return FAILED;
+       }
+
+       if (policy->curlft.use_time)
+       {
+               /* we need the monotonic time, but the kernel returns system time. */
+               *use_time = time_monotonic(NULL) - (time(NULL) - policy->curlft.use_time);
+       }
+       else
+       {
+               *use_time = 0;
+       }
+
+       free(out);
+       return SUCCESS;
+}
+
+METHOD(kernel_ipsec_t, del_policy, status_t,
+       private_kernel_netlink_ipsec_t *this, host_t *src, host_t *dst,
+       traffic_selector_t *src_ts, traffic_selector_t *dst_ts,
+       policy_dir_t direction, policy_type_t type, ipsec_sa_cfg_t *sa,
+       mark_t mark, policy_priority_t prio)
+{
+       policy_entry_t *current, policy;
+       enumerator_t *enumerator;
+       policy_sa_t *mapping;
+       netlink_buf_t request;
+       struct nlmsghdr *hdr;
+       struct xfrm_userpolicy_id *policy_id;
+       bool is_installed = TRUE;
+       u_int32_t priority;
+       ipsec_sa_t assigned_sa = {
+               .src = src,
+               .dst = dst,
+               .mark = mark,
+               .cfg = *sa,
+       };
+
+       DBG2(DBG_KNL, "deleting policy %R === %R %N  (mark %u/0x%08x)",
+                                  src_ts, dst_ts, policy_dir_names, direction,
+                                  mark.value, mark.mask);
+
+       /* create a policy */
+       memset(&policy, 0, sizeof(policy_entry_t));
+       policy.sel = ts2selector(src_ts, dst_ts);
+       policy.mark = mark.value & mark.mask;
+       policy.direction = direction;
+
+       /* find the policy */
+       this->mutex->lock(this->mutex);
+       current = this->policies->get(this->policies, &policy);
+       if (!current)
+       {
+               if (mark.value)
+               {
+                       DBG1(DBG_KNL, "deleting policy %R === %R %N  (mark %u/0x%08x) "
+                                                 "failed, not found", src_ts, dst_ts, policy_dir_names,
+                                                  direction, mark.value, mark.mask);
+               }
+               else
+               {
+                       DBG1(DBG_KNL, "deleting policy %R === %R %N failed, not found",
+                                                  src_ts, dst_ts, policy_dir_names, direction);
+               }
+               this->mutex->unlock(this->mutex);
+               return NOT_FOUND;
+       }
+
+       /* remove mapping to SA by reqid and priority */
+       priority = get_priority(current, prio);
+       enumerator = current->used_by->create_enumerator(current->used_by);
+       while (enumerator->enumerate(enumerator, (void**)&mapping))
+       {
+               if (priority == mapping->priority && type == mapping->type &&
+                       ipsec_sa_equals(mapping->sa, &assigned_sa))
+               {
+                       current->used_by->remove_at(current->used_by, enumerator);
+                       policy_sa_destroy(mapping, &direction, this);
+                       break;
+               }
+               is_installed = FALSE;
+       }
+       enumerator->destroy(enumerator);
+
+       if (current->used_by->get_count(current->used_by) > 0)
+       {       /* policy is used by more SAs, keep in kernel */
+               DBG2(DBG_KNL, "policy still used by another CHILD_SA, not removed");
+               if (!is_installed)
+               {       /* no need to update as the policy was not installed for this SA */
+                       this->mutex->unlock(this->mutex);
+                       return SUCCESS;
+               }
+
+               DBG2(DBG_KNL, "updating policy %R === %R %N  (mark %u/0x%08x)",
+                                          src_ts, dst_ts, policy_dir_names, direction,
+                                          mark.value, mark.mask);
+
+               current->used_by->get_first(current->used_by, (void**)&mapping);
+               if (add_policy_internal(this, current, mapping, TRUE) != SUCCESS)
+               {
+                       DBG1(DBG_KNL, "unable to update policy %R === %R %N",
+                                                  src_ts, dst_ts, policy_dir_names, direction);
+                       return FAILED;
+               }
+               return SUCCESS;
+       }
+
+       memset(&request, 0, sizeof(request));
+
+       hdr = &request.hdr;
+       hdr->nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
+       hdr->nlmsg_type = XFRM_MSG_DELPOLICY;
+       hdr->nlmsg_len = NLMSG_LENGTH(sizeof(struct xfrm_userpolicy_id));
+
+       policy_id = NLMSG_DATA(hdr);
+       policy_id->sel = current->sel;
+       policy_id->dir = direction;
+
+       if (!add_mark(hdr, sizeof(request), mark))
+       {
+               this->mutex->unlock(this->mutex);
+               return FAILED;
+       }
+
+       if (current->route)
+       {
+               route_entry_t *route = current->route;
+               if (hydra->kernel_interface->del_route(hydra->kernel_interface,
+                               route->dst_net, route->prefixlen, route->gateway,
+                               route->src_ip, route->if_name) != SUCCESS)
+               {
+                       DBG1(DBG_KNL, "error uninstalling route installed with "
+                                                 "policy %R === %R %N", src_ts, dst_ts,
+                                                  policy_dir_names, direction);
+               }
+       }
+
+       this->policies->remove(this->policies, current);
+       policy_entry_destroy(this, current);
+       this->mutex->unlock(this->mutex);
+
+       if (this->socket_xfrm->send_ack(this->socket_xfrm, hdr) != SUCCESS)
+       {
+               if (mark.value)
+               {
+                       DBG1(DBG_KNL, "unable to delete policy %R === %R %N  "
+                                                 "(mark %u/0x%08x)", src_ts, dst_ts, policy_dir_names,
+                                                  direction, mark.value, mark.mask);
+               }
+               else
+               {
+                       DBG1(DBG_KNL, "unable to delete policy %R === %R %N",
+                                                  src_ts, dst_ts, policy_dir_names, direction);
+               }
+               return FAILED;
+       }
+       return SUCCESS;
+}
+
+METHOD(kernel_ipsec_t, flush_policies, status_t,
+       private_kernel_netlink_ipsec_t *this)
+{
+       netlink_buf_t request;
+       struct nlmsghdr *hdr;
+
+       memset(&request, 0, sizeof(request));
+
+       DBG2(DBG_KNL, "flushing all policies from SPD");
+
+       hdr = &request.hdr;
+       hdr->nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
+       hdr->nlmsg_type = XFRM_MSG_FLUSHPOLICY;
+       hdr->nlmsg_len = NLMSG_LENGTH(0); /* no data associated */
+
+       /* by adding an rtattr of type  XFRMA_POLICY_TYPE we could restrict this
+        * to main or sub policies (default is main) */
+
+       if (this->socket_xfrm->send_ack(this->socket_xfrm, hdr) != SUCCESS)
+       {
+               DBG1(DBG_KNL, "unable to flush SPD entries");
+               return FAILED;
+       }
+       return SUCCESS;
+}
+
+/**
+ * Bypass socket using a per-socket policy
+ */
+static bool add_socket_bypass(private_kernel_netlink_ipsec_t *this,
+                                                         int fd, int family)
+{
+       struct xfrm_userpolicy_info policy;
+       u_int sol, ipsec_policy;
+
+       switch (family)
+       {
+               case AF_INET:
+                       sol = SOL_IP;
+                       ipsec_policy = IP_XFRM_POLICY;
+                       break;
+               case AF_INET6:
+                       sol = SOL_IPV6;
+                       ipsec_policy = IPV6_XFRM_POLICY;
+                       break;
+               default:
+                       return FALSE;
+       }
+
+       memset(&policy, 0, sizeof(policy));
+       policy.action = XFRM_POLICY_ALLOW;
+       policy.sel.family = family;
+
+       policy.dir = XFRM_POLICY_OUT;
+       if (setsockopt(fd, sol, ipsec_policy, &policy, sizeof(policy)) < 0)
+       {
+               DBG1(DBG_KNL, "unable to set IPSEC_POLICY on socket: %s",
+                                          strerror(errno));
+               return FALSE;
+       }
+       policy.dir = XFRM_POLICY_IN;
+       if (setsockopt(fd, sol, ipsec_policy, &policy, sizeof(policy)) < 0)
+       {
+               DBG1(DBG_KNL, "unable to set IPSEC_POLICY on socket: %s",
+                                          strerror(errno));
+               return FALSE;
+       }
+       return TRUE;
+}
+
+/**
+ * Port based IKE bypass policy
+ */
+typedef struct {
+       /** address family */
+       int family;
+       /** layer 4 protocol */
+       int proto;
+       /** port number, network order */
+       u_int16_t port;
+} bypass_t;
+
+/**
+ * Add or remove a bypass policy from/to kernel
+ */
+static bool manage_bypass(private_kernel_netlink_ipsec_t *this,
+                                                 int type, policy_dir_t dir, bypass_t *bypass)
+{
+       netlink_buf_t request;
+       struct xfrm_selector *sel;
+       struct nlmsghdr *hdr;
+
+       memset(&request, 0, sizeof(request));
+       hdr = &request.hdr;
+       hdr->nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
+       hdr->nlmsg_type = type;
+
+       if (type == XFRM_MSG_NEWPOLICY)
+       {
+               struct xfrm_userpolicy_info *policy;
+
+               hdr->nlmsg_len = NLMSG_LENGTH(sizeof(struct xfrm_userpolicy_info));
+
+               policy = NLMSG_DATA(hdr);
+               policy->dir = dir;
+               policy->priority = 32;
+               policy->action = XFRM_POLICY_ALLOW;
+               policy->share = XFRM_SHARE_ANY;
+
+               policy->lft.soft_byte_limit = XFRM_INF;
+               policy->lft.soft_packet_limit = XFRM_INF;
+               policy->lft.hard_byte_limit = XFRM_INF;
+               policy->lft.hard_packet_limit = XFRM_INF;
+
+               sel = &policy->sel;
+       }
+       else /* XFRM_MSG_DELPOLICY */
+       {
+               struct xfrm_userpolicy_id *policy;
+
+               hdr->nlmsg_len = NLMSG_LENGTH(sizeof(struct xfrm_userpolicy_id));
+
+               policy = NLMSG_DATA(hdr);
+               policy->dir = dir;
+
+               sel = &policy->sel;
+       }
+
+       sel->family = bypass->family;
+       sel->proto = bypass->proto;
+       if (dir == POLICY_IN)
+       {
+               sel->dport = bypass->port;
+               sel->dport_mask = 0xffff;
+       }
+       else
+       {
+               sel->sport = bypass->port;
+               sel->sport_mask = 0xffff;
+       }
+       return this->socket_xfrm->send_ack(this->socket_xfrm, hdr) == SUCCESS;
+}
+
+/**
+ * Bypass socket using a port-based bypass policy
+ */
+static bool add_port_bypass(private_kernel_netlink_ipsec_t *this,
+                                                       int fd, int family)
+{
+       union {
+               struct sockaddr sa;
+               struct sockaddr_in in;
+               struct sockaddr_in6 in6;
+       } saddr;
+       socklen_t len;
+       bypass_t bypass = {
+               .family = family,
+       };
+
+       len = sizeof(saddr);
+       if (getsockname(fd, &saddr.sa, &len) != 0)
+       {
+               return FALSE;
+       }
+#ifdef SO_PROTOCOL /* since 2.6.32 */
+       len = sizeof(bypass.proto);
+       if (getsockopt(fd, SOL_SOCKET, SO_PROTOCOL, &bypass.proto, &len) != 0)
+#endif
+       {       /* assume UDP if SO_PROTOCOL not supported */
+               bypass.proto = IPPROTO_UDP;
+       }
+       switch (family)
+       {
+               case AF_INET:
+                       bypass.port = saddr.in.sin_port;
+                       break;
+               case AF_INET6:
+                       bypass.port = saddr.in6.sin6_port;
+                       break;
+               default:
+                       return FALSE;
+       }
+
+       if (!manage_bypass(this, XFRM_MSG_NEWPOLICY, POLICY_IN, &bypass))
+       {
+               return FALSE;
+       }
+       if (!manage_bypass(this, XFRM_MSG_NEWPOLICY, POLICY_OUT, &bypass))
+       {
+               manage_bypass(this, XFRM_MSG_DELPOLICY, POLICY_IN, &bypass);
+               return FALSE;
+       }
+       array_insert(this->bypass, ARRAY_TAIL, &bypass);
+
+       return TRUE;
+}
+
+/**
+ * Remove installed port based bypass policy
+ */
+static void remove_port_bypass(bypass_t *bypass, int idx,
+                                                          private_kernel_netlink_ipsec_t *this)
+{
+       manage_bypass(this, XFRM_MSG_DELPOLICY, POLICY_OUT, bypass);
+       manage_bypass(this, XFRM_MSG_DELPOLICY, POLICY_IN, bypass);
+}
+
+METHOD(kernel_ipsec_t, bypass_socket, bool,
+       private_kernel_netlink_ipsec_t *this, int fd, int family)
+{
+       if (lib->settings->get_bool(lib->settings,
+                                       "%s.plugins.kernel-netlink.port_bypass", FALSE, lib->ns))
+       {
+               return add_port_bypass(this, fd, family);
+       }
+       return add_socket_bypass(this, fd, family);
+}
+
+METHOD(kernel_ipsec_t, enable_udp_decap, bool,
+       private_kernel_netlink_ipsec_t *this, int fd, int family, u_int16_t port)
+{
+       int type = UDP_ENCAP_ESPINUDP;
+
+       if (setsockopt(fd, SOL_UDP, UDP_ENCAP, &type, sizeof(type)) < 0)
+       {
+               DBG1(DBG_KNL, "unable to set UDP_ENCAP: %s", strerror(errno));
+               return FALSE;
+       }
+       return TRUE;
+}
+
+METHOD(kernel_ipsec_t, destroy, void,
+       private_kernel_netlink_ipsec_t *this)
+{
+       enumerator_t *enumerator;
+       policy_entry_t *policy;
+
+       array_destroy_function(this->bypass,
+                                                  (array_callback_t)remove_port_bypass, this);
+       if (this->socket_xfrm_events > 0)
+       {
+               lib->watcher->remove(lib->watcher, this->socket_xfrm_events);
+               close(this->socket_xfrm_events);
+       }
+       DESTROY_IF(this->socket_xfrm);
+       enumerator = this->policies->create_enumerator(this->policies);
+       while (enumerator->enumerate(enumerator, &policy, &policy))
+       {
+               policy_entry_destroy(this, policy);
+       }
+       enumerator->destroy(enumerator);
+       this->policies->destroy(this->policies);
+       this->sas->destroy(this->sas);
+       this->mutex->destroy(this->mutex);
+       free(this);
+}
+
+/*
+ * Described in header.
+ */
+kernel_netlink_ipsec_t *kernel_netlink_ipsec_create()
+{
+       private_kernel_netlink_ipsec_t *this;
+       bool register_for_events = TRUE;
+       FILE *f;
+
+       INIT(this,
+               .public = {
+                       .interface = {
+                               .get_features = _get_features,
+                               .get_spi = _get_spi,
+                               .get_cpi = _get_cpi,
+                               .add_sa  = _add_sa,
+                               .update_sa = _update_sa,
+                               .query_sa = _query_sa,
+                               .del_sa = _del_sa,
+                               .flush_sas = _flush_sas,
+                               .add_policy = _add_policy,
+                               .query_policy = _query_policy,
+                               .del_policy = _del_policy,
+                               .flush_policies = _flush_policies,
+                               .bypass_socket = _bypass_socket,
+                               .enable_udp_decap = _enable_udp_decap,
+                               .destroy = _destroy,
+                       },
+               },
+               .policies = hashtable_create((hashtable_hash_t)policy_hash,
+                                                                        (hashtable_equals_t)policy_equals, 32),
+               .sas = hashtable_create((hashtable_hash_t)ipsec_sa_hash,
+                                                               (hashtable_equals_t)ipsec_sa_equals, 32),
+               .bypass = array_create(sizeof(bypass_t), 0),
+               .mutex = mutex_create(MUTEX_TYPE_DEFAULT),
+               .policy_update = lib->settings->get_bool(lib->settings,
+                                       "%s.plugins.kernel-netlink.policy_update", FALSE, lib->ns),
+               .install_routes = lib->settings->get_bool(lib->settings,
+                                                       "%s.install_routes", TRUE, lib->ns),
+               .proto_port_transport = lib->settings->get_bool(lib->settings,
+                                               "%s.plugins.kernel-netlink.set_proto_port_transport_sa",
+                                               FALSE, lib->ns),
+       );
+
+       if (streq(lib->ns, "starter"))
+       {       /* starter has no threads, so we do not register for kernel events */
+               register_for_events = FALSE;
+       }
+
+       f = fopen("/proc/sys/net/core/xfrm_acq_expires", "w");
+       if (f)
+       {
+               fprintf(f, "%u", lib->settings->get_int(lib->settings,
+                                                               "%s.plugins.kernel-netlink.xfrm_acq_expires",
+                                                               DEFAULT_ACQUIRE_LIFETIME, lib->ns));
+               fclose(f);
+       }
+
+       this->socket_xfrm = netlink_socket_create(NETLINK_XFRM, xfrm_msg_names,
+                               lib->settings->get_bool(lib->settings,
+                                       "%s.plugins.kernel-netlink.parallel_xfrm", FALSE, lib->ns));
+       if (!this->socket_xfrm)
+       {
+               destroy(this);
+               return NULL;
+       }
+
+       if (register_for_events)
+       {
+               struct sockaddr_nl addr;
+
+               memset(&addr, 0, sizeof(addr));
+               addr.nl_family = AF_NETLINK;
+
+               /* create and bind XFRM socket for ACQUIRE, EXPIRE, MIGRATE & MAPPING */
+               this->socket_xfrm_events = socket(AF_NETLINK, SOCK_RAW, NETLINK_XFRM);
+               if (this->socket_xfrm_events <= 0)
+               {
+                       DBG1(DBG_KNL, "unable to create XFRM event socket");
+                       destroy(this);
+                       return NULL;
+               }
+               addr.nl_groups = XFRMNLGRP(ACQUIRE) | XFRMNLGRP(EXPIRE) |
+                                                XFRMNLGRP(MIGRATE) | XFRMNLGRP(MAPPING);
+               if (bind(this->socket_xfrm_events, (struct sockaddr*)&addr, sizeof(addr)))
+               {
+                       DBG1(DBG_KNL, "unable to bind XFRM event socket");
+                       destroy(this);
+                       return NULL;
+               }
+               lib->watcher->add(lib->watcher, this->socket_xfrm_events, WATCHER_READ,
+                                                 (watcher_cb_t)receive_events, this);
+       }
+
+       return &this->public;
+}
diff --git a/src/libcharon/plugins/kernel_netlink/kernel_netlink_ipsec.h b/src/libcharon/plugins/kernel_netlink/kernel_netlink_ipsec.h
new file mode 100644 (file)
index 0000000..3a45cce
--- /dev/null
@@ -0,0 +1,46 @@
+/*
+ * Copyright (C) 2008 Tobias Brunner
+ * Hochschule fuer Technik Rapperswil
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.  See <http://www.fsf.org/copyleft/gpl.txt>.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ * or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * for more details.
+ */
+
+/**
+ * @defgroup kernel_netlink_ipsec_i kernel_netlink_ipsec
+ * @{ @ingroup kernel_netlink
+ */
+
+#ifndef KERNEL_NETLINK_IPSEC_H_
+#define KERNEL_NETLINK_IPSEC_H_
+
+#include <kernel/kernel_ipsec.h>
+
+typedef struct kernel_netlink_ipsec_t kernel_netlink_ipsec_t;
+
+/**
+ * Implementation of the kernel ipsec interface using Netlink.
+ */
+struct kernel_netlink_ipsec_t {
+
+       /**
+        * Implements kernel_ipsec_t interface
+        */
+       kernel_ipsec_t interface;
+};
+
+/**
+ * Create a netlink kernel ipsec interface instance.
+ *
+ * @return                     kernel_netlink_ipsec_t instance
+ */
+kernel_netlink_ipsec_t *kernel_netlink_ipsec_create();
+
+#endif /** KERNEL_NETLINK_IPSEC_H_ @}*/
diff --git a/src/libcharon/plugins/kernel_netlink/kernel_netlink_net.c b/src/libcharon/plugins/kernel_netlink/kernel_netlink_net.c
new file mode 100644 (file)
index 0000000..4e5e02d
--- /dev/null
@@ -0,0 +1,2685 @@
+/*
+ * Copyright (C) 2008-2014 Tobias Brunner
+ * Copyright (C) 2005-2008 Martin Willi
+ * Hochschule fuer Technik Rapperswil
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.  See <http://www.fsf.org/copyleft/gpl.txt>.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ * or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * for more details.
+ */
+
+/*
+ * Copyright (C) 2010 secunet Security Networks AG
+ * Copyright (C) 2010 Thomas Egerer
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include <sys/socket.h>
+#include <sys/utsname.h>
+#include <linux/netlink.h>
+#include <linux/rtnetlink.h>
+#include <unistd.h>
+#include <errno.h>
+#include <net/if.h>
+#ifdef HAVE_LINUX_FIB_RULES_H
+#include <linux/fib_rules.h>
+#endif
+
+#include "kernel_netlink_net.h"
+#include "kernel_netlink_shared.h"
+
+#include <hydra.h>
+#include <utils/debug.h>
+#include <threading/mutex.h>
+#include <threading/rwlock.h>
+#include <threading/rwlock_condvar.h>
+#include <threading/spinlock.h>
+#include <collections/hashtable.h>
+#include <collections/linked_list.h>
+#include <processing/jobs/callback_job.h>
+
+/** delay before firing roam events (ms) */
+#define ROAM_DELAY 100
+
+/** delay before reinstalling routes (ms) */
+#define ROUTE_DELAY 100
+
+/** maximum recursion when searching for addresses in get_route() */
+#define MAX_ROUTE_RECURSION 2
+
+#ifndef ROUTING_TABLE
+#define ROUTING_TABLE 0
+#endif
+
+#ifndef ROUTING_TABLE_PRIO
+#define ROUTING_TABLE_PRIO 0
+#endif
+
+ENUM(rt_msg_names, RTM_NEWLINK, RTM_GETRULE,
+       "RTM_NEWLINK",
+       "RTM_DELLINK",
+       "RTM_GETLINK",
+       "RTM_SETLINK",
+       "RTM_NEWADDR",
+       "RTM_DELADDR",
+       "RTM_GETADDR",
+       "31",
+       "RTM_NEWROUTE",
+       "RTM_DELROUTE",
+       "RTM_GETROUTE",
+       "35",
+       "RTM_NEWNEIGH",
+       "RTM_DELNEIGH",
+       "RTM_GETNEIGH",
+       "RTM_NEWRULE",
+       "RTM_DELRULE",
+       "RTM_GETRULE",
+);
+
+typedef struct addr_entry_t addr_entry_t;
+
+/**
+ * IP address in an iface_entry_t
+ */
+struct addr_entry_t {
+
+       /** the ip address */
+       host_t *ip;
+
+       /** address flags */
+       u_char flags;
+
+       /** scope of the address */
+       u_char scope;
+
+       /** number of times this IP is used, if virtual (i.e. managed by us) */
+       u_int refcount;
+
+       /** TRUE once it is installed, if virtual */
+       bool installed;
+};
+
+/**
+ * destroy a addr_entry_t object
+ */
+static void addr_entry_destroy(addr_entry_t *this)
+{
+       this->ip->destroy(this->ip);
+       free(this);
+}
+
+typedef struct iface_entry_t iface_entry_t;
+
+/**
+ * A network interface on this system, containing addr_entry_t's
+ */
+struct iface_entry_t {
+
+       /** interface index */
+       int ifindex;
+
+       /** name of the interface */
+       char ifname[IFNAMSIZ];
+
+       /** interface flags, as in netdevice(7) SIOCGIFFLAGS */
+       u_int flags;
+
+       /** list of addresses as host_t */
+       linked_list_t *addrs;
+
+       /** TRUE if usable by config */
+       bool usable;
+};
+
+/**
+ * destroy an interface entry
+ */
+static void iface_entry_destroy(iface_entry_t *this)
+{
+       this->addrs->destroy_function(this->addrs, (void*)addr_entry_destroy);
+       free(this);
+}
+
+/**
+ * find an interface entry by index
+ */
+static bool iface_entry_by_index(iface_entry_t *this, int *ifindex)
+{
+       return this->ifindex == *ifindex;
+}
+
+/**
+ * find an interface entry by name
+ */
+static bool iface_entry_by_name(iface_entry_t *this, char *ifname)
+{
+       return streq(this->ifname, ifname);
+}
+
+/**
+ * check if an interface is up
+ */
+static inline bool iface_entry_up(iface_entry_t *iface)
+{
+       return (iface->flags & IFF_UP) == IFF_UP;
+}
+
+/**
+ * check if an interface is up and usable
+ */
+static inline bool iface_entry_up_and_usable(iface_entry_t *iface)
+{
+       return iface->usable && iface_entry_up(iface);
+}
+
+typedef struct addr_map_entry_t addr_map_entry_t;
+
+/**
+ * Entry that maps an IP address to an interface entry
+ */
+struct addr_map_entry_t {
+       /** The IP address */
+       host_t *ip;
+
+       /** The address entry for this IP address */
+       addr_entry_t *addr;
+
+       /** The interface this address is installed on */
+       iface_entry_t *iface;
+};
+
+/**
+ * Hash a addr_map_entry_t object, all entries with the same IP address
+ * are stored in the same bucket
+ */
+static u_int addr_map_entry_hash(addr_map_entry_t *this)
+{
+       return chunk_hash(this->ip->get_address(this->ip));
+}
+
+/**
+ * Compare two addr_map_entry_t objects, two entries are equal if they are
+ * installed on the same interface
+ */
+static bool addr_map_entry_equals(addr_map_entry_t *a, addr_map_entry_t *b)
+{
+       return a->iface->ifindex == b->iface->ifindex &&
+                  a->ip->ip_equals(a->ip, b->ip);
+}
+
+/**
+ * Used with get_match this finds an address entry if it is installed on
+ * an up and usable interface
+ */
+static bool addr_map_entry_match_up_and_usable(addr_map_entry_t *a,
+                                                                                          addr_map_entry_t *b)
+{
+       return iface_entry_up_and_usable(b->iface) &&
+                  a->ip->ip_equals(a->ip, b->ip);
+}
+
+/**
+ * Used with get_match this finds an address entry if it is installed on
+ * any active local interface
+ */
+static bool addr_map_entry_match_up(addr_map_entry_t *a, addr_map_entry_t *b)
+{
+       return iface_entry_up(b->iface) && a->ip->ip_equals(a->ip, b->ip);
+}
+
+/**
+ * Used with get_match this finds an address entry if it is installed on
+ * any local interface
+ */
+static bool addr_map_entry_match(addr_map_entry_t *a, addr_map_entry_t *b)
+{
+       return a->ip->ip_equals(a->ip, b->ip);
+}
+
+typedef struct route_entry_t route_entry_t;
+
+/**
+ * Installed routing entry
+ */
+struct route_entry_t {
+       /** Name of the interface the route is bound to */
+       char *if_name;
+
+       /** Source ip of the route */
+       host_t *src_ip;
+
+       /** Gateway for this route */
+       host_t *gateway;
+
+       /** Destination net */
+       chunk_t dst_net;
+
+       /** Destination net prefixlen */
+       u_int8_t prefixlen;
+};
+
+/**
+ * Clone a route_entry_t object.
+ */
+static route_entry_t *route_entry_clone(route_entry_t *this)
+{
+       route_entry_t *route;
+
+       INIT(route,
+               .if_name = strdup(this->if_name),
+               .src_ip = this->src_ip->clone(this->src_ip),
+               .gateway = this->gateway ? this->gateway->clone(this->gateway) : NULL,
+               .dst_net = chunk_clone(this->dst_net),
+               .prefixlen = this->prefixlen,
+       );
+       return route;
+}
+
+/**
+ * Destroy a route_entry_t object
+ */
+static void route_entry_destroy(route_entry_t *this)
+{
+       free(this->if_name);
+       DESTROY_IF(this->src_ip);
+       DESTROY_IF(this->gateway);
+       chunk_free(&this->dst_net);
+       free(this);
+}
+
+/**
+ * Hash a route_entry_t object
+ */
+static u_int route_entry_hash(route_entry_t *this)
+{
+       return chunk_hash_inc(chunk_from_thing(this->prefixlen),
+                                                 chunk_hash(this->dst_net));
+}
+
+/**
+ * Compare two route_entry_t objects
+ */
+static bool route_entry_equals(route_entry_t *a, route_entry_t *b)
+{
+       if (a->if_name && b->if_name && streq(a->if_name, b->if_name) &&
+               a->src_ip->ip_equals(a->src_ip, b->src_ip) &&
+               chunk_equals(a->dst_net, b->dst_net) && a->prefixlen == b->prefixlen)
+       {
+               return (!a->gateway && !b->gateway) || (a->gateway && b->gateway &&
+                                       a->gateway->ip_equals(a->gateway, b->gateway));
+       }
+       return FALSE;
+}
+
+typedef struct net_change_t net_change_t;
+
+/**
+ * Queued network changes
+ */
+struct net_change_t {
+       /** Name of the interface that got activated (or an IP appeared on) */
+       char *if_name;
+};
+
+/**
+ * Destroy a net_change_t object
+ */
+static void net_change_destroy(net_change_t *this)
+{
+       free(this->if_name);
+       free(this);
+}
+
+/**
+ * Hash a net_change_t object
+ */
+static u_int net_change_hash(net_change_t *this)
+{
+       return chunk_hash(chunk_create(this->if_name, strlen(this->if_name)));
+}
+
+/**
+ * Compare two net_change_t objects
+ */
+static bool net_change_equals(net_change_t *a, net_change_t *b)
+{
+       return streq(a->if_name, b->if_name);
+}
+
+typedef struct private_kernel_netlink_net_t private_kernel_netlink_net_t;
+
+/**
+ * Private variables and functions of kernel_netlink_net class.
+ */
+struct private_kernel_netlink_net_t {
+       /**
+        * Public part of the kernel_netlink_net_t object.
+        */
+       kernel_netlink_net_t public;
+
+       /**
+        * lock to access various lists and maps
+        */
+       rwlock_t *lock;
+
+       /**
+        * condition variable to signal virtual IP add/removal
+        */
+       rwlock_condvar_t *condvar;
+
+       /**
+        * Cached list of interfaces and its addresses (iface_entry_t)
+        */
+       linked_list_t *ifaces;
+
+       /**
+        * Map for IP addresses to iface_entry_t objects (addr_map_entry_t)
+        */
+       hashtable_t *addrs;
+
+       /**
+        * Map for virtual IP addresses to iface_entry_t objects (addr_map_entry_t)
+        */
+       hashtable_t *vips;
+
+       /**
+        * netlink rt socket (routing)
+        */
+       netlink_socket_t *socket;
+
+       /**
+        * Netlink rt socket to receive address change events
+        */
+       int socket_events;
+
+       /**
+        * earliest time of the next roam event
+        */
+       timeval_t next_roam;
+
+       /**
+        * roam event due to address change
+        */
+       bool roam_address;
+
+       /**
+        * lock to check and update roam event time
+        */
+       spinlock_t *roam_lock;
+
+       /**
+        * routing table to install routes
+        */
+       int routing_table;
+
+       /**
+        * priority of used routing table
+        */
+       int routing_table_prio;
+
+       /**
+        * installed routes
+        */
+       hashtable_t *routes;
+
+       /**
+        * mutex for routes
+        */
+       mutex_t *routes_lock;
+
+       /**
+        * interface changes which may trigger route reinstallation
+        */
+       hashtable_t *net_changes;
+
+       /**
+        * mutex for route reinstallation triggers
+        */
+       mutex_t *net_changes_lock;
+
+       /**
+        * time of last route reinstallation
+        */
+       timeval_t last_route_reinstall;
+
+       /**
+        * whether to react to RTM_NEWROUTE or RTM_DELROUTE events
+        */
+       bool process_route;
+
+       /**
+        * whether to trigger roam events
+        */
+       bool roam_events;
+
+       /**
+        * whether to actually install virtual IPs
+        */
+       bool install_virtual_ip;
+
+       /**
+        * the name of the interface virtual IP addresses are installed on
+        */
+       char *install_virtual_ip_on;
+
+       /**
+        * whether preferred source addresses can be specified for IPv6 routes
+        */
+       bool rta_prefsrc_for_ipv6;
+
+       /**
+        * whether marks can be used in route lookups
+        */
+       bool rta_mark;
+
+       /**
+        * the mark excluded from the routing rule used for virtual IPs
+        */
+       mark_t routing_mark;
+
+       /**
+        * whether to prefer temporary IPv6 addresses over public ones
+        */
+       bool prefer_temporary_addrs;
+
+       /**
+        * list with routing tables to be excluded from route lookup
+        */
+       linked_list_t *rt_exclude;
+
+       /**
+        * MTU to set on installed routes
+        */
+       u_int32_t mtu;
+
+       /**
+        * MSS to set on installed routes
+        */
+       u_int32_t mss;
+};
+
+/**
+ * Forward declaration
+ */
+static status_t manage_srcroute(private_kernel_netlink_net_t *this,
+                                                               int nlmsg_type, int flags, chunk_t dst_net,
+                                                               u_int8_t prefixlen, host_t *gateway,
+                                                               host_t *src_ip, char *if_name);
+
+/**
+ * Clear the queued network changes.
+ */
+static void net_changes_clear(private_kernel_netlink_net_t *this)
+{
+       enumerator_t *enumerator;
+       net_change_t *change;
+
+       enumerator = this->net_changes->create_enumerator(this->net_changes);
+       while (enumerator->enumerate(enumerator, NULL, (void**)&change))
+       {
+               this->net_changes->remove_at(this->net_changes, enumerator);
+               net_change_destroy(change);
+       }
+       enumerator->destroy(enumerator);
+}
+
+/**
+ * Act upon queued network changes.
+ */
+static job_requeue_t reinstall_routes(private_kernel_netlink_net_t *this)
+{
+       enumerator_t *enumerator;
+       route_entry_t *route;
+
+       this->net_changes_lock->lock(this->net_changes_lock);
+       this->routes_lock->lock(this->routes_lock);
+
+       enumerator = this->routes->create_enumerator(this->routes);
+       while (enumerator->enumerate(enumerator, NULL, (void**)&route))
+       {
+               net_change_t *change, lookup = {
+                       .if_name = route->if_name,
+               };
+               /* check if a change for the outgoing interface is queued */
+               change = this->net_changes->get(this->net_changes, &lookup);
+               if (!change)
+               {       /* in case src_ip is not on the outgoing interface */
+                       if (this->public.interface.get_interface(&this->public.interface,
+                                                                                               route->src_ip, &lookup.if_name))
+                       {
+                               if (!streq(lookup.if_name, route->if_name))
+                               {
+                                       change = this->net_changes->get(this->net_changes, &lookup);
+                               }
+                               free(lookup.if_name);
+                       }
+               }
+               if (change)
+               {
+                       manage_srcroute(this, RTM_NEWROUTE, NLM_F_CREATE | NLM_F_EXCL,
+                                                       route->dst_net, route->prefixlen, route->gateway,
+                                                       route->src_ip, route->if_name);
+               }
+       }
+       enumerator->destroy(enumerator);
+       this->routes_lock->unlock(this->routes_lock);
+
+       net_changes_clear(this);
+       this->net_changes_lock->unlock(this->net_changes_lock);
+       return JOB_REQUEUE_NONE;
+}
+
+/**
+ * Queue route reinstallation caused by network changes for a given interface.
+ *
+ * The route reinstallation is delayed for a while and only done once for
+ * several calls during this delay, in order to avoid doing it too often.
+ * The interface name is freed.
+ */
+static void queue_route_reinstall(private_kernel_netlink_net_t *this,
+                                                                 char *if_name)
+{
+       net_change_t *update, *found;
+       timeval_t now;
+       job_t *job;
+
+       INIT(update,
+               .if_name = if_name
+       );
+
+       this->net_changes_lock->lock(this->net_changes_lock);
+       found = this->net_changes->put(this->net_changes, update, update);
+       if (found)
+       {
+               net_change_destroy(found);
+       }
+       time_monotonic(&now);
+       if (timercmp(&now, &this->last_route_reinstall, >))
+       {
+               timeval_add_ms(&now, ROUTE_DELAY);
+               this->last_route_reinstall = now;
+
+               job = (job_t*)callback_job_create((callback_job_cb_t)reinstall_routes,
+                                                                                 this, NULL, NULL);
+               lib->scheduler->schedule_job_ms(lib->scheduler, job, ROUTE_DELAY);
+       }
+       this->net_changes_lock->unlock(this->net_changes_lock);
+}
+
+/**
+ * check if the given IP is known as virtual IP and currently installed
+ *
+ * this function will also return TRUE if the virtual IP entry disappeared.
+ * in that case the returned entry will be NULL.
+ *
+ * this->lock must be held when calling this function
+ */
+static bool is_vip_installed_or_gone(private_kernel_netlink_net_t *this,
+                                                                        host_t *ip, addr_map_entry_t **entry)
+{
+       addr_map_entry_t lookup = {
+               .ip = ip,
+       };
+
+       *entry = this->vips->get_match(this->vips, &lookup,
+                                                                 (void*)addr_map_entry_match);
+       if (*entry == NULL)
+       {       /* the virtual IP disappeared */
+               return TRUE;
+       }
+       return (*entry)->addr->installed;
+}
+
+/**
+ * check if the given IP is known as virtual IP
+ *
+ * this->lock must be held when calling this function
+ */
+static bool is_known_vip(private_kernel_netlink_net_t *this, host_t *ip)
+{
+       addr_map_entry_t lookup = {
+               .ip = ip,
+       };
+
+       return this->vips->get_match(this->vips, &lookup,
+                                                               (void*)addr_map_entry_match) != NULL;
+}
+
+/**
+ * Add an address map entry
+ */
+static void addr_map_entry_add(hashtable_t *map, addr_entry_t *addr,
+                                                          iface_entry_t *iface)
+{
+       addr_map_entry_t *entry;
+
+       INIT(entry,
+               .ip = addr->ip,
+               .addr = addr,
+               .iface = iface,
+       );
+       entry = map->put(map, entry, entry);
+       free(entry);
+}
+
+/**
+ * Remove an address map entry
+ */
+static void addr_map_entry_remove(hashtable_t *map, addr_entry_t *addr,
+                                                                 iface_entry_t *iface)
+{
+       addr_map_entry_t *entry, lookup = {
+               .ip = addr->ip,
+               .addr = addr,
+               .iface = iface,
+       };
+
+       entry = map->remove(map, &lookup);
+       free(entry);
+}
+
+/**
+ * Determine the type or scope of the given unicast IP address.  This is not
+ * the same thing returned in rtm_scope/ifa_scope.
+ *
+ * We use return values as defined in RFC 6724 (referring to RFC 4291).
+ */
+static u_char get_scope(host_t *ip)
+{
+       chunk_t addr;
+
+       addr = ip->get_address(ip);
+       switch (addr.len)
+       {
+               case 4:
+                       /* we use the mapping defined in RFC 6724, 3.2 */
+                       if (addr.ptr[0] == 127)
+                       {       /* link-local, same as the IPv6 loopback address */
+                               return 2;
+                       }
+                       if (addr.ptr[0] == 169 && addr.ptr[1] == 254)
+                       {       /* link-local */
+                               return 2;
+                       }
+                       break;
+               case 16:
+                       if (IN6_IS_ADDR_LOOPBACK((struct in6_addr*)addr.ptr))
+                       {       /* link-local, according to RFC 4291, 2.5.3 */
+                               return 2;
+                       }
+                       if (IN6_IS_ADDR_LINKLOCAL((struct in6_addr*)addr.ptr))
+                       {
+                               return 2;
+                       }
+                       if (IN6_IS_ADDR_SITELOCAL((struct in6_addr*)addr.ptr))
+                       {       /* deprecated, according to RFC 4291, 2.5.7 */
+                               return 5;
+                       }
+                       break;
+               default:
+                       break;
+       }
+       /* global */
+       return 14;
+}
+
+/**
+ * Returns the length of the common prefix in bits up to the length of a's
+ * prefix, defined by RFC 6724 as the portion of the address not including the
+ * interface ID, which is 64-bit for most unicast addresses (see RFC 4291).
+ */
+static u_char common_prefix(host_t *a, host_t *b)
+{
+       chunk_t aa, ba;
+       u_char byte, bits = 0, match;
+
+       aa = a->get_address(a);
+       ba = b->get_address(b);
+       for (byte = 0; byte < 8; byte++)
+       {
+               if (aa.ptr[byte] != ba.ptr[byte])
+               {
+                       match = aa.ptr[byte] ^ ba.ptr[byte];
+                       for (bits = 8; match; match >>= 1)
+                       {
+                               bits--;
+                       }
+                       break;
+               }
+       }
+       return byte * 8 + bits;
+}
+
+/**
+ * Compare two IP addresses and return TRUE if the second address is the better
+ * choice of the two to reach the destination.
+ * For IPv6 we approximately follow RFC 6724.
+ */
+static bool is_address_better(private_kernel_netlink_net_t *this,
+                                                         addr_entry_t *a, addr_entry_t *b, host_t *d)
+{
+       u_char sa, sb, sd, pa, pb;
+
+       /* rule 2: prefer appropriate scope */
+       if (d)
+       {
+               sa = get_scope(a->ip);
+               sb = get_scope(b->ip);
+               sd = get_scope(d);
+               if (sa < sb)
+               {
+                       return sa < sd;
+               }
+               else if (sb < sa)
+               {
+                       return sb >= sd;
+               }
+       }
+       if (a->ip->get_family(a->ip) == AF_INET)
+       {       /* stop here for IPv4, default to addresses found earlier */
+               return FALSE;
+       }
+       /* rule 3: avoid deprecated addresses (RFC 4862) */
+       if ((a->flags & IFA_F_DEPRECATED) != (b->flags & IFA_F_DEPRECATED))
+       {
+               return a->flags & IFA_F_DEPRECATED;
+       }
+       /* rule 4 is not applicable as we don't know if an address is a home or
+        * care-of addresses.
+        * rule 5 does not apply as we only compare addresses from one interface
+        * rule 6 requires a policy table (optionally configurable) to match
+        * configurable labels
+        */
+       /* rule 7: prefer temporary addresses (WE REVERSE THIS BY DEFAULT!) */
+       if ((a->flags & IFA_F_TEMPORARY) != (b->flags & IFA_F_TEMPORARY))
+       {
+               if (this->prefer_temporary_addrs)
+               {
+                       return b->flags & IFA_F_TEMPORARY;
+               }
+               return a->flags & IFA_F_TEMPORARY;
+       }
+       /* rule 8: use longest matching prefix */
+       if (d)
+       {
+               pa = common_prefix(a->ip, d);
+               pb = common_prefix(b->ip, d);
+               if (pa != pb)
+               {
+                       return pb > pa;
+               }
+       }
+       /* default to addresses found earlier */
+       return FALSE;
+}
+
+/**
+ * Get a non-virtual IP address on the given interface.
+ *
+ * If a candidate address is given, we first search for that address and if not
+ * found return the address as above.
+ * Returned host is a clone, has to be freed by caller.
+ *
+ * this->lock must be held when calling this function.
+ */
+static host_t *get_interface_address(private_kernel_netlink_net_t *this,
+                                                                        int ifindex, int family, host_t *dest,
+                                                                        host_t *candidate)
+{
+       iface_entry_t *iface;
+       enumerator_t *addrs;
+       addr_entry_t *addr, *best = NULL;
+
+       if (this->ifaces->find_first(this->ifaces, (void*)iface_entry_by_index,
+                                                                (void**)&iface, &ifindex) == SUCCESS)
+       {
+               if (iface->usable)
+               {       /* only use interfaces not excluded by config */
+                       addrs = iface->addrs->create_enumerator(iface->addrs);
+                       while (addrs->enumerate(addrs, &addr))
+                       {
+                               if (addr->refcount ||
+                                       addr->ip->get_family(addr->ip) != family)
+                               {       /* ignore virtual IP addresses and ensure family matches */
+                                       continue;
+                               }
+                               if (candidate && candidate->ip_equals(candidate, addr->ip))
+                               {       /* stop if we find the candidate */
+                                       best = addr;
+                                       break;
+                               }
+                               else if (!best || is_address_better(this, best, addr, dest))
+                               {
+                                       best = addr;
+                               }
+                       }
+                       addrs->destroy(addrs);
+               }
+       }
+       return best ? best->ip->clone(best->ip) : NULL;
+}
+
+/**
+ * callback function that raises the delayed roam event
+ */
+static job_requeue_t roam_event(private_kernel_netlink_net_t *this)
+{
+       bool address;
+
+       this->roam_lock->lock(this->roam_lock);
+       address = this->roam_address;
+       this->roam_address = FALSE;
+       this->roam_lock->unlock(this->roam_lock);
+       hydra->kernel_interface->roam(hydra->kernel_interface, address);
+       return JOB_REQUEUE_NONE;
+}
+
+/**
+ * fire a roaming event. we delay it for a bit and fire only one event
+ * for multiple calls. otherwise we would create too many events.
+ */
+static void fire_roam_event(private_kernel_netlink_net_t *this, bool address)
+{
+       timeval_t now;
+       job_t *job;
+
+       if (!this->roam_events)
+       {
+               return;
+       }
+
+       time_monotonic(&now);
+       this->roam_lock->lock(this->roam_lock);
+       this->roam_address |= address;
+       if (!timercmp(&now, &this->next_roam, >))
+       {
+               this->roam_lock->unlock(this->roam_lock);
+               return;
+       }
+       timeval_add_ms(&now, ROAM_DELAY);
+       this->next_roam = now;
+       this->roam_lock->unlock(this->roam_lock);
+
+       job = (job_t*)callback_job_create((callback_job_cb_t)roam_event,
+                                                                         this, NULL, NULL);
+       lib->scheduler->schedule_job_ms(lib->scheduler, job, ROAM_DELAY);
+}
+
+/**
+ * check if an interface with a given index is up and usable
+ *
+ * this->lock must be locked when calling this function
+ */
+static bool is_interface_up_and_usable(private_kernel_netlink_net_t *this,
+                                                                          int index)
+{
+       iface_entry_t *iface;
+
+       if (this->ifaces->find_first(this->ifaces, (void*)iface_entry_by_index,
+                                                                (void**)&iface, &index) == SUCCESS)
+       {
+               return iface_entry_up_and_usable(iface);
+       }
+       return FALSE;
+}
+
+/**
+ * unregister the current addr_entry_t from the hashtable it is stored in
+ *
+ * this->lock must be locked when calling this function
+ */
+static void addr_entry_unregister(addr_entry_t *addr, iface_entry_t *iface,
+                                                                 private_kernel_netlink_net_t *this)
+{
+       if (addr->refcount)
+       {
+               addr_map_entry_remove(this->vips, addr, iface);
+               this->condvar->broadcast(this->condvar);
+               return;
+       }
+       addr_map_entry_remove(this->addrs, addr, iface);
+}
+
+/**
+ * process RTM_NEWLINK/RTM_DELLINK from kernel
+ */
+static void process_link(private_kernel_netlink_net_t *this,
+                                                struct nlmsghdr *hdr, bool event)
+{
+       struct ifinfomsg* msg = NLMSG_DATA(hdr);
+       struct rtattr *rta = IFLA_RTA(msg);
+       size_t rtasize = IFLA_PAYLOAD (hdr);
+       enumerator_t *enumerator;
+       iface_entry_t *current, *entry = NULL;
+       char *name = NULL;
+       bool update = FALSE, update_routes = FALSE;
+
+       while (RTA_OK(rta, rtasize))
+       {
+               switch (rta->rta_type)
+               {
+                       case IFLA_IFNAME:
+                               name = RTA_DATA(rta);
+                               break;
+               }
+               rta = RTA_NEXT(rta, rtasize);
+       }
+       if (!name)
+       {
+               name = "(unknown)";
+       }
+
+       this->lock->write_lock(this->lock);
+       switch (hdr->nlmsg_type)
+       {
+               case RTM_NEWLINK:
+               {
+                       if (this->ifaces->find_first(this->ifaces,
+                                                                       (void*)iface_entry_by_index, (void**)&entry,
+                                                                       &msg->ifi_index) != SUCCESS)
+                       {
+                               INIT(entry,
+                                       .ifindex = msg->ifi_index,
+                                       .addrs = linked_list_create(),
+                                       .usable = hydra->kernel_interface->is_interface_usable(
+                                                                                               hydra->kernel_interface, name),
+                               );
+                               this->ifaces->insert_last(this->ifaces, entry);
+                       }
+                       strncpy(entry->ifname, name, IFNAMSIZ);
+                       entry->ifname[IFNAMSIZ-1] = '\0';
+                       if (event && entry->usable)
+                       {
+                               if (!(entry->flags & IFF_UP) && (msg->ifi_flags & IFF_UP))
+                               {
+                                       update = update_routes = TRUE;
+                                       DBG1(DBG_KNL, "interface %s activated", name);
+                               }
+                               if ((entry->flags & IFF_UP) && !(msg->ifi_flags & IFF_UP))
+                               {
+                                       update = TRUE;
+                                       DBG1(DBG_KNL, "interface %s deactivated", name);
+                               }
+                       }
+                       entry->flags = msg->ifi_flags;
+                       break;
+               }
+               case RTM_DELLINK:
+               {
+                       enumerator = this->ifaces->create_enumerator(this->ifaces);
+                       while (enumerator->enumerate(enumerator, &current))
+                       {
+                               if (current->ifindex == msg->ifi_index)
+                               {
+                                       if (event && current->usable)
+                                       {
+                                               update = TRUE;
+                                               DBG1(DBG_KNL, "interface %s deleted", current->ifname);
+                                       }
+                                       /* TODO: move virtual IPs installed on this interface to
+                                        * another interface? */
+                                       this->ifaces->remove_at(this->ifaces, enumerator);
+                                       current->addrs->invoke_function(current->addrs,
+                                                               (void*)addr_entry_unregister, current, this);
+                                       iface_entry_destroy(current);
+                                       break;
+                               }
+                       }
+                       enumerator->destroy(enumerator);
+                       break;
+               }
+       }
+       this->lock->unlock(this->lock);
+
+       if (update_routes && event)
+       {
+               queue_route_reinstall(this, strdup(name));
+       }
+
+       if (update && event)
+       {
+               fire_roam_event(this, TRUE);
+       }
+}
+
+/**
+ * process RTM_NEWADDR/RTM_DELADDR from kernel
+ */
+static void process_addr(private_kernel_netlink_net_t *this,
+                                                struct nlmsghdr *hdr, bool event)
+{
+       struct ifaddrmsg* msg = NLMSG_DATA(hdr);
+       struct rtattr *rta = IFA_RTA(msg);
+       size_t rtasize = IFA_PAYLOAD (hdr);
+       host_t *host = NULL;
+       iface_entry_t *iface;
+       chunk_t local = chunk_empty, address = chunk_empty;
+       char *route_ifname = NULL;
+       bool update = FALSE, found = FALSE, changed = FALSE;
+
+       while (RTA_OK(rta, rtasize))
+       {
+               switch (rta->rta_type)
+               {
+                       case IFA_LOCAL:
+                               local.ptr = RTA_DATA(rta);
+                               local.len = RTA_PAYLOAD(rta);
+                               break;
+                       case IFA_ADDRESS:
+                               address.ptr = RTA_DATA(rta);
+                               address.len = RTA_PAYLOAD(rta);
+                               break;
+               }
+               rta = RTA_NEXT(rta, rtasize);
+       }
+
+       /* For PPP interfaces, we need the IFA_LOCAL address,
+        * IFA_ADDRESS is the peers address. But IFA_LOCAL is
+        * not included in all cases (IPv6?), so fallback to IFA_ADDRESS. */
+       if (local.ptr)
+       {
+               host = host_create_from_chunk(msg->ifa_family, local, 0);
+       }
+       else if (address.ptr)
+       {
+               host = host_create_from_chunk(msg->ifa_family, address, 0);
+       }
+
+       if (host == NULL)
+       {       /* bad family? */
+               return;
+       }
+
+       this->lock->write_lock(this->lock);
+       if (this->ifaces->find_first(this->ifaces, (void*)iface_entry_by_index,
+                                                                (void**)&iface, &msg->ifa_index) == SUCCESS)
+       {
+               addr_map_entry_t *entry, lookup = {
+                       .ip = host,
+                       .iface = iface,
+               };
+               addr_entry_t *addr;
+
+               entry = this->vips->get(this->vips, &lookup);
+               if (entry)
+               {
+                       if (hdr->nlmsg_type == RTM_NEWADDR)
+                       {       /* mark as installed and signal waiting threads */
+                               entry->addr->installed = TRUE;
+                       }
+                       else
+                       {       /* the address was already marked as uninstalled */
+                               addr = entry->addr;
+                               iface->addrs->remove(iface->addrs, addr, NULL);
+                               addr_map_entry_remove(this->vips, addr, iface);
+                               addr_entry_destroy(addr);
+                       }
+                       /* no roam events etc. for virtual IPs */
+                       this->condvar->broadcast(this->condvar);
+                       this->lock->unlock(this->lock);
+                       host->destroy(host);
+                       return;
+               }
+               entry = this->addrs->get(this->addrs, &lookup);
+               if (entry)
+               {
+                       if (hdr->nlmsg_type == RTM_DELADDR)
+                       {
+                               found = TRUE;
+                               addr = entry->addr;
+                               iface->addrs->remove(iface->addrs, addr, NULL);
+                               if (iface->usable)
+                               {
+                                       changed = TRUE;
+                                       DBG1(DBG_KNL, "%H disappeared from %s", host,
+                                                iface->ifname);
+                               }
+                               addr_map_entry_remove(this->addrs, addr, iface);
+                               addr_entry_destroy(addr);
+                       }
+               }
+               else
+               {
+                       if (hdr->nlmsg_type == RTM_NEWADDR)
+                       {
+                               found = TRUE;
+                               changed = TRUE;
+                               route_ifname = strdup(iface->ifname);
+                               INIT(addr,
+                                       .ip = host->clone(host),
+                                       .flags = msg->ifa_flags,
+                                       .scope = msg->ifa_scope,
+                               );
+                               iface->addrs->insert_last(iface->addrs, addr);
+                               addr_map_entry_add(this->addrs, addr, iface);
+                               if (event && iface->usable)
+                               {
+                                       DBG1(DBG_KNL, "%H appeared on %s", host, iface->ifname);
+                               }
+                       }
+               }
+               if (found && (iface->flags & IFF_UP))
+               {
+                       update = TRUE;
+               }
+               if (!iface->usable)
+               {       /* ignore events for interfaces excluded by config */
+                       update = changed = FALSE;
+               }
+       }
+       this->lock->unlock(this->lock);
+
+       if (update && event && route_ifname)
+       {
+               queue_route_reinstall(this, route_ifname);
+       }
+       else
+       {
+               free(route_ifname);
+       }
+       host->destroy(host);
+
+       /* send an update to all IKE_SAs */
+       if (update && event && changed)
+       {
+               fire_roam_event(this, TRUE);
+       }
+}
+
+/**
+ * process RTM_NEWROUTE and RTM_DELROUTE from kernel
+ */
+static void process_route(private_kernel_netlink_net_t *this, struct nlmsghdr *hdr)
+{
+       struct rtmsg* msg = NLMSG_DATA(hdr);
+       struct rtattr *rta = RTM_RTA(msg);
+       size_t rtasize = RTM_PAYLOAD(hdr);
+       u_int32_t rta_oif = 0;
+       host_t *host = NULL;
+
+       /* ignore routes added by us or in the local routing table (local addrs) */
+       if (msg->rtm_table && (msg->rtm_table == this->routing_table ||
+                                                  msg->rtm_table == RT_TABLE_LOCAL))
+       {
+               return;
+       }
+       else if (msg->rtm_flags & RTM_F_CLONED)
+       {       /* ignore cached routes, seem to be created a lot for IPv6 */
+               return;
+       }
+
+       while (RTA_OK(rta, rtasize))
+       {
+               switch (rta->rta_type)
+               {
+                       case RTA_PREFSRC:
+                               DESTROY_IF(host);
+                               host = host_create_from_chunk(msg->rtm_family,
+                                                       chunk_create(RTA_DATA(rta), RTA_PAYLOAD(rta)), 0);
+                               break;
+                       case RTA_OIF:
+                               if (RTA_PAYLOAD(rta) == sizeof(rta_oif))
+                               {
+                                       rta_oif = *(u_int32_t*)RTA_DATA(rta);
+                               }
+                               break;
+               }
+               rta = RTA_NEXT(rta, rtasize);
+       }
+       this->lock->read_lock(this->lock);
+       if (rta_oif && !is_interface_up_and_usable(this, rta_oif))
+       {       /* ignore route changes for interfaces that are ignored or down */
+               this->lock->unlock(this->lock);
+               DESTROY_IF(host);
+               return;
+       }
+       if (!host && rta_oif)
+       {
+               host = get_interface_address(this, rta_oif, msg->rtm_family,
+                                                                        NULL, NULL);
+       }
+       if (!host || is_known_vip(this, host))
+       {       /* ignore routes added for virtual IPs */
+               this->lock->unlock(this->lock);
+               DESTROY_IF(host);
+               return;
+       }
+       this->lock->unlock(this->lock);
+       fire_roam_event(this, FALSE);
+       host->destroy(host);
+}
+
+/**
+ * Receives events from kernel
+ */
+static bool receive_events(private_kernel_netlink_net_t *this, int fd,
+                                                  watcher_event_t event)
+{
+       char response[1536];
+       struct nlmsghdr *hdr = (struct nlmsghdr*)response;
+       struct sockaddr_nl addr;
+       socklen_t addr_len = sizeof(addr);
+       int len;
+
+       len = recvfrom(this->socket_events, response, sizeof(response),
+                                  MSG_DONTWAIT, (struct sockaddr*)&addr, &addr_len);
+       if (len < 0)
+       {
+               switch (errno)
+               {
+                       case EINTR:
+                               /* interrupted, try again */
+                               return TRUE;
+                       case EAGAIN:
+                               /* no data ready, select again */
+                               return TRUE;
+                       default:
+                               DBG1(DBG_KNL, "unable to receive from rt event socket");
+                               sleep(1);
+                               return TRUE;
+               }
+       }
+
+       if (addr.nl_pid != 0)
+       {       /* not from kernel. not interested, try another one */
+               return TRUE;
+       }
+
+       while (NLMSG_OK(hdr, len))
+       {
+               /* looks good so far, dispatch netlink message */
+               switch (hdr->nlmsg_type)
+               {
+                       case RTM_NEWADDR:
+                       case RTM_DELADDR:
+                               process_addr(this, hdr, TRUE);
+                               break;
+                       case RTM_NEWLINK:
+                       case RTM_DELLINK:
+                               process_link(this, hdr, TRUE);
+                               break;
+                       case RTM_NEWROUTE:
+                       case RTM_DELROUTE:
+                               if (this->process_route)
+                               {
+                                       process_route(this, hdr);
+                               }
+                               break;
+                       default:
+                               break;
+               }
+               hdr = NLMSG_NEXT(hdr, len);
+       }
+       return TRUE;
+}
+
+/** enumerator over addresses */
+typedef struct {
+       private_kernel_netlink_net_t* this;
+       /** which addresses to enumerate */
+       kernel_address_type_t which;
+} address_enumerator_t;
+
+/**
+ * cleanup function for address enumerator
+ */
+static void address_enumerator_destroy(address_enumerator_t *data)
+{
+       data->this->lock->unlock(data->this->lock);
+       free(data);
+}
+
+/**
+ * filter for addresses
+ */
+static bool filter_addresses(address_enumerator_t *data,
+                                                        addr_entry_t** in, host_t** out)
+{
+       if (!(data->which & ADDR_TYPE_VIRTUAL) && (*in)->refcount)
+       {       /* skip virtual interfaces added by us */
+               return FALSE;
+       }
+       if (!(data->which & ADDR_TYPE_REGULAR) && !(*in)->refcount)
+       {       /* address is regular, but not requested */
+               return FALSE;
+       }
+       if ((*in)->scope >= RT_SCOPE_LINK)
+       {       /* skip addresses with a unusable scope */
+               return FALSE;
+       }
+       *out = (*in)->ip;
+       return TRUE;
+}
+
+/**
+ * enumerator constructor for interfaces
+ */
+static enumerator_t *create_iface_enumerator(iface_entry_t *iface,
+                                                                                        address_enumerator_t *data)
+{
+       return enumerator_create_filter(
+                               iface->addrs->create_enumerator(iface->addrs),
+                               (void*)filter_addresses, data, NULL);
+}
+
+/**
+ * filter for interfaces
+ */
+static bool filter_interfaces(address_enumerator_t *data, iface_entry_t** in,
+                                                         iface_entry_t** out)
+{
+       if (!(data->which & ADDR_TYPE_IGNORED) && !(*in)->usable)
+       {       /* skip interfaces excluded by config */
+               return FALSE;
+       }
+       if (!(data->which & ADDR_TYPE_LOOPBACK) && ((*in)->flags & IFF_LOOPBACK))
+       {       /* ignore loopback devices */
+               return FALSE;
+       }
+       if (!(data->which & ADDR_TYPE_DOWN) && !((*in)->flags & IFF_UP))
+       {       /* skip interfaces not up */
+               return FALSE;
+       }
+       *out = *in;
+       return TRUE;
+}
+
+METHOD(kernel_net_t, create_address_enumerator, enumerator_t*,
+       private_kernel_netlink_net_t *this, kernel_address_type_t which)
+{
+       address_enumerator_t *data;
+
+       INIT(data,
+               .this = this,
+               .which = which,
+       );
+
+       this->lock->read_lock(this->lock);
+       return enumerator_create_nested(
+                               enumerator_create_filter(
+                                       this->ifaces->create_enumerator(this->ifaces),
+                                       (void*)filter_interfaces, data, NULL),
+                               (void*)create_iface_enumerator, data,
+                               (void*)address_enumerator_destroy);
+}
+
+METHOD(kernel_net_t, get_interface_name, bool,
+       private_kernel_netlink_net_t *this, host_t* ip, char **name)
+{
+       addr_map_entry_t *entry, lookup = {
+               .ip = ip,
+       };
+
+       if (ip->is_anyaddr(ip))
+       {
+               return FALSE;
+       }
+       this->lock->read_lock(this->lock);
+       /* first try to find it on an up and usable interface */
+       entry = this->addrs->get_match(this->addrs, &lookup,
+                                                                 (void*)addr_map_entry_match_up_and_usable);
+       if (entry)
+       {
+               if (name)
+               {
+                       *name = strdup(entry->iface->ifname);
+                       DBG2(DBG_KNL, "%H is on interface %s", ip, *name);
+               }
+               this->lock->unlock(this->lock);
+               return TRUE;
+       }
+       /* in a second step, consider virtual IPs installed by us */
+       entry = this->vips->get_match(this->vips, &lookup,
+                                                                 (void*)addr_map_entry_match_up_and_usable);
+       if (entry)
+       {
+               if (name)
+               {
+                       *name = strdup(entry->iface->ifname);
+                       DBG2(DBG_KNL, "virtual IP %H is on interface %s", ip, *name);
+               }
+               this->lock->unlock(this->lock);
+               return TRUE;
+       }
+       /* maybe it is installed on an ignored interface */
+       entry = this->addrs->get_match(this->addrs, &lookup,
+                                                                 (void*)addr_map_entry_match_up);
+       if (!entry)
+       {
+               DBG2(DBG_KNL, "%H is not a local address or the interface is down", ip);
+       }
+       this->lock->unlock(this->lock);
+       return FALSE;
+}
+
+/**
+ * get the index of an interface by name
+ */
+static int get_interface_index(private_kernel_netlink_net_t *this, char* name)
+{
+       iface_entry_t *iface;
+       int ifindex = 0;
+
+       DBG2(DBG_KNL, "getting iface index for %s", name);
+
+       this->lock->read_lock(this->lock);
+       if (this->ifaces->find_first(this->ifaces, (void*)iface_entry_by_name,
+                                                               (void**)&iface, name) == SUCCESS)
+       {
+               ifindex = iface->ifindex;
+       }
+       this->lock->unlock(this->lock);
+
+       if (ifindex == 0)
+       {
+               DBG1(DBG_KNL, "unable to get interface index for %s", name);
+       }
+       return ifindex;
+}
+
+/**
+ * check if an address or net (addr with prefix net bits) is in
+ * subnet (net with net_len net bits)
+ */
+static bool addr_in_subnet(chunk_t addr, int prefix, chunk_t net, int net_len)
+{
+       static const u_char mask[] = { 0x00, 0x80, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc, 0xfe };
+       int byte = 0;
+
+       if (net_len == 0)
+       {       /* any address matches a /0 network */
+               return TRUE;
+       }
+       if (addr.len != net.len || net_len > 8 * net.len || prefix < net_len)
+       {
+               return FALSE;
+       }
+       /* scan through all bytes in network order */
+       while (net_len > 0)
+       {
+               if (net_len < 8)
+               {
+                       return (mask[net_len] & addr.ptr[byte]) == (mask[net_len] & net.ptr[byte]);
+               }
+               else
+               {
+                       if (addr.ptr[byte] != net.ptr[byte])
+                       {
+                               return FALSE;
+                       }
+                       byte++;
+                       net_len -= 8;
+               }
+       }
+       return TRUE;
+}
+
+/**
+ * Store information about a route retrieved via RTNETLINK
+ */
+typedef struct {
+       chunk_t gtw;
+       chunk_t src;
+       chunk_t dst;
+       host_t *src_host;
+       u_int8_t dst_len;
+       u_int32_t table;
+       u_int32_t oif;
+       u_int32_t priority;
+} rt_entry_t;
+
+/**
+ * Free a route entry
+ */
+static void rt_entry_destroy(rt_entry_t *this)
+{
+       DESTROY_IF(this->src_host);
+       free(this);
+}
+
+/**
+ * Check if the route received with RTM_NEWROUTE is usable based on its type.
+ */
+static bool route_usable(struct nlmsghdr *hdr)
+{
+       struct rtmsg *msg;
+
+       msg = NLMSG_DATA(hdr);
+       switch (msg->rtm_type)
+       {
+               case RTN_BLACKHOLE:
+               case RTN_UNREACHABLE:
+               case RTN_PROHIBIT:
+               case RTN_THROW:
+                       return FALSE;
+               default:
+                       return TRUE;
+       }
+}
+
+/**
+ * Parse route received with RTM_NEWROUTE. The given rt_entry_t object will be
+ * reused if not NULL.
+ *
+ * Returned chunks point to internal data of the Netlink message.
+ */
+static rt_entry_t *parse_route(struct nlmsghdr *hdr, rt_entry_t *route)
+{
+       struct rtattr *rta;
+       struct rtmsg *msg;
+       size_t rtasize;
+
+       msg = NLMSG_DATA(hdr);
+       rta = RTM_RTA(msg);
+       rtasize = RTM_PAYLOAD(hdr);
+
+       if (route)
+       {
+               route->gtw = chunk_empty;
+               route->src = chunk_empty;
+               route->dst = chunk_empty;
+               route->dst_len = msg->rtm_dst_len;
+               route->table = msg->rtm_table;
+               route->oif = 0;
+               route->priority = 0;
+       }
+       else
+       {
+               INIT(route,
+                       .dst_len = msg->rtm_dst_len,
+                       .table = msg->rtm_table,
+               );
+       }
+
+       while (RTA_OK(rta, rtasize))
+       {
+               switch (rta->rta_type)
+               {
+                       case RTA_PREFSRC:
+                               route->src = chunk_create(RTA_DATA(rta), RTA_PAYLOAD(rta));
+                               break;
+                       case RTA_GATEWAY:
+                               route->gtw = chunk_create(RTA_DATA(rta), RTA_PAYLOAD(rta));
+                               break;
+                       case RTA_DST:
+                               route->dst = chunk_create(RTA_DATA(rta), RTA_PAYLOAD(rta));
+                               break;
+                       case RTA_OIF:
+                               if (RTA_PAYLOAD(rta) == sizeof(route->oif))
+                               {
+                                       route->oif = *(u_int32_t*)RTA_DATA(rta);
+                               }
+                               break;
+                       case RTA_PRIORITY:
+                               if (RTA_PAYLOAD(rta) == sizeof(route->priority))
+                               {
+                                       route->priority = *(u_int32_t*)RTA_DATA(rta);
+                               }
+                               break;
+#ifdef HAVE_RTA_TABLE
+                       case RTA_TABLE:
+                               if (RTA_PAYLOAD(rta) == sizeof(route->table))
+                               {
+                                       route->table = *(u_int32_t*)RTA_DATA(rta);
+                               }
+                               break;
+#endif /* HAVE_RTA_TABLE*/
+               }
+               rta = RTA_NEXT(rta, rtasize);
+       }
+       return route;
+}
+
+/**
+ * Get a route: If "nexthop", the nexthop is returned. source addr otherwise.
+ */
+static host_t *get_route(private_kernel_netlink_net_t *this, host_t *dest,
+                                                int prefix, bool nexthop, host_t *candidate,
+                                                u_int recursion)
+{
+       netlink_buf_t request;
+       struct nlmsghdr *hdr, *out, *current;
+       struct rtmsg *msg;
+       chunk_t chunk;
+       size_t len;
+       linked_list_t *routes;
+       rt_entry_t *route = NULL, *best = NULL;
+       enumerator_t *enumerator;
+       host_t *addr = NULL;
+       bool match_net;
+       int family;
+
+       if (recursion > MAX_ROUTE_RECURSION)
+       {
+               return NULL;
+       }
+       chunk = dest->get_address(dest);
+       len = chunk.len * 8;
+       prefix = prefix < 0 ? len : min(prefix, len);
+       match_net = prefix != len;
+
+       memset(&request, 0, sizeof(request));
+
+       family = dest->get_family(dest);
+       hdr = &request.hdr;
+       hdr->nlmsg_flags = NLM_F_REQUEST;
+       hdr->nlmsg_type = RTM_GETROUTE;
+       hdr->nlmsg_len = NLMSG_LENGTH(sizeof(struct rtmsg));
+
+       msg = NLMSG_DATA(hdr);
+       msg->rtm_family = family;
+       if (!match_net && this->rta_mark && this->routing_mark.value)
+       {
+               /* if our routing rule excludes packets with a certain mark we can
+                * get the preferred route without having to dump all routes */
+               chunk = chunk_from_thing(this->routing_mark.value);
+               netlink_add_attribute(hdr, RTA_MARK, chunk, sizeof(request));
+       }
+       else if (family == AF_INET || this->rta_prefsrc_for_ipv6 ||
+                        this->routing_table || match_net)
+       {       /* kernels prior to 3.0 do not support RTA_PREFSRC for IPv6 routes.
+                * as we want to ignore routes with virtual IPs we cannot use DUMP
+                * if these routes are not installed in a separate table */
+               hdr->nlmsg_flags |= NLM_F_DUMP;
+       }
+       if (candidate)
+       {
+               chunk = candidate->get_address(candidate);
+               netlink_add_attribute(hdr, RTA_PREFSRC, chunk, sizeof(request));
+       }
+       if (!match_net)
+       {
+               chunk = dest->get_address(dest);
+               netlink_add_attribute(hdr, RTA_DST, chunk, sizeof(request));
+       }
+
+       if (this->socket->send(this->socket, hdr, &out, &len) != SUCCESS)
+       {
+               DBG2(DBG_KNL, "getting %s to reach %H/%d failed",
+                        nexthop ? "nexthop" : "address", dest, prefix);
+               return NULL;
+       }
+       routes = linked_list_create();
+       this->lock->read_lock(this->lock);
+
+       for (current = out; NLMSG_OK(current, len);
+                current = NLMSG_NEXT(current, len))
+       {
+               switch (current->nlmsg_type)
+               {
+                       case NLMSG_DONE:
+                               break;
+                       case RTM_NEWROUTE:
+                       {
+                               rt_entry_t *other;
+                               uintptr_t table;
+
+                               if (!route_usable(current))
+                               {
+                                       continue;
+                               }
+                               route = parse_route(current, route);
+
+                               table = (uintptr_t)route->table;
+                               if (this->rt_exclude->find_first(this->rt_exclude, NULL,
+                                                                                                (void**)&table) == SUCCESS)
+                               {       /* route is from an excluded routing table */
+                                       continue;
+                               }
+                               if (this->routing_table != 0 &&
+                                       route->table == this->routing_table)
+                               {       /* route is from our own ipsec routing table */
+                                       continue;
+                               }
+                               if (route->oif && !is_interface_up_and_usable(this, route->oif))
+                               {       /* interface is down */
+                                       continue;
+                               }
+                               if (!addr_in_subnet(chunk, prefix, route->dst, route->dst_len))
+                               {       /* route destination does not contain dest */
+                                       continue;
+                               }
+                               if (route->src.ptr)
+                               {       /* verify source address, if any */
+                                       host_t *src = host_create_from_chunk(msg->rtm_family,
+                                                                                                                route->src, 0);
+                                       if (src && is_known_vip(this, src))
+                                       {       /* ignore routes installed by us */
+                                               src->destroy(src);
+                                               continue;
+                                       }
+                                       route->src_host = src;
+                               }
+                               /* insert route, sorted by priority and network prefix */
+                               enumerator = routes->create_enumerator(routes);
+                               while (enumerator->enumerate(enumerator, &other))
+                               {
+                                       if (route->priority < other->priority)
+                                       {
+                                               break;
+                                       }
+                                       if (route->priority == other->priority &&
+                                               route->dst_len > other->dst_len)
+                                       {
+                                               break;
+                                       }
+                               }
+                               routes->insert_before(routes, enumerator, route);
+                               enumerator->destroy(enumerator);
+                               route = NULL;
+                               continue;
+                       }
+                       default:
+                               continue;
+               }
+               break;
+       }
+       if (route)
+       {
+               rt_entry_destroy(route);
+       }
+
+       /* now we have a list of routes matching dest, sorted by net prefix.
+        * we will look for source addresses for these routes and select the one
+        * with the preferred source address, if possible */
+       enumerator = routes->create_enumerator(routes);
+       while (enumerator->enumerate(enumerator, &route))
+       {
+               if (route->src_host)
+               {       /* got a source address with the route, if no preferred source
+                        * is given or it matches we are done, as this is the best route */
+                       if (!candidate || candidate->ip_equals(candidate, route->src_host))
+                       {
+                               best = route;
+                               break;
+                       }
+                       else if (route->oif)
+                       {       /* no match yet, maybe it is assigned to the same interface */
+                               host_t *src = get_interface_address(this, route->oif,
+                                                                                       msg->rtm_family, dest, candidate);
+                               if (src && src->ip_equals(src, candidate))
+                               {
+                                       route->src_host->destroy(route->src_host);
+                                       route->src_host = src;
+                                       best = route;
+                                       break;
+                               }
+                               DESTROY_IF(src);
+                       }
+                       /* no luck yet with the source address. if this is the best (first)
+                        * route we store it as fallback in case we don't find a route with
+                        * the preferred source */
+                       best = best ?: route;
+                       continue;
+               }
+               if (route->oif)
+               {       /* no src, but an interface - get address from it */
+                       route->src_host = get_interface_address(this, route->oif,
+                                                                                       msg->rtm_family, dest, candidate);
+                       if (route->src_host)
+                       {       /* we handle this address the same as the one above */
+                               if (!candidate ||
+                                        candidate->ip_equals(candidate, route->src_host))
+                               {
+                                       best = route;
+                                       break;
+                               }
+                               best = best ?: route;
+                               continue;
+                       }
+               }
+               if (route->gtw.ptr)
+               {       /* no src, no iface, but a gateway - lookup src to reach gtw */
+                       host_t *gtw;
+
+                       gtw = host_create_from_chunk(msg->rtm_family, route->gtw, 0);
+                       if (gtw && !gtw->ip_equals(gtw, dest))
+                       {
+                               route->src_host = get_route(this, gtw, -1, FALSE, candidate,
+                                                                                       recursion + 1);
+                       }
+                       DESTROY_IF(gtw);
+                       if (route->src_host)
+                       {       /* more of the same */
+                               if (!candidate ||
+                                        candidate->ip_equals(candidate, route->src_host))
+                               {
+                                       best = route;
+                                       break;
+                               }
+                               best = best ?: route;
+                       }
+               }
+       }
+       enumerator->destroy(enumerator);
+
+       if (nexthop)
+       {       /* nexthop lookup, return gateway if any */
+               if (best || routes->get_first(routes, (void**)&best) == SUCCESS)
+               {
+                       addr = host_create_from_chunk(msg->rtm_family, best->gtw, 0);
+               }
+               if (!addr && !match_net)
+               {       /* fallback to destination address */
+                       addr = dest->clone(dest);
+               }
+       }
+       else
+       {
+               if (best)
+               {
+                       addr = best->src_host->clone(best->src_host);
+               }
+       }
+       this->lock->unlock(this->lock);
+       routes->destroy_function(routes, (void*)rt_entry_destroy);
+       free(out);
+
+       if (addr)
+       {
+               DBG2(DBG_KNL, "using %H as %s to reach %H/%d", addr,
+                        nexthop ? "nexthop" : "address", dest, prefix);
+       }
+       else if (!recursion)
+       {
+               DBG2(DBG_KNL, "no %s found to reach %H/%d",
+                        nexthop ? "nexthop" : "address", dest, prefix);
+       }
+       return addr;
+}
+
+METHOD(kernel_net_t, get_source_addr, host_t*,
+       private_kernel_netlink_net_t *this, host_t *dest, host_t *src)
+{
+       return get_route(this, dest, -1, FALSE, src, 0);
+}
+
+METHOD(kernel_net_t, get_nexthop, host_t*,
+       private_kernel_netlink_net_t *this, host_t *dest, int prefix, host_t *src)
+{
+       return get_route(this, dest, prefix, TRUE, src, 0);
+}
+
+/**
+ * Manages the creation and deletion of ip addresses on an interface.
+ * By setting the appropriate nlmsg_type, the ip will be set or unset.
+ */
+static status_t manage_ipaddr(private_kernel_netlink_net_t *this, int nlmsg_type,
+                                                         int flags, int if_index, host_t *ip, int prefix)
+{
+       netlink_buf_t request;
+       struct nlmsghdr *hdr;
+       struct ifaddrmsg *msg;
+       chunk_t chunk;
+
+       memset(&request, 0, sizeof(request));
+
+       chunk = ip->get_address(ip);
+
+       hdr = &request.hdr;
+       hdr->nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK | flags;
+       hdr->nlmsg_type = nlmsg_type;
+       hdr->nlmsg_len = NLMSG_LENGTH(sizeof(struct ifaddrmsg));
+
+       msg = NLMSG_DATA(hdr);
+       msg->ifa_family = ip->get_family(ip);
+       msg->ifa_flags = 0;
+       msg->ifa_prefixlen = prefix < 0 ? chunk.len * 8 : prefix;
+       msg->ifa_scope = RT_SCOPE_UNIVERSE;
+       msg->ifa_index = if_index;
+
+       netlink_add_attribute(hdr, IFA_LOCAL, chunk, sizeof(request));
+
+       if (ip->get_family(ip) == AF_INET6 && this->rta_prefsrc_for_ipv6)
+       {       /* if source routes are possible we let the virtual IP get deprecated
+                * immediately (but mark it as valid forever) so it gets only used if
+                * forced by our route, and not by the default IPv6 address selection */
+               struct ifa_cacheinfo cache = {
+                       .ifa_valid = 0xFFFFFFFF,
+                       .ifa_prefered = 0,
+               };
+               netlink_add_attribute(hdr, IFA_CACHEINFO, chunk_from_thing(cache),
+                                                         sizeof(request));
+       }
+       return this->socket->send_ack(this->socket, hdr);
+}
+
+METHOD(kernel_net_t, add_ip, status_t,
+       private_kernel_netlink_net_t *this, host_t *virtual_ip, int prefix,
+       char *iface_name)
+{
+       addr_map_entry_t *entry, lookup = {
+               .ip = virtual_ip,
+       };
+       iface_entry_t *iface = NULL;
+
+       if (!this->install_virtual_ip)
+       {       /* disabled by config */
+               return SUCCESS;
+       }
+
+       this->lock->write_lock(this->lock);
+       /* the virtual IP might actually be installed as regular IP, in which case
+        * we don't track it as virtual IP */
+       entry = this->addrs->get_match(this->addrs, &lookup,
+                                                                 (void*)addr_map_entry_match);
+       if (!entry)
+       {       /* otherwise it might already be installed as virtual IP */
+               entry = this->vips->get_match(this->vips, &lookup,
+                                                                        (void*)addr_map_entry_match);
+               if (entry)
+               {       /* the vip we found can be in one of three states: 1) installed and
+                        * ready, 2) just added by another thread, but not yet confirmed to
+                        * be installed by the kernel, 3) just deleted, but not yet gone.
+                        * Then while we wait below, several things could happen (as we
+                        * release the lock).  For instance, the interface could disappear,
+                        * or the IP is finally deleted, and it reappears on a different
+                        * interface. All these cases are handled by the call below. */
+                       while (!is_vip_installed_or_gone(this, virtual_ip, &entry))
+                       {
+                               this->condvar->wait(this->condvar, this->lock);
+                       }
+                       if (entry)
+                       {
+                               entry->addr->refcount++;
+                       }
+               }
+       }
+       if (entry)
+       {
+               DBG2(DBG_KNL, "virtual IP %H is already installed on %s", virtual_ip,
+                        entry->iface->ifname);
+               this->lock->unlock(this->lock);
+               return SUCCESS;
+       }
+       /* try to find the target interface, either by config or via src ip */
+       if (!this->install_virtual_ip_on ||
+                this->ifaces->find_first(this->ifaces, (void*)iface_entry_by_name,
+                                               (void**)&iface, this->install_virtual_ip_on) != SUCCESS)
+       {
+               if (this->ifaces->find_first(this->ifaces, (void*)iface_entry_by_name,
+                                                                        (void**)&iface, iface_name) != SUCCESS)
+               {       /* if we don't find the requested interface we just use the first */
+                       this->ifaces->get_first(this->ifaces, (void**)&iface);
+               }
+       }
+       if (iface)
+       {
+               addr_entry_t *addr;
+               char *ifname;
+               int ifi;
+
+               INIT(addr,
+                       .ip = virtual_ip->clone(virtual_ip),
+                       .refcount = 1,
+                       .scope = RT_SCOPE_UNIVERSE,
+               );
+               iface->addrs->insert_last(iface->addrs, addr);
+               addr_map_entry_add(this->vips, addr, iface);
+               ifi = iface->ifindex;
+               this->lock->unlock(this->lock);
+               if (manage_ipaddr(this, RTM_NEWADDR, NLM_F_CREATE | NLM_F_EXCL,
+                                                 ifi, virtual_ip, prefix) == SUCCESS)
+               {
+                       this->lock->write_lock(this->lock);
+                       while (!is_vip_installed_or_gone(this, virtual_ip, &entry))
+                       {       /* wait until address appears */
+                               this->condvar->wait(this->condvar, this->lock);
+                       }
+                       if (entry)
+                       {       /* we fail if the interface got deleted in the meantime */
+                               ifname = strdup(entry->iface->ifname);
+                               this->lock->unlock(this->lock);
+                               DBG2(DBG_KNL, "virtual IP %H installed on %s",
+                                        virtual_ip, ifname);
+                               /* during IKEv1 reauthentication, children get moved from
+                                * old the new SA before the virtual IP is available. This
+                                * kills the route for our virtual IP, reinstall. */
+                               queue_route_reinstall(this, ifname);
+                               return SUCCESS;
+                       }
+                       this->lock->unlock(this->lock);
+               }
+               DBG1(DBG_KNL, "adding virtual IP %H failed", virtual_ip);
+               return FAILED;
+       }
+       this->lock->unlock(this->lock);
+       DBG1(DBG_KNL, "no interface available, unable to install virtual IP %H",
+                virtual_ip);
+       return FAILED;
+}
+
+METHOD(kernel_net_t, del_ip, status_t,
+       private_kernel_netlink_net_t *this, host_t *virtual_ip, int prefix,
+       bool wait)
+{
+       addr_map_entry_t *entry, lookup = {
+               .ip = virtual_ip,
+       };
+
+       if (!this->install_virtual_ip)
+       {       /* disabled by config */
+               return SUCCESS;
+       }
+
+       DBG2(DBG_KNL, "deleting virtual IP %H", virtual_ip);
+
+       this->lock->write_lock(this->lock);
+       entry = this->vips->get_match(this->vips, &lookup,
+                                                                (void*)addr_map_entry_match);
+       if (!entry)
+       {       /* we didn't install this IP as virtual IP */
+               entry = this->addrs->get_match(this->addrs, &lookup,
+                                                                         (void*)addr_map_entry_match);
+               if (entry)
+               {
+                       DBG2(DBG_KNL, "not deleting existing IP %H on %s", virtual_ip,
+                                entry->iface->ifname);
+                       this->lock->unlock(this->lock);
+                       return SUCCESS;
+               }
+               DBG2(DBG_KNL, "virtual IP %H not cached, unable to delete", virtual_ip);
+               this->lock->unlock(this->lock);
+               return FAILED;
+       }
+       if (entry->addr->refcount == 1)
+       {
+               status_t status;
+               int ifi;
+
+               /* we set this flag so that threads calling add_ip will block and wait
+                * until the entry is gone, also so we can wait below */
+               entry->addr->installed = FALSE;
+               ifi = entry->iface->ifindex;
+               this->lock->unlock(this->lock);
+               status = manage_ipaddr(this, RTM_DELADDR, 0, ifi, virtual_ip, prefix);
+               if (status == SUCCESS && wait)
+               {       /* wait until the address is really gone */
+                       this->lock->write_lock(this->lock);
+                       while (is_known_vip(this, virtual_ip))
+                       {
+                               this->condvar->wait(this->condvar, this->lock);
+                       }
+                       this->lock->unlock(this->lock);
+               }
+               return status;
+       }
+       else
+       {
+               entry->addr->refcount--;
+       }
+       DBG2(DBG_KNL, "virtual IP %H used by other SAs, not deleting",
+                virtual_ip);
+       this->lock->unlock(this->lock);
+       return SUCCESS;
+}
+
+/**
+ * Manages source routes in the routing table.
+ * By setting the appropriate nlmsg_type, the route gets added or removed.
+ */
+static status_t manage_srcroute(private_kernel_netlink_net_t *this,
+                                                               int nlmsg_type, int flags, chunk_t dst_net,
+                                                               u_int8_t prefixlen, host_t *gateway,
+                                                               host_t *src_ip, char *if_name)
+{
+       netlink_buf_t request;
+       struct nlmsghdr *hdr;
+       struct rtmsg *msg;
+       struct rtattr *rta;
+       int ifindex;
+       chunk_t chunk;
+
+       /* if route is 0.0.0.0/0, we can't install it, as it would
+        * overwrite the default route. Instead, we add two routes:
+        * 0.0.0.0/1 and 128.0.0.0/1 */
+       if (this->routing_table == 0 && prefixlen == 0)
+       {
+               chunk_t half_net;
+               u_int8_t half_prefixlen;
+               status_t status;
+
+               half_net = chunk_alloca(dst_net.len);
+               memset(half_net.ptr, 0, half_net.len);
+               half_prefixlen = 1;
+
+               status = manage_srcroute(this, nlmsg_type, flags, half_net, half_prefixlen,
+                                       gateway, src_ip, if_name);
+               half_net.ptr[0] |= 0x80;
+               status = manage_srcroute(this, nlmsg_type, flags, half_net, half_prefixlen,
+                                       gateway, src_ip, if_name);
+               return status;
+       }
+
+       memset(&request, 0, sizeof(request));
+
+       hdr = &request.hdr;
+       hdr->nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK | flags;
+       hdr->nlmsg_type = nlmsg_type;
+       hdr->nlmsg_len = NLMSG_LENGTH(sizeof(struct rtmsg));
+
+       msg = NLMSG_DATA(hdr);
+       msg->rtm_family = src_ip->get_family(src_ip);
+       msg->rtm_dst_len = prefixlen;
+       msg->rtm_table = this->routing_table;
+       msg->rtm_protocol = RTPROT_STATIC;
+       msg->rtm_type = RTN_UNICAST;
+       msg->rtm_scope = RT_SCOPE_UNIVERSE;
+
+       netlink_add_attribute(hdr, RTA_DST, dst_net, sizeof(request));
+       chunk = src_ip->get_address(src_ip);
+       netlink_add_attribute(hdr, RTA_PREFSRC, chunk, sizeof(request));
+       if (gateway && gateway->get_family(gateway) == src_ip->get_family(src_ip))
+       {
+               chunk = gateway->get_address(gateway);
+               netlink_add_attribute(hdr, RTA_GATEWAY, chunk, sizeof(request));
+       }
+       ifindex = get_interface_index(this, if_name);
+       chunk.ptr = (char*)&ifindex;
+       chunk.len = sizeof(ifindex);
+       netlink_add_attribute(hdr, RTA_OIF, chunk, sizeof(request));
+
+       if (this->mtu || this->mss)
+       {
+               chunk = chunk_alloca(RTA_LENGTH((sizeof(struct rtattr) +
+                                                                                sizeof(u_int32_t)) * 2));
+               chunk.len = 0;
+               rta = (struct rtattr*)chunk.ptr;
+               if (this->mtu)
+               {
+                       rta->rta_type = RTAX_MTU;
+                       rta->rta_len = RTA_LENGTH(sizeof(u_int32_t));
+                       memcpy(RTA_DATA(rta), &this->mtu, sizeof(u_int32_t));
+                       chunk.len = rta->rta_len;
+               }
+               if (this->mss)
+               {
+                       rta = (struct rtattr*)(chunk.ptr + RTA_ALIGN(chunk.len));
+                       rta->rta_type = RTAX_ADVMSS;
+                       rta->rta_len = RTA_LENGTH(sizeof(u_int32_t));
+                       memcpy(RTA_DATA(rta), &this->mss, sizeof(u_int32_t));
+                       chunk.len = RTA_ALIGN(chunk.len) + rta->rta_len;
+               }
+               netlink_add_attribute(hdr, RTA_METRICS, chunk, sizeof(request));
+       }
+
+       return this->socket->send_ack(this->socket, hdr);
+}
+
+METHOD(kernel_net_t, add_route, status_t,
+       private_kernel_netlink_net_t *this, chunk_t dst_net, u_int8_t prefixlen,
+       host_t *gateway, host_t *src_ip, char *if_name)
+{
+       status_t status;
+       route_entry_t *found, route = {
+               .dst_net = dst_net,
+               .prefixlen = prefixlen,
+               .gateway = gateway,
+               .src_ip = src_ip,
+               .if_name = if_name,
+       };
+
+       this->routes_lock->lock(this->routes_lock);
+       found = this->routes->get(this->routes, &route);
+       if (found)
+       {
+               this->routes_lock->unlock(this->routes_lock);
+               return ALREADY_DONE;
+       }
+       status = manage_srcroute(this, RTM_NEWROUTE, NLM_F_CREATE | NLM_F_EXCL,
+                                                        dst_net, prefixlen, gateway, src_ip, if_name);
+       if (status == SUCCESS)
+       {
+               found = route_entry_clone(&route);
+               this->routes->put(this->routes, found, found);
+       }
+       this->routes_lock->unlock(this->routes_lock);
+       return status;
+}
+
+METHOD(kernel_net_t, del_route, status_t,
+       private_kernel_netlink_net_t *this, chunk_t dst_net, u_int8_t prefixlen,
+       host_t *gateway, host_t *src_ip, char *if_name)
+{
+       status_t status;
+       route_entry_t *found, route = {
+               .dst_net = dst_net,
+               .prefixlen = prefixlen,
+               .gateway = gateway,
+               .src_ip = src_ip,
+               .if_name = if_name,
+       };
+
+       this->routes_lock->lock(this->routes_lock);
+       found = this->routes->get(this->routes, &route);
+       if (!found)
+       {
+               this->routes_lock->unlock(this->routes_lock);
+               return NOT_FOUND;
+       }
+       this->routes->remove(this->routes, found);
+       route_entry_destroy(found);
+       status = manage_srcroute(this, RTM_DELROUTE, 0, dst_net, prefixlen,
+                                                        gateway, src_ip, if_name);
+       this->routes_lock->unlock(this->routes_lock);
+       return status;
+}
+
+/**
+ * Initialize a list of local addresses.
+ */
+static status_t init_address_list(private_kernel_netlink_net_t *this)
+{
+       netlink_buf_t request;
+       struct nlmsghdr *out, *current, *in;
+       struct rtgenmsg *msg;
+       size_t len;
+       enumerator_t *ifaces, *addrs;
+       iface_entry_t *iface;
+       addr_entry_t *addr;
+
+       DBG2(DBG_KNL, "known interfaces and IP addresses:");
+
+       memset(&request, 0, sizeof(request));
+
+       in = &request.hdr;
+       in->nlmsg_len = NLMSG_LENGTH(sizeof(struct rtgenmsg));
+       in->nlmsg_flags = NLM_F_REQUEST | NLM_F_MATCH | NLM_F_ROOT;
+       msg = NLMSG_DATA(in);
+       msg->rtgen_family = AF_UNSPEC;
+
+       /* get all links */
+       in->nlmsg_type = RTM_GETLINK;
+       if (this->socket->send(this->socket, in, &out, &len) != SUCCESS)
+       {
+               return FAILED;
+       }
+       current = out;
+       while (NLMSG_OK(current, len))
+       {
+               switch (current->nlmsg_type)
+               {
+                       case NLMSG_DONE:
+                               break;
+                       case RTM_NEWLINK:
+                               process_link(this, current, FALSE);
+                               /* fall through */
+                       default:
+                               current = NLMSG_NEXT(current, len);
+                               continue;
+               }
+               break;
+       }
+       free(out);
+
+       /* get all interface addresses */
+       in->nlmsg_type = RTM_GETADDR;
+       if (this->socket->send(this->socket, in, &out, &len) != SUCCESS)
+       {
+               return FAILED;
+       }
+       current = out;
+       while (NLMSG_OK(current, len))
+       {
+               switch (current->nlmsg_type)
+               {
+                       case NLMSG_DONE:
+                               break;
+                       case RTM_NEWADDR:
+                               process_addr(this, current, FALSE);
+                               /* fall through */
+                       default:
+                               current = NLMSG_NEXT(current, len);
+                               continue;
+               }
+               break;
+       }
+       free(out);
+
+       this->lock->read_lock(this->lock);
+       ifaces = this->ifaces->create_enumerator(this->ifaces);
+       while (ifaces->enumerate(ifaces, &iface))
+       {
+               if (iface_entry_up_and_usable(iface))
+               {
+                       DBG2(DBG_KNL, "  %s", iface->ifname);
+                       addrs = iface->addrs->create_enumerator(iface->addrs);
+                       while (addrs->enumerate(addrs, (void**)&addr))
+                       {
+                               DBG2(DBG_KNL, "    %H", addr->ip);
+                       }
+                       addrs->destroy(addrs);
+               }
+       }
+       ifaces->destroy(ifaces);
+       this->lock->unlock(this->lock);
+       return SUCCESS;
+}
+
+/**
+ * create or delete a rule to use our routing table
+ */
+static status_t manage_rule(private_kernel_netlink_net_t *this, int nlmsg_type,
+                                                       int family, u_int32_t table, u_int32_t prio)
+{
+       netlink_buf_t request;
+       struct nlmsghdr *hdr;
+       struct rtmsg *msg;
+       chunk_t chunk;
+       char *fwmark;
+
+       memset(&request, 0, sizeof(request));
+       hdr = &request.hdr;
+       hdr->nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
+       hdr->nlmsg_type = nlmsg_type;
+       if (nlmsg_type == RTM_NEWRULE)
+       {
+               hdr->nlmsg_flags |= NLM_F_CREATE | NLM_F_EXCL;
+       }
+       hdr->nlmsg_len = NLMSG_LENGTH(sizeof(struct rtmsg));
+
+       msg = NLMSG_DATA(hdr);
+       msg->rtm_table = table;
+       msg->rtm_family = family;
+       msg->rtm_protocol = RTPROT_BOOT;
+       msg->rtm_scope = RT_SCOPE_UNIVERSE;
+       msg->rtm_type = RTN_UNICAST;
+
+       chunk = chunk_from_thing(prio);
+       netlink_add_attribute(hdr, RTA_PRIORITY, chunk, sizeof(request));
+
+       fwmark = lib->settings->get_str(lib->settings,
+                                                       "%s.plugins.kernel-netlink.fwmark", NULL, lib->ns);
+       if (fwmark)
+       {
+#ifdef HAVE_LINUX_FIB_RULES_H
+               mark_t mark;
+
+               if (fwmark[0] == '!')
+               {
+                       msg->rtm_flags |= FIB_RULE_INVERT;
+                       fwmark++;
+               }
+               if (mark_from_string(fwmark, &mark))
+               {
+                       chunk = chunk_from_thing(mark.value);
+                       netlink_add_attribute(hdr, FRA_FWMARK, chunk, sizeof(request));
+                       chunk = chunk_from_thing(mark.mask);
+                       netlink_add_attribute(hdr, FRA_FWMASK, chunk, sizeof(request));
+                       if (msg->rtm_flags & FIB_RULE_INVERT)
+                       {
+                               this->routing_mark = mark;
+                       }
+               }
+#else
+               DBG1(DBG_KNL, "setting firewall mark on routing rule is not supported");
+#endif
+       }
+       return this->socket->send_ack(this->socket, hdr);
+}
+
+/**
+ * check for kernel features (currently only via version number)
+ */
+static void check_kernel_features(private_kernel_netlink_net_t *this)
+{
+       struct utsname utsname;
+       int a, b, c;
+
+       if (uname(&utsname) == 0)
+       {
+               switch(sscanf(utsname.release, "%d.%d.%d", &a, &b, &c))
+               {
+                       case 3:
+                               if (a == 2)
+                               {
+                                       if (b == 6 && c >= 36)
+                                       {
+                                               this->rta_mark = TRUE;
+                                       }
+                                       DBG2(DBG_KNL, "detected Linux %d.%d.%d, no support for "
+                                                "RTA_PREFSRC for IPv6 routes", a, b, c);
+                                       break;
+                               }
+                               /* fall-through */
+                       case 2:
+                               /* only 3.x+ uses two part version numbers */
+                               this->rta_prefsrc_for_ipv6 = TRUE;
+                               this->rta_mark = TRUE;
+                               break;
+                       default:
+                               break;
+               }
+       }
+}
+
+/**
+ * Destroy an address to iface map
+ */
+static void addr_map_destroy(hashtable_t *map)
+{
+       enumerator_t *enumerator;
+       addr_map_entry_t *addr;
+
+       enumerator = map->create_enumerator(map);
+       while (enumerator->enumerate(enumerator, NULL, (void**)&addr))
+       {
+               free(addr);
+       }
+       enumerator->destroy(enumerator);
+       map->destroy(map);
+}
+
+METHOD(kernel_net_t, destroy, void,
+       private_kernel_netlink_net_t *this)
+{
+       enumerator_t *enumerator;
+       route_entry_t *route;
+
+       if (this->routing_table)
+       {
+               manage_rule(this, RTM_DELRULE, AF_INET, this->routing_table,
+                                       this->routing_table_prio);
+               manage_rule(this, RTM_DELRULE, AF_INET6, this->routing_table,
+                                       this->routing_table_prio);
+       }
+       if (this->socket_events > 0)
+       {
+               lib->watcher->remove(lib->watcher, this->socket_events);
+               close(this->socket_events);
+       }
+       enumerator = this->routes->create_enumerator(this->routes);
+       while (enumerator->enumerate(enumerator, NULL, (void**)&route))
+       {
+               manage_srcroute(this, RTM_DELROUTE, 0, route->dst_net, route->prefixlen,
+                                               route->gateway, route->src_ip, route->if_name);
+               route_entry_destroy(route);
+       }
+       enumerator->destroy(enumerator);
+       this->routes->destroy(this->routes);
+       this->routes_lock->destroy(this->routes_lock);
+       DESTROY_IF(this->socket);
+
+       net_changes_clear(this);
+       this->net_changes->destroy(this->net_changes);
+       this->net_changes_lock->destroy(this->net_changes_lock);
+
+       addr_map_destroy(this->addrs);
+       addr_map_destroy(this->vips);
+
+       this->ifaces->destroy_function(this->ifaces, (void*)iface_entry_destroy);
+       this->rt_exclude->destroy(this->rt_exclude);
+       this->roam_lock->destroy(this->roam_lock);
+       this->condvar->destroy(this->condvar);
+       this->lock->destroy(this->lock);
+       free(this);
+}
+
+/*
+ * Described in header.
+ */
+kernel_netlink_net_t *kernel_netlink_net_create()
+{
+       private_kernel_netlink_net_t *this;
+       enumerator_t *enumerator;
+       bool register_for_events = TRUE;
+       char *exclude;
+
+       INIT(this,
+               .public = {
+                       .interface = {
+                               .get_interface = _get_interface_name,
+                               .create_address_enumerator = _create_address_enumerator,
+                               .get_source_addr = _get_source_addr,
+                               .get_nexthop = _get_nexthop,
+                               .add_ip = _add_ip,
+                               .del_ip = _del_ip,
+                               .add_route = _add_route,
+                               .del_route = _del_route,
+                               .destroy = _destroy,
+                       },
+               },
+               .socket = netlink_socket_create(NETLINK_ROUTE, rt_msg_names,
+                       lib->settings->get_bool(lib->settings,
+                               "%s.plugins.kernel-netlink.parallel_route", FALSE, lib->ns)),
+               .rt_exclude = linked_list_create(),
+               .routes = hashtable_create((hashtable_hash_t)route_entry_hash,
+                                                                  (hashtable_equals_t)route_entry_equals, 16),
+               .net_changes = hashtable_create(
+                                                                  (hashtable_hash_t)net_change_hash,
+                                                                  (hashtable_equals_t)net_change_equals, 16),
+               .addrs = hashtable_create(
+                                                               (hashtable_hash_t)addr_map_entry_hash,
+                                                               (hashtable_equals_t)addr_map_entry_equals, 16),
+               .vips = hashtable_create((hashtable_hash_t)addr_map_entry_hash,
+                                                                (hashtable_equals_t)addr_map_entry_equals, 16),
+               .routes_lock = mutex_create(MUTEX_TYPE_DEFAULT),
+               .net_changes_lock = mutex_create(MUTEX_TYPE_DEFAULT),
+               .ifaces = linked_list_create(),
+               .lock = rwlock_create(RWLOCK_TYPE_DEFAULT),
+               .condvar = rwlock_condvar_create(),
+               .roam_lock = spinlock_create(),
+               .routing_table = lib->settings->get_int(lib->settings,
+                                               "%s.routing_table", ROUTING_TABLE, lib->ns),
+               .routing_table_prio = lib->settings->get_int(lib->settings,
+                                               "%s.routing_table_prio", ROUTING_TABLE_PRIO, lib->ns),
+               .process_route = lib->settings->get_bool(lib->settings,
+                                               "%s.process_route", TRUE, lib->ns),
+               .install_virtual_ip = lib->settings->get_bool(lib->settings,
+                                               "%s.install_virtual_ip", TRUE, lib->ns),
+               .install_virtual_ip_on = lib->settings->get_str(lib->settings,
+                                               "%s.install_virtual_ip_on", NULL, lib->ns),
+               .prefer_temporary_addrs = lib->settings->get_bool(lib->settings,
+                                               "%s.prefer_temporary_addrs", FALSE, lib->ns),
+               .roam_events = lib->settings->get_bool(lib->settings,
+                                               "%s.plugins.kernel-netlink.roam_events", TRUE, lib->ns),
+               .mtu = lib->settings->get_int(lib->settings,
+                                               "%s.plugins.kernel-netlink.mtu", 0, lib->ns),
+               .mss = lib->settings->get_int(lib->settings,
+                                               "%s.plugins.kernel-netlink.mss", 0, lib->ns),
+       );
+       timerclear(&this->last_route_reinstall);
+       timerclear(&this->next_roam);
+
+       check_kernel_features(this);
+
+       if (streq(lib->ns, "starter"))
+       {       /* starter has no threads, so we do not register for kernel events */
+               register_for_events = FALSE;
+       }
+
+       exclude = lib->settings->get_str(lib->settings,
+                                                                        "%s.ignore_routing_tables", NULL, lib->ns);
+       if (exclude)
+       {
+               char *token;
+               uintptr_t table;
+
+               enumerator = enumerator_create_token(exclude, " ", " ");
+               while (enumerator->enumerate(enumerator, &token))
+               {
+                       errno = 0;
+                       table = strtoul(token, NULL, 10);
+
+                       if (errno == 0)
+                       {
+                               this->rt_exclude->insert_last(this->rt_exclude, (void*)table);
+                       }
+               }
+               enumerator->destroy(enumerator);
+       }
+
+       if (register_for_events)
+       {
+               struct sockaddr_nl addr;
+
+               memset(&addr, 0, sizeof(addr));
+               addr.nl_family = AF_NETLINK;
+
+               /* create and bind RT socket for events (address/interface/route changes) */
+               this->socket_events = socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE);
+               if (this->socket_events < 0)
+               {
+                       DBG1(DBG_KNL, "unable to create RT event socket");
+                       destroy(this);
+                       return NULL;
+               }
+               addr.nl_groups = RTMGRP_IPV4_IFADDR | RTMGRP_IPV6_IFADDR |
+                                                RTMGRP_IPV4_ROUTE | RTMGRP_IPV6_ROUTE | RTMGRP_LINK;
+               if (bind(this->socket_events, (struct sockaddr*)&addr, sizeof(addr)))
+               {
+                       DBG1(DBG_KNL, "unable to bind RT event socket");
+                       destroy(this);
+                       return NULL;
+               }
+
+               lib->watcher->add(lib->watcher, this->socket_events, WATCHER_READ,
+                                                 (watcher_cb_t)receive_events, this);
+       }
+
+       if (init_address_list(this) != SUCCESS)
+       {
+               DBG1(DBG_KNL, "unable to get interface list");
+               destroy(this);
+               return NULL;
+       }
+
+       if (this->routing_table)
+       {
+               if (manage_rule(this, RTM_NEWRULE, AF_INET, this->routing_table,
+                                               this->routing_table_prio) != SUCCESS)
+               {
+                       DBG1(DBG_KNL, "unable to create IPv4 routing table rule");
+               }
+               if (manage_rule(this, RTM_NEWRULE, AF_INET6, this->routing_table,
+                                               this->routing_table_prio) != SUCCESS)
+               {
+                       DBG1(DBG_KNL, "unable to create IPv6 routing table rule");
+               }
+       }
+
+       return &this->public;
+}
diff --git a/src/libcharon/plugins/kernel_netlink/kernel_netlink_net.h b/src/libcharon/plugins/kernel_netlink/kernel_netlink_net.h
new file mode 100644 (file)
index 0000000..ff9831d
--- /dev/null
@@ -0,0 +1,46 @@
+/*
+ * Copyright (C) 2008 Tobias Brunner
+ * Hochschule fuer Technik Rapperswil
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.  See <http://www.fsf.org/copyleft/gpl.txt>.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ * or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * for more details.
+ */
+
+/**
+ * @defgroup kernel_netlink_net_i kernel_netlink_net
+ * @{ @ingroup kernel_netlink
+ */
+
+#ifndef KERNEL_NETLINK_NET_H_
+#define KERNEL_NETLINK_NET_H_
+
+#include <kernel/kernel_net.h>
+
+typedef struct kernel_netlink_net_t kernel_netlink_net_t;
+
+/**
+ * Implementation of the kernel network interface using Netlink.
+ */
+struct kernel_netlink_net_t {
+
+       /**
+        * Implements kernel_net_t interface
+        */
+       kernel_net_t interface;
+};
+
+/**
+ * Create a netlink kernel network interface instance.
+ *
+ * @return                     kernel_netlink_net_t instance
+ */
+kernel_netlink_net_t *kernel_netlink_net_create();
+
+#endif /** KERNEL_NETLINK_NET_H_ @}*/
diff --git a/src/libcharon/plugins/kernel_netlink/kernel_netlink_plugin.c b/src/libcharon/plugins/kernel_netlink/kernel_netlink_plugin.c
new file mode 100644 (file)
index 0000000..8d5a0d5
--- /dev/null
@@ -0,0 +1,87 @@
+/*
+ * Copyright (C) 2008 Tobias Brunner
+ * Hochschule fuer Technik Rapperswil
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.  See <http://www.fsf.org/copyleft/gpl.txt>.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ * or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * for more details.
+ */
+
+
+#include "kernel_netlink_plugin.h"
+
+#include "kernel_netlink_ipsec.h"
+#include "kernel_netlink_net.h"
+
+#include <hydra.h>
+
+typedef struct private_kernel_netlink_plugin_t private_kernel_netlink_plugin_t;
+
+/**
+ * private data of kernel netlink plugin
+ */
+struct private_kernel_netlink_plugin_t {
+       /**
+        * implements plugin interface
+        */
+       kernel_netlink_plugin_t public;
+};
+
+METHOD(plugin_t, get_name, char*,
+       private_kernel_netlink_plugin_t *this)
+{
+       return "kernel-netlink";
+}
+
+METHOD(plugin_t, get_features, int,
+       private_kernel_netlink_plugin_t *this, plugin_feature_t *features[])
+{
+       static plugin_feature_t f[] = {
+               PLUGIN_CALLBACK(kernel_ipsec_register, kernel_netlink_ipsec_create),
+                       PLUGIN_PROVIDE(CUSTOM, "kernel-ipsec"),
+               PLUGIN_CALLBACK(kernel_net_register, kernel_netlink_net_create),
+                       PLUGIN_PROVIDE(CUSTOM, "kernel-net"),
+       };
+       *features = f;
+       return countof(f);
+}
+
+METHOD(plugin_t, destroy, void,
+       private_kernel_netlink_plugin_t *this)
+{
+       free(this);
+}
+
+/*
+ * see header file
+ */
+plugin_t *kernel_netlink_plugin_create()
+{
+       private_kernel_netlink_plugin_t *this;
+
+       if (!lib->caps->keep(lib->caps, CAP_NET_ADMIN))
+       {       /* required to bind/use XFRM sockets / create/modify routing tables, but
+                * not if only the read-only parts of kernel-netlink-net are used, so
+                * we don't fail here */
+               DBG1(DBG_KNL, "kernel-netlink plugin might require CAP_NET_ADMIN "
+                        "capability");
+       }
+
+       INIT(this,
+               .public = {
+                       .plugin = {
+                               .get_name = _get_name,
+                               .get_features = _get_features,
+                               .destroy = _destroy,
+                       },
+               },
+       );
+
+       return &this->public.plugin;
+}
diff --git a/src/libcharon/plugins/kernel_netlink/kernel_netlink_plugin.h b/src/libcharon/plugins/kernel_netlink/kernel_netlink_plugin.h
new file mode 100644 (file)
index 0000000..74c9ae2
--- /dev/null
@@ -0,0 +1,42 @@
+/*
+ * Copyright (C) 2008 Tobias Brunner
+ * Hochschule fuer Technik Rapperswil
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.  See <http://www.fsf.org/copyleft/gpl.txt>.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ * or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * for more details.
+ */
+
+/**
+ * @defgroup kernel_netlink kernel_netlink
+ * @ingroup cplugins
+ *
+ * @defgroup kernel_netlink_plugin kernel_netlink_plugin
+ * @{ @ingroup kernel_netlink
+ */
+
+#ifndef KERNEL_NETLINK_PLUGIN_H_
+#define KERNEL_NETLINK_PLUGIN_H_
+
+#include <plugins/plugin.h>
+
+typedef struct kernel_netlink_plugin_t kernel_netlink_plugin_t;
+
+/**
+ * netlink kernel interface plugin
+ */
+struct kernel_netlink_plugin_t {
+
+       /**
+        * implements plugin interface
+        */
+       plugin_t plugin;
+};
+
+#endif /** KERNEL_NETLINK_PLUGIN_H_ @}*/
diff --git a/src/libcharon/plugins/kernel_netlink/kernel_netlink_shared.c b/src/libcharon/plugins/kernel_netlink/kernel_netlink_shared.c
new file mode 100644 (file)
index 0000000..f7ce992
--- /dev/null
@@ -0,0 +1,655 @@
+/*
+ * Copyright (C) 2014 Martin Willi
+ * Copyright (C) 2014 revosec AG
+ * Copyright (C) 2008 Tobias Brunner
+ * Hochschule fuer Technik Rapperswil
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.  See <http://www.fsf.org/copyleft/gpl.txt>.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ * or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * for more details.
+ */
+
+#include <sys/socket.h>
+#include <linux/netlink.h>
+#include <linux/rtnetlink.h>
+#include <linux/xfrm.h>
+#include <errno.h>
+#include <unistd.h>
+
+#include "kernel_netlink_shared.h"
+
+#include <utils/debug.h>
+#include <threading/mutex.h>
+#include <threading/condvar.h>
+#include <collections/array.h>
+#include <collections/hashtable.h>
+
+typedef struct private_netlink_socket_t private_netlink_socket_t;
+
+/**
+ * Private variables and functions of netlink_socket_t class.
+ */
+struct private_netlink_socket_t {
+
+       /**
+        * public part of the netlink_socket_t object.
+        */
+       netlink_socket_t public;
+
+       /**
+        * mutex to lock access entries
+        */
+       mutex_t *mutex;
+
+       /**
+        * Netlink request entries currently active, uintptr_t seq => entry_t
+        */
+       hashtable_t *entries;
+
+       /**
+        * Current sequence number for Netlink requests
+        */
+       refcount_t seq;
+
+       /**
+        * netlink socket
+        */
+       int socket;
+
+       /**
+        * Netlink protocol
+        */
+       int protocol;
+
+       /**
+        * Enum names for Netlink messages
+        */
+       enum_name_t *names;
+
+       /**
+        * Timeout for Netlink replies, in ms
+        */
+       u_int timeout;
+
+       /**
+        * Number of times to repeat timed out queries
+        */
+       u_int retries;
+
+       /**
+        * Buffer size for received Netlink messages
+        */
+       u_int buflen;
+
+       /**
+        * Use parallel netlink queries
+        */
+       bool parallel;
+
+       /**
+        * Ignore errors potentially resulting from a retransmission
+        */
+       bool ignore_retransmit_errors;
+};
+
+/**
+ * #definable hook to simulate request message loss
+ */
+#ifdef NETLINK_MSG_LOSS_HOOK
+bool NETLINK_MSG_LOSS_HOOK(struct nlmsghdr *msg);
+#define msg_loss_hook(msg) NETLINK_MSG_LOSS_HOOK(msg)
+#else
+#define msg_loss_hook(msg) FALSE
+#endif
+
+/**
+ * Request entry the answer for a waiting thread is collected in
+ */
+typedef struct {
+       /** Condition variable thread is waiting */
+       condvar_t *condvar;
+       /** Array of hdrs in a multi-message response, as struct nlmsghdr* */
+       array_t *hdrs;
+       /** All response messages received? */
+       bool complete;
+} entry_t;
+
+/**
+ * Clean up a thread waiting entry
+ */
+static void destroy_entry(entry_t *entry)
+{
+       entry->condvar->destroy(entry->condvar);
+       array_destroy_function(entry->hdrs, (void*)free, NULL);
+       free(entry);
+}
+
+/**
+ * Write a Netlink message to socket
+ */
+static bool write_msg(private_netlink_socket_t *this, struct nlmsghdr *msg)
+{
+       struct sockaddr_nl addr = {
+               .nl_family = AF_NETLINK,
+       };
+       int len;
+
+       if (msg_loss_hook(msg))
+       {
+               return TRUE;
+       }
+
+       while (TRUE)
+       {
+               len = sendto(this->socket, msg, msg->nlmsg_len, 0,
+                                        (struct sockaddr*)&addr, sizeof(addr));
+               if (len != msg->nlmsg_len)
+               {
+                       if (errno == EINTR)
+                       {
+                               continue;
+                       }
+                       DBG1(DBG_KNL, "netlink write error: %s", strerror(errno));
+                       return FALSE;
+               }
+               return TRUE;
+       }
+}
+
+/**
+ * Read a single Netlink message from socket, return 0 on error, -1 on timeout
+ */
+static ssize_t read_msg(private_netlink_socket_t *this,
+                                               char *buf, size_t buflen, bool block)
+{
+       ssize_t len;
+
+       if (block)
+       {
+               fd_set set;
+               timeval_t tv = {};
+
+               FD_ZERO(&set);
+               FD_SET(this->socket, &set);
+               timeval_add_ms(&tv, this->timeout);
+
+               if (select(this->socket + 1, &set, NULL, NULL,
+                                  this->timeout ? &tv : NULL) <= 0)
+               {
+                       return -1;
+               }
+       }
+       len = recv(this->socket, buf, buflen, MSG_TRUNC|(block ? 0 : MSG_DONTWAIT));
+       if (len > buflen)
+       {
+               DBG1(DBG_KNL, "netlink response exceeds buffer size");
+               return 0;
+       }
+       if (len < 0)
+       {
+               if (errno != EAGAIN && errno != EWOULDBLOCK && errno != EINTR)
+               {
+                       DBG1(DBG_KNL, "netlink read error: %s", strerror(errno));
+               }
+               return 0;
+       }
+       return len;
+}
+
+/**
+ * Queue received response message
+ */
+static bool queue(private_netlink_socket_t *this, struct nlmsghdr *buf)
+{
+       struct nlmsghdr *hdr;
+       entry_t *entry;
+       uintptr_t seq;
+
+       seq = (uintptr_t)buf->nlmsg_seq;
+
+       this->mutex->lock(this->mutex);
+       entry = this->entries->get(this->entries, (void*)seq);
+       if (entry)
+       {
+               hdr = malloc(buf->nlmsg_len);
+               memcpy(hdr, buf, buf->nlmsg_len);
+               array_insert(entry->hdrs, ARRAY_TAIL, hdr);
+               if (hdr->nlmsg_type == NLMSG_DONE || !(hdr->nlmsg_flags & NLM_F_MULTI))
+               {
+                       entry->complete = TRUE;
+                       entry->condvar->signal(entry->condvar);
+               }
+       }
+       else
+       {
+               DBG1(DBG_KNL, "received unknown netlink seq %u, ignored", seq);
+       }
+       this->mutex->unlock(this->mutex);
+
+       return entry != NULL;
+}
+
+/**
+ * Read and queue response message, optionally blocking, returns TRUE on timeout
+ */
+static bool read_and_queue(private_netlink_socket_t *this, bool block)
+{
+       struct nlmsghdr *hdr;
+       char buf[this->buflen];
+       ssize_t len;
+
+       len = read_msg(this, buf, sizeof(buf), block);
+       if (len == -1)
+       {
+               return TRUE;
+       }
+       if (len)
+       {
+               hdr = (struct nlmsghdr*)buf;
+               while (NLMSG_OK(hdr, len))
+               {
+                       if (!queue(this, hdr))
+                       {
+                               break;
+                       }
+                       hdr = NLMSG_NEXT(hdr, len);
+               }
+       }
+       return FALSE;
+}
+
+CALLBACK(watch, bool,
+       private_netlink_socket_t *this, int fd, watcher_event_t event)
+{
+       if (event == WATCHER_READ)
+       {
+               read_and_queue(this, FALSE);
+       }
+       return TRUE;
+}
+
+/**
+ * Send a netlink request, try once
+ */
+static status_t send_once(private_netlink_socket_t *this, struct nlmsghdr *in,
+                                                 uintptr_t seq, struct nlmsghdr **out, size_t *out_len)
+{
+       struct nlmsghdr *hdr;
+       chunk_t result = {};
+       entry_t *entry;
+
+       in->nlmsg_seq = seq;
+       in->nlmsg_pid = getpid();
+
+       if (this->names)
+       {
+               DBG3(DBG_KNL, "sending %N %u: %b", this->names, in->nlmsg_type,
+                        (u_int)seq, in, in->nlmsg_len);
+       }
+
+       this->mutex->lock(this->mutex);
+       if (!write_msg(this, in))
+       {
+               this->mutex->unlock(this->mutex);
+               return FAILED;
+       }
+
+       INIT(entry,
+               .condvar = condvar_create(CONDVAR_TYPE_DEFAULT),
+               .hdrs = array_create(0, 0),
+       );
+       this->entries->put(this->entries, (void*)seq, entry);
+
+       while (!entry->complete)
+       {
+               if (this->parallel &&
+                       lib->watcher->get_state(lib->watcher) == WATCHER_RUNNING)
+               {
+                       if (this->timeout)
+                       {
+                               if (entry->condvar->timed_wait(entry->condvar, this->mutex,
+                                                                                          this->timeout))
+                               {
+                                       break;
+                               }
+                       }
+                       else
+                       {
+                               entry->condvar->wait(entry->condvar, this->mutex);
+                       }
+               }
+               else
+               {       /* During (de-)initialization, no watcher thread is active.
+                        * collect responses ourselves. */
+                       if (read_and_queue(this, TRUE))
+                       {
+                               break;
+                       }
+               }
+       }
+       this->entries->remove(this->entries, (void*)seq);
+
+       this->mutex->unlock(this->mutex);
+
+       if (!entry->complete)
+       {       /* timeout */
+               destroy_entry(entry);
+               return OUT_OF_RES;
+       }
+
+       while (array_remove(entry->hdrs, ARRAY_HEAD, &hdr))
+       {
+               if (this->names)
+               {
+                       DBG3(DBG_KNL, "received %N %u: %b", this->names, hdr->nlmsg_type,
+                                hdr->nlmsg_seq, hdr, hdr->nlmsg_len);
+               }
+               result = chunk_cat("mm", result,
+                                                  chunk_create((char*)hdr, hdr->nlmsg_len));
+       }
+       destroy_entry(entry);
+
+       *out_len = result.len;
+       *out = (struct nlmsghdr*)result.ptr;
+
+       return SUCCESS;
+}
+
+/**
+ * Ignore errors for message types that might have completed previously
+ */
+static void ignore_retransmit_error(private_netlink_socket_t *this,
+                                                                       struct nlmsgerr *err, int type)
+{
+       switch (err->error)
+       {
+               case -EEXIST:
+                       switch (this->protocol)
+                       {
+                               case NETLINK_XFRM:
+                                       switch (type)
+                                       {
+                                               case XFRM_MSG_NEWPOLICY:
+                                               case XFRM_MSG_NEWSA:
+                                                       err->error = 0;
+                                                       break;
+                                       }
+                                       break;
+                               case NETLINK_ROUTE:
+                                       switch (type)
+                                       {
+                                               case RTM_NEWADDR:
+                                               case RTM_NEWLINK:
+                                               case RTM_NEWNEIGH:
+                                               case RTM_NEWROUTE:
+                                               case RTM_NEWRULE:
+                                                       err->error = 0;
+                                                       break;
+                                       }
+                                       break;
+                       }
+                       break;
+               case -ENOENT:
+                       switch (this->protocol)
+                       {
+                               case NETLINK_XFRM:
+                                       switch (type)
+                                       {
+                                               case XFRM_MSG_DELPOLICY:
+                                               case XFRM_MSG_DELSA:
+                                                       err->error = 0;
+                                                       break;
+                                       }
+                                       break;
+                               case NETLINK_ROUTE:
+                                       switch (type)
+                                       {
+                                               case RTM_DELADDR:
+                                               case RTM_DELLINK:
+                                               case RTM_DELNEIGH:
+                                               case RTM_DELROUTE:
+                                               case RTM_DELRULE:
+                                                       err->error = 0;
+                                                       break;
+                                       }
+                                       break;
+                       }
+                       break;
+       }
+}
+
+METHOD(netlink_socket_t, netlink_send, status_t,
+       private_netlink_socket_t *this, struct nlmsghdr *in, struct nlmsghdr **out,
+       size_t *out_len)
+{
+       uintptr_t seq;
+       u_int try;
+
+       seq = ref_get(&this->seq);
+
+       for (try = 0; try <= this->retries; ++try)
+       {
+               struct nlmsghdr *hdr;
+               status_t status;
+               size_t len;
+
+               if (try > 0)
+               {
+                       DBG1(DBG_KNL, "retransmitting Netlink request (%u/%u)",
+                                try, this->retries);
+               }
+               status = send_once(this, in, seq, &hdr, &len);
+               switch (status)
+               {
+                       case SUCCESS:
+                               break;
+                       case OUT_OF_RES:
+                               continue;
+                       default:
+                               return status;
+               }
+               if (hdr->nlmsg_type == NLMSG_ERROR)
+               {
+                       struct nlmsgerr* err;
+
+                       err = NLMSG_DATA(hdr);
+                       if (err->error == -EBUSY)
+                       {
+                               free(hdr);
+                               try--;
+                               continue;
+                       }
+                       if (this->ignore_retransmit_errors && try > 0)
+                       {
+                               ignore_retransmit_error(this, err, in->nlmsg_type);
+                       }
+               }
+               *out = hdr;
+               *out_len = len;
+               return SUCCESS;
+       }
+       DBG1(DBG_KNL, "Netlink request timed out after %u retransmits",
+                this->retries);
+       return OUT_OF_RES;
+}
+
+METHOD(netlink_socket_t, netlink_send_ack, status_t,
+       private_netlink_socket_t *this, struct nlmsghdr *in)
+{
+       struct nlmsghdr *out, *hdr;
+       size_t len;
+
+       if (netlink_send(this, in, &out, &len) != SUCCESS)
+       {
+               return FAILED;
+       }
+       hdr = out;
+       while (NLMSG_OK(hdr, len))
+       {
+               switch (hdr->nlmsg_type)
+               {
+                       case NLMSG_ERROR:
+                       {
+                               struct nlmsgerr* err = NLMSG_DATA(hdr);
+
+                               if (err->error)
+                               {
+                                       if (-err->error == EEXIST)
+                                       {       /* do not report existing routes */
+                                               free(out);
+                                               return ALREADY_DONE;
+                                       }
+                                       if (-err->error == ESRCH)
+                                       {       /* do not report missing entries */
+                                               free(out);
+                                               return NOT_FOUND;
+                                       }
+                                       DBG1(DBG_KNL, "received netlink error: %s (%d)",
+                                                strerror(-err->error), -err->error);
+                                       free(out);
+                                       return FAILED;
+                               }
+                               free(out);
+                               return SUCCESS;
+                       }
+                       default:
+                               hdr = NLMSG_NEXT(hdr, len);
+                               continue;
+                       case NLMSG_DONE:
+                               break;
+               }
+               break;
+       }
+       DBG1(DBG_KNL, "netlink request not acknowledged");
+       free(out);
+       return FAILED;
+}
+
+METHOD(netlink_socket_t, destroy, void,
+       private_netlink_socket_t *this)
+{
+       if (this->socket != -1)
+       {
+               if (this->parallel)
+               {
+                       lib->watcher->remove(lib->watcher, this->socket);
+               }
+               close(this->socket);
+       }
+       this->entries->destroy(this->entries);
+       this->mutex->destroy(this->mutex);
+       free(this);
+}
+
+/**
+ * Described in header.
+ */
+netlink_socket_t *netlink_socket_create(int protocol, enum_name_t *names,
+                                                                               bool parallel)
+{
+       private_netlink_socket_t *this;
+       struct sockaddr_nl addr = {
+               .nl_family = AF_NETLINK,
+       };
+
+       INIT(this,
+               .public = {
+                       .send = _netlink_send,
+                       .send_ack = _netlink_send_ack,
+                       .destroy = _destroy,
+               },
+               .seq = 200,
+               .mutex = mutex_create(MUTEX_TYPE_RECURSIVE),
+               .socket = socket(AF_NETLINK, SOCK_RAW, protocol),
+               .entries = hashtable_create(hashtable_hash_ptr, hashtable_equals_ptr, 4),
+               .protocol = protocol,
+               .names = names,
+               .buflen = lib->settings->get_int(lib->settings,
+                                                       "%s.plugins.kernel-netlink.buflen", 0, lib->ns),
+               .timeout = lib->settings->get_int(lib->settings,
+                                                       "%s.plugins.kernel-netlink.timeout", 0, lib->ns),
+               .retries = lib->settings->get_int(lib->settings,
+                                                       "%s.plugins.kernel-netlink.retries", 0, lib->ns),
+               .ignore_retransmit_errors = lib->settings->get_bool(lib->settings,
+                                                       "%s.plugins.kernel-netlink.ignore_retransmit_errors",
+                                                       FALSE, lib->ns),
+               .parallel = parallel,
+       );
+
+       if (!this->buflen)
+       {
+               long pagesize = sysconf(_SC_PAGESIZE);
+               if (pagesize == -1)
+               {
+                       pagesize = 4096;
+               }
+               /* base this on NLMSG_GOODSIZE */
+               this->buflen = min(pagesize, 8192);
+       }
+       if (this->socket == -1)
+       {
+               DBG1(DBG_KNL, "unable to create netlink socket");
+               destroy(this);
+               return NULL;
+       }
+       if (bind(this->socket, (struct sockaddr*)&addr, sizeof(addr)))
+       {
+               DBG1(DBG_KNL, "unable to bind netlink socket");
+               destroy(this);
+               return NULL;
+       }
+       if (this->parallel)
+       {
+               lib->watcher->add(lib->watcher, this->socket, WATCHER_READ, watch, this);
+       }
+
+       return &this->public;
+}
+
+/**
+ * Described in header.
+ */
+void netlink_add_attribute(struct nlmsghdr *hdr, int rta_type, chunk_t data,
+                                                 size_t buflen)
+{
+       struct rtattr *rta;
+
+       if (NLMSG_ALIGN(hdr->nlmsg_len) + RTA_LENGTH(data.len) > buflen)
+       {
+               DBG1(DBG_KNL, "unable to add attribute, buffer too small");
+               return;
+       }
+
+       rta = (struct rtattr*)(((char*)hdr) + NLMSG_ALIGN(hdr->nlmsg_len));
+       rta->rta_type = rta_type;
+       rta->rta_len = RTA_LENGTH(data.len);
+       memcpy(RTA_DATA(rta), data.ptr, data.len);
+       hdr->nlmsg_len = NLMSG_ALIGN(hdr->nlmsg_len) + rta->rta_len;
+}
+
+/**
+ * Described in header.
+ */
+void* netlink_reserve(struct nlmsghdr *hdr, int buflen, int type, int len)
+{
+       struct rtattr *rta;
+
+       if (NLMSG_ALIGN(hdr->nlmsg_len) + RTA_LENGTH(len) > buflen)
+       {
+               DBG1(DBG_KNL, "unable to add attribute, buffer too small");
+               return NULL;
+       }
+
+       rta = ((void*)hdr) + NLMSG_ALIGN(hdr->nlmsg_len);
+       rta->rta_type = type;
+       rta->rta_len = RTA_LENGTH(len);
+       hdr->nlmsg_len = NLMSG_ALIGN(hdr->nlmsg_len) + rta->rta_len;
+
+       return RTA_DATA(rta);
+}
diff --git a/src/libcharon/plugins/kernel_netlink/kernel_netlink_shared.h b/src/libcharon/plugins/kernel_netlink/kernel_netlink_shared.h
new file mode 100644 (file)
index 0000000..b034326
--- /dev/null
@@ -0,0 +1,104 @@
+/*
+ * Copyright (C) 2008 Tobias Brunner
+ * Hochschule fuer Technik Rapperswil
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.  See <http://www.fsf.org/copyleft/gpl.txt>.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ * or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * for more details.
+ */
+
+#ifndef KERNEL_NETLINK_SHARED_H_
+#define KERNEL_NETLINK_SHARED_H_
+
+#include <library.h>
+
+#include <linux/rtnetlink.h>
+
+/**
+ * Default buffer size.
+ *
+ * 1024 byte is currently sufficient for all operations.
+ */
+#ifndef KERNEL_NETLINK_BUFSIZE
+#define KERNEL_NETLINK_BUFSIZE 1024
+#endif
+
+/**
+ * General purpose netlink buffer.
+ *
+ * Some platforms require an enforced aligment to four bytes (e.g. ARM).
+ */
+typedef union {
+       struct nlmsghdr hdr;
+       u_char bytes[KERNEL_NETLINK_BUFSIZE];
+} netlink_buf_t __attribute__((aligned(RTA_ALIGNTO)));
+
+typedef struct netlink_socket_t netlink_socket_t;
+
+/**
+ * Wrapper around a netlink socket.
+ */
+struct netlink_socket_t {
+
+       /**
+        * Send a netlink message and wait for a reply.
+        *
+        * @param       in              netlink message to send
+        * @param       out     received netlink message
+        * @param       out_len length of the received message
+        */
+       status_t (*send)(netlink_socket_t *this, struct nlmsghdr *in,
+                                        struct nlmsghdr **out, size_t *out_len);
+
+       /**
+        * Send a netlink message and wait for its acknowledge.
+        *
+        * @param       in              netlink message to send
+        */
+       status_t (*send_ack)(netlink_socket_t *this, struct nlmsghdr *in);
+
+       /**
+        * Destroy the socket.
+        */
+       void (*destroy)(netlink_socket_t *this);
+};
+
+/**
+ * Create a netlink_socket_t object.
+ *
+ * @param protocol     protocol type (e.g. NETLINK_XFRM or NETLINK_ROUTE)
+ * @param names                optional enum names for Netlink messages
+ * @param parallel     support parallel queries on this Netlink socket
+ */
+netlink_socket_t *netlink_socket_create(int protocol, enum_name_t *names,
+                                                                               bool parallel);
+
+/**
+ * Creates an rtattr and adds it to the given netlink message.
+ *
+ * @param hdr                  netlink message
+ * @param rta_type             type of the rtattr
+ * @param data                 data to add to the rtattr
+ * @param buflen               length of the netlink message buffer
+ */
+void netlink_add_attribute(struct nlmsghdr *hdr, int rta_type, chunk_t data,
+                                                  size_t buflen);
+
+/**
+ * Reserve space in a netlink message for given size and type, returning buffer.
+ *
+ * @param hdr                  netlink message
+ * @param buflen               size of full netlink buffer
+ * @param type                 RTA type
+ * @param len                  length of RTA data
+ * @return                             buffer to len bytes of attribute data, NULL on error
+ */
+void* netlink_reserve(struct nlmsghdr *hdr, int buflen, int type, int len);
+
+#endif /* KERNEL_NETLINK_SHARED_H_ */
diff --git a/src/libcharon/plugins/kernel_netlink/suites/test_socket.c b/src/libcharon/plugins/kernel_netlink/suites/test_socket.c
new file mode 100644 (file)
index 0000000..3e8facd
--- /dev/null
@@ -0,0 +1,302 @@
+/*
+ * Copyright (C) 2014 Martin Willi
+ * Copyright (C) 2014 revosec AG
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.  See <http://www.fsf.org/copyleft/gpl.txt>.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ * or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * for more details.
+ */
+
+#include <test_suite.h>
+
+#include <threading/thread.h>
+
+#include "../kernel_netlink_shared.h"
+
+/**
+ * Netlink message drop configuration
+ */
+static int drop_interval = 0;
+
+/**
+ * Netlink message drop hook
+ */
+bool netlink_msg_loss(struct nlmsghdr *hdr)
+{
+       static refcount_t i;
+
+       if (drop_interval)
+       {
+               return ref_get(&i) % drop_interval == drop_interval - 1;
+       }
+       return FALSE;
+}
+
+START_TEST(test_echo)
+{
+       netlink_socket_t *s;
+       struct nlmsghdr *out;
+       struct rtmsg *msg;
+       char dst[] = {
+               127,0,0,1
+       };
+       size_t len;
+       netlink_buf_t request = {
+               .hdr = {
+                       .nlmsg_len = NLMSG_LENGTH(sizeof(struct rtmsg)),
+                       .nlmsg_flags = NLM_F_REQUEST,
+                       .nlmsg_type = RTM_GETROUTE,
+               },
+       };
+
+       msg = NLMSG_DATA(&request.hdr);
+       msg->rtm_family = AF_INET;
+       netlink_add_attribute(&request.hdr, RTA_DST,
+                                                 chunk_from_thing(dst), sizeof(request));
+
+       s = netlink_socket_create(NETLINK_ROUTE, NULL, _i != 0);
+
+       ck_assert(s->send(s, &request.hdr, &out, &len) == SUCCESS);
+       ck_assert_int_eq(out->nlmsg_type, RTM_NEWROUTE);
+       free(out);
+       s->destroy(s);
+}
+END_TEST
+
+START_TEST(test_echo_dump)
+{
+       netlink_socket_t *s;
+       struct nlmsghdr *out, *current;
+       struct rtgenmsg *msg;
+       size_t len;
+       netlink_buf_t request = {
+               .hdr = {
+                       .nlmsg_len = NLMSG_LENGTH(sizeof(struct rtgenmsg)),
+                       .nlmsg_flags = NLM_F_REQUEST | NLM_F_MATCH | NLM_F_ROOT,
+                       .nlmsg_type = RTM_GETLINK,
+               },
+       };
+
+       s = netlink_socket_create(NETLINK_ROUTE, NULL, _i != 0);
+       msg = NLMSG_DATA(&request.hdr);
+       msg->rtgen_family = AF_UNSPEC;
+
+       ck_assert(s->send(s, &request.hdr, &out, &len) == SUCCESS);
+       current = out;
+       while (TRUE)
+       {
+               ck_assert(NLMSG_OK(current, len));
+               if (current->nlmsg_type == NLMSG_DONE)
+               {
+                       break;
+               }
+               ck_assert_int_eq(current->nlmsg_type, RTM_NEWLINK);
+               current = NLMSG_NEXT(current, len);
+       }
+       free(out);
+       s->destroy(s);
+}
+END_TEST
+
+CALLBACK(stress, void*,
+       netlink_socket_t *s)
+{
+       struct nlmsghdr *out;
+       struct rtmsg *msg;
+       char dst[] = {
+               127,0,0,1
+       };
+       size_t len;
+       int i;
+       netlink_buf_t request = {
+               .hdr = {
+                       .nlmsg_len = NLMSG_LENGTH(sizeof(struct rtmsg)),
+                       .nlmsg_flags = NLM_F_REQUEST,
+                       .nlmsg_type = RTM_GETROUTE,
+               },
+       };
+
+       for (i = 0; i < 10; i++)
+       {
+               msg = NLMSG_DATA(&request.hdr);
+               msg->rtm_family = AF_INET;
+               netlink_add_attribute(&request.hdr, RTA_DST,
+                                                         chunk_from_thing(dst), sizeof(request));
+
+               ck_assert(s->send(s, &request.hdr, &out, &len) == SUCCESS);
+               ck_assert_int_eq(out->nlmsg_type, RTM_NEWROUTE);
+               free(out);
+       }
+       return NULL;
+}
+
+CALLBACK(stress_dump, void*,
+       netlink_socket_t *s)
+{
+       struct nlmsghdr *out, *current;
+       struct rtgenmsg *msg;
+       size_t len;
+       int i;
+       netlink_buf_t request = {
+               .hdr = {
+                       .nlmsg_len = NLMSG_LENGTH(sizeof(struct rtgenmsg)),
+                       .nlmsg_flags = NLM_F_REQUEST | NLM_F_MATCH | NLM_F_ROOT,
+                       .nlmsg_type = RTM_GETLINK,
+               },
+       };
+
+       msg = NLMSG_DATA(&request.hdr);
+       msg->rtgen_family = AF_UNSPEC;
+
+       for (i = 0; i < 10; i++)
+       {
+               ck_assert(s->send(s, &request.hdr, &out, &len) == SUCCESS);
+               current = out;
+               while (TRUE)
+               {
+                       ck_assert(NLMSG_OK(current, len));
+                       if (current->nlmsg_type == NLMSG_DONE)
+                       {
+                               break;
+                       }
+                       ck_assert_int_eq(current->nlmsg_type, RTM_NEWLINK);
+                       current = NLMSG_NEXT(current, len);
+               }
+               free(out);
+       }
+       return NULL;
+}
+
+START_TEST(test_stress)
+{
+       thread_t *threads[10];
+       netlink_socket_t *s;
+       int i;
+
+       s = netlink_socket_create(NETLINK_ROUTE, NULL, _i != 0);
+       for (i = 0; i < countof(threads); i++)
+       {
+               threads[i] = thread_create(stress, s);
+       }
+       for (i = 0; i < countof(threads); i++)
+       {
+               threads[i]->join(threads[i]);
+       }
+       s->destroy(s);
+}
+END_TEST
+
+START_TEST(test_stress_dump)
+{
+       thread_t *threads[10];
+       netlink_socket_t *s;
+       int i;
+
+       s = netlink_socket_create(NETLINK_ROUTE, NULL, _i != 0);
+       for (i = 0; i < countof(threads); i++)
+       {
+               threads[i] = thread_create(stress_dump, s);
+       }
+       for (i = 0; i < countof(threads); i++)
+       {
+               threads[i]->join(threads[i]);
+       }
+       s->destroy(s);
+}
+END_TEST
+
+START_TEST(test_retransmit_success)
+{
+       netlink_socket_t *s;
+       struct nlmsghdr *out;
+       struct rtgenmsg *msg;
+       size_t len;
+       netlink_buf_t request = {
+               .hdr = {
+                       .nlmsg_len = NLMSG_LENGTH(sizeof(struct rtgenmsg)),
+                       .nlmsg_flags = NLM_F_REQUEST | NLM_F_MATCH | NLM_F_ROOT,
+                       .nlmsg_type = RTM_GETLINK,
+               },
+       };
+
+       drop_interval = 2;
+
+       lib->settings->set_int(lib->settings,
+                                                       "%s.plugins.kernel-netlink.timeout", 100, lib->ns);
+       lib->settings->set_int(lib->settings,
+                                                       "%s.plugins.kernel-netlink.retries", 1, lib->ns);
+
+       s = netlink_socket_create(NETLINK_ROUTE, NULL, _i != 0);
+       msg = NLMSG_DATA(&request.hdr);
+       msg->rtgen_family = AF_UNSPEC;
+
+       ck_assert(s->send(s, &request.hdr, &out, &len) == SUCCESS);
+       free(out);
+       s->destroy(s);
+
+       drop_interval = 0;
+}
+END_TEST
+
+START_TEST(test_retransmit_fail)
+{
+       netlink_socket_t *s;
+       struct nlmsghdr *out;
+       struct rtgenmsg *msg;
+       size_t len;
+       netlink_buf_t request = {
+               .hdr = {
+                       .nlmsg_len = NLMSG_LENGTH(sizeof(struct rtgenmsg)),
+                       .nlmsg_flags = NLM_F_REQUEST | NLM_F_MATCH | NLM_F_ROOT,
+                       .nlmsg_type = RTM_GETLINK,
+               },
+       };
+
+       drop_interval = 1;
+
+       lib->settings->set_int(lib->settings,
+                                                       "%s.plugins.kernel-netlink.timeout", 50, lib->ns);
+       lib->settings->set_int(lib->settings,
+                                                       "%s.plugins.kernel-netlink.retries", 3, lib->ns);
+
+       s = netlink_socket_create(NETLINK_ROUTE, NULL, _i != 0);
+       msg = NLMSG_DATA(&request.hdr);
+       msg->rtgen_family = AF_UNSPEC;
+
+       ck_assert(s->send(s, &request.hdr, &out, &len) == OUT_OF_RES);
+       s->destroy(s);
+
+       drop_interval = 0;
+}
+END_TEST
+
+Suite *socket_suite_create()
+{
+       Suite *s;
+       TCase *tc;
+
+       s = suite_create("netlink socket");
+
+       tc = tcase_create("echo");
+       tcase_add_loop_test(tc, test_echo, 0, 2);
+       tcase_add_loop_test(tc, test_echo_dump, 0, 2);
+       suite_add_tcase(s, tc);
+
+       tc = tcase_create("stress");
+       tcase_add_loop_test(tc, test_stress, 0, 2);
+       tcase_add_loop_test(tc, test_stress_dump, 0, 2);
+       suite_add_tcase(s, tc);
+
+       tc = tcase_create("retransmit");
+       tcase_add_loop_test(tc, test_retransmit_success, 0, 2);
+       tcase_add_loop_test(tc, test_retransmit_fail, 0, 2);
+       suite_add_tcase(s, tc);
+
+       return s;
+}
diff --git a/src/libcharon/plugins/kernel_netlink/tests.c b/src/libcharon/plugins/kernel_netlink/tests.c
new file mode 100644 (file)
index 0000000..52985b4
--- /dev/null
@@ -0,0 +1,49 @@
+/*
+ * Copyright (C) 2014 Martin Willi
+ * Copyright (C) 2014 revosec AG
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.  See <http://www.fsf.org/copyleft/gpl.txt>.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ * or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * for more details.
+ */
+
+#include <test_runner.h>
+
+#include <hydra.h>
+
+/* declare test suite constructors */
+#define TEST_SUITE(x) test_suite_t* x();
+#include "tests.h"
+#undef TEST_SUITE
+
+static test_configuration_t tests[] = {
+#define TEST_SUITE(x) \
+       { .suite = x, },
+#include "tests.h"
+       { .suite = NULL, }
+};
+
+static bool test_runner_init(bool init)
+{
+       if (init)
+       {
+               lib->processor->set_threads(lib->processor, 8);
+       }
+       else
+       {
+               lib->processor->set_threads(lib->processor, 0);
+               lib->processor->cancel(lib->processor);
+       }
+       return TRUE;
+}
+
+int main(int argc, char *argv[])
+{
+       return test_runner_run("kernel-netlink", tests, test_runner_init);
+}
diff --git a/src/libcharon/plugins/kernel_netlink/tests.h b/src/libcharon/plugins/kernel_netlink/tests.h
new file mode 100644 (file)
index 0000000..2b6715a
--- /dev/null
@@ -0,0 +1,16 @@
+/*
+ * Copyright (C) 2014 Martin Willi
+ * Copyright (C) 2014 revosec AG
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.  See <http://www.fsf.org/copyleft/gpl.txt>.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ * or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * for more details.
+ */
+
+TEST_SUITE(socket_suite_create)
diff --git a/src/libcharon/plugins/kernel_pfkey/Makefile.am b/src/libcharon/plugins/kernel_pfkey/Makefile.am
new file mode 100644 (file)
index 0000000..f645528
--- /dev/null
@@ -0,0 +1,19 @@
+AM_CPPFLAGS = \
+       -I${linux_headers} \
+       -I$(top_srcdir)/src/libstrongswan \
+       -I$(top_srcdir)/src/libhydra
+
+AM_CFLAGS = \
+       $(PLUGIN_CFLAGS)
+
+if MONOLITHIC
+noinst_LTLIBRARIES = libstrongswan-kernel-pfkey.la
+else
+plugin_LTLIBRARIES = libstrongswan-kernel-pfkey.la
+endif
+
+libstrongswan_kernel_pfkey_la_SOURCES = \
+       kernel_pfkey_plugin.h kernel_pfkey_plugin.c \
+       kernel_pfkey_ipsec.h kernel_pfkey_ipsec.c
+
+libstrongswan_kernel_pfkey_la_LDFLAGS = -module -avoid-version
diff --git a/src/libcharon/plugins/kernel_pfkey/kernel_pfkey_ipsec.c b/src/libcharon/plugins/kernel_pfkey/kernel_pfkey_ipsec.c
new file mode 100644 (file)
index 0000000..a2fccd1
--- /dev/null
@@ -0,0 +1,3102 @@
+/*
+ * Copyright (C) 2008-2015 Tobias Brunner
+ * Copyright (C) 2008 Andreas Steffen
+ * Hochschule fuer Technik Rapperswil
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.  See <http://www.fsf.org/copyleft/gpl.txt>.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ * or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * for more details.
+ */
+/*
+ * Copyright (C) 2014 Nanoteq Pty Ltd
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include <stdint.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+
+#ifdef __FreeBSD__
+#include <limits.h> /* for LONG_MAX */
+#endif
+
+#ifdef HAVE_NET_PFKEYV2_H
+#include <net/pfkeyv2.h>
+#else
+#include <linux/pfkeyv2.h>
+#endif
+
+#ifdef SADB_X_EXT_NAT_T_TYPE
+#define HAVE_NATT
+#endif
+
+#ifdef HAVE_NETIPSEC_IPSEC_H
+#include <netipsec/ipsec.h>
+#elif defined(HAVE_NETINET6_IPSEC_H)
+#include <netinet6/ipsec.h>
+#else
+#include <linux/ipsec.h>
+#endif
+
+#ifdef HAVE_NATT
+#ifdef HAVE_LINUX_UDP_H
+#include <linux/udp.h>
+#else
+#include <netinet/udp.h>
+#endif /*HAVE_LINUX_UDP_H*/
+#endif /*HAVE_NATT*/
+
+#include <unistd.h>
+#include <time.h>
+#include <errno.h>
+#ifdef __APPLE__
+#include <sys/sysctl.h>
+#endif
+
+#include "kernel_pfkey_ipsec.h"
+
+#include <hydra.h>
+#include <utils/debug.h>
+#include <networking/host.h>
+#include <collections/linked_list.h>
+#include <collections/hashtable.h>
+#include <threading/mutex.h>
+
+/** non linux specific */
+#ifndef IPPROTO_COMP
+#ifdef IPPROTO_IPCOMP
+#define IPPROTO_COMP IPPROTO_IPCOMP
+#endif
+#endif
+
+#ifndef SADB_X_AALG_SHA2_256HMAC
+#define SADB_X_AALG_SHA2_256HMAC SADB_X_AALG_SHA2_256
+#define SADB_X_AALG_SHA2_384HMAC SADB_X_AALG_SHA2_384
+#define SADB_X_AALG_SHA2_512HMAC SADB_X_AALG_SHA2_512
+#endif