kernel-netlink: Support configuring XFRM policy hashing thresholds
[strongswan.git] / src / libcharon / plugins / kernel_netlink / kernel_netlink_ipsec.c
index 6d971bd..6b06c26 100644 (file)
@@ -1,11 +1,11 @@
 /*
- * Copyright (C) 2006-2009 Tobias Brunner
+ * Copyright (C) 2006-2016 Tobias Brunner
  * Copyright (C) 2005-2009 Martin Willi
- * Copyright (C) 2008 Andreas Steffen
+ * Copyright (C) 2008-2016 Andreas Steffen
  * Copyright (C) 2006-2007 Fabian Hartmann, Noah Heusser
  * Copyright (C) 2006 Daniel Roethlisberger
  * Copyright (C) 2005 Jan Hutter
- * Hochschule fuer Technik Rapperswil
+ * HSR Hochschule fuer Technik Rapperswil
  *
  * This program is free software; you can redistribute it and/or modify it
  * under the terms of the GNU General Public License as published by the
@@ -18,6 +18,7 @@
  * for more details.
  */
 
+#define _GNU_SOURCE
 #include <sys/types.h>
 #include <sys/socket.h>
 #include <stdint.h>
 #include <linux/rtnetlink.h>
 #include <linux/xfrm.h>
 #include <linux/udp.h>
+#include <net/if.h>
 #include <unistd.h>
 #include <time.h>
 #include <errno.h>
 #include <string.h>
 #include <fcntl.h>
+#include <dlfcn.h>
 
 #include "kernel_netlink_ipsec.h"
 #include "kernel_netlink_shared.h"
 
-#include <hydra.h>
 #include <daemon.h>
-#include <threading/thread.h>
+#include <utils/debug.h>
 #include <threading/mutex.h>
-#include <utils/hashtable.h>
-#include <processing/jobs/callback_job.h>
+#include <threading/condvar.h>
+#include <collections/array.h>
+#include <collections/hashtable.h>
+#include <collections/linked_list.h>
 
-/** required for Linux 2.6.26 kernel and later */
+/** Required for Linux 2.6.26 kernel and later */
 #ifndef XFRM_STATE_AF_UNSPEC
-#define XFRM_STATE_AF_UNSPEC   32
+#define XFRM_STATE_AF_UNSPEC 32
 #endif
 
-/** from linux/in.h */
+/** From linux/in.h */
 #ifndef IP_XFRM_POLICY
 #define IP_XFRM_POLICY 17
 #endif
 
-/* missing on uclibc */
+/** Missing on uclibc */
 #ifndef IPV6_XFRM_POLICY
 #define IPV6_XFRM_POLICY 34
 #endif /*IPV6_XFRM_POLICY*/
 
-/** default priority of installed policies */
-#define PRIO_LOW 3000
-#define PRIO_HIGH 2000
+/* from linux/udp.h */
+#ifndef UDP_ENCAP
+#define UDP_ENCAP 100
+#endif
+
+#ifndef UDP_ENCAP_ESPINUDP
+#define UDP_ENCAP_ESPINUDP 2
+#endif
+
+/* this is not defined on some platforms */
+#ifndef SOL_UDP
+#define SOL_UDP IPPROTO_UDP
+#endif
+
+/** Base priority for installed policies */
+#define PRIO_BASE 100000
+
+/** Default lifetime of an acquire XFRM state (in seconds) */
+#define DEFAULT_ACQUIRE_LIFETIME 165
 
 /**
- * map the limit for bytes and packets to XFRM_INF per default
+ * Map the limit for bytes and packets to XFRM_INF by default
  */
 #define XFRM_LIMIT(x) ((x) == 0 ? XFRM_INF : (x))
 
 #define XFRMNLGRP(x) (1<<(XFRMNLGRP_##x-1))
 
 /**
- * returns a pointer to the first rtattr following the nlmsghdr *nlh and the
+ * Returns a pointer to the first rtattr following the nlmsghdr *nlh and the
  * 'usual' netlink data x like 'struct xfrm_usersa_info'
  */
-#define XFRM_RTA(nlh, x) ((struct rtattr*)(NLMSG_DATA(nlh) + NLMSG_ALIGN(sizeof(x))))
-/**
- * returns a pointer to the next rtattr following rta.
- * !!! do not use this to parse messages. use RTA_NEXT and RTA_OK instead !!!
- */
-#define XFRM_RTA_NEXT(rta) ((struct rtattr*)(((char*)(rta)) + RTA_ALIGN((rta)->rta_len)))
+#define XFRM_RTA(nlh, x) ((struct rtattr*)(NLMSG_DATA(nlh) + \
+                                                                                  NLMSG_ALIGN(sizeof(x))))
 /**
- * returns the total size of attached rta data
+ * Returns the total size of attached rta data
  * (after 'usual' netlink data x like 'struct xfrm_usersa_info')
  */
 #define XFRM_PAYLOAD(nlh, x) NLMSG_PAYLOAD(nlh, sizeof(x))
@@ -130,7 +146,7 @@ ENUM(xfrm_msg_names, XFRM_MSG_NEWSA, XFRM_MSG_MAPPING,
        "XFRM_MSG_MAPPING"
 );
 
-ENUM(xfrm_attr_type_names, XFRMA_UNSPEC, XFRMA_KMADDRESS,
+ENUM(xfrm_attr_type_names, XFRMA_UNSPEC, XFRMA_REPLAY_ESN_VAL,
        "XFRMA_UNSPEC",
        "XFRMA_ALG_AUTH",
        "XFRMA_ALG_CRYPT",
@@ -150,11 +166,13 @@ ENUM(xfrm_attr_type_names, XFRMA_UNSPEC, XFRMA_KMADDRESS,
        "XFRMA_POLICY_TYPE",
        "XFRMA_MIGRATE",
        "XFRMA_ALG_AEAD",
-       "XFRMA_KMADDRESS"
+       "XFRMA_KMADDRESS",
+       "XFRMA_ALG_AUTH_TRUNC",
+       "XFRMA_MARK",
+       "XFRMA_TFCPAD",
+       "XFRMA_REPLAY_ESN_VAL",
 );
 
-#define END_OF_LIST -1
-
 /**
  * Algorithms for encryption
  */
@@ -164,7 +182,7 @@ static kernel_algorithm_t encryption_algs[] = {
        {ENCR_3DES,                                     "des3_ede"                      },
 /*     {ENCR_RC5,                                      "***"                           }, */
 /*     {ENCR_IDEA,                                     "***"                           }, */
-       {ENCR_CAST,                                     "cast128"                       },
+       {ENCR_CAST,                                     "cast5"                         },
        {ENCR_BLOWFISH,                         "blowfish"                      },
 /*     {ENCR_3IDEA,                            "***"                           }, */
 /*     {ENCR_DES_IV32,                         "***"                           }, */
@@ -183,7 +201,9 @@ static kernel_algorithm_t encryption_algs[] = {
 /*     {ENCR_CAMELLIA_CCM_ICV8,        "***"                           }, */
 /*     {ENCR_CAMELLIA_CCM_ICV12,       "***"                           }, */
 /*     {ENCR_CAMELLIA_CCM_ICV16,       "***"                           }, */
-       {END_OF_LIST,                           NULL                            }
+       {ENCR_SERPENT_CBC,                      "serpent"                       },
+       {ENCR_TWOFISH_CBC,                      "twofish"                       },
+       {ENCR_CHACHA20_POLY1305,        "rfc7539esp(chacha20,poly1305)"},
 };
 
 /**
@@ -191,7 +211,9 @@ static kernel_algorithm_t encryption_algs[] = {
  */
 static kernel_algorithm_t integrity_algs[] = {
        {AUTH_HMAC_MD5_96,                      "md5"                           },
+       {AUTH_HMAC_MD5_128,                     "hmac(md5)"                     },
        {AUTH_HMAC_SHA1_96,                     "sha1"                          },
+       {AUTH_HMAC_SHA1_160,            "hmac(sha1)"            },
        {AUTH_HMAC_SHA2_256_96,         "sha256"                        },
        {AUTH_HMAC_SHA2_256_128,        "hmac(sha256)"          },
        {AUTH_HMAC_SHA2_384_192,        "hmac(sha384)"          },
@@ -199,7 +221,6 @@ static kernel_algorithm_t integrity_algs[] = {
 /*     {AUTH_DES_MAC,                          "***"                           }, */
 /*     {AUTH_KPDK_MD5,                         "***"                           }, */
        {AUTH_AES_XCBC_96,                      "xcbc(aes)"                     },
-       {END_OF_LIST,                           NULL                            }
 };
 
 /**
@@ -210,29 +231,122 @@ static kernel_algorithm_t compression_algs[] = {
        {IPCOMP_DEFLATE,                        "deflate"                       },
        {IPCOMP_LZS,                            "lzs"                           },
        {IPCOMP_LZJH,                           "lzjh"                          },
-       {END_OF_LIST,                           NULL                            }
 };
 
 /**
  * Look up a kernel algorithm name and its key size
  */
-static char* lookup_algorithm(kernel_algorithm_t *list, int ikev2)
+static char* lookup_algorithm(transform_type_t type, int ikev2)
 {
-       while (list->ikev2 != END_OF_LIST)
+       kernel_algorithm_t *list;
+       int i, count;
+       char *name;
+
+       switch (type)
+       {
+               case ENCRYPTION_ALGORITHM:
+                       list = encryption_algs;
+                       count = countof(encryption_algs);
+                       break;
+               case INTEGRITY_ALGORITHM:
+                       list = integrity_algs;
+                       count = countof(integrity_algs);
+                       break;
+               case COMPRESSION_ALGORITHM:
+                       list = compression_algs;
+                       count = countof(compression_algs);
+                       break;
+               default:
+                       return NULL;
+       }
+       for (i = 0; i < count; i++)
        {
-               if (list->ikev2 == ikev2)
+               if (list[i].ikev2 == ikev2)
                {
-                       return list->name;
+                       return list[i].name;
                }
-               list++;
+       }
+       if (charon->kernel->lookup_algorithm(charon->kernel, ikev2, type, NULL,
+                                                                                &name))
+       {
+               return name;
        }
        return NULL;
 }
 
+typedef struct private_kernel_netlink_ipsec_t private_kernel_netlink_ipsec_t;
+
+/**
+ * Private variables and functions of kernel_netlink class.
+ */
+struct private_kernel_netlink_ipsec_t {
+       /**
+        * Public part of the kernel_netlink_t object
+        */
+       kernel_netlink_ipsec_t public;
+
+       /**
+        * Mutex to lock access to installed policies
+        */
+       mutex_t *mutex;
+
+       /**
+        * Condvar to synchronize access to individual policies
+        */
+       condvar_t *condvar;
+
+       /**
+        * Hash table of installed policies (policy_entry_t)
+        */
+       hashtable_t *policies;
+
+       /**
+        * Hash table of IPsec SAs using policies (ipsec_sa_t)
+        */
+       hashtable_t *sas;
+
+       /**
+        * Netlink xfrm socket (IPsec)
+        */
+       netlink_socket_t *socket_xfrm;
+
+       /**
+        * Netlink xfrm socket to receive acquire and expire events
+        */
+       int socket_xfrm_events;
+
+       /**
+        * Whether to install routes along policies
+        */
+       bool install_routes;
+
+       /**
+        * Whether to set protocol and ports on selector installed with transport
+        * mode IPsec SAs
+        */
+       bool proto_port_transport;
+
+       /**
+        * Whether to always use UPDATE to install policies
+        */
+       bool policy_update;
+
+       /**
+        * Installed port based IKE bypass policies, as bypass_t
+        */
+       array_t *bypass;
+
+       /**
+        * Custom priority calculation function
+        */
+       uint32_t (*get_priority)(kernel_ipsec_policy_id_t *id,
+                                                        kernel_ipsec_manage_policy_t *data);
+};
+
 typedef struct route_entry_t route_entry_t;
 
 /**
- * installed routing entry
+ * Installed routing entry
  */
 struct route_entry_t {
        /** Name of the interface the route is bound to */
@@ -241,18 +355,18 @@ struct route_entry_t {
        /** Source ip of the route */
        host_t *src_ip;
 
-       /** gateway for this route */
+       /** Gateway for this route */
        host_t *gateway;
 
        /** Destination net */
        chunk_t dst_net;
 
        /** Destination net prefixlen */
-       u_int8_t prefixlen;
+       uint8_t prefixlen;
 };
 
 /**
- * destroy an route_entry_t object
+ * Destroy a route_entry_t object
  */
 static void route_entry_destroy(route_entry_t *this)
 {
@@ -263,127 +377,325 @@ static void route_entry_destroy(route_entry_t *this)
        free(this);
 }
 
-typedef struct policy_entry_t policy_entry_t;
+/**
+ * Compare two route_entry_t objects
+ */
+static bool route_entry_equals(route_entry_t *a, route_entry_t *b)
+{
+       return a->if_name && b->if_name && streq(a->if_name, b->if_name) &&
+                  a->src_ip->ip_equals(a->src_ip, b->src_ip) &&
+                  a->gateway->ip_equals(a->gateway, b->gateway) &&
+                  chunk_equals(a->dst_net, b->dst_net) && a->prefixlen == b->prefixlen;
+}
+
+typedef struct ipsec_sa_t ipsec_sa_t;
 
 /**
- * installed kernel policy.
+ * IPsec SA assigned to a policy.
  */
-struct policy_entry_t {
+struct ipsec_sa_t {
+       /** Source address of this SA */
+       host_t *src;
 
-       /** direction of this policy: in, out, forward */
-       u_int8_t direction;
+       /** Destination address of this SA */
+       host_t *dst;
 
-       /** parameters of installed policy */
-       struct xfrm_selector sel;
+       /** Optional mark */
+       mark_t mark;
 
-       /** optional mark */
-       u_int32_t mark;
+       /** Description of this SA */
+       ipsec_sa_cfg_t cfg;
 
-       /** associated route installed for this policy */
-       route_entry_t *route;
+       /** Reference count for this SA */
+       refcount_t refcount;
+};
+
+/**
+ * Hash function for ipsec_sa_t objects
+ */
+static u_int ipsec_sa_hash(ipsec_sa_t *sa)
+{
+       return chunk_hash_inc(sa->src->get_address(sa->src),
+                                                 chunk_hash_inc(sa->dst->get_address(sa->dst),
+                                                 chunk_hash_inc(chunk_from_thing(sa->mark),
+                                                 chunk_hash(chunk_from_thing(sa->cfg)))));
+}
+
+/**
+ * Equality function for ipsec_sa_t objects
+ */
+static bool ipsec_sa_equals(ipsec_sa_t *sa, ipsec_sa_t *other_sa)
+{
+       return sa->src->ip_equals(sa->src, other_sa->src) &&
+                  sa->dst->ip_equals(sa->dst, other_sa->dst) &&
+                  sa->mark.value == other_sa->mark.value &&
+                  sa->mark.mask == other_sa->mark.mask &&
+                  ipsec_sa_cfg_equals(&sa->cfg, &other_sa->cfg);
+}
+
+/**
+ * Allocate or reference an IPsec SA object
+ */
+static ipsec_sa_t *ipsec_sa_create(private_kernel_netlink_ipsec_t *this,
+                                                                  host_t *src, host_t *dst, mark_t mark,
+                                                                  ipsec_sa_cfg_t *cfg)
+{
+       ipsec_sa_t *sa, *found;
+       INIT(sa,
+               .src = src,
+               .dst = dst,
+               .mark = mark,
+               .cfg = *cfg,
+       );
+       found = this->sas->get(this->sas, sa);
+       if (!found)
+       {
+               sa->src = src->clone(src);
+               sa->dst = dst->clone(dst);
+               this->sas->put(this->sas, sa, sa);
+       }
+       else
+       {
+               free(sa);
+               sa = found;
+       }
+       ref_get(&sa->refcount);
+       return sa;
+}
+
+/**
+ * Release and destroy an IPsec SA object
+ */
+static void ipsec_sa_destroy(private_kernel_netlink_ipsec_t *this,
+                                                        ipsec_sa_t *sa)
+{
+       if (ref_put(&sa->refcount))
+       {
+               this->sas->remove(this->sas, sa);
+               DESTROY_IF(sa->src);
+               DESTROY_IF(sa->dst);
+               free(sa);
+       }
+}
+
+typedef struct policy_sa_t policy_sa_t;
+typedef struct policy_sa_out_t policy_sa_out_t;
+
+/**
+ * Mapping between a policy and an IPsec SA.
+ */
+struct policy_sa_t {
+       /** Priority assigned to the policy when installed with this SA */
+       uint32_t priority;
+
+       /** Automatic priority assigned to the policy when installed with this SA */
+       uint32_t auto_priority;
+
+       /** Type of the policy */
+       policy_type_t type;
 
-       /** by how many CHILD_SA's this policy is used */
-       u_int refcount;
+       /** Assigned SA */
+       ipsec_sa_t *sa;
 };
 
 /**
- * Hash function for policy_entry_t objects
+ * For outbound policies we also cache the traffic selectors in order to install
+ * the route.
  */
-static u_int policy_hash(policy_entry_t *key)
+struct policy_sa_out_t {
+       /** Generic interface */
+       policy_sa_t generic;
+
+       /** Source traffic selector of this policy */
+       traffic_selector_t *src_ts;
+
+       /** Destination traffic selector of this policy */
+       traffic_selector_t *dst_ts;
+};
+
+/**
+ * Create a policy_sa(_in)_t object
+ */
+static policy_sa_t *policy_sa_create(private_kernel_netlink_ipsec_t *this,
+       policy_dir_t dir, policy_type_t type, host_t *src, host_t *dst,
+       traffic_selector_t *src_ts, traffic_selector_t *dst_ts, mark_t mark,
+       ipsec_sa_cfg_t *cfg)
 {
-       chunk_t chunk = chunk_create((void*)&key->sel,
-                                                       sizeof(struct xfrm_selector) + sizeof(u_int32_t));
-       return chunk_hash(chunk);
+       policy_sa_t *policy;
+
+       if (dir == POLICY_OUT)
+       {
+               policy_sa_out_t *out;
+               INIT(out,
+                       .src_ts = src_ts->clone(src_ts),
+                       .dst_ts = dst_ts->clone(dst_ts),
+               );
+               policy = &out->generic;
+       }
+       else
+       {
+               INIT(policy, .priority = 0);
+       }
+       policy->type = type;
+       policy->sa = ipsec_sa_create(this, src, dst, mark, cfg);
+       return policy;
 }
 
 /**
- * Equality function for policy_entry_t objects
+ * Destroy a policy_sa(_in)_t object
  */
-static bool policy_equals(policy_entry_t *key, policy_entry_t *other_key)
+static void policy_sa_destroy(policy_sa_t *policy, policy_dir_t *dir,
+                                                         private_kernel_netlink_ipsec_t *this)
 {
-       return memeq(&key->sel, &other_key->sel,
-                                sizeof(struct xfrm_selector) + sizeof(u_int32_t)) &&
-                  key->direction == other_key->direction;
+       if (*dir == POLICY_OUT)
+       {
+               policy_sa_out_t *out = (policy_sa_out_t*)policy;
+               out->src_ts->destroy(out->src_ts);
+               out->dst_ts->destroy(out->dst_ts);
+       }
+       ipsec_sa_destroy(this, policy->sa);
+       free(policy);
 }
 
-typedef struct private_kernel_netlink_ipsec_t private_kernel_netlink_ipsec_t;
+typedef struct policy_entry_t policy_entry_t;
 
 /**
- * Private variables and functions of kernel_netlink class.
+ * Installed kernel policy.
  */
-struct private_kernel_netlink_ipsec_t {
-       /**
-        * Public part of the kernel_netlink_t object.
-        */
-       kernel_netlink_ipsec_t public;
+struct policy_entry_t {
 
-       /**
-        * mutex to lock access to various lists
-        */
-       mutex_t *mutex;
+       /** Direction of this policy: in, out, forward */
+       uint8_t direction;
 
-       /**
-        * Hash table of installed policies (policy_entry_t)
-        */
-       hashtable_t *policies;
+       /** Parameters of installed policy */
+       struct xfrm_selector sel;
 
-       /**
-        * job receiving netlink events
-        */
-       callback_job_t *job;
+       /** Optional mark */
+       uint32_t mark;
 
-       /**
-        * Netlink xfrm socket (IPsec)
-        */
-       netlink_socket_t *socket_xfrm;
+       /** Associated route installed for this policy */
+       route_entry_t *route;
 
-       /**
-        * netlink xfrm socket to receive acquire and expire events
-        */
-       int socket_xfrm_events;
+       /** List of SAs this policy is used by, ordered by priority */
+       linked_list_t *used_by;
 
-       /**
-        * whether to install routes along policies
-        */
-       bool install_routes;
+       /** reqid for this policy */
+       uint32_t reqid;
+
+       /** Number of threads waiting to work on this policy */
+       int waiting;
+
+       /** TRUE if a thread is working on this policy */
+       bool working;
 };
 
 /**
- * convert a IKEv2 specific protocol identifier to the kernel one
+ * Destroy a policy_entry_t object
  */
-static u_int8_t proto_ike2kernel(protocol_id_t proto)
+static void policy_entry_destroy(private_kernel_netlink_ipsec_t *this,
+                                                                policy_entry_t *policy)
 {
-       switch (proto)
+       if (policy->route)
        {
-               case PROTO_ESP:
-                       return IPPROTO_ESP;
-               case PROTO_AH:
-                       return IPPROTO_AH;
-               default:
-                       return proto;
+               route_entry_destroy(policy->route);
        }
+       if (policy->used_by)
+       {
+               policy->used_by->invoke_function(policy->used_by,
+                                                                               (linked_list_invoke_t)policy_sa_destroy,
+                                                                                &policy->direction, this);
+               policy->used_by->destroy(policy->used_by);
+       }
+       free(policy);
 }
 
 /**
- * reverse of ike2kernel
+ * Hash function for policy_entry_t objects
  */
-static protocol_id_t proto_kernel2ike(u_int8_t proto)
+static u_int policy_hash(policy_entry_t *key)
 {
-       switch (proto)
+       chunk_t chunk = chunk_from_thing(key->sel);
+       return chunk_hash_inc(chunk, chunk_hash(chunk_from_thing(key->mark)));
+}
+
+/**
+ * Equality function for policy_entry_t objects
+ */
+static bool policy_equals(policy_entry_t *key, policy_entry_t *other_key)
+{
+       return memeq(&key->sel, &other_key->sel, sizeof(struct xfrm_selector)) &&
+                  key->mark == other_key->mark &&
+                  key->direction == other_key->direction;
+}
+
+/**
+ * Determine number of set bits in 16 bit port mask
+ */
+static inline uint32_t port_mask_bits(uint16_t port_mask)
+{
+       uint32_t bits;
+       uint16_t bit_mask = 0x8000;
+
+       port_mask = ntohs(port_mask);
+
+       for (bits = 0; bits < 16; bits++)
        {
-               case IPPROTO_ESP:
-                       return PROTO_ESP;
-               case IPPROTO_AH:
-                       return PROTO_AH;
-               default:
-                       return proto;
+               if (!(port_mask & bit_mask))
+               {
+                       break;
+               }
+               bit_mask >>= 1;
+       }
+       return bits;
+}
+
+/**
+ * Calculate the priority of a policy
+ *
+ * bits 0-0:  restriction to network interface (0..1)   1 bit
+ * bits 1-6:  src + dst port mask bits (2 * 0..16)      6 bits
+ * bits 7-7:  restriction to protocol (0..1)            1 bit
+ * bits 8-16: src + dst network mask bits (2 * 0..128)  9 bits
+ *                                                     17 bits
+ *
+ * smallest value: 000000000 0 000000 0:      0, lowest priority = 100'000
+ * largest value : 100000000 1 100000 1: 65'729, highst priority =  34'271
+ */
+static uint32_t get_priority(policy_entry_t *policy, policy_priority_t prio,
+                                                        char *interface)
+{
+       uint32_t priority = PRIO_BASE, sport_mask_bits, dport_mask_bits;
+
+       switch (prio)
+       {
+               case POLICY_PRIORITY_FALLBACK:
+                       priority += PRIO_BASE;
+                       /* fall-through to next case */
+               case POLICY_PRIORITY_ROUTED:
+                       priority += PRIO_BASE;
+                       /* fall-through to next case */
+               case POLICY_PRIORITY_DEFAULT:
+                       priority += PRIO_BASE;
+                       /* fall-through to next case */
+               case POLICY_PRIORITY_PASS:
+                       break;
        }
+       sport_mask_bits = port_mask_bits(policy->sel.sport_mask);
+       dport_mask_bits = port_mask_bits(policy->sel.dport_mask);
+
+       /* calculate priority */
+       priority -= (policy->sel.prefixlen_s + policy->sel.prefixlen_d) * 256;
+       priority -=  policy->sel.proto ? 128 : 0;
+       priority -= (sport_mask_bits + dport_mask_bits) * 2;
+       priority -= (interface != NULL);
+
+       return priority;
 }
 
 /**
- * convert the general ipsec mode to the one defined in xfrm.h
+ * Convert the general ipsec mode to the one defined in xfrm.h
  */
-static u_int8_t mode2kernel(ipsec_mode_t mode)
+static uint8_t mode2kernel(ipsec_mode_t mode)
 {
        switch (mode)
        {
@@ -399,7 +711,7 @@ static u_int8_t mode2kernel(ipsec_mode_t mode)
 }
 
 /**
- * convert a host_t to a struct xfrm_address
+ * Convert a host_t to a struct xfrm_address
  */
 static void host2xfrm(host_t *host, xfrm_address_t *xfrm)
 {
@@ -408,9 +720,9 @@ static void host2xfrm(host_t *host, xfrm_address_t *xfrm)
 }
 
 /**
- * convert a struct xfrm_address to a host_t
+ * Convert a struct xfrm_address to a host_t
  */
-static host_t* xfrm2host(int family, xfrm_address_t *xfrm, u_int16_t port)
+static host_t* xfrm2host(int family, xfrm_address_t *xfrm, uint16_t port)
 {
        chunk_t chunk;
 
@@ -429,10 +741,10 @@ static host_t* xfrm2host(int family, xfrm_address_t *xfrm, u_int16_t port)
 }
 
 /**
- * convert a traffic selector address range to subnet and its mask.
+ * Convert a traffic selector address range to subnet and its mask.
  */
 static void ts2subnet(traffic_selector_t* ts,
-                                         xfrm_address_t *net, u_int8_t *mask)
+                                         xfrm_address_t *net, uint8_t *mask)
 {
        host_t *net_host;
        chunk_t net_chunk;
@@ -444,20 +756,18 @@ static void ts2subnet(traffic_selector_t* ts,
 }
 
 /**
- * convert a traffic selector port range to port/portmask
+ * Convert a traffic selector port range to port/portmask
  */
 static void ts2ports(traffic_selector_t* ts,
-                                        u_int16_t *port, u_int16_t *mask)
+                                        uint16_t *port, uint16_t *mask)
 {
-       /* linux does not seem to accept complex portmasks. Only
-        * any or a specific port is allowed. We set to any, if we have
-        * a port range, or to a specific, if we have one port only.
-        */
-       u_int16_t from, to;
+       uint16_t from, to, bitmask;
+       int bit;
 
        from = ts->get_from_port(ts);
        to = ts->get_to_port(ts);
 
+       /* Quick check for a single port */
        if (from == to)
        {
                *port = htons(from);
@@ -465,18 +775,34 @@ static void ts2ports(traffic_selector_t* ts,
        }
        else
        {
-               *port = 0;
+               /* Compute the port mask for port ranges */
                *mask = 0;
+
+               for (bit = 15; bit >= 0; bit--)
+               {
+                       bitmask = 1 << bit;
+
+                       if ((bitmask & from) != (bitmask & to))
+                       {
+                               *port = htons(from & *mask);
+                               *mask = htons(*mask);
+                               return;
+                       }
+                       *mask |= bitmask;
+               }
        }
+       return;
 }
 
 /**
- * convert a pair of traffic_selectors to a xfrm_selector
+ * Convert a pair of traffic_selectors to an xfrm_selector
  */
 static struct xfrm_selector ts2selector(traffic_selector_t *src,
-                                                                               traffic_selector_t *dst)
+                                                                               traffic_selector_t *dst,
+                                                                               char *interface)
 {
        struct xfrm_selector sel;
+       uint16_t port;
 
        memset(&sel, 0, sizeof(sel));
        sel.family = (src->get_type(src) == TS_IPV4_ADDR_RANGE) ? AF_INET : AF_INET6;
@@ -486,20 +812,31 @@ static struct xfrm_selector ts2selector(traffic_selector_t *src,
        ts2subnet(src, &sel.saddr, &sel.prefixlen_s);
        ts2ports(dst, &sel.dport, &sel.dport_mask);
        ts2ports(src, &sel.sport, &sel.sport_mask);
-       sel.ifindex = 0;
+       if ((sel.proto == IPPROTO_ICMP || sel.proto == IPPROTO_ICMPV6) &&
+               (sel.dport || sel.sport))
+       {
+               /* the kernel expects the ICMP type and code in the source and
+                * destination port fields, respectively. */
+               port = ntohs(max(sel.dport, sel.sport));
+               sel.sport = htons(traffic_selector_icmp_type(port));
+               sel.sport_mask = sel.sport ? ~0 : 0;
+               sel.dport = htons(traffic_selector_icmp_code(port));
+               sel.dport_mask = sel.dport ? ~0 : 0;
+       }
+       sel.ifindex = interface ? if_nametoindex(interface) : 0;
        sel.user = 0;
 
        return sel;
 }
 
 /**
- * convert a xfrm_selector to a src|dst traffic_selector
+ * Convert an xfrm_selector to a src|dst traffic_selector
  */
 static traffic_selector_t* selector2ts(struct xfrm_selector *sel, bool src)
 {
        u_char *addr;
-       u_int8_t prefixlen;
-       u_int16_t port = 0;
+       uint8_t prefixlen;
+       uint16_t port = 0;
        host_t *host = NULL;
 
        if (src)
@@ -508,7 +845,7 @@ static traffic_selector_t* selector2ts(struct xfrm_selector *sel, bool src)
                prefixlen = sel->prefixlen_s;
                if (sel->sport_mask)
                {
-                       port = htons(sel->sport);
+                       port = ntohs(sel->sport);
                }
        }
        else
@@ -517,10 +854,15 @@ static traffic_selector_t* selector2ts(struct xfrm_selector *sel, bool src)
                prefixlen = sel->prefixlen_d;
                if (sel->dport_mask)
                {
-                       port = htons(sel->dport);
+                       port = ntohs(sel->dport);
                }
        }
-
+       if (sel->proto == IPPROTO_ICMP || sel->proto == IPPROTO_ICMPV6)
+       {       /* convert ICMP[v6] message type and code as supplied by the kernel in
+                * source and destination ports (both in network order) */
+               port = (sel->sport >> 8) | (sel->dport & 0xff00);
+               port = ntohs(port);
+       }
        /* The Linux 2.6 kernel does not set the selector's family field,
         * so as a kludge we additionally test the prefix length.
         */
@@ -536,24 +878,25 @@ static traffic_selector_t* selector2ts(struct xfrm_selector *sel, bool src)
        if (host)
        {
                return traffic_selector_create_from_subnet(host, prefixlen,
-                                                                                                  sel->proto, port);
+                                                                                       sel->proto, port, port ?: 65535);
        }
        return NULL;
 }
 
 /**
- * process a XFRM_MSG_ACQUIRE from kernel
+ * Process a XFRM_MSG_ACQUIRE from kernel
  */
-static void process_acquire(private_kernel_netlink_ipsec_t *this, struct nlmsghdr *hdr)
+static void process_acquire(private_kernel_netlink_ipsec_t *this,
+                                                       struct nlmsghdr *hdr)
 {
-       u_int32_t reqid = 0;
-       int proto = 0;
-       traffic_selector_t *src_ts, *dst_ts;
        struct xfrm_user_acquire *acquire;
        struct rtattr *rta;
        size_t rtasize;
+       traffic_selector_t *src_ts, *dst_ts;
+       uint32_t reqid = 0;
+       int proto = 0;
 
-       acquire = (struct xfrm_user_acquire*)NLMSG_DATA(hdr);
+       acquire = NLMSG_DATA(hdr);
        rta = XFRM_RTA(hdr, struct xfrm_user_acquire);
        rtasize = XFRM_PAYLOAD(hdr, struct xfrm_user_acquire);
 
@@ -566,7 +909,6 @@ static void process_acquire(private_kernel_netlink_ipsec_t *this, struct nlmsghd
                if (rta->rta_type == XFRMA_TMPL)
                {
                        struct xfrm_user_tmpl* tmpl;
-
                        tmpl = (struct xfrm_user_tmpl*)RTA_DATA(rta);
                        reqid = tmpl->reqid;
                        proto = tmpl->id.proto;
@@ -586,53 +928,55 @@ static void process_acquire(private_kernel_netlink_ipsec_t *this, struct nlmsghd
        src_ts = selector2ts(&acquire->sel, TRUE);
        dst_ts = selector2ts(&acquire->sel, FALSE);
 
-       charon->kernel_interface->acquire(charon->kernel_interface, reqid, src_ts,
-                                                                         dst_ts);
+       charon->kernel->acquire(charon->kernel, reqid, src_ts, dst_ts);
 }
 
 /**
- * process a XFRM_MSG_EXPIRE from kernel
+ * Process a XFRM_MSG_EXPIRE from kernel
  */
-static void process_expire(private_kernel_netlink_ipsec_t *this, struct nlmsghdr *hdr)
+static void process_expire(private_kernel_netlink_ipsec_t *this,
+                                                  struct nlmsghdr *hdr)
 {
-       protocol_id_t protocol;
-       u_int32_t spi, reqid;
        struct xfrm_user_expire *expire;
+       uint32_t spi;
+       uint8_t protocol;
+       host_t *dst;
 
-       expire = (struct xfrm_user_expire*)NLMSG_DATA(hdr);
-       protocol = proto_kernel2ike(expire->state.id.proto);
+       expire = NLMSG_DATA(hdr);
+       protocol = expire->state.id.proto;
        spi = expire->state.id.spi;
-       reqid = expire->state.reqid;
 
        DBG2(DBG_KNL, "received a XFRM_MSG_EXPIRE");
 
-       if (protocol != PROTO_ESP && protocol != PROTO_AH)
+       if (protocol == IPPROTO_ESP || protocol == IPPROTO_AH)
        {
-               DBG2(DBG_KNL, "ignoring XFRM_MSG_EXPIRE for SA with SPI %.8x and "
-                                         "reqid {%u} which is not a CHILD_SA", ntohl(spi), reqid);
-               return;
+               dst = xfrm2host(expire->state.family, &expire->state.id.daddr, 0);
+               if (dst)
+               {
+                       charon->kernel->expire(charon->kernel, protocol, spi, dst,
+                                                                  expire->hard != 0);
+                       dst->destroy(dst);
+               }
        }
-
-       charon->kernel_interface->expire(charon->kernel_interface, reqid, protocol,
-                                                                        spi, expire->hard != 0);
 }
 
 /**
- * process a XFRM_MSG_MIGRATE from kernel
+ * Process a XFRM_MSG_MIGRATE from kernel
  */
-static void process_migrate(private_kernel_netlink_ipsec_t *this, struct nlmsghdr *hdr)
+static void process_migrate(private_kernel_netlink_ipsec_t *this,
+                                                       struct nlmsghdr *hdr)
 {
+       struct xfrm_userpolicy_id *policy_id;
+       struct rtattr *rta;
+       size_t rtasize;
        traffic_selector_t *src_ts, *dst_ts;
        host_t *local = NULL, *remote = NULL;
        host_t *old_src = NULL, *old_dst = NULL;
        host_t *new_src = NULL, *new_dst = NULL;
-       struct xfrm_userpolicy_id *policy_id;
-       struct rtattr *rta;
-       size_t rtasize;
-       u_int32_t reqid = 0;
+       uint32_t reqid = 0;
        policy_dir_t dir;
 
-       policy_id = (struct xfrm_userpolicy_id*)NLMSG_DATA(hdr);
+       policy_id = NLMSG_DATA(hdr);
        rta     = XFRM_RTA(hdr, struct xfrm_userpolicy_id);
        rtasize = XFRM_PAYLOAD(hdr, struct xfrm_userpolicy_id);
 
@@ -659,18 +1003,15 @@ static void process_migrate(private_kernel_netlink_ipsec_t *this, struct nlmsghd
                else if (rta->rta_type == XFRMA_MIGRATE)
                {
                        struct xfrm_user_migrate *migrate;
-                       protocol_id_t proto;
 
                        migrate = (struct xfrm_user_migrate*)RTA_DATA(rta);
                        old_src = xfrm2host(migrate->old_family, &migrate->old_saddr, 0);
                        old_dst = xfrm2host(migrate->old_family, &migrate->old_daddr, 0);
                        new_src = xfrm2host(migrate->new_family, &migrate->new_saddr, 0);
                        new_dst = xfrm2host(migrate->new_family, &migrate->new_daddr, 0);
-                       proto = proto_kernel2ike(migrate->proto);
                        reqid = migrate->reqid;
-                       DBG2(DBG_KNL, "  migrate %N %H...%H to %H...%H, reqid {%u}",
-                                                        protocol_id_names, proto, old_src, old_dst,
-                                                        new_src, new_dst, reqid);
+                       DBG2(DBG_KNL, "  migrate %H...%H to %H...%H, reqid {%u}",
+                                                  old_src, old_dst, new_src, new_dst, reqid);
                        DESTROY_IF(old_src);
                        DESTROY_IF(old_dst);
                        DESTROY_IF(new_src);
@@ -681,8 +1022,8 @@ static void process_migrate(private_kernel_netlink_ipsec_t *this, struct nlmsghd
 
        if (src_ts && dst_ts && local && remote)
        {
-               charon->kernel_interface->migrate(charon->kernel_interface, reqid,
-                                                                                 src_ts, dst_ts, dir, local, remote);
+               charon->kernel->migrate(charon->kernel, reqid, src_ts, dst_ts, dir,
+                                                               local, remote);
        }
        else
        {
@@ -694,29 +1035,35 @@ static void process_migrate(private_kernel_netlink_ipsec_t *this, struct nlmsghd
 }
 
 /**
- * process a XFRM_MSG_MAPPING from kernel
+ * Process a XFRM_MSG_MAPPING from kernel
  */
 static void process_mapping(private_kernel_netlink_ipsec_t *this,
                                                        struct nlmsghdr *hdr)
 {
-       u_int32_t spi, reqid;
        struct xfrm_user_mapping *mapping;
-       host_t *host;
+       uint32_t spi;
 
-       mapping = (struct xfrm_user_mapping*)NLMSG_DATA(hdr);
+       mapping = NLMSG_DATA(hdr);
        spi = mapping->id.spi;
-       reqid = mapping->reqid;
 
        DBG2(DBG_KNL, "received a XFRM_MSG_MAPPING");
 
-       if (proto_kernel2ike(mapping->id.proto) == PROTO_ESP)
+       if (mapping->id.proto == IPPROTO_ESP)
        {
-               host = xfrm2host(mapping->id.family, &mapping->new_saddr,
-                                                mapping->new_sport);
-               if (host)
+               host_t *dst, *new;
+
+               dst = xfrm2host(mapping->id.family, &mapping->id.daddr, 0);
+               if (dst)
                {
-                       charon->kernel_interface->mapping(charon->kernel_interface, reqid,
-                                                                                         spi, host);
+                       new = xfrm2host(mapping->id.family, &mapping->new_saddr,
+                                                       mapping->new_sport);
+                       if (new)
+                       {
+                               charon->kernel->mapping(charon->kernel, IPPROTO_ESP, spi, dst,
+                                                                               new);
+                               new->destroy(new);
+                       }
+                       dst->destroy(dst);
                }
        }
 }
@@ -724,40 +1071,38 @@ static void process_mapping(private_kernel_netlink_ipsec_t *this,
 /**
  * Receives events from kernel
  */
-static job_requeue_t receive_events(private_kernel_netlink_ipsec_t *this)
+static bool receive_events(private_kernel_netlink_ipsec_t *this, int fd,
+                                                  watcher_event_t event)
 {
        char response[1024];
        struct nlmsghdr *hdr = (struct nlmsghdr*)response;
        struct sockaddr_nl addr;
        socklen_t addr_len = sizeof(addr);
        int len;
-       bool oldstate;
-
-       oldstate = thread_cancelability(TRUE);
-       len = recvfrom(this->socket_xfrm_events, response, sizeof(response), 0,
-                                  (struct sockaddr*)&addr, &addr_len);
-       thread_cancelability(oldstate);
 
+       len = recvfrom(this->socket_xfrm_events, response, sizeof(response),
+                                  MSG_DONTWAIT, (struct sockaddr*)&addr, &addr_len);
        if (len < 0)
        {
                switch (errno)
                {
                        case EINTR:
                                /* interrupted, try again */
-                               return JOB_REQUEUE_DIRECT;
+                               return TRUE;
                        case EAGAIN:
                                /* no data ready, select again */
-                               return JOB_REQUEUE_DIRECT;
+                               return TRUE;
                        default:
-                               DBG1(DBG_KNL, "unable to receive from xfrm event socket");
+                               DBG1(DBG_KNL, "unable to receive from XFRM event socket: %s "
+                                        "(%d)", strerror(errno), errno);
                                sleep(1);
-                               return JOB_REQUEUE_FAIR;
+                               return TRUE;
                }
        }
 
        if (addr.nl_pid != 0)
        {       /* not from kernel. not interested, try another one */
-               return JOB_REQUEUE_DIRECT;
+               return TRUE;
        }
 
        while (NLMSG_OK(hdr, len))
@@ -777,40 +1122,46 @@ static job_requeue_t receive_events(private_kernel_netlink_ipsec_t *this)
                                process_mapping(this, hdr);
                                break;
                        default:
-                               DBG1(DBG_KNL, "received unknown event from xfrm event socket: %d", hdr->nlmsg_type);
+                               DBG1(DBG_KNL, "received unknown event from XFRM event "
+                                        "socket: %d", hdr->nlmsg_type);
                                break;
                }
                hdr = NLMSG_NEXT(hdr, len);
        }
-       return JOB_REQUEUE_DIRECT;
+       return TRUE;
+}
+
+METHOD(kernel_ipsec_t, get_features, kernel_feature_t,
+       private_kernel_netlink_ipsec_t *this)
+{
+       return KERNEL_ESP_V3_TFC;
 }
 
 /**
  * Get an SPI for a specific protocol from the kernel.
  */
 static status_t get_spi_internal(private_kernel_netlink_ipsec_t *this,
-               host_t *src, host_t *dst, u_int8_t proto, u_int32_t min, u_int32_t max,
-               u_int32_t reqid, u_int32_t *spi)
+       host_t *src, host_t *dst, uint8_t proto, uint32_t min, uint32_t max,
+       uint32_t *spi)
 {
        netlink_buf_t request;
        struct nlmsghdr *hdr, *out;
        struct xfrm_userspi_info *userspi;
-       u_int32_t received_spi = 0;
+       uint32_t received_spi = 0;
        size_t len;
 
        memset(&request, 0, sizeof(request));
 
-       hdr = (struct nlmsghdr*)request;
+       hdr = &request.hdr;
        hdr->nlmsg_flags = NLM_F_REQUEST;
        hdr->nlmsg_type = XFRM_MSG_ALLOCSPI;
        hdr->nlmsg_len = NLMSG_LENGTH(sizeof(struct xfrm_userspi_info));
 
-       userspi = (struct xfrm_userspi_info*)NLMSG_DATA(hdr);
+       userspi = NLMSG_DATA(hdr);
        host2xfrm(src, &userspi->info.saddr);
        host2xfrm(dst, &userspi->info.id.daddr);
        userspi->info.id.proto = proto;
        userspi->info.mode = XFRM_MODE_TUNNEL;
-       userspi->info.reqid = reqid;
        userspi->info.family = src->get_family(src);
        userspi->min = min;
        userspi->max = max;
@@ -831,7 +1182,6 @@ static status_t get_spi_internal(private_kernel_netlink_ipsec_t *this,
                                case NLMSG_ERROR:
                                {
                                        struct nlmsgerr *err = NLMSG_DATA(hdr);
-
                                        DBG1(DBG_KNL, "allocating SPI failed: %s (%d)",
                                                 strerror(-err->error), -err->error);
                                        break;
@@ -858,94 +1208,131 @@ static status_t get_spi_internal(private_kernel_netlink_ipsec_t *this,
 
 METHOD(kernel_ipsec_t, get_spi, status_t,
        private_kernel_netlink_ipsec_t *this, host_t *src, host_t *dst,
-       protocol_id_t protocol, u_int32_t reqid, u_int32_t *spi)
+       uint8_t protocol, uint32_t *spi)
 {
-       DBG2(DBG_KNL, "getting SPI for reqid {%u}", reqid);
-
-       if (get_spi_internal(this, src, dst, proto_ike2kernel(protocol),
-                       0xc0000000, 0xcFFFFFFF, reqid, spi) != SUCCESS)
+       if (get_spi_internal(this, src, dst, protocol,
+                                                0xc0000000, 0xcFFFFFFF, spi) != SUCCESS)
        {
-               DBG1(DBG_KNL, "unable to get SPI for reqid {%u}", reqid);
+               DBG1(DBG_KNL, "unable to get SPI");
                return FAILED;
        }
 
-       DBG2(DBG_KNL, "got SPI %.8x for reqid {%u}", ntohl(*spi), reqid);
-
+       DBG2(DBG_KNL, "got SPI %.8x", ntohl(*spi));
        return SUCCESS;
 }
 
 METHOD(kernel_ipsec_t, get_cpi, status_t,
        private_kernel_netlink_ipsec_t *this, host_t *src, host_t *dst,
-       u_int32_t reqid, u_int16_t *cpi)
+       uint16_t *cpi)
 {
-       u_int32_t received_spi = 0;
-
-       DBG2(DBG_KNL, "getting CPI for reqid {%u}", reqid);
+       uint32_t received_spi = 0;
 
-       if (get_spi_internal(this, src, dst,
-                       IPPROTO_COMP, 0x100, 0xEFFF, reqid, &received_spi) != SUCCESS)
+       if (get_spi_internal(this, src, dst, IPPROTO_COMP,
+                                                0x100, 0xEFFF, &received_spi) != SUCCESS)
        {
-               DBG1(DBG_KNL, "unable to get CPI for reqid {%u}", reqid);
+               DBG1(DBG_KNL, "unable to get CPI");
                return FAILED;
        }
 
-       *cpi = htons((u_int16_t)ntohl(received_spi));
-
-       DBG2(DBG_KNL, "got CPI %.4x for reqid {%u}", ntohs(*cpi), reqid);
+       *cpi = htons((uint16_t)ntohl(received_spi));
 
+       DBG2(DBG_KNL, "got CPI %.4x", ntohs(*cpi));
        return SUCCESS;
 }
 
-METHOD(kernel_ipsec_t, add_sa, status_t,
-       private_kernel_netlink_ipsec_t *this, host_t *src, host_t *dst,
-       u_int32_t spi, protocol_id_t protocol, u_int32_t reqid, mark_t mark,
-       lifetime_cfg_t *lifetime, u_int16_t enc_alg, chunk_t enc_key,
-       u_int16_t int_alg, chunk_t int_key,     ipsec_mode_t mode, u_int16_t ipcomp,
-       u_int16_t cpi, bool encap, bool inbound,
-       traffic_selector_t* src_ts, traffic_selector_t* dst_ts)
+/**
+ * Format the mark for debug messages
+ */
+static void format_mark(char *buf, int buflen, mark_t mark)
 {
-       netlink_buf_t request;
-       char *alg_name;
-       struct nlmsghdr *hdr;
-       struct xfrm_usersa_info *sa;
-       u_int16_t icv_size = 64;
-
-       /* if IPComp is used, we install an additional IPComp SA. if the cpi is 0
-        * we are in the recursive call below */
-       if (ipcomp != IPCOMP_NONE && cpi != 0)
+       if (mark.value)
+       {
+               snprintf(buf, buflen, " (mark %u/0x%08x)", mark.value, mark.mask);
+       }
+}
+
+/**
+ * Add a XFRM mark to message if required
+ */
+static bool add_mark(struct nlmsghdr *hdr, int buflen, mark_t mark)
+{
+       if (mark.value)
+       {
+               struct xfrm_mark *xmrk;
+
+               xmrk = netlink_reserve(hdr, buflen, XFRMA_MARK, sizeof(*xmrk));
+               if (!xmrk)
+               {
+                       return FALSE;
+               }
+               xmrk->v = mark.value;
+               xmrk->m = mark.mask;
+       }
+       return TRUE;
+}
+
+METHOD(kernel_ipsec_t, add_sa, status_t,
+       private_kernel_netlink_ipsec_t *this, kernel_ipsec_sa_id_t *id,
+       kernel_ipsec_add_sa_t *data)
+{
+       netlink_buf_t request;
+       char *alg_name, markstr[32] = "";
+       struct nlmsghdr *hdr;
+       struct xfrm_usersa_info *sa;
+       uint16_t icv_size = 64, ipcomp = data->ipcomp;
+       ipsec_mode_t mode = data->mode, original_mode = data->mode;
+       traffic_selector_t *first_src_ts, *first_dst_ts;
+       status_t status = FAILED;
+
+       /* if IPComp is used, we install an additional IPComp SA. if the cpi is 0
+        * we are in the recursive call below */
+       if (ipcomp != IPCOMP_NONE && data->cpi != 0)
        {
                lifetime_cfg_t lft = {{0,0,0},{0,0,0},{0,0,0}};
-               add_sa(this, src, dst, htonl(ntohs(cpi)), IPPROTO_COMP, reqid, mark,
-                          &lft, ENCR_UNDEFINED, chunk_empty, AUTH_UNDEFINED, chunk_empty,
-                          mode, ipcomp, 0, FALSE, inbound, NULL, NULL);
+               kernel_ipsec_sa_id_t ipcomp_id = {
+                       .src = id->src,
+                       .dst = id->dst,
+                       .spi = htonl(ntohs(data->cpi)),
+                       .proto = IPPROTO_COMP,
+                       .mark = id->mark,
+               };
+               kernel_ipsec_add_sa_t ipcomp_sa = {
+                       .reqid = data->reqid,
+                       .mode = data->mode,
+                       .src_ts = data->src_ts,
+                       .dst_ts = data->dst_ts,
+                       .lifetime = &lft,
+                       .enc_alg = ENCR_UNDEFINED,
+                       .int_alg = AUTH_UNDEFINED,
+                       .tfc = data->tfc,
+                       .ipcomp = data->ipcomp,
+                       .initiator = data->initiator,
+                       .inbound = data->inbound,
+                       .update = data->update,
+               };
+               add_sa(this, &ipcomp_id, &ipcomp_sa);
                ipcomp = IPCOMP_NONE;
                /* use transport mode ESP SA, IPComp uses tunnel mode */
                mode = MODE_TRANSPORT;
        }
 
        memset(&request, 0, sizeof(request));
+       format_mark(markstr, sizeof(markstr), id->mark);
 
-       if (mark.value)
-       {
-               DBG2(DBG_KNL, "adding SAD entry with SPI %.8x and reqid {%u}  "
-                                         "(mark %u/0x%8x)", ntohl(spi), reqid, mark.value, mark.mask);
-       }
-       else
-       {
-               DBG2(DBG_KNL, "adding SAD entry with SPI %.8x and reqid {%u}",
-                                          ntohl(spi), reqid);
-       }
-       hdr = (struct nlmsghdr*)request;
+       DBG2(DBG_KNL, "adding SAD entry with SPI %.8x and reqid {%u}%s",
+                ntohl(id->spi), data->reqid, markstr);
+
+       hdr = &request.hdr;
        hdr->nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
-       hdr->nlmsg_type = inbound ? XFRM_MSG_UPDSA : XFRM_MSG_NEWSA;
+       hdr->nlmsg_type = data->update ? XFRM_MSG_UPDSA : XFRM_MSG_NEWSA;
        hdr->nlmsg_len = NLMSG_LENGTH(sizeof(struct xfrm_usersa_info));
 
-       sa = (struct xfrm_usersa_info*)NLMSG_DATA(hdr);
-       host2xfrm(src, &sa->saddr);
-       host2xfrm(dst, &sa->id.daddr);
-       sa->id.spi = spi;
-       sa->id.proto = proto_ike2kernel(protocol);
-       sa->family = src->get_family(src);
+       sa = NLMSG_DATA(hdr);
+       host2xfrm(id->src, &sa->saddr);
+       host2xfrm(id->dst, &sa->id.daddr);
+       sa->id.spi = id->spi;
+       sa->id.proto = id->proto;
+       sa->family = id->src->get_family(id->src);
        sa->mode = mode2kernel(mode);
        switch (mode)
        {
@@ -953,30 +1340,46 @@ METHOD(kernel_ipsec_t, add_sa, status_t,
                        sa->flags |= XFRM_STATE_AF_UNSPEC;
                        break;
                case MODE_BEET:
-                       if(src_ts && dst_ts)
+               case MODE_TRANSPORT:
+                       if (original_mode == MODE_TUNNEL)
+                       {       /* don't install selectors for switched SAs.  because only one
+                                * selector can be installed other traffic would get dropped */
+                               break;
+                       }
+                       if (data->src_ts->get_first(data->src_ts,
+                                                                               (void**)&first_src_ts) == SUCCESS &&
+                               data->dst_ts->get_first(data->dst_ts,
+                                                                               (void**)&first_dst_ts) == SUCCESS)
                        {
-                               sa->sel = ts2selector(src_ts, dst_ts);
+                               sa->sel = ts2selector(first_src_ts, first_dst_ts,
+                                                                         data->interface);
+                               if (!this->proto_port_transport)
+                               {
+                                       /* don't install proto/port on SA. This would break
+                                        * potential secondary SAs for the same address using a
+                                        * different prot/port. */
+                                       sa->sel.proto = 0;
+                                       sa->sel.dport = sa->sel.dport_mask = 0;
+                                       sa->sel.sport = sa->sel.sport_mask = 0;
+                               }
                        }
                        break;
                default:
                        break;
        }
 
-       sa->replay_window = (protocol == IPPROTO_COMP) ? 0 : 32;
-       sa->reqid = reqid;
-       sa->lft.soft_byte_limit = XFRM_LIMIT(lifetime->bytes.rekey);
-       sa->lft.hard_byte_limit = XFRM_LIMIT(lifetime->bytes.life);
-       sa->lft.soft_packet_limit = XFRM_LIMIT(lifetime->packets.rekey);
-       sa->lft.hard_packet_limit = XFRM_LIMIT(lifetime->packets.life);
+       sa->reqid = data->reqid;
+       sa->lft.soft_byte_limit = XFRM_LIMIT(data->lifetime->bytes.rekey);
+       sa->lft.hard_byte_limit = XFRM_LIMIT(data->lifetime->bytes.life);
+       sa->lft.soft_packet_limit = XFRM_LIMIT(data->lifetime->packets.rekey);
+       sa->lft.hard_packet_limit = XFRM_LIMIT(data->lifetime->packets.life);
        /* we use lifetimes since added, not since used */
-       sa->lft.soft_add_expires_seconds = lifetime->time.rekey;
-       sa->lft.hard_add_expires_seconds = lifetime->time.life;
+       sa->lft.soft_add_expires_seconds = data->lifetime->time.rekey;
+       sa->lft.hard_add_expires_seconds = data->lifetime->time.life;
        sa->lft.soft_use_expires_seconds = 0;
        sa->lft.hard_use_expires_seconds = 0;
 
-       struct rtattr *rthdr = XFRM_RTA(hdr, struct xfrm_usersa_info);
-
-       switch (enc_alg)
+       switch (data->enc_alg)
        {
                case ENCR_UNDEFINED:
                        /* no encryption */
@@ -985,6 +1388,7 @@ METHOD(kernel_ipsec_t, add_sa, status_t,
                case ENCR_AES_GCM_ICV16:
                case ENCR_NULL_AUTH_AES_GMAC:
                case ENCR_CAMELLIA_CCM_ICV16:
+               case ENCR_CHACHA20_POLY1305:
                        icv_size += 32;
                        /* FALL */
                case ENCR_AES_CCM_ICV12:
@@ -998,216 +1402,256 @@ METHOD(kernel_ipsec_t, add_sa, status_t,
                {
                        struct xfrm_algo_aead *algo;
 
-                       alg_name = lookup_algorithm(encryption_algs, enc_alg);
+                       alg_name = lookup_algorithm(ENCRYPTION_ALGORITHM, data->enc_alg);
                        if (alg_name == NULL)
                        {
                                DBG1(DBG_KNL, "algorithm %N not supported by kernel!",
-                                        encryption_algorithm_names, enc_alg);
-                               return FAILED;
+                                                encryption_algorithm_names, data->enc_alg);
+                                       goto failed;
                        }
                        DBG2(DBG_KNL, "  using encryption algorithm %N with key size %d",
-                                encryption_algorithm_names, enc_alg, enc_key.len * 8);
+                                encryption_algorithm_names, data->enc_alg,
+                                data->enc_key.len * 8);
 
-                       rthdr->rta_type = XFRMA_ALG_AEAD;
-                       rthdr->rta_len = RTA_LENGTH(sizeof(struct xfrm_algo_aead) + enc_key.len);
-                       hdr->nlmsg_len += rthdr->rta_len;
-                       if (hdr->nlmsg_len > sizeof(request))
+                       algo = netlink_reserve(hdr, sizeof(request), XFRMA_ALG_AEAD,
+                                                                  sizeof(*algo) + data->enc_key.len);
+                       if (!algo)
                        {
-                               return FAILED;
+                               goto failed;
                        }
-
-                       algo = (struct xfrm_algo_aead*)RTA_DATA(rthdr);
-                       algo->alg_key_len = enc_key.len * 8;
+                       algo->alg_key_len = data->enc_key.len * 8;
                        algo->alg_icv_len = icv_size;
-                       strcpy(algo->alg_name, alg_name);
-                       memcpy(algo->alg_key, enc_key.ptr, enc_key.len);
-
-                       rthdr = XFRM_RTA_NEXT(rthdr);
+                       strncpy(algo->alg_name, alg_name, sizeof(algo->alg_name));
+                       algo->alg_name[sizeof(algo->alg_name) - 1] = '\0';
+                       memcpy(algo->alg_key, data->enc_key.ptr, data->enc_key.len);
                        break;
                }
                default:
                {
                        struct xfrm_algo *algo;
 
-                       alg_name = lookup_algorithm(encryption_algs, enc_alg);
+                       alg_name = lookup_algorithm(ENCRYPTION_ALGORITHM, data->enc_alg);
                        if (alg_name == NULL)
                        {
                                DBG1(DBG_KNL, "algorithm %N not supported by kernel!",
-                                        encryption_algorithm_names, enc_alg);
-                               return FAILED;
+                                        encryption_algorithm_names, data->enc_alg);
+                               goto failed;
                        }
                        DBG2(DBG_KNL, "  using encryption algorithm %N with key size %d",
-                                encryption_algorithm_names, enc_alg, enc_key.len * 8);
+                                encryption_algorithm_names, data->enc_alg,
+                                data->enc_key.len * 8);
 
-                       rthdr->rta_type = XFRMA_ALG_CRYPT;
-                       rthdr->rta_len = RTA_LENGTH(sizeof(struct xfrm_algo) + enc_key.len);
-                       hdr->nlmsg_len += rthdr->rta_len;
-                       if (hdr->nlmsg_len > sizeof(request))
+                       algo = netlink_reserve(hdr, sizeof(request), XFRMA_ALG_CRYPT,
+                                                                  sizeof(*algo) + data->enc_key.len);
+                       if (!algo)
                        {
-                               return FAILED;
+                               goto failed;
                        }
-
-                       algo = (struct xfrm_algo*)RTA_DATA(rthdr);
-                       algo->alg_key_len = enc_key.len * 8;
-                       strcpy(algo->alg_name, alg_name);
-                       memcpy(algo->alg_key, enc_key.ptr, enc_key.len);
-
-                       rthdr = XFRM_RTA_NEXT(rthdr);
+                       algo->alg_key_len = data->enc_key.len * 8;
+                       strncpy(algo->alg_name, alg_name, sizeof(algo->alg_name));
+                       algo->alg_name[sizeof(algo->alg_name) - 1] = '\0';
+                       memcpy(algo->alg_key, data->enc_key.ptr, data->enc_key.len);
                }
        }
 
-       if (int_alg  != AUTH_UNDEFINED)
+       if (data->int_alg != AUTH_UNDEFINED)
        {
-               alg_name = lookup_algorithm(integrity_algs, int_alg);
+               u_int trunc_len = 0;
+
+               alg_name = lookup_algorithm(INTEGRITY_ALGORITHM, data->int_alg);
                if (alg_name == NULL)
                {
                        DBG1(DBG_KNL, "algorithm %N not supported by kernel!",
-                                integrity_algorithm_names, int_alg);
-                       return FAILED;
+                                integrity_algorithm_names, data->int_alg);
+                       goto failed;
                }
                DBG2(DBG_KNL, "  using integrity algorithm %N with key size %d",
-                        integrity_algorithm_names, int_alg, int_key.len * 8);
+                        integrity_algorithm_names, data->int_alg, data->int_key.len * 8);
+
+               switch (data->int_alg)
+               {
+                       case AUTH_HMAC_MD5_128:
+                       case AUTH_HMAC_SHA2_256_128:
+                               trunc_len = 128;
+                               break;
+                       case AUTH_HMAC_SHA1_160:
+                               trunc_len = 160;
+                               break;
+                       default:
+                               break;
+               }
 
-               if (int_alg == AUTH_HMAC_SHA2_256_128)
+               if (trunc_len)
                {
                        struct xfrm_algo_auth* algo;
 
                        /* the kernel uses SHA256 with 96 bit truncation by default,
-                        * use specified truncation size supported by newer kernels */
-                       rthdr->rta_type = XFRMA_ALG_AUTH_TRUNC;
-                       rthdr->rta_len = RTA_LENGTH(sizeof(struct xfrm_algo_auth) + int_key.len);
-
-                       hdr->nlmsg_len += rthdr->rta_len;
-                       if (hdr->nlmsg_len > sizeof(request))
+                        * use specified truncation size supported by newer kernels.
+                        * also use this for untruncated MD5 and SHA1. */
+                       algo = netlink_reserve(hdr, sizeof(request), XFRMA_ALG_AUTH_TRUNC,
+                                                                  sizeof(*algo) + data->int_key.len);
+                       if (!algo)
                        {
-                               return FAILED;
+                               goto failed;
                        }
-
-                       algo = (struct xfrm_algo_auth*)RTA_DATA(rthdr);
-                       algo->alg_key_len = int_key.len * 8;
-                       algo->alg_trunc_len = 128;
-                       strcpy(algo->alg_name, alg_name);
-                       memcpy(algo->alg_key, int_key.ptr, int_key.len);
+                       algo->alg_key_len = data->int_key.len * 8;
+                       algo->alg_trunc_len = trunc_len;
+                       strncpy(algo->alg_name, alg_name, sizeof(algo->alg_name));
+                       algo->alg_name[sizeof(algo->alg_name) - 1] = '\0';
+                       memcpy(algo->alg_key, data->int_key.ptr, data->int_key.len);
                }
                else
                {
                        struct xfrm_algo* algo;
 
-                       rthdr->rta_type = XFRMA_ALG_AUTH;
-                       rthdr->rta_len = RTA_LENGTH(sizeof(struct xfrm_algo) + int_key.len);
-
-                       hdr->nlmsg_len += rthdr->rta_len;
-                       if (hdr->nlmsg_len > sizeof(request))
+                       algo = netlink_reserve(hdr, sizeof(request), XFRMA_ALG_AUTH,
+                                                                  sizeof(*algo) + data->int_key.len);
+                       if (!algo)
                        {
-                               return FAILED;
+                               goto failed;
                        }
-
-                       algo = (struct xfrm_algo*)RTA_DATA(rthdr);
-                       algo->alg_key_len = int_key.len * 8;
-                       strcpy(algo->alg_name, alg_name);
-                       memcpy(algo->alg_key, int_key.ptr, int_key.len);
+                       algo->alg_key_len = data->int_key.len * 8;
+                       strncpy(algo->alg_name, alg_name, sizeof(algo->alg_name));
+                       algo->alg_name[sizeof(algo->alg_name) - 1] = '\0';
+                       memcpy(algo->alg_key, data->int_key.ptr, data->int_key.len);
                }
-               rthdr = XFRM_RTA_NEXT(rthdr);
        }
 
        if (ipcomp != IPCOMP_NONE)
        {
-               rthdr->rta_type = XFRMA_ALG_COMP;
-               alg_name = lookup_algorithm(compression_algs, ipcomp);
+               struct xfrm_algo* algo;
+
+               alg_name = lookup_algorithm(COMPRESSION_ALGORITHM, ipcomp);
                if (alg_name == NULL)
                {
                        DBG1(DBG_KNL, "algorithm %N not supported by kernel!",
                                 ipcomp_transform_names, ipcomp);
-                       return FAILED;
+                       goto failed;
                }
                DBG2(DBG_KNL, "  using compression algorithm %N",
                         ipcomp_transform_names, ipcomp);
 
-               rthdr->rta_len = RTA_LENGTH(sizeof(struct xfrm_algo));
-               hdr->nlmsg_len += rthdr->rta_len;
-               if (hdr->nlmsg_len > sizeof(request))
+               algo = netlink_reserve(hdr, sizeof(request), XFRMA_ALG_COMP,
+                                                          sizeof(*algo));
+               if (!algo)
                {
-                       return FAILED;
+                       goto failed;
                }
-
-               struct xfrm_algo* algo = (struct xfrm_algo*)RTA_DATA(rthdr);
                algo->alg_key_len = 0;
-               strcpy(algo->alg_name, alg_name);
-
-               rthdr = XFRM_RTA_NEXT(rthdr);
+               strncpy(algo->alg_name, alg_name, sizeof(algo->alg_name));
+               algo->alg_name[sizeof(algo->alg_name) - 1] = '\0';
        }
 
-       if (encap)
+       if (data->encap)
        {
                struct xfrm_encap_tmpl *tmpl;
 
-               rthdr->rta_type = XFRMA_ENCAP;
-               rthdr->rta_len = RTA_LENGTH(sizeof(struct xfrm_encap_tmpl));
-
-               hdr->nlmsg_len += rthdr->rta_len;
-               if (hdr->nlmsg_len > sizeof(request))
+               tmpl = netlink_reserve(hdr, sizeof(request), XFRMA_ENCAP, sizeof(*tmpl));
+               if (!tmpl)
                {
-                       return FAILED;
+                       goto failed;
                }
-
-               tmpl = (struct xfrm_encap_tmpl*)RTA_DATA(rthdr);
                tmpl->encap_type = UDP_ENCAP_ESPINUDP;
-               tmpl->encap_sport = htons(src->get_port(src));
-               tmpl->encap_dport = htons(dst->get_port(dst));
+               tmpl->encap_sport = htons(id->src->get_port(id->src));
+               tmpl->encap_dport = htons(id->dst->get_port(id->dst));
                memset(&tmpl->encap_oa, 0, sizeof (xfrm_address_t));
                /* encap_oa could probably be derived from the
-                * traffic selectors [rfc4306, p39]. In the netlink kernel implementation
-                * pluto does the same as we do here but it uses encap_oa in the
-                * pfkey implementation. BUT as /usr/src/linux/net/key/af_key.c indicates
-                * the kernel ignores it anyway
+                * traffic selectors [rfc4306, p39]. In the netlink kernel
+                * implementation pluto does the same as we do here but it uses
+                * encap_oa in the pfkey implementation.
+                * BUT as /usr/src/linux/net/key/af_key.c indicates the kernel ignores
+                * it anyway
                 *   -> does that mean that NAT-T encap doesn't work in transport mode?
                 * No. The reason the kernel ignores NAT-OA is that it recomputes
-                * (or, rather, just ignores) the checksum. If packets pass
-                * the IPsec checks it marks them "checksum ok" so OA isn't needed. */
-               rthdr = XFRM_RTA_NEXT(rthdr);
+                * (or, rather, just ignores) the checksum. If packets pass the IPsec
+                * checks it marks them "checksum ok" so OA isn't needed. */
        }
 
-       if (mark.value)
+       if (!add_mark(hdr, sizeof(request), id->mark))
        {
-               struct xfrm_mark *mrk;
+               goto failed;
+       }
 
-               rthdr->rta_type = XFRMA_MARK;
-               rthdr->rta_len = RTA_LENGTH(sizeof(struct xfrm_mark));
+       if (data->tfc && id->proto == IPPROTO_ESP && mode == MODE_TUNNEL)
+       {       /* the kernel supports TFC padding only for tunnel mode ESP SAs */
+               uint32_t *tfcpad;
 
-               hdr->nlmsg_len += rthdr->rta_len;
-               if (hdr->nlmsg_len > sizeof(request))
+               tfcpad = netlink_reserve(hdr, sizeof(request), XFRMA_TFCPAD,
+                                                                sizeof(*tfcpad));
+               if (!tfcpad)
                {
-                       return FAILED;
+                       goto failed;
                }
-
-               mrk = (struct xfrm_mark*)RTA_DATA(rthdr);
-               mrk->v = mark.value;
-               mrk->m = mark.mask;
-               rthdr = XFRM_RTA_NEXT(rthdr);
+               *tfcpad = data->tfc;
        }
 
-       if (this->socket_xfrm->send_ack(this->socket_xfrm, hdr) != SUCCESS)
+       if (id->proto != IPPROTO_COMP)
        {
-               if (mark.value)
+               /* generally, we don't need a replay window for outbound SAs, however,
+                * when using ESN the kernel rejects the attribute if it is 0 */
+               if (!data->inbound && data->replay_window)
+               {
+                       data->replay_window = data->esn ? 1 : 0;
+               }
+               if (data->replay_window != 0 && (data->esn || data->replay_window > 32))
                {
-                       DBG1(DBG_KNL, "unable to add SAD entry with SPI %.8x  "
-                                                 "(mark %u/0x%8x)", ntohl(spi), mark.value, mark.mask);
+                       /* for ESN or larger replay windows we need the new
+                        * XFRMA_REPLAY_ESN_VAL attribute to configure a bitmap */
+                       struct xfrm_replay_state_esn *replay;
+                       uint32_t bmp_size;
+
+                       bmp_size = round_up(data->replay_window, sizeof(uint32_t) * 8) / 8;
+                       replay = netlink_reserve(hdr, sizeof(request), XFRMA_REPLAY_ESN_VAL,
+                                                                        sizeof(*replay) + bmp_size);
+                       if (!replay)
+                       {
+                               goto failed;
+                       }
+                       /* bmp_len contains number uf __u32's */
+                       replay->bmp_len = bmp_size / sizeof(uint32_t);
+                       replay->replay_window = data->replay_window;
+                       DBG2(DBG_KNL, "  using replay window of %u packets",
+                                data->replay_window);
+
+                       if (data->esn)
+                       {
+                               DBG2(DBG_KNL, "  using extended sequence numbers (ESN)");
+                               sa->flags |= XFRM_STATE_ESN;
+                       }
                }
                else
                {
-                       DBG1(DBG_KNL, "unable to add SAD entry with SPI %.8x", ntohl(spi));
+                       DBG2(DBG_KNL, "  using replay window of %u packets",
+                                data->replay_window);
+                       sa->replay_window = data->replay_window;
                }
-               return FAILED;
        }
-       return SUCCESS;
+
+       if (this->socket_xfrm->send_ack(this->socket_xfrm, hdr) != SUCCESS)
+       {
+               DBG1(DBG_KNL, "unable to add SAD entry with SPI %.8x%s", ntohl(id->spi),
+                        markstr);
+               goto failed;
+       }
+
+       status = SUCCESS;
+
+failed:
+       memwipe(&request, sizeof(request));
+       return status;
 }
 
 /**
- * Get the replay state (i.e. sequence numbers) of an SA.
+ * Get the ESN replay state (i.e. sequence numbers) of an SA.
+ *
+ * Allocates into one the replay state structure we get from the kernel.
  */
-static status_t get_replay_state(private_kernel_netlink_ipsec_t *this,
-                                                 u_int32_t spi, protocol_id_t protocol, host_t *dst,
-                                                 struct xfrm_replay_state *replay)
+static void get_replay_state(private_kernel_netlink_ipsec_t *this,
+                                                        kernel_ipsec_sa_id_t *sa,
+                                                        struct xfrm_replay_state_esn **replay_esn,
+                                                        uint32_t *replay_esn_len,
+                                                        struct xfrm_replay_state **replay,
+                                                        struct xfrm_lifetime_cur **lifetime)
 {
        netlink_buf_t request;
        struct nlmsghdr *hdr, *out = NULL;
@@ -1218,20 +1662,26 @@ static status_t get_replay_state(private_kernel_netlink_ipsec_t *this,
 
        memset(&request, 0, sizeof(request));
 
-       DBG2(DBG_KNL, "querying replay state from SAD entry with SPI %.8x", ntohl(spi));
+       DBG2(DBG_KNL, "querying replay state from SAD entry with SPI %.8x",
+                ntohl(sa->spi));
 
-       hdr = (struct nlmsghdr*)request;
+       hdr = &request.hdr;
        hdr->nlmsg_flags = NLM_F_REQUEST;
        hdr->nlmsg_type = XFRM_MSG_GETAE;
        hdr->nlmsg_len = NLMSG_LENGTH(sizeof(struct xfrm_aevent_id));
 
-       aevent_id = (struct xfrm_aevent_id*)NLMSG_DATA(hdr);
+       aevent_id = NLMSG_DATA(hdr);
        aevent_id->flags = XFRM_AE_RVAL;
 
-       host2xfrm(dst, &aevent_id->sa_id.daddr);
-       aevent_id->sa_id.spi = spi;
-       aevent_id->sa_id.proto = proto_ike2kernel(protocol);
-       aevent_id->sa_id.family = dst->get_family(dst);
+       host2xfrm(sa->dst, &aevent_id->sa_id.daddr);
+       aevent_id->sa_id.spi = sa->spi;
+       aevent_id->sa_id.proto = sa->proto;
+       aevent_id->sa_id.family = sa->dst->get_family(sa->dst);
+
+       if (!add_mark(hdr, sizeof(request), sa->mark))
+       {
+               return;
+       }
 
        if (this->socket_xfrm->send(this->socket_xfrm, hdr, &out, &len) == SUCCESS)
        {
@@ -1248,8 +1698,8 @@ static status_t get_replay_state(private_kernel_netlink_ipsec_t *this,
                                case NLMSG_ERROR:
                                {
                                        struct nlmsgerr *err = NLMSG_DATA(hdr);
-                                       DBG1(DBG_KNL, "querying replay state from SAD entry failed: %s (%d)",
-                                                strerror(-err->error), -err->error);
+                                       DBG1(DBG_KNL, "querying replay state from SAD entry "
+                                                "failed: %s (%d)", strerror(-err->error), -err->error);
                                        break;
                                }
                                default:
@@ -1262,82 +1712,73 @@ static status_t get_replay_state(private_kernel_netlink_ipsec_t *this,
                }
        }
 
-       if (out_aevent == NULL)
-       {
-               DBG1(DBG_KNL, "unable to query replay state from SAD entry with SPI %.8x",
-                                         ntohl(spi));
-               free(out);
-               return FAILED;
-       }
-
-       rta = XFRM_RTA(out, struct xfrm_aevent_id);
-       rtasize = XFRM_PAYLOAD(out, struct xfrm_aevent_id);
-       while(RTA_OK(rta, rtasize))
+       if (out_aevent)
        {
-               if (rta->rta_type == XFRMA_REPLAY_VAL &&
-                       RTA_PAYLOAD(rta) == sizeof(struct xfrm_replay_state))
+               rta = XFRM_RTA(out, struct xfrm_aevent_id);
+               rtasize = XFRM_PAYLOAD(out, struct xfrm_aevent_id);
+               while (RTA_OK(rta, rtasize))
                {
-                       memcpy(replay, RTA_DATA(rta), RTA_PAYLOAD(rta));
-                       free(out);
-                       return SUCCESS;
+                       if (rta->rta_type == XFRMA_LTIME_VAL &&
+                               RTA_PAYLOAD(rta) == sizeof(**lifetime))
+                       {
+                               free(*lifetime);
+                               *lifetime = malloc(RTA_PAYLOAD(rta));
+                               memcpy(*lifetime, RTA_DATA(rta), RTA_PAYLOAD(rta));
+                       }
+                       if (rta->rta_type == XFRMA_REPLAY_VAL &&
+                               RTA_PAYLOAD(rta) == sizeof(**replay))
+                       {
+                               free(*replay);
+                               *replay = malloc(RTA_PAYLOAD(rta));
+                               memcpy(*replay, RTA_DATA(rta), RTA_PAYLOAD(rta));
+                       }
+                       if (rta->rta_type == XFRMA_REPLAY_ESN_VAL &&
+                               RTA_PAYLOAD(rta) >= sizeof(**replay_esn))
+                       {
+                               free(*replay_esn);
+                               *replay_esn = malloc(RTA_PAYLOAD(rta));
+                               *replay_esn_len = RTA_PAYLOAD(rta);
+                               memcpy(*replay_esn, RTA_DATA(rta), RTA_PAYLOAD(rta));
+                       }
+                       rta = RTA_NEXT(rta, rtasize);
                }
-               rta = RTA_NEXT(rta, rtasize);
        }
-
-       DBG1(DBG_KNL, "unable to query replay state from SAD entry with SPI %.8x",
-                                 ntohl(spi));
        free(out);
-       return FAILED;
 }
 
 METHOD(kernel_ipsec_t, query_sa, status_t,
-       private_kernel_netlink_ipsec_t *this, host_t *src, host_t *dst,
-       u_int32_t spi, protocol_id_t protocol, mark_t mark, u_int64_t *bytes)
+       private_kernel_netlink_ipsec_t *this, kernel_ipsec_sa_id_t *id,
+       kernel_ipsec_query_sa_t *data, uint64_t *bytes, uint64_t *packets,
+       time_t *time)
 {
        netlink_buf_t request;
        struct nlmsghdr *out = NULL, *hdr;
        struct xfrm_usersa_id *sa_id;
        struct xfrm_usersa_info *sa = NULL;
+       status_t status = FAILED;
        size_t len;
+       char markstr[32] = "";
 
        memset(&request, 0, sizeof(request));
+       format_mark(markstr, sizeof(markstr), id->mark);
 
-       if (mark.value)
-       {
-               DBG2(DBG_KNL, "querying SAD entry with SPI %.8x  (mark %u/0x%8x)",
-                                          ntohl(spi), mark.value, mark.mask);
-       }
-       else
-       {
-               DBG2(DBG_KNL, "querying SAD entry with SPI %.8x", ntohl(spi));
-       }
-       hdr = (struct nlmsghdr*)request;
+       DBG2(DBG_KNL, "querying SAD entry with SPI %.8x%s", ntohl(id->spi),
+                markstr);
+
+       hdr = &request.hdr;
        hdr->nlmsg_flags = NLM_F_REQUEST;
        hdr->nlmsg_type = XFRM_MSG_GETSA;
        hdr->nlmsg_len = NLMSG_LENGTH(sizeof(struct xfrm_usersa_id));
 
-       sa_id = (struct xfrm_usersa_id*)NLMSG_DATA(hdr);
-       host2xfrm(dst, &sa_id->daddr);
-       sa_id->spi = spi;
-       sa_id->proto = proto_ike2kernel(protocol);
-       sa_id->family = dst->get_family(dst);
+       sa_id = NLMSG_DATA(hdr);
+       host2xfrm(id->dst, &sa_id->daddr);
+       sa_id->spi = id->spi;
+       sa_id->proto = id->proto;
+       sa_id->family = id->dst->get_family(id->dst);
 
-       if (mark.value)
+       if (!add_mark(hdr, sizeof(request), id->mark))
        {
-               struct xfrm_mark *mrk;
-               struct rtattr *rthdr = XFRM_RTA(hdr, struct xfrm_usersa_id);
-
-               rthdr->rta_type = XFRMA_MARK;
-               rthdr->rta_len = RTA_LENGTH(sizeof(struct xfrm_mark));
-               hdr->nlmsg_len += rthdr->rta_len;
-               if (hdr->nlmsg_len > sizeof(request))
-               {
-                       return FAILED;
-               }
-
-               mrk = (struct xfrm_mark*)RTA_DATA(rthdr);
-               mrk->v = mark.value;
-               mrk->m = mark.mask;
+               return FAILED;
        }
 
        if (this->socket_xfrm->send(this->socket_xfrm, hdr, &out, &len) == SUCCESS)
@@ -1349,26 +1790,16 @@ METHOD(kernel_ipsec_t, query_sa, status_t,
                        {
                                case XFRM_MSG_NEWSA:
                                {
-                                       sa = (struct xfrm_usersa_info*)NLMSG_DATA(hdr);
+                                       sa = NLMSG_DATA(hdr);
                                        break;
                                }
                                case NLMSG_ERROR:
                                {
                                        struct nlmsgerr *err = NLMSG_DATA(hdr);
 
-                                       if (mark.value)
-                                       {
-                                               DBG1(DBG_KNL, "querying SAD entry with SPI %.8x  "
-                                                                         "(mark %u/0x%8x) failed: %s (%d)",
-                                                                          ntohl(spi), mark.value, mark.mask,
-                                                                          strerror(-err->error), -err->error);
-                                       }
-                                       else
-                                       {
-                                               DBG1(DBG_KNL, "querying SAD entry with SPI %.8x "
-                                                                         "failed: %s (%d)", ntohl(spi),
-                                                                          strerror(-err->error), -err->error);
-                                       }
+                                       DBG1(DBG_KNL, "querying SAD entry with SPI %.8x%s failed: "
+                                                "%s (%d)", ntohl(id->spi), markstr,
+                                                strerror(-err->error), -err->error);
                                        break;
                                }
                                default:
@@ -1383,102 +1814,97 @@ METHOD(kernel_ipsec_t, query_sa, status_t,
 
        if (sa == NULL)
        {
-               DBG2(DBG_KNL, "unable to query SAD entry with SPI %.8x", ntohl(spi));
-               free(out);
-               return FAILED;
+               DBG2(DBG_KNL, "unable to query SAD entry with SPI %.8x%s",
+                        ntohl(id->spi), markstr);
        }
-       *bytes = sa->curlft.bytes;
-
+       else
+       {
+               if (bytes)
+               {
+                       *bytes = sa->curlft.bytes;
+               }
+               if (packets)
+               {
+                       *packets = sa->curlft.packets;
+               }
+               if (time)
+               {       /* curlft contains an "use" time, but that contains a timestamp
+                        * of the first use, not the last. Last use time must be queried
+                        * on the policy on Linux */
+                       *time = 0;
+               }
+               status = SUCCESS;
+       }
+       memwipe(out, len);
        free(out);
-       return SUCCESS;
+       return status;
 }
 
 METHOD(kernel_ipsec_t, del_sa, status_t,
-       private_kernel_netlink_ipsec_t *this, host_t *src, host_t *dst,
-       u_int32_t spi, protocol_id_t protocol, u_int16_t cpi, mark_t mark)
+       private_kernel_netlink_ipsec_t *this, kernel_ipsec_sa_id_t *id,
+       kernel_ipsec_del_sa_t *data)
 {
        netlink_buf_t request;
        struct nlmsghdr *hdr;
        struct xfrm_usersa_id *sa_id;
+       char markstr[32] = "";
 
        /* if IPComp was used, we first delete the additional IPComp SA */
-       if (cpi)
+       if (data->cpi)
        {
-               del_sa(this, src, dst, htonl(ntohs(cpi)), IPPROTO_COMP, 0, mark);
+               kernel_ipsec_sa_id_t ipcomp_id = {
+                       .src = id->src,
+                       .dst = id->dst,
+                       .spi = htonl(ntohs(data->cpi)),
+                       .proto = IPPROTO_COMP,
+                       .mark = id->mark,
+               };
+               kernel_ipsec_del_sa_t ipcomp = {};
+               del_sa(this, &ipcomp_id, &ipcomp);
        }
 
        memset(&request, 0, sizeof(request));
+       format_mark(markstr, sizeof(markstr), id->mark);
 
-       if (mark.value)
-       {
-               DBG2(DBG_KNL, "deleting SAD entry with SPI %.8x  (mark %u/0x%8x)",
-                                          ntohl(spi), mark.value, mark.mask);
-       }
-       else
-       {
-               DBG2(DBG_KNL, "deleting SAD entry with SPI %.8x", ntohl(spi));
-       }
-       hdr = (struct nlmsghdr*)request;
+       DBG2(DBG_KNL, "deleting SAD entry with SPI %.8x%s", ntohl(id->spi),
+                markstr);
+
+       hdr = &request.hdr;
        hdr->nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
        hdr->nlmsg_type = XFRM_MSG_DELSA;
        hdr->nlmsg_len = NLMSG_LENGTH(sizeof(struct xfrm_usersa_id));
 
-       sa_id = (struct xfrm_usersa_id*)NLMSG_DATA(hdr);
-       host2xfrm(dst, &sa_id->daddr);
-       sa_id->spi = spi;
-       sa_id->proto = proto_ike2kernel(protocol);
-       sa_id->family = dst->get_family(dst);
+       sa_id = NLMSG_DATA(hdr);
+       host2xfrm(id->dst, &sa_id->daddr);
+       sa_id->spi = id->spi;
+       sa_id->proto = id->proto;
+       sa_id->family = id->dst->get_family(id->dst);
 
-       if (mark.value)
-       {
-               struct xfrm_mark *mrk;
-               struct rtattr *rthdr = XFRM_RTA(hdr, struct xfrm_usersa_id);
-
-               rthdr->rta_type = XFRMA_MARK;
-               rthdr->rta_len = RTA_LENGTH(sizeof(struct xfrm_mark));
-               hdr->nlmsg_len += rthdr->rta_len;
-               if (hdr->nlmsg_len > sizeof(request))
-               {
-                       return FAILED;
-               }
-
-               mrk = (struct xfrm_mark*)RTA_DATA(rthdr);
-               mrk->v = mark.value;
-               mrk->m = mark.mask;
-       }
-
-       if (this->socket_xfrm->send_ack(this->socket_xfrm, hdr) != SUCCESS)
+       if (!add_mark(hdr, sizeof(request), id->mark))
        {
-               if (mark.value)
-               {
-                       DBG1(DBG_KNL, "unable to delete SAD entry with SPI %.8x  "
-                                                 "(mark %u/0x%8x)", ntohl(spi), mark.value, mark.mask);
-               }
-               else
-               {
-                       DBG1(DBG_KNL, "unable to delete SAD entry with SPI %.8x", ntohl(spi));
-               }
                return FAILED;
        }
-       if (mark.value)
-       {
-               DBG2(DBG_KNL, "deleted SAD entry with SPI %.8x  (mark %u/0x%8x)",
-                                          ntohl(spi), mark.value, mark.mask);
-       }
-       else
+
+       switch (this->socket_xfrm->send_ack(this->socket_xfrm, hdr))
        {
-               DBG2(DBG_KNL, "deleted SAD entry with SPI %.8x", ntohl(spi));
+               case SUCCESS:
+                       DBG2(DBG_KNL, "deleted SAD entry with SPI %.8x%s",
+                                ntohl(id->spi), markstr);
+                       return SUCCESS;
+               case NOT_FOUND:
+                       return NOT_FOUND;
+               default:
+                       DBG1(DBG_KNL, "unable to delete SAD entry with SPI %.8x%s",
+                                ntohl(id->spi), markstr);
+                       return FAILED;
        }
-       return SUCCESS;
 }
 
 METHOD(kernel_ipsec_t, update_sa, status_t,
-       private_kernel_netlink_ipsec_t *this, u_int32_t spi, protocol_id_t protocol,
-       u_int16_t cpi, host_t *src, host_t *dst, host_t *new_src, host_t *new_dst,
-       bool old_encap, bool new_encap, mark_t mark)
+       private_kernel_netlink_ipsec_t *this, kernel_ipsec_sa_id_t *id,
+       kernel_ipsec_update_sa_t *data)
 {
        netlink_buf_t request;
-       u_char *pos;
        struct nlmsghdr *hdr, *out = NULL;
        struct xfrm_usersa_id *sa_id;
        struct xfrm_usersa_info *out_sa = NULL, *sa;
@@ -1486,31 +1912,53 @@ METHOD(kernel_ipsec_t, update_sa, status_t,
        struct rtattr *rta;
        size_t rtasize;
        struct xfrm_encap_tmpl* tmpl = NULL;
-       bool got_replay_state = FALSE;
-       struct xfrm_replay_state replay;
+       struct xfrm_replay_state *replay = NULL;
+       struct xfrm_replay_state_esn *replay_esn = NULL;
+       struct xfrm_lifetime_cur *lifetime = NULL;
+       uint32_t replay_esn_len = 0;
+       kernel_ipsec_del_sa_t del = { 0 };
+       status_t status = FAILED;
+       char markstr[32] = "";
 
        /* if IPComp is used, we first update the IPComp SA */
-       if (cpi)
-       {
-               update_sa(this, htonl(ntohs(cpi)), IPPROTO_COMP, 0,
-                                 src, dst, new_src, new_dst, FALSE, FALSE, mark);
+       if (data->cpi)
+       {
+               kernel_ipsec_sa_id_t ipcomp_id = {
+                       .src = id->src,
+                       .dst = id->dst,
+                       .spi = htonl(ntohs(data->cpi)),
+                       .proto = IPPROTO_COMP,
+                       .mark = id->mark,
+               };
+               kernel_ipsec_update_sa_t ipcomp = {
+                       .new_src = data->new_src,
+                       .new_dst = data->new_dst,
+               };
+               update_sa(this, &ipcomp_id, &ipcomp);
        }
 
        memset(&request, 0, sizeof(request));
+       format_mark(markstr, sizeof(markstr), id->mark);
 
-       DBG2(DBG_KNL, "querying SAD entry with SPI %.8x for update", ntohl(spi));
+       DBG2(DBG_KNL, "querying SAD entry with SPI %.8x%s for update",
+                ntohl(id->spi), markstr);
 
        /* query the existing SA first */
-       hdr = (struct nlmsghdr*)request;
+       hdr = &request.hdr;
        hdr->nlmsg_flags = NLM_F_REQUEST;
        hdr->nlmsg_type = XFRM_MSG_GETSA;
        hdr->nlmsg_len = NLMSG_LENGTH(sizeof(struct xfrm_usersa_id));
 
-       sa_id = (struct xfrm_usersa_id*)NLMSG_DATA(hdr);
-       host2xfrm(dst, &sa_id->daddr);
-       sa_id->spi = spi;
-       sa_id->proto = proto_ike2kernel(protocol);
-       sa_id->family = dst->get_family(dst);
+       sa_id = NLMSG_DATA(hdr);
+       host2xfrm(id->dst, &sa_id->daddr);
+       sa_id->spi = id->spi;
+       sa_id->proto = id->proto;
+       sa_id->family = id->dst->get_family(id->dst);
+
+       if (!add_mark(hdr, sizeof(request), id->mark))
+       {
+               return FAILED;
+       }
 
        if (this->socket_xfrm->send(this->socket_xfrm, hdr, &out, &len) == SUCCESS)
        {
@@ -1542,387 +1990,594 @@ METHOD(kernel_ipsec_t, update_sa, status_t,
        }
        if (out_sa == NULL)
        {
-               DBG1(DBG_KNL, "unable to update SAD entry with SPI %.8x", ntohl(spi));
-               free(out);
-               return FAILED;
+               DBG1(DBG_KNL, "unable to update SAD entry with SPI %.8x%s",
+                        ntohl(id->spi), markstr);
+               goto failed;
        }
 
-       /* try to get the replay state */
-       if (get_replay_state(this, spi, protocol, dst, &replay) == SUCCESS)
-       {
-               got_replay_state = TRUE;
-       }
+       get_replay_state(this, id, &replay_esn, &replay_esn_len, &replay,
+                                        &lifetime);
 
        /* delete the old SA (without affecting the IPComp SA) */
-       if (del_sa(this, src, dst, spi, protocol, 0, mark) != SUCCESS)
+       if (del_sa(this, id, &del) != SUCCESS)
        {
-               DBG1(DBG_KNL, "unable to delete old SAD entry with SPI %.8x", ntohl(spi));
-               free(out);
-               return FAILED;
+               DBG1(DBG_KNL, "unable to delete old SAD entry with SPI %.8x%s",
+                        ntohl(id->spi), markstr);
+               goto failed;
        }
 
-       DBG2(DBG_KNL, "updating SAD entry with SPI %.8x from %#H..%#H to %#H..%#H",
-                ntohl(spi), src, dst, new_src, new_dst);
+       DBG2(DBG_KNL, "updating SAD entry with SPI %.8x%s from %#H..%#H to "
+                "%#H..%#H", ntohl(id->spi), markstr, id->src, id->dst, data->new_src,
+                data->new_dst);
        /* copy over the SA from out to request */
-       hdr = (struct nlmsghdr*)request;
-       memcpy(hdr, out, min(out->nlmsg_len, sizeof(request)));
+       hdr = &request.hdr;
        hdr->nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
        hdr->nlmsg_type = XFRM_MSG_NEWSA;
        hdr->nlmsg_len = NLMSG_LENGTH(sizeof(struct xfrm_usersa_info));
        sa = NLMSG_DATA(hdr);
-       sa->family = new_dst->get_family(new_dst);
+       memcpy(sa, NLMSG_DATA(out), sizeof(struct xfrm_usersa_info));
+       sa->family = data->new_dst->get_family(data->new_dst);
 
-       if (!src->ip_equals(src, new_src))
+       if (!id->src->ip_equals(id->src, data->new_src))
        {
-               host2xfrm(new_src, &sa->saddr);
+               host2xfrm(data->new_src, &sa->saddr);
        }
-       if (!dst->ip_equals(dst, new_dst))
+       if (!id->dst->ip_equals(id->dst, data->new_dst))
        {
-               host2xfrm(new_dst, &sa->id.daddr);
+               host2xfrm(data->new_dst, &sa->id.daddr);
        }
 
        rta = XFRM_RTA(out, struct xfrm_usersa_info);
        rtasize = XFRM_PAYLOAD(out, struct xfrm_usersa_info);
-       pos = (u_char*)XFRM_RTA(hdr, struct xfrm_usersa_info);
-       while(RTA_OK(rta, rtasize))
+       while (RTA_OK(rta, rtasize))
        {
                /* copy all attributes, but not XFRMA_ENCAP if we are disabling it */
-               if (rta->rta_type != XFRMA_ENCAP || new_encap)
+               if (rta->rta_type != XFRMA_ENCAP || data->new_encap)
                {
                        if (rta->rta_type == XFRMA_ENCAP)
                        {       /* update encap tmpl */
-                               tmpl = (struct xfrm_encap_tmpl*)RTA_DATA(rta);
-                               tmpl->encap_sport = ntohs(new_src->get_port(new_src));
-                               tmpl->encap_dport = ntohs(new_dst->get_port(new_dst));
+                               tmpl = RTA_DATA(rta);
+                               tmpl->encap_sport = ntohs(data->new_src->get_port(data->new_src));
+                               tmpl->encap_dport = ntohs(data->new_dst->get_port(data->new_dst));
                        }
-                       memcpy(pos, rta, rta->rta_len);
-                       pos += RTA_ALIGN(rta->rta_len);
-                       hdr->nlmsg_len += RTA_ALIGN(rta->rta_len);
+                       netlink_add_attribute(hdr, rta->rta_type,
+                                                                 chunk_create(RTA_DATA(rta), RTA_PAYLOAD(rta)),
+                                                                 sizeof(request));
                }
                rta = RTA_NEXT(rta, rtasize);
        }
 
-       rta = (struct rtattr*)pos;
-       if (tmpl == NULL && new_encap)
+       if (tmpl == NULL && data->new_encap)
        {       /* add tmpl if we are enabling it */
-               rta->rta_type = XFRMA_ENCAP;
-               rta->rta_len = RTA_LENGTH(sizeof(struct xfrm_encap_tmpl));
-
-               hdr->nlmsg_len += rta->rta_len;
-               if (hdr->nlmsg_len > sizeof(request))
+               tmpl = netlink_reserve(hdr, sizeof(request), XFRMA_ENCAP, sizeof(*tmpl));
+               if (!tmpl)
                {
-                       return FAILED;
+                       goto failed;
                }
-
-               tmpl = (struct xfrm_encap_tmpl*)RTA_DATA(rta);
                tmpl->encap_type = UDP_ENCAP_ESPINUDP;
-               tmpl->encap_sport = ntohs(new_src->get_port(new_src));
-               tmpl->encap_dport = ntohs(new_dst->get_port(new_dst));
+               tmpl->encap_sport = ntohs(data->new_src->get_port(data->new_src));
+               tmpl->encap_dport = ntohs(data->new_dst->get_port(data->new_dst));
                memset(&tmpl->encap_oa, 0, sizeof (xfrm_address_t));
-
-               rta = XFRM_RTA_NEXT(rta);
        }
 
-       if (got_replay_state)
-       {       /* copy the replay data if available */
-               rta->rta_type = XFRMA_REPLAY_VAL;
-               rta->rta_len = RTA_LENGTH(sizeof(struct xfrm_replay_state));
+       if (replay_esn)
+       {
+               struct xfrm_replay_state_esn *state;
 
-               hdr->nlmsg_len += rta->rta_len;
-               if (hdr->nlmsg_len > sizeof(request))
+               state = netlink_reserve(hdr, sizeof(request), XFRMA_REPLAY_ESN_VAL,
+                                                               replay_esn_len);
+               if (!state)
                {
-                       return FAILED;
+                       goto failed;
                }
-               memcpy(RTA_DATA(rta), &replay, sizeof(replay));
-
-               rta = XFRM_RTA_NEXT(rta);
+               memcpy(state, replay_esn, replay_esn_len);
        }
-
-       if (this->socket_xfrm->send_ack(this->socket_xfrm, hdr) != SUCCESS)
+       else if (replay)
        {
-               DBG1(DBG_KNL, "unable to update SAD entry with SPI %.8x", ntohl(spi));
-               free(out);
-               return FAILED;
-       }
-       free(out);
-
-       return SUCCESS;
-}
-
-METHOD(kernel_ipsec_t, add_policy, status_t,
-       private_kernel_netlink_ipsec_t *this, host_t *src, host_t *dst,
-       traffic_selector_t *src_ts, traffic_selector_t *dst_ts,
-       policy_dir_t direction, u_int32_t spi, protocol_id_t protocol,
-       u_int32_t reqid, mark_t mark, ipsec_mode_t mode, u_int16_t ipcomp,
-       u_int16_t cpi,  bool routed)
-{
-       policy_entry_t *current, *policy;
-       bool found = FALSE;
-       netlink_buf_t request;
-       struct xfrm_userpolicy_info *policy_info;
-       struct nlmsghdr *hdr;
-
-       /* create a policy */
-       policy = malloc_thing(policy_entry_t);
-       memset(policy, 0, sizeof(policy_entry_t));
-       policy->sel = ts2selector(src_ts, dst_ts);
-       policy->mark = mark.value & mark.mask;
-       policy->direction = direction;
+               struct xfrm_replay_state *state;
 
-       /* find the policy, which matches EXACTLY */
-       this->mutex->lock(this->mutex);
-       current = this->policies->get(this->policies, policy);
-       if (current)
-       {
-               /* use existing policy */
-               current->refcount++;
-               if (mark.value)
-               {
-                       DBG2(DBG_KNL, "policy %R === %R %N  (mark %u/0x%8x) "
-                                                 "already exists, increasing refcount",
-                                                  src_ts, dst_ts, policy_dir_names, direction,
-                                                  mark.value, mark.mask);
-               }
-               else
+               state = netlink_reserve(hdr, sizeof(request), XFRMA_REPLAY_VAL,
+                                                               sizeof(*state));
+               if (!state)
                {
-                       DBG2(DBG_KNL, "policy %R === %R %N "
-                                                 "already exists, increasing refcount",
-                                                  src_ts, dst_ts, policy_dir_names, direction);
+                       goto failed;
                }
-               free(policy);
-               policy = current;
-               found = TRUE;
+               memcpy(state, replay, sizeof(*state));
        }
        else
-       {       /* apply the new one, if we have no such policy */
-               this->policies->put(this->policies, policy, policy);
-               policy->refcount = 1;
+       {
+               DBG1(DBG_KNL, "unable to copy replay state from old SAD entry with "
+                        "SPI %.8x%s", ntohl(id->spi), markstr);
        }
-
-       if (mark.value)
+       if (lifetime)
        {
-               DBG2(DBG_KNL, "adding policy %R === %R %N  (mark %u/0x%8x)",
-                                          src_ts, dst_ts, policy_dir_names, direction,
-                                          mark.value, mark.mask);
+               struct xfrm_lifetime_cur *state;
+
+               state = netlink_reserve(hdr, sizeof(request), XFRMA_LTIME_VAL,
+                                                               sizeof(*state));
+               if (!state)
+               {
+                       goto failed;
+               }
+               memcpy(state, lifetime, sizeof(*state));
        }
        else
        {
-               DBG2(DBG_KNL, "adding policy %R === %R %N",
-                                          src_ts, dst_ts, policy_dir_names, direction);
+               DBG1(DBG_KNL, "unable to copy usage stats from old SAD entry with "
+                        "SPI %.8x%s", ntohl(id->spi), markstr);
        }
 
-       memset(&request, 0, sizeof(request));
-       hdr = (struct nlmsghdr*)request;
-       hdr->nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
-       hdr->nlmsg_type = found ? XFRM_MSG_UPDPOLICY : XFRM_MSG_NEWPOLICY;
-       hdr->nlmsg_len = NLMSG_LENGTH(sizeof(struct xfrm_userpolicy_info));
+       if (this->socket_xfrm->send_ack(this->socket_xfrm, hdr) != SUCCESS)
+       {
+               DBG1(DBG_KNL, "unable to update SAD entry with SPI %.8x%s",
+                        ntohl(id->spi), markstr);
+               goto failed;
+       }
 
-       policy_info = (struct xfrm_userpolicy_info*)NLMSG_DATA(hdr);
-       policy_info->sel = policy->sel;
-       policy_info->dir = policy->direction;
-       /* calculate priority based on source selector size, small size = high prio */
-       policy_info->priority = routed ? PRIO_LOW : PRIO_HIGH;
-       policy_info->priority -= policy->sel.prefixlen_s * 10;
-       policy_info->priority -= policy->sel.proto ? 2 : 0;
-       policy_info->priority -= policy->sel.sport_mask ? 1 : 0;
-       policy_info->action = XFRM_POLICY_ALLOW;
-       policy_info->share = XFRM_SHARE_ANY;
-       this->mutex->unlock(this->mutex);
+       status = SUCCESS;
+failed:
+       free(replay);
+       free(replay_esn);
+       free(lifetime);
+       memwipe(out, len);
+       memwipe(&request, sizeof(request));
+       free(out);
 
-       /* policies don't expire */
-       policy_info->lft.soft_byte_limit = XFRM_INF;
-       policy_info->lft.soft_packet_limit = XFRM_INF;
-       policy_info->lft.hard_byte_limit = XFRM_INF;
-       policy_info->lft.hard_packet_limit = XFRM_INF;
-       policy_info->lft.soft_add_expires_seconds = 0;
-       policy_info->lft.hard_add_expires_seconds = 0;
-       policy_info->lft.soft_use_expires_seconds = 0;
-       policy_info->lft.hard_use_expires_seconds = 0;
+       return status;
+}
+
+METHOD(kernel_ipsec_t, flush_sas, status_t,
+       private_kernel_netlink_ipsec_t *this)
+{
+       netlink_buf_t request;
+       struct nlmsghdr *hdr;
+       struct xfrm_usersa_flush *flush;
+       struct {
+               uint8_t proto;
+               char *name;
+       } protos[] = {
+               { IPPROTO_AH, "AH" },
+               { IPPROTO_ESP, "ESP" },
+               { IPPROTO_COMP, "IPComp" },
+       };
+       int i;
 
-       struct rtattr *rthdr = XFRM_RTA(hdr, struct xfrm_userpolicy_info);
-       rthdr->rta_type = XFRMA_TMPL;
-       rthdr->rta_len = RTA_LENGTH(sizeof(struct xfrm_user_tmpl));
+       memset(&request, 0, sizeof(request));
 
-       hdr->nlmsg_len += rthdr->rta_len;
-       if (hdr->nlmsg_len > sizeof(request))
-       {
-               return FAILED;
-       }
+       hdr = &request.hdr;
+       hdr->nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
+       hdr->nlmsg_type = XFRM_MSG_FLUSHSA;
+       hdr->nlmsg_len = NLMSG_LENGTH(sizeof(struct xfrm_usersa_flush));
 
-       struct xfrm_user_tmpl *tmpl = (struct xfrm_user_tmpl*)RTA_DATA(rthdr);
+       flush = NLMSG_DATA(hdr);
 
-       if (ipcomp != IPCOMP_NONE)
+       for (i = 0; i < countof(protos); i++)
        {
-               tmpl->reqid = reqid;
-               tmpl->id.proto = IPPROTO_COMP;
-               tmpl->aalgos = tmpl->ealgos = tmpl->calgos = ~0;
-               tmpl->mode = mode2kernel(mode);
-               tmpl->optional = direction != POLICY_OUT;
-               tmpl->family = src->get_family(src);
+               DBG2(DBG_KNL, "flushing all %s SAD entries", protos[i].name);
 
-               host2xfrm(src, &tmpl->saddr);
-               host2xfrm(dst, &tmpl->id.daddr);
+               flush->proto = protos[i].proto;
 
-               /* add an additional xfrm_user_tmpl */
-               rthdr->rta_len += RTA_LENGTH(sizeof(struct xfrm_user_tmpl));
-               hdr->nlmsg_len += RTA_LENGTH(sizeof(struct xfrm_user_tmpl));
-               if (hdr->nlmsg_len > sizeof(request))
+               if (this->socket_xfrm->send_ack(this->socket_xfrm, hdr) != SUCCESS)
                {
+                       DBG1(DBG_KNL, "unable to flush %s SAD entries", protos[i].name);
                        return FAILED;
                }
+       }
+       return SUCCESS;
+}
 
-               tmpl++;
-
-               /* use transport mode for ESP if we have a tunnel mode IPcomp SA */
-               mode = MODE_TRANSPORT;
+/**
+ * Unlock the mutex and signal waiting threads
+ */
+static void policy_change_done(private_kernel_netlink_ipsec_t *this,
+                                                          policy_entry_t *policy)
+{
+       policy->working = FALSE;
+       if (policy->waiting)
+       {       /* don't need to wake threads waiting for other policies */
+               this->condvar->broadcast(this->condvar);
+       }
+       this->mutex->unlock(this->mutex);
+}
+
+/**
+ * Install a route for the given policy if enabled and required
+ */
+static void install_route(private_kernel_netlink_ipsec_t *this,
+       policy_entry_t *policy, policy_sa_t *mapping, ipsec_sa_t *ipsec)
+{
+       policy_sa_out_t *out = (policy_sa_out_t*)mapping;
+       route_entry_t *route;
+       host_t *iface;
+
+       INIT(route,
+               .prefixlen = policy->sel.prefixlen_d,
+       );
+
+       if (charon->kernel->get_address_by_ts(charon->kernel, out->src_ts,
+                                                                                 &route->src_ip, NULL) == SUCCESS)
+       {
+               if (!ipsec->dst->is_anyaddr(ipsec->dst))
+               {
+                       route->gateway = charon->kernel->get_nexthop(charon->kernel,
+                                                                                               ipsec->dst, -1, ipsec->src,
+                                                                                               &route->if_name);
+               }
+               else
+               {       /* for shunt policies */
+                       iface = xfrm2host(policy->sel.family, &policy->sel.daddr, 0);
+                       route->gateway = charon->kernel->get_nexthop(charon->kernel,
+                                                                                               iface, policy->sel.prefixlen_d,
+                                                                                               route->src_ip, &route->if_name);
+                       iface->destroy(iface);
+               }
+               route->dst_net = chunk_alloc(policy->sel.family == AF_INET ? 4 : 16);
+               memcpy(route->dst_net.ptr, &policy->sel.daddr, route->dst_net.len);
+
+               /* get the interface to install the route for, if we haven't one yet.
+                * If we have a local address, use it. Otherwise (for shunt policies)
+                * use the route's source address. */
+               if (!route->if_name)
+               {
+                       iface = ipsec->src;
+                       if (iface->is_anyaddr(iface))
+                       {
+                               iface = route->src_ip;
+                       }
+                       if (!charon->kernel->get_interface(charon->kernel, iface,
+                                                                                          &route->if_name))
+                       {
+                               route_entry_destroy(route);
+                               return;
+                       }
+               }
+               if (policy->route)
+               {
+                       route_entry_t *old = policy->route;
+                       if (route_entry_equals(old, route))
+                       {
+                               route_entry_destroy(route);
+                               return;
+                       }
+                       /* uninstall previously installed route */
+                       if (charon->kernel->del_route(charon->kernel, old->dst_net,
+                                                                                 old->prefixlen, old->gateway,
+                                                                                 old->src_ip, old->if_name) != SUCCESS)
+                       {
+                               DBG1(DBG_KNL, "error uninstalling route installed with policy "
+                                        "%R === %R %N", out->src_ts, out->dst_ts, policy_dir_names,
+                                        policy->direction);
+                       }
+                       route_entry_destroy(old);
+                       policy->route = NULL;
+               }
+
+               DBG2(DBG_KNL, "installing route: %R via %H src %H dev %s", out->dst_ts,
+                        route->gateway, route->src_ip, route->if_name);
+               switch (charon->kernel->add_route(charon->kernel, route->dst_net,
+                                                                                 route->prefixlen, route->gateway,
+                                                                                 route->src_ip, route->if_name))
+               {
+                       default:
+                               DBG1(DBG_KNL, "unable to install source route for %H",
+                                        route->src_ip);
+                               /* FALL */
+                       case ALREADY_DONE:
+                               /* route exists, do not uninstall */
+                               route_entry_destroy(route);
+                               break;
+                       case SUCCESS:
+                               /* cache the installed route */
+                               policy->route = route;
+                               break;
+               }
        }
        else
        {
-               /* when using IPcomp, only the IPcomp SA uses tmp src/dst addresses */
-               host2xfrm(src, &tmpl->saddr);
-               host2xfrm(dst, &tmpl->id.daddr);
+               free(route);
        }
+}
+
+/**
+ * Add or update a policy in the kernel.
+ *
+ * Note: The mutex has to be locked when entering this function
+ * and is unlocked here in any case.
+ */
+static status_t add_policy_internal(private_kernel_netlink_ipsec_t *this,
+       policy_entry_t *policy, policy_sa_t *mapping, bool update)
+{
+       netlink_buf_t request;
+       policy_entry_t clone;
+       ipsec_sa_t *ipsec = mapping->sa;
+       struct xfrm_userpolicy_info *policy_info;
+       struct nlmsghdr *hdr;
+       status_t status;
+       int i;
 
-       tmpl->reqid = reqid;
-       tmpl->id.proto = proto_ike2kernel(protocol);
-       tmpl->aalgos = tmpl->ealgos = tmpl->calgos = ~0;
-       tmpl->mode = mode2kernel(mode);
-       tmpl->family = src->get_family(src);
-       rthdr = XFRM_RTA_NEXT(rthdr);
+       /* clone the policy so we are able to check it out again later */
+       memcpy(&clone, policy, sizeof(policy_entry_t));
 
-       if (mark.value)
-       {
-               struct xfrm_mark *mrk;
+       memset(&request, 0, sizeof(request));
+       hdr = &request.hdr;
+       hdr->nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
+       hdr->nlmsg_type = update ? XFRM_MSG_UPDPOLICY : XFRM_MSG_NEWPOLICY;
+       hdr->nlmsg_len = NLMSG_LENGTH(sizeof(struct xfrm_userpolicy_info));
 
-               rthdr->rta_type = XFRMA_MARK;
-               rthdr->rta_len = RTA_LENGTH(sizeof(struct xfrm_mark));
+       policy_info = NLMSG_DATA(hdr);
+       policy_info->sel = policy->sel;
+       policy_info->dir = policy->direction;
 
-               hdr->nlmsg_len += rthdr->rta_len;
-               if (hdr->nlmsg_len > sizeof(request))
+       /* calculate priority based on selector size, small size = high prio */
+       policy_info->priority = mapping->priority;
+       policy_info->action = mapping->type != POLICY_DROP ? XFRM_POLICY_ALLOW
+                                                                                                          : XFRM_POLICY_BLOCK;
+       policy_info->share = XFRM_SHARE_ANY;
+
+       /* policies don't expire */
+       policy_info->lft.soft_byte_limit = XFRM_INF;
+       policy_info->lft.soft_packet_limit = XFRM_INF;
+       policy_info->lft.hard_byte_limit = XFRM_INF;
+       policy_info->lft.hard_packet_limit = XFRM_INF;
+       policy_info->lft.soft_add_expires_seconds = 0;
+       policy_info->lft.hard_add_expires_seconds = 0;
+       policy_info->lft.soft_use_expires_seconds = 0;
+       policy_info->lft.hard_use_expires_seconds = 0;
+
+       if (mapping->type == POLICY_IPSEC && ipsec->cfg.reqid)
+       {
+               struct xfrm_user_tmpl *tmpl;
+               struct {
+                       uint8_t proto;
+                       bool use;
+               } protos[] = {
+                       { IPPROTO_COMP, ipsec->cfg.ipcomp.transform != IPCOMP_NONE },
+                       { IPPROTO_ESP, ipsec->cfg.esp.use },
+                       { IPPROTO_AH, ipsec->cfg.ah.use },
+               };
+               ipsec_mode_t proto_mode = ipsec->cfg.mode;
+               int count = 0;
+
+               for (i = 0; i < countof(protos); i++)
                {
+                       if (protos[i].use)
+                       {
+                               count++;
+                       }
+               }
+               tmpl = netlink_reserve(hdr, sizeof(request), XFRMA_TMPL,
+                                                          count * sizeof(*tmpl));
+               if (!tmpl)
+               {
+                       policy_change_done(this, policy);
                        return FAILED;
                }
 
-               mrk = (struct xfrm_mark*)RTA_DATA(rthdr);
-               mrk->v = mark.value;
-               mrk->m = mark.mask;
+               for (i = 0; i < countof(protos); i++)
+               {
+                       if (!protos[i].use)
+                       {
+                               continue;
+                       }
+                       tmpl->reqid = ipsec->cfg.reqid;
+                       tmpl->id.proto = protos[i].proto;
+                       tmpl->aalgos = tmpl->ealgos = tmpl->calgos = ~0;
+                       tmpl->mode = mode2kernel(proto_mode);
+                       tmpl->optional = protos[i].proto == IPPROTO_COMP &&
+                                                        policy->direction != POLICY_OUT;
+                       tmpl->family = ipsec->src->get_family(ipsec->src);
+
+                       if (proto_mode == MODE_TUNNEL || proto_mode == MODE_BEET)
+                       {       /* only for tunnel mode */
+                               host2xfrm(ipsec->src, &tmpl->saddr);
+                               host2xfrm(ipsec->dst, &tmpl->id.daddr);
+                       }
+
+                       tmpl++;
+
+                       /* use transport mode for other SAs */
+                       proto_mode = MODE_TRANSPORT;
+               }
        }
 
-       if (this->socket_xfrm->send_ack(this->socket_xfrm, hdr) != SUCCESS)
+       if (!add_mark(hdr, sizeof(request), ipsec->mark))
        {
-               DBG1(DBG_KNL, "unable to add policy %R === %R %N", src_ts, dst_ts,
-                                          policy_dir_names, direction);
+               policy_change_done(this, policy);
                return FAILED;
        }
+       this->mutex->unlock(this->mutex);
+
+       status = this->socket_xfrm->send_ack(this->socket_xfrm, hdr);
+       if (status == ALREADY_DONE && !update)
+       {
+               DBG1(DBG_KNL, "policy already exists, try to update it");
+               hdr->nlmsg_type = XFRM_MSG_UPDPOLICY;
+               status = this->socket_xfrm->send_ack(this->socket_xfrm, hdr);
+       }
 
+       this->mutex->lock(this->mutex);
+       if (status != SUCCESS)
+       {
+               policy_change_done(this, policy);
+               return FAILED;
+       }
        /* install a route, if:
-        * - we are NOT updating a policy
-        * - this is a forward policy (to just get one for each child)
-        * - we are in tunnel/BEET mode
+        * - this is an outbound policy (to just get one for each child)
         * - routing is not disabled via strongswan.conf
+        * - the selector is not for a specific protocol/port
+        * - we are in tunnel/BEET mode or install a bypass policy
         */
-       if (policy->route == NULL && direction == POLICY_FWD &&
-               mode != MODE_TRANSPORT && this->install_routes)
+       if (policy->direction == POLICY_OUT && this->install_routes &&
+               !policy->sel.proto && !policy->sel.dport && !policy->sel.sport)
+       {
+               if (mapping->type == POLICY_PASS ||
+                  (mapping->type == POLICY_IPSEC && ipsec->cfg.mode != MODE_TRANSPORT))
+               {
+                       install_route(this, policy, mapping, ipsec);
+               }
+       }
+       policy_change_done(this, policy);
+       return SUCCESS;
+}
+
+METHOD(kernel_ipsec_t, add_policy, status_t,
+       private_kernel_netlink_ipsec_t *this, kernel_ipsec_policy_id_t *id,
+       kernel_ipsec_manage_policy_t *data)
+{
+       policy_entry_t *policy, *current;
+       policy_sa_t *assigned_sa, *current_sa;
+       enumerator_t *enumerator;
+       bool found = FALSE, update = TRUE;
+       char markstr[32] = "";
+       uint32_t cur_priority = 0;
+       int use_count;
+
+       /* create a policy */
+       INIT(policy,
+               .sel = ts2selector(id->src_ts, id->dst_ts, id->interface),
+               .mark = id->mark.value & id->mark.mask,
+               .direction = id->dir,
+               .reqid = data->sa->reqid,
+       );
+       format_mark(markstr, sizeof(markstr), id->mark);
+
+       /* find the policy, which matches EXACTLY */
+       this->mutex->lock(this->mutex);
+       current = this->policies->get(this->policies, policy);
+       if (current)
        {
-               route_entry_t *route = malloc_thing(route_entry_t);
+               if (current->reqid && data->sa->reqid &&
+                       current->reqid != data->sa->reqid)
+               {
+                       DBG1(DBG_CFG, "unable to install policy %R === %R %N%s for reqid "
+                                "%u, the same policy for reqid %u exists",
+                                id->src_ts, id->dst_ts, policy_dir_names, id->dir, markstr,
+                                data->sa->reqid, current->reqid);
+                       policy_entry_destroy(this, policy);
+                       this->mutex->unlock(this->mutex);
+                       return INVALID_STATE;
+               }
+               /* use existing policy */
+               DBG2(DBG_KNL, "policy %R === %R %N%s already exists, increasing "
+                        "refcount", id->src_ts, id->dst_ts, policy_dir_names, id->dir,
+                        markstr);
+               policy_entry_destroy(this, policy);
+               policy = current;
+               found = TRUE;
 
-               if (charon->kernel_interface->get_address_by_ts(charon->kernel_interface,
-                               dst_ts, &route->src_ip) == SUCCESS)
+               policy->waiting++;
+               while (policy->working)
                {
-                       /* get the nexthop to src (src as we are in POLICY_FWD).*/
-                       route->gateway = charon->kernel_interface->get_nexthop(
-                                                                                               charon->kernel_interface, src);
-                       /* install route via outgoing interface */
-                       route->if_name = charon->kernel_interface->get_interface(
-                                                                                               charon->kernel_interface, dst);
-                       route->dst_net = chunk_alloc(policy->sel.family == AF_INET ? 4 : 16);
-                       memcpy(route->dst_net.ptr, &policy->sel.saddr, route->dst_net.len);
-                       route->prefixlen = policy->sel.prefixlen_s;
-
-                       if (route->if_name)
+                       this->condvar->wait(this->condvar, this->mutex);
+               }
+               policy->waiting--;
+               policy->working = TRUE;
+       }
+       else
+       {       /* use the new one, if we have no such policy */
+               policy->used_by = linked_list_create();
+               this->policies->put(this->policies, policy, policy);
+       }
+
+       /* cache the assigned IPsec SA */
+       assigned_sa = policy_sa_create(this, id->dir, data->type, data->src,
+                                               data->dst, id->src_ts, id->dst_ts, id->mark, data->sa);
+       assigned_sa->auto_priority = get_priority(policy, data->prio, id->interface);
+       assigned_sa->priority = this->get_priority ? this->get_priority(id, data)
+                                                                                          : data->manual_prio;
+       assigned_sa->priority = assigned_sa->priority ?: assigned_sa->auto_priority;
+
+       /* insert the SA according to its priority */
+       enumerator = policy->used_by->create_enumerator(policy->used_by);
+       while (enumerator->enumerate(enumerator, (void**)&current_sa))
+       {
+               if (current_sa->priority > assigned_sa->priority)
+               {
+                       break;
+               }
+               if (current_sa->priority == assigned_sa->priority)
+               {
+                       /* in case of equal manual prios order SAs by automatic priority */
+                       if (current_sa->auto_priority > assigned_sa->auto_priority)
                        {
-                               switch (charon->kernel_interface->add_route(
-                                                                       charon->kernel_interface, route->dst_net,
-                                                                       route->prefixlen, route->gateway,
-                                                                       route->src_ip, route->if_name))
-                               {
-                                       default:
-                                               DBG1(DBG_KNL, "unable to install source route for %H",
-                                                        route->src_ip);
-                                               /* FALL */
-                                       case ALREADY_DONE:
-                                               /* route exists, do not uninstall */
-                                               route_entry_destroy(route);
-                                               break;
-                                       case SUCCESS:
-                                               /* cache the installed route */
-                                               policy->route = route;
-                                               break;
-                               }
+                               break;
                        }
-                       else
+                       /* prefer SAs with a reqid over those without */
+                       if (current_sa->auto_priority == assigned_sa->auto_priority &&
+                               (!current_sa->sa->cfg.reqid || assigned_sa->sa->cfg.reqid))
                        {
-                               route_entry_destroy(route);
+                               break;
                        }
                }
-               else
+               if (update)
                {
-                       free(route);
+                       cur_priority = current_sa->priority;
+                       update = FALSE;
                }
        }
+       policy->used_by->insert_before(policy->used_by, enumerator, assigned_sa);
+       enumerator->destroy(enumerator);
+
+       use_count = policy->used_by->get_count(policy->used_by);
+       if (!update)
+       {       /* we don't update the policy if the priority is lower than that of
+                * the currently installed one */
+               policy_change_done(this, policy);
+               DBG2(DBG_KNL, "not updating policy %R === %R %N%s [priority %u,"
+                        "refcount %d]", id->src_ts, id->dst_ts, policy_dir_names,
+                        id->dir, markstr, cur_priority, use_count);
+               return SUCCESS;
+       }
+       policy->reqid = assigned_sa->sa->cfg.reqid;
+
+       if (this->policy_update)
+       {
+               found = TRUE;
+       }
+
+       DBG2(DBG_KNL, "%s policy %R === %R %N%s [priority %u, refcount %d]",
+                found ? "updating" : "adding", id->src_ts, id->dst_ts,
+                policy_dir_names, id->dir, markstr, assigned_sa->priority, use_count);
+
+       if (add_policy_internal(this, policy, assigned_sa, found) != SUCCESS)
+       {
+               DBG1(DBG_KNL, "unable to %s policy %R === %R %N%s",
+                        found ? "update" : "add", id->src_ts, id->dst_ts,
+                        policy_dir_names, id->dir, markstr);
+               return FAILED;
+       }
        return SUCCESS;
 }
 
 METHOD(kernel_ipsec_t, query_policy, status_t,
-       private_kernel_netlink_ipsec_t *this, traffic_selector_t *src_ts,
-       traffic_selector_t *dst_ts, policy_dir_t direction, mark_t mark,
-       u_int32_t *use_time)
+       private_kernel_netlink_ipsec_t *this, kernel_ipsec_policy_id_t *id,
+       kernel_ipsec_query_policy_t *data, time_t *use_time)
 {
        netlink_buf_t request;
        struct nlmsghdr *out = NULL, *hdr;
        struct xfrm_userpolicy_id *policy_id;
        struct xfrm_userpolicy_info *policy = NULL;
        size_t len;
+       char markstr[32] = "";
 
        memset(&request, 0, sizeof(request));
+       format_mark(markstr, sizeof(markstr), id->mark);
 
-       if (mark.value)
-       {
-               DBG2(DBG_KNL, "querying policy %R === %R %N  (mark %u/0x%8x)",
-                                          src_ts, dst_ts, policy_dir_names, direction,
-                                          mark.value, mark.mask);
-       }
-       else
-       {
-               DBG2(DBG_KNL, "querying policy %R === %R %N", src_ts, dst_ts,
-                                          policy_dir_names, direction);
-       }
-       hdr = (struct nlmsghdr*)request;
+       DBG2(DBG_KNL, "querying policy %R === %R %N%s", id->src_ts, id->dst_ts,
+                policy_dir_names, id->dir, markstr);
+
+       hdr = &request.hdr;
        hdr->nlmsg_flags = NLM_F_REQUEST;
        hdr->nlmsg_type = XFRM_MSG_GETPOLICY;
        hdr->nlmsg_len = NLMSG_LENGTH(sizeof(struct xfrm_userpolicy_id));
 
-       policy_id = (struct xfrm_userpolicy_id*)NLMSG_DATA(hdr);
-       policy_id->sel = ts2selector(src_ts, dst_ts);
-       policy_id->dir = direction;
+       policy_id = NLMSG_DATA(hdr);
+       policy_id->sel = ts2selector(id->src_ts, id->dst_ts, id->interface);
+       policy_id->dir = id->dir;
 
-       if (mark.value)
+       if (!add_mark(hdr, sizeof(request), id->mark))
        {
-               struct xfrm_mark *mrk;
-               struct rtattr *rthdr = XFRM_RTA(hdr, struct xfrm_userpolicy_id);
-
-               rthdr->rta_type = XFRMA_MARK;
-               rthdr->rta_len = RTA_LENGTH(sizeof(struct xfrm_mark));
-
-               hdr->nlmsg_len += rthdr->rta_len;
-               if (hdr->nlmsg_len > sizeof(request))
-               {
-                       return FAILED;
-               }
-
-               mrk = (struct xfrm_mark*)RTA_DATA(rthdr);
-               mrk->v = mark.value;
-               mrk->m = mark.mask;
+               return FAILED;
        }
 
        if (this->socket_xfrm->send(this->socket_xfrm, hdr, &out, &len) == SUCCESS)
@@ -1934,7 +2589,7 @@ METHOD(kernel_ipsec_t, query_policy, status_t,
                        {
                                case XFRM_MSG_NEWPOLICY:
                                {
-                                       policy = (struct xfrm_userpolicy_info*)NLMSG_DATA(hdr);
+                                       policy = NLMSG_DATA(hdr);
                                        break;
                                }
                                case NLMSG_ERROR:
@@ -1956,8 +2611,8 @@ METHOD(kernel_ipsec_t, query_policy, status_t,
 
        if (policy == NULL)
        {
-               DBG2(DBG_KNL, "unable to query policy %R === %R %N", src_ts, dst_ts,
-                                          policy_dir_names, direction);
+               DBG2(DBG_KNL, "unable to query policy %R === %R %N%s", id->src_ts,
+                        id->dst_ts, policy_dir_names, id->dir, markstr);
                free(out);
                return FAILED;
        }
@@ -1977,132 +2632,193 @@ METHOD(kernel_ipsec_t, query_policy, status_t,
 }
 
 METHOD(kernel_ipsec_t, del_policy, status_t,
-       private_kernel_netlink_ipsec_t *this, traffic_selector_t *src_ts,
-       traffic_selector_t *dst_ts, policy_dir_t direction,     mark_t mark,
-       bool unrouted)
+       private_kernel_netlink_ipsec_t *this, kernel_ipsec_policy_id_t *id,
+       kernel_ipsec_manage_policy_t *data)
 {
-       policy_entry_t *current, policy, *to_delete = NULL;
-       route_entry_t *route;
+       policy_entry_t *current, policy;
+       enumerator_t *enumerator;
+       policy_sa_t *mapping;
        netlink_buf_t request;
        struct nlmsghdr *hdr;
        struct xfrm_userpolicy_id *policy_id;
-
-       if (mark.value)
-       {
-               DBG2(DBG_KNL, "deleting policy %R === %R %N  (mark %u/0x%8x)",
-                                          src_ts, dst_ts, policy_dir_names, direction,
-                                          mark.value, mark.mask);
-       }
-       else
-       {
-               DBG2(DBG_KNL, "deleting policy %R === %R %N",
-                                          src_ts, dst_ts, policy_dir_names, direction);
-       }
+       bool is_installed = TRUE;
+       uint32_t priority, auto_priority, cur_priority;
+       ipsec_sa_t assigned_sa = {
+               .src = data->src,
+               .dst = data->dst,
+               .mark = id->mark,
+               .cfg = *data->sa,
+       };
+       char markstr[32] = "";
+       int use_count;
+       status_t status = SUCCESS;
+
+       format_mark(markstr, sizeof(markstr), id->mark);
+
+       DBG2(DBG_KNL, "deleting policy %R === %R %N%s", id->src_ts, id->dst_ts,
+                policy_dir_names, id->dir, markstr);
 
        /* create a policy */
        memset(&policy, 0, sizeof(policy_entry_t));
-       policy.sel = ts2selector(src_ts, dst_ts);
-       policy.mark = mark.value & mark.mask;
-       policy.direction = direction;
+       policy.sel = ts2selector(id->src_ts, id->dst_ts, id->interface);
+       policy.mark = id->mark.value & id->mark.mask;
+       policy.direction = id->dir;
 
        /* find the policy */
        this->mutex->lock(this->mutex);
        current = this->policies->get(this->policies, &policy);
-       if (current)
+       if (!current)
        {
-               to_delete = current;
-               if (--to_delete->refcount > 0)
-               {
-                       /* is used by more SAs, keep in kernel */
-                       DBG2(DBG_KNL, "policy still used by another CHILD_SA, not removed");
-                       this->mutex->unlock(this->mutex);
-                       return SUCCESS;
-               }
-               /* remove if last reference */
-               this->policies->remove(this->policies, to_delete);
+               DBG1(DBG_KNL, "deleting policy %R === %R %N%s failed, not found",
+                        id->src_ts, id->dst_ts, policy_dir_names, id->dir, markstr);
+               this->mutex->unlock(this->mutex);
+               return NOT_FOUND;
        }
-       this->mutex->unlock(this->mutex);
-       if (!to_delete)
+       current->waiting++;
+       while (current->working)
        {
-               if (mark.value)
+               this->condvar->wait(this->condvar, this->mutex);
+       }
+       current->working = TRUE;
+       current->waiting--;
+
+       /* remove mapping to SA by reqid and priority */
+       auto_priority = get_priority(current, data->prio,id->interface);
+       priority = this->get_priority ? this->get_priority(id, data)
+                                                                 : data->manual_prio;
+       priority = priority ?: auto_priority;
+
+       enumerator = current->used_by->create_enumerator(current->used_by);
+       while (enumerator->enumerate(enumerator, (void**)&mapping))
+       {
+               if (priority == mapping->priority &&
+                       auto_priority == mapping->auto_priority &&
+                       data->type == mapping->type &&
+                       ipsec_sa_equals(mapping->sa, &assigned_sa))
                {
-                       DBG1(DBG_KNL, "deleting policy %R === %R %N  (mark %u/0x%8x) "
-                                                 "failed, not found", src_ts, dst_ts, policy_dir_names,
-                                                  direction, mark.value, mark.mask);
+                       current->used_by->remove_at(current->used_by, enumerator);
+                       policy_sa_destroy(mapping, &id->dir, this);
+                       break;
                }
-               else
+               if (is_installed)
                {
-                       DBG1(DBG_KNL, "deleting policy %R === %R %N failed, not found",
-                                                  src_ts, dst_ts, policy_dir_names, direction);
+                       cur_priority = mapping->priority;
+                       is_installed = FALSE;
                }
-               return NOT_FOUND;
+       }
+       enumerator->destroy(enumerator);
+
+       use_count = current->used_by->get_count(current->used_by);
+       if (use_count > 0)
+       {       /* policy is used by more SAs, keep in kernel */
+               DBG2(DBG_KNL, "policy still used by another CHILD_SA, not removed");
+               if (!is_installed)
+               {       /* no need to update as the policy was not installed for this SA */
+                       policy_change_done(this, current);
+                       DBG2(DBG_KNL, "not updating policy %R === %R %N%s [priority %u, "
+                                "refcount %d]", id->src_ts, id->dst_ts, policy_dir_names,
+                                id->dir, markstr, cur_priority, use_count);
+                       return SUCCESS;
+               }
+               current->used_by->get_first(current->used_by, (void**)&mapping);
+               current->reqid = mapping->sa->cfg.reqid;
+
+               DBG2(DBG_KNL, "updating policy %R === %R %N%s [priority %u, "
+                        "refcount %d]", id->src_ts, id->dst_ts, policy_dir_names, id->dir,
+                        markstr, mapping->priority, use_count);
+
+               if (add_policy_internal(this, current, mapping, TRUE) != SUCCESS)
+               {
+                       DBG1(DBG_KNL, "unable to update policy %R === %R %N%s",
+                                id->src_ts, id->dst_ts, policy_dir_names, id->dir, markstr);
+                       return FAILED;
+               }
+               return SUCCESS;
        }
 
        memset(&request, 0, sizeof(request));
 
-       hdr = (struct nlmsghdr*)request;
+       hdr = &request.hdr;
        hdr->nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
        hdr->nlmsg_type = XFRM_MSG_DELPOLICY;
        hdr->nlmsg_len = NLMSG_LENGTH(sizeof(struct xfrm_userpolicy_id));
 
-       policy_id = (struct xfrm_userpolicy_id*)NLMSG_DATA(hdr);
-       policy_id->sel = to_delete->sel;
-       policy_id->dir = direction;
+       policy_id = NLMSG_DATA(hdr);
+       policy_id->sel = current->sel;
+       policy_id->dir = id->dir;
 
-       if (mark.value)
+       if (!add_mark(hdr, sizeof(request), id->mark))
        {
-               struct xfrm_mark *mrk;
-               struct rtattr *rthdr = XFRM_RTA(hdr, struct xfrm_userpolicy_id);
+               policy_change_done(this, current);
+               return FAILED;
+       }
 
-               rthdr->rta_type = XFRMA_MARK;
-               rthdr->rta_len = RTA_LENGTH(sizeof(struct xfrm_mark));
-               hdr->nlmsg_len += rthdr->rta_len;
-               if (hdr->nlmsg_len > sizeof(request))
+       if (current->route)
+       {
+               route_entry_t *route = current->route;
+               if (charon->kernel->del_route(charon->kernel, route->dst_net,
+                                                                         route->prefixlen, route->gateway,
+                                                                         route->src_ip, route->if_name) != SUCCESS)
                {
-                       return FAILED;
+                       DBG1(DBG_KNL, "error uninstalling route installed with policy "
+                                "%R === %R %N%s", id->src_ts, id->dst_ts, policy_dir_names,
+                                id->dir, markstr);
                }
-
-               mrk = (struct xfrm_mark*)RTA_DATA(rthdr);
-               mrk->v = mark.value;
-               mrk->m = mark.mask;
        }
-
-       route = to_delete->route;
-       free(to_delete);
+       this->mutex->unlock(this->mutex);
 
        if (this->socket_xfrm->send_ack(this->socket_xfrm, hdr) != SUCCESS)
        {
-               if (mark.value)
-               {
-                       DBG1(DBG_KNL, "unable to delete policy %R === %R %N  "
-                          "(mark %u/0x%8x)", src_ts, dst_ts, policy_dir_names,
-                                                  direction, mark.value, mark.mask);
-               }
-               else
-               {
-                       DBG1(DBG_KNL, "unable to delete policy %R === %R %N",
-                                                  src_ts, dst_ts, policy_dir_names, direction);
-               }
-               return FAILED;
+               DBG1(DBG_KNL, "unable to delete policy %R === %R %N%s", id->src_ts,
+                        id->dst_ts, policy_dir_names, id->dir, markstr);
+               status = FAILED;
        }
 
-       if (route)
+       this->mutex->lock(this->mutex);
+       if (!current->waiting)
+       {       /* only if no other thread still needs the policy */
+               this->policies->remove(this->policies, current);
+               policy_entry_destroy(this, current);
+               this->mutex->unlock(this->mutex);
+       }
+       else
        {
-               if (charon->kernel_interface->del_route(charon->kernel_interface,
-                               route->dst_net, route->prefixlen, route->gateway,
-                               route->src_ip, route->if_name) != SUCCESS)
-               {
-                       DBG1(DBG_KNL, "error uninstalling route installed with "
-                                                 "policy %R === %R %N", src_ts, dst_ts,
-                                                  policy_dir_names, direction);
-               }
-               route_entry_destroy(route);
+               policy_change_done(this, current);
+       }
+       return status;
+}
+
+METHOD(kernel_ipsec_t, flush_policies, status_t,
+       private_kernel_netlink_ipsec_t *this)
+{
+       netlink_buf_t request;
+       struct nlmsghdr *hdr;
+
+       memset(&request, 0, sizeof(request));
+
+       DBG2(DBG_KNL, "flushing all policies from SPD");
+
+       hdr = &request.hdr;
+       hdr->nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
+       hdr->nlmsg_type = XFRM_MSG_FLUSHPOLICY;
+       hdr->nlmsg_len = NLMSG_LENGTH(0); /* no data associated */
+
+       /* by adding an rtattr of type  XFRMA_POLICY_TYPE we could restrict this
+        * to main or sub policies (default is main) */
+
+       if (this->socket_xfrm->send_ack(this->socket_xfrm, hdr) != SUCCESS)
+       {
+               DBG1(DBG_KNL, "unable to flush SPD entries");
+               return FAILED;
        }
        return SUCCESS;
 }
 
-METHOD(kernel_ipsec_t, bypass_socket, bool,
-       private_kernel_netlink_ipsec_t *this, int fd, int family)
+/**
+ * Bypass socket using a per-socket policy
+ */
+static bool add_socket_bypass(private_kernel_netlink_ipsec_t *this,
+                                                         int fd, int family)
 {
        struct xfrm_userpolicy_info policy;
        u_int sol, ipsec_policy;
@@ -2128,15 +2844,176 @@ METHOD(kernel_ipsec_t, bypass_socket, bool,
        policy.dir = XFRM_POLICY_OUT;
        if (setsockopt(fd, sol, ipsec_policy, &policy, sizeof(policy)) < 0)
        {
-               DBG1(DBG_KNL, "unable to set IPSEC_POLICY on socket: %s",
-                        strerror(errno));
+               DBG1(DBG_KNL, "unable to set IPSEC_POLICY on socket: %s (%d)",
+                        strerror(errno), errno);
                return FALSE;
        }
        policy.dir = XFRM_POLICY_IN;
        if (setsockopt(fd, sol, ipsec_policy, &policy, sizeof(policy)) < 0)
        {
-               DBG1(DBG_KNL, "unable to set IPSEC_POLICY on socket: %s",
-                        strerror(errno));
+               DBG1(DBG_KNL, "unable to set IPSEC_POLICY on socket: %s (%d)",
+                        strerror(errno), errno);
+               return FALSE;
+       }
+       return TRUE;
+}
+
+/**
+ * Port based IKE bypass policy
+ */
+typedef struct {
+       /** address family */
+       int family;
+       /** layer 4 protocol */
+       int proto;
+       /** port number, network order */
+       uint16_t port;
+} bypass_t;
+
+/**
+ * Add or remove a bypass policy from/to kernel
+ */
+static bool manage_bypass(private_kernel_netlink_ipsec_t *this,
+                                                 int type, policy_dir_t dir, bypass_t *bypass)
+{
+       netlink_buf_t request;
+       struct xfrm_selector *sel;
+       struct nlmsghdr *hdr;
+
+       memset(&request, 0, sizeof(request));
+       hdr = &request.hdr;
+       hdr->nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
+       hdr->nlmsg_type = type;
+
+       if (type == XFRM_MSG_NEWPOLICY)
+       {
+               struct xfrm_userpolicy_info *policy;
+
+               hdr->nlmsg_len = NLMSG_LENGTH(sizeof(struct xfrm_userpolicy_info));
+
+               policy = NLMSG_DATA(hdr);
+               policy->dir = dir;
+               policy->priority = 32;
+               policy->action = XFRM_POLICY_ALLOW;
+               policy->share = XFRM_SHARE_ANY;
+
+               policy->lft.soft_byte_limit = XFRM_INF;
+               policy->lft.soft_packet_limit = XFRM_INF;
+               policy->lft.hard_byte_limit = XFRM_INF;
+               policy->lft.hard_packet_limit = XFRM_INF;
+
+               sel = &policy->sel;
+       }
+       else /* XFRM_MSG_DELPOLICY */
+       {
+               struct xfrm_userpolicy_id *policy;
+
+               hdr->nlmsg_len = NLMSG_LENGTH(sizeof(struct xfrm_userpolicy_id));
+
+               policy = NLMSG_DATA(hdr);
+               policy->dir = dir;
+
+               sel = &policy->sel;
+       }
+
+       sel->family = bypass->family;
+       sel->proto = bypass->proto;
+       if (dir == POLICY_IN)
+       {
+               sel->dport = bypass->port;
+               sel->dport_mask = 0xffff;
+       }
+       else
+       {
+               sel->sport = bypass->port;
+               sel->sport_mask = 0xffff;
+       }
+       return this->socket_xfrm->send_ack(this->socket_xfrm, hdr) == SUCCESS;
+}
+
+/**
+ * Bypass socket using a port-based bypass policy
+ */
+static bool add_port_bypass(private_kernel_netlink_ipsec_t *this,
+                                                       int fd, int family)
+{
+       union {
+               struct sockaddr sa;
+               struct sockaddr_in in;
+               struct sockaddr_in6 in6;
+       } saddr;
+       socklen_t len;
+       bypass_t bypass = {
+               .family = family,
+       };
+
+       len = sizeof(saddr);
+       if (getsockname(fd, &saddr.sa, &len) != 0)
+       {
+               return FALSE;
+       }
+#ifdef SO_PROTOCOL /* since 2.6.32 */
+       len = sizeof(bypass.proto);
+       if (getsockopt(fd, SOL_SOCKET, SO_PROTOCOL, &bypass.proto, &len) != 0)
+#endif
+       {       /* assume UDP if SO_PROTOCOL not supported */
+               bypass.proto = IPPROTO_UDP;
+       }
+       switch (family)
+       {
+               case AF_INET:
+                       bypass.port = saddr.in.sin_port;
+                       break;
+               case AF_INET6:
+                       bypass.port = saddr.in6.sin6_port;
+                       break;
+               default:
+                       return FALSE;
+       }
+
+       if (!manage_bypass(this, XFRM_MSG_NEWPOLICY, POLICY_IN, &bypass))
+       {
+               return FALSE;
+       }
+       if (!manage_bypass(this, XFRM_MSG_NEWPOLICY, POLICY_OUT, &bypass))
+       {
+               manage_bypass(this, XFRM_MSG_DELPOLICY, POLICY_IN, &bypass);
+               return FALSE;
+       }
+       array_insert(this->bypass, ARRAY_TAIL, &bypass);
+
+       return TRUE;
+}
+
+/**
+ * Remove installed port based bypass policy
+ */
+static void remove_port_bypass(bypass_t *bypass, int idx,
+                                                          private_kernel_netlink_ipsec_t *this)
+{
+       manage_bypass(this, XFRM_MSG_DELPOLICY, POLICY_OUT, bypass);
+       manage_bypass(this, XFRM_MSG_DELPOLICY, POLICY_IN, bypass);
+}
+
+METHOD(kernel_ipsec_t, bypass_socket, bool,
+       private_kernel_netlink_ipsec_t *this, int fd, int family)
+{
+       if (lib->settings->get_bool(lib->settings,
+                                       "%s.plugins.kernel-netlink.port_bypass", FALSE, lib->ns))
+       {
+               return add_port_bypass(this, fd, family);
+       }
+       return add_socket_bypass(this, fd, family);
+}
+
+METHOD(kernel_ipsec_t, enable_udp_decap, bool,
+       private_kernel_netlink_ipsec_t *this, int fd, int family, uint16_t port)
+{
+       int type = UDP_ENCAP_ESPINUDP;
+
+       if (setsockopt(fd, SOL_UDP, UDP_ENCAP, &type, sizeof(type)) < 0)
+       {
+               DBG1(DBG_KNL, "unable to set UDP_ENCAP: %s", strerror(errno));
                return FALSE;
        }
        return TRUE;
@@ -2148,96 +3025,232 @@ METHOD(kernel_ipsec_t, destroy, void,
        enumerator_t *enumerator;
        policy_entry_t *policy;
 
-       if (this->job)
-       {
-               this->job->cancel(this->job);
-       }
+       array_destroy_function(this->bypass,
+                                                  (array_callback_t)remove_port_bypass, this);
        if (this->socket_xfrm_events > 0)
        {
+               lib->watcher->remove(lib->watcher, this->socket_xfrm_events);
                close(this->socket_xfrm_events);
        }
        DESTROY_IF(this->socket_xfrm);
        enumerator = this->policies->create_enumerator(this->policies);
        while (enumerator->enumerate(enumerator, &policy, &policy))
        {
-               free(policy);
+               policy_entry_destroy(this, policy);
        }
        enumerator->destroy(enumerator);
        this->policies->destroy(this->policies);
+       this->sas->destroy(this->sas);
+       this->condvar->destroy(this->condvar);
        this->mutex->destroy(this->mutex);
        free(this);
 }
 
+/**
+ * Get the currently configured SPD hashing thresholds for an address family
+ */
+static bool get_spd_hash_thresh(private_kernel_netlink_ipsec_t *this,
+                                                               int type, uint8_t *lbits, uint8_t *rbits)
+{
+       netlink_buf_t request;
+       struct nlmsghdr *hdr, *out;
+       struct xfrmu_spdhthresh *thresh;
+       struct rtattr *rta;
+       size_t len, rtasize;
+       bool success = FALSE;
+
+       memset(&request, 0, sizeof(request));
+
+       hdr = &request.hdr;
+       hdr->nlmsg_flags = NLM_F_REQUEST;
+       hdr->nlmsg_type = XFRM_MSG_GETSPDINFO;
+       hdr->nlmsg_len = NLMSG_LENGTH(sizeof(uint32_t));
+
+       if (this->socket_xfrm->send(this->socket_xfrm, hdr, &out, &len) == SUCCESS)
+       {
+               hdr = out;
+               while (NLMSG_OK(hdr, len))
+               {
+                       switch (hdr->nlmsg_type)
+                       {
+                               case XFRM_MSG_NEWSPDINFO:
+                               {
+                                       rta = XFRM_RTA(hdr, uint32_t);
+                                       rtasize = XFRM_PAYLOAD(hdr, uint32_t);
+                                       while (RTA_OK(rta, rtasize))
+                                       {
+                                               if (rta->rta_type == type &&
+                                                       RTA_PAYLOAD(rta) == sizeof(*thresh))
+                                               {
+                                                       thresh = RTA_DATA(rta);
+                                                       *lbits = thresh->lbits;
+                                                       *rbits = thresh->rbits;
+                                                       success = TRUE;
+                                                       break;
+                                               }
+                                               rta = RTA_NEXT(rta, rtasize);
+                                       }
+                                       break;
+                               }
+                               case NLMSG_ERROR:
+                               {
+                                       struct nlmsgerr *err = NLMSG_DATA(hdr);
+                                       DBG1(DBG_KNL, "getting SPD hash threshold failed: %s (%d)",
+                                                strerror(-err->error), -err->error);
+                                       break;
+                               }
+                               default:
+                                       hdr = NLMSG_NEXT(hdr, len);
+                                       continue;
+                               case NLMSG_DONE:
+                                       break;
+                       }
+                       break;
+               }
+               free(out);
+       }
+       return success;
+}
+
+/**
+ * Configure SPD hashing threshold for an address family
+ */
+static void setup_spd_hash_thresh(private_kernel_netlink_ipsec_t *this,
+                                                                 char *key, int type, uint8_t def)
+{
+       struct xfrmu_spdhthresh *thresh;
+       struct nlmsghdr *hdr;
+       netlink_buf_t request;
+       uint8_t lbits, rbits;
+
+       if (!get_spd_hash_thresh(this, type, &lbits, &rbits))
+       {
+               return;
+       }
+       memset(&request, 0, sizeof(request));
+
+       hdr = &request.hdr;
+       hdr->nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
+       hdr->nlmsg_type = XFRM_MSG_NEWSPDINFO;
+       hdr->nlmsg_len = NLMSG_LENGTH(sizeof(uint32_t));
+
+       thresh = netlink_reserve(hdr, sizeof(request), type, sizeof(*thresh));
+       thresh->lbits = lib->settings->get_int(lib->settings,
+                                                       "%s.plugins.kernel-netlink.spdh_thresh.%s.lbits",
+                                                       def, lib->ns, key);
+       thresh->rbits = lib->settings->get_int(lib->settings,
+                                                       "%s.plugins.kernel-netlink.spdh_thresh.%s.rbits",
+                                                       def, lib->ns, key);
+       if (thresh->lbits != lbits || thresh->rbits != rbits)
+       {
+               if (this->socket_xfrm->send_ack(this->socket_xfrm, hdr) != SUCCESS)
+               {
+                       DBG1(DBG_KNL, "setting SPD hash threshold failed");
+               }
+       }
+}
+
 /*
  * Described in header.
  */
 kernel_netlink_ipsec_t *kernel_netlink_ipsec_create()
 {
        private_kernel_netlink_ipsec_t *this;
-       struct sockaddr_nl addr;
-       int fd;
+       bool register_for_events = TRUE;
+       FILE *f;
 
        INIT(this,
                .public = {
                        .interface = {
+                               .get_features = _get_features,
                                .get_spi = _get_spi,
                                .get_cpi = _get_cpi,
                                .add_sa  = _add_sa,
                                .update_sa = _update_sa,
                                .query_sa = _query_sa,
                                .del_sa = _del_sa,
+                               .flush_sas = _flush_sas,
                                .add_policy = _add_policy,
                                .query_policy = _query_policy,
                                .del_policy = _del_policy,
+                               .flush_policies = _flush_policies,
                                .bypass_socket = _bypass_socket,
+                               .enable_udp_decap = _enable_udp_decap,
                                .destroy = _destroy,
                        },
                },
                .policies = hashtable_create((hashtable_hash_t)policy_hash,
                                                                         (hashtable_equals_t)policy_equals, 32),
+               .sas = hashtable_create((hashtable_hash_t)ipsec_sa_hash,
+                                                               (hashtable_equals_t)ipsec_sa_equals, 32),
+               .bypass = array_create(sizeof(bypass_t), 0),
                .mutex = mutex_create(MUTEX_TYPE_DEFAULT),
+               .condvar = condvar_create(CONDVAR_TYPE_DEFAULT),
+               .get_priority = dlsym(RTLD_DEFAULT,
+                                                         "kernel_netlink_get_priority_custom"),
+               .policy_update = lib->settings->get_bool(lib->settings,
+                                       "%s.plugins.kernel-netlink.policy_update", FALSE, lib->ns),
                .install_routes = lib->settings->get_bool(lib->settings,
-                                                                                               "charon.install_routes", TRUE),
+                                                       "%s.install_routes", TRUE, lib->ns),
+               .proto_port_transport = lib->settings->get_bool(lib->settings,
+                                               "%s.plugins.kernel-netlink.set_proto_port_transport_sa",
+                                               FALSE, lib->ns),
        );
 
-       /* disable lifetimes for allocated SPIs in kernel */
-       fd = open("/proc/sys/net/core/xfrm_acq_expires", O_WRONLY);
-       if (fd)
+       if (streq(lib->ns, "starter"))
+       {       /* starter has no threads, so we do not register for kernel events */
+               register_for_events = FALSE;
+       }
+
+       f = fopen("/proc/sys/net/core/xfrm_acq_expires", "w");
+       if (f)
        {
-               ignore_result(write(fd, "165", 3));
-               close(fd);
+               fprintf(f, "%u", lib->settings->get_int(lib->settings,
+                                                               "%s.plugins.kernel-netlink.xfrm_acq_expires",
+                                                               DEFAULT_ACQUIRE_LIFETIME, lib->ns));
+               fclose(f);
        }
 
-       this->socket_xfrm = netlink_socket_create(NETLINK_XFRM);
+       this->socket_xfrm = netlink_socket_create(NETLINK_XFRM, xfrm_msg_names,
+                               lib->settings->get_bool(lib->settings,
+                                       "%s.plugins.kernel-netlink.parallel_xfrm", FALSE, lib->ns));
        if (!this->socket_xfrm)
        {
                destroy(this);
                return NULL;
        }
 
-       memset(&addr, 0, sizeof(addr));
-       addr.nl_family = AF_NETLINK;
+       setup_spd_hash_thresh(this, "ipv4", XFRMA_SPD_IPV4_HTHRESH, 32);
+       setup_spd_hash_thresh(this, "ipv6", XFRMA_SPD_IPV6_HTHRESH, 128);
 
-       /* create and bind XFRM socket for ACQUIRE, EXPIRE, MIGRATE & MAPPING */
-       this->socket_xfrm_events = socket(AF_NETLINK, SOCK_RAW, NETLINK_XFRM);
-       if (this->socket_xfrm_events <= 0)
-       {
-               DBG1(DBG_KNL, "unable to create XFRM event socket");
-               destroy(this);
-               return NULL;
-       }
-       addr.nl_groups = XFRMNLGRP(ACQUIRE) | XFRMNLGRP(EXPIRE) |
-                                        XFRMNLGRP(MIGRATE) | XFRMNLGRP(MAPPING);
-       if (bind(this->socket_xfrm_events, (struct sockaddr*)&addr, sizeof(addr)))
+       if (register_for_events)
        {
-               DBG1(DBG_KNL, "unable to bind XFRM event socket");
-               destroy(this);
-               return NULL;
+               struct sockaddr_nl addr;
+
+               memset(&addr, 0, sizeof(addr));
+               addr.nl_family = AF_NETLINK;
+
+               /* create and bind XFRM socket for ACQUIRE, EXPIRE, MIGRATE & MAPPING */
+               this->socket_xfrm_events = socket(AF_NETLINK, SOCK_RAW, NETLINK_XFRM);
+               if (this->socket_xfrm_events <= 0)
+               {
+                       DBG1(DBG_KNL, "unable to create XFRM event socket: %s (%d)",
+                                strerror(errno), errno);
+                       destroy(this);
+                       return NULL;
+               }
+               addr.nl_groups = XFRMNLGRP(ACQUIRE) | XFRMNLGRP(EXPIRE) |
+                                                XFRMNLGRP(MIGRATE) | XFRMNLGRP(MAPPING);
+               if (bind(this->socket_xfrm_events, (struct sockaddr*)&addr, sizeof(addr)))
+               {
+                       DBG1(DBG_KNL, "unable to bind XFRM event socket: %s (%d)",
+                                strerror(errno), errno);
+                       destroy(this);
+                       return NULL;
+               }
+               lib->watcher->add(lib->watcher, this->socket_xfrm_events, WATCHER_READ,
+                                                 (watcher_cb_t)receive_events, this);
        }
-       this->job = callback_job_create((callback_job_cb_t)receive_events,
-                                                                       this, NULL, NULL);
-       hydra->processor->queue_job(hydra->processor, (job_t*)this->job);
 
        return &this->public;
 }
-