ikev1: Send and verify IPv6 addresses correctly
[strongswan.git] / src / libhydra / plugins / kernel_netlink / kernel_netlink_net.c
index 4b64a8d..4e5e02d 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2008-2012 Tobias Brunner
+ * Copyright (C) 2008-2014 Tobias Brunner
  * Copyright (C) 2005-2008 Martin Willi
  * Hochschule fuer Technik Rapperswil
  *
 #include <unistd.h>
 #include <errno.h>
 #include <net/if.h>
+#ifdef HAVE_LINUX_FIB_RULES_H
+#include <linux/fib_rules.h>
+#endif
 
 #include "kernel_netlink_net.h"
 #include "kernel_netlink_shared.h"
 
 #include <hydra.h>
-#include <debug.h>
-#include <threading/thread.h>
-#include <threading/condvar.h>
+#include <utils/debug.h>
 #include <threading/mutex.h>
+#include <threading/rwlock.h>
+#include <threading/rwlock_condvar.h>
 #include <threading/spinlock.h>
-#include <utils/hashtable.h>
-#include <utils/linked_list.h>
+#include <collections/hashtable.h>
+#include <collections/linked_list.h>
 #include <processing/jobs/callback_job.h>
 
 /** delay before firing roam events (ms) */
 /** delay before reinstalling routes (ms) */
 #define ROUTE_DELAY 100
 
+/** maximum recursion when searching for addresses in get_route() */
+#define MAX_ROUTE_RECURSION 2
+
+#ifndef ROUTING_TABLE
+#define ROUTING_TABLE 0
+#endif
+
+#ifndef ROUTING_TABLE_PRIO
+#define ROUTING_TABLE_PRIO 0
+#endif
+
+ENUM(rt_msg_names, RTM_NEWLINK, RTM_GETRULE,
+       "RTM_NEWLINK",
+       "RTM_DELLINK",
+       "RTM_GETLINK",
+       "RTM_SETLINK",
+       "RTM_NEWADDR",
+       "RTM_DELADDR",
+       "RTM_GETADDR",
+       "31",
+       "RTM_NEWROUTE",
+       "RTM_DELROUTE",
+       "RTM_GETROUTE",
+       "35",
+       "RTM_NEWNEIGH",
+       "RTM_DELNEIGH",
+       "RTM_GETNEIGH",
+       "RTM_NEWRULE",
+       "RTM_DELRULE",
+       "RTM_GETRULE",
+);
+
 typedef struct addr_entry_t addr_entry_t;
 
 /**
@@ -74,6 +109,9 @@ struct addr_entry_t {
        /** the ip address */
        host_t *ip;
 
+       /** address flags */
+       u_char flags;
+
        /** scope of the address */
        u_char scope;
 
@@ -253,7 +291,7 @@ static route_entry_t *route_entry_clone(route_entry_t *this)
        INIT(route,
                .if_name = strdup(this->if_name),
                .src_ip = this->src_ip->clone(this->src_ip),
-               .gateway = this->gateway->clone(this->gateway),
+               .gateway = this->gateway ? this->gateway->clone(this->gateway) : NULL,
                .dst_net = chunk_clone(this->dst_net),
                .prefixlen = this->prefixlen,
        );
@@ -286,10 +324,14 @@ static u_int route_entry_hash(route_entry_t *this)
  */
 static bool route_entry_equals(route_entry_t *a, route_entry_t *b)
 {
-       return a->if_name && b->if_name && streq(a->if_name, b->if_name) &&
-                  a->src_ip->ip_equals(a->src_ip, b->src_ip) &&
-                  a->gateway->ip_equals(a->gateway, b->gateway) &&
-                  chunk_equals(a->dst_net, b->dst_net) && a->prefixlen == b->prefixlen;
+       if (a->if_name && b->if_name && streq(a->if_name, b->if_name) &&
+               a->src_ip->ip_equals(a->src_ip, b->src_ip) &&
+               chunk_equals(a->dst_net, b->dst_net) && a->prefixlen == b->prefixlen)
+       {
+               return (!a->gateway && !b->gateway) || (a->gateway && b->gateway &&
+                                       a->gateway->ip_equals(a->gateway, b->gateway));
+       }
+       return FALSE;
 }
 
 typedef struct net_change_t net_change_t;
@@ -339,14 +381,14 @@ struct private_kernel_netlink_net_t {
        kernel_netlink_net_t public;
 
        /**
-        * mutex to lock access to various lists
+        * lock to access various lists and maps
         */
-       mutex_t *mutex;
+       rwlock_t *lock;
 
        /**
         * condition variable to signal virtual IP add/removal
         */
-       condvar_t *condvar;
+       rwlock_condvar_t *condvar;
 
        /**
         * Cached list of interfaces and its addresses (iface_entry_t)
@@ -379,6 +421,11 @@ struct private_kernel_netlink_net_t {
        timeval_t next_roam;
 
        /**
+        * roam event due to address change
+        */
+       bool roam_address;
+
+       /**
         * lock to check and update roam event time
         */
        spinlock_t *roam_lock;
@@ -424,6 +471,11 @@ struct private_kernel_netlink_net_t {
        bool process_route;
 
        /**
+        * whether to trigger roam events
+        */
+       bool roam_events;
+
+       /**
         * whether to actually install virtual IPs
         */
        bool install_virtual_ip;
@@ -439,9 +491,34 @@ struct private_kernel_netlink_net_t {
        bool rta_prefsrc_for_ipv6;
 
        /**
+        * whether marks can be used in route lookups
+        */
+       bool rta_mark;
+
+       /**
+        * the mark excluded from the routing rule used for virtual IPs
+        */
+       mark_t routing_mark;
+
+       /**
+        * whether to prefer temporary IPv6 addresses over public ones
+        */
+       bool prefer_temporary_addrs;
+
+       /**
         * list with routing tables to be excluded from route lookup
         */
        linked_list_t *rt_exclude;
+
+       /**
+        * MTU to set on installed routes
+        */
+       u_int32_t mtu;
+
+       /**
+        * MSS to set on installed routes
+        */
+       u_int32_t mss;
 };
 
 /**
@@ -542,12 +619,7 @@ static void queue_route_reinstall(private_kernel_netlink_net_t *this,
        time_monotonic(&now);
        if (timercmp(&now, &this->last_route_reinstall, >))
        {
-               now.tv_usec += ROUTE_DELAY * 1000;
-               while (now.tv_usec > 1000000)
-               {
-                       now.tv_sec++;
-                       now.tv_usec -= 1000000;
-               }
+               timeval_add_ms(&now, ROUTE_DELAY);
                this->last_route_reinstall = now;
 
                job = (job_t*)callback_job_create((callback_job_cb_t)reinstall_routes,
@@ -563,7 +635,7 @@ static void queue_route_reinstall(private_kernel_netlink_net_t *this,
  * this function will also return TRUE if the virtual IP entry disappeared.
  * in that case the returned entry will be NULL.
  *
- * this->mutex must be locked when calling this function
+ * this->lock must be held when calling this function
  */
 static bool is_vip_installed_or_gone(private_kernel_netlink_net_t *this,
                                                                         host_t *ip, addr_map_entry_t **entry)
@@ -584,7 +656,7 @@ static bool is_vip_installed_or_gone(private_kernel_netlink_net_t *this,
 /**
  * check if the given IP is known as virtual IP
  *
- * this->mutex must be locked when calling this function
+ * this->lock must be held when calling this function
  */
 static bool is_known_vip(private_kernel_netlink_net_t *this, host_t *ip)
 {
@@ -630,20 +702,157 @@ static void addr_map_entry_remove(hashtable_t *map, addr_entry_t *addr,
 }
 
 /**
- * get the first non-virtual ip address on the given interface.
- * if a candidate address is given, we first search for that address and if not
+ * Determine the type or scope of the given unicast IP address.  This is not
+ * the same thing returned in rtm_scope/ifa_scope.
+ *
+ * We use return values as defined in RFC 6724 (referring to RFC 4291).
+ */
+static u_char get_scope(host_t *ip)
+{
+       chunk_t addr;
+
+       addr = ip->get_address(ip);
+       switch (addr.len)
+       {
+               case 4:
+                       /* we use the mapping defined in RFC 6724, 3.2 */
+                       if (addr.ptr[0] == 127)
+                       {       /* link-local, same as the IPv6 loopback address */
+                               return 2;
+                       }
+                       if (addr.ptr[0] == 169 && addr.ptr[1] == 254)
+                       {       /* link-local */
+                               return 2;
+                       }
+                       break;
+               case 16:
+                       if (IN6_IS_ADDR_LOOPBACK((struct in6_addr*)addr.ptr))
+                       {       /* link-local, according to RFC 4291, 2.5.3 */
+                               return 2;
+                       }
+                       if (IN6_IS_ADDR_LINKLOCAL((struct in6_addr*)addr.ptr))
+                       {
+                               return 2;
+                       }
+                       if (IN6_IS_ADDR_SITELOCAL((struct in6_addr*)addr.ptr))
+                       {       /* deprecated, according to RFC 4291, 2.5.7 */
+                               return 5;
+                       }
+                       break;
+               default:
+                       break;
+       }
+       /* global */
+       return 14;
+}
+
+/**
+ * Returns the length of the common prefix in bits up to the length of a's
+ * prefix, defined by RFC 6724 as the portion of the address not including the
+ * interface ID, which is 64-bit for most unicast addresses (see RFC 4291).
+ */
+static u_char common_prefix(host_t *a, host_t *b)
+{
+       chunk_t aa, ba;
+       u_char byte, bits = 0, match;
+
+       aa = a->get_address(a);
+       ba = b->get_address(b);
+       for (byte = 0; byte < 8; byte++)
+       {
+               if (aa.ptr[byte] != ba.ptr[byte])
+               {
+                       match = aa.ptr[byte] ^ ba.ptr[byte];
+                       for (bits = 8; match; match >>= 1)
+                       {
+                               bits--;
+                       }
+                       break;
+               }
+       }
+       return byte * 8 + bits;
+}
+
+/**
+ * Compare two IP addresses and return TRUE if the second address is the better
+ * choice of the two to reach the destination.
+ * For IPv6 we approximately follow RFC 6724.
+ */
+static bool is_address_better(private_kernel_netlink_net_t *this,
+                                                         addr_entry_t *a, addr_entry_t *b, host_t *d)
+{
+       u_char sa, sb, sd, pa, pb;
+
+       /* rule 2: prefer appropriate scope */
+       if (d)
+       {
+               sa = get_scope(a->ip);
+               sb = get_scope(b->ip);
+               sd = get_scope(d);
+               if (sa < sb)
+               {
+                       return sa < sd;
+               }
+               else if (sb < sa)
+               {
+                       return sb >= sd;
+               }
+       }
+       if (a->ip->get_family(a->ip) == AF_INET)
+       {       /* stop here for IPv4, default to addresses found earlier */
+               return FALSE;
+       }
+       /* rule 3: avoid deprecated addresses (RFC 4862) */
+       if ((a->flags & IFA_F_DEPRECATED) != (b->flags & IFA_F_DEPRECATED))
+       {
+               return a->flags & IFA_F_DEPRECATED;
+       }
+       /* rule 4 is not applicable as we don't know if an address is a home or
+        * care-of addresses.
+        * rule 5 does not apply as we only compare addresses from one interface
+        * rule 6 requires a policy table (optionally configurable) to match
+        * configurable labels
+        */
+       /* rule 7: prefer temporary addresses (WE REVERSE THIS BY DEFAULT!) */
+       if ((a->flags & IFA_F_TEMPORARY) != (b->flags & IFA_F_TEMPORARY))
+       {
+               if (this->prefer_temporary_addrs)
+               {
+                       return b->flags & IFA_F_TEMPORARY;
+               }
+               return a->flags & IFA_F_TEMPORARY;
+       }
+       /* rule 8: use longest matching prefix */
+       if (d)
+       {
+               pa = common_prefix(a->ip, d);
+               pb = common_prefix(b->ip, d);
+               if (pa != pb)
+               {
+                       return pb > pa;
+               }
+       }
+       /* default to addresses found earlier */
+       return FALSE;
+}
+
+/**
+ * Get a non-virtual IP address on the given interface.
+ *
+ * If a candidate address is given, we first search for that address and if not
  * found return the address as above.
- * returned host is a clone, has to be freed by caller.
+ * Returned host is a clone, has to be freed by caller.
+ *
+ * this->lock must be held when calling this function.
  */
 static host_t *get_interface_address(private_kernel_netlink_net_t *this,
-                                                                        int ifindex, int family, host_t *candidate)
+                                                                        int ifindex, int family, host_t *dest,
+                                                                        host_t *candidate)
 {
        iface_entry_t *iface;
        enumerator_t *addrs;
-       addr_entry_t *addr;
-       host_t *ip = NULL;
+       addr_entry_t *addr, *best = NULL;
 
-       this->mutex->lock(this->mutex);
        if (this->ifaces->find_first(this->ifaces, (void*)iface_entry_by_index,
                                                                 (void**)&iface, &ifindex) == SUCCESS)
        {
@@ -652,42 +861,39 @@ static host_t *get_interface_address(private_kernel_netlink_net_t *this,
                        addrs = iface->addrs->create_enumerator(iface->addrs);
                        while (addrs->enumerate(addrs, &addr))
                        {
-                               if (addr->refcount)
-                               {       /* ignore virtual IP addresses */
+                               if (addr->refcount ||
+                                       addr->ip->get_family(addr->ip) != family)
+                               {       /* ignore virtual IP addresses and ensure family matches */
                                        continue;
                                }
-                               if (addr->ip->get_family(addr->ip) == family)
+                               if (candidate && candidate->ip_equals(candidate, addr->ip))
+                               {       /* stop if we find the candidate */
+                                       best = addr;
+                                       break;
+                               }
+                               else if (!best || is_address_better(this, best, addr, dest))
                                {
-                                       if (!candidate || candidate->ip_equals(candidate, addr->ip))
-                                       {       /* stop at the first address if we don't search for a
-                                                * candidate or if the candidate matches */
-                                               ip = addr->ip;
-                                               break;
-                                       }
-                                       else if (!ip)
-                                       {       /* store the first address as fallback if candidate is
-                                                * not found */
-                                               ip = addr->ip;
-                                       }
+                                       best = addr;
                                }
                        }
                        addrs->destroy(addrs);
                }
        }
-       if (ip)
-       {
-               ip = ip->clone(ip);
-       }
-       this->mutex->unlock(this->mutex);
-       return ip;
+       return best ? best->ip->clone(best->ip) : NULL;
 }
 
 /**
  * callback function that raises the delayed roam event
  */
-static job_requeue_t roam_event(uintptr_t address)
+static job_requeue_t roam_event(private_kernel_netlink_net_t *this)
 {
-       hydra->kernel_interface->roam(hydra->kernel_interface, address != 0);
+       bool address;
+
+       this->roam_lock->lock(this->roam_lock);
+       address = this->roam_address;
+       this->roam_address = FALSE;
+       this->roam_lock->unlock(this->roam_lock);
+       hydra->kernel_interface->roam(hydra->kernel_interface, address);
        return JOB_REQUEUE_NONE;
 }
 
@@ -700,32 +906,32 @@ static void fire_roam_event(private_kernel_netlink_net_t *this, bool address)
        timeval_t now;
        job_t *job;
 
+       if (!this->roam_events)
+       {
+               return;
+       }
+
        time_monotonic(&now);
        this->roam_lock->lock(this->roam_lock);
+       this->roam_address |= address;
        if (!timercmp(&now, &this->next_roam, >))
        {
                this->roam_lock->unlock(this->roam_lock);
                return;
        }
-       now.tv_usec += ROAM_DELAY * 1000;
-       while (now.tv_usec > 1000000)
-       {
-               now.tv_sec++;
-               now.tv_usec -= 1000000;
-       }
+       timeval_add_ms(&now, ROAM_DELAY);
        this->next_roam = now;
        this->roam_lock->unlock(this->roam_lock);
 
        job = (job_t*)callback_job_create((callback_job_cb_t)roam_event,
-                                                                         (void*)(uintptr_t)(address ? 1 : 0),
-                                                                          NULL, NULL);
+                                                                         this, NULL, NULL);
        lib->scheduler->schedule_job_ms(lib->scheduler, job, ROAM_DELAY);
 }
 
 /**
  * check if an interface with a given index is up and usable
  *
- * this->mutex must be locked when calling this function
+ * this->lock must be locked when calling this function
  */
 static bool is_interface_up_and_usable(private_kernel_netlink_net_t *this,
                                                                           int index)
@@ -743,7 +949,7 @@ static bool is_interface_up_and_usable(private_kernel_netlink_net_t *this,
 /**
  * unregister the current addr_entry_t from the hashtable it is stored in
  *
- * this->mutex must be locked when calling this function
+ * this->lock must be locked when calling this function
  */
 static void addr_entry_unregister(addr_entry_t *addr, iface_entry_t *iface,
                                                                  private_kernel_netlink_net_t *this)
@@ -763,7 +969,7 @@ static void addr_entry_unregister(addr_entry_t *addr, iface_entry_t *iface,
 static void process_link(private_kernel_netlink_net_t *this,
                                                 struct nlmsghdr *hdr, bool event)
 {
-       struct ifinfomsg* msg = (struct ifinfomsg*)(NLMSG_DATA(hdr));
+       struct ifinfomsg* msg = NLMSG_DATA(hdr);
        struct rtattr *rta = IFLA_RTA(msg);
        size_t rtasize = IFLA_PAYLOAD (hdr);
        enumerator_t *enumerator;
@@ -786,7 +992,7 @@ static void process_link(private_kernel_netlink_net_t *this,
                name = "(unknown)";
        }
 
-       this->mutex->lock(this->mutex);
+       this->lock->write_lock(this->lock);
        switch (hdr->nlmsg_type)
        {
                case RTM_NEWLINK:
@@ -846,7 +1052,7 @@ static void process_link(private_kernel_netlink_net_t *this,
                        break;
                }
        }
-       this->mutex->unlock(this->mutex);
+       this->lock->unlock(this->lock);
 
        if (update_routes && event)
        {
@@ -865,7 +1071,7 @@ static void process_link(private_kernel_netlink_net_t *this,
 static void process_addr(private_kernel_netlink_net_t *this,
                                                 struct nlmsghdr *hdr, bool event)
 {
-       struct ifaddrmsg* msg = (struct ifaddrmsg*)(NLMSG_DATA(hdr));
+       struct ifaddrmsg* msg = NLMSG_DATA(hdr);
        struct rtattr *rta = IFA_RTA(msg);
        size_t rtasize = IFA_PAYLOAD (hdr);
        host_t *host = NULL;
@@ -907,7 +1113,7 @@ static void process_addr(private_kernel_netlink_net_t *this,
                return;
        }
 
-       this->mutex->lock(this->mutex);
+       this->lock->write_lock(this->lock);
        if (this->ifaces->find_first(this->ifaces, (void*)iface_entry_by_index,
                                                                 (void**)&iface, &msg->ifa_index) == SUCCESS)
        {
@@ -933,7 +1139,7 @@ static void process_addr(private_kernel_netlink_net_t *this,
                        }
                        /* no roam events etc. for virtual IPs */
                        this->condvar->broadcast(this->condvar);
-                       this->mutex->unlock(this->mutex);
+                       this->lock->unlock(this->lock);
                        host->destroy(host);
                        return;
                }
@@ -964,6 +1170,7 @@ static void process_addr(private_kernel_netlink_net_t *this,
                                route_ifname = strdup(iface->ifname);
                                INIT(addr,
                                        .ip = host->clone(host),
+                                       .flags = msg->ifa_flags,
                                        .scope = msg->ifa_scope,
                                );
                                iface->addrs->insert_last(iface->addrs, addr);
@@ -983,7 +1190,7 @@ static void process_addr(private_kernel_netlink_net_t *this,
                        update = changed = FALSE;
                }
        }
-       this->mutex->unlock(this->mutex);
+       this->lock->unlock(this->lock);
 
        if (update && event && route_ifname)
        {
@@ -1007,7 +1214,7 @@ static void process_addr(private_kernel_netlink_net_t *this,
  */
 static void process_route(private_kernel_netlink_net_t *this, struct nlmsghdr *hdr)
 {
-       struct rtmsg* msg = (struct rtmsg*)(NLMSG_DATA(hdr));
+       struct rtmsg* msg = NLMSG_DATA(hdr);
        struct rtattr *rta = RTM_RTA(msg);
        size_t rtasize = RTM_PAYLOAD(hdr);
        u_int32_t rta_oif = 0;
@@ -1042,65 +1249,63 @@ static void process_route(private_kernel_netlink_net_t *this, struct nlmsghdr *h
                }
                rta = RTA_NEXT(rta, rtasize);
        }
-       this->mutex->lock(this->mutex);
+       this->lock->read_lock(this->lock);
        if (rta_oif && !is_interface_up_and_usable(this, rta_oif))
        {       /* ignore route changes for interfaces that are ignored or down */
-               this->mutex->unlock(this->mutex);
+               this->lock->unlock(this->lock);
                DESTROY_IF(host);
                return;
        }
        if (!host && rta_oif)
        {
-               host = get_interface_address(this, rta_oif, msg->rtm_family, NULL);
+               host = get_interface_address(this, rta_oif, msg->rtm_family,
+                                                                        NULL, NULL);
        }
-       if (host)
-       {
-               if (!is_known_vip(this, host))
-               {       /* ignore routes added for virtual IPs */
-                       fire_roam_event(this, FALSE);
-               }
-               host->destroy(host);
+       if (!host || is_known_vip(this, host))
+       {       /* ignore routes added for virtual IPs */
+               this->lock->unlock(this->lock);
+               DESTROY_IF(host);
+               return;
        }
-       this->mutex->unlock(this->mutex);
+       this->lock->unlock(this->lock);
+       fire_roam_event(this, FALSE);
+       host->destroy(host);
 }
 
 /**
  * Receives events from kernel
  */
-static job_requeue_t receive_events(private_kernel_netlink_net_t *this)
+static bool receive_events(private_kernel_netlink_net_t *this, int fd,
+                                                  watcher_event_t event)
 {
-       char response[1024];
+       char response[1536];
        struct nlmsghdr *hdr = (struct nlmsghdr*)response;
        struct sockaddr_nl addr;
        socklen_t addr_len = sizeof(addr);
        int len;
-       bool oldstate;
-
-       oldstate = thread_cancelability(TRUE);
-       len = recvfrom(this->socket_events, response, sizeof(response), 0,
-                                  (struct sockaddr*)&addr, &addr_len);
-       thread_cancelability(oldstate);
 
+       len = recvfrom(this->socket_events, response, sizeof(response),
+                                  MSG_DONTWAIT, (struct sockaddr*)&addr, &addr_len);
        if (len < 0)
        {
                switch (errno)
                {
                        case EINTR:
                                /* interrupted, try again */
-                               return JOB_REQUEUE_DIRECT;
+                               return TRUE;
                        case EAGAIN:
                                /* no data ready, select again */
-                               return JOB_REQUEUE_DIRECT;
+                               return TRUE;
                        default:
                                DBG1(DBG_KNL, "unable to receive from rt event socket");
                                sleep(1);
-                               return JOB_REQUEUE_FAIR;
+                               return TRUE;
                }
        }
 
        if (addr.nl_pid != 0)
        {       /* not from kernel. not interested, try another one */
-               return JOB_REQUEUE_DIRECT;
+               return TRUE;
        }
 
        while (NLMSG_OK(hdr, len))
@@ -1128,7 +1333,7 @@ static job_requeue_t receive_events(private_kernel_netlink_net_t *this)
                }
                hdr = NLMSG_NEXT(hdr, len);
        }
-       return JOB_REQUEUE_DIRECT;
+       return TRUE;
 }
 
 /** enumerator over addresses */
@@ -1143,7 +1348,7 @@ typedef struct {
  */
 static void address_enumerator_destroy(address_enumerator_t *data)
 {
-       data->this->mutex->unlock(data->this->mutex);
+       data->this->lock->unlock(data->this->lock);
        free(data);
 }
 
@@ -1157,6 +1362,10 @@ static bool filter_addresses(address_enumerator_t *data,
        {       /* skip virtual interfaces added by us */
                return FALSE;
        }
+       if (!(data->which & ADDR_TYPE_REGULAR) && !(*in)->refcount)
+       {       /* address is regular, but not requested */
+               return FALSE;
+       }
        if ((*in)->scope >= RT_SCOPE_LINK)
        {       /* skip addresses with a unusable scope */
                return FALSE;
@@ -1201,11 +1410,14 @@ static bool filter_interfaces(address_enumerator_t *data, iface_entry_t** in,
 METHOD(kernel_net_t, create_address_enumerator, enumerator_t*,
        private_kernel_netlink_net_t *this, kernel_address_type_t which)
 {
-       address_enumerator_t *data = malloc_thing(address_enumerator_t);
-       data->this = this;
-       data->which = which;
+       address_enumerator_t *data;
+
+       INIT(data,
+               .this = this,
+               .which = which,
+       );
 
-       this->mutex->lock(this->mutex);
+       this->lock->read_lock(this->lock);
        return enumerator_create_nested(
                                enumerator_create_filter(
                                        this->ifaces->create_enumerator(this->ifaces),
@@ -1225,7 +1437,7 @@ METHOD(kernel_net_t, get_interface_name, bool,
        {
                return FALSE;
        }
-       this->mutex->lock(this->mutex);
+       this->lock->read_lock(this->lock);
        /* first try to find it on an up and usable interface */
        entry = this->addrs->get_match(this->addrs, &lookup,
                                                                  (void*)addr_map_entry_match_up_and_usable);
@@ -1236,7 +1448,20 @@ METHOD(kernel_net_t, get_interface_name, bool,
                        *name = strdup(entry->iface->ifname);
                        DBG2(DBG_KNL, "%H is on interface %s", ip, *name);
                }
-               this->mutex->unlock(this->mutex);
+               this->lock->unlock(this->lock);
+               return TRUE;
+       }
+       /* in a second step, consider virtual IPs installed by us */
+       entry = this->vips->get_match(this->vips, &lookup,
+                                                                 (void*)addr_map_entry_match_up_and_usable);
+       if (entry)
+       {
+               if (name)
+               {
+                       *name = strdup(entry->iface->ifname);
+                       DBG2(DBG_KNL, "virtual IP %H is on interface %s", ip, *name);
+               }
+               this->lock->unlock(this->lock);
                return TRUE;
        }
        /* maybe it is installed on an ignored interface */
@@ -1246,7 +1471,7 @@ METHOD(kernel_net_t, get_interface_name, bool,
        {
                DBG2(DBG_KNL, "%H is not a local address or the interface is down", ip);
        }
-       this->mutex->unlock(this->mutex);
+       this->lock->unlock(this->lock);
        return FALSE;
 }
 
@@ -1260,13 +1485,13 @@ static int get_interface_index(private_kernel_netlink_net_t *this, char* name)
 
        DBG2(DBG_KNL, "getting iface index for %s", name);
 
-       this->mutex->lock(this->mutex);
+       this->lock->read_lock(this->lock);
        if (this->ifaces->find_first(this->ifaces, (void*)iface_entry_by_name,
                                                                (void**)&iface, name) == SUCCESS)
        {
                ifindex = iface->ifindex;
        }
-       this->mutex->unlock(this->mutex);
+       this->lock->unlock(this->lock);
 
        if (ifindex == 0)
        {
@@ -1276,9 +1501,10 @@ static int get_interface_index(private_kernel_netlink_net_t *this, char* name)
 }
 
 /**
- * check if an address (chunk) addr is in subnet (net with net_len net bits)
+ * check if an address or net (addr with prefix net bits) is in
+ * subnet (net with net_len net bits)
  */
-static bool addr_in_subnet(chunk_t addr, chunk_t net, int net_len)
+static bool addr_in_subnet(chunk_t addr, int prefix, chunk_t net, int net_len)
 {
        static const u_char mask[] = { 0x00, 0x80, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc, 0xfe };
        int byte = 0;
@@ -1287,7 +1513,7 @@ static bool addr_in_subnet(chunk_t addr, chunk_t net, int net_len)
        {       /* any address matches a /0 network */
                return TRUE;
        }
-       if (addr.len != net.len || net_len > 8 * net.len )
+       if (addr.len != net.len || net_len > 8 * net.len || prefix < net_len)
        {
                return FALSE;
        }
@@ -1322,6 +1548,7 @@ typedef struct {
        u_int8_t dst_len;
        u_int32_t table;
        u_int32_t oif;
+       u_int32_t priority;
 } rt_entry_t;
 
 /**
@@ -1334,6 +1561,26 @@ static void rt_entry_destroy(rt_entry_t *this)
 }
 
 /**
+ * Check if the route received with RTM_NEWROUTE is usable based on its type.
+ */
+static bool route_usable(struct nlmsghdr *hdr)
+{
+       struct rtmsg *msg;
+
+       msg = NLMSG_DATA(hdr);
+       switch (msg->rtm_type)
+       {
+               case RTN_BLACKHOLE:
+               case RTN_UNREACHABLE:
+               case RTN_PROHIBIT:
+               case RTN_THROW:
+                       return FALSE;
+               default:
+                       return TRUE;
+       }
+}
+
+/**
  * Parse route received with RTM_NEWROUTE. The given rt_entry_t object will be
  * reused if not NULL.
  *
@@ -1345,7 +1592,7 @@ static rt_entry_t *parse_route(struct nlmsghdr *hdr, rt_entry_t *route)
        struct rtmsg *msg;
        size_t rtasize;
 
-       msg = (struct rtmsg*)(NLMSG_DATA(hdr));
+       msg = NLMSG_DATA(hdr);
        rta = RTM_RTA(msg);
        rtasize = RTM_PAYLOAD(hdr);
 
@@ -1357,6 +1604,7 @@ static rt_entry_t *parse_route(struct nlmsghdr *hdr, rt_entry_t *route)
                route->dst_len = msg->rtm_dst_len;
                route->table = msg->rtm_table;
                route->oif = 0;
+               route->priority = 0;
        }
        else
        {
@@ -1385,6 +1633,12 @@ static rt_entry_t *parse_route(struct nlmsghdr *hdr, rt_entry_t *route)
                                        route->oif = *(u_int32_t*)RTA_DATA(rta);
                                }
                                break;
+                       case RTA_PRIORITY:
+                               if (RTA_PAYLOAD(rta) == sizeof(route->priority))
+                               {
+                                       route->priority = *(u_int32_t*)RTA_DATA(rta);
+                               }
+                               break;
 #ifdef HAVE_RTA_TABLE
                        case RTA_TABLE:
                                if (RTA_PAYLOAD(rta) == sizeof(route->table))
@@ -1403,7 +1657,8 @@ static rt_entry_t *parse_route(struct nlmsghdr *hdr, rt_entry_t *route)
  * Get a route: If "nexthop", the nexthop is returned. source addr otherwise.
  */
 static host_t *get_route(private_kernel_netlink_net_t *this, host_t *dest,
-                                                bool nexthop, host_t *candidate)
+                                                int prefix, bool nexthop, host_t *candidate,
+                                                u_int recursion)
 {
        netlink_buf_t request;
        struct nlmsghdr *hdr, *out, *current;
@@ -1414,39 +1669,61 @@ static host_t *get_route(private_kernel_netlink_net_t *this, host_t *dest,
        rt_entry_t *route = NULL, *best = NULL;
        enumerator_t *enumerator;
        host_t *addr = NULL;
+       bool match_net;
+       int family;
+
+       if (recursion > MAX_ROUTE_RECURSION)
+       {
+               return NULL;
+       }
+       chunk = dest->get_address(dest);
+       len = chunk.len * 8;
+       prefix = prefix < 0 ? len : min(prefix, len);
+       match_net = prefix != len;
 
        memset(&request, 0, sizeof(request));
 
-       hdr = (struct nlmsghdr*)request;
+       family = dest->get_family(dest);
+       hdr = &request.hdr;
        hdr->nlmsg_flags = NLM_F_REQUEST;
-       if (dest->get_family(dest) == AF_INET || this->rta_prefsrc_for_ipv6 ||
-               this->routing_table)
+       hdr->nlmsg_type = RTM_GETROUTE;
+       hdr->nlmsg_len = NLMSG_LENGTH(sizeof(struct rtmsg));
+
+       msg = NLMSG_DATA(hdr);
+       msg->rtm_family = family;
+       if (!match_net && this->rta_mark && this->routing_mark.value)
+       {
+               /* if our routing rule excludes packets with a certain mark we can
+                * get the preferred route without having to dump all routes */
+               chunk = chunk_from_thing(this->routing_mark.value);
+               netlink_add_attribute(hdr, RTA_MARK, chunk, sizeof(request));
+       }
+       else if (family == AF_INET || this->rta_prefsrc_for_ipv6 ||
+                        this->routing_table || match_net)
        {       /* kernels prior to 3.0 do not support RTA_PREFSRC for IPv6 routes.
                 * as we want to ignore routes with virtual IPs we cannot use DUMP
                 * if these routes are not installed in a separate table */
                hdr->nlmsg_flags |= NLM_F_DUMP;
        }
-       hdr->nlmsg_type = RTM_GETROUTE;
-       hdr->nlmsg_len = NLMSG_LENGTH(sizeof(struct rtmsg));
-
-       msg = (struct rtmsg*)NLMSG_DATA(hdr);
-       msg->rtm_family = dest->get_family(dest);
        if (candidate)
        {
                chunk = candidate->get_address(candidate);
                netlink_add_attribute(hdr, RTA_PREFSRC, chunk, sizeof(request));
        }
-       chunk = dest->get_address(dest);
-       netlink_add_attribute(hdr, RTA_DST, chunk, sizeof(request));
+       if (!match_net)
+       {
+               chunk = dest->get_address(dest);
+               netlink_add_attribute(hdr, RTA_DST, chunk, sizeof(request));
+       }
 
        if (this->socket->send(this->socket, hdr, &out, &len) != SUCCESS)
        {
-               DBG2(DBG_KNL, "getting %s to reach %H failed",
-                        nexthop ? "nexthop" : "address", dest);
+               DBG2(DBG_KNL, "getting %s to reach %H/%d failed",
+                        nexthop ? "nexthop" : "address", dest, prefix);
                return NULL;
        }
        routes = linked_list_create();
-       this->mutex->lock(this->mutex);
+       this->lock->read_lock(this->lock);
 
        for (current = out; NLMSG_OK(current, len);
                 current = NLMSG_NEXT(current, len))
@@ -1460,6 +1737,10 @@ static host_t *get_route(private_kernel_netlink_net_t *this, host_t *dest,
                                rt_entry_t *other;
                                uintptr_t table;
 
+                               if (!route_usable(current))
+                               {
+                                       continue;
+                               }
                                route = parse_route(current, route);
 
                                table = (uintptr_t)route->table;
@@ -1477,7 +1758,7 @@ static host_t *get_route(private_kernel_netlink_net_t *this, host_t *dest,
                                {       /* interface is down */
                                        continue;
                                }
-                               if (!addr_in_subnet(chunk, route->dst, route->dst_len))
+                               if (!addr_in_subnet(chunk, prefix, route->dst, route->dst_len))
                                {       /* route destination does not contain dest */
                                        continue;
                                }
@@ -1492,11 +1773,16 @@ static host_t *get_route(private_kernel_netlink_net_t *this, host_t *dest,
                                        }
                                        route->src_host = src;
                                }
-                               /* insert route, sorted by decreasing network prefix */
+                               /* insert route, sorted by priority and network prefix */
                                enumerator = routes->create_enumerator(routes);
                                while (enumerator->enumerate(enumerator, &other))
                                {
-                                       if (route->dst_len > other->dst_len)
+                                       if (route->priority < other->priority)
+                                       {
+                                               break;
+                                       }
+                                       if (route->priority == other->priority &&
+                                               route->dst_len > other->dst_len)
                                        {
                                                break;
                                        }
@@ -1533,7 +1819,7 @@ static host_t *get_route(private_kernel_netlink_net_t *this, host_t *dest,
                        else if (route->oif)
                        {       /* no match yet, maybe it is assigned to the same interface */
                                host_t *src = get_interface_address(this, route->oif,
-                                                                                                       msg->rtm_family, candidate);
+                                                                                       msg->rtm_family, dest, candidate);
                                if (src && src->ip_equals(src, candidate))
                                {
                                        route->src_host->destroy(route->src_host);
@@ -1552,7 +1838,7 @@ static host_t *get_route(private_kernel_netlink_net_t *this, host_t *dest,
                if (route->oif)
                {       /* no src, but an interface - get address from it */
                        route->src_host = get_interface_address(this, route->oif,
-                                                                                                       msg->rtm_family, candidate);
+                                                                                       msg->rtm_family, dest, candidate);
                        if (route->src_host)
                        {       /* we handle this address the same as the one above */
                                if (!candidate ||
@@ -1570,8 +1856,12 @@ static host_t *get_route(private_kernel_netlink_net_t *this, host_t *dest,
                        host_t *gtw;
 
                        gtw = host_create_from_chunk(msg->rtm_family, route->gtw, 0);
-                       route->src_host = get_route(this, gtw, FALSE, candidate);
-                       gtw->destroy(gtw);
+                       if (gtw && !gtw->ip_equals(gtw, dest))
+                       {
+                               route->src_host = get_route(this, gtw, -1, FALSE, candidate,
+                                                                                       recursion + 1);
+                       }
+                       DESTROY_IF(gtw);
                        if (route->src_host)
                        {       /* more of the same */
                                if (!candidate ||
@@ -1592,7 +1882,10 @@ static host_t *get_route(private_kernel_netlink_net_t *this, host_t *dest,
                {
                        addr = host_create_from_chunk(msg->rtm_family, best->gtw, 0);
                }
-               addr = addr ?: dest->clone(dest);
+               if (!addr && !match_net)
+               {       /* fallback to destination address */
+                       addr = dest->clone(dest);
+               }
        }
        else
        {
@@ -1601,19 +1894,19 @@ static host_t *get_route(private_kernel_netlink_net_t *this, host_t *dest,
                        addr = best->src_host->clone(best->src_host);
                }
        }
-       this->mutex->unlock(this->mutex);
+       this->lock->unlock(this->lock);
        routes->destroy_function(routes, (void*)rt_entry_destroy);
        free(out);
 
        if (addr)
        {
-               DBG2(DBG_KNL, "using %H as %s to reach %H", addr,
-                        nexthop ? "nexthop" : "address", dest);
+               DBG2(DBG_KNL, "using %H as %s to reach %H/%d", addr,
+                        nexthop ? "nexthop" : "address", dest, prefix);
        }
-       else
+       else if (!recursion)
        {
-               DBG2(DBG_KNL, "no %s found to reach %H",
-                        nexthop ? "nexthop" : "address", dest);
+               DBG2(DBG_KNL, "no %s found to reach %H/%d",
+                        nexthop ? "nexthop" : "address", dest, prefix);
        }
        return addr;
 }
@@ -1621,13 +1914,13 @@ static host_t *get_route(private_kernel_netlink_net_t *this, host_t *dest,
 METHOD(kernel_net_t, get_source_addr, host_t*,
        private_kernel_netlink_net_t *this, host_t *dest, host_t *src)
 {
-       return get_route(this, dest, FALSE, src);
+       return get_route(this, dest, -1, FALSE, src, 0);
 }
 
 METHOD(kernel_net_t, get_nexthop, host_t*,
-       private_kernel_netlink_net_t *this, host_t *dest, host_t *src)
+       private_kernel_netlink_net_t *this, host_t *dest, int prefix, host_t *src)
 {
-       return get_route(this, dest, TRUE, src);
+       return get_route(this, dest, prefix, TRUE, src, 0);
 }
 
 /**
@@ -1635,7 +1928,7 @@ METHOD(kernel_net_t, get_nexthop, host_t*,
  * By setting the appropriate nlmsg_type, the ip will be set or unset.
  */
 static status_t manage_ipaddr(private_kernel_netlink_net_t *this, int nlmsg_type,
-                                                         int flags, int if_index, host_t *ip)
+                                                         int flags, int if_index, host_t *ip, int prefix)
 {
        netlink_buf_t request;
        struct nlmsghdr *hdr;
@@ -1646,25 +1939,37 @@ static status_t manage_ipaddr(private_kernel_netlink_net_t *this, int nlmsg_type
 
        chunk = ip->get_address(ip);
 
-       hdr = (struct nlmsghdr*)request;
+       hdr = &request.hdr;
        hdr->nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK | flags;
        hdr->nlmsg_type = nlmsg_type;
        hdr->nlmsg_len = NLMSG_LENGTH(sizeof(struct ifaddrmsg));
 
-       msg = (struct ifaddrmsg*)NLMSG_DATA(hdr);
+       msg = NLMSG_DATA(hdr);
        msg->ifa_family = ip->get_family(ip);
        msg->ifa_flags = 0;
-       msg->ifa_prefixlen = 8 * chunk.len;
+       msg->ifa_prefixlen = prefix < 0 ? chunk.len * 8 : prefix;
        msg->ifa_scope = RT_SCOPE_UNIVERSE;
        msg->ifa_index = if_index;
 
        netlink_add_attribute(hdr, IFA_LOCAL, chunk, sizeof(request));
 
+       if (ip->get_family(ip) == AF_INET6 && this->rta_prefsrc_for_ipv6)
+       {       /* if source routes are possible we let the virtual IP get deprecated
+                * immediately (but mark it as valid forever) so it gets only used if
+                * forced by our route, and not by the default IPv6 address selection */
+               struct ifa_cacheinfo cache = {
+                       .ifa_valid = 0xFFFFFFFF,
+                       .ifa_prefered = 0,
+               };
+               netlink_add_attribute(hdr, IFA_CACHEINFO, chunk_from_thing(cache),
+                                                         sizeof(request));
+       }
        return this->socket->send_ack(this->socket, hdr);
 }
 
 METHOD(kernel_net_t, add_ip, status_t,
-       private_kernel_netlink_net_t *this, host_t *virtual_ip, host_t *iface_ip)
+       private_kernel_netlink_net_t *this, host_t *virtual_ip, int prefix,
+       char *iface_name)
 {
        addr_map_entry_t *entry, lookup = {
                .ip = virtual_ip,
@@ -1676,7 +1981,7 @@ METHOD(kernel_net_t, add_ip, status_t,
                return SUCCESS;
        }
 
-       this->mutex->lock(this->mutex);
+       this->lock->write_lock(this->lock);
        /* the virtual IP might actually be installed as regular IP, in which case
         * we don't track it as virtual IP */
        entry = this->addrs->get_match(this->addrs, &lookup,
@@ -1690,12 +1995,12 @@ METHOD(kernel_net_t, add_ip, status_t,
                         * ready, 2) just added by another thread, but not yet confirmed to
                         * be installed by the kernel, 3) just deleted, but not yet gone.
                         * Then while we wait below, several things could happen (as we
-                        * release the mutex).  For instance, the interface could disappear,
+                        * release the lock).  For instance, the interface could disappear,
                         * or the IP is finally deleted, and it reappears on a different
                         * interface. All these cases are handled by the call below. */
                        while (!is_vip_installed_or_gone(this, virtual_ip, &entry))
                        {
-                               this->condvar->wait(this->condvar, this->mutex);
+                               this->condvar->wait(this->condvar, this->lock);
                        }
                        if (entry)
                        {
@@ -1707,7 +2012,7 @@ METHOD(kernel_net_t, add_ip, status_t,
        {
                DBG2(DBG_KNL, "virtual IP %H is already installed on %s", virtual_ip,
                         entry->iface->ifname);
-               this->mutex->unlock(this->mutex);
+               this->lock->unlock(this->lock);
                return SUCCESS;
        }
        /* try to find the target interface, either by config or via src ip */
@@ -1715,21 +2020,17 @@ METHOD(kernel_net_t, add_ip, status_t,
                 this->ifaces->find_first(this->ifaces, (void*)iface_entry_by_name,
                                                (void**)&iface, this->install_virtual_ip_on) != SUCCESS)
        {
-               lookup.ip = iface_ip;
-               entry = this->addrs->get_match(this->addrs, &lookup,
-                                                                         (void*)addr_map_entry_match);
-               if (!entry)
+               if (this->ifaces->find_first(this->ifaces, (void*)iface_entry_by_name,
+                                                                        (void**)&iface, iface_name) != SUCCESS)
                {       /* if we don't find the requested interface we just use the first */
                        this->ifaces->get_first(this->ifaces, (void**)&iface);
                }
-               else
-               {
-                       iface = entry->iface;
-               }
        }
        if (iface)
        {
                addr_entry_t *addr;
+               char *ifname;
+               int ifi;
 
                INIT(addr,
                        .ip = virtual_ip->clone(virtual_ip),
@@ -1738,33 +2039,42 @@ METHOD(kernel_net_t, add_ip, status_t,
                );
                iface->addrs->insert_last(iface->addrs, addr);
                addr_map_entry_add(this->vips, addr, iface);
+               ifi = iface->ifindex;
+               this->lock->unlock(this->lock);
                if (manage_ipaddr(this, RTM_NEWADDR, NLM_F_CREATE | NLM_F_EXCL,
-                                                 iface->ifindex, virtual_ip) == SUCCESS)
+                                                 ifi, virtual_ip, prefix) == SUCCESS)
                {
+                       this->lock->write_lock(this->lock);
                        while (!is_vip_installed_or_gone(this, virtual_ip, &entry))
                        {       /* wait until address appears */
-                               this->condvar->wait(this->condvar, this->mutex);
+                               this->condvar->wait(this->condvar, this->lock);
                        }
                        if (entry)
                        {       /* we fail if the interface got deleted in the meantime */
-                               DBG2(DBG_KNL, "virtual IP %H installed on %s", virtual_ip,
-                                        entry->iface->ifname);
-                               this->mutex->unlock(this->mutex);
+                               ifname = strdup(entry->iface->ifname);
+                               this->lock->unlock(this->lock);
+                               DBG2(DBG_KNL, "virtual IP %H installed on %s",
+                                        virtual_ip, ifname);
+                               /* during IKEv1 reauthentication, children get moved from
+                                * old the new SA before the virtual IP is available. This
+                                * kills the route for our virtual IP, reinstall. */
+                               queue_route_reinstall(this, ifname);
                                return SUCCESS;
                        }
+                       this->lock->unlock(this->lock);
                }
-               this->mutex->unlock(this->mutex);
                DBG1(DBG_KNL, "adding virtual IP %H failed", virtual_ip);
                return FAILED;
        }
-       this->mutex->unlock(this->mutex);
+       this->lock->unlock(this->lock);
        DBG1(DBG_KNL, "no interface available, unable to install virtual IP %H",
                 virtual_ip);
        return FAILED;
 }
 
 METHOD(kernel_net_t, del_ip, status_t,
-       private_kernel_netlink_net_t *this, host_t *virtual_ip)
+       private_kernel_netlink_net_t *this, host_t *virtual_ip, int prefix,
+       bool wait)
 {
        addr_map_entry_t *entry, lookup = {
                .ip = virtual_ip,
@@ -1777,7 +2087,7 @@ METHOD(kernel_net_t, del_ip, status_t,
 
        DBG2(DBG_KNL, "deleting virtual IP %H", virtual_ip);
 
-       this->mutex->lock(this->mutex);
+       this->lock->write_lock(this->lock);
        entry = this->vips->get_match(this->vips, &lookup,
                                                                 (void*)addr_map_entry_match);
        if (!entry)
@@ -1788,30 +2098,33 @@ METHOD(kernel_net_t, del_ip, status_t,
                {
                        DBG2(DBG_KNL, "not deleting existing IP %H on %s", virtual_ip,
                                 entry->iface->ifname);
-                       this->mutex->unlock(this->mutex);
+                       this->lock->unlock(this->lock);
                        return SUCCESS;
                }
                DBG2(DBG_KNL, "virtual IP %H not cached, unable to delete", virtual_ip);
-               this->mutex->unlock(this->mutex);
+               this->lock->unlock(this->lock);
                return FAILED;
        }
        if (entry->addr->refcount == 1)
        {
                status_t status;
+               int ifi;
 
                /* we set this flag so that threads calling add_ip will block and wait
                 * until the entry is gone, also so we can wait below */
                entry->addr->installed = FALSE;
-               status = manage_ipaddr(this, RTM_DELADDR, 0, entry->iface->ifindex,
-                                                          virtual_ip);
-               if (status == SUCCESS)
+               ifi = entry->iface->ifindex;
+               this->lock->unlock(this->lock);
+               status = manage_ipaddr(this, RTM_DELADDR, 0, ifi, virtual_ip, prefix);
+               if (status == SUCCESS && wait)
                {       /* wait until the address is really gone */
+                       this->lock->write_lock(this->lock);
                        while (is_known_vip(this, virtual_ip))
                        {
-                               this->condvar->wait(this->condvar, this->mutex);
+                               this->condvar->wait(this->condvar, this->lock);
                        }
+                       this->lock->unlock(this->lock);
                }
-               this->mutex->unlock(this->mutex);
                return status;
        }
        else
@@ -1820,7 +2133,7 @@ METHOD(kernel_net_t, del_ip, status_t,
        }
        DBG2(DBG_KNL, "virtual IP %H used by other SAs, not deleting",
                 virtual_ip);
-       this->mutex->unlock(this->mutex);
+       this->lock->unlock(this->lock);
        return SUCCESS;
 }
 
@@ -1836,6 +2149,7 @@ static status_t manage_srcroute(private_kernel_netlink_net_t *this,
        netlink_buf_t request;
        struct nlmsghdr *hdr;
        struct rtmsg *msg;
+       struct rtattr *rta;
        int ifindex;
        chunk_t chunk;
 
@@ -1862,12 +2176,12 @@ static status_t manage_srcroute(private_kernel_netlink_net_t *this,
 
        memset(&request, 0, sizeof(request));
 
-       hdr = (struct nlmsghdr*)request;
+       hdr = &request.hdr;
        hdr->nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK | flags;
        hdr->nlmsg_type = nlmsg_type;
        hdr->nlmsg_len = NLMSG_LENGTH(sizeof(struct rtmsg));
 
-       msg = (struct rtmsg*)NLMSG_DATA(hdr);
+       msg = NLMSG_DATA(hdr);
        msg->rtm_family = src_ip->get_family(src_ip);
        msg->rtm_dst_len = prefixlen;
        msg->rtm_table = this->routing_table;
@@ -1888,6 +2202,30 @@ static status_t manage_srcroute(private_kernel_netlink_net_t *this,
        chunk.len = sizeof(ifindex);
        netlink_add_attribute(hdr, RTA_OIF, chunk, sizeof(request));
 
+       if (this->mtu || this->mss)
+       {
+               chunk = chunk_alloca(RTA_LENGTH((sizeof(struct rtattr) +
+                                                                                sizeof(u_int32_t)) * 2));
+               chunk.len = 0;
+               rta = (struct rtattr*)chunk.ptr;
+               if (this->mtu)
+               {
+                       rta->rta_type = RTAX_MTU;
+                       rta->rta_len = RTA_LENGTH(sizeof(u_int32_t));
+                       memcpy(RTA_DATA(rta), &this->mtu, sizeof(u_int32_t));
+                       chunk.len = rta->rta_len;
+               }
+               if (this->mss)
+               {
+                       rta = (struct rtattr*)(chunk.ptr + RTA_ALIGN(chunk.len));
+                       rta->rta_type = RTAX_ADVMSS;
+                       rta->rta_len = RTA_LENGTH(sizeof(u_int32_t));
+                       memcpy(RTA_DATA(rta), &this->mss, sizeof(u_int32_t));
+                       chunk.len = RTA_ALIGN(chunk.len) + rta->rta_len;
+               }
+               netlink_add_attribute(hdr, RTA_METRICS, chunk, sizeof(request));
+       }
+
        return this->socket->send_ack(this->socket, hdr);
 }
 
@@ -1911,10 +2249,13 @@ METHOD(kernel_net_t, add_route, status_t,
                this->routes_lock->unlock(this->routes_lock);
                return ALREADY_DONE;
        }
-       found = route_entry_clone(&route);
-       this->routes->put(this->routes, found, found);
        status = manage_srcroute(this, RTM_NEWROUTE, NLM_F_CREATE | NLM_F_EXCL,
                                                         dst_net, prefixlen, gateway, src_ip, if_name);
+       if (status == SUCCESS)
+       {
+               found = route_entry_clone(&route);
+               this->routes->put(this->routes, found, found);
+       }
        this->routes_lock->unlock(this->routes_lock);
        return status;
 }
@@ -1964,10 +2305,10 @@ static status_t init_address_list(private_kernel_netlink_net_t *this)
 
        memset(&request, 0, sizeof(request));
 
-       in = (struct nlmsghdr*)&request;
+       in = &request.hdr;
        in->nlmsg_len = NLMSG_LENGTH(sizeof(struct rtgenmsg));
        in->nlmsg_flags = NLM_F_REQUEST | NLM_F_MATCH | NLM_F_ROOT;
-       msg = (struct rtgenmsg*)NLMSG_DATA(in);
+       msg = NLMSG_DATA(in);
        msg->rtgen_family = AF_UNSPEC;
 
        /* get all links */
@@ -2018,7 +2359,7 @@ static status_t init_address_list(private_kernel_netlink_net_t *this)
        }
        free(out);
 
-       this->mutex->lock(this->mutex);
+       this->lock->read_lock(this->lock);
        ifaces = this->ifaces->create_enumerator(this->ifaces);
        while (ifaces->enumerate(ifaces, &iface))
        {
@@ -2034,7 +2375,7 @@ static status_t init_address_list(private_kernel_netlink_net_t *this)
                }
        }
        ifaces->destroy(ifaces);
-       this->mutex->unlock(this->mutex);
+       this->lock->unlock(this->lock);
        return SUCCESS;
 }
 
@@ -2048,9 +2389,10 @@ static status_t manage_rule(private_kernel_netlink_net_t *this, int nlmsg_type,
        struct nlmsghdr *hdr;
        struct rtmsg *msg;
        chunk_t chunk;
+       char *fwmark;
 
        memset(&request, 0, sizeof(request));
-       hdr = (struct nlmsghdr*)request;
+       hdr = &request.hdr;
        hdr->nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
        hdr->nlmsg_type = nlmsg_type;
        if (nlmsg_type == RTM_NEWRULE)
@@ -2059,7 +2401,7 @@ static status_t manage_rule(private_kernel_netlink_net_t *this, int nlmsg_type,
        }
        hdr->nlmsg_len = NLMSG_LENGTH(sizeof(struct rtmsg));
 
-       msg = (struct rtmsg*)NLMSG_DATA(hdr);
+       msg = NLMSG_DATA(hdr);
        msg->rtm_table = table;
        msg->rtm_family = family;
        msg->rtm_protocol = RTPROT_BOOT;
@@ -2069,6 +2411,33 @@ static status_t manage_rule(private_kernel_netlink_net_t *this, int nlmsg_type,
        chunk = chunk_from_thing(prio);
        netlink_add_attribute(hdr, RTA_PRIORITY, chunk, sizeof(request));
 
+       fwmark = lib->settings->get_str(lib->settings,
+                                                       "%s.plugins.kernel-netlink.fwmark", NULL, lib->ns);
+       if (fwmark)
+       {
+#ifdef HAVE_LINUX_FIB_RULES_H
+               mark_t mark;
+
+               if (fwmark[0] == '!')
+               {
+                       msg->rtm_flags |= FIB_RULE_INVERT;
+                       fwmark++;
+               }
+               if (mark_from_string(fwmark, &mark))
+               {
+                       chunk = chunk_from_thing(mark.value);
+                       netlink_add_attribute(hdr, FRA_FWMARK, chunk, sizeof(request));
+                       chunk = chunk_from_thing(mark.mask);
+                       netlink_add_attribute(hdr, FRA_FWMASK, chunk, sizeof(request));
+                       if (msg->rtm_flags & FIB_RULE_INVERT)
+                       {
+                               this->routing_mark = mark;
+                       }
+               }
+#else
+               DBG1(DBG_KNL, "setting firewall mark on routing rule is not supported");
+#endif
+       }
        return this->socket->send_ack(this->socket, hdr);
 }
 
@@ -2087,6 +2456,10 @@ static void check_kernel_features(private_kernel_netlink_net_t *this)
                        case 3:
                                if (a == 2)
                                {
+                                       if (b == 6 && c >= 36)
+                                       {
+                                               this->rta_mark = TRUE;
+                                       }
                                        DBG2(DBG_KNL, "detected Linux %d.%d.%d, no support for "
                                                 "RTA_PREFSRC for IPv6 routes", a, b, c);
                                        break;
@@ -2095,6 +2468,7 @@ static void check_kernel_features(private_kernel_netlink_net_t *this)
                        case 2:
                                /* only 3.x+ uses two part version numbers */
                                this->rta_prefsrc_for_ipv6 = TRUE;
+                               this->rta_mark = TRUE;
                                break;
                        default:
                                break;
@@ -2134,6 +2508,7 @@ METHOD(kernel_net_t, destroy, void,
        }
        if (this->socket_events > 0)
        {
+               lib->watcher->remove(lib->watcher, this->socket_events);
                close(this->socket_events);
        }
        enumerator = this->routes->create_enumerator(this->routes);
@@ -2159,7 +2534,7 @@ METHOD(kernel_net_t, destroy, void,
        this->rt_exclude->destroy(this->rt_exclude);
        this->roam_lock->destroy(this->roam_lock);
        this->condvar->destroy(this->condvar);
-       this->mutex->destroy(this->mutex);
+       this->lock->destroy(this->lock);
        free(this);
 }
 
@@ -2187,7 +2562,9 @@ kernel_netlink_net_t *kernel_netlink_net_create()
                                .destroy = _destroy,
                        },
                },
-               .socket = netlink_socket_create(NETLINK_ROUTE),
+               .socket = netlink_socket_create(NETLINK_ROUTE, rt_msg_names,
+                       lib->settings->get_bool(lib->settings,
+                               "%s.plugins.kernel-netlink.parallel_route", FALSE, lib->ns)),
                .rt_exclude = linked_list_create(),
                .routes = hashtable_create((hashtable_hash_t)route_entry_hash,
                                                                   (hashtable_equals_t)route_entry_equals, 16),
@@ -2202,32 +2579,40 @@ kernel_netlink_net_t *kernel_netlink_net_create()
                .routes_lock = mutex_create(MUTEX_TYPE_DEFAULT),
                .net_changes_lock = mutex_create(MUTEX_TYPE_DEFAULT),
                .ifaces = linked_list_create(),
-               .mutex = mutex_create(MUTEX_TYPE_RECURSIVE),
-               .condvar = condvar_create(CONDVAR_TYPE_DEFAULT),
+               .lock = rwlock_create(RWLOCK_TYPE_DEFAULT),
+               .condvar = rwlock_condvar_create(),
                .roam_lock = spinlock_create(),
                .routing_table = lib->settings->get_int(lib->settings,
-                               "%s.routing_table", ROUTING_TABLE, hydra->daemon),
+                                               "%s.routing_table", ROUTING_TABLE, lib->ns),
                .routing_table_prio = lib->settings->get_int(lib->settings,
-                               "%s.routing_table_prio", ROUTING_TABLE_PRIO, hydra->daemon),
+                                               "%s.routing_table_prio", ROUTING_TABLE_PRIO, lib->ns),
                .process_route = lib->settings->get_bool(lib->settings,
-                               "%s.process_route", TRUE, hydra->daemon),
+                                               "%s.process_route", TRUE, lib->ns),
                .install_virtual_ip = lib->settings->get_bool(lib->settings,
-                               "%s.install_virtual_ip", TRUE, hydra->daemon),
+                                               "%s.install_virtual_ip", TRUE, lib->ns),
                .install_virtual_ip_on = lib->settings->get_str(lib->settings,
-                               "%s.install_virtual_ip_on", NULL, hydra->daemon),
+                                               "%s.install_virtual_ip_on", NULL, lib->ns),
+               .prefer_temporary_addrs = lib->settings->get_bool(lib->settings,
+                                               "%s.prefer_temporary_addrs", FALSE, lib->ns),
+               .roam_events = lib->settings->get_bool(lib->settings,
+                                               "%s.plugins.kernel-netlink.roam_events", TRUE, lib->ns),
+               .mtu = lib->settings->get_int(lib->settings,
+                                               "%s.plugins.kernel-netlink.mtu", 0, lib->ns),
+               .mss = lib->settings->get_int(lib->settings,
+                                               "%s.plugins.kernel-netlink.mss", 0, lib->ns),
        );
        timerclear(&this->last_route_reinstall);
        timerclear(&this->next_roam);
 
        check_kernel_features(this);
 
-       if (streq(hydra->daemon, "starter"))
+       if (streq(lib->ns, "starter"))
        {       /* starter has no threads, so we do not register for kernel events */
                register_for_events = FALSE;
        }
 
        exclude = lib->settings->get_str(lib->settings,
-                                       "%s.ignore_routing_tables", NULL, hydra->daemon);
+                                                                        "%s.ignore_routing_tables", NULL, lib->ns);
        if (exclude)
        {
                char *token;
@@ -2271,10 +2656,8 @@ kernel_netlink_net_t *kernel_netlink_net_create()
                        return NULL;
                }
 
-               lib->processor->queue_job(lib->processor,
-                       (job_t*)callback_job_create_with_prio(
-                                       (callback_job_cb_t)receive_events, this, NULL,
-                                       (callback_job_cancel_t)return_false, JOB_PRIO_CRITICAL));
+               lib->watcher->add(lib->watcher, this->socket_events, WATCHER_READ,
+                                                 (watcher_cb_t)receive_events, this);
        }
 
        if (init_address_list(this) != SUCCESS)