From feda3ba2effd74936f7ba1b03da1e30a9dee3a16 Mon Sep 17 00:00:00 2001
From: Mike Evans <mike.evans@metaswitch.com>
Date: Thu, 4 Jun 2015 07:50:08 +0000
Subject: [PATCH 2/3] Add kernel tunnel dynamic attribute

For IP-in-IP routes, use original next hop received in BGP UPDATE

Instead of using a recursively resolved next hop, which will probably
be to somewhere half way along the tunnel, from where packets won't be
able to be routed any further.
---
 nest/route.h           |  1 +
 proto/bgp/packets.c    |  1 +
 sysdep/linux/krt-sys.h |  1 +
 sysdep/linux/netlink.Y |  4 +++-
 sysdep/linux/netlink.c | 23 +++++++++++++++++++++--
 5 files changed, 27 insertions(+), 3 deletions(-)

diff --git a/nest/route.h b/nest/route.h
index 2bfd066..3eb86c8 100644
--- a/nest/route.h
+++ b/nest/route.h
@@ -359,6 +359,7 @@ typedef struct rta {
   byte aflags;				/* Attribute cache flags (RTAF_...) */
   u16 hash_key;				/* Hash over important fields */
   u32 igp_metric;			/* IGP metric to next hop (for iBGP routes) */
+  ip_addr orig_gw;			/* Original next hop from BGP UPDATE */
   ip_addr gw;				/* Next hop */
   ip_addr from;				/* Advertising router */
   struct hostentry *hostentry;		/* Hostentry for recursive next-hops */
diff --git a/proto/bgp/packets.c b/proto/bgp/packets.c
index 0cf38ed..fba9d60 100644
--- a/proto/bgp/packets.c
+++ b/proto/bgp/packets.c
@@ -1180,6 +1180,7 @@ bgp_set_next_hop(struct bgp_proto *p, rta *a)
       if (ipa_zero(*nexthop))
 	  return 0;
 
+      a->orig_gw = *nexthop;
       rta_set_recursive_next_hop(p->p.table, a, p->igp_table, nexthop, nexthop + second);
     }
 
diff --git a/sysdep/linux/krt-sys.h b/sysdep/linux/krt-sys.h
index 6d6586d..cd4d2ec 100644
--- a/sysdep/linux/krt-sys.h
+++ b/sysdep/linux/krt-sys.h
@@ -37,6 +37,7 @@ static inline struct ifa * kif_get_primary_ip(struct iface *i) { return NULL; }
 #define EA_KRT_PREFSRC		EA_CODE(EAP_KRT, 0x10)
 #define EA_KRT_REALM		EA_CODE(EAP_KRT, 0x11)
 #define EA_KRT_SCOPE		EA_CODE(EAP_KRT, 0x12)
+#define EA_KRT_TUNNEL		EA_CODE(EAP_KRT, 0x13)
 
 
 #define KRT_METRICS_MAX		0x10	/* RTAX_QUICKACK+1 */
diff --git a/sysdep/linux/netlink.Y b/sysdep/linux/netlink.Y
index f577244..4266333 100644
--- a/sysdep/linux/netlink.Y
+++ b/sysdep/linux/netlink.Y
@@ -15,7 +15,8 @@ CF_KEYWORDS(KERNEL, TABLE, METRIC, KRT_PREFSRC, KRT_REALM, KRT_SCOPE, KRT_MTU, K
 	    KRT_HOPLIMIT, KRT_INITCWND, KRT_RTO_MIN, KRT_INITRWND, KRT_QUICKACK,
 	    KRT_LOCK_MTU, KRT_LOCK_WINDOW, KRT_LOCK_RTT, KRT_LOCK_RTTVAR,
 	    KRT_LOCK_SSTRESH, KRT_LOCK_CWND, KRT_LOCK_ADVMSS, KRT_LOCK_REORDERING,
-	    KRT_LOCK_HOPLIMIT, KRT_LOCK_RTO_MIN, KRT_FEATURE_ECN, KRT_FEATURE_ALLFRAG)
+	    KRT_LOCK_HOPLIMIT, KRT_LOCK_RTO_MIN, KRT_FEATURE_ECN, KRT_FEATURE_ALLFRAG,
+	    KRT_TUNNEL)
 
 CF_GRAMMAR
 
@@ -29,6 +30,7 @@ kern_sys_item:
 CF_ADDTO(dynamic_attr, KRT_PREFSRC	{ $$ = f_new_dynamic_attr(EAF_TYPE_IP_ADDRESS, T_IP, EA_KRT_PREFSRC); })
 CF_ADDTO(dynamic_attr, KRT_REALM	{ $$ = f_new_dynamic_attr(EAF_TYPE_INT, T_INT, EA_KRT_REALM); })
 CF_ADDTO(dynamic_attr, KRT_SCOPE	{ $$ = f_new_dynamic_attr(EAF_TYPE_INT, T_INT, EA_KRT_SCOPE); })
+CF_ADDTO(dynamic_attr, KRT_TUNNEL	{ $$ = f_new_dynamic_attr(EAF_TYPE_STRING, T_STRING, EA_KRT_TUNNEL); })
 
 CF_ADDTO(dynamic_attr, KRT_MTU		{ $$ = f_new_dynamic_attr(EAF_TYPE_INT, T_INT, EA_KRT_MTU); })
 CF_ADDTO(dynamic_attr, KRT_WINDOW	{ $$ = f_new_dynamic_attr(EAF_TYPE_INT, T_INT, EA_KRT_WINDOW); })
diff --git a/sysdep/linux/netlink.c b/sysdep/linux/netlink.c
index 1490213..d754df7 100644
--- a/sysdep/linux/netlink.c
+++ b/sysdep/linux/netlink.c
@@ -887,6 +887,7 @@ nl_send_route(struct krt_proto *p, rte *e, struct ea_list *eattrs, int op, int d
     struct rtmsg r;
     char buf[128 + KRT_METRICS_MAX*8 + nh_bufsize(a->nexthops)];
   } r;
+  struct iface* i;
 
   DBG("nl_send_route(%I/%d,op=%x)\n", net->n.prefix, net->n.pxlen, op);
 
@@ -962,8 +963,22 @@ dest:
     {
     case RTD_ROUTER:
       r.r.rtm_type = RTN_UNICAST;
-      nl_add_attr_u32(&r.h, sizeof(r), RTA_OIF, iface->index);
-      nl_add_attr_ipa(&r.h, sizeof(r), RTA_GATEWAY, gw);
+      if ((ea = ea_find(eattrs, EA_KRT_TUNNEL)) &&
+          (i = if_find_by_name(ea->u.ptr->data)))
+      {
+        /*
+         * Tunnel attribute is set, so set the route up using the specified tunnel device
+         * to the originator of the route.
+         */
+        nl_add_attr_u32(&r.h, sizeof(r), RTA_OIF, i->index);
+        nl_add_attr_ipa(&r.h, sizeof(r), RTA_GATEWAY, a->orig_gw);
+        r.r.rtm_flags |= RTNH_F_ONLINK;
+      }
+      else
+      {
+         nl_add_attr_u32(&r.h, sizeof(r), RTA_OIF, iface->index);
+         nl_add_attr_ipa(&r.h, sizeof(r), RTA_GATEWAY, gw);
+      }
       break;
     case RTD_DEVICE:
       r.r.rtm_type = RTN_UNICAST;
@@ -1652,6 +1667,10 @@ krt_sys_get_attr(eattr *a, byte *buf, int buflen UNUSED)
     bsprintf(buf, "scope");
     return GA_NAME;
 
+  case EA_KRT_TUNNEL:
+    bsprintf(buf, "tunnel");
+    return GA_NAME;
+
   case EA_KRT_LOCK:
     buf += bsprintf(buf, "lock:");
     ea_format_bitfield(a, buf, buflen, krt_metrics_names, 2, KRT_METRICS_MAX);
-- 
2.7.4

