[PATCH master] IPv6 ECMP support fixes for linux
Hello Fellows, as a follow-up for the recent discussion about BIRD support of IPv6 ECMP on Linux ( http://trubka.network.cz/pipermail/bird-users/2016-January/010163.html ), I've created a patch, which addresses this problem. The current patch is against the master branch, i.e. for BIRD 1.x. I will also post a similar patch against the int-new branch, for BIRD 2. Note that the patch still does not support the learn mode for IPv6 ECMP. The support for it can be added later. Thanks, Mikhail
The API for configuring ECMP for IPv6 on Linux is not symmetrical. Routes can be set via the multipath structures, but Linux kernel splits this up into separate routes internally. As a result, ECMP routes are retorned as separate independent routes when queried. This patch works around this issue by making bird collect individual routes for the same destination in one multipath route. It also implements deletion of multipath routes as a set of delete operations for each route entry. Learn mode is still not supported for now. Signed-off-by: Mikhail Sennikovskii <mikhail.sennikovskii@profitbricks.com> --- nest/route.h | 1 + nest/rt-table.c | 29 ++++++++++ sysdep/linux/netlink.c | 79 ++++++++++++++++++++++++++- sysdep/unix/krt.c | 144 +++++++++++++++++++++++++++++++++++++++++++++++-- sysdep/unix/krt.h | 5 ++ 5 files changed, 254 insertions(+), 4 deletions(-) diff --git a/nest/route.h b/nest/route.h index c435b9e..c24d7ac 100644 --- a/nest/route.h +++ b/nest/route.h @@ -271,6 +271,7 @@ static inline void rte_update(struct proto *p, net *net, rte *new) { rte_update2 void rte_discard(rtable *tab, rte *old); int rt_examine(rtable *t, ip_addr prefix, int pxlen, struct proto *p, struct filter *filter); rte *rt_export_merged(struct announce_hook *ah, net *net, rte **rt_free, struct ea_list **tmpa, int silent); +rte *rt_merge_list(struct announce_hook *ah, rte *e); void rt_refresh_begin(rtable *t, struct announce_hook *ah); void rt_refresh_end(rtable *t, struct announce_hook *ah); void rte_dump(rte *); diff --git a/nest/rt-table.c b/nest/rt-table.c index 57c8b8e..a3437da 100644 --- a/nest/rt-table.c +++ b/nest/rt-table.c @@ -597,6 +597,35 @@ mpnh_merge_rta(struct mpnh *nhs, rta *a, int max) } rte * +rt_merge_list(struct announce_hook *ah, rte *e) +{ + struct mpnh *nhs = NULL; + rte *cur = e, *next, *ret; + + if (!e->next) + return e; + + for (; cur; cur = next) + { + next = cur->next; + /* sanity */ + cur->next = NULL; + nhs = mpnh_merge_rta(nhs, cur->attrs, ah->proto->merge_limit); + if (cur != e) + rte_free(cur); + } + + ret = rte_cow_rta(e, rte_update_pool); + ret->attrs->dest = RTD_MULTIPATH; + ret->attrs->nexthops = nhs; + + if (e != ret) + rte_free(e); + + return ret; +} + +rte * rt_export_merged(struct announce_hook *ah, net *net, rte **rt_free, ea_list **tmpa, int silent) { // struct proto *p = ah->proto; diff --git a/sysdep/linux/netlink.c b/sysdep/linux/netlink.c index 640d187..31e833c 100644 --- a/sysdep/linux/netlink.c +++ b/sysdep/linux/netlink.c @@ -896,6 +896,37 @@ nl_send_route(struct krt_proto *p, rte *e, struct ea_list *eattrs, int new) return nl_exchange(&r.h); } +#ifdef IPV6 +static void +krt_del_rte_multipath(struct krt_proto *p, rte *old) +{ + rta *a = old->attrs; + struct mpnh *nh; + rte *e; + int err; + rta ra = { + .src= p->p.main_source, + .source = RTS_INHERIT, + .scope = SCOPE_UNIVERSE, + .cast = RTC_UNICAST + }; + + e = rte_get_temp(&ra); + + for (nh = a->nexthops; nh; nh = nh->next) + { + ra.gw = nh->gw; + ra.iface = nh->iface; + + err = nl_send_route(p, old, NULL, 0); + if (err < 0) + DBG("deleting route failed %d\n", err); + } + + rte_free(e); +} +#endif + void krt_replace_rte(struct krt_proto *p, net *n, rte *new, rte *old, struct ea_list *eattrs) { @@ -909,7 +940,14 @@ krt_replace_rte(struct krt_proto *p, net *n, rte *new, rte *old, struct ea_list */ if (old) - nl_send_route(p, old, NULL, 0); + { +#ifdef IPV6 + if (old->attrs->dest == RTD_MULTIPATH) + krt_del_rte_multipath(p, old); + else +#endif + nl_send_route(p, old, NULL, 0); + } if (new) err = nl_send_route(p, new, eattrs, 1); @@ -1178,17 +1216,56 @@ nl_parse_route(struct nlmsghdr *h, int scan) krt_got_route_async(p, e, new); } +static void +krt_scan_notify_begin(struct krt_proto *p) +{ +#ifdef IPV6 + if (p) + krt_got_route_begin(p); + else + { + HASH_WALK(nl_table_map, sys.hash_next, cp) + { + krt_got_route_begin(cp); + } + HASH_WALK_END; + } +#endif +} + +static void +krt_scan_notify_end(struct krt_proto *p) +{ +#ifdef IPV6 + if (p) + krt_got_route_end(p); + else + { + HASH_WALK(nl_table_map, sys.hash_next, cp) + { + krt_got_route_end(cp); + } + HASH_WALK_END; + } +#endif +} + void krt_do_scan(struct krt_proto *p UNUSED) /* CONFIG_ALL_TABLES_AT_ONCE => p is NULL */ { struct nlmsghdr *h; nl_request_dump(BIRD_AF, RTM_GETROUTE); + + krt_scan_notify_begin(p); + while (h = nl_get_scan()) if (h->nlmsg_type == RTM_NEWROUTE || h->nlmsg_type == RTM_DELROUTE) nl_parse_route(h, 1); else log(L_DEBUG "nl_scan_fire: Unknown packet received (type=%d)", h->nlmsg_type); + + krt_scan_notify_end(p); } /* diff --git a/sysdep/unix/krt.c b/sysdep/unix/krt.c index 5e78586..1a6ec90 100644 --- a/sysdep/unix/krt.c +++ b/sysdep/unix/krt.c @@ -660,8 +660,8 @@ krt_same_dest(rte *k, rte *e) * We expect that the route is a temporary rte and its attributes are uncached. */ -void -krt_got_route(struct krt_proto *p, rte *e) +static void +krt_got_route_collected(struct krt_proto *p, rte *e) { net *net = e->net; int verdict; @@ -737,7 +737,8 @@ krt_got_route(struct krt_proto *p, rte *e) /* Get a cached copy of attributes and temporarily link the route */ rta *a = e->attrs; a->source = RTS_DUMMY; - e->attrs = rta_lookup(a); + if (!rta_is_cached(a)) + e->attrs = rta_lookup(a); e->next = net->routes; net->routes = e; } @@ -745,6 +746,143 @@ krt_got_route(struct krt_proto *p, rte *e) rte_free(e); } +static rte * +krt_mp_collect_postprocess(struct krt_proto *p, rte *e) +{ + return rt_merge_list(p->p.main_ahook, e); +} + +static int +krt_mp_is_collectable(struct krt_proto *p, rte *e) +{ + struct rta *a = e->attrs; + + if (a->dest != RTD_ROUTER && a->dest != RTD_DEVICE) + return 0; + + return 1; +} + +static int +krt_mp_is_mergable(struct krt_proto *p, rte *e1, rte *e2) +{ + if (!rte_is_valid(e1) || !rte_is_valid(e2)) + return 0; + + if (e1->pref != e2->pref) + return 0; + + if (e1->attrs->src->proto->proto != e2->attrs->src->proto->proto) + return 0; + + return 1; +} + +static int +krt_mp_collect_add(struct krt_proto *p, rte *mp_collect_rte, rte *e) +{ + struct rte *last; + if (mp_collect_rte->net != e->net) + return -1; + + if (!krt_mp_is_collectable(p, e)) + return -1; + + if (!krt_mp_is_mergable(p, mp_collect_rte, e)) + return -1; + + rta *a = e->attrs; + if (!rta_is_cached(a)) + e->attrs = rta_lookup(a); + + last = mp_collect_rte; + + for ( ; last->next; last = last->next); + + last->next = e; + e->next = NULL; + + return 0; +} + +void +krt_mp_collect(struct krt_proto *p, rte *e) +{ + if (p->mp_collect_rte) + { + if (!krt_mp_collect_add(p, p->mp_collect_rte, e)) + { + krt_trace_in(p, e, "collecting[add]"); + return; + } + + rte *cur = NULL; + + cur = krt_mp_collect_postprocess(p, p->mp_collect_rte); + p->mp_collect_rte = NULL; + krt_trace_in(p, cur, "collected"); + krt_got_route_collected(p, cur); + } + + ASSERT(!p->mp_collect_rte); + if (krt_mp_is_collectable(p, e)) + { + e->attrs = rta_lookup(e->attrs); + e->next = NULL; + p->mp_collect_rte = e; + krt_trace_in(p, e, "collecting"); + return; + } + + krt_got_route_collected(p, e); +} + +void krt_got_route_begin(struct krt_proto *p) +{ + DBG("KRT: mp_collect: begin for proto (%s)\n", p->p.name); + ASSERT(!p->mp_collect_mode); + p->mp_collect_mode = 1; +} + +void krt_got_route_end(struct krt_proto *p) +{ + DBG("KRT: mp_collect: end for proto (%s)\n", p->p.name); + + ASSERT(p->mp_collect_mode); + + p->mp_collect_mode = 0; + + rte *mp_collect_rte = p->mp_collect_rte; + + if (!mp_collect_rte) + { + DBG("KRT: mp_collect: no collected entry on end\n"); + return; + } + + p->mp_collect_rte = NULL; + + mp_collect_rte = krt_mp_collect_postprocess(p, mp_collect_rte); + + krt_trace_in(p, mp_collect_rte, "collected[end]"); + + krt_got_route_collected(p, mp_collect_rte); + + DBG("KRT: mp_collect: route collected on end\n"); +} + +void +krt_got_route(struct krt_proto *p, rte *e) +{ + if (p->mp_collect_mode) + { + krt_mp_collect(p, e); + return; + } + + krt_got_route_collected(p, e); +} + static void krt_prune(struct krt_proto *p) { diff --git a/sysdep/unix/krt.h b/sysdep/unix/krt.h index d4a8717..7d46888 100644 --- a/sysdep/unix/krt.h +++ b/sysdep/unix/krt.h @@ -68,6 +68,9 @@ struct krt_proto { byte ready; /* Initial feed has been finished */ byte initialized; /* First scan has been finished */ byte reload; /* Next scan is doing reload */ + byte mp_collect_mode; /* Collecting multipath entries from single-path */ + + rte *mp_collect_rte; }; extern pool *krt_pool; @@ -81,6 +84,8 @@ extern pool *krt_pool; struct proto_config * kif_init_config(int class); void kif_request_scan(void); +void krt_got_route_begin(struct krt_proto *p); +void krt_got_route_end(struct krt_proto *p); void krt_got_route(struct krt_proto *p, struct rte *e); void krt_got_route_async(struct krt_proto *p, struct rte *e, int new); -- 2.5.0
On Wed, Feb 10, 2016 at 04:38:03PM +0100, Mikhail Sennikovskii wrote:
Hello Fellows,
as a follow-up for the recent discussion about BIRD support of IPv6 ECMP on Linux ( http://trubka.network.cz/pipermail/bird-users/2016-January/010163.html ), I've created a patch, which addresses this problem.
Hi Thanks for the patch, i have several comments: 1) In the patch, the incoming direction (collection from kernel) is handled in unix/krt.c, while outgoing direction (krt_replace_rte()) is handled in linux/netlink.c . It would make more sense to handle both directions in the same place. As BIRD uses internally representation with one route and mpnh structure, i would prefer to move collection also to linux/netlink.c, so generic code in unix/krt.c would not care about OS-specific interface (and does not need any changes). Linux code would just collect next hops and then call krt_got_route() for the whole multipath route. 2) Function krt_replace_rte() should be smarter than just flushing the whole old multipath rte. It should compare old and new next hops (we could suppose they are sorted) and either add missing or remove surplussing next hops by calling nl_send_route(). 3) When collecting received routes, you should avoid creating list of routes for the same net, and then merging such list by rt_merge_list(), you could simply collect next hops (struct mpnh *) for the current route. 4) Also rt_merge_list() uses mpnh_merge_rta() which uses rte_update_pool. That is probably not safe outside of nest update code. The linux netlink code should have its own linpool (static and shared, because netlink socket is also shared), so you could allocate mpnh structures from it. The linpool should be flushed when the route is collected and propagated to krt_got_route(). So you don't need to do rta_lookup() for temporary next hops. Note that such linpool could be also used for IPv4 multipath next hops in nl_parse_multipath() instead of nh_buffer.
Note that the patch still does not support the learn mode for IPv6 ECMP. The support for it can be added later.
Well, the learn mode is not really a problem. With the modifications mentioned above, learning would work automatically (it is handled by unix/krt.c, regardless of how route is generated). What is problematic is handling asynchronous notifications (because you don't know inside netlink code what are other next hops and whether you will get another one related to the same net). -- Elen sila lumenn' omentielvo Ondrej 'Santiago' Zajicek (email: santiago@crfreenet.org) OpenPGP encrypted e-mails preferred (KeyID 0x11DEADC3, wwwkeys.pgp.net) "To err is human -- to blame it on a computer is even more so."
Hi, 2016-02-16 18:48 GMT+01:00 Ondrej Zajicek <santiago@crfreenet.org>:
On Wed, Feb 10, 2016 at 04:38:03PM +0100, Mikhail Sennikovskii wrote:
Hello Fellows,
as a follow-up for the recent discussion about BIRD support of IPv6 ECMP on Linux ( http://trubka.network.cz/pipermail/bird-users/2016-January/010163.html ), I've created a patch, which addresses this problem.
Hi
Thanks for the patch, i have several comments:
1) In the patch, the incoming direction (collection from kernel) is handled in unix/krt.c, while outgoing direction (krt_replace_rte()) is handled in linux/netlink.c . It would make more sense to handle both directions in the same place. As BIRD uses internally representation with one route and mpnh structure, i would prefer to move collection also to linux/netlink.c, so generic code in unix/krt.c would not care about OS-specific interface (and does not need any changes). Linux code would just collect next hops and then call krt_got_route() for the whole multipath route.
Agreed, will fix. 2) Function krt_replace_rte() should be smarter than just flushing the
whole old multipath rte. It should compare old and new next hops (we could suppose they are sorted) and either add missing
I'm not sure for now whether it's possible to make this work, as netlink returns EEXIST whenever one tries to add a route for the network, already present in the routing table. This is why removing and re-installing seemed the best solution for me. or remove
surplussing next hops by calling nl_send_route().
Just removing should work, yes.
3) When collecting received routes, you should avoid creating list of routes for the same net, and then merging such list by rt_merge_list(), you could simply collect next hops (struct mpnh *) for the current route.
I can do it this way.
4) Also rt_merge_list() uses mpnh_merge_rta() which uses rte_update_pool. That is probably not safe outside of nest update code. The linux netlink code should have its own linpool (static and shared, because netlink socket is also shared), so you could allocate mpnh structures from it. The linpool should be flushed when the route is collected and propagated to krt_got_route(). So you don't need to do rta_lookup() for temporary next hops.
Right, will do.
Note that such linpool could be also used for IPv4 multipath next hops in nl_parse_multipath() instead of nh_buffer.
I can add this fix as well, probably as a separate patch.
Note that the patch still does not support the learn mode for IPv6 ECMP.
The support for it can be added later.
Well, the learn mode is not really a problem. With the modifications mentioned above, learning would work automatically (it is handled by unix/krt.c, regardless of how route is generated).
What is problematic is handling asynchronous notifications (because you don't know inside netlink code what are other next hops and whether you will get another one related to the same net).
Sorry, I guess I just mixed up with bird terminology. I thought the learn mode is directly related to handling these asynchronous notifications and adjusting the bird routes set accordingly. Regards, Mikhail
-- Elen sila lumenn' omentielvo
Ondrej 'Santiago' Zajicek (email: santiago@crfreenet.org) OpenPGP encrypted e-mails preferred (KeyID 0x11DEADC3, wwwkeys.pgp.net) "To err is human -- to blame it on a computer is even more so."
On Wed, Feb 17, 2016 at 02:59:12PM +0100, Mikhail Sennikovskii wrote:
Hi,
Hi Thanks for a quick response.
2) Function krt_replace_rte() should be smarter than just flushing the
whole old multipath rte. It should compare old and new next hops (we could suppose they are sorted) and either add missing
I'm not sure for now whether it's possible to make this work, as netlink returns EEXIST whenever one tries to add a route for the network, already present in the routing table. This is why removing and re-installing seemed the best solution for me.
There is a flag (NLM_F_EXCL) that control such behavior. So, BIRD should use this flag in single path case or if there is no other, but should not when adding additional next hops to multipath rte.
4) Also rt_merge_list() uses mpnh_merge_rta() which uses rte_update_pool. That is probably not safe outside of nest update code. The linux netlink code should have its own linpool (static and shared, because netlink socket is also shared), so you could allocate mpnh structures from it. The linpool should be flushed when the route is collected and propagated to krt_got_route(). So you don't need to do rta_lookup() for temporary next hops.
Right, will do.
I noticed later that nl_parse_route() allocates some other route attributes by alloca(), so naive collecting without rta_lookup() will not work. Probably just replace these alloca() with lp_alloc().
What is problematic is handling asynchronous notifications (because you don't know inside netlink code what are other next hops and whether you will get another one related to the same net).
Sorry, I guess I just mixed up with bird terminology. I thought the learn mode is directly related to handling these asynchronous notifications and adjusting the bird routes set accordingly.
These are two separate things. Learn mode is whether BIRD learn alien routes from kernel table back to BIRD or just ignore them. BIRD uses both periodic scans and asynchronous notifications to monitor kernel routing table for both bird route changes and alien route changes. -- Elen sila lumenn' omentielvo Ondrej 'Santiago' Zajicek (email: santiago@crfreenet.org) OpenPGP encrypted e-mails preferred (KeyID 0x11DEADC3, wwwkeys.pgp.net) "To err is human -- to blame it on a computer is even more so."
participants (2)
-
Mikhail Sennikovskii -
Ondrej Zajicek