[PATCH int-new] IPv6 ECMP support fixes for linux
Hello Fellows, as mentioned in my previous mail, here is a patch for IPv6 ECMP support against the int-new branch, for BIRD 2. The only difference between the master and int-new patches is that the current (int-new) one uses the rtable.addr_type to determine the IPv6-style behavior, instead of placing it under IPV6 ifdef. Thanks, Mikhail
The API for configuring ECMP for IPv6 on Linux is not symmetrical. Routes can be set via the multipath structures, but Linux kernel splits this up into separate routes internally. As a result, ECMP routes are retorned as separate independent routes when queried. This patch works around this issue by making bird collect individual routes for the same destination in one multipath route. It also implements deletion of multipath routes as a set of delete operations for each route entry. Learn mode is still not supported for now. Signed-off-by: Mikhail Sennikovskii <mikhail.sennikovskii@profitbricks.com> --- nest/route.h | 1 + nest/rt-table.c | 29 ++++++++++ sysdep/linux/netlink.c | 93 +++++++++++++++++++++++++++++++- sysdep/unix/krt.c | 144 +++++++++++++++++++++++++++++++++++++++++++++++-- sysdep/unix/krt.h | 5 ++ 5 files changed, 268 insertions(+), 4 deletions(-) diff --git a/nest/route.h b/nest/route.h index eba3d9b..66d7213 100644 --- a/nest/route.h +++ b/nest/route.h @@ -284,6 +284,7 @@ static inline void rte_update(struct proto *p, net *net, rte *new) { rte_update2 void rte_discard(rtable *tab, rte *old); int rt_examine(rtable *t, net_addr *a, struct proto *p, struct filter *filter); rte *rt_export_merged(struct announce_hook *ah, net *net, rte **rt_free, struct ea_list **tmpa, int silent); +rte *rt_merge_list(struct announce_hook *ah, rte *e); void rt_refresh_begin(rtable *t, struct announce_hook *ah); void rt_refresh_end(rtable *t, struct announce_hook *ah); void rte_dump(rte *); diff --git a/nest/rt-table.c b/nest/rt-table.c index f164ecd..ccc5845 100644 --- a/nest/rt-table.c +++ b/nest/rt-table.c @@ -698,6 +698,35 @@ mpnh_merge_rta(struct mpnh *nhs, rta *a, int max) } rte * +rt_merge_list(struct announce_hook *ah, rte *e) +{ + struct mpnh *nhs = NULL; + rte *cur = e, *next, *ret; + + if (!e->next) + return e; + + for (; cur; cur = next) + { + next = cur->next; + /* sanity */ + cur->next = NULL; + nhs = mpnh_merge_rta(nhs, cur->attrs, ah->proto->merge_limit); + if (cur != e) + rte_free(cur); + } + + ret = rte_cow_rta(e, rte_update_pool); + ret->attrs->dest = RTD_MULTIPATH; + ret->attrs->nexthops = nhs; + + if (e != ret) + rte_free(e); + + return ret; +} + +rte * rt_export_merged(struct announce_hook *ah, net *net, rte **rt_free, ea_list **tmpa, int silent) { // struct proto *p = ah->proto; diff --git a/sysdep/linux/netlink.c b/sysdep/linux/netlink.c index 530cb24..a0eda60 100644 --- a/sysdep/linux/netlink.c +++ b/sysdep/linux/netlink.c @@ -1003,6 +1003,48 @@ nl_send_route(struct krt_proto *p, rte *e, struct ea_list *eattrs, int new) return nl_exchange(&r.h); } +static void +krt_del_rte_multipath(struct krt_proto *p, rte *old) +{ + rta *a = old->attrs; + struct mpnh *nh; + rte *e; + int err; + rta ra = { + .src= p->p.main_source, + .source = RTS_INHERIT, + .scope = SCOPE_UNIVERSE, + .cast = RTC_UNICAST + }; + + e = rte_get_temp(&ra); + + for (nh = a->nexthops; nh; nh = nh->next) + { + ra.gw = nh->gw; + ra.iface = nh->iface; + + err = nl_send_route(p, old, NULL, 0); + if (err < 0) + DBG("deleting route failed %d\n", err); + } + + rte_free(e); +} + +static int trk_is_use_collect_mode(struct krt_proto *p) +{ + switch (p->p.table->addr_type) + { + case NET_IP6: + case NET_VPN6: + case NET_ROA6: + return 1; + default: + return 0; + } +} + void krt_replace_rte(struct krt_proto *p, net *n, rte *new, rte *old, struct ea_list *eattrs) { @@ -1016,7 +1058,12 @@ krt_replace_rte(struct krt_proto *p, net *n, rte *new, rte *old, struct ea_list */ if (old) - nl_send_route(p, old, NULL, 0); + { + if (trk_is_use_collect_mode(p) && old->attrs->dest == RTD_MULTIPATH) + krt_del_rte_multipath(p, old); + else + nl_send_route(p, old, NULL, 0); + } if (new) err = nl_send_route(p, new, eattrs, 1); @@ -1277,6 +1324,45 @@ nl_parse_route(struct nlmsghdr *h, int scan) krt_got_route_async(p, e, new); } +static void +krt_scan_notify_begin(struct krt_proto *p) +{ + if (p) + { + if (trk_is_use_collect_mode(p)) + krt_got_route_begin(p); + } + else + { + HASH_WALK(nl_table_map, sys.hash_next, cp) + { + if (trk_is_use_collect_mode(cp)) + krt_got_route_begin(cp); + } + HASH_WALK_END; + } +} + +static void +krt_scan_notify_end(struct krt_proto *p) +{ + if (p) + { + DBG("KRT: mp_collect: end: proto is specified (%s)\n", p->p.name); + if (trk_is_use_collect_mode(p)) + krt_got_route_end(p); + } + else + { + HASH_WALK(nl_table_map, sys.hash_next, cp) + { + if (trk_is_use_collect_mode(cp)) + krt_got_route_end(cp); + } + HASH_WALK_END; + } +} + void krt_do_scan(struct krt_proto *p UNUSED) /* CONFIG_ALL_TABLES_AT_ONCE => p is NULL */ { @@ -1290,11 +1376,16 @@ krt_do_scan(struct krt_proto *p UNUSED) /* CONFIG_ALL_TABLES_AT_ONCE => p is NUL log(L_DEBUG "nl_scan_fire: Unknown packet received (type=%d)", h->nlmsg_type); nl_request_dump(AF_INET6, RTM_GETROUTE); + + krt_scan_notify_begin(p); + while (h = nl_get_scan()) if (h->nlmsg_type == RTM_NEWROUTE || h->nlmsg_type == RTM_DELROUTE) nl_parse_route(h, 1); else log(L_DEBUG "nl_scan_fire: Unknown packet received (type=%d)", h->nlmsg_type); + + krt_scan_notify_end(p); } /* diff --git a/sysdep/unix/krt.c b/sysdep/unix/krt.c index a15d00e..ae93ce4 100644 --- a/sysdep/unix/krt.c +++ b/sysdep/unix/krt.c @@ -658,8 +658,8 @@ krt_same_dest(rte *k, rte *e) * We expect that the route is a temporary rte and its attributes are uncached. */ -void -krt_got_route(struct krt_proto *p, rte *e) +static void +krt_got_route_collected(struct krt_proto *p, rte *e) { net *net = e->net; int verdict; @@ -735,7 +735,8 @@ krt_got_route(struct krt_proto *p, rte *e) /* Get a cached copy of attributes and temporarily link the route */ rta *a = e->attrs; a->source = RTS_DUMMY; - e->attrs = rta_lookup(a); + if (!rta_is_cached(a)) + e->attrs = rta_lookup(a); e->next = net->routes; net->routes = e; } @@ -743,6 +744,143 @@ krt_got_route(struct krt_proto *p, rte *e) rte_free(e); } +static rte * +krt_mp_collect_postprocess(struct krt_proto *p, rte *e) +{ + return rt_merge_list(p->p.main_ahook, e); +} + +static int +krt_mp_is_collectable(struct krt_proto *p, rte *e) +{ + struct rta *a = e->attrs; + + if (a->dest != RTD_ROUTER && a->dest != RTD_DEVICE) + return 0; + + return 1; +} + +static int +krt_mp_is_mergable(struct krt_proto *p, rte *e1, rte *e2) +{ + if (!rte_is_valid(e1) || !rte_is_valid(e2)) + return 0; + + if (e1->pref != e2->pref) + return 0; + + if (e1->attrs->src->proto->proto != e2->attrs->src->proto->proto) + return 0; + + return 1; +} + +static int +krt_mp_collect_add(struct krt_proto *p, rte *mp_collect_rte, rte *e) +{ + struct rte *last; + if (mp_collect_rte->net != e->net) + return -1; + + if (!krt_mp_is_collectable(p, e)) + return -1; + + if (!krt_mp_is_mergable(p, mp_collect_rte, e)) + return -1; + + rta *a = e->attrs; + if (!rta_is_cached(a)) + e->attrs = rta_lookup(a); + + last = mp_collect_rte; + + for ( ; last->next; last = last->next); + + last->next = e; + e->next = NULL; + + return 0; +} + +void +krt_mp_collect(struct krt_proto *p, rte *e) +{ + if (p->mp_collect_rte) + { + if (!krt_mp_collect_add(p, p->mp_collect_rte, e)) + { + krt_trace_in(p, e, "collecting[add]"); + return; + } + + rte *cur = NULL; + + cur = krt_mp_collect_postprocess(p, p->mp_collect_rte); + p->mp_collect_rte = NULL; + krt_trace_in(p, cur, "collected"); + krt_got_route_collected(p, cur); + } + + ASSERT(!p->mp_collect_rte); + if (krt_mp_is_collectable(p, e)) + { + e->attrs = rta_lookup(e->attrs); + e->next = NULL; + p->mp_collect_rte = e; + krt_trace_in(p, e, "collecting"); + return; + } + + krt_got_route_collected(p, e); +} + +void krt_got_route_begin(struct krt_proto *p) +{ + DBG("KRT: mp_collect: begin for proto (%s)\n", p->p.name); + ASSERT(!p->mp_collect_mode); + p->mp_collect_mode = 1; +} + +void krt_got_route_end(struct krt_proto *p) +{ + DBG("KRT: mp_collect: end for proto (%s)\n", p->p.name); + + ASSERT(p->mp_collect_mode); + + p->mp_collect_mode = 0; + + rte *mp_collect_rte = p->mp_collect_rte; + + if (!mp_collect_rte) + { + DBG("KRT: mp_collect: no collected entry on end\n"); + return; + } + + p->mp_collect_rte = NULL; + + mp_collect_rte = krt_mp_collect_postprocess(p, mp_collect_rte); + + krt_trace_in(p, mp_collect_rte, "collected[end]"); + + krt_got_route_collected(p, mp_collect_rte); + + DBG("KRT: mp_collect: route collected on end\n"); +} + +void +krt_got_route(struct krt_proto *p, rte *e) +{ + if (p->mp_collect_mode) + { + krt_mp_collect(p, e); + return; + } + + krt_got_route_collected(p, e); +} + static void krt_prune(struct krt_proto *p) { diff --git a/sysdep/unix/krt.h b/sysdep/unix/krt.h index f05dc37..3a1781f 100644 --- a/sysdep/unix/krt.h +++ b/sysdep/unix/krt.h @@ -69,6 +69,9 @@ struct krt_proto { byte ready; /* Initial feed has been finished */ byte initialized; /* First scan has been finished */ byte reload; /* Next scan is doing reload */ + byte mp_collect_mode; /* Collecting multipath entries from single-path */ + + rte *mp_collect_rte; }; extern pool *krt_pool; @@ -82,6 +85,8 @@ extern pool *krt_pool; struct proto_config * kif_init_config(int class); void kif_request_scan(void); +void krt_got_route_begin(struct krt_proto *p); +void krt_got_route_end(struct krt_proto *p); void krt_got_route(struct krt_proto *p, struct rte *e); void krt_got_route_async(struct krt_proto *p, struct rte *e, int new); -- 2.5.0
participants (1)
-
Mikhail Sennikovskii