[PATCH 1/2] Babel: Add option to support ecmp routes
--- The FIB walks in babel_reconfigure_iface are a bit ugly, I tried to loop over ifa->neigh_list instead but couldn't get it to work. I'm open to suggestions :) doc/bird.sgml | 8 +++++ proto/babel/babel.c | 70 +++++++++++++++++++++++++++++++++++++------- proto/babel/babel.h | 3 ++ proto/babel/config.Y | 3 ++ 4 files changed, 73 insertions(+), 11 deletions(-) diff --git a/doc/bird.sgml b/doc/bird.sgml index 1580facd..5e85d8ec 100644 --- a/doc/bird.sgml +++ b/doc/bird.sgml @@ -1879,6 +1879,7 @@ protocol babel [<name>] { check link <switch>; next hop ipv4 <address>; next hop ipv6 <address>; + ecmp <switch> [limit <num>]; authentication none|mac [permissive]; password "<text>"; password "<text>" { @@ -1983,6 +1984,13 @@ protocol babel [<name>] { source for Babel packets will be used. In normal operation, it should not be necessary to set this option. + <tag><label id="babel-ecmp">ecmp <m>switch</m> [limit <m>number</m>]</tag> + + Determines whether babel will emit ECMP (equal-cost multipath) + routes, allowing to load-balancing traffic across multiple paths. If + enabled the maximum number of next-hops to allow can be specified, + defaulting to 16. + <tag><label id="babel-authentication">authentication none|mac [permissive]</tag> Selects authentication method to be used. <cf/none/ means that packets are not authenticated at all, <cf/mac/ means MAC authentication is diff --git a/proto/babel/babel.c b/proto/babel/babel.c index 4a7d550f..4d9c657c 100644 --- a/proto/babel/babel.c +++ b/proto/babel/babel.c @@ -623,6 +623,34 @@ done: } } +/** + * babel_nexthop_insert - add next_hop of route to nexthop list + * @p: Babel protocol instance + * @r: Babel route + * @nhs: nexthop list head to append onto + * @nh: freshly allocated buffer to fill + */ +static void +babel_nexthop_insert( + struct babel_proto *p, + struct babel_route *r, + struct nexthop **nhs, + struct nexthop *nh) +{ + nh->gw = r->next_hop; + nh->iface = r->neigh->ifa->iface; + + /* + * If we cannot find a reachable neighbour, set the entry to be onlink. This + * makes it possible to, e.g., assign /32 addresses on a mesh interface and + * have routing work. + */ + if (!neigh_find(&p->p, r->next_hop, r->neigh->ifa->iface, 0)) + nh->flags = RNF_ONLINK; + + nexthop_insert(nhs, nh); +} + /** * babel_announce_rte - announce selected route to the core * @p: Babel protocol instance @@ -635,6 +663,7 @@ done: static void babel_announce_rte(struct babel_proto *p, struct babel_entry *e) { + struct babel_config *cf = (void *) p->p.cf; struct babel_route *r = e->selected; struct channel *c = (e->n.addr->type == NET_IP4) ? p->ip4_channel : p->ip6_channel; @@ -645,18 +674,24 @@ babel_announce_rte(struct babel_proto *p, struct babel_entry *e) .source = RTS_BABEL, .scope = SCOPE_UNIVERSE, .dest = RTD_UNICAST, - .from = r->neigh->addr, - .nh.gw = r->next_hop, - .nh.iface = r->neigh->ifa->iface, }; - /* - * If we cannot find a reachable neighbour, set the entry to be onlink. This - * makes it possible to, e.g., assign /32 addresses on a mesh interface and - * have routing work. - */ - if (!neigh_find(&p->p, r->next_hop, r->neigh->ifa->iface, 0)) - a0.nh.flags = RNF_ONLINK; + struct nexthop *nhs = NULL; + babel_nexthop_insert(p, r, &nhs, allocz(sizeof(struct nexthop))); + int num_nexthops = 1; + + struct babel_route *cr; + WALK_LIST(cr, e->routes) { + if (num_nexthops++ >= cf->ecmp) + break; + + if (cr == r || !cr->feasible || cr->metric != r->metric) + continue; + + babel_nexthop_insert(p, cr, &nhs, allocz(sizeof(struct nexthop))); + } + + a0.nh = *nhs; rta *a = rta_lookup(&a0); rte *rte = rte_get_temp(a); @@ -736,6 +771,7 @@ babel_announce_retraction(struct babel_proto *p, struct babel_entry *e) static void babel_select_route(struct babel_proto *p, struct babel_entry *e, struct babel_route *mod) { + struct babel_config *cf = (void *) p->p.cf; struct babel_route *r, *best = e->selected; /* Shortcut if only non-best was modified */ @@ -744,8 +780,10 @@ babel_select_route(struct babel_proto *p, struct babel_entry *e, struct babel_ro /* Either select modified route, or keep old best route */ if ((mod->metric < (best ? best->metric : BABEL_INFINITY)) && mod->feasible) best = mod; - else + else if (cf->ecmp==1) return; + /* With ecmp one of the non-selected but equal metric routes might have + * changed so contine on with the announcement in that case. */ } else { @@ -1879,6 +1917,16 @@ babel_reconfigure_iface(struct babel_proto *p, struct babel_iface *ifa, struct b if (ifa->up) babel_iface_kick_timer(ifa); + /* Update all routes to refresh ecmp settings. */ + FIB_WALK(&p->ip6_rtable, struct babel_entry, e) { + babel_select_route(p, e, NULL); + } + FIB_WALK_END; + FIB_WALK(&p->ip4_rtable, struct babel_entry, e) { + babel_select_route(p, e, NULL); + } + FIB_WALK_END; + return 1; } diff --git a/proto/babel/babel.h b/proto/babel/babel.h index 84feb085..3868dcb2 100644 --- a/proto/babel/babel.h +++ b/proto/babel/babel.h @@ -62,6 +62,8 @@ #define BABEL_OVERHEAD (IP6_HEADER_LENGTH+UDP_HEADER_LENGTH) #define BABEL_MIN_MTU (512 + BABEL_OVERHEAD) +#define BABEL_DEFAULT_ECMP_LIMIT 16 + #define BABEL_AUTH_NONE 0 #define BABEL_AUTH_MAC 1 @@ -120,6 +122,7 @@ struct babel_config { list iface_list; /* List of iface configs (struct babel_iface_config) */ uint hold_time; /* Time to hold stale entries and unreachable routes */ u8 randomize_router_id; + u8 ecmp; /* Maximum number of nexthops to install */ struct channel_config *ip4_channel; struct channel_config *ip6_channel; diff --git a/proto/babel/config.Y b/proto/babel/config.Y index 05210fa4..8f18c790 100644 --- a/proto/babel/config.Y +++ b/proto/babel/config.Y @@ -36,6 +36,7 @@ babel_proto_start: proto_start BABEL this_proto = proto_config_new(&proto_babel, $1); init_list(&BABEL_CFG->iface_list); BABEL_CFG->hold_time = 1 S_; + BABEL_CFG->ecmp = 1; }; babel_proto_item: @@ -43,6 +44,8 @@ babel_proto_item: | proto_channel | INTERFACE babel_iface | RANDOMIZE ROUTER ID bool { BABEL_CFG->randomize_router_id = $4; } + | ECMP bool { BABEL_CFG->ecmp = $2 ? BABEL_DEFAULT_ECMP_LIMIT : 1; } + | ECMP bool LIMIT expr { BABEL_CFG->ecmp = $2 ? $4 : 1; } ; babel_proto_opts: -- 2.30.2
(+ Juliusz) Daniel Gröber <dxld@darkboxed.org> writes:
--- The FIB walks in babel_reconfigure_iface are a bit ugly, I tried to loop over ifa->neigh_list instead but couldn't get it to work. I'm open to suggestions :)
doc/bird.sgml | 8 +++++ proto/babel/babel.c | 70 +++++++++++++++++++++++++++++++++++++------- proto/babel/babel.h | 3 ++ proto/babel/config.Y | 3 ++ 4 files changed, 73 insertions(+), 11 deletions(-)
diff --git a/doc/bird.sgml b/doc/bird.sgml index 1580facd..5e85d8ec 100644 --- a/doc/bird.sgml +++ b/doc/bird.sgml @@ -1879,6 +1879,7 @@ protocol babel [<name>] { check link <switch>; next hop ipv4 <address>; next hop ipv6 <address>; + ecmp <switch> [limit <num>]; authentication none|mac [permissive]; password "<text>"; password "<text>" { @@ -1983,6 +1984,13 @@ protocol babel [<name>] { source for Babel packets will be used. In normal operation, it should not be necessary to set this option.
+ <tag><label id="babel-ecmp">ecmp <m>switch</m> [limit <m>number</m>]</tag> + + Determines whether babel will emit ECMP (equal-cost multipath) + routes, allowing to load-balancing traffic across multiple paths. If + enabled the maximum number of next-hops to allow can be specified, + defaulting to 16.
This should probably explain the constraints: i.e., that only equal-weight routes are added (which in practice limits it to interfaces of type 'wired', or wireless interfaces with no packet loss, I suppose?) and that the routes all have to be feasible (though I'm not sure if we explain feasibility in the docs anywhere).
<tag><label id="babel-authentication">authentication none|mac [permissive]</tag> Selects authentication method to be used. <cf/none/ means that packets are not authenticated at all, <cf/mac/ means MAC authentication is diff --git a/proto/babel/babel.c b/proto/babel/babel.c index 4a7d550f..4d9c657c 100644 --- a/proto/babel/babel.c +++ b/proto/babel/babel.c @@ -623,6 +623,34 @@ done: } }
+/** + * babel_nexthop_insert - add next_hop of route to nexthop list + * @p: Babel protocol instance + * @r: Babel route + * @nhs: nexthop list head to append onto + * @nh: freshly allocated buffer to fill + */ +static void +babel_nexthop_insert( + struct babel_proto *p, + struct babel_route *r, + struct nexthop **nhs, + struct nexthop *nh) +{ + nh->gw = r->next_hop; + nh->iface = r->neigh->ifa->iface; + + /* + * If we cannot find a reachable neighbour, set the entry to be onlink. This + * makes it possible to, e.g., assign /32 addresses on a mesh interface and + * have routing work. + */ + if (!neigh_find(&p->p, r->next_hop, r->neigh->ifa->iface, 0)) + nh->flags = RNF_ONLINK; + + nexthop_insert(nhs, nh); +} + /** * babel_announce_rte - announce selected route to the core * @p: Babel protocol instance @@ -635,6 +663,7 @@ done: static void babel_announce_rte(struct babel_proto *p, struct babel_entry *e) { + struct babel_config *cf = (void *) p->p.cf; struct babel_route *r = e->selected; struct channel *c = (e->n.addr->type == NET_IP4) ? p->ip4_channel : p->ip6_channel;
@@ -645,18 +674,24 @@ babel_announce_rte(struct babel_proto *p, struct babel_entry *e) .source = RTS_BABEL, .scope = SCOPE_UNIVERSE, .dest = RTD_UNICAST, - .from = r->neigh->addr, - .nh.gw = r->next_hop, - .nh.iface = r->neigh->ifa->iface, };
- /* - * If we cannot find a reachable neighbour, set the entry to be onlink. This - * makes it possible to, e.g., assign /32 addresses on a mesh interface and - * have routing work. - */ - if (!neigh_find(&p->p, r->next_hop, r->neigh->ifa->iface, 0)) - a0.nh.flags = RNF_ONLINK; + struct nexthop *nhs = NULL; + babel_nexthop_insert(p, r, &nhs, allocz(sizeof(struct nexthop))); + int num_nexthops = 1; + + struct babel_route *cr; + WALK_LIST(cr, e->routes) { + if (num_nexthops++ >= cf->ecmp) + break; + + if (cr == r || !cr->feasible || cr->metric != r->metric) + continue;
You should probably switch the order of those two checks? You're increasing the counter even if the route is subsequently discarded...
+ + babel_nexthop_insert(p, cr, &nhs, allocz(sizeof(struct nexthop))); + } + + a0.nh = *nhs;
rta *a = rta_lookup(&a0); rte *rte = rte_get_temp(a); @@ -736,6 +771,7 @@ babel_announce_retraction(struct babel_proto *p, struct babel_entry *e) static void babel_select_route(struct babel_proto *p, struct babel_entry *e, struct babel_route *mod) { + struct babel_config *cf = (void *) p->p.cf; struct babel_route *r, *best = e->selected;
/* Shortcut if only non-best was modified */ @@ -744,8 +780,10 @@ babel_select_route(struct babel_proto *p, struct babel_entry *e, struct babel_ro /* Either select modified route, or keep old best route */ if ((mod->metric < (best ? best->metric : BABEL_INFINITY)) && mod->feasible) best = mod; - else + else if (cf->ecmp==1)
This looks like you're returning if ECMP is enabled; I think finding a better name for that config variable would be good; maybe just "max_nexthops"?
return; + /* With ecmp one of the non-selected but equal metric routes might have + * changed so contine on with the announcement in that case. */ } else { @@ -1879,6 +1917,16 @@ babel_reconfigure_iface(struct babel_proto *p, struct babel_iface *ifa, struct b if (ifa->up) babel_iface_kick_timer(ifa);
+ /* Update all routes to refresh ecmp settings. */ + FIB_WALK(&p->ip6_rtable, struct babel_entry, e) { + babel_select_route(p, e, NULL); + } + FIB_WALK_END; + FIB_WALK(&p->ip4_rtable, struct babel_entry, e) { + babel_select_route(p, e, NULL); + } + FIB_WALK_END; +
I don't think this is the right thing to do; you should probably just refuse the reconfiguration if ECMP settings changed (see the 0-return at the top when TOS changes)?
return 1; }
diff --git a/proto/babel/babel.h b/proto/babel/babel.h index 84feb085..3868dcb2 100644 --- a/proto/babel/babel.h +++ b/proto/babel/babel.h @@ -62,6 +62,8 @@ #define BABEL_OVERHEAD (IP6_HEADER_LENGTH+UDP_HEADER_LENGTH) #define BABEL_MIN_MTU (512 + BABEL_OVERHEAD)
+#define BABEL_DEFAULT_ECMP_LIMIT 16 + #define BABEL_AUTH_NONE 0 #define BABEL_AUTH_MAC 1
@@ -120,6 +122,7 @@ struct babel_config { list iface_list; /* List of iface configs (struct babel_iface_config) */ uint hold_time; /* Time to hold stale entries and unreachable routes */ u8 randomize_router_id; + u8 ecmp; /* Maximum number of nexthops to install */
See above re: the naming of this; 'max_nexthops'?
struct channel_config *ip4_channel; struct channel_config *ip6_channel; diff --git a/proto/babel/config.Y b/proto/babel/config.Y index 05210fa4..8f18c790 100644 --- a/proto/babel/config.Y +++ b/proto/babel/config.Y @@ -36,6 +36,7 @@ babel_proto_start: proto_start BABEL this_proto = proto_config_new(&proto_babel, $1); init_list(&BABEL_CFG->iface_list); BABEL_CFG->hold_time = 1 S_; + BABEL_CFG->ecmp = 1; };
babel_proto_item: @@ -43,6 +44,8 @@ babel_proto_item: | proto_channel | INTERFACE babel_iface | RANDOMIZE ROUTER ID bool { BABEL_CFG->randomize_router_id = $4; } + | ECMP bool { BABEL_CFG->ecmp = $2 ? BABEL_DEFAULT_ECMP_LIMIT : 1; } + | ECMP bool LIMIT expr { BABEL_CFG->ecmp = $2 ? $4 : 1; } ;
babel_proto_opts: -- 2.30.2
Hi Toke, On Sun, May 08, 2022 at 09:06:43PM +0200, Toke Høiland-Jørgensen wrote:
Daniel Gröber <dxld@darkboxed.org> writes:
--- The FIB walks in babel_reconfigure_iface are a bit ugly, I tried to loop over ifa->neigh_list instead but couldn't get it to work. I'm open to suggestions :)
doc/bird.sgml | 8 +++++ proto/babel/babel.c | 70 +++++++++++++++++++++++++++++++++++++------- proto/babel/babel.h | 3 ++ proto/babel/config.Y | 3 ++ 4 files changed, 73 insertions(+), 11 deletions(-)
diff --git a/doc/bird.sgml b/doc/bird.sgml index 1580facd..5e85d8ec 100644 --- a/doc/bird.sgml +++ b/doc/bird.sgml @@ -1879,6 +1879,7 @@ protocol babel [<name>] { check link <switch>; next hop ipv4 <address>; next hop ipv6 <address>; + ecmp <switch> [limit <num>]; authentication none|mac [permissive]; password "<text>"; password "<text>" { @@ -1983,6 +1984,13 @@ protocol babel [<name>] { source for Babel packets will be used. In normal operation, it should not be necessary to set this option.
+ <tag><label id="babel-ecmp">ecmp <m>switch</m> [limit <m>number</m>]</tag> + + Determines whether babel will emit ECMP (equal-cost multipath) + routes, allowing to load-balancing traffic across multiple paths. If + enabled the maximum number of next-hops to allow can be specified, + defaulting to 16.
This should probably explain the constraints: i.e., that only equal-weight routes are added (which in practice limits it to interfaces of type 'wired', or wireless interfaces with no packet loss, I suppose?) and that the routes all have to be feasible (though I'm not sure if we explain feasibility in the docs anywhere).
Right, I have this now: When neibours are using a dynamic link-quality metric this is unlikely to be useful. For best results <ref id="babel-type" name="type wired"> should be used throughout the network to get what amounts to a hop count metric. Once RTT metric are merged I'm going to add another paragraph touting the virtues of using (the equivalents of rtt-min/max from babeld) to kick bufferbloated or weirdly re-routed paths out of the ECMP set. That's my main use-case anyway.
<tag><label id="babel-authentication">authentication none|mac [permissive]</tag> Selects authentication method to be used. <cf/none/ means that packets are not authenticated at all, <cf/mac/ means MAC authentication is diff --git a/proto/babel/babel.c b/proto/babel/babel.c index 4a7d550f..4d9c657c 100644 --- a/proto/babel/babel.c +++ b/proto/babel/babel.c @@ -623,6 +623,34 @@ done: } }
+/** + * babel_nexthop_insert - add next_hop of route to nexthop list + * @p: Babel protocol instance + * @r: Babel route + * @nhs: nexthop list head to append onto + * @nh: freshly allocated buffer to fill + */ +static void +babel_nexthop_insert( + struct babel_proto *p, + struct babel_route *r, + struct nexthop **nhs, + struct nexthop *nh) +{ + nh->gw = r->next_hop; + nh->iface = r->neigh->ifa->iface; + + /* + * If we cannot find a reachable neighbour, set the entry to be onlink. This + * makes it possible to, e.g., assign /32 addresses on a mesh interface and + * have routing work. + */ + if (!neigh_find(&p->p, r->next_hop, r->neigh->ifa->iface, 0)) + nh->flags = RNF_ONLINK; + + nexthop_insert(nhs, nh); +} + /** * babel_announce_rte - announce selected route to the core * @p: Babel protocol instance @@ -635,6 +663,7 @@ done: static void babel_announce_rte(struct babel_proto *p, struct babel_entry *e) { + struct babel_config *cf = (void *) p->p.cf; struct babel_route *r = e->selected; struct channel *c = (e->n.addr->type == NET_IP4) ? p->ip4_channel : p->ip6_channel;
@@ -645,18 +674,24 @@ babel_announce_rte(struct babel_proto *p, struct babel_entry *e) .source = RTS_BABEL, .scope = SCOPE_UNIVERSE, .dest = RTD_UNICAST, - .from = r->neigh->addr, - .nh.gw = r->next_hop, - .nh.iface = r->neigh->ifa->iface, };
- /* - * If we cannot find a reachable neighbour, set the entry to be onlink. This - * makes it possible to, e.g., assign /32 addresses on a mesh interface and - * have routing work. - */ - if (!neigh_find(&p->p, r->next_hop, r->neigh->ifa->iface, 0)) - a0.nh.flags = RNF_ONLINK; + struct nexthop *nhs = NULL; + babel_nexthop_insert(p, r, &nhs, allocz(sizeof(struct nexthop))); + int num_nexthops = 1; + + struct babel_route *cr; + WALK_LIST(cr, e->routes) { + if (num_nexthops++ >= cf->ecmp) + break; + + if (cr == r || !cr->feasible || cr->metric != r->metric) + continue;
You should probably switch the order of those two checks? You're increasing the counter even if the route is subsequently discarded...
Whoops, last minute change and brainfart.
+ + babel_nexthop_insert(p, cr, &nhs, allocz(sizeof(struct nexthop))); + } + + a0.nh = *nhs;
rta *a = rta_lookup(&a0); rte *rte = rte_get_temp(a); @@ -736,6 +771,7 @@ babel_announce_retraction(struct babel_proto *p, struct babel_entry *e) static void babel_select_route(struct babel_proto *p, struct babel_entry *e, struct babel_route *mod) { + struct babel_config *cf = (void *) p->p.cf; struct babel_route *r, *best = e->selected;
/* Shortcut if only non-best was modified */ @@ -744,8 +780,10 @@ babel_select_route(struct babel_proto *p, struct babel_entry *e, struct babel_ro /* Either select modified route, or keep old best route */ if ((mod->metric < (best ? best->metric : BABEL_INFINITY)) && mod->feasible) best = mod; - else + else if (cf->ecmp==1)
This looks like you're returning if ECMP is enabled; I think finding a better name for that config variable would be good; maybe just "max_nexthops"?
Yeah that's neater. Fixed.
return; + /* With ecmp one of the non-selected but equal metric routes might have + * changed so contine on with the announcement in that case. */ } else { @@ -1879,6 +1917,16 @@ babel_reconfigure_iface(struct babel_proto *p, struct babel_iface *ifa, struct b if (ifa->up) babel_iface_kick_timer(ifa);
+ /* Update all routes to refresh ecmp settings. */ + FIB_WALK(&p->ip6_rtable, struct babel_entry, e) { + babel_select_route(p, e, NULL); + } + FIB_WALK_END; + FIB_WALK(&p->ip4_rtable, struct babel_entry, e) { + babel_select_route(p, e, NULL); + } + FIB_WALK_END; +
I don't think this is the right thing to do; you should probably just refuse the reconfiguration if ECMP settings changed (see the 0-return at the top when TOS changes)?
Why not? Works fine AFAICT. I tested the reload stuff and the nexthops get updated when the limit changes just as I would expect. If I refuse the reconfiguration the user would have to restart bird to get this to apply, right? I don't want that.
return 1; }
diff --git a/proto/babel/babel.h b/proto/babel/babel.h index 84feb085..3868dcb2 100644 --- a/proto/babel/babel.h +++ b/proto/babel/babel.h @@ -62,6 +62,8 @@ #define BABEL_OVERHEAD (IP6_HEADER_LENGTH+UDP_HEADER_LENGTH) #define BABEL_MIN_MTU (512 + BABEL_OVERHEAD)
+#define BABEL_DEFAULT_ECMP_LIMIT 16 + #define BABEL_AUTH_NONE 0 #define BABEL_AUTH_MAC 1
@@ -120,6 +122,7 @@ struct babel_config { list iface_list; /* List of iface configs (struct babel_iface_config) */ uint hold_time; /* Time to hold stale entries and unreachable routes */ u8 randomize_router_id; + u8 ecmp; /* Maximum number of nexthops to install */
See above re: the naming of this; 'max_nexthops'?
struct channel_config *ip4_channel; struct channel_config *ip6_channel; diff --git a/proto/babel/config.Y b/proto/babel/config.Y index 05210fa4..8f18c790 100644 --- a/proto/babel/config.Y +++ b/proto/babel/config.Y @@ -36,6 +36,7 @@ babel_proto_start: proto_start BABEL this_proto = proto_config_new(&proto_babel, $1); init_list(&BABEL_CFG->iface_list); BABEL_CFG->hold_time = 1 S_; + BABEL_CFG->ecmp = 1; };
babel_proto_item: @@ -43,6 +44,8 @@ babel_proto_item: | proto_channel | INTERFACE babel_iface | RANDOMIZE ROUTER ID bool { BABEL_CFG->randomize_router_id = $4; } + | ECMP bool { BABEL_CFG->ecmp = $2 ? BABEL_DEFAULT_ECMP_LIMIT : 1; } + | ECMP bool LIMIT expr { BABEL_CFG->ecmp = $2 ? $4 : 1; } ;
babel_proto_opts: -- 2.30.2
--Daniel
dxld@darkboxed.org writes:
Hi Toke,
On Sun, May 08, 2022 at 09:06:43PM +0200, Toke Høiland-Jørgensen wrote:
Daniel Gröber <dxld@darkboxed.org> writes:
--- The FIB walks in babel_reconfigure_iface are a bit ugly, I tried to loop over ifa->neigh_list instead but couldn't get it to work. I'm open to suggestions :)
doc/bird.sgml | 8 +++++ proto/babel/babel.c | 70 +++++++++++++++++++++++++++++++++++++------- proto/babel/babel.h | 3 ++ proto/babel/config.Y | 3 ++ 4 files changed, 73 insertions(+), 11 deletions(-)
diff --git a/doc/bird.sgml b/doc/bird.sgml index 1580facd..5e85d8ec 100644 --- a/doc/bird.sgml +++ b/doc/bird.sgml @@ -1879,6 +1879,7 @@ protocol babel [<name>] { check link <switch>; next hop ipv4 <address>; next hop ipv6 <address>; + ecmp <switch> [limit <num>]; authentication none|mac [permissive]; password "<text>"; password "<text>" { @@ -1983,6 +1984,13 @@ protocol babel [<name>] { source for Babel packets will be used. In normal operation, it should not be necessary to set this option.
+ <tag><label id="babel-ecmp">ecmp <m>switch</m> [limit <m>number</m>]</tag> + + Determines whether babel will emit ECMP (equal-cost multipath) + routes, allowing to load-balancing traffic across multiple paths. If + enabled the maximum number of next-hops to allow can be specified, + defaulting to 16.
This should probably explain the constraints: i.e., that only equal-weight routes are added (which in practice limits it to interfaces of type 'wired', or wireless interfaces with no packet loss, I suppose?) and that the routes all have to be feasible (though I'm not sure if we explain feasibility in the docs anywhere).
Right, I have this now:
When neibours are using a dynamic link-quality metric this is unlikely to be useful. For best results <ref id="babel-type" name="type wired"> should be used throughout the network to get what amounts to a hop count metric.
Once RTT metric are merged I'm going to add another paragraph touting the virtues of using (the equivalents of rtt-min/max from babeld) to kick bufferbloated or weirdly re-routed paths out of the ECMP set. That's my main use-case anyway.
<tag><label id="babel-authentication">authentication none|mac [permissive]</tag> Selects authentication method to be used. <cf/none/ means that packets are not authenticated at all, <cf/mac/ means MAC authentication is diff --git a/proto/babel/babel.c b/proto/babel/babel.c index 4a7d550f..4d9c657c 100644 --- a/proto/babel/babel.c +++ b/proto/babel/babel.c @@ -623,6 +623,34 @@ done: } }
+/** + * babel_nexthop_insert - add next_hop of route to nexthop list + * @p: Babel protocol instance + * @r: Babel route + * @nhs: nexthop list head to append onto + * @nh: freshly allocated buffer to fill + */ +static void +babel_nexthop_insert( + struct babel_proto *p, + struct babel_route *r, + struct nexthop **nhs, + struct nexthop *nh) +{ + nh->gw = r->next_hop; + nh->iface = r->neigh->ifa->iface; + + /* + * If we cannot find a reachable neighbour, set the entry to be onlink. This + * makes it possible to, e.g., assign /32 addresses on a mesh interface and + * have routing work. + */ + if (!neigh_find(&p->p, r->next_hop, r->neigh->ifa->iface, 0)) + nh->flags = RNF_ONLINK; + + nexthop_insert(nhs, nh); +} + /** * babel_announce_rte - announce selected route to the core * @p: Babel protocol instance @@ -635,6 +663,7 @@ done: static void babel_announce_rte(struct babel_proto *p, struct babel_entry *e) { + struct babel_config *cf = (void *) p->p.cf; struct babel_route *r = e->selected; struct channel *c = (e->n.addr->type == NET_IP4) ? p->ip4_channel : p->ip6_channel;
@@ -645,18 +674,24 @@ babel_announce_rte(struct babel_proto *p, struct babel_entry *e) .source = RTS_BABEL, .scope = SCOPE_UNIVERSE, .dest = RTD_UNICAST, - .from = r->neigh->addr, - .nh.gw = r->next_hop, - .nh.iface = r->neigh->ifa->iface, };
- /* - * If we cannot find a reachable neighbour, set the entry to be onlink. This - * makes it possible to, e.g., assign /32 addresses on a mesh interface and - * have routing work. - */ - if (!neigh_find(&p->p, r->next_hop, r->neigh->ifa->iface, 0)) - a0.nh.flags = RNF_ONLINK; + struct nexthop *nhs = NULL; + babel_nexthop_insert(p, r, &nhs, allocz(sizeof(struct nexthop))); + int num_nexthops = 1; + + struct babel_route *cr; + WALK_LIST(cr, e->routes) { + if (num_nexthops++ >= cf->ecmp) + break; + + if (cr == r || !cr->feasible || cr->metric != r->metric) + continue;
You should probably switch the order of those two checks? You're increasing the counter even if the route is subsequently discarded...
Whoops, last minute change and brainfart.
+ + babel_nexthop_insert(p, cr, &nhs, allocz(sizeof(struct nexthop))); + } + + a0.nh = *nhs;
rta *a = rta_lookup(&a0); rte *rte = rte_get_temp(a); @@ -736,6 +771,7 @@ babel_announce_retraction(struct babel_proto *p, struct babel_entry *e) static void babel_select_route(struct babel_proto *p, struct babel_entry *e, struct babel_route *mod) { + struct babel_config *cf = (void *) p->p.cf; struct babel_route *r, *best = e->selected;
/* Shortcut if only non-best was modified */ @@ -744,8 +780,10 @@ babel_select_route(struct babel_proto *p, struct babel_entry *e, struct babel_ro /* Either select modified route, or keep old best route */ if ((mod->metric < (best ? best->metric : BABEL_INFINITY)) && mod->feasible) best = mod; - else + else if (cf->ecmp==1)
This looks like you're returning if ECMP is enabled; I think finding a better name for that config variable would be good; maybe just "max_nexthops"?
Yeah that's neater. Fixed.
return; + /* With ecmp one of the non-selected but equal metric routes might have + * changed so contine on with the announcement in that case. */ } else { @@ -1879,6 +1917,16 @@ babel_reconfigure_iface(struct babel_proto *p, struct babel_iface *ifa, struct b if (ifa->up) babel_iface_kick_timer(ifa);
+ /* Update all routes to refresh ecmp settings. */ + FIB_WALK(&p->ip6_rtable, struct babel_entry, e) { + babel_select_route(p, e, NULL); + } + FIB_WALK_END; + FIB_WALK(&p->ip4_rtable, struct babel_entry, e) { + babel_select_route(p, e, NULL); + } + FIB_WALK_END; +
I don't think this is the right thing to do; you should probably just refuse the reconfiguration if ECMP settings changed (see the 0-return at the top when TOS changes)?
Why not? Works fine AFAICT. I tested the reload stuff and the nexthops get updated when the limit changes just as I would expect.
Because walking the FIB like this is an ugly hack (as you yourself said in the description)? Also, it won't work; babel_select_route() can modify the fib, so calling that from within FIB_WALK() doesn't work. You'd have to do the full iterator thing (see babel_expire_routes_()), and that's way overkill for this.
If I refuse the reconfiguration the user would have to restart bird to get this to apply, right? I don't want that.
No, if you refuse reconfig, bird will automatically bring the interface down, then up again. This will flush all the neighbours, but I think that's OK; switching ECMP on and off doesn't have to be a non-destructive operation. Oh, and when looking at this, I noticed that babel_retract_route() only conditionally calls babel_select_route(); this is the case in quite a few places, actually, you'd have to update all of them. -Toke
Hi Toke, On Mon, May 09, 2022 at 03:21:15PM +0200, Toke Høiland-Jørgensen wrote:
+ /* Update all routes to refresh ecmp settings. */ + FIB_WALK(&p->ip6_rtable, struct babel_entry, e) { + babel_select_route(p, e, NULL); + } + FIB_WALK_END; + FIB_WALK(&p->ip4_rtable, struct babel_entry, e) { + babel_select_route(p, e, NULL); + } + FIB_WALK_END; +
I don't think this is the right thing to do; you should probably just refuse the reconfiguration if ECMP settings changed (see the 0-return at the top when TOS changes)?
Why not? Works fine AFAICT. I tested the reload stuff and the nexthops get updated when the limit changes just as I would expect.
Because walking the FIB like this is an ugly hack (as you yourself said in the description)?
Well I really just meant inlining the interations, I hadn't though about the iterator invalidation ascpect actually.
If I refuse the reconfiguration the user would have to restart bird to get this to apply, right? I don't want that.
No, if you refuse reconfig, bird will automatically bring the interface down, then up again. This will flush all the neighbours, but I think that's OK; switching ECMP on and off doesn't have to be a non-destructive operation.
Hmm, that might be acceptable. Given that I might need a babel_route->fib mapping anyway (see below) this might end up coming for free though.
Oh, and when looking at this, I noticed that babel_retract_route() only conditionally calls babel_select_route(); this is the case in quite a few places, actually, you'd have to update all of them.
Right to do babel_route removal efficiently I'll probably need some more machinery to keep track of which fib route/nexhop a given babel_route maps to. Just iterating over all fib routes to remove one babel_route is definetly too inefficient. --Daniel
dxld@darkboxed.org writes:
Hi Toke,
On Mon, May 09, 2022 at 03:21:15PM +0200, Toke Høiland-Jørgensen wrote:
+ /* Update all routes to refresh ecmp settings. */ + FIB_WALK(&p->ip6_rtable, struct babel_entry, e) { + babel_select_route(p, e, NULL); + } + FIB_WALK_END; + FIB_WALK(&p->ip4_rtable, struct babel_entry, e) { + babel_select_route(p, e, NULL); + } + FIB_WALK_END; +
I don't think this is the right thing to do; you should probably just refuse the reconfiguration if ECMP settings changed (see the 0-return at the top when TOS changes)?
Why not? Works fine AFAICT. I tested the reload stuff and the nexthops get updated when the limit changes just as I would expect.
Because walking the FIB like this is an ugly hack (as you yourself said in the description)?
Well I really just meant inlining the interations, I hadn't though about the iterator invalidation ascpect actually.
If I refuse the reconfiguration the user would have to restart bird to get this to apply, right? I don't want that.
No, if you refuse reconfig, bird will automatically bring the interface down, then up again. This will flush all the neighbours, but I think that's OK; switching ECMP on and off doesn't have to be a non-destructive operation.
Hmm, that might be acceptable. Given that I might need a babel_route->fib mapping anyway (see below) this might end up coming for free though.
Oh, and when looking at this, I noticed that babel_retract_route() only conditionally calls babel_select_route(); this is the case in quite a few places, actually, you'd have to update all of them.
Right to do babel_route removal efficiently I'll probably need some more machinery to keep track of which fib route/nexhop a given babel_route maps to.
Just iterating over all fib routes to remove one babel_route is definetly too inefficient.
I think what you need to do is to add something that's equivalent to 'babel_entry->selected' but for multiple 'babel_route' objects. So that everywhere there's a 'r == e->selected' check you can do 'babel_is_selected(e, r)', and that will check membership of that set. One option could just be to add a boolean flag to struct babel_route 'is_selected' which will be set for all routes that are part of an ECMP route; and make sure babel_select_route() clears that flag for all routes it does not add as nexthops. Then you can just check that field all the places where there's a conditional call to babel_select_route(); you don't need to do any separate fib walking apart from what is already done to expire routes... -Toke
--- doc/bird.sgml | 25 ++++++++++++++++ proto/babel/babel.c | 70 +++++++++++++++++++++++++++++++++++++------- proto/babel/babel.h | 5 ++++ proto/babel/config.Y | 3 ++ 4 files changed, 92 insertions(+), 11 deletions(-) diff --git a/doc/bird.sgml b/doc/bird.sgml index 1580facd..d1e6376b 100644 --- a/doc/bird.sgml +++ b/doc/bird.sgml @@ -1865,6 +1865,7 @@ protocol babel [<name>] { ipv4 { <channel config> }; ipv6 [sadr] { <channel config> }; randomize router id <switch>; + ecmp <switch> [limit <num>]; interface <interface pattern> { type <wired|wireless>; rxcost <number>; @@ -1909,6 +1910,18 @@ protocol babel [<name>] { router ID every time it starts up, which avoids this problem at the cost of not having stable router IDs in the network. Default: no. + <tag><label id="babel-ecmp">ecmp <m>switch</m> [limit <m>number</m>]</tag> + + Determines whether babel will emit ECMP (equal-cost multipath) + routes, allowing to load-balancing traffic across multiple paths. If + enabled the maximum number of next-hops to allow can be specified, + defaulting to 16. + + When neibours are using a dynamic link-quality metric this is + unlikely to be useful. For best results <ref id="babel-type" + name="type wired"> should be used throughout the network to get what + amounts to a hop count metric. + <tag><label id="babel-type">type wired|wireless </tag> This option specifies the interface type: Wired or wireless. On wired interfaces a neighbor is considered unreachable after a small number of @@ -1928,6 +1941,18 @@ protocol babel [<name>] { selection and not local route selection. Default: 96 for wired interfaces, 256 for wireless. + <tag><label id="babel-ecmp">ecmp <m>switch</m> [limit <m>number</m>]</tag> + + Determines whether babel will emit ECMP (equal-cost multipath) + routes, allowing to load-balancing traffic across multiple paths. If + enabled the maximum number of next-hops to allow can be specified, + defaulting to 16. + + When neibours are using a dynamic link-quality metric this is + unlikely to be useful. For best results <ref id="babel-type" + name="type wired"> should be used throughout the network to get what + amounts to a hop count metric. + <tag><label id="babel-limit">limit <m/num/</tag> BIRD keeps track of received Hello messages from each neighbor to establish neighbor reachability. For wired type interfaces, this option diff --git a/proto/babel/babel.c b/proto/babel/babel.c index 4a7d550f..01ea3a44 100644 --- a/proto/babel/babel.c +++ b/proto/babel/babel.c @@ -623,6 +623,34 @@ done: } } +/** + * babel_nexthop_insert - add next_hop of route to nexthop list + * @p: Babel protocol instance + * @r: Babel route + * @nhs: nexthop list head to append onto + * @nh: freshly allocated buffer to fill + */ +static void +babel_nexthop_insert( + struct babel_proto *p, + struct babel_route *r, + struct nexthop **nhs, + struct nexthop *nh) +{ + nh->gw = r->next_hop; + nh->iface = r->neigh->ifa->iface; + + /* + * If we cannot find a reachable neighbour, set the entry to be onlink. This + * makes it possible to, e.g., assign /32 addresses on a mesh interface and + * have routing work. + */ + if (!neigh_find(&p->p, r->next_hop, r->neigh->ifa->iface, 0)) + nh->flags = RNF_ONLINK; + + nexthop_insert(nhs, nh); +} + /** * babel_announce_rte - announce selected route to the core * @p: Babel protocol instance @@ -635,6 +663,7 @@ done: static void babel_announce_rte(struct babel_proto *p, struct babel_entry *e) { + struct babel_config *cf = (void *) p->p.cf; struct babel_route *r = e->selected; struct channel *c = (e->n.addr->type == NET_IP4) ? p->ip4_channel : p->ip6_channel; @@ -645,18 +674,24 @@ babel_announce_rte(struct babel_proto *p, struct babel_entry *e) .source = RTS_BABEL, .scope = SCOPE_UNIVERSE, .dest = RTD_UNICAST, - .from = r->neigh->addr, - .nh.gw = r->next_hop, - .nh.iface = r->neigh->ifa->iface, }; - /* - * If we cannot find a reachable neighbour, set the entry to be onlink. This - * makes it possible to, e.g., assign /32 addresses on a mesh interface and - * have routing work. - */ - if (!neigh_find(&p->p, r->next_hop, r->neigh->ifa->iface, 0)) - a0.nh.flags = RNF_ONLINK; + struct nexthop *nhs = NULL; + babel_nexthop_insert(p, r, &nhs, allocz(sizeof(struct nexthop))); + int num_nexthops = 1; + + struct babel_route *cr; + WALK_LIST(cr, e->routes) { + if (cr == r || !cr->feasible || cr->metric != r->metric) + continue; + + if (num_nexthops++ >= cf->max_nexthops) + break; + + babel_nexthop_insert(p, cr, &nhs, allocz(sizeof(struct nexthop))); + } + + a0.nh = *nhs; rta *a = rta_lookup(&a0); rte *rte = rte_get_temp(a); @@ -736,6 +771,7 @@ babel_announce_retraction(struct babel_proto *p, struct babel_entry *e) static void babel_select_route(struct babel_proto *p, struct babel_entry *e, struct babel_route *mod) { + struct babel_config *cf = (void *) p->p.cf; struct babel_route *r, *best = e->selected; /* Shortcut if only non-best was modified */ @@ -744,8 +780,10 @@ babel_select_route(struct babel_proto *p, struct babel_entry *e, struct babel_ro /* Either select modified route, or keep old best route */ if ((mod->metric < (best ? best->metric : BABEL_INFINITY)) && mod->feasible) best = mod; - else + else if (cf->max_nexthops == 1) return; + /* With ecmp one of the non-selected but equal metric routes might have + * changed so contine on with the announcement in that case. */ } else { @@ -1879,6 +1917,16 @@ babel_reconfigure_iface(struct babel_proto *p, struct babel_iface *ifa, struct b if (ifa->up) babel_iface_kick_timer(ifa); + /* Update all routes to refresh ecmp settings. */ + FIB_WALK(&p->ip6_rtable, struct babel_entry, e) { + babel_select_route(p, e, NULL); + } + FIB_WALK_END; + FIB_WALK(&p->ip4_rtable, struct babel_entry, e) { + babel_select_route(p, e, NULL); + } + FIB_WALK_END; + return 1; } diff --git a/proto/babel/babel.h b/proto/babel/babel.h index 84feb085..46e268fd 100644 --- a/proto/babel/babel.h +++ b/proto/babel/babel.h @@ -62,6 +62,8 @@ #define BABEL_OVERHEAD (IP6_HEADER_LENGTH+UDP_HEADER_LENGTH) #define BABEL_MIN_MTU (512 + BABEL_OVERHEAD) +#define BABEL_DEFAULT_ECMP_LIMIT 16 + #define BABEL_AUTH_NONE 0 #define BABEL_AUTH_MAC 1 @@ -120,6 +122,9 @@ struct babel_config { list iface_list; /* List of iface configs (struct babel_iface_config) */ uint hold_time; /* Time to hold stale entries and unreachable routes */ u8 randomize_router_id; + u8 max_nexthops; /* Maximum number of nexthops to + install. Defaults to 1. It's >1 if + ECMP is enabled. */ struct channel_config *ip4_channel; struct channel_config *ip6_channel; diff --git a/proto/babel/config.Y b/proto/babel/config.Y index 05210fa4..92e754cd 100644 --- a/proto/babel/config.Y +++ b/proto/babel/config.Y @@ -36,6 +36,7 @@ babel_proto_start: proto_start BABEL this_proto = proto_config_new(&proto_babel, $1); init_list(&BABEL_CFG->iface_list); BABEL_CFG->hold_time = 1 S_; + BABEL_CFG->max_nexthops = 1; }; babel_proto_item: @@ -43,6 +44,8 @@ babel_proto_item: | proto_channel | INTERFACE babel_iface | RANDOMIZE ROUTER ID bool { BABEL_CFG->randomize_router_id = $4; } + | ECMP bool { BABEL_CFG->max_nexthops = $2 ? BABEL_DEFAULT_ECMP_LIMIT : 1; } + | ECMP bool LIMIT expr { BABEL_CFG->max_nexthops = $2 ? $4 : 1; } ; babel_proto_opts: -- 2.30.2
--- doc/bird.sgml | 7 +++++++ proto/babel/babel.c | 1 + proto/babel/babel.h | 2 ++ proto/babel/config.Y | 2 ++ 4 files changed, 12 insertions(+) diff --git a/doc/bird.sgml b/doc/bird.sgml index d1e6376b..8d159b22 100644 --- a/doc/bird.sgml +++ b/doc/bird.sgml @@ -1880,6 +1880,7 @@ protocol babel [<name>] { check link <switch>; next hop ipv4 <address>; next hop ipv6 <address>; + ecmp weight <num>; authentication none|mac [permissive]; password "<text>"; password "<text>" { @@ -2008,6 +2009,12 @@ protocol babel [<name>] { source for Babel packets will be used. In normal operation, it should not be necessary to set this option. + <tag><label id="babel-ecmp-weight">ecmp weight <m>number</m></tag> + This specifies the relative weight used for nexthops going through + the iface when ECMP is enabled. Larger weight values relative to other + nexthops attract more traffic. Valid values are 1-256. Default value + is 1. + <tag><label id="babel-authentication">authentication none|mac [permissive]</tag> Selects authentication method to be used. <cf/none/ means that packets are not authenticated at all, <cf/mac/ means MAC authentication is diff --git a/proto/babel/babel.c b/proto/babel/babel.c index 01ea3a44..cd732eab 100644 --- a/proto/babel/babel.c +++ b/proto/babel/babel.c @@ -639,6 +639,7 @@ babel_nexthop_insert( { nh->gw = r->next_hop; nh->iface = r->neigh->ifa->iface; + nh->weight = r->neigh->ifa->cf->ecmp_weight; /* * If we cannot find a reachable neighbour, set the entry to be onlink. This diff --git a/proto/babel/babel.h b/proto/babel/babel.h index 46e268fd..06b82799 100644 --- a/proto/babel/babel.h +++ b/proto/babel/babel.h @@ -147,6 +147,8 @@ struct babel_iface_config { int tx_tos; int tx_priority; + u8 ecmp_weight; + ip_addr next_hop_ip4; ip_addr next_hop_ip6; diff --git a/proto/babel/config.Y b/proto/babel/config.Y index 92e754cd..0da5025d 100644 --- a/proto/babel/config.Y +++ b/proto/babel/config.Y @@ -70,6 +70,7 @@ babel_iface_start: BABEL_IFACE->tx_tos = IP_PREC_INTERNET_CONTROL; BABEL_IFACE->tx_priority = sk_priority_control; BABEL_IFACE->check_link = 1; + BABEL_IFACE->ecmp_weight = 0; }; @@ -149,6 +150,7 @@ babel_iface_item: | AUTHENTICATION NONE { BABEL_IFACE->auth_type = BABEL_AUTH_NONE; } | AUTHENTICATION MAC { BABEL_IFACE->auth_type = BABEL_AUTH_MAC; BABEL_IFACE->auth_permissive = 0; } | AUTHENTICATION MAC PERMISSIVE { BABEL_IFACE->auth_type = BABEL_AUTH_MAC; BABEL_IFACE->auth_permissive = 1; } + | ECMP WEIGHT expr { BABEL_IFACE->ecmp_weight = $3 - 1; if (($3<1) || ($3>256)) cf_error("ECMP weight must be in range 1-256"); } | password_list ; -- 2.30.2
Daniel Gröber <dxld@darkboxed.org> writes:
--- doc/bird.sgml | 25 ++++++++++++++++ proto/babel/babel.c | 70 +++++++++++++++++++++++++++++++++++++------- proto/babel/babel.h | 5 ++++ proto/babel/config.Y | 3 ++ 4 files changed, 92 insertions(+), 11 deletions(-)
See comments on the previous patch; in general, it's better to wait until any discussion has been fully resolved between posting a new version :) (also, why is it still two patches, and where are the commit messages?) -Toke
We introduce ecmp support for the babel protocol by extending it's definition of a route being selected to mean the route being in the ECMP set. In order to keep code changes minimal we keep the pointer to an arbitrary member of the ECMP set in the FIB entry and add a new flag to babel_route which indicates which routes were actually announced to the core. Since keeping this flag update at all times is a hassle we take a lazy approach and simply check the metric of the selected route against the route in question whenever we want to know for sure if a route is in the ECMP set. --- doc/bird.sgml | 32 ++++++++++++++ proto/babel/babel.c | 103 +++++++++++++++++++++++++++++++++++-------- proto/babel/babel.h | 16 +++++++ proto/babel/config.Y | 5 +++ 4 files changed, 138 insertions(+), 18 deletions(-) Changes in v3: - Squash with ecmp weigth patch - Add babel_route_is_selected() as replacement for e->selected I'm not totally sure the lazy approach is safe yet. We might need additional bookeeping to reset r->active_nexthop on route retraction/flush instead. diff --git a/doc/bird.sgml b/doc/bird.sgml index 1580facd..8d159b22 100644 --- a/doc/bird.sgml +++ b/doc/bird.sgml @@ -1865,6 +1865,7 @@ protocol babel [<name>] { ipv4 { <channel config> }; ipv6 [sadr] { <channel config> }; randomize router id <switch>; + ecmp <switch> [limit <num>]; interface <interface pattern> { type <wired|wireless>; rxcost <number>; @@ -1879,6 +1880,7 @@ protocol babel [<name>] { check link <switch>; next hop ipv4 <address>; next hop ipv6 <address>; + ecmp weight <num>; authentication none|mac [permissive]; password "<text>"; password "<text>" { @@ -1909,6 +1911,18 @@ protocol babel [<name>] { router ID every time it starts up, which avoids this problem at the cost of not having stable router IDs in the network. Default: no. + <tag><label id="babel-ecmp">ecmp <m>switch</m> [limit <m>number</m>]</tag> + + Determines whether babel will emit ECMP (equal-cost multipath) + routes, allowing to load-balancing traffic across multiple paths. If + enabled the maximum number of next-hops to allow can be specified, + defaulting to 16. + + When neibours are using a dynamic link-quality metric this is + unlikely to be useful. For best results <ref id="babel-type" + name="type wired"> should be used throughout the network to get what + amounts to a hop count metric. + <tag><label id="babel-type">type wired|wireless </tag> This option specifies the interface type: Wired or wireless. On wired interfaces a neighbor is considered unreachable after a small number of @@ -1928,6 +1942,18 @@ protocol babel [<name>] { selection and not local route selection. Default: 96 for wired interfaces, 256 for wireless. + <tag><label id="babel-ecmp">ecmp <m>switch</m> [limit <m>number</m>]</tag> + + Determines whether babel will emit ECMP (equal-cost multipath) + routes, allowing to load-balancing traffic across multiple paths. If + enabled the maximum number of next-hops to allow can be specified, + defaulting to 16. + + When neibours are using a dynamic link-quality metric this is + unlikely to be useful. For best results <ref id="babel-type" + name="type wired"> should be used throughout the network to get what + amounts to a hop count metric. + <tag><label id="babel-limit">limit <m/num/</tag> BIRD keeps track of received Hello messages from each neighbor to establish neighbor reachability. For wired type interfaces, this option @@ -1983,6 +2009,12 @@ protocol babel [<name>] { source for Babel packets will be used. In normal operation, it should not be necessary to set this option. + <tag><label id="babel-ecmp-weight">ecmp weight <m>number</m></tag> + This specifies the relative weight used for nexthops going through + the iface when ECMP is enabled. Larger weight values relative to other + nexthops attract more traffic. Valid values are 1-256. Default value + is 1. + <tag><label id="babel-authentication">authentication none|mac [permissive]</tag> Selects authentication method to be used. <cf/none/ means that packets are not authenticated at all, <cf/mac/ means MAC authentication is diff --git a/proto/babel/babel.c b/proto/babel/babel.c index 4a7d550f..cd5e7a20 100644 --- a/proto/babel/babel.c +++ b/proto/babel/babel.c @@ -164,12 +164,23 @@ babel_get_route(struct babel_proto *p, struct babel_entry *e, struct babel_neigh return r; } +/* Check if a route is currently active as part of any RTE nexthop. + */ +static inline u8 +babel_route_is_selected(struct babel_route *r) +{ + if (!r->e->selected || r->e->selected->metric != r->metric) + r->active_nexthop = 0; + + return r->active_nexthop; +} + static inline void babel_retract_route(struct babel_proto *p, struct babel_route *r) { r->metric = r->advert_metric = BABEL_INFINITY; - if (r == r->e->selected) + if (babel_route_is_selected(r)) babel_select_route(p, r->e, r); } @@ -210,7 +221,7 @@ babel_expire_route(struct babel_proto *p, struct babel_route *r) static void babel_refresh_route(struct babel_proto *p, struct babel_route *r) { - if (r == r->e->selected) + if (babel_route_is_selected(r)) babel_send_route_request(p, r->e, r->neigh); r->refresh_time = 0; @@ -238,7 +249,7 @@ loop: if (r->expires && r->expires <= now_) { - changed = changed || (r == e->selected); + changed = changed || babel_route_is_selected(r); babel_expire_route(p, r); } } @@ -624,7 +635,38 @@ done: } /** - * babel_announce_rte - announce selected route to the core + * babel_nexthop_insert - add next_hop of route to nexthop list + * @p: Babel protocol instance + * @r: Babel route + * @nhs: nexthop list head to append onto + * @nh: freshly allocated buffer to fill + */ +static void +babel_nexthop_insert( + struct babel_proto *p, + struct babel_route *r, + struct nexthop **nhs, + struct nexthop *nh) +{ + r->active_nexthop = 1; + + nh->gw = r->next_hop; + nh->iface = r->neigh->ifa->iface; + nh->weight = r->neigh->ifa->cf->ecmp_weight; + + /* + * If we cannot find a reachable neighbour, set the entry to be onlink. This + * makes it possible to, e.g., assign /32 addresses on a mesh interface and + * have routing work. + */ + if (!neigh_find(&p->p, r->next_hop, r->neigh->ifa->iface, 0)) + nh->flags = RNF_ONLINK; + + nexthop_insert(nhs, nh); +} + +/** + * babel_announce_rte - announce selected routes to the core * @p: Babel protocol instance * @e: Babel route entry to announce * @@ -635,6 +677,7 @@ done: static void babel_announce_rte(struct babel_proto *p, struct babel_entry *e) { + struct babel_config *cf = (void *) p->p.cf; struct babel_route *r = e->selected; struct channel *c = (e->n.addr->type == NET_IP4) ? p->ip4_channel : p->ip6_channel; @@ -645,18 +688,24 @@ babel_announce_rte(struct babel_proto *p, struct babel_entry *e) .source = RTS_BABEL, .scope = SCOPE_UNIVERSE, .dest = RTD_UNICAST, - .from = r->neigh->addr, - .nh.gw = r->next_hop, - .nh.iface = r->neigh->ifa->iface, }; - /* - * If we cannot find a reachable neighbour, set the entry to be onlink. This - * makes it possible to, e.g., assign /32 addresses on a mesh interface and - * have routing work. - */ - if (!neigh_find(&p->p, r->next_hop, r->neigh->ifa->iface, 0)) - a0.nh.flags = RNF_ONLINK; + struct nexthop *nhs = NULL; + babel_nexthop_insert(p, r, &nhs, allocz(sizeof(struct nexthop))); + int num_nexthops = 1; + + struct babel_route *cr; + WALK_LIST(cr, e->routes) { + if (cr == r || !cr->feasible || cr->metric != r->metric) + continue; + + if (num_nexthops++ >= cf->max_nexthops) + break; + + babel_nexthop_insert(p, cr, &nhs, allocz(sizeof(struct nexthop))); + } + + a0.nh = *nhs; rta *a = rta_lookup(&a0); rte *rte = rte_get_temp(a); @@ -736,6 +785,7 @@ babel_announce_retraction(struct babel_proto *p, struct babel_entry *e) static void babel_select_route(struct babel_proto *p, struct babel_entry *e, struct babel_route *mod) { + struct babel_config *cf = (void *) p->p.cf; struct babel_route *r, *best = e->selected; /* Shortcut if only non-best was modified */ @@ -744,8 +794,10 @@ babel_select_route(struct babel_proto *p, struct babel_entry *e, struct babel_ro /* Either select modified route, or keep old best route */ if ((mod->metric < (best ? best->metric : BABEL_INFINITY)) && mod->feasible) best = mod; - else + else if (cf->max_nexthops == 1) return; + /* With ecmp one of the non-selected but equal metric routes might have + * changed so contine on with the announcement in that case. */ } else { @@ -754,9 +806,10 @@ babel_select_route(struct babel_proto *p, struct babel_entry *e, struct babel_ro best = NULL; /* Find the best feasible route from all routes */ - WALK_LIST(r, e->routes) + WALK_LIST(r, e->routes) { if ((r->metric < (best ? best->metric : BABEL_INFINITY)) && r->feasible) best = r; + } } if (best) @@ -1956,7 +2009,7 @@ babel_dump_entry(struct babel_entry *e) WALK_LIST(r,e->routes) { debug(" "); - if (r == e->selected) debug("*"); + if (babel_route_is_selected(r)) debug("*"); babel_dump_route(r); } } @@ -2170,7 +2223,7 @@ babel_show_routes_(struct babel_proto *p, struct fib *rtable) struct babel_route *r; WALK_LIST(r, e->routes) { - char c = (r == e->selected) ? '*' : (r->feasible ? '+' : ' '); + char c = (babel_route_is_selected(r)) ? '*' : (r->feasible ? '+' : ' '); btime time = r->expires ? r->expires - current_time() : 0; cli_msg(-1025, "%-*N %-25I %-10s %5u %c %5u %7t", width, e->n.addr, r->next_hop, r->neigh->ifa->ifname, @@ -2441,6 +2494,16 @@ babel_shutdown(struct proto *P) return PS_DOWN; } +static void +babel_reconfigure_routes(struct babel_proto *p, struct fib *rtable) +{ + struct fib_iterator fit; + FIB_ITERATE_INIT(&fit, rtable); + FIB_ITERATE_START(rtable, &fit, struct babel_entry, e) + babel_announce_rte(p, e); + FIB_ITERATE_END; +} + static int babel_reconfigure(struct proto *P, struct proto_config *CF) { @@ -2460,6 +2523,10 @@ babel_reconfigure(struct proto *P, struct proto_config *CF) p->p.cf = CF; babel_reconfigure_ifaces(p, new); + /* Update all routes to refresh ecmp settings. */ + babel_reconfigure_routes(p, &p->ip6_rtable); + babel_reconfigure_routes(p, &p->ip4_rtable); + babel_trigger_update(p); babel_kick_timer(p); diff --git a/proto/babel/babel.h b/proto/babel/babel.h index 84feb085..153ef78c 100644 --- a/proto/babel/babel.h +++ b/proto/babel/babel.h @@ -62,6 +62,8 @@ #define BABEL_OVERHEAD (IP6_HEADER_LENGTH+UDP_HEADER_LENGTH) #define BABEL_MIN_MTU (512 + BABEL_OVERHEAD) +#define BABEL_DEFAULT_ECMP_LIMIT 16 + #define BABEL_AUTH_NONE 0 #define BABEL_AUTH_MAC 1 @@ -120,6 +122,9 @@ struct babel_config { list iface_list; /* List of iface configs (struct babel_iface_config) */ uint hold_time; /* Time to hold stale entries and unreachable routes */ u8 randomize_router_id; + u8 max_nexthops; /* Maximum number of nexthops to + install. Defaults to 1. It's >1 if + ECMP is enabled. */ struct channel_config *ip4_channel; struct channel_config *ip6_channel; @@ -142,6 +147,8 @@ struct babel_iface_config { int tx_tos; int tx_priority; + u8 ecmp_weight; + ip_addr next_hop_ip4; ip_addr next_hop_ip6; @@ -258,6 +265,11 @@ struct babel_route { struct babel_neighbor *neigh; u8 feasible; + u8 active_nexthop; + /* If true route could be in the ecmp set (i.e. it's in use as a nexthop), if + * false definetly not. Use babel_route_is_selected() to get a boolean + * answer. */ + u16 seqno; u16 metric; u16 advert_metric; @@ -280,6 +292,10 @@ struct babel_seqno_request { struct babel_entry { struct babel_route *selected; + /* Route currently being announced to neigbours. When ECMP is enabled this is + * but one of the full set of routes in use for nexthop purposes. To check if + * a particular route is part of the ECMP set use babel_route_is_selected(). + */ list routes; /* Routes for this prefix (struct babel_route) */ list sources; /* Source entries for this prefix (struct babel_source). */ diff --git a/proto/babel/config.Y b/proto/babel/config.Y index 05210fa4..0da5025d 100644 --- a/proto/babel/config.Y +++ b/proto/babel/config.Y @@ -36,6 +36,7 @@ babel_proto_start: proto_start BABEL this_proto = proto_config_new(&proto_babel, $1); init_list(&BABEL_CFG->iface_list); BABEL_CFG->hold_time = 1 S_; + BABEL_CFG->max_nexthops = 1; }; babel_proto_item: @@ -43,6 +44,8 @@ babel_proto_item: | proto_channel | INTERFACE babel_iface | RANDOMIZE ROUTER ID bool { BABEL_CFG->randomize_router_id = $4; } + | ECMP bool { BABEL_CFG->max_nexthops = $2 ? BABEL_DEFAULT_ECMP_LIMIT : 1; } + | ECMP bool LIMIT expr { BABEL_CFG->max_nexthops = $2 ? $4 : 1; } ; babel_proto_opts: @@ -67,6 +70,7 @@ babel_iface_start: BABEL_IFACE->tx_tos = IP_PREC_INTERNET_CONTROL; BABEL_IFACE->tx_priority = sk_priority_control; BABEL_IFACE->check_link = 1; + BABEL_IFACE->ecmp_weight = 0; }; @@ -146,6 +150,7 @@ babel_iface_item: | AUTHENTICATION NONE { BABEL_IFACE->auth_type = BABEL_AUTH_NONE; } | AUTHENTICATION MAC { BABEL_IFACE->auth_type = BABEL_AUTH_MAC; BABEL_IFACE->auth_permissive = 0; } | AUTHENTICATION MAC PERMISSIVE { BABEL_IFACE->auth_type = BABEL_AUTH_MAC; BABEL_IFACE->auth_permissive = 1; } + | ECMP WEIGHT expr { BABEL_IFACE->ecmp_weight = $3 - 1; if (($3<1) || ($3>256)) cf_error("ECMP weight must be in range 1-256"); } | password_list ; -- 2.30.2
On Thu, May 26, 2022 at 03:33:44PM +0200, Daniel Gröber wrote:
Changes in v3: - Squash with ecmp weigth patch - Add babel_route_is_selected() as replacement for e->selected I'm not totally sure the lazy approach is safe yet. We might need additional bookeeping to reset r->active_nexthop on route retraction/flush instead.
Forgot to mention: - Refactor route reload and use babel_announce_rte directly instead of via babel_select_route.
+static void +babel_reconfigure_routes(struct babel_proto *p, struct fib *rtable) +{ + struct fib_iterator fit; + FIB_ITERATE_INIT(&fit, rtable); + FIB_ITERATE_START(rtable, &fit, struct babel_entry, e) + babel_announce_rte(p, e); + FIB_ITERATE_END; +} + static int babel_reconfigure(struct proto *P, struct proto_config *CF) { @@ -2460,6 +2523,10 @@ babel_reconfigure(struct proto *P, struct proto_config *CF) p->p.cf = CF; babel_reconfigure_ifaces(p, new);
+ /* Update all routes to refresh ecmp settings. */ + babel_reconfigure_routes(p, &p->ip6_rtable); + babel_reconfigure_routes(p, &p->ip4_rtable); + babel_trigger_update(p); babel_kick_timer(p);
--Daniel
participants (3)
-
Daniel Gröber -
dxld@darkboxed.org -
Toke Høiland-Jørgensen