[PATCH] Babel: Remove unecessary FIB_ITERATE restart
The route expiration code appears to have been stolen from rip.c, in that code the rt_notify function actually does modify the rtable fib by calling fib_get. The babel code however does no such thing, so this inefficient restart is just entirely uneccesarry. To prove this is true I add a bunch of checking code that can be removed later. --- proto/babel/babel.c | 64 ++++++++++++++++++++++++++------------------- proto/babel/babel.h | 8 +++--- 2 files changed, 42 insertions(+), 30 deletions(-) diff --git a/proto/babel/babel.c b/proto/babel/babel.c index ff8b6b52..bed2e1e4 100644 --- a/proto/babel/babel.c +++ b/proto/babel/babel.c @@ -58,6 +58,9 @@ static void babel_update_cost(struct babel_neighbor *n); static inline void babel_kick_timer(struct babel_proto *p); static inline void babel_iface_kick_timer(struct babel_iface *ifa); +#define rtable_lock(rtable) ASSERT(!(rtable)->lock++) +#define rtable_unlock(rtable) ASSERT(!--(rtable)->lock) + /* * Functions to maintain data structures */ @@ -76,15 +79,16 @@ babel_init_entry(struct fib *f UNUSED, void *E) static inline struct babel_entry * babel_find_entry(struct babel_proto *p, const net_addr *n) { - struct fib *rtable = (n->type == NET_IP4) ? &p->ip4_rtable : &p->ip6_rtable; + struct fib *rtable = (n->type == NET_IP4) ? &p->ip4_rtable.fib : &p->ip6_rtable.fib; return fib_find(rtable, n); } static struct babel_entry * babel_get_entry(struct babel_proto *p, const net_addr *n) { - struct fib *rtable = (n->type == NET_IP4) ? &p->ip4_rtable : &p->ip6_rtable; - struct babel_entry *e = fib_get(rtable, n); + struct babel_rtable *rtable = (n->type == NET_IP4) ? &p->ip4_rtable : &p->ip6_rtable; + ASSERT(!rtable->lock); + struct babel_entry *e = fib_get(&rtable->fib, n); return e; } @@ -217,17 +221,18 @@ babel_refresh_route(struct babel_proto *p, struct babel_route *r) } static void -babel_expire_routes_(struct babel_proto *p, struct fib *rtable) +babel_expire_routes_(struct babel_proto *p, struct babel_rtable *rtable) { struct babel_config *cf = (void *) p->p.cf; struct babel_route *r, *rx; struct fib_iterator fit; btime now_ = current_time(); - FIB_ITERATE_INIT(&fit, rtable); + rtable_lock(rtable); + FIB_ITERATE_INIT(&fit, &rtable->fib); loop: - FIB_ITERATE_START(rtable, &fit, struct babel_entry, e) + FIB_ITERATE_START(&rtable->fib, &fit, struct babel_entry, e) { int changed = 0; @@ -244,16 +249,7 @@ loop: } if (changed) - { - /* - * We have to restart the iteration because there may be a cascade of - * synchronous events babel_select_route() -> nest table change -> - * babel_rt_notify() -> rtable change, invalidating hidden variables. - */ - FIB_ITERATE_PUT(&fit); babel_select_route(p, e, NULL); - goto loop; - } /* Clean up stale entries */ if ((e->valid == BABEL_ENTRY_STALE) && ((e->updated + cf->hold_time) <= now_)) @@ -263,6 +259,7 @@ loop: if (e->unreachable && (!e->valid || (e->router_id == p->router_id))) { FIB_ITERATE_PUT(&fit); + rtable_unlock(rtable); babel_announce_retraction(p, e); goto loop; } @@ -274,11 +271,13 @@ loop: if (!e->valid && EMPTY_LIST(e->routes) && EMPTY_LIST(e->sources) && EMPTY_LIST(e->requests)) { FIB_ITERATE_PUT(&fit); - fib_delete(rtable, e); + rtable_unlock(rtable); + fib_delete(&rtable->fib, e); goto loop; } } FIB_ITERATE_END; + rtable_unlock(rtable); } static void @@ -959,7 +958,7 @@ babel_send_seqno_request(struct babel_proto *p, struct babel_entry *e, * transmitted entry is updated. */ static void -babel_send_update_(struct babel_iface *ifa, btime changed, struct fib *rtable) +babel_send_update_(struct babel_iface *ifa, btime changed, struct babel_rtable *rtable) { struct babel_proto *p = ifa->proto; @@ -970,7 +969,8 @@ babel_send_update_(struct babel_iface *ifa, btime changed, struct fib *rtable) p->update_seqno_inc = 0; } - FIB_WALK(rtable, struct babel_entry, e) + rtable_lock(rtable); + FIB_WALK(&rtable->fib, struct babel_entry, e) { if (!e->valid) continue; @@ -1022,6 +1022,7 @@ babel_send_update_(struct babel_iface *ifa, btime changed, struct fib *rtable) } } FIB_WALK_END; + rtable_unlock(rtable); } static void @@ -2055,16 +2056,21 @@ babel_dump(struct proto *P) WALK_LIST(ifa, p->interfaces) babel_dump_iface(ifa); - FIB_WALK(&p->ip4_rtable, struct babel_entry, e) + rtable_lock(&p->ip4_rtable); + FIB_WALK(&p->ip4_rtable.fib, struct babel_entry, e) { babel_dump_entry(e); } FIB_WALK_END; - FIB_WALK(&p->ip6_rtable, struct babel_entry, e) + rtable_unlock(&p->ip4_rtable); + + rtable_lock(&p->ip6_rtable); + FIB_WALK(&p->ip6_rtable.fib, struct babel_entry, e) { babel_dump_entry(e); } FIB_WALK_END; + rtable_unlock(&p->ip6_rtable); } static void @@ -2180,11 +2186,12 @@ babel_show_neighbors(struct proto *P, const char *iff) } static void -babel_show_entries_(struct babel_proto *p, struct fib *rtable) +babel_show_entries_(struct babel_proto *p, struct babel_rtable *rtable) { int width = babel_sadr_enabled(p) ? -54 : -24; - FIB_WALK(rtable, struct babel_entry, e) + rtable_lock(rtable); + FIB_WALK(&rtable->fib, struct babel_entry, e) { struct babel_route *r = NULL; uint rts = 0, srcs = 0; @@ -2207,6 +2214,7 @@ babel_show_entries_(struct babel_proto *p, struct fib *rtable) e->n.addr, "<none>", "-", "-", rts, srcs); } FIB_WALK_END; + rtable_unlock(rtable); } void @@ -2230,11 +2238,12 @@ babel_show_entries(struct proto *P) } static void -babel_show_routes_(struct babel_proto *p, struct fib *rtable) +babel_show_routes_(struct babel_proto *p, struct babel_rtable *rtable) { int width = babel_sadr_enabled(p) ? -54 : -24; - FIB_WALK(rtable, struct babel_entry, e) + rtable_lock(rtable); + FIB_WALK(&rtable->fib, struct babel_entry, e) { struct babel_route *r; WALK_LIST(r, e->routes) @@ -2247,6 +2256,7 @@ babel_show_routes_(struct babel_proto *p, struct fib *rtable) } } FIB_WALK_END; + rtable_unlock(rtable); } void @@ -2446,9 +2456,9 @@ babel_start(struct proto *P) struct babel_config *cf = (void *) P->cf; u8 ip6_type = cf->ip6_channel ? cf->ip6_channel->net_type : NET_IP6; - fib_init(&p->ip4_rtable, P->pool, NET_IP4, sizeof(struct babel_entry), + fib_init(&p->ip4_rtable.fib, P->pool, NET_IP4, sizeof(struct babel_entry), OFFSETOF(struct babel_entry, n), 0, babel_init_entry); - fib_init(&p->ip6_rtable, P->pool, ip6_type, sizeof(struct babel_entry), + fib_init(&p->ip6_rtable.fib, P->pool, ip6_type, sizeof(struct babel_entry), OFFSETOF(struct babel_entry, n), 0, babel_init_entry); init_list(&p->interfaces); @@ -2508,7 +2518,7 @@ babel_reconfigure(struct proto *P, struct proto_config *CF) TRACE(D_EVENTS, "Reconfiguring"); - if (p->ip6_rtable.addr_type != ip6_type) + if (p->ip6_rtable.fib.addr_type != ip6_type) return 0; if (!proto_configure_channel(P, &p->ip4_channel, new->ip4_channel) || diff --git a/proto/babel/babel.h b/proto/babel/babel.h index da8386b3..e55523ed 100644 --- a/proto/babel/babel.h +++ b/proto/babel/babel.h @@ -158,8 +158,10 @@ struct babel_iface_config { struct babel_proto { struct proto p; timer *timer; - struct fib ip4_rtable; - struct fib ip6_rtable; + struct babel_rtable { + struct fib fib; + u8 lock; /* Ensure no changes during FIB_ITERATE */ + } ip4_rtable, ip6_rtable; struct channel *ip4_channel; struct channel *ip6_channel; @@ -408,7 +410,7 @@ struct babel_msg_auth { }; static inline int babel_sadr_enabled(struct babel_proto *p) -{ return p->ip6_rtable.addr_type == NET_IP6_SADR; } +{ return p->ip6_rtable.fib.addr_type == NET_IP6_SADR; } /* babel.c */ void babel_handle_ack_req(union babel_msg *msg, struct babel_iface *ifa); -- 2.30.2
Daniel Gröber <dxld@darkboxed.org> writes:
The route expiration code appears to have been stolen from rip.c, in that code the rt_notify function actually does modify the rtable fib by calling fib_get. The babel code however does no such thing, so this inefficient restart is just entirely uneccesarry.
Erm, huh? The babel rt_notify() function calls babel_get_entry(), which calls fib_get()? It's quite similar to the rip one, in fact (which, as you note, is no coincidence).
To prove this is true I add a bunch of checking code that can be removed later.
That doesn't really prove anything, though, unless you can also show that you did an exhaustive torture test of all possible route expiry/retraction possibilities? :) More to the point, what problem are you really trying to solve here? Did you observe any particular performance issue with the current code, for instance? -Toke
Hi Toke, On Mon, Jan 30, 2023 at 10:50:14PM +0100, Toke Høiland-Jørgensen wrote:
Daniel Gröber <dxld@darkboxed.org> writes:
The route expiration code appears to have been stolen from rip.c, in that code the rt_notify function actually does modify the rtable fib by calling fib_get. The babel code however does no such thing, so this inefficient restart is just entirely uneccesarry.
Erm, huh? The babel rt_notify() function calls babel_get_entry(), which calls fib_get()? It's quite similar to the rip one, in fact (which, as you note, is no coincidence).
Rats, looks like I missed that codepath in my review. Interesting that my test code never blew up because of it though. Looking at it again it should blow up when rt_notify gets a route that's announced from another protocol, not babel, as babel_announce_rte will always already have a babel_entry in hand. Is it perhaps that the problematic sync callchain only happens with babel routes and so we always already have an entry?
To prove this is true I add a bunch of checking code that can be removed later.
That doesn't really prove anything, though, unless you can also show that you did an exhaustive torture test of all possible route expiry/retraction possibilities? :)
Prove is perhaps the wrong word, I just wanted to see if it blows up in my face haha.
More to the point, what problem are you really trying to solve here? Did you observe any particular performance issue with the current code, for instance?
With my recent "announce all possible routes to the core" changes I'm concerned about cubic blowup here. Since now every expired route (not just the selected one) causes a restart. We can always go back to the solution I had before this: keep a singly-linked list of routes to be expired and do them all after FIB_ITERATE_PUT. --Daniel
dxld@darkboxed.org writes:
Hi Toke,
On Mon, Jan 30, 2023 at 10:50:14PM +0100, Toke Høiland-Jørgensen wrote:
Daniel Gröber <dxld@darkboxed.org> writes:
The route expiration code appears to have been stolen from rip.c, in that code the rt_notify function actually does modify the rtable fib by calling fib_get. The babel code however does no such thing, so this inefficient restart is just entirely uneccesarry.
Erm, huh? The babel rt_notify() function calls babel_get_entry(), which calls fib_get()? It's quite similar to the rip one, in fact (which, as you note, is no coincidence).
Rats, looks like I missed that codepath in my review. Interesting that my test code never blew up because of it though.
Looking at it again it should blow up when rt_notify gets a route that's announced from another protocol, not babel, as babel_announce_rte will always already have a babel_entry in hand.
Is it perhaps that the problematic sync callchain only happens with babel routes and so we always already have an entry?
Well, assuming babel_select_route()->babel_announce_rte()->rt_notify() only ever results in rt_notify() receiving (or not, in case of filters) its own routes back, I guess that fib_get() shouldn't actually modify anything. However, that's an awful lot of assumptions to base things on, so that doesn't quite seem safe... Especially since "blow up" here probably means subtle data corruption rather than an outright crash?
To prove this is true I add a bunch of checking code that can be removed later.
That doesn't really prove anything, though, unless you can also show that you did an exhaustive torture test of all possible route expiry/retraction possibilities? :)
Prove is perhaps the wrong word, I just wanted to see if it blows up in my face haha.
More to the point, what problem are you really trying to solve here? Did you observe any particular performance issue with the current code, for instance?
With my recent "announce all possible routes to the core" changes I'm concerned about cubic blowup here. Since now every expired route (not just the selected one) causes a restart.
We can always go back to the solution I had before this: keep a singly-linked list of routes to be expired and do them all after FIB_ITERATE_PUT.
Yeah, that seems like a better solution, cf. the above :) -Toke
On Mon, Jan 30, 2023 at 11:07:59PM +0100, dxld@darkboxed.org wrote:
Hi Toke,
On Mon, Jan 30, 2023 at 10:50:14PM +0100, Toke Høiland-Jørgensen wrote:
Daniel Gröber <dxld@darkboxed.org> writes:
The route expiration code appears to have been stolen from rip.c, in that code the rt_notify function actually does modify the rtable fib by calling fib_get. The babel code however does no such thing, so this inefficient restart is just entirely uneccesarry.
Erm, huh? The babel rt_notify() function calls babel_get_entry(), which calls fib_get()? It's quite similar to the rip one, in fact (which, as you note, is no coincidence).
Rats, looks like I missed that codepath in my review. Interesting that my test code never blew up because of it though.
Looking at it again it should blow up when rt_notify gets a route that's announced from another protocol, not babel, as babel_announce_rte will always already have a babel_entry in hand.
Is it perhaps that the problematic sync callchain only happens with babel routes and so we always already have an entry?
The general reason to use FIB_ITERATE_PUT() here is that we are leaving the local code path, going to completely different module (nest) that may do callbacks back to Babel code. These callbacks may or may not use/modify Babel FIB table, but assuming they would not and keeping the data structure 'dirty' (with active iterators) makes the code super brittle and even if done correctly it may break later during some refactorization.
More to the point, what problem are you really trying to solve here? Did you observe any particular performance issue with the current code, for instance?
With my recent "announce all possible routes to the core" changes I'm concerned about cubic blowup here. Since now every expired route (not just the selected one) causes a restart.
Are you aware that it is not really restart, as FIB_ITERATE_START() continues from last stored FIB_ITERATE_PUT() position, so it is only a restart for that babel_entry? -- Elen sila lumenn' omentielvo Ondrej 'Santiago' Zajicek (email: santiago@crfreenet.org) OpenPGP encrypted e-mails preferred (KeyID 0x11DEADC3, wwwkeys.pgp.net) "To err is human -- to blame it on a computer is even more so."
participants (4)
-
Daniel Gröber -
dxld@darkboxed.org -
Ondrej Zajicek -
Toke Høiland-Jørgensen