[PATCH] [RFC] Babel: Implement route daming with fixed delay
In order to prevent RTT based routing from causing persistent traffic oscillations we delay core rte announcement of each prefix by a configurable but metric invariant amount of time. Initial announcements and withdrawals will go through undelayed but a chnage in route for a prefix will cancel the previous announcement and reset the delay. --- This is just quickly thrown together to demonstrate the idea of how to replace metric smoothing with something more simplistic. Obviously the performance of using a linked list for keeping the scheduled announcements is suboptimal, our binary heap could be used instead. I'm not quite sure all the logic works out but let's discuss the concept before spending more time on it :) This could be extended to make the delay depend on metric difference if it turns out that's necessary I just skipped it for simplicity. proto/babel/babel.c | 79 +++++++++++++++++++++++++++++++++++++++++++-- proto/babel/babel.h | 11 +++++-- 2 files changed, 86 insertions(+), 4 deletions(-) diff --git a/proto/babel/babel.c b/proto/babel/babel.c index 2243fc89..d6d99a37 100644 --- a/proto/babel/babel.c +++ b/proto/babel/babel.c @@ -62,6 +62,7 @@ static inline int gt_mod64k(uint a, uint b) static void babel_expire_requests(struct babel_proto *p, struct babel_entry *e); static void babel_announce_rte(struct babel_proto *p, struct babel_entry *e, struct babel_route *r); +static void babel_sched_announce_rte(struct babel_proto *p, struct babel_entry *e, struct babel_route *r); static void babel_rte_update_unreachable(struct babel_proto *p, struct babel_entry *e, u8 announce); static void babel_send_route_request(struct babel_proto *p, struct babel_entry *e, struct babel_neighbor *n); static void babel_send_seqno_request(struct babel_proto *p, struct babel_entry *e, struct babel_seqno_request *sr, struct babel_neighbor *n); @@ -653,7 +654,7 @@ done: WALK_LIST2(r, n, nbr->routes, neigh_route) { r->metric = babel_compute_metric(nbr, r->advert_metric); - babel_announce_rte(p, r->e, r); + babel_sched_announce_rte(p, r->e, r); } } } @@ -772,6 +773,50 @@ babel_announce_rte(struct babel_proto *p, struct babel_entry *e, struct babel_ro } } +/* + * When route damping is enabled we have to delay updates to the kernel FIB. We + * do this by queueing some updates before actually announcing them to core. + */ +static void +babel_sched_announce_rte(struct babel_proto *p, struct babel_entry *e, struct babel_route *r) +{ + struct babel_config *cf = (void *) p->p.cf; + + if (!cf->route_damping || + !e->selected || + e->selected->metric < r->metric || + /*^ When the best route's metric is worse than the one we're about to + * announce it's safe to do so immediately even if route damping is on as + * it won't get exported to most other protocols. Note add-path BGP is + * still notified but we're not concerned about micromanaging the + * stability of exotic setups, only our local kernel FIB's impact. */ + r->metric == BABEL_INFINITY || !r->feasible) + /*^ This checks if announce_rte will do a withdrawal. */ + { + if (e->scheduled) { + rem_node(&e->sched); + e->scheduled = NULL; + } + + babel_announce_rte(p, r->e, r); + } + else + { + struct babel_iface *ifa = r->neigh->ifa; + + if (!e->scheduled) + add_tail(&ifa->sched_rtes, &e->sched); + e->scheduled = r; + + r->sched_time = current_time() + cf->route_damping; + + if (!ifa->want_sched) { + ifa->next_sched = r->sched_time; + ifa->want_sched = 1; + } + } +} + /* * Functions to send replies */ @@ -1371,7 +1416,7 @@ babel_handle_update(union babel_msg *m, struct babel_iface *ifa) e->updated = current_time(); } - babel_announce_rte(p, e, r); + babel_sched_announce_rte(p, e, r); } void @@ -1683,6 +1728,32 @@ babel_iface_timer(timer *t) p->triggered = 0; } + if (ifa->want_sched && now_ >= ifa->next_sched) { + TRACE(D_EVENTS, "Announcing scheduled rtes on %s", ifa->ifname); + + struct babel_entry *e, *ex; + + ifa->next_sched = TIME_INFINITY; + + u8 all_announced = 1; + WALK_LIST_DELSAFE(e, ex, ifa->sched_rtes) { + struct babel_route *r = e->scheduled; + ASSERT(r); + if (now_ >= r->sched_time) { + rem_node(&e->sched); + babel_announce_rte(p, e, r); + } else { + all_announced = 0; + if (r->sched_time < ifa->next_sched) + ifa->next_sched = r->sched_time; + } + } + + if (all_announced) { + ifa->want_sched = 0; + } + } + btime next_event = MIN(ifa->next_hello, ifa->next_regular); if (ifa->want_triggered) next_event = MIN(next_event, ifa->next_triggered); tm_set(ifa->timer, next_event); @@ -1862,6 +1933,8 @@ babel_add_iface(struct babel_proto *p, struct iface *new, struct babel_iface_con init_list(&ifa->msg_queue); ifa->send_event = ev_new_init(ifa->pool, babel_send_queue, ifa); + init_list(&ifa->sched_rtes); + struct object_lock *lock = olock_new(ifa->pool); lock->type = OBJLOCK_UDP; lock->addr = IP6_BABEL_ROUTERS; @@ -2523,6 +2596,8 @@ babel_init(struct proto_config *CF) P->rte_better = babel_rte_better; P->rte_igp_metric = babel_rte_igp_metric; + cf->route_damping = 15 S_; // HACK for testing + return P; } diff --git a/proto/babel/babel.h b/proto/babel/babel.h index 3868d7c4..186d4297 100644 --- a/proto/babel/babel.h +++ b/proto/babel/babel.h @@ -134,6 +134,7 @@ struct babel_config { list iface_list; /* List of iface configs (struct babel_iface_config) */ uint hold_time; /* Time to hold stale entries and unreachable routes */ u8 randomize_router_id; + uint route_damping; struct channel_config *ip4_channel; struct channel_config *ip6_channel; @@ -216,6 +217,7 @@ struct babel_iface { int tx_length; list neigh_list; /* List of neighbors seen on this iface (struct babel_neighbor) */ list msg_queue; + list sched_rtes; /* Scheduled rte announcments for route dampening */ u16 hello_seqno; /* To be increased on each hello */ @@ -226,7 +228,9 @@ struct babel_iface { btime next_hello; btime next_regular; btime next_triggered; - btime want_triggered; + u8 want_triggered; + btime next_sched; + u8 want_sched; timer *timer; event *send_event; @@ -292,6 +296,7 @@ struct babel_route { ip_addr next_hop; btime refresh_time; btime expires; + btime sched_time; }; struct babel_seqno_request { @@ -306,7 +311,9 @@ struct babel_seqno_request { }; struct babel_entry { - struct babel_route *selected; + node sched; /* for babel_iface.sched_rtes */ + + struct babel_route *selected, *scheduled; list routes; /* Routes for this prefix (struct babel_route) */ list sources; /* Source entries for this prefix (struct babel_source). */ -- 2.30.2
Hi Daniel,
In order to prevent RTT based routing from causing persistent traffic oscillations we delay core rte announcement of each prefix by a configurable but metric invariant amount of time.
Could you please explain how that works? How does it interact with cost smoothing and route selection hysteresis? Thanks, -- Juliusz
Hi Juliusz, On Mon, Mar 06, 2023 at 03:59:14PM +0100, Juliusz Chroboczek wrote:
In order to prevent RTT based routing from causing persistent traffic oscillations we delay core rte announcement of each prefix by a configurable but metric invariant amount of time.
Could you please explain how that works? How does it interact with cost smoothing and route selection hysteresis?
I would approach the stability problem from an electronics/signal processing/control theory background. In my view the current approach is a roundabout way of implementing a simple filter to prevent (high-frequency) oscillations from propagating from the input (measured link cost) to the output (the datapath). I have essentially two gripes with the current approach: 1) we use not one, but two nonlinear (i.e. hard to understand) filters for what one well designed linear (i.e. easy to reason about) filter could do and 2) the continous nature of the "smoothing" causes a blowup of the number of bird core announcements that could be avoided. In the present patch I'm mainly concerned with 2) so I just threw out the smoothing to keep things simple. Frankly I haven't yet been able to reverse-engineer the weird exponential smoothing filter behaviour enough to build a model, perhaps you (or someone) could expand on the rationale for the smoothing filter construction and the addition of hysteresis? Thanks, --Daniel
I would approach the stability problem from an electronics/signal processing/control theory background.
I have no signal processing background whatsoever; to my eyes, signal processing is a fairly advanced for of magic. (My background is in logic and programming languages.)
Frankly I haven't yet been able to reverse-engineer the weird exponential smoothing filter behaviour enough to build a model, perhaps you (or someone) could expand on the rationale for the smoothing filter construction and the addition of hysteresis?
To be honest, we hacked things until we had acceptable worst-case behaviour. We had two networks to experiment with: Nexedi's production network (hundreds of tunnels over the public IPv6 Internet) and a simulated network we built ourselves which we believed represented the worst case (a bufferbloated diamond network). We built a first prototype, which we instrumented to log RTT samples and route flaps, and noticed three things: 1. in the production network, the RTT signal is noisy (see figures 4 and 6 of [1]); 2. in the bufferbloated diamond network, when we switch away from a congested route, we switch back too early, before the buffers have had time to drain; 3. in the diamond network, we tend to switch routes as we oscillate around a common value. Hence, the three mechanisms: 1. smoothing of the RTT, to makes the signal less noisy; the smoothing is exponential just because it's easy to implement; 2. saturating map from RTT to metric, so that congested routes all appear equally bad, and we don't switch back before the buffers drain; this was stolen from [2]; 3. hysteresis, in order to avoid switching with too high a frequency. Daniel, if you feel you're competent to work on that, I'd be interested in collaborating. I don't currently have funding for Babel, but it should be easy enough to find some. References: [1] https://arxiv.org/pdf/1403.3488.pdf [2] Atul Khanna and John Zinky. The Revised ARPANET Routing Metric. SIGCOMM ’89 Symposium proceedings on Communications architectures and protocols, pp. 45-56. ACM New York, NY, USA, 1989.
Hi Juliusz, On Tue, Mar 07, 2023 at 01:20:28PM +0100, Juliusz Chroboczek wrote:
I have no signal processing background whatsoever; to my eyes, signal processing is a fairly advanced for of magic. (My background is in logic and programming languages.)
To be honest, we hacked things until we had acceptable worst-case behaviour.
Not to worry, there's really not that much to know that's not on Wikipedia :)
We had two networks to experiment with: Nexedi's production network (hundreds of tunnels over the public IPv6 Internet) and a simulated network we built ourselves which we believed represented the worst case (a bufferbloated diamond network). We built a first prototype, which we instrumented to log RTT samples and route flaps, and noticed three things:
How was the simulation setup built? I'd be interested in trying out my implementation in the worst-case with real queues in the feedback path. See, from a signals/systems perspective what we're trying to do here looks an awful lot like a closed-loop control system[1], we measure something and feed the measurment back into the system (via the FIB) which influences the measurments (RTT). In this framework one thing you can do to analyze system stability is to "cut open" the feedback path, look at the combination of feedback path and controller (the transfer function) and apply poles and zeros analysis[2] to determine stability. Note this can be done either experimentally by measuring the frequency response (the "bode plot") or by theoretical modelling. [1]: https://en.wikipedia.org/wiki/Closed-loop_transfer_function [2]: https://en.wikipedia.org/wiki/Control_theory#Stability This is all very well studied since engineers generally don't like their physical systems shaking themselves apart :) Problem is our feedback path includes network queues, which AFAICT are highly nonlinear and due to the nature of the internet certainly not time-invariant so treating this as LTI systems is questionable. However perhaps the timescales at which the network queues operate and the network changes are far enough from our control timescale that such an analysis could still be useful.
1. in the production network, the RTT signal is noisy (see figures 4 and 6 of [1]); 2. in the bufferbloated diamond network, when we switch away from a congested route, we switch back too early, before the buffers have had time to drain;
3. in the diamond network, we tend to switch routes as we oscillate around a common value.
Hence, the three mechanisms:
1. smoothing of the RTT, to makes the signal less noisy; the smoothing is exponential just because it's easy to implement;
2. saturating map from RTT to metric, so that congested routes all appear equally bad, and we don't switch back before the buffers drain; this was stolen from [2];
3. hysteresis, in order to avoid switching with too high a frequency.
Right, so my plan of attack is to combat 3. with the filter in this patch instead or (depending on results) in addition to hysteresis. If it turns out hysteresis is needed I would go for a simple stateful implementation so as to not have to reintroduce the smoothing. I'm not sure we really need to do anything about 1. The route damping filter (potentially plus hysteresis) will already limit the frequency of route changes as desired, why should we want to apply additional filtering on the RTT estimates? Instead of cascading filters I would just the tweak the frequency response of that one filter. I managed to build a simple digital simulation for the rearming-timer filter in this patch and when testing with random frequency or sweep inputs and it does work as expected. There's some artifacts (lower than expected output frequency) around the cutoff frequency which is due to the rarming nature of the timer but that seems fine. Next step would be to add the smoothing filter to the simulation too and compare. I haven't figured out how to charactarize the response of the hysteresis yet so that's something I still have to look at.
Daniel, if you feel you're competent to work on that, I'd be interested in collaborating. I don't currently have funding for Babel, but it should be easy enough to find some.
I'm not sure about "competent", TBH I haven't used any of this knowledge in years. I just learned this stuff in my electronics engineering centered secondary education. I am definetly not the right person to write this up rigorously, but I might just know enough to be dangerous (: Thanks, --Daniel
Hi Juliusz, On Tue, Mar 07, 2023 at 01:20:28PM +0100, Juliusz Chroboczek wrote:
To be honest, we hacked things until we had acceptable worst-case behaviour.
We had two networks to experiment with: Nexedi's production network (hundreds of tunnels over the public IPv6 Internet) and a simulated network we built ourselves which we believed represented the worst case (a bufferbloated diamond network). We built a first prototype, which we instrumented to log RTT samples and route flaps, and noticed three things:
1. in the production network, the RTT signal is noisy (see figures 4 and 6 of [1]);
2. in the bufferbloated diamond network, when we switch away from a congested route, we switch back too early, before the buffers have had time to drain;
3. in the diamond network, we tend to switch routes as we oscillate around a common value.
I'm working on comparing the exponential filter behaviour to my damping approach, I was wondering if you still have any scripts/notes on how the simulations for Fig 10 and 11 were setup?
References:
Hence, the three mechanisms:
1. smoothing of the RTT, to makes the signal less noisy; the smoothing is exponential just because it's easy to implement;
2. saturating map from RTT to metric, so that congested routes all appear equally bad, and we don't switch back before the buffers drain;
Re-reading this, was the time until a route is reconsidered made dependent on the metric/RTT difference on purpose to get this draining behaviour? While my timer approach doesn't mirror this currently I'm not opposed to it, I just wonder if metric difference is the right variable here. Thanks, --Daniel
participants (3)
-
Daniel Gröber -
dxld@darkboxed.org -
Juliusz Chroboczek