--- filter/filter.c | 6 + nest/proto-hooks.c | 3 +- nest/protocol.h | 6 +- nest/route.h | 8 ++ nest/rt-table.c | 58 +++++++++-- proto/bgp/attrs.c | 289 +++++++++++++++++++++++++++++++++++++++++++++++++++- proto/bgp/bgp.c | 4 +- proto/bgp/bgp.h | 9 ++- proto/ospf/ospf.c | 4 +- proto/rip/rip.c | 4 +- 10 files changed, 374 insertions(+), 17 deletions(-) diff --git a/filter/filter.c b/filter/filter.c index b3b17dc..77af39a 100644 --- a/filter/filter.c +++ b/filter/filter.c @@ -935,6 +935,12 @@ interpret(struct f_inst *what) if (v1.type != T_PATH) runtime( "Setting path attribute to non-path value" ); l->attrs[0].u.ptr = v1.val.ad; + /* + * Zero cache on AS-PATH change. + * XXX: This breaks COW concept but + * removing this flag is harmless + */ + (*f_rte)->pflags &= ~0x01; break; case EAF_TYPE_INT_SET: if (v1.type != T_CLIST) diff --git a/nest/proto-hooks.c b/nest/proto-hooks.c index f026192..1dec95c 100644 --- a/nest/proto-hooks.c +++ b/nest/proto-hooks.c @@ -271,6 +271,7 @@ int import_control(struct proto *p, rte **e, ea_list **attrs, struct linpool *po * rte_better - compare metrics of two routes * @new: the new route * @old: the original route + * @flags: protocol-specific comparison flags * * This hook gets called when the routing table contains two routes * for the same network which have originated from different instances @@ -279,7 +280,7 @@ int import_control(struct proto *p, rte **e, ea_list **attrs, struct linpool *po * * Result: 1 if @new is better (more preferred) than @old, 0 otherwise. */ -int rte_better(rte *new, rte *old) +int rte_better(rte *new, rte *old, u32 flags) { DUMMY; } /** diff --git a/nest/protocol.h b/nest/protocol.h index a7518c2..02defb4 100644 --- a/nest/protocol.h +++ b/nest/protocol.h @@ -178,13 +178,15 @@ struct proto { /* * Routing entry hooks (called only for rte's belonging to this protocol): * - * rte_better Compare two rte's and decide which one is better (1=first, 0=second). + * rte_best Find best rte starting from given. + * rte_better Compare two rte's and decide if better (1=first, 0=second). * rte_same Compare two rte's and decide whether they are identical (1=yes, 0=no). * rte_insert Called whenever a rte is inserted to a routing table. * rte_remove Called whenever a rte is removed from the routing table. */ - int (*rte_better)(struct rte *, struct rte *); + struct rte *(*rte_best)(struct rte *); + int (*rte_better)(struct rte *, struct rte *, u32 flags); int (*rte_same)(struct rte *, struct rte *); void (*rte_insert)(struct network *, struct rte *); void (*rte_remove)(struct network *, struct rte *); diff --git a/nest/route.h b/nest/route.h index a4c0154..31a7a56 100644 --- a/nest/route.h +++ b/nest/route.h @@ -201,6 +201,12 @@ typedef struct rte { u32 router_id; /* Router that originated this route */ } ospf; #endif +#ifdef CONFIG_BGP + struct { + u32 first_as; /* First as number in AS-PATH or local AS */ + struct rte *next_item; /* Used in bgp_get_best */ + } bgp; +#endif struct { /* Routes generated by krt sync (both temporary and inherited ones) */ s8 src; /* Alleged route source (see krt.h) */ u8 proto; /* Kernel source protocol ID */ @@ -212,6 +218,8 @@ typedef struct rte { } rte; #define REF_COW 1 /* Copy this rte on write */ +/* Protocol-specific flags */ +#define PFR_FLAG1 0x01 /* Definition of first possible flag */ /* Types of route announcement, also used as flags */ #define RA_OPTIMAL 1 /* Announcement of optimal route change */ diff --git a/nest/rt-table.c b/nest/rt-table.c index e20d2f6..574711a 100644 --- a/nest/rt-table.c +++ b/nest/rt-table.c @@ -135,10 +135,23 @@ rte_do_cow(rte *r) return e; } -static int /* Actually better or at least as good as */ +/** + * rte_better - returns 1 if new rte is better + * @new: new rte + * @old: old BEST rte + * + * Functions compares old and new rte returning 1 IFF new rte is better. + * Best rte in @old->net is required to be specified in @old if @old and @new are + * originated from same protocol type. Specifying different rte can produce + * incorrect calculations in some protocols (see bgp_rte_better) + * + * Returns 1 if @new is better, 0 overwise + */ +static int rte_better(rte *new, rte *old) { - int (*better)(rte *, rte *); + int (*better)(rte *, rte *, u32); + struct proto *p_old, *p_new; if (!old) return 1; @@ -146,17 +159,19 @@ rte_better(rte *new, rte *old) return 1; if (new->pref < old->pref) return 0; - if (new->attrs->proto->proto != old->attrs->proto->proto) + p_old = old->attrs->proto; + p_new = new->attrs->proto; + if (p_new->proto != p_old->proto) { /* * If the user has configured protocol preferences, so that two different protocols * have the same preference, try to break the tie by comparing addresses. Not too * useful, but keeps the ordering of routes unambiguous. */ - return new->attrs->proto->proto > old->attrs->proto->proto; + return p_new->proto > p_old->proto; } - if (better = new->attrs->proto->rte_better) - return better(new, old); + if (better = p_new->rte_better) + return better(new, old, 0); return 0; } @@ -426,9 +441,11 @@ static void rte_recalculate(rtable *table, net *net, struct proto *p, struct proto *src, rte *new, ea_list *tmpa) { struct proto_stats *stats = &p->stats; + struct proto *rte_proto; rte *old_best = net->routes; rte *old = NULL; - rte **k, *r, *s; + rte **k, *r, *s, *t; + u32 proto_mask, attr_class; #ifdef CONFIG_PIPE if (proto_is_pipe(p) && (p->table == table)) @@ -526,9 +543,36 @@ rte_recalculate(rtable *table, net *net, struct proto *p, struct proto *src, rte /* Find new optimal route */ r = NULL; + proto_mask = 0; for (s=net->routes; s; s=s->next) + { + rte_proto = s->attrs->proto; + attr_class = rte_proto->proto->attr_class; + /* Protocol has non-zero attr_class and rte_best hook */ + if (attr_class && rte_proto->rte_best) + { + /* Check if this is first hook invocation */ + if (!((2 << attr_class) & proto_mask)) + { + proto_mask |= 2 << attr_class; + /* + * Find best route from protocol point of view + * and compare it with current best route if exists + */ + t = rte_proto->rte_best(s); + if (!r || rte_better(r, t)) + r = t; + } + /* + * Protocol has already supplied its best route. + * Skip this rte + */ + continue; + } + if (rte_better(s, r)) r = s; + } /* Announce optimal route */ rte_announce(table, RA_OPTIMAL, net, r, old_best, tmpa); diff --git a/proto/bgp/attrs.c b/proto/bgp/attrs.c index 2832f42..0b417df 100644 --- a/proto/bgp/attrs.c +++ b/proto/bgp/attrs.c @@ -1118,7 +1118,7 @@ rte_resolvable(rte *rt) } int -bgp_rte_better(rte *new, rte *old) +bgp_rte_better(rte *new, rte *old, u32 flags UNUSED) { struct bgp_proto *new_bgp = (struct bgp_proto *) new->attrs->proto; struct bgp_proto *old_bgp = (struct bgp_proto *) old->attrs->proto; @@ -1236,6 +1236,293 @@ bgp_rte_better(rte *new, rte *old) return (ipa_compare(new_bgp->cf->remote_ip, old_bgp->cf->remote_ip) < 0); } + + +static inline u32 +bgp_get_neighbor_as(rte *s) +{ + u32 first_as; + + if (s->flags & PFR_BGP_ASCACHE) + return s->u.bgp.first_as; + + first_as = bgp_get_neighbor(s); + s->u.bgp.first_as = first_as; + s->pflags |= PFR_BGP_ASCACHE; + + return first_as; +} + +/** + * bgp_rte_better_med - Determine best route + * @new: new rte + * @old: old BEST rte in @old->net + * + * Function implements RFC 4271 best route selection + * with so-called 'deterministic' MED comparison. + * If @new and @old are announced from different neighbours + * @old is required to be the best route. + * + * Returns: 1 if @new is better, 0 overwise + */ +int +bgp_rte_better_med(rte *new, rte *old, u32 flags) +{ + struct bgp_proto *new_bgp = (struct bgp_proto *) new->attrs->proto; + struct bgp_proto *old_bgp = (struct bgp_proto *) old->attrs->proto; + struct bgp_config *new_config, *old_config; + ea_list *new_eattrs = new->attrs->eattrs; + ea_list *old_eattrs = old->attrs->eattrs; + eattr *x, *y; + u32 n, o; + u32 new_as, old_as, tmp_as; + rte *s, *r; + + /* RFC 4271 9.1.2.1. Route resolvability test */ + n = rte_resolvable(new); + o = rte_resolvable(old); + if (n > o) + return 1; + if (n < o) + return 0; + + /* Start with local preferences */ + x = ea_find(new_eattrs, EA_CODE(EAP_BGP, BA_LOCAL_PREF)); + y = ea_find(old_eattrs, EA_CODE(EAP_BGP, BA_LOCAL_PREF)); + n = x ? x->u.data : new_bgp->cf->default_local_pref; + o = y ? y->u.data : old_bgp->cf->default_local_pref; + if (n > o) + return 1; + if (n < o) + return 0; + + /* RFC 4271 9.1.2.2. a) Use AS path lengths */ + if (new_bgp->cf->compare_path_lengths || old_bgp->cf->compare_path_lengths) + { + x = ea_find(new_eattrs, EA_CODE(EAP_BGP, BA_AS_PATH)); + y = ea_find(old_eattrs, EA_CODE(EAP_BGP, BA_AS_PATH)); + n = x ? as_path_getlen(x->u.ptr) : AS_PATH_MAXLEN; + o = y ? as_path_getlen(y->u.ptr) : AS_PATH_MAXLEN; + if (n < o) + return 1; + if (n > o) + return 0; + } + + /* RFC 4271 9.1.2.2. b) Use origins */ + x = ea_find(new_eattrs, EA_CODE(EAP_BGP, BA_ORIGIN)); + y = ea_find(old_eattrs, EA_CODE(EAP_BGP, BA_ORIGIN)); + n = x ? x->u.data : ORIGIN_INCOMPLETE; + o = y ? y->u.data : ORIGIN_INCOMPLETE; + if (n < o) + return 1; + if (n > o) + return 0; + + /* RFC 4271 9.1.2.2. c) Compare MED's */ + /* + * This is a bit tricky. + * + * First we retrieve/update neighbor ASNs from rte cache. + * If @old and @new ASNs are the same, we can immediately + * compare MED. If not, we need to compare best routes + * from both ASNs. Since @old is the best route already nothing + * needs to be done with @old. However, we need to scan all + * routes for given network (another place where @old is required + * to be the best (and the first) route) to get best rte in + * @new neighbor ASN. This changes complexity from O(1) to O(N). + * If found rte is not @new, we simply returns 0 + * since @old is the best rte of all routes. Overwise, we + * continue to compare @old and @new skippind MED comparison. + */ + /* Get neighbor info first as && update cache if nesessary */ + old_as = bgp_get_neighbor_as(old); + new_as = bgp_get_neighbor_as(new); + + new_config = new_bgp->cf; + old_config = old_bgp->cf; + + if (new_as == old_as) + { + /* We can compare MED */ + x = ea_find(new_eattrs, EA_CODE(EAP_BGP, BA_MULTI_EXIT_DISC)); + y = ea_find(old_eattrs, EA_CODE(EAP_BGP, BA_MULTI_EXIT_DISC)); + n = x ? x->u.data : new_config->default_med; + o = y ? y->u.data : old_config->default_med; + if (n < o) + return 1; + if (n > o) + return 0; + } + else + { + /* + * routes came from different neighbor ASses. + * We need to find best route within routes + * from new_as + */ + r = new; + for (s = old; s; s = s->next) + { + /* Get neighbor ASN */ + tmp_as = bgp_get_neighbor_as(s); + if (tmp_as != new_as) + continue; + + /* + * Okay, we've found another rte from the same neighbor. + * Let's check it. + */ + if (bgp_rte_better_med(s, r, 0)) + r = s; + } + + /* + * Best route for neighbor routes is saved in r. + * If new is not the best route in the group this + * means that current best route remains best. + */ + if (r != new) + return 0; + } + + /* RFC 4271 9.1.2.2. d) Prefer external peers */ + if (new_bgp->is_internal > old_bgp->is_internal) + return 0; + if (new_bgp->is_internal < old_bgp->is_internal) + return 1; + + /* RFC 4271 9.1.2.2. e) Compare IGP metrics */ + n = new_config->igp_metric ? new->attrs->igp_metric : 0; + o = old_config->igp_metric ? old->attrs->igp_metric : 0; + if (n < o) + return 1; + if (n > o) + return 0; + + /* RFC 4271 9.1.2.2. f) Compare BGP identifiers */ + /* RFC 4456 9. a) Use ORIGINATOR_ID instead of local neighor ID */ + x = ea_find(new_eattrs, EA_CODE(EAP_BGP, BA_ORIGINATOR_ID)); + y = ea_find(old_eattrs, EA_CODE(EAP_BGP, BA_ORIGINATOR_ID)); + n = x ? x->u.data : new_bgp->remote_id; + o = y ? y->u.data : old_bgp->remote_id; + + /* RFC 5004 - prefer older routes */ + /* (if both are external and from different peer) */ + if ((new_config->prefer_older || old_config->prefer_older) && + !new_bgp->is_internal && n != o) + return 0; + + /* rest of RFC 4271 9.1.2.2. f) */ + if (n < o) + return 1; + if (n > o) + return 0; + + /* RFC 4456 9. b) Compare cluster list lengths */ + x = ea_find(new_eattrs, EA_CODE(EAP_BGP, BA_CLUSTER_LIST)); + y = ea_find(old_eattrs, EA_CODE(EAP_BGP, BA_CLUSTER_LIST)); + n = x ? int_set_get_size(x->u.ptr) : 0; + o = y ? int_set_get_size(y->u.ptr) : 0; + if (n < o) + return 1; + if (n > o) + return 0; + + /* RFC 4271 9.1.2.2. g) Compare peer IP adresses */ + return (ipa_compare(new_config->remote_ip, old_config->remote_ip) < 0); +} + +#define LOCAL_HASH 16 +/** + * bgp_rte_best - returns the best rte in chain originated by BGP + * + */ +rte * +bgp_rte_best(rte *old) +{ + rte **rgroups, *s, *ss, *best = NULL; + u32 local_as, best_as; + int i; + + if (!old->next) + return old; + + /* Allocate hash for ASn groups */ + rgroups = alloca(LOCAL_HASH * sizeof(rte *)); + memset(rgroups, 0, LOCAL_HASH * sizeof(rte *)); + + for (s = old; s; s = s->next) + { + local_as = bgp_get_neighbor_as(s); + best = rgroups[local_as & (LOCAL_HASH - 1)]; + if (!best) + { + /* New group */ + rgroups[local_as & (LOCAL_HASH - 1)] = s; + s->u.bgp.next_item = NULL; + continue; + } + + ss = best; + best_as = bgp_get_neighbor_as(ss); + while (ss->u.bgp.next_item) + { + if ((best_as = bgp_get_neighbor_as(ss)) == local_as) + break; + /* Hash collision */ + ss = ss->u.bgp.next_item; + } + + /* List ended */ + if (best_as == local_as) + { + /* Both rte are from the same group, let's compare */ + if (bgp_rte_better_med(s, best, 0)) + { + /* New best rte in group */ + rgroups[local_as & (LOCAL_HASH - 1)] = s; + s->u.bgp.next_item = best->u.bgp.next_item; + } + + continue; + } + + /* New group */ + ss->u.bgp.next_item = s; + s->u.bgp.next_item = NULL; + } + + /* Cycle ended, now we have to compare all per-group best rtes */ + for (i = 0, best = NULL; i < LOCAL_HASH; i++) + { + for (s = rgroups[i]; s; s = s->u.bgp.next_item) + { + if (!best) + { + best = s; + continue; + } + + /* Do general preference checking */ + if (best->pref > s->pref) + continue; + if (s->pref > best->pref) + { + best = s; + continue; + } + + /* Do full bgp-specific checking without MED comparison */ + if (bgp_rte_better_med(s, best, BGP_CFLAG_SKIPMED)) + best = s; + } + } + + return best; +} +#undef LOCAL_HASH + static struct adata * bgp_aggregator_convert_to_new(struct adata *old, struct linpool *pool) { diff --git a/proto/bgp/bgp.c b/proto/bgp/bgp.c index 675342d..a277680 100644 --- a/proto/bgp/bgp.c +++ b/proto/bgp/bgp.c @@ -904,7 +904,9 @@ bgp_init(struct proto_config *C) P->accept_ra_types = RA_OPTIMAL; P->rt_notify = bgp_rt_notify; - P->rte_better = bgp_rte_better; + //P->rte_better = bgp_rte_better; + P->rte_better = bgp_rte_better_med; + P->rte_best = bgp_rte_best; P->import_control = bgp_import_control; P->neigh_notify = bgp_neigh_notify; P->reload_routes = bgp_reload_routes; diff --git a/proto/bgp/bgp.h b/proto/bgp/bgp.h index 437ba33..70e1ff9 100644 --- a/proto/bgp/bgp.h +++ b/proto/bgp/bgp.h @@ -184,7 +184,9 @@ void bgp_attach_attr(struct ea_list **to, struct linpool *pool, unsigned attr, u byte *bgp_attach_attr_wa(struct ea_list **to, struct linpool *pool, unsigned attr, unsigned len); struct rta *bgp_decode_attrs(struct bgp_conn *conn, byte *a, unsigned int len, struct linpool *pool, int mandatory); int bgp_get_attr(struct eattr *e, byte *buf, int buflen); -int bgp_rte_better(struct rte *, struct rte *); +int bgp_rte_better(struct rte *, struct rte *, u32); +int bgp_rte_better_med(struct rte *, struct rte *, u32); +rte *bgp_rte_best(rte *old); void bgp_rt_notify(struct proto *P, rtable *tbl UNUSED, net *n, rte *new, rte *old UNUSED, ea_list *attrs); int bgp_import_control(struct proto *, struct rte **, struct ea_list **, struct linpool *); void bgp_attr_init(struct bgp_proto *); @@ -241,6 +243,11 @@ void bgp_log_error(struct bgp_proto *p, u8 class, char *msg, unsigned code, unsi #define BA_AS4_PATH 0x11 /* [RFC4893] */ #define BA_AS4_AGGREGATOR 0x12 +/* Protocol-specififc rte flags */ +#define PFR_BGP_ASCACHE PFR_FLAG1 /* 0x01 Set if bgp.first_as rte field is valid */ +/* Protocol-specific rte comparison flags */ +#define BGP_CFLAG_SKIPMED 0x01 /* Skip MED when comparing rte from different neigbour ASNs */ + /* BGP connection states */ #define BS_IDLE 0 diff --git a/proto/ospf/ospf.c b/proto/ospf/ospf.c index ce7ad37..12b209f 100644 --- a/proto/ospf/ospf.c +++ b/proto/ospf/ospf.c @@ -105,7 +105,7 @@ static int ospf_reload_routes(struct proto *p); static void ospf_rt_notify(struct proto *p, struct rtable *table UNUSED, net * n, rte * new, rte * old UNUSED, ea_list * attrs); -static int ospf_rte_better(struct rte *new, struct rte *old); +static int ospf_rte_better(struct rte *new, struct rte *old, u32 flags UNUSED); static int ospf_rte_same(struct rte *new, struct rte *old); static void ospf_disp(timer *timer); @@ -314,7 +314,7 @@ ospf_init(struct proto_config *c) /* If new is better return 1 */ static int -ospf_rte_better(struct rte *new, struct rte *old) +ospf_rte_better(struct rte *new, struct rte *old, u32 flags UNUSED) { if (new->u.ospf.metric1 == LSINFINITY) return 0; diff --git a/proto/rip/rip.c b/proto/rip/rip.c index 543aa30..6967c68 100644 --- a/proto/rip/rip.c +++ b/proto/rip/rip.c @@ -265,7 +265,7 @@ rip_rte_update_if_better(rtable *tab, net *net, struct proto *p, rte *new) rte *old; old = rte_find(net, p); - if (!old || p->rte_better(new, old) || + if (!old || p->rte_better(new, old, 0) || (ipa_equal(old->attrs->from, new->attrs->from) && (old->u.rip.metric != new->u.rip.metric)) ) rte_update(tab, net, p, p, new); @@ -909,7 +909,7 @@ rip_rte_same(struct rte *new, struct rte *old) static int -rip_rte_better(struct rte *new, struct rte *old) +rip_rte_better(struct rte *new, struct rte *old, u32 flags UNUSED) { struct proto *p = new->attrs->proto; -- 1.7.3.2 --------------070507060406060400060500--