On Mon, Jan 10, 2022 at 11:47:57PM +0100, Tomas Hlavacek wrote:
Add netlink KRT dump filter on Linux to avoid PMTU cache records from FNHE table dump along with KRT.
Linux Kernel added FNHE table dump to the netlink API in patch https://patchwork.ozlabs.org/project/netdev/patch/8d3b68cd37fb5fddc470904cdd...
The filter mitigates the risk of receiving unknown and potentially large number of FNHE records that would block BIRD I/O in each sync. There is a known issue caused by the GRE tunnels on Linux that seems to be creating one FNHE record for each destination IP address that is routed through the tunnel, even when the PMTU equals to GRE interface MTU (tested with kernel 5.5 - 5.16-rc7).
Thanks, merged with some modifications: https://gitlab.nic.cz/labs/bird/-/commit/e818f16448e918ed07633480291283f3449... Instead of switching NETLINK_GET_STRICT_CHK on and off, i just used strict checking for all dumps (including link and address). Also, removed the SO_SNDBUF/SO_RCVBUF change. That seems unrelated and has some issues: 1) Why these values? 32k for SO_SNDBUF is smaller than the default value (208k), so it in fact makes the buffer smaller (which probably does not matter). While 1M for SO_RCVBUF is bigger that max value, so it is capped at 416k. 2) It applies just for nl_scan and nl_req, and not for async fd, where it makes most sense. 3) We may want big rx buffer for async fd, in this case we may consider using SO_SNDBUFFORCE. I am not sure which netlink socket operations are really synchronous or with flow control, so big buffer is not needed.
--- sysdep/linux/netlink.c | 44 +++++++++++++++++++++++++++++++++++++++--- 1 file changed, 41 insertions(+), 3 deletions(-)
diff --git a/sysdep/linux/netlink.c b/sysdep/linux/netlink.c index f85bcf35..79414122 100644 --- a/sysdep/linux/netlink.c +++ b/sysdep/linux/netlink.c @@ -128,7 +128,7 @@ struct nl_sock uint last_size; };
-#define NL_RX_SIZE 8192 +#define NL_RX_SIZE 32768
#define NL_OP_DELETE 0 #define NL_OP_ADD (NLM_F_CREATE|NLM_F_EXCL) @@ -143,11 +143,18 @@ static struct nl_sock nl_req = {.fd = -1}; /* Netlink socket for requests */ static void nl_open_sock(struct nl_sock *nl) { + int sndbuf = 32768; + int rcvbuf = 1024*1024; + if (nl->fd < 0) { - nl->fd = socket(PF_NETLINK, SOCK_RAW, NETLINK_ROUTE); + nl->fd = socket(PF_NETLINK, SOCK_RAW | SOCK_CLOEXEC, NETLINK_ROUTE); if (nl->fd < 0) die("Unable to open rtnetlink socket: %m"); + + setsockopt(nl->fd, SOL_SOCKET, SO_SNDBUF, &sndbuf, sizeof(sndbuf)); + setsockopt(nl->fd, SOL_SOCKET, SO_RCVBUF, &rcvbuf, sizeof(rcvbuf)); + nl->seq = (u32) (current_time() TO_S); /* Or perhaps random_u32() ? */ nl->rx_buffer = xmalloc(NL_RX_SIZE); nl->last_hdr = NULL; @@ -155,6 +162,12 @@ nl_open_sock(struct nl_sock *nl) } }
+static void +nl_set_strict_dump(struct nl_sock *nl, int strict) +{ + setsockopt(nl->fd, SOL_NETLINK, NETLINK_GET_STRICT_CHK, &strict, sizeof(strict)); +} + static void nl_open(void) { @@ -192,6 +205,29 @@ nl_request_dump(int af, int cmd) nl_send(&nl_scan, &req.nh); }
+static void +nl_request_dump_rt(int af, int cmd) +{ + struct { + struct nlmsghdr nh; + struct rtmsg rtm; + char buf[128]; + } req = { + .nh.nlmsg_type = cmd, + .nh.nlmsg_len = NLMSG_LENGTH(sizeof(struct rtmsg)), + .nh.nlmsg_flags = NLM_F_REQUEST | NLM_F_DUMP, + .nh.nlmsg_seq = ++(nl_scan.seq), + .nh.nlmsg_pid = 0, + .rtm.rtm_protocol = RTPROT_UNSPEC, + .rtm.rtm_family = af + /* .rtm.rtm_flags is defaults to zero, hence RTM_F_CLONED is not set */ + }; + + send(nl_scan.fd, &req, sizeof(req), 0); + nl_scan.last_hdr = NULL; +} + + static struct nlmsghdr * nl_get_reply(struct nl_sock *nl) { @@ -1864,13 +1900,15 @@ krt_do_scan(struct krt_proto *p UNUSED) /* CONFIG_ALL_TABLES_AT_ONCE => p is NUL struct nl_parse_state s;
nl_parse_begin(&s, 1); - nl_request_dump(AF_UNSPEC, RTM_GETROUTE); + nl_set_strict_dump(&nl_scan, 1); + nl_request_dump_rt(AF_UNSPEC, RTM_GETROUTE); while (h = nl_get_scan()) if (h->nlmsg_type == RTM_NEWROUTE || h->nlmsg_type == RTM_DELROUTE) nl_parse_route(&s, h); else log(L_DEBUG "nl_scan_fire: Unknown packet received (type=%d)", h->nlmsg_type); nl_parse_end(&s); + nl_set_strict_dump(&nl_scan, 0); }
/* -- 2.25.1
-- Elen sila lumenn' omentielvo Ondrej 'Santiago' Zajicek (email: santiago@crfreenet.org) OpenPGP encrypted e-mails preferred (KeyID 0x11DEADC3, wwwkeys.pgp.net) "To err is human -- to blame it on a computer is even more so."