Hi! Attached is a crude hack to allow profiling filter processing. When you're done implementing all of your filter logic and are wondering exactly why it now takes bird so friggin' long to propagate routes while sitting for several minutes at 100% CPU on the anemic embedded CPUs you're trying to run it on, tools like 'perf' aren't too useful, since they'll just tell you that all CPU time is being spent in interpret() or related functions and won't tell you which filter language statements are responsible for how much of the CPU time. The attached patch is a very rough way of attempting to enable you to figure out what's eating your cycles. The basic idea is that it sets up a SIGUSR2 handler that will write to stderr the config file name and line currently being processed, and the prefix being operated on if there is one. You can then send bird SIGUSR2 in a tight loop whenever it seems to be spinning at 100% CPU and the resulting output will give you a clue of what's going on. This is a very dirty hack, and it does some things that probably aren't entirely safe, like calling into snprintf from a signal handler, and it shouldn't be merged in this form, but for our purposes, it seems to be working well, and seems to be able to provide useful insights into some of our self-inflicted filter performance problems. Feedback appreciated! (Better ideas also appreciated. :D) Cheers, Lennert diff --git a/filter/f-util.c b/filter/f-util.c index def2b248..452dbcb2 100644 --- a/filter/f-util.c +++ b/filter/f-util.c @@ -20,6 +20,7 @@ f_new_inst(void) ret->code = ret->aux = 0; ret->arg1 = ret->arg2 = ret->next = NULL; ret->lineno = ifs->lino; + ret->file_name = ifs->file_name; return ret; } diff --git a/filter/filter.c b/filter/filter.c index f18970e0..d58c6cb4 100644 --- a/filter/filter.c +++ b/filter/filter.c @@ -33,6 +33,10 @@ #undef LOCAL_DEBUG +#include <stdio.h> +#include <signal.h> +#include <unistd.h> + #include "nest/bird.h" #include "lib/lists.h" #include "lib/resource.h" @@ -551,7 +555,7 @@ val_format(struct f_val v, buffer *buf) } } -static struct rte **f_rte; +static struct rte ** volatile f_rte; static struct rta *f_old_rta; static struct ea_list **f_tmp_attrs; static struct linpool *f_pool; @@ -630,8 +634,10 @@ static struct tbf rl_runtime_err = TBF_DEFAULT_LOG_LIMITS; * &f_val structures are copied around, so there are no problems with * memory managment. */ +static struct f_val interpret(struct f_inst *what); + static struct f_val -interpret(struct f_inst *what) +__interpret(struct f_inst *what) { struct symbol *sym; struct f_val v1, v2, res, *vp; @@ -1515,6 +1521,70 @@ interpret(struct f_inst *what) return res; } +#define barrier() __asm__ __volatile__("": : :"memory") + +static char *file_name; +static int lino; + +static struct f_val +interpret(struct f_inst *what) +{ + struct f_val ret; + + if (what != NULL) { + lino = what->lineno; + barrier(); + file_name = what->file_name; + } + + ret = __interpret(what); + + if (what != NULL) { + file_name = NULL; + barrier(); + lino = -1; + } + + return ret; +} + +static void handle_sigusr2(int sig UNUSED) +{ + char prefix[64]; + char buf[1024]; + + if (file_name == NULL) { +// write(2, "not interpreting an f_inst right now\n", 37); + return; + } + + prefix[0] = 0; +#ifndef IPV6 + if (f_rte != NULL) { + struct fib_node *n = &(*f_rte)->net->n; + ip_addr addr = n->prefix; + + snprintf(prefix, sizeof(prefix), "%d.%d.%d.%d/%d", + (addr >> 24) & 0xff, (addr >> 16) & 0xff, + (addr >> 8) & 0xff, addr & 0xff, n->pxlen); + } +#endif + + snprintf(buf, sizeof(buf), "%s:%d %s\n", file_name, lino, prefix); + + write(2, buf, strlen(buf)); +} + +void filter_profile_init(void) +{ + struct sigaction sa; + + bzero(&sa, sizeof(sa)); + sa.sa_handler = handle_sigusr2; + sa.sa_flags = SA_RESTART; + sigaction(SIGUSR2, &sa, NULL); +} + #undef ARG #define ARG(x,y) \ if (!i_same(f1->y, f2->y)) \ @@ -1720,6 +1790,7 @@ f_run(struct filter *filter, struct rte **rte, struct ea_list **tmp_attrs, struc rta_free(f_old_rta); } + f_rte = NULL; if (res.type != T_RETURN) { log_rl(&rl_runtime_err, L_ERR "Filter %s did not return accept nor reject. Make up your mind", filter->name); @@ -1747,6 +1818,8 @@ f_eval_rte(struct f_inst *expr, struct rte **rte, struct linpool *tmp_pool) /* Note that in this function we assume that rte->attrs is private / uncached */ struct f_val res = interpret(expr); + f_rte = NULL; + /* Hack to include EAF_TEMP attributes to the main list */ (*rte)->attrs->eattrs = ea_append(tmp_attrs, (*rte)->attrs->eattrs); diff --git a/filter/filter.h b/filter/filter.h index 049ceb76..4349fdb3 100644 --- a/filter/filter.h +++ b/filter/filter.h @@ -26,6 +26,7 @@ struct f_inst { /* Instruction */ int i; void *p; } a2; + char *file_name; int lineno; }; diff --git a/sysdep/unix/main.c b/sysdep/unix/main.c index 8aa19fce..4ef84196 100644 --- a/sysdep/unix/main.c +++ b/sysdep/unix/main.c @@ -870,6 +870,7 @@ main(int argc, char **argv) write_pid_file(); signal_init(); + filter_profile_init(); config_commit(conf, RECONFIG_HARD, 0);
❦ 29 août 2017 06:14 +0300, Lennert Buytenhek <buytenh@wantstofly.org> :
Feedback appreciated! (Better ideas also appreciated. :D)
Using USDT probes? You can attach arbitrary strings to them. I know perf supports them (with a recent kernel) but I don't know how exactly. However, with systemtap, it's dead easy to see them: stap -e 'probe bird.* { print($$vars) }' For implementation, see: https://github.com/vincentbernat/lldpd/commit/bdfe419389075af3cdfeadc78008a1... https://github.com/vincentbernat/lldpd/commit/140e34f057462d5ff7818ab4af368f... -- Say what you mean, simply and directly. - The Elements of Programming Style (Kernighan & Plauger)
On Tue, Aug 29, 2017 at 07:57:20AM +0200, Vincent Bernat wrote:
Feedback appreciated! (Better ideas also appreciated. :D)
Using USDT probes? You can attach arbitrary strings to them. I know perf supports them (with a recent kernel) but I don't know how exactly. However, with systemtap, it's dead easy to see them:
stap -e 'probe bird.* { print($$vars) }'
For implementation, see: https://github.com/vincentbernat/lldpd/commit/bdfe419389075af3cdfeadc78008a1... https://github.com/vincentbernat/lldpd/commit/140e34f057462d5ff7818ab4af368f...
As far as I can see, these are tracepoints, but they wouldn't let me do profiling? What I need is profiling, as I want to know what's consuming the most CPU, so I want to be able to fire an event N times per second to tell me what bird is doing right at that specific moment, and listing or counting tracepoint invocations won't necessarily tell me what's using up the most CPU.
❦ 29 août 2017 10:42 +0300, Lennert Buytenhek <buytenh@wantstofly.org> :
Feedback appreciated! (Better ideas also appreciated. :D)
Using USDT probes? You can attach arbitrary strings to them. I know perf supports them (with a recent kernel) but I don't know how exactly. However, with systemtap, it's dead easy to see them:
stap -e 'probe bird.* { print($$vars) }'
For implementation, see: https://github.com/vincentbernat/lldpd/commit/bdfe419389075af3cdfeadc78008a1... https://github.com/vincentbernat/lldpd/commit/140e34f057462d5ff7818ab4af368f...
As far as I can see, these are tracepoints, but they wouldn't let me do profiling? What I need is profiling, as I want to know what's consuming the most CPU, so I want to be able to fire an event N times per second to tell me what bird is doing right at that specific moment, and listing or counting tracepoint invocations won't necessarily tell me what's using up the most CPU.
I didn't look at your patch, but from my understanding, when you see a CPU pike, you sent USR2 and it started "tracing". Use of tracepoints would do the same thing without having to send a signal. You could also have a tracepoint for "begin", a tracepoint for "end", you record the current filter by intercepting "begin", remove it on "end" and you can profile on CPU to know which is the current filter if BIRD is currently running on a CPU. -- Conscience doth make cowards of us all. -- Shakespeare
On Tue, Aug 29, 2017 at 10:07:03AM +0200, Vincent Bernat wrote:
Feedback appreciated! (Better ideas also appreciated. :D)
Using USDT probes? You can attach arbitrary strings to them. I know perf supports them (with a recent kernel) but I don't know how exactly. However, with systemtap, it's dead easy to see them:
stap -e 'probe bird.* { print($$vars) }'
For implementation, see: https://github.com/vincentbernat/lldpd/commit/bdfe419389075af3cdfeadc78008a1... https://github.com/vincentbernat/lldpd/commit/140e34f057462d5ff7818ab4af368f...
As far as I can see, these are tracepoints, but they wouldn't let me do profiling? What I need is profiling, as I want to know what's consuming the most CPU, so I want to be able to fire an event N times per second to tell me what bird is doing right at that specific moment, and listing or counting tracepoint invocations won't necessarily tell me what's using up the most CPU.
I didn't look at your patch, but from my understanding, when you see a CPU pike, you sent USR2 and it started "tracing". Use of tracepoints would do the same thing without having to send a signal.
The patch outputs a single location (config file + line number) sample every time you send a SIGUSR2, so you have to send SIGUSR2 signals in a loop to collect enough samples (and then aggregate those samples somehow). Sample output while running 'while true; killall -USR2 bird; done' while having bird receive 1M /32 routes from a remote test peer is as follows (where each line corresponds to one SIGUSR2 signal): # ./bird -c test.conf -s test.sock -f test.conf:55 200.6.170.145/32 test.conf:30 200.7.98.189/32 test.conf:37 200.14.187.67/32 test.conf:47 200.7.183.137/32 test.conf:28 200.15.56.225/32 test.conf:44 200.5.53.104/32 test.conf:33 200.1.38.168/32 test.conf:25 200.12.74.254/32 test.conf:26 200.1.111.250/32 test.conf:42 200.9.63.39/32 test.conf:37 200.15.180.68/32 test.conf:44 200.11.188.128/32 test.conf:36 200.9.193.157/32 test.conf:32 200.13.155.79/32 test.conf:32 200.6.171.136/32 test.conf:38 200.3.169.76/32 test.conf:31 200.6.189.140/32 test.conf:36 200.14.178.7/32 test.conf:49 200.13.55.165/32 test.conf:33 200.6.83.248/32 test.conf:51 200.4.150.202/32 test.conf:39 200.3.215.221/32 test.conf:26 200.11.220.20/32 test.conf:31 200.10.166.139/32 test.conf:32 200.4.250.210/32 test.conf:54 200.6.3.46/32 test.conf:30 200.9.48.108/32 test.conf:38 200.11.118.191/32 test.conf:39 200.10.199.82/32 test.conf:47 200.0.168.132/32 [...]
You could also have a tracepoint for "begin", a tracepoint for "end", you record the current filter by intercepting "begin", remove it on "end" and you can profile on CPU to know which is the current filter if BIRD is currently running on a CPU.
Right, my patch does something like that, but it unconditionally does the "begin" and "end" actions, because each of them is only two global variable assignments, the cost of which is likely negligible, so you don't really need to conditionalize those with tracepoints -- and you'd do the heavy lifting from a timer anyway. Ideally, I would like to be able to use a timer-based profiling mode (e.g. perf record -F) with stack trace collection, but then instead of collecting bird stack traces, which would just show many stacks of nested interpret() calls, I'd like it to collect traces of f_inst call stacks. (Right now, my patch only prints the top 'stack frame' when receiving SIGUSR2, but there's no reason you wouldn't be able to instrument interpret() to thread the stack 'frames', so that you'd be able to get a full config language stack trace when a profiling event is triggered.) That and JITting filter statements... Thanks! Lennert
On Tue, Aug 29, 2017 at 12:05:26PM +0300, Lennert Buytenhek wrote:
(Right now, my patch only prints the top 'stack frame' when receiving SIGUSR2, but there's no reason you wouldn't be able to instrument interpret() to thread the stack 'frames', so that you'd be able to get a full config language stack trace when a profiling event is triggered.)
Something like this (on top of the previous patch). This is complicated slightly by how bird builds its parse tree and that it uses tail recursion in interpret() to jump to the next statement. diff --git a/filter/filter.c b/filter/filter.c index d58c6cb4..e0e4bc56 100644 --- a/filter/filter.c +++ b/filter/filter.c @@ -1517,62 +1517,84 @@ __interpret(struct f_inst *what) bug( "Unknown instruction %d (%c)", what->code, what->code & 0xff); } if (what->next) - return interpret(what->next); + return __interpret(what->next); return res; } +struct frame +{ + struct frame *parent; + struct f_inst *inst; +}; + #define barrier() __asm__ __volatile__("": : :"memory") -static char *file_name; -static int lino; +static struct frame *frame; static struct f_val interpret(struct f_inst *what) { + struct frame f; struct f_val ret; if (what != NULL) { - lino = what->lineno; + f.parent = frame; + f.inst = what; barrier(); - file_name = what->file_name; + frame = &f; } ret = __interpret(what); if (what != NULL) { - file_name = NULL; + frame = f.parent; barrier(); - lino = -1; } return ret; } +static void dump_frame(struct frame *f) +{ + char buf[1024]; + + if (f->parent != NULL) + dump_frame(f->parent); + + snprintf(buf, sizeof(buf), "%s%s:%d\n", + (f->parent != NULL) ? " " : "", + f->inst->file_name, f->inst->lineno); + + write(2, buf, strlen(buf)); +} + static void handle_sigusr2(int sig UNUSED) { char prefix[64]; - char buf[1024]; - if (file_name == NULL) { + if (frame == NULL) { // write(2, "not interpreting an f_inst right now\n", 37); return; } + write(2, "===\n", 4); + prefix[0] = 0; #ifndef IPV6 if (f_rte != NULL) { struct fib_node *n = &(*f_rte)->net->n; ip_addr addr = n->prefix; - snprintf(prefix, sizeof(prefix), "%d.%d.%d.%d/%d", + snprintf(prefix, sizeof(prefix), "%d.%d.%d.%d/%d\n", (addr >> 24) & 0xff, (addr >> 16) & 0xff, (addr >> 8) & 0xff, addr & 0xff, n->pxlen); } #endif + write(2, prefix, strlen(prefix)); - snprintf(buf, sizeof(buf), "%s:%d %s\n", file_name, lino, prefix); + dump_frame(frame); - write(2, buf, strlen(buf)); + write(2, "===\n", 4); } void filter_profile_init(void)
participants (2)
-
Lennert Buytenhek -
Vincent Bernat