Wrangling wayward event loops with Linux perf and eBPF

It’s a tale as old as asynchronous time: your event loop is humming along, right up until something makes a blocking syscall. Everything grinds to a halt while the app waits for the function to return. Or, worse: things don’t grind to a halt. The syscall happens fast enough that the synchronous actions go unnoticed, until (at the worst possible moment) some downstream dependency shits the bed and only then do things grind to a halt, long after the app has been declared “stable in production”.

This is exactly the sort of thing that we might want to solve with system call interception tools like LD_PRELOAD or seccomp. Unfortunately there’s a catch: we don’t know a priori whether a given syscall (e.g. recvfrom()) is going to block. We could try to intuit that information, but any such intuition would be highly stateful. For instance, we could:

trap on socket operations, and check whether the socket in question is currently configured in a blocking mode. But that’s a lot of work in the hot path.
monitor the data structures returned by our event-loop syscall of choice (epoll_wait), and validate that any recvfrom syscall was in response to a notification of available data. But that’s still in the hot path, and only covers recieiving operations: it doesn’t handle outbound operations like connect() or sendto().

Ideally we’d intercept syscalls only when they’re already on the slow path. If we wait until the operating system has decided to put our event loop to sleep, we might just delay its slumber by some nanoseconds. Going to sleep indicates that our event loop either: (a) ran out of pending work to do, which is a good thing, or (b) is blocking on a syscall, which is a bad thing.

The approach demonstrated here uses Linux’s perf subsystem, specifically the PERF_COUNT_SW_CONTEXT_SWITCHES event. We then do in-kernel filtering of the events using an eBPF program. The semantics are:

make sure the program is actually going to sleep (entering the S TASK_INTERRUPTIBLE state), and is not context switching for some unrelated reason (e.g. time-sharing, interrupts).
check that we aren’t executing one of the expected syscalls, like epoll_wait() or futex(). I also chose to exempt read/write syscalls against stdin/stdout/stderr, because I don’t want my program to log itself to death.
if none of the exceptions apply, send the thread a SIGABRT signal (6). This produces a core dump with a stack trace that should stick out like a sore thumb from the rest of the asynchonous code in the app.

The eBPF program looks like this:

// bpf_program.c
// bpftool btf dump file /sys/kernel/btf/vmlinux format c > vmlinux.h
// clang -O2 -g -target bpf -c bpf_program.c -o bpf_program.o
#define __TARGET_ARCH_x86
#include "vmlinux.h"
#include <bpf/bpf_helpers.h>
#include <bpf/bpf_tracing.h>
#include <asm/unistd.h>

#define TASK_INTERRUPTIBLE 0x0001

#define STDIN_FILENO 0
#define STDOUT_FILENO 1
#define STDERR_FILENO 2

// Dual licensed GPLv2 + Apache 2
char _license[] SEC("license") = "GPL and additional rights";

void *bpf_cast_to_kern_ctx(void *) __ksym;

SEC("perf_event")
int check_and_signal(struct bpf_perf_event_data *ctx)
{
    struct bpf_perf_event_data_kern *kctx = bpf_cast_to_kern_ctx(ctx);
    struct task_struct *task = (struct task_struct *)bpf_get_current_task();
    __u64 state;

    // Are we going to sleep, or context switching for some other reason?
    bpf_probe_read_kernel(&state, sizeof(task->__state), &task->__state);
    if ((state & TASK_INTERRUPTIBLE) == 0)
        return 0;

    __u64 syscall_nr = kctx->data->regs_user.regs->orig_ax;
    __u64 syscall_rdi = PT_REGS_PARM1(kctx->data->regs_user.regs);

    if(
        syscall_nr == __NR_write &&
        (syscall_rdi == STDOUT_FILENO || syscall_rdi == STDERR_FILENO)
    ) {
        return 0;
    }

    if(syscall_nr == __NR_read && syscall_rdi == STDIN_FILENO) {
        return 0;
    }

    if(syscall_nr == __NR_epoll_wait
        || syscall_nr == __NR_epoll_wait_old
        || syscall_nr == __NR_futex
        || syscall_nr == __NR_futex_wait
        || syscall_nr == __NR_futex_waitv
    ) {
        return 0;
    }

    bpf_send_signal_thread(6);
    return 0;
}

Once compiled, we’ll need to embed it into a runner program, and the easiest way I found to do so was with a little bit of assembly:

// bpf.S
.section .rodata.bpf,"a",@progbits
.global bpf_program
.global bpf_program_end
.balign 4
bpf_program:
.incbin "bpf_program.o"
bpf_program_end:

The runner looks like below. It divides neatly into a privileged operation, which loads the BPF program, and “the rest”. The privileges required are:

CAP_BPF: to interact with the BPF subsystem
CAP_PERFMON: to make perf_event BPF filters
perf_event_paranoid <= 1: to allow a process to trace itself

// eventloop-guardian.c
// Dual licensed GPLv2 + Apache 2
// Compile with:
// clang -fuse-ld=lld -o eventloop-guardian eventloop-guardian.c bpf.S \
//   -lbpf -lcap
// sudo setcap cap_perfmon,cap_bpf+ep eventloop-guardian

#include <bpf/libbpf.h>        // for bpf_map__update_elem, bpf_object__load
#include <fcntl.h>             // for fcntl
#include <linux/perf_event.h>  // for perf_event_attr, perf_event_attr::(ano...
#include <stdint.h>            // for uint64_t, uint32_t
#include <stdio.h>             // for perror, size_t, NULL, fclose, fgets
#include <stdlib.h>            // for exit, EXIT_FAILURE, calloc
#include <string.h>            // for memset, strcmp
#include <sys/capability.h>    // for _cap_struct, cap_free, cap_init, cap_s...
#include <sys/ioctl.h>         // for ioctl
#include <sys/syscall.h>       // for __NR_perf_event_open
#include <unistd.h>            // for execvp, syscall, pid_t

#define MAX_LINE_LEN 256

static long perf_event_open(
    struct perf_event_attr *hw_event,
    pid_t pid,
    int cpu,
    int group_fd,
    unsigned long flags
) {
    return syscall(__NR_perf_event_open, hw_event, pid, cpu, group_fd, flags);
}

void drop_capabilities() {
    cap_t caps = cap_init();
    if (caps == NULL) {
        perror("cap_init");
        exit(EXIT_FAILURE);
    }

    if (cap_set_proc(caps) == -1) {
        perror("cap_set_proc");
        cap_free(caps);
        exit(EXIT_FAILURE);
    }
}

extern const char bpf_program[];
extern const char bpf_program_end[];

int main(int argc, char ** argv) {
    if(argc < 2) {
        fprintf(stderr, "Usage: %s <command to run>\n", argv[0]);
        exit(EXIT_FAILURE);
    }

    struct bpf_object * bpf_obj = bpf_object__open_mem(
        bpf_program,
        bpf_program_end - bpf_program,
        NULL
    );
    if (bpf_obj == NULL) {
        perror("bpf_object__open");
        return -1;
    }

    // Load our program into the kernel
    // Executes the bpf(2) syscall, which requires CAP_BPF
    // as well as CAP_PERFMON for our perf_event BPF program
    bpf_object__load(bpf_obj);

    drop_capabilities();

    // Get a FD for the program, so that we can link it to our perf event
    struct bpf_program * bpf_prog = bpf_object__next_program(bpf_obj, NULL);
    int bpf_fd = bpf_program__fd(bpf_prog);

    // Preserve across exec, just in case the program forks and still wants to
    // use our pre-loaded BPF
    int flags = fcntl(bpf_fd, F_GETFD, 0);
    fcntl(bpf_fd, F_SETFD, flags & ~FD_CLOEXEC);

    // This could theoretically be done by the program itself. It doesn't
    // involve any privileged operations
    struct perf_event_attr pe;
    memset(&pe, 0, sizeof(struct perf_event_attr));
    pe.type = PERF_TYPE_SOFTWARE;
    pe.size = sizeof(struct perf_event_attr);
    pe.config = PERF_COUNT_SW_CONTEXT_SWITCHES;
    pe.sample_type = PERF_SAMPLE_REGS_USER;
    pe.sample_period = 1;
    pe.sample_regs_user = 1 << 5;

    int fd = perf_event_open(
        &pe,
        0, // this thread
        -1, // all cpus
        -1, // no group
        0 // no flags
    );
    if (fd == -1) {
        perror("perf_event_open");
        return -1;
    }

    if(-1 == ioctl(fd, PERF_EVENT_IOC_SET_BPF, bpf_fd)) {
        perror("ioctl (bpf)");
        return -1;

    }
    if(-1 == ioctl(fd, PERF_EVENT_IOC_ENABLE, 0)) {
        perror("ioctl (enable)");
        return -1;
    }

    // Launch the wrapped process
    execvp(argv[1], argv + 1);
    perror("execvp");
    exit(EXIT_FAILURE);
}

As you can see, all processing happens in-kernel, so you can (for instance) exec*(3) into another process and have the syscall filtering continue to work. I also really like that the only performance hit occurs when the application is already on the slow path; there’s no fastpath involvement here.

The net result is a Python that definitely cannot REPL, since that involves sleeping within the pselect6() syscall. It also can’t time.sleep() or socket.connect():

$ eventloop-guardian python3
Python 3.12.3 (main, Nov  6 2024, 18:32:19) [GCC 13.2.0] on linux
Type "help", "copyright", "credits" or "license" for more information.
>>> Aborted (core dumped)
$ eventloop-guardian python3 -c 'import time; time.sleep(0.1)'
Aborted (core dumped)
$ eventloop-guardian python3 -c 'from socket import socket; s = socket(); s.connect(("example.com", 80))'
Aborted (core dumped)

But it can do things asynchronously:

$ eventloop-guardian python3 -c 'import asyncio; asyncio.run(asyncio.sleep(1))'
$ eventloop-guardian python3 -c 'import asyncio; asyncio.run(asyncio.open_connection("www.example.com", 80))'
$ eventloop-guardian python3 -c 'import asyncio, time; asyncio.run(asyncio.to_thread(lambda: time.sleep(0.1)))'