wchan, and thinking beneath the syscall

Our previous post looked at a slightly interesting situation: we wanted to use standard Linux tools to solve a performance problem, but arguments to the calls themselves didn’t carry enough information for us to make meaningful distinctions. The solution I gave in that post still discriminates by syscalls, with the small wrinkle that it is triggered by scheduler actions, rather than the syscalls themselves.

But an earlier version of the same program took a rather different tack: it searched the kernel stack traces for specific symbols (functions) that would be expected to call into the scheduler. Specifically:

ep_poll, which calls the scheduler during epoll_wait
futex_wait_queue, which implements futex
n_tty_write, which outputs data to terminals

This approach was directly inspired by Linux’s wchan facility, which reports the reason that a sleeping task is sleeping. It is implemented quite simply: walk the task’s stack from the innermost frame outward. The moment you have exited the scheduler, resolve your instruction pointer to a function name, and that is your task’s WCHAN.

My program did basically the same thing, but backwards: it used /proc/kallsyms ¹ to resolve interesting symbols to into interesting instruction pointer address ranges, and then the BPF program just asked if any of the stack frames it saw fell within those ranges. I decided not to ship it in that form because it was more complicated and brittle: syscalls are a public interface that is stable over time, whereas internal symbol names aren’t.

Ironically though, the reverse is potentially true, at least in some situations where new syscalls are added faster than the internal implementations change. With syscalls, epoll_wait_old and epoll_wait both funnel into the same private ep_poll symbol. Same deal with write/writev/pwritev/pwritev2, which all call n_tty_write (when a TTY is involved). And of course there are at least three different futex waiting syscalls. I’m sure that io_uring has some comparable functionalities as well that I haven’t explored.

I have included the code below, in case it is useful to anyone.

BPF:

#define __TARGET_ARCH_x86
#include "vmlinux.h"
#include <bpf/bpf_helpers.h>
#include <bpf/bpf_tracing.h>
#include <asm/unistd.h>

#define TASK_INTERRUPTIBLE 0x0001
#define STACK_MAX_DEPTH 10
#define STACK_SKIP_FRAMES 4
#define MAX_SYMBOL_ENTRIES 10

#define STDIN_FILENO 0
#define STDOUT_FILENO 1
#define STDERR_FILENO 2

// Dual license GPLv2 + Apache 2
char _license[] SEC("license") = "GPL and additional rights";

void *bpf_cast_to_kern_ctx(void *) __ksym;

struct symbol_range {
    __u64 start;
    __u64 end;
};

struct {
    __uint(type, BPF_MAP_TYPE_ARRAY);
    __uint(max_entries, MAX_SYMBOL_ENTRIES);
    __type(key, __u32);
    __type(value, struct symbol_range);
} symbols_map SEC(".maps");

SEC("perf_event")
int check_and_signal(struct bpf_perf_event_data *ctx)
{
    struct bpf_perf_event_data_kern *kctx = bpf_cast_to_kern_ctx(ctx);
    struct task_struct *task = (struct task_struct *)bpf_get_current_task();
    __u64 state;

    // Are we going to sleep, or context switching for some other reason?
    bpf_probe_read_kernel(&state, sizeof(task->__state), &task->__state);
    if ((state & TASK_INTERRUPTIBLE) == 0)
        return 0;

    __u64 stack[STACK_MAX_DEPTH];
    int depth = bpf_get_stack(ctx, stack, sizeof(stack), STACK_SKIP_FRAMES);
    if (depth < 0)
        return 0;

    if (depth > STACK_MAX_DEPTH)
        depth = STACK_MAX_DEPTH;

    for(__u32 symbol_idx = 0; symbol_idx < MAX_SYMBOL_ENTRIES; symbol_idx++) {
        __u32 tmp = symbol_idx;
        // If we pass symbol_idx into bpf_map_lookup_elem directly, the
        // verifier will assume that it could have been mutated by the helper
        // function, and our program will fail verification.
        struct symbol_range *entry = bpf_map_lookup_elem(&symbols_map, &tmp);
        if (!entry || entry->start == 0) break;

        for(__u32 i = 0; i < depth; i++) {
            __u64 ip = stack[i];
            if(ip >= entry->start && ip < entry->end) return 0;
        }
    }

    bpf_send_signal_thread(6);
    return 0;
}

Utility:

// Dual licensed GPLv2 + Apache 2
#include <bpf/libbpf.h>        // for bpf_map__update_elem, bpf_object__load
#include <fcntl.h>             // for fcntl
#include <linux/perf_event.h>  // for perf_event_attr, perf_event_attr::(ano...
#include <stdint.h>            // for uint64_t, uint32_t
#include <stdio.h>             // for perror, size_t, NULL, fclose, fgets
#include <stdlib.h>            // for exit, EXIT_FAILURE, calloc
#include <string.h>            // for memset, strcmp
#include <sys/capability.h>    // for _cap_struct, cap_free, cap_init, cap_s...
#include <sys/ioctl.h>         // for ioctl
#include <sys/syscall.h>       // for __NR_perf_event_open
#include <unistd.h>            // for execvp, syscall, pid_t

#define MAX_LINE_LEN 256

static long perf_event_open(
    struct perf_event_attr *hw_event,
    pid_t pid,
    int cpu,
    int group_fd,
    unsigned long flags
) {
    return syscall(__NR_perf_event_open, hw_event, pid, cpu, group_fd, flags);
}

typedef struct symbol_range {
    uint64_t start;
    uint64_t end;
} symbol_range;

static size_t find_symbols(
    const char ** wanted_symbols,
    symbol_range * ranges,
    size_t num_symbols
) {
    FILE *file = fopen("/proc/kallsyms", "r");
    if (!file) {
        perror("Error opening /proc/kallsyms");
        exit(EXIT_FAILURE);
    }

    size_t found_count = 0;
    char line[MAX_LINE_LEN];

    while (fgets(line, MAX_LINE_LEN, file)) {
        unsigned long address;
        char type;
        char sym_name[MAX_LINE_LEN];

        if (sscanf(line, "%lx %c %s", &address, &type, sym_name) != 3) {
            continue;
        }

        if (ranges[found_count].start && !ranges[found_count].end) {
            ranges[found_count].end = address;
            found_count++;
        }

        for (size_t i = 0; i < num_symbols; i++) {
            if (strcmp(sym_name, wanted_symbols[i]) != 0) {
                continue;
            }
            ranges[found_count].start = address;

            if(address != 0) {
                continue;
            }

            printf("%s\n", line);
            fprintf(stderr, "Can't read addresses from kallsyms!\n");
            exit(1);
        }
    }

    fclose(file);
    return found_count;
}

void drop_capabilities() {
    cap_t caps = cap_init();
    if (caps == NULL) {
        perror("cap_init");
        exit(EXIT_FAILURE);
    }

    if (cap_set_proc(caps) == -1) {
        perror("cap_set_proc");
        cap_free(caps);
        exit(EXIT_FAILURE);
    }
}

// The following kernel symbols will be allowed to block the program
const char * wanted_symbols[] = {
    "ep_poll",
    "futex_wait_queue",
    "n_tty_write",
};
const size_t wanted_symbols_count = sizeof(wanted_symbols) / sizeof(char *);

extern const char bpf_program[];
extern const char bpf_program_end[];

int main(int argc, char ** argv) {
    if(argc < 2) {
        fprintf(stderr, "Usage: %s <command to run>\n", argv[0]);
        exit(EXIT_FAILURE);
    }
    symbol_range *symbols = calloc(wanted_symbols_count, sizeof(symbol_range));
    size_t found_symbols_count = find_symbols(
        wanted_symbols,
        symbols,
        wanted_symbols_count
    );
    if(found_symbols_count != wanted_symbols_count) {
        fprintf(
            stderr,
            "Wanted %zu symbols, found %zu\n",
            wanted_symbols_count,
            found_symbols_count
        );
        exit(EXIT_FAILURE);
    }

    struct bpf_object * bpf_obj = bpf_object__open_mem(
        bpf_program,
        bpf_program_end - bpf_program,
        NULL
    );
    if (bpf_obj == NULL) {
        perror("bpf_object__open");
        return -1;
    }

    // Load our program into the kernel
    // Executes the bpf(2) syscall, which requires CAP_BPF
    // as well as CAP_PERFMON for our perf_event BPF program
    bpf_object__load(bpf_obj);

    drop_capabilities();

    // Load the symbol resolutions we did into the map
    struct bpf_map * bpf_map = bpf_object__next_map(bpf_obj, NULL);
    for(uint32_t key = 0; key < found_symbols_count; key++) {
        if(0 != bpf_map__update_elem(
            bpf_map,
            (void *) &key, sizeof(uint32_t),
            (void *) &symbols[key], sizeof(struct symbol_range),
            0
        )) {
            perror("bpf_map__update_elem");
            return -1;
        }
    }

    // Get a FD for the program, so that we can link it to our perf event
    struct bpf_program * bpf_prog = bpf_object__next_program(bpf_obj, NULL);
    int bpf_fd = bpf_program__fd(bpf_prog);

    // Preserve across exec, just in case the program forks and still wants to
    // use our pre-loaded BPF
    int flags = fcntl(bpf_fd, F_GETFD, 0);
    fcntl(bpf_fd, F_SETFD, flags & ~FD_CLOEXEC);

    // This could theoretically be done by the program itself. It doesn't
    // involve any privileged operations
    struct perf_event_attr pe;
    memset(&pe, 0, sizeof(struct perf_event_attr));
    pe.type = PERF_TYPE_SOFTWARE;
    pe.size = sizeof(struct perf_event_attr);
    pe.config = PERF_COUNT_SW_CONTEXT_SWITCHES;
    pe.sample_period = 1;

    int fd = perf_event_open(
        &pe,
        0, // this thread
        -1, // all cpus
        -1, // no group
        0 // no flags
    );
    if (fd == -1) {
        perror("perf_event_open");
        return -1;
    }

    if(-1 == ioctl(fd, PERF_EVENT_IOC_SET_BPF, bpf_fd)) {
        perror("ioctl (bpf)");
        return -1;

    }
    if(-1 == ioctl(fd, PERF_EVENT_IOC_ENABLE, 0)) {
        perror("ioctl (enable)");
        return -1;
    }

    execvp(argv[1], argv + 1);
}

Reading kallsyms also required another capability (CAP_SYSLOG), which I was mildly interested in avoiding. ↩