From ea2e7ce5d0fc878463ba39deb46cf2ab20398fd2 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 1 Sep 2016 18:37:21 -0700 Subject: [PATCH 1/6] bpf: support 8-byte metafield access The verifier supported only 4-byte metafields in struct __sk_buff and struct xdp_md. The metafields in upcoming struct bpf_perf_event are 8-byte to match register width in struct pt_regs. Teach verifier to recognize 8-byte metafield access. The patch doesn't affect safety of sockets and xdp programs. They check for 4-byte only ctx access before these conditions are hit. Signed-off-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- kernel/bpf/verifier.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index abb61f3f6900..c1c9e441f0f5 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2333,7 +2333,8 @@ static int do_check(struct verifier_env *env) if (err) return err; - if (BPF_SIZE(insn->code) != BPF_W) { + if (BPF_SIZE(insn->code) != BPF_W && + BPF_SIZE(insn->code) != BPF_DW) { insn_idx++; continue; } @@ -2642,9 +2643,11 @@ static int convert_ctx_accesses(struct verifier_env *env) for (i = 0; i < insn_cnt; i++, insn++) { u32 insn_delta, cnt; - if (insn->code == (BPF_LDX | BPF_MEM | BPF_W)) + if (insn->code == (BPF_LDX | BPF_MEM | BPF_W) || + insn->code == (BPF_LDX | BPF_MEM | BPF_DW)) type = BPF_READ; - else if (insn->code == (BPF_STX | BPF_MEM | BPF_W)) + else if (insn->code == (BPF_STX | BPF_MEM | BPF_W) || + insn->code == (BPF_STX | BPF_MEM | BPF_DW)) type = BPF_WRITE; else continue; From 0515e5999a466dfe6e1924f460da599bb6821487 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 1 Sep 2016 18:37:22 -0700 Subject: [PATCH 2/6] bpf: introduce BPF_PROG_TYPE_PERF_EVENT program type Introduce BPF_PROG_TYPE_PERF_EVENT programs that can be attached to HW and SW perf events (PERF_TYPE_HARDWARE and PERF_TYPE_SOFTWARE correspondingly in uapi/linux/perf_event.h) The program visible context meta structure is struct bpf_perf_event_data { struct pt_regs regs; __u64 sample_period; }; which is accessible directly from the program: int bpf_prog(struct bpf_perf_event_data *ctx) { ... ctx->sample_period ... ... ctx->regs.ip ... } The bpf verifier rewrites the accesses into kernel internal struct bpf_perf_event_data_kern which allows changing struct perf_sample_data without affecting bpf programs. New fields can be added to the end of struct bpf_perf_event_data in the future. Signed-off-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- include/linux/perf_event.h | 5 +++ include/uapi/linux/Kbuild | 1 + include/uapi/linux/bpf.h | 1 + include/uapi/linux/bpf_perf_event.h | 18 +++++++++ kernel/trace/bpf_trace.c | 61 +++++++++++++++++++++++++++++ 5 files changed, 86 insertions(+) create mode 100644 include/uapi/linux/bpf_perf_event.h diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 2b6b43cc0dd5..97bfe62f30d7 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -788,6 +788,11 @@ struct perf_output_handle { int page; }; +struct bpf_perf_event_data_kern { + struct pt_regs *regs; + struct perf_sample_data *data; +}; + #ifdef CONFIG_CGROUP_PERF /* diff --git a/include/uapi/linux/Kbuild b/include/uapi/linux/Kbuild index 185f8ea2702f..d0352a971ebd 100644 --- a/include/uapi/linux/Kbuild +++ b/include/uapi/linux/Kbuild @@ -71,6 +71,7 @@ header-y += binfmts.h header-y += blkpg.h header-y += blktrace_api.h header-y += bpf_common.h +header-y += bpf_perf_event.h header-y += bpf.h header-y += bpqether.h header-y += bsg.h diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index e4c5a1baa993..f896dfac4ac0 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -95,6 +95,7 @@ enum bpf_prog_type { BPF_PROG_TYPE_SCHED_ACT, BPF_PROG_TYPE_TRACEPOINT, BPF_PROG_TYPE_XDP, + BPF_PROG_TYPE_PERF_EVENT, }; #define BPF_PSEUDO_MAP_FD 1 diff --git a/include/uapi/linux/bpf_perf_event.h b/include/uapi/linux/bpf_perf_event.h new file mode 100644 index 000000000000..067427259820 --- /dev/null +++ b/include/uapi/linux/bpf_perf_event.h @@ -0,0 +1,18 @@ +/* Copyright (c) 2016 Facebook + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + */ +#ifndef _UAPI__LINUX_BPF_PERF_EVENT_H__ +#define _UAPI__LINUX_BPF_PERF_EVENT_H__ + +#include +#include + +struct bpf_perf_event_data { + struct pt_regs regs; + __u64 sample_period; +}; + +#endif /* _UAPI__LINUX_BPF_PERF_EVENT_H__ */ diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index ad35213b8405..d3869b03d9fe 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1,4 +1,5 @@ /* Copyright (c) 2011-2015 PLUMgrid, http://plumgrid.com + * Copyright (c) 2016 Facebook * * This program is free software; you can redistribute it and/or * modify it under the terms of version 2 of the GNU General Public @@ -8,6 +9,7 @@ #include #include #include +#include #include #include #include @@ -552,10 +554,69 @@ static struct bpf_prog_type_list tracepoint_tl = { .type = BPF_PROG_TYPE_TRACEPOINT, }; +static bool pe_prog_is_valid_access(int off, int size, enum bpf_access_type type, + enum bpf_reg_type *reg_type) +{ + if (off < 0 || off >= sizeof(struct bpf_perf_event_data)) + return false; + if (type != BPF_READ) + return false; + if (off % size != 0) + return false; + if (off == offsetof(struct bpf_perf_event_data, sample_period)) { + if (size != sizeof(u64)) + return false; + } else { + if (size != sizeof(long)) + return false; + } + return true; +} + +static u32 pe_prog_convert_ctx_access(enum bpf_access_type type, int dst_reg, + int src_reg, int ctx_off, + struct bpf_insn *insn_buf, + struct bpf_prog *prog) +{ + struct bpf_insn *insn = insn_buf; + + switch (ctx_off) { + case offsetof(struct bpf_perf_event_data, sample_period): + BUILD_BUG_ON(FIELD_SIZEOF(struct perf_sample_data, period) != sizeof(u64)); + *insn++ = BPF_LDX_MEM(bytes_to_bpf_size(FIELD_SIZEOF(struct bpf_perf_event_data_kern, data)), + dst_reg, src_reg, + offsetof(struct bpf_perf_event_data_kern, data)); + *insn++ = BPF_LDX_MEM(BPF_DW, dst_reg, dst_reg, + offsetof(struct perf_sample_data, period)); + break; + default: + *insn++ = BPF_LDX_MEM(bytes_to_bpf_size(FIELD_SIZEOF(struct bpf_perf_event_data_kern, regs)), + dst_reg, src_reg, + offsetof(struct bpf_perf_event_data_kern, regs)); + *insn++ = BPF_LDX_MEM(bytes_to_bpf_size(sizeof(long)), + dst_reg, dst_reg, ctx_off); + break; + } + + return insn - insn_buf; +} + +static const struct bpf_verifier_ops perf_event_prog_ops = { + .get_func_proto = tp_prog_func_proto, + .is_valid_access = pe_prog_is_valid_access, + .convert_ctx_access = pe_prog_convert_ctx_access, +}; + +static struct bpf_prog_type_list perf_event_tl = { + .ops = &perf_event_prog_ops, + .type = BPF_PROG_TYPE_PERF_EVENT, +}; + static int __init register_kprobe_prog_ops(void) { bpf_register_prog_type(&kprobe_tl); bpf_register_prog_type(&tracepoint_tl); + bpf_register_prog_type(&perf_event_tl); return 0; } late_initcall(register_kprobe_prog_ops); From fdc15d388d600d5a1599e14c700af105a5b60761 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 1 Sep 2016 18:37:23 -0700 Subject: [PATCH 3/6] bpf: perf_event progs should only use preallocated maps Make sure that BPF_PROG_TYPE_PERF_EVENT programs only use preallocated hash maps, since doing memory allocation in overflow_handler can crash depending on where nmi got triggered. Signed-off-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- kernel/bpf/verifier.c | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index c1c9e441f0f5..48c2705db22c 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2511,6 +2511,20 @@ process_bpf_exit: return 0; } +static int check_map_prog_compatibility(struct bpf_map *map, + struct bpf_prog *prog) + +{ + if (prog->type == BPF_PROG_TYPE_PERF_EVENT && + (map->map_type == BPF_MAP_TYPE_HASH || + map->map_type == BPF_MAP_TYPE_PERCPU_HASH) && + (map->map_flags & BPF_F_NO_PREALLOC)) { + verbose("perf_event programs can only use preallocated hash map\n"); + return -EINVAL; + } + return 0; +} + /* look for pseudo eBPF instructions that access map FDs and * replace them with actual map pointers */ @@ -2518,7 +2532,7 @@ static int replace_map_fd_with_map_ptr(struct verifier_env *env) { struct bpf_insn *insn = env->prog->insnsi; int insn_cnt = env->prog->len; - int i, j; + int i, j, err; for (i = 0; i < insn_cnt; i++, insn++) { if (BPF_CLASS(insn->code) == BPF_LDX && @@ -2562,6 +2576,12 @@ static int replace_map_fd_with_map_ptr(struct verifier_env *env) return PTR_ERR(map); } + err = check_map_prog_compatibility(map, env->prog); + if (err) { + fdput(f); + return err; + } + /* store map pointer inside BPF_LD_IMM64 instruction */ insn[0].imm = (u32) (unsigned long) map; insn[1].imm = ((u64) (unsigned long) map) >> 32; From aa6a5f3cb2b2edc5b9aab0b4fdfdfa9c3b5096a8 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 1 Sep 2016 18:37:24 -0700 Subject: [PATCH 4/6] perf, bpf: add perf events core support for BPF_PROG_TYPE_PERF_EVENT programs Allow attaching BPF_PROG_TYPE_PERF_EVENT programs to sw and hw perf events via overflow_handler mechanism. When program is attached the overflow_handlers become stacked. The program acts as a filter. Returning zero from the program means that the normal perf_event_output handler will not be called and sampling event won't be stored in the ring buffer. The overflow_handler_context==NULL is an additional safety check to make sure programs are not attached to hw breakpoints and watchdog in case other checks (that prevent that now anyway) get accidentally relaxed in the future. The program refcnt is incremented in case perf_events are inhereted when target task is forked. Similar to kprobe and tracepoint programs there is no ioctl to detach the program or swap already attached program. The user space expected to close(perf_event_fd) like it does right now for kprobe+bpf. That restriction simplifies the code quite a bit. The invocation of overflow_handler in __perf_event_overflow() is now done via READ_ONCE, since that pointer can be replaced when the program is attached while perf_event itself could have been active already. There is no need to do similar treatment for event->prog, since it's assigned only once before it's accessed. Signed-off-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/linux/bpf.h | 4 ++ include/linux/perf_event.h | 4 ++ kernel/events/core.c | 89 +++++++++++++++++++++++++++++++++++++- 3 files changed, 96 insertions(+), 1 deletion(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 11134238417d..9a904f63f8c1 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -297,6 +297,10 @@ static inline struct bpf_prog *bpf_prog_add(struct bpf_prog *prog, int i) static inline void bpf_prog_put(struct bpf_prog *prog) { } +static inline struct bpf_prog *bpf_prog_inc(struct bpf_prog *prog) +{ + return ERR_PTR(-EOPNOTSUPP); +} #endif /* CONFIG_BPF_SYSCALL */ /* verifier prototypes for helper functions called from eBPF programs */ diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 97bfe62f30d7..ccb73a58113d 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -679,6 +679,10 @@ struct perf_event { u64 (*clock)(void); perf_overflow_handler_t overflow_handler; void *overflow_handler_context; +#ifdef CONFIG_BPF_SYSCALL + perf_overflow_handler_t orig_overflow_handler; + struct bpf_prog *prog; +#endif #ifdef CONFIG_EVENT_TRACING struct trace_event_call *tp_event; diff --git a/kernel/events/core.c b/kernel/events/core.c index 3cfabdf7b942..85bf4c37911f 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -7022,7 +7022,7 @@ static int __perf_event_overflow(struct perf_event *event, irq_work_queue(&event->pending); } - event->overflow_handler(event, data, regs); + READ_ONCE(event->overflow_handler)(event, data, regs); if (*perf_event_fasync(event) && event->pending_kill) { event->pending_wakeup = 1; @@ -7637,11 +7637,83 @@ static void perf_event_free_filter(struct perf_event *event) ftrace_profile_free_filter(event); } +#ifdef CONFIG_BPF_SYSCALL +static void bpf_overflow_handler(struct perf_event *event, + struct perf_sample_data *data, + struct pt_regs *regs) +{ + struct bpf_perf_event_data_kern ctx = { + .data = data, + .regs = regs, + }; + int ret = 0; + + preempt_disable(); + if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1)) + goto out; + rcu_read_lock(); + ret = BPF_PROG_RUN(event->prog, (void *)&ctx); + rcu_read_unlock(); +out: + __this_cpu_dec(bpf_prog_active); + preempt_enable(); + if (!ret) + return; + + event->orig_overflow_handler(event, data, regs); +} + +static int perf_event_set_bpf_handler(struct perf_event *event, u32 prog_fd) +{ + struct bpf_prog *prog; + + if (event->overflow_handler_context) + /* hw breakpoint or kernel counter */ + return -EINVAL; + + if (event->prog) + return -EEXIST; + + prog = bpf_prog_get_type(prog_fd, BPF_PROG_TYPE_PERF_EVENT); + if (IS_ERR(prog)) + return PTR_ERR(prog); + + event->prog = prog; + event->orig_overflow_handler = READ_ONCE(event->overflow_handler); + WRITE_ONCE(event->overflow_handler, bpf_overflow_handler); + return 0; +} + +static void perf_event_free_bpf_handler(struct perf_event *event) +{ + struct bpf_prog *prog = event->prog; + + if (!prog) + return; + + WRITE_ONCE(event->overflow_handler, event->orig_overflow_handler); + event->prog = NULL; + bpf_prog_put(prog); +} +#else +static int perf_event_set_bpf_handler(struct perf_event *event, u32 prog_fd) +{ + return -EOPNOTSUPP; +} +static void perf_event_free_bpf_handler(struct perf_event *event) +{ +} +#endif + static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd) { bool is_kprobe, is_tracepoint; struct bpf_prog *prog; + if (event->attr.type == PERF_TYPE_HARDWARE || + event->attr.type == PERF_TYPE_SOFTWARE) + return perf_event_set_bpf_handler(event, prog_fd); + if (event->attr.type != PERF_TYPE_TRACEPOINT) return -EINVAL; @@ -7682,6 +7754,8 @@ static void perf_event_free_bpf_prog(struct perf_event *event) { struct bpf_prog *prog; + perf_event_free_bpf_handler(event); + if (!event->tp_event) return; @@ -8998,6 +9072,19 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, if (!overflow_handler && parent_event) { overflow_handler = parent_event->overflow_handler; context = parent_event->overflow_handler_context; +#ifdef CONFIG_BPF_SYSCALL + if (overflow_handler == bpf_overflow_handler) { + struct bpf_prog *prog = bpf_prog_inc(parent_event->prog); + + if (IS_ERR(prog)) { + err = PTR_ERR(prog); + goto err_ns; + } + event->prog = prog; + event->orig_overflow_handler = + parent_event->orig_overflow_handler; + } +#endif } if (overflow_handler) { From 1c47910ef80135ac89e4d0b471d123572cee5535 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 1 Sep 2016 18:37:25 -0700 Subject: [PATCH 5/6] samples/bpf: add perf_event+bpf example The bpf program is called 50 times a second and does hashmap[kern&user_stackid]++ It's primary purpose to check that key bpf helpers like map lookup, update, get_stackid, trace_printk and ctx access are all working. It checks: - PERF_COUNT_HW_CPU_CYCLES on all cpus - PERF_COUNT_HW_CPU_CYCLES for current process and inherited perf_events to children - PERF_COUNT_SW_CPU_CLOCK on all cpus - PERF_COUNT_SW_CPU_CLOCK for current process Signed-off-by: Alexei Starovoitov Signed-off-by: David S. Miller --- samples/bpf/Makefile | 4 + samples/bpf/bpf_helpers.h | 2 + samples/bpf/bpf_load.c | 7 +- samples/bpf/trace_event_kern.c | 65 ++++++++++ samples/bpf/trace_event_user.c | 213 +++++++++++++++++++++++++++++++++ 5 files changed, 290 insertions(+), 1 deletion(-) create mode 100644 samples/bpf/trace_event_kern.c create mode 100644 samples/bpf/trace_event_user.c diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile index db3cb061bfcd..a69cf9045285 100644 --- a/samples/bpf/Makefile +++ b/samples/bpf/Makefile @@ -25,6 +25,7 @@ hostprogs-y += test_cgrp2_array_pin hostprogs-y += xdp1 hostprogs-y += xdp2 hostprogs-y += test_current_task_under_cgroup +hostprogs-y += trace_event test_verifier-objs := test_verifier.o libbpf.o test_maps-objs := test_maps.o libbpf.o @@ -52,6 +53,7 @@ xdp1-objs := bpf_load.o libbpf.o xdp1_user.o xdp2-objs := bpf_load.o libbpf.o xdp1_user.o test_current_task_under_cgroup-objs := bpf_load.o libbpf.o \ test_current_task_under_cgroup_user.o +trace_event-objs := bpf_load.o libbpf.o trace_event_user.o # Tell kbuild to always build the programs always := $(hostprogs-y) @@ -79,6 +81,7 @@ always += test_cgrp2_tc_kern.o always += xdp1_kern.o always += xdp2_kern.o always += test_current_task_under_cgroup_kern.o +always += trace_event_kern.o HOSTCFLAGS += -I$(objtree)/usr/include @@ -103,6 +106,7 @@ HOSTLOADLIBES_test_overhead += -lelf -lrt HOSTLOADLIBES_xdp1 += -lelf HOSTLOADLIBES_xdp2 += -lelf HOSTLOADLIBES_test_current_task_under_cgroup += -lelf +HOSTLOADLIBES_trace_event += -lelf # Allows pointing LLC/CLANG to a LLVM backend with bpf support, redefine on cmdline: # make samples/bpf/ LLC=~/git/llvm/build/bin/llc CLANG=~/git/llvm/build/bin/clang diff --git a/samples/bpf/bpf_helpers.h b/samples/bpf/bpf_helpers.h index bbdf62a1e45e..90f44bd2045e 100644 --- a/samples/bpf/bpf_helpers.h +++ b/samples/bpf/bpf_helpers.h @@ -55,6 +55,8 @@ static int (*bpf_skb_get_tunnel_opt)(void *ctx, void *md, int size) = (void *) BPF_FUNC_skb_get_tunnel_opt; static int (*bpf_skb_set_tunnel_opt)(void *ctx, void *md, int size) = (void *) BPF_FUNC_skb_set_tunnel_opt; +static unsigned long long (*bpf_get_prandom_u32)(void) = + (void *) BPF_FUNC_get_prandom_u32; /* llvm builtin functions that eBPF C program may use to * emit BPF_LD_ABS and BPF_LD_IND instructions diff --git a/samples/bpf/bpf_load.c b/samples/bpf/bpf_load.c index 0cfda2320320..97913e109b14 100644 --- a/samples/bpf/bpf_load.c +++ b/samples/bpf/bpf_load.c @@ -51,6 +51,7 @@ static int load_and_attach(const char *event, struct bpf_insn *prog, int size) bool is_kretprobe = strncmp(event, "kretprobe/", 10) == 0; bool is_tracepoint = strncmp(event, "tracepoint/", 11) == 0; bool is_xdp = strncmp(event, "xdp", 3) == 0; + bool is_perf_event = strncmp(event, "perf_event", 10) == 0; enum bpf_prog_type prog_type; char buf[256]; int fd, efd, err, id; @@ -69,6 +70,8 @@ static int load_and_attach(const char *event, struct bpf_insn *prog, int size) prog_type = BPF_PROG_TYPE_TRACEPOINT; } else if (is_xdp) { prog_type = BPF_PROG_TYPE_XDP; + } else if (is_perf_event) { + prog_type = BPF_PROG_TYPE_PERF_EVENT; } else { printf("Unknown event '%s'\n", event); return -1; @@ -82,7 +85,7 @@ static int load_and_attach(const char *event, struct bpf_insn *prog, int size) prog_fd[prog_cnt++] = fd; - if (is_xdp) + if (is_xdp || is_perf_event) return 0; if (is_socket) { @@ -326,6 +329,7 @@ int load_bpf_file(char *path) memcmp(shname_prog, "kretprobe/", 10) == 0 || memcmp(shname_prog, "tracepoint/", 11) == 0 || memcmp(shname_prog, "xdp", 3) == 0 || + memcmp(shname_prog, "perf_event", 10) == 0 || memcmp(shname_prog, "socket", 6) == 0) load_and_attach(shname_prog, insns, data_prog->d_size); } @@ -344,6 +348,7 @@ int load_bpf_file(char *path) memcmp(shname, "kretprobe/", 10) == 0 || memcmp(shname, "tracepoint/", 11) == 0 || memcmp(shname, "xdp", 3) == 0 || + memcmp(shname, "perf_event", 10) == 0 || memcmp(shname, "socket", 6) == 0) load_and_attach(shname, data->d_buf, data->d_size); } diff --git a/samples/bpf/trace_event_kern.c b/samples/bpf/trace_event_kern.c new file mode 100644 index 000000000000..71a8ed32823e --- /dev/null +++ b/samples/bpf/trace_event_kern.c @@ -0,0 +1,65 @@ +/* Copyright (c) 2016 Facebook + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + */ +#include +#include +#include +#include +#include +#include "bpf_helpers.h" + +struct key_t { + char comm[TASK_COMM_LEN]; + u32 kernstack; + u32 userstack; +}; + +struct bpf_map_def SEC("maps") counts = { + .type = BPF_MAP_TYPE_HASH, + .key_size = sizeof(struct key_t), + .value_size = sizeof(u64), + .max_entries = 10000, +}; + +struct bpf_map_def SEC("maps") stackmap = { + .type = BPF_MAP_TYPE_STACK_TRACE, + .key_size = sizeof(u32), + .value_size = PERF_MAX_STACK_DEPTH * sizeof(u64), + .max_entries = 10000, +}; + +#define KERN_STACKID_FLAGS (0 | BPF_F_FAST_STACK_CMP) +#define USER_STACKID_FLAGS (0 | BPF_F_FAST_STACK_CMP | BPF_F_USER_STACK) + +SEC("perf_event") +int bpf_prog1(struct bpf_perf_event_data *ctx) +{ + char fmt[] = "CPU-%d period %lld ip %llx"; + u32 cpu = bpf_get_smp_processor_id(); + struct key_t key; + u64 *val, one = 1; + + if (ctx->sample_period < 10000) + /* ignore warmup */ + return 0; + bpf_get_current_comm(&key.comm, sizeof(key.comm)); + key.kernstack = bpf_get_stackid(ctx, &stackmap, KERN_STACKID_FLAGS); + key.userstack = bpf_get_stackid(ctx, &stackmap, USER_STACKID_FLAGS); + if ((int)key.kernstack < 0 && (int)key.userstack < 0) { + bpf_trace_printk(fmt, sizeof(fmt), cpu, ctx->sample_period, + ctx->regs.ip); + return 0; + } + + val = bpf_map_lookup_elem(&counts, &key); + if (val) + (*val)++; + else + bpf_map_update_elem(&counts, &key, &one, BPF_NOEXIST); + return 0; +} + +char _license[] SEC("license") = "GPL"; diff --git a/samples/bpf/trace_event_user.c b/samples/bpf/trace_event_user.c new file mode 100644 index 000000000000..9a130d31ecf2 --- /dev/null +++ b/samples/bpf/trace_event_user.c @@ -0,0 +1,213 @@ +/* Copyright (c) 2016 Facebook + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "libbpf.h" +#include "bpf_load.h" + +#define SAMPLE_FREQ 50 + +static bool sys_read_seen, sys_write_seen; + +static void print_ksym(__u64 addr) +{ + struct ksym *sym; + + if (!addr) + return; + sym = ksym_search(addr); + printf("%s;", sym->name); + if (!strcmp(sym->name, "sys_read")) + sys_read_seen = true; + else if (!strcmp(sym->name, "sys_write")) + sys_write_seen = true; +} + +static void print_addr(__u64 addr) +{ + if (!addr) + return; + printf("%llx;", addr); +} + +#define TASK_COMM_LEN 16 + +struct key_t { + char comm[TASK_COMM_LEN]; + __u32 kernstack; + __u32 userstack; +}; + +static void print_stack(struct key_t *key, __u64 count) +{ + __u64 ip[PERF_MAX_STACK_DEPTH] = {}; + static bool warned; + int i; + + printf("%3lld %s;", count, key->comm); + if (bpf_lookup_elem(map_fd[1], &key->kernstack, ip) != 0) { + printf("---;"); + } else { + for (i = PERF_MAX_STACK_DEPTH - 1; i >= 0; i--) + print_ksym(ip[i]); + } + printf("-;"); + if (bpf_lookup_elem(map_fd[1], &key->userstack, ip) != 0) { + printf("---;"); + } else { + for (i = PERF_MAX_STACK_DEPTH - 1; i >= 0; i--) + print_addr(ip[i]); + } + printf("\n"); + + if (key->kernstack == -EEXIST && !warned) { + printf("stackmap collisions seen. Consider increasing size\n"); + warned = true; + } else if ((int)key->kernstack < 0 && (int)key->userstack < 0) { + printf("err stackid %d %d\n", key->kernstack, key->userstack); + } +} + +static void int_exit(int sig) +{ + kill(0, SIGKILL); + exit(0); +} + +static void print_stacks(void) +{ + struct key_t key = {}, next_key; + __u64 value; + __u32 stackid = 0, next_id; + int fd = map_fd[0], stack_map = map_fd[1]; + + sys_read_seen = sys_write_seen = false; + while (bpf_get_next_key(fd, &key, &next_key) == 0) { + bpf_lookup_elem(fd, &next_key, &value); + print_stack(&next_key, value); + bpf_delete_elem(fd, &next_key); + key = next_key; + } + + if (!sys_read_seen || !sys_write_seen) { + printf("BUG kernel stack doesn't contain sys_read() and sys_write()\n"); + int_exit(0); + } + + /* clear stack map */ + while (bpf_get_next_key(stack_map, &stackid, &next_id) == 0) { + bpf_delete_elem(stack_map, &next_id); + stackid = next_id; + } +} + +static void test_perf_event_all_cpu(struct perf_event_attr *attr) +{ + int nr_cpus = sysconf(_SC_NPROCESSORS_CONF); + int *pmu_fd = malloc(nr_cpus * sizeof(int)); + int i; + + /* open perf_event on all cpus */ + for (i = 0; i < nr_cpus; i++) { + pmu_fd[i] = perf_event_open(attr, -1, i, -1, 0); + if (pmu_fd[i] < 0) { + printf("perf_event_open failed\n"); + goto all_cpu_err; + } + assert(ioctl(pmu_fd[i], PERF_EVENT_IOC_SET_BPF, prog_fd[0]) == 0); + assert(ioctl(pmu_fd[i], PERF_EVENT_IOC_ENABLE, 0) == 0); + } + system("dd if=/dev/zero of=/dev/null count=5000k"); + print_stacks(); +all_cpu_err: + for (i--; i >= 0; i--) + close(pmu_fd[i]); + free(pmu_fd); +} + +static void test_perf_event_task(struct perf_event_attr *attr) +{ + int pmu_fd; + + /* open task bound event */ + pmu_fd = perf_event_open(attr, 0, -1, -1, 0); + if (pmu_fd < 0) { + printf("perf_event_open failed\n"); + return; + } + assert(ioctl(pmu_fd, PERF_EVENT_IOC_SET_BPF, prog_fd[0]) == 0); + assert(ioctl(pmu_fd, PERF_EVENT_IOC_ENABLE, 0) == 0); + system("dd if=/dev/zero of=/dev/null count=5000k"); + print_stacks(); + close(pmu_fd); +} + +static void test_bpf_perf_event(void) +{ + struct perf_event_attr attr_type_hw = { + .sample_freq = SAMPLE_FREQ, + .freq = 1, + .type = PERF_TYPE_HARDWARE, + .config = PERF_COUNT_HW_CPU_CYCLES, + .inherit = 1, + }; + struct perf_event_attr attr_type_sw = { + .sample_freq = SAMPLE_FREQ, + .freq = 1, + .type = PERF_TYPE_SOFTWARE, + .config = PERF_COUNT_SW_CPU_CLOCK, + .inherit = 1, + }; + + test_perf_event_all_cpu(&attr_type_hw); + test_perf_event_task(&attr_type_hw); + test_perf_event_all_cpu(&attr_type_sw); + test_perf_event_task(&attr_type_sw); +} + + +int main(int argc, char **argv) +{ + struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; + char filename[256]; + + snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); + setrlimit(RLIMIT_MEMLOCK, &r); + + signal(SIGINT, int_exit); + + if (load_kallsyms()) { + printf("failed to process /proc/kallsyms\n"); + return 1; + } + + if (load_bpf_file(filename)) { + printf("%s", bpf_log_buf); + return 2; + } + + if (fork() == 0) { + read_trace_pipe(); + return 0; + } + test_bpf_perf_event(); + + int_exit(0); + return 0; +} From 72874418e4b9e2673c26a810b0ae9f418b573ee3 Mon Sep 17 00:00:00 2001 From: Brendan Gregg Date: Thu, 1 Sep 2016 18:37:26 -0700 Subject: [PATCH 6/6] samples/bpf: add sampleip example sample instruction pointer and frequency count in a BPF map Signed-off-by: Brendan Gregg Signed-off-by: Alexei Starovoitov Signed-off-by: David S. Miller --- samples/bpf/Makefile | 4 + samples/bpf/sampleip_kern.c | 38 +++++++ samples/bpf/sampleip_user.c | 196 ++++++++++++++++++++++++++++++++++++ 3 files changed, 238 insertions(+) create mode 100644 samples/bpf/sampleip_kern.c create mode 100644 samples/bpf/sampleip_user.c diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile index a69cf9045285..12b7304d55dc 100644 --- a/samples/bpf/Makefile +++ b/samples/bpf/Makefile @@ -26,6 +26,7 @@ hostprogs-y += xdp1 hostprogs-y += xdp2 hostprogs-y += test_current_task_under_cgroup hostprogs-y += trace_event +hostprogs-y += sampleip test_verifier-objs := test_verifier.o libbpf.o test_maps-objs := test_maps.o libbpf.o @@ -54,6 +55,7 @@ xdp2-objs := bpf_load.o libbpf.o xdp1_user.o test_current_task_under_cgroup-objs := bpf_load.o libbpf.o \ test_current_task_under_cgroup_user.o trace_event-objs := bpf_load.o libbpf.o trace_event_user.o +sampleip-objs := bpf_load.o libbpf.o sampleip_user.o # Tell kbuild to always build the programs always := $(hostprogs-y) @@ -82,6 +84,7 @@ always += xdp1_kern.o always += xdp2_kern.o always += test_current_task_under_cgroup_kern.o always += trace_event_kern.o +always += sampleip_kern.o HOSTCFLAGS += -I$(objtree)/usr/include @@ -107,6 +110,7 @@ HOSTLOADLIBES_xdp1 += -lelf HOSTLOADLIBES_xdp2 += -lelf HOSTLOADLIBES_test_current_task_under_cgroup += -lelf HOSTLOADLIBES_trace_event += -lelf +HOSTLOADLIBES_sampleip += -lelf # Allows pointing LLC/CLANG to a LLVM backend with bpf support, redefine on cmdline: # make samples/bpf/ LLC=~/git/llvm/build/bin/llc CLANG=~/git/llvm/build/bin/clang diff --git a/samples/bpf/sampleip_kern.c b/samples/bpf/sampleip_kern.c new file mode 100644 index 000000000000..774a681f374a --- /dev/null +++ b/samples/bpf/sampleip_kern.c @@ -0,0 +1,38 @@ +/* Copyright 2016 Netflix, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + */ +#include +#include +#include +#include +#include "bpf_helpers.h" + +#define MAX_IPS 8192 + +struct bpf_map_def SEC("maps") ip_map = { + .type = BPF_MAP_TYPE_HASH, + .key_size = sizeof(u64), + .value_size = sizeof(u32), + .max_entries = MAX_IPS, +}; + +SEC("perf_event") +int do_sample(struct bpf_perf_event_data *ctx) +{ + u64 ip; + u32 *value, init_val = 1; + + ip = ctx->regs.ip; + value = bpf_map_lookup_elem(&ip_map, &ip); + if (value) + *value += 1; + else + /* E2BIG not tested for this example only */ + bpf_map_update_elem(&ip_map, &ip, &init_val, BPF_NOEXIST); + + return 0; +} +char _license[] SEC("license") = "GPL"; diff --git a/samples/bpf/sampleip_user.c b/samples/bpf/sampleip_user.c new file mode 100644 index 000000000000..260a6bdd6413 --- /dev/null +++ b/samples/bpf/sampleip_user.c @@ -0,0 +1,196 @@ +/* + * sampleip: sample instruction pointer and frequency count in a BPF map. + * + * Copyright 2016 Netflix, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "libbpf.h" +#include "bpf_load.h" + +#define DEFAULT_FREQ 99 +#define DEFAULT_SECS 5 +#define MAX_IPS 8192 +#define PAGE_OFFSET 0xffff880000000000 + +static int nr_cpus; + +static void usage(void) +{ + printf("USAGE: sampleip [-F freq] [duration]\n"); + printf(" -F freq # sample frequency (Hertz), default 99\n"); + printf(" duration # sampling duration (seconds), default 5\n"); +} + +static int sampling_start(int *pmu_fd, int freq) +{ + int i; + + struct perf_event_attr pe_sample_attr = { + .type = PERF_TYPE_SOFTWARE, + .freq = 1, + .sample_period = freq, + .config = PERF_COUNT_SW_CPU_CLOCK, + .inherit = 1, + }; + + for (i = 0; i < nr_cpus; i++) { + pmu_fd[i] = perf_event_open(&pe_sample_attr, -1 /* pid */, i, + -1 /* group_fd */, 0 /* flags */); + if (pmu_fd[i] < 0) { + fprintf(stderr, "ERROR: Initializing perf sampling\n"); + return 1; + } + assert(ioctl(pmu_fd[i], PERF_EVENT_IOC_SET_BPF, + prog_fd[0]) == 0); + assert(ioctl(pmu_fd[i], PERF_EVENT_IOC_ENABLE, 0) == 0); + } + + return 0; +} + +static void sampling_end(int *pmu_fd) +{ + int i; + + for (i = 0; i < nr_cpus; i++) + close(pmu_fd[i]); +} + +struct ipcount { + __u64 ip; + __u32 count; +}; + +/* used for sorting */ +struct ipcount counts[MAX_IPS]; + +static int count_cmp(const void *p1, const void *p2) +{ + return ((struct ipcount *)p1)->count - ((struct ipcount *)p2)->count; +} + +static void print_ip_map(int fd) +{ + struct ksym *sym; + __u64 key, next_key; + __u32 value; + int i, max; + + printf("%-19s %-32s %s\n", "ADDR", "KSYM", "COUNT"); + + /* fetch IPs and counts */ + key = 0, i = 0; + while (bpf_get_next_key(fd, &key, &next_key) == 0) { + bpf_lookup_elem(fd, &next_key, &value); + counts[i].ip = next_key; + counts[i++].count = value; + key = next_key; + } + max = i; + + /* sort and print */ + qsort(counts, max, sizeof(struct ipcount), count_cmp); + for (i = 0; i < max; i++) { + if (counts[i].ip > PAGE_OFFSET) { + sym = ksym_search(counts[i].ip); + printf("0x%-17llx %-32s %u\n", counts[i].ip, sym->name, + counts[i].count); + } else { + printf("0x%-17llx %-32s %u\n", counts[i].ip, "(user)", + counts[i].count); + } + } + + if (max == MAX_IPS) { + printf("WARNING: IP hash was full (max %d entries); ", max); + printf("may have dropped samples\n"); + } +} + +static void int_exit(int sig) +{ + printf("\n"); + print_ip_map(map_fd[0]); + exit(0); +} + +int main(int argc, char **argv) +{ + char filename[256]; + int *pmu_fd, opt, freq = DEFAULT_FREQ, secs = DEFAULT_SECS; + + /* process arguments */ + while ((opt = getopt(argc, argv, "F:h")) != -1) { + switch (opt) { + case 'F': + freq = atoi(optarg); + break; + case 'h': + default: + usage(); + return 0; + } + } + if (argc - optind == 1) + secs = atoi(argv[optind]); + if (freq == 0 || secs == 0) { + usage(); + return 1; + } + + /* initialize kernel symbol translation */ + if (load_kallsyms()) { + fprintf(stderr, "ERROR: loading /proc/kallsyms\n"); + return 2; + } + + /* create perf FDs for each CPU */ + nr_cpus = sysconf(_SC_NPROCESSORS_CONF); + pmu_fd = malloc(nr_cpus * sizeof(int)); + if (pmu_fd == NULL) { + fprintf(stderr, "ERROR: malloc of pmu_fd\n"); + return 1; + } + + /* load BPF program */ + snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); + if (load_bpf_file(filename)) { + fprintf(stderr, "ERROR: loading BPF program (errno %d):\n", + errno); + if (strcmp(bpf_log_buf, "") == 0) + fprintf(stderr, "Try: ulimit -l unlimited\n"); + else + fprintf(stderr, "%s", bpf_log_buf); + return 1; + } + signal(SIGINT, int_exit); + + /* do sampling */ + printf("Sampling at %d Hertz for %d seconds. Ctrl-C also ends.\n", + freq, secs); + if (sampling_start(pmu_fd, freq) != 0) + return 1; + sleep(secs); + sampling_end(pmu_fd); + free(pmu_fd); + + /* output sample counts */ + print_ip_map(map_fd[0]); + + return 0; +}