From 2872e9ac33a4440173418147351ed4f93177e763 Mon Sep 17 00:00:00 2001 From: Andrey Ignatov Date: Fri, 19 Jun 2020 14:11:44 -0700 Subject: bpf: Set map_btf_{name, id} for all map types Set map_btf_name and map_btf_id for all map types so that map fields can be accessed by bpf programs. Signed-off-by: Andrey Ignatov Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/a825f808f22af52b018dbe82f1c7d29dab5fc978.1592600985.git.rdna@fb.com --- kernel/bpf/stackmap.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel/bpf/stackmap.c') diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index 599488f25e40..27dc9b1b08a5 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -613,6 +613,7 @@ static void stack_map_free(struct bpf_map *map) put_callchain_buffers(); } +static int stack_trace_map_btf_id; const struct bpf_map_ops stack_trace_map_ops = { .map_alloc = stack_map_alloc, .map_free = stack_map_free, @@ -621,6 +622,8 @@ const struct bpf_map_ops stack_trace_map_ops = { .map_update_elem = stack_map_update_elem, .map_delete_elem = stack_map_delete_elem, .map_check_btf = map_check_no_btf, + .map_btf_name = "bpf_stack_map", + .map_btf_id = &stack_trace_map_btf_id, }; static int __init stack_map_init(void) -- cgit v1.2.3 From bba1dc0b55ac462d24ed1228ad49800c238cd6d7 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Mon, 29 Jun 2020 21:33:39 -0700 Subject: bpf: Remove redundant synchronize_rcu. bpf_free_used_maps() or close(map_fd) will trigger map_free callback. bpf_free_used_maps() is called after bpf prog is no longer executing: bpf_prog_put->call_rcu->bpf_prog_free->bpf_free_used_maps. Hence there is no need to call synchronize_rcu() to protect map elements. Note that hash_of_maps and array_of_maps update/delete inner maps via sys_bpf() that calls maybe_wait_bpf_programs() and synchronize_rcu(). Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Acked-by: Paul E. McKenney Link: https://lore.kernel.org/bpf/20200630043343.53195-2-alexei.starovoitov@gmail.com --- kernel/bpf/arraymap.c | 9 --------- kernel/bpf/hashtab.c | 8 +++----- kernel/bpf/lpm_trie.c | 5 ----- kernel/bpf/queue_stack_maps.c | 7 ------- kernel/bpf/reuseport_array.c | 2 -- kernel/bpf/ringbuf.c | 7 ------- kernel/bpf/stackmap.c | 3 --- 7 files changed, 3 insertions(+), 38 deletions(-) (limited to 'kernel/bpf/stackmap.c') diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index ec5cd11032aa..c66e8273fccd 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -386,13 +386,6 @@ static void array_map_free(struct bpf_map *map) { struct bpf_array *array = container_of(map, struct bpf_array, map); - /* at this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0, - * so the programs (can be more than one that used this map) were - * disconnected from events. Wait for outstanding programs to complete - * and free the array - */ - synchronize_rcu(); - if (array->map.map_type == BPF_MAP_TYPE_PERCPU_ARRAY) bpf_array_free_percpu(array); @@ -546,8 +539,6 @@ static void fd_array_map_free(struct bpf_map *map) struct bpf_array *array = container_of(map, struct bpf_array, map); int i; - synchronize_rcu(); - /* make sure it's empty */ for (i = 0; i < array->map.max_entries; i++) BUG_ON(array->ptrs[i] != NULL); diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index acd06081d81d..d4378d7d442b 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -1290,12 +1290,10 @@ static void htab_map_free(struct bpf_map *map) { struct bpf_htab *htab = container_of(map, struct bpf_htab, map); - /* at this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0, - * so the programs (can be more than one that used this map) were - * disconnected from events. Wait for outstanding critical sections in - * these programs to complete + /* bpf_free_used_maps() or close(map_fd) will trigger this map_free callback. + * bpf_free_used_maps() is called after bpf prog is no longer executing. + * There is no need to synchronize_rcu() here to protect map elements. */ - synchronize_rcu(); /* some of free_htab_elem() callbacks for elements of this map may * not have executed. Wait for them. diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c index 1abd4e3f906d..44474bf3ab7a 100644 --- a/kernel/bpf/lpm_trie.c +++ b/kernel/bpf/lpm_trie.c @@ -589,11 +589,6 @@ static void trie_free(struct bpf_map *map) struct lpm_trie_node __rcu **slot; struct lpm_trie_node *node; - /* Wait for outstanding programs to complete - * update/lookup/delete/get_next_key and free the trie. - */ - synchronize_rcu(); - /* Always start at the root and walk down to a node that has no * children. Then free that node, nullify its reference in the parent * and start over. diff --git a/kernel/bpf/queue_stack_maps.c b/kernel/bpf/queue_stack_maps.c index 80c66a6d7c54..44184f82916a 100644 --- a/kernel/bpf/queue_stack_maps.c +++ b/kernel/bpf/queue_stack_maps.c @@ -101,13 +101,6 @@ static void queue_stack_map_free(struct bpf_map *map) { struct bpf_queue_stack *qs = bpf_queue_stack(map); - /* at this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0, - * so the programs (can be more than one that used this map) were - * disconnected from events. Wait for outstanding critical sections in - * these programs to complete - */ - synchronize_rcu(); - bpf_map_area_free(qs); } diff --git a/kernel/bpf/reuseport_array.c b/kernel/bpf/reuseport_array.c index a09922f656e4..3625c4fcc65c 100644 --- a/kernel/bpf/reuseport_array.c +++ b/kernel/bpf/reuseport_array.c @@ -96,8 +96,6 @@ static void reuseport_array_free(struct bpf_map *map) struct sock *sk; u32 i; - synchronize_rcu(); - /* * ops->map_*_elem() will not be able to access this * array now. Hence, this function only races with diff --git a/kernel/bpf/ringbuf.c b/kernel/bpf/ringbuf.c index dbf37aff4827..13a8d3967e07 100644 --- a/kernel/bpf/ringbuf.c +++ b/kernel/bpf/ringbuf.c @@ -215,13 +215,6 @@ static void ringbuf_map_free(struct bpf_map *map) { struct bpf_ringbuf_map *rb_map; - /* at this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0, - * so the programs (can be more than one that used this map) were - * disconnected from events. Wait for outstanding critical sections in - * these programs to complete - */ - synchronize_rcu(); - rb_map = container_of(map, struct bpf_ringbuf_map, map); bpf_ringbuf_free(rb_map->rb); kfree(rb_map); diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index 27dc9b1b08a5..071f98d0f7c6 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -604,9 +604,6 @@ static void stack_map_free(struct bpf_map *map) { struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map); - /* wait for bpf programs to complete before freeing stack map */ - synchronize_rcu(); - bpf_map_area_free(smap->elems); pcpu_freelist_destroy(&smap->freelist); bpf_map_area_free(smap); -- cgit v1.2.3 From fa28dcb82a38f8e3993b0fae9106b1a80b59e4f0 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Mon, 29 Jun 2020 23:28:44 -0700 Subject: bpf: Introduce helper bpf_get_task_stack() Introduce helper bpf_get_task_stack(), which dumps stack trace of given task. This is different to bpf_get_stack(), which gets stack track of current task. One potential use case of bpf_get_task_stack() is to call it from bpf_iter__task and dump all /proc//stack to a seq_file. bpf_get_task_stack() uses stack_trace_save_tsk() instead of get_perf_callchain() for kernel stack. The benefit of this choice is that stack_trace_save_tsk() doesn't require changes in arch/. The downside of using stack_trace_save_tsk() is that stack_trace_save_tsk() dumps the stack trace to unsigned long array. For 32-bit systems, we need to translate it to u64 array. Signed-off-by: Song Liu Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200630062846.664389-3-songliubraving@fb.com --- include/linux/bpf.h | 1 + include/uapi/linux/bpf.h | 37 +++++++++++++++++++- kernel/bpf/stackmap.c | 77 +++++++++++++++++++++++++++++++++++++++--- kernel/bpf/verifier.c | 4 ++- kernel/trace/bpf_trace.c | 2 ++ scripts/bpf_helpers_doc.py | 2 ++ tools/include/uapi/linux/bpf.h | 37 +++++++++++++++++++- 7 files changed, 153 insertions(+), 7 deletions(-) (limited to 'kernel/bpf/stackmap.c') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 3d2ade703a35..0cd7f6884c5c 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1627,6 +1627,7 @@ extern const struct bpf_func_proto bpf_get_current_uid_gid_proto; extern const struct bpf_func_proto bpf_get_current_comm_proto; extern const struct bpf_func_proto bpf_get_stackid_proto; extern const struct bpf_func_proto bpf_get_stack_proto; +extern const struct bpf_func_proto bpf_get_task_stack_proto; extern const struct bpf_func_proto bpf_sock_map_update_proto; extern const struct bpf_func_proto bpf_sock_hash_update_proto; extern const struct bpf_func_proto bpf_get_current_cgroup_id_proto; diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 0cb8ec948816..da9bf35a26f8 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -3285,6 +3285,39 @@ union bpf_attr { * Dynamically cast a *sk* pointer to a *udp6_sock* pointer. * Return * *sk* if casting is valid, or NULL otherwise. + * + * long bpf_get_task_stack(struct task_struct *task, void *buf, u32 size, u64 flags) + * Description + * Return a user or a kernel stack in bpf program provided buffer. + * To achieve this, the helper needs *task*, which is a valid + * pointer to struct task_struct. To store the stacktrace, the + * bpf program provides *buf* with a nonnegative *size*. + * + * The last argument, *flags*, holds the number of stack frames to + * skip (from 0 to 255), masked with + * **BPF_F_SKIP_FIELD_MASK**. The next bits can be used to set + * the following flags: + * + * **BPF_F_USER_STACK** + * Collect a user space stack instead of a kernel stack. + * **BPF_F_USER_BUILD_ID** + * Collect buildid+offset instead of ips for user stack, + * only valid if **BPF_F_USER_STACK** is also specified. + * + * **bpf_get_task_stack**\ () can collect up to + * **PERF_MAX_STACK_DEPTH** both kernel and user frames, subject + * to sufficient large buffer size. Note that + * this limit can be controlled with the **sysctl** program, and + * that it should be manually increased in order to profile long + * user stacks (such as stacks for Java programs). To do so, use: + * + * :: + * + * # sysctl kernel.perf_event_max_stack= + * Return + * A non-negative value equal to or less than *size* on success, + * or a negative error in case of failure. + * */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3427,7 +3460,9 @@ union bpf_attr { FN(skc_to_tcp_sock), \ FN(skc_to_tcp_timewait_sock), \ FN(skc_to_tcp_request_sock), \ - FN(skc_to_udp6_sock), + FN(skc_to_udp6_sock), \ + FN(get_task_stack), \ + /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index 071f98d0f7c6..5ad72ab2276b 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -348,6 +348,40 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs, } } +static struct perf_callchain_entry * +get_callchain_entry_for_task(struct task_struct *task, u32 init_nr) +{ + struct perf_callchain_entry *entry; + int rctx; + + entry = get_callchain_entry(&rctx); + + if (!entry) + return NULL; + + entry->nr = init_nr + + stack_trace_save_tsk(task, (unsigned long *)(entry->ip + init_nr), + sysctl_perf_event_max_stack - init_nr, 0); + + /* stack_trace_save_tsk() works on unsigned long array, while + * perf_callchain_entry uses u64 array. For 32-bit systems, it is + * necessary to fix this mismatch. + */ + if (__BITS_PER_LONG != 64) { + unsigned long *from = (unsigned long *) entry->ip; + u64 *to = entry->ip; + int i; + + /* copy data from the end to avoid using extra buffer */ + for (i = entry->nr - 1; i >= (int)init_nr; i--) + to[i] = (u64)(from[i]); + } + + put_callchain_entry(rctx); + + return entry; +} + BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map, u64, flags) { @@ -448,8 +482,8 @@ const struct bpf_func_proto bpf_get_stackid_proto = { .arg3_type = ARG_ANYTHING, }; -BPF_CALL_4(bpf_get_stack, struct pt_regs *, regs, void *, buf, u32, size, - u64, flags) +static long __bpf_get_stack(struct pt_regs *regs, struct task_struct *task, + void *buf, u32 size, u64 flags) { u32 init_nr, trace_nr, copy_len, elem_size, num_elem; bool user_build_id = flags & BPF_F_USER_BUILD_ID; @@ -471,13 +505,22 @@ BPF_CALL_4(bpf_get_stack, struct pt_regs *, regs, void *, buf, u32, size, if (unlikely(size % elem_size)) goto clear; + /* cannot get valid user stack for task without user_mode regs */ + if (task && user && !user_mode(regs)) + goto err_fault; + num_elem = size / elem_size; if (sysctl_perf_event_max_stack < num_elem) init_nr = 0; else init_nr = sysctl_perf_event_max_stack - num_elem; - trace = get_perf_callchain(regs, init_nr, kernel, user, - sysctl_perf_event_max_stack, false, false); + + if (kernel && task) + trace = get_callchain_entry_for_task(task, init_nr); + else + trace = get_perf_callchain(regs, init_nr, kernel, user, + sysctl_perf_event_max_stack, + false, false); if (unlikely(!trace)) goto err_fault; @@ -505,6 +548,12 @@ clear: return err; } +BPF_CALL_4(bpf_get_stack, struct pt_regs *, regs, void *, buf, u32, size, + u64, flags) +{ + return __bpf_get_stack(regs, NULL, buf, size, flags); +} + const struct bpf_func_proto bpf_get_stack_proto = { .func = bpf_get_stack, .gpl_only = true, @@ -515,6 +564,26 @@ const struct bpf_func_proto bpf_get_stack_proto = { .arg4_type = ARG_ANYTHING, }; +BPF_CALL_4(bpf_get_task_stack, struct task_struct *, task, void *, buf, + u32, size, u64, flags) +{ + struct pt_regs *regs = task_pt_regs(task); + + return __bpf_get_stack(regs, task, buf, size, flags); +} + +static int bpf_get_task_stack_btf_ids[5]; +const struct bpf_func_proto bpf_get_task_stack_proto = { + .func = bpf_get_task_stack, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_BTF_ID, + .arg2_type = ARG_PTR_TO_UNINIT_MEM, + .arg3_type = ARG_CONST_SIZE_OR_ZERO, + .arg4_type = ARG_ANYTHING, + .btf_id = bpf_get_task_stack_btf_ids, +}; + /* Called from eBPF program */ static void *stack_map_lookup_elem(struct bpf_map *map, void *key) { diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 7de98906ddf4..b608185e1ffd 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -4864,7 +4864,9 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn if (err) return err; - if (func_id == BPF_FUNC_get_stack && !env->prog->has_callchain_buf) { + if ((func_id == BPF_FUNC_get_stack || + func_id == BPF_FUNC_get_task_stack) && + !env->prog->has_callchain_buf) { const char *err_str; #ifdef CONFIG_PERF_EVENTS diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 5d59dda5f661..977ba3b6f6c6 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1137,6 +1137,8 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_ringbuf_query_proto; case BPF_FUNC_jiffies64: return &bpf_jiffies64_proto; + case BPF_FUNC_get_task_stack: + return &bpf_get_task_stack_proto; default: return NULL; } diff --git a/scripts/bpf_helpers_doc.py b/scripts/bpf_helpers_doc.py index 6bab40ff442e..6843376733df 100755 --- a/scripts/bpf_helpers_doc.py +++ b/scripts/bpf_helpers_doc.py @@ -426,6 +426,7 @@ class PrinterHelpers(Printer): 'struct tcp_timewait_sock', 'struct tcp_request_sock', 'struct udp6_sock', + 'struct task_struct', 'struct __sk_buff', 'struct sk_msg_md', @@ -468,6 +469,7 @@ class PrinterHelpers(Printer): 'struct tcp_timewait_sock', 'struct tcp_request_sock', 'struct udp6_sock', + 'struct task_struct', } mapped_types = { 'u8': '__u8', diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 0cb8ec948816..da9bf35a26f8 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -3285,6 +3285,39 @@ union bpf_attr { * Dynamically cast a *sk* pointer to a *udp6_sock* pointer. * Return * *sk* if casting is valid, or NULL otherwise. + * + * long bpf_get_task_stack(struct task_struct *task, void *buf, u32 size, u64 flags) + * Description + * Return a user or a kernel stack in bpf program provided buffer. + * To achieve this, the helper needs *task*, which is a valid + * pointer to struct task_struct. To store the stacktrace, the + * bpf program provides *buf* with a nonnegative *size*. + * + * The last argument, *flags*, holds the number of stack frames to + * skip (from 0 to 255), masked with + * **BPF_F_SKIP_FIELD_MASK**. The next bits can be used to set + * the following flags: + * + * **BPF_F_USER_STACK** + * Collect a user space stack instead of a kernel stack. + * **BPF_F_USER_BUILD_ID** + * Collect buildid+offset instead of ips for user stack, + * only valid if **BPF_F_USER_STACK** is also specified. + * + * **bpf_get_task_stack**\ () can collect up to + * **PERF_MAX_STACK_DEPTH** both kernel and user frames, subject + * to sufficient large buffer size. Note that + * this limit can be controlled with the **sysctl** program, and + * that it should be manually increased in order to profile long + * user stacks (such as stacks for Java programs). To do so, use: + * + * :: + * + * # sysctl kernel.perf_event_max_stack= + * Return + * A non-negative value equal to or less than *size* on success, + * or a negative error in case of failure. + * */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3427,7 +3460,9 @@ union bpf_attr { FN(skc_to_tcp_sock), \ FN(skc_to_tcp_timewait_sock), \ FN(skc_to_tcp_request_sock), \ - FN(skc_to_udp6_sock), + FN(skc_to_udp6_sock), \ + FN(get_task_stack), \ + /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call -- cgit v1.2.3 From 046cc3dd9a2507b8e111714807ab8bf15ea5bb70 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Thu, 2 Jul 2020 19:45:37 -0700 Subject: bpf: Fix build without CONFIG_STACKTRACE Without CONFIG_STACKTRACE stack_trace_save_tsk() is not defined. Let get_callchain_entry_for_task() to always return NULL in such cases. Fixes: fa28dcb82a38 ("bpf: Introduce helper bpf_get_task_stack()") Reported-by: kernel test robot Signed-off-by: Song Liu Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200703024537.79971-1-songliubraving@fb.com --- kernel/bpf/stackmap.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel/bpf/stackmap.c') diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index 5ad72ab2276b..a6c361ed7937 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -351,6 +351,7 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs, static struct perf_callchain_entry * get_callchain_entry_for_task(struct task_struct *task, u32 init_nr) { +#ifdef CONFIG_STACKTRACE struct perf_callchain_entry *entry; int rctx; @@ -380,6 +381,9 @@ get_callchain_entry_for_task(struct task_struct *task, u32 init_nr) put_callchain_entry(rctx); return entry; +#else /* CONFIG_STACKTRACE */ + return NULL; +#endif } BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map, -- cgit v1.2.3 From c9a0f3b85e09dd16665b639cb884490410619434 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Sat, 11 Jul 2020 23:53:24 +0200 Subject: bpf: Resolve BTF IDs in vmlinux image Using BTF_ID_LIST macro to define lists for several helpers using BTF arguments. And running resolve_btfids on vmlinux elf object during linking, so the .BTF_ids section gets the IDs resolved. Signed-off-by: Jiri Olsa Signed-off-by: Alexei Starovoitov Tested-by: Andrii Nakryiko Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200711215329.41165-5-jolsa@kernel.org --- Makefile | 3 ++- kernel/bpf/stackmap.c | 5 ++++- kernel/trace/bpf_trace.c | 9 +++++++-- net/core/filter.c | 9 +++++++-- scripts/link-vmlinux.sh | 6 ++++++ 5 files changed, 26 insertions(+), 6 deletions(-) (limited to 'kernel/bpf/stackmap.c') diff --git a/Makefile b/Makefile index 017e775b3288..462f1a75fcb3 100644 --- a/Makefile +++ b/Makefile @@ -448,6 +448,7 @@ OBJSIZE = $(CROSS_COMPILE)size STRIP = $(CROSS_COMPILE)strip endif PAHOLE = pahole +RESOLVE_BTFIDS = $(objtree)/tools/bpf/resolve_btfids/resolve_btfids LEX = flex YACC = bison AWK = awk @@ -510,7 +511,7 @@ GCC_PLUGINS_CFLAGS := CLANG_FLAGS := export ARCH SRCARCH CONFIG_SHELL BASH HOSTCC KBUILD_HOSTCFLAGS CROSS_COMPILE LD CC -export CPP AR NM STRIP OBJCOPY OBJDUMP OBJSIZE READELF PAHOLE LEX YACC AWK INSTALLKERNEL +export CPP AR NM STRIP OBJCOPY OBJDUMP OBJSIZE READELF PAHOLE RESOLVE_BTFIDS LEX YACC AWK INSTALLKERNEL export PERL PYTHON PYTHON3 CHECK CHECKFLAGS MAKE UTS_MACHINE HOSTCXX export KGZIP KBZIP2 KLZOP LZMA LZ4 XZ export KBUILD_HOSTCXXFLAGS KBUILD_HOSTLDFLAGS KBUILD_HOSTLDLIBS LDFLAGS_MODULE diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index a6c361ed7937..48d8e739975f 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -9,6 +9,7 @@ #include #include #include +#include #include "percpu_freelist.h" #define STACK_CREATE_FLAG_MASK \ @@ -576,7 +577,9 @@ BPF_CALL_4(bpf_get_task_stack, struct task_struct *, task, void *, buf, return __bpf_get_stack(regs, task, buf, size, flags); } -static int bpf_get_task_stack_btf_ids[5]; +BTF_ID_LIST(bpf_get_task_stack_btf_ids) +BTF_ID(struct, task_struct) + const struct bpf_func_proto bpf_get_task_stack_proto = { .func = bpf_get_task_stack, .gpl_only = false, diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index e0b7775039ab..e178e8e32b33 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -13,6 +13,7 @@ #include #include #include +#include #include @@ -710,7 +711,9 @@ out: return err; } -static int bpf_seq_printf_btf_ids[5]; +BTF_ID_LIST(bpf_seq_printf_btf_ids) +BTF_ID(struct, seq_file) + static const struct bpf_func_proto bpf_seq_printf_proto = { .func = bpf_seq_printf, .gpl_only = true, @@ -728,7 +731,9 @@ BPF_CALL_3(bpf_seq_write, struct seq_file *, m, const void *, data, u32, len) return seq_write(m, data, len) ? -EOVERFLOW : 0; } -static int bpf_seq_write_btf_ids[5]; +BTF_ID_LIST(bpf_seq_write_btf_ids) +BTF_ID(struct, seq_file) + static const struct bpf_func_proto bpf_seq_write_proto = { .func = bpf_seq_write, .gpl_only = true, diff --git a/net/core/filter.c b/net/core/filter.c index ddcc0d6209e1..4e572441e64a 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -75,6 +75,7 @@ #include #include #include +#include /** * sk_filter_trim_cap - run a packet through a socket filter @@ -3779,7 +3780,9 @@ static const struct bpf_func_proto bpf_skb_event_output_proto = { .arg5_type = ARG_CONST_SIZE_OR_ZERO, }; -static int bpf_skb_output_btf_ids[5]; +BTF_ID_LIST(bpf_skb_output_btf_ids) +BTF_ID(struct, sk_buff) + const struct bpf_func_proto bpf_skb_output_proto = { .func = bpf_skb_event_output, .gpl_only = true, @@ -4173,7 +4176,9 @@ static const struct bpf_func_proto bpf_xdp_event_output_proto = { .arg5_type = ARG_CONST_SIZE_OR_ZERO, }; -static int bpf_xdp_output_btf_ids[5]; +BTF_ID_LIST(bpf_xdp_output_btf_ids) +BTF_ID(struct, xdp_buff) + const struct bpf_func_proto bpf_xdp_output_proto = { .func = bpf_xdp_event_output, .gpl_only = true, diff --git a/scripts/link-vmlinux.sh b/scripts/link-vmlinux.sh index 92dd745906f4..e26f02dbedee 100755 --- a/scripts/link-vmlinux.sh +++ b/scripts/link-vmlinux.sh @@ -336,6 +336,12 @@ fi vmlinux_link vmlinux "${kallsymso}" ${btf_vmlinux_bin_o} +# fill in BTF IDs +if [ -n "${CONFIG_DEBUG_INFO_BTF}" ]; then +info BTFIDS vmlinux +${RESOLVE_BTFIDS} vmlinux +fi + if [ -n "${CONFIG_BUILDTIME_TABLE_SORT}" ]; then info SORTTAB vmlinux if ! sorttable vmlinux; then -- cgit v1.2.3 From 7b04d6d60fcfb5b2200ffebb9cfb90927bdfeec7 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Thu, 23 Jul 2020 11:06:44 -0700 Subject: bpf: Separate bpf_get_[stack|stackid] for perf events BPF Calling get_perf_callchain() on perf_events from PEBS entries may cause unwinder errors. To fix this issue, the callchain is fetched early. Such perf_events are marked with __PERF_SAMPLE_CALLCHAIN_EARLY. Similarly, calling bpf_get_[stack|stackid] on perf_events from PEBS may also cause unwinder errors. To fix this, add separate version of these two helpers, bpf_get_[stack|stackid]_pe. These two hepers use callchain in bpf_perf_event_data_kern->data->callchain. Signed-off-by: Song Liu Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200723180648.1429892-2-songliubraving@fb.com --- include/linux/bpf.h | 2 + kernel/bpf/stackmap.c | 184 ++++++++++++++++++++++++++++++++++++++++++----- kernel/trace/bpf_trace.c | 4 +- 3 files changed, 170 insertions(+), 20 deletions(-) (limited to 'kernel/bpf/stackmap.c') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 4175cf1f4665..8357be349133 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1675,6 +1675,8 @@ extern const struct bpf_func_proto bpf_get_current_comm_proto; extern const struct bpf_func_proto bpf_get_stackid_proto; extern const struct bpf_func_proto bpf_get_stack_proto; extern const struct bpf_func_proto bpf_get_task_stack_proto; +extern const struct bpf_func_proto bpf_get_stackid_proto_pe; +extern const struct bpf_func_proto bpf_get_stack_proto_pe; extern const struct bpf_func_proto bpf_sock_map_update_proto; extern const struct bpf_func_proto bpf_sock_hash_update_proto; extern const struct bpf_func_proto bpf_get_current_cgroup_id_proto; diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index 48d8e739975f..5beb2f8c23da 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -4,6 +4,7 @@ #include #include #include +#include #include #include #include @@ -387,11 +388,10 @@ get_callchain_entry_for_task(struct task_struct *task, u32 init_nr) #endif } -BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map, - u64, flags) +static long __bpf_get_stackid(struct bpf_map *map, + struct perf_callchain_entry *trace, u64 flags) { struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map); - struct perf_callchain_entry *trace; struct stack_map_bucket *bucket, *new_bucket, *old_bucket; u32 max_depth = map->value_size / stack_map_data_size(map); /* stack_map_alloc() checks that max_depth <= sysctl_perf_event_max_stack */ @@ -399,21 +399,9 @@ BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map, u32 skip = flags & BPF_F_SKIP_FIELD_MASK; u32 hash, id, trace_nr, trace_len; bool user = flags & BPF_F_USER_STACK; - bool kernel = !user; u64 *ips; bool hash_matches; - if (unlikely(flags & ~(BPF_F_SKIP_FIELD_MASK | BPF_F_USER_STACK | - BPF_F_FAST_STACK_CMP | BPF_F_REUSE_STACKID))) - return -EINVAL; - - trace = get_perf_callchain(regs, init_nr, kernel, user, - sysctl_perf_event_max_stack, false, false); - - if (unlikely(!trace)) - /* couldn't fetch the stack trace */ - return -EFAULT; - /* get_perf_callchain() guarantees that trace->nr >= init_nr * and trace-nr <= sysctl_perf_event_max_stack, so trace_nr <= max_depth */ @@ -478,6 +466,30 @@ BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map, return id; } +BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map, + u64, flags) +{ + u32 max_depth = map->value_size / stack_map_data_size(map); + /* stack_map_alloc() checks that max_depth <= sysctl_perf_event_max_stack */ + u32 init_nr = sysctl_perf_event_max_stack - max_depth; + bool user = flags & BPF_F_USER_STACK; + struct perf_callchain_entry *trace; + bool kernel = !user; + + if (unlikely(flags & ~(BPF_F_SKIP_FIELD_MASK | BPF_F_USER_STACK | + BPF_F_FAST_STACK_CMP | BPF_F_REUSE_STACKID))) + return -EINVAL; + + trace = get_perf_callchain(regs, init_nr, kernel, user, + sysctl_perf_event_max_stack, false, false); + + if (unlikely(!trace)) + /* couldn't fetch the stack trace */ + return -EFAULT; + + return __bpf_get_stackid(map, trace, flags); +} + const struct bpf_func_proto bpf_get_stackid_proto = { .func = bpf_get_stackid, .gpl_only = true, @@ -487,7 +499,77 @@ const struct bpf_func_proto bpf_get_stackid_proto = { .arg3_type = ARG_ANYTHING, }; +static __u64 count_kernel_ip(struct perf_callchain_entry *trace) +{ + __u64 nr_kernel = 0; + + while (nr_kernel < trace->nr) { + if (trace->ip[nr_kernel] == PERF_CONTEXT_USER) + break; + nr_kernel++; + } + return nr_kernel; +} + +BPF_CALL_3(bpf_get_stackid_pe, struct bpf_perf_event_data_kern *, ctx, + struct bpf_map *, map, u64, flags) +{ + struct perf_event *event = ctx->event; + struct perf_callchain_entry *trace; + bool kernel, user; + __u64 nr_kernel; + int ret; + + /* perf_sample_data doesn't have callchain, use bpf_get_stackid */ + if (!(event->attr.sample_type & __PERF_SAMPLE_CALLCHAIN_EARLY)) + return bpf_get_stackid((unsigned long)(ctx->regs), + (unsigned long) map, flags, 0, 0); + + if (unlikely(flags & ~(BPF_F_SKIP_FIELD_MASK | BPF_F_USER_STACK | + BPF_F_FAST_STACK_CMP | BPF_F_REUSE_STACKID))) + return -EINVAL; + + user = flags & BPF_F_USER_STACK; + kernel = !user; + + trace = ctx->data->callchain; + if (unlikely(!trace)) + return -EFAULT; + + nr_kernel = count_kernel_ip(trace); + + if (kernel) { + __u64 nr = trace->nr; + + trace->nr = nr_kernel; + ret = __bpf_get_stackid(map, trace, flags); + + /* restore nr */ + trace->nr = nr; + } else { /* user */ + u64 skip = flags & BPF_F_SKIP_FIELD_MASK; + + skip += nr_kernel; + if (skip > BPF_F_SKIP_FIELD_MASK) + return -EFAULT; + + flags = (flags & ~BPF_F_SKIP_FIELD_MASK) | skip; + ret = __bpf_get_stackid(map, trace, flags); + } + return ret; +} + +const struct bpf_func_proto bpf_get_stackid_proto_pe = { + .func = bpf_get_stackid_pe, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_CONST_MAP_PTR, + .arg3_type = ARG_ANYTHING, +}; + static long __bpf_get_stack(struct pt_regs *regs, struct task_struct *task, + struct perf_callchain_entry *trace_in, void *buf, u32 size, u64 flags) { u32 init_nr, trace_nr, copy_len, elem_size, num_elem; @@ -520,7 +602,9 @@ static long __bpf_get_stack(struct pt_regs *regs, struct task_struct *task, else init_nr = sysctl_perf_event_max_stack - num_elem; - if (kernel && task) + if (trace_in) + trace = trace_in; + else if (kernel && task) trace = get_callchain_entry_for_task(task, init_nr); else trace = get_perf_callchain(regs, init_nr, kernel, user, @@ -556,7 +640,7 @@ clear: BPF_CALL_4(bpf_get_stack, struct pt_regs *, regs, void *, buf, u32, size, u64, flags) { - return __bpf_get_stack(regs, NULL, buf, size, flags); + return __bpf_get_stack(regs, NULL, NULL, buf, size, flags); } const struct bpf_func_proto bpf_get_stack_proto = { @@ -574,7 +658,7 @@ BPF_CALL_4(bpf_get_task_stack, struct task_struct *, task, void *, buf, { struct pt_regs *regs = task_pt_regs(task); - return __bpf_get_stack(regs, task, buf, size, flags); + return __bpf_get_stack(regs, task, NULL, buf, size, flags); } BTF_ID_LIST(bpf_get_task_stack_btf_ids) @@ -591,6 +675,70 @@ const struct bpf_func_proto bpf_get_task_stack_proto = { .btf_id = bpf_get_task_stack_btf_ids, }; +BPF_CALL_4(bpf_get_stack_pe, struct bpf_perf_event_data_kern *, ctx, + void *, buf, u32, size, u64, flags) +{ + struct perf_event *event = ctx->event; + struct perf_callchain_entry *trace; + bool kernel, user; + int err = -EINVAL; + __u64 nr_kernel; + + if (!(event->attr.sample_type & __PERF_SAMPLE_CALLCHAIN_EARLY)) + return __bpf_get_stack(ctx->regs, NULL, NULL, buf, size, flags); + + if (unlikely(flags & ~(BPF_F_SKIP_FIELD_MASK | BPF_F_USER_STACK | + BPF_F_USER_BUILD_ID))) + goto clear; + + user = flags & BPF_F_USER_STACK; + kernel = !user; + + err = -EFAULT; + trace = ctx->data->callchain; + if (unlikely(!trace)) + goto clear; + + nr_kernel = count_kernel_ip(trace); + + if (kernel) { + __u64 nr = trace->nr; + + trace->nr = nr_kernel; + err = __bpf_get_stack(ctx->regs, NULL, trace, buf, + size, flags); + + /* restore nr */ + trace->nr = nr; + } else { /* user */ + u64 skip = flags & BPF_F_SKIP_FIELD_MASK; + + skip += nr_kernel; + if (skip > BPF_F_SKIP_FIELD_MASK) + goto clear; + + flags = (flags & ~BPF_F_SKIP_FIELD_MASK) | skip; + err = __bpf_get_stack(ctx->regs, NULL, trace, buf, + size, flags); + } + return err; + +clear: + memset(buf, 0, size); + return err; + +} + +const struct bpf_func_proto bpf_get_stack_proto_pe = { + .func = bpf_get_stack_pe, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_UNINIT_MEM, + .arg3_type = ARG_CONST_SIZE_OR_ZERO, + .arg4_type = ARG_ANYTHING, +}; + /* Called from eBPF program */ static void *stack_map_lookup_elem(struct bpf_map *map, void *key) { diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 3cc0dcb60ca2..cb91ef902cc4 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1411,9 +1411,9 @@ pe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) case BPF_FUNC_perf_event_output: return &bpf_perf_event_output_proto_tp; case BPF_FUNC_get_stackid: - return &bpf_get_stackid_proto_tp; + return &bpf_get_stackid_proto_pe; case BPF_FUNC_get_stack: - return &bpf_get_stack_proto_tp; + return &bpf_get_stack_proto_pe; case BPF_FUNC_perf_prog_read_value: return &bpf_perf_prog_read_value_proto; case BPF_FUNC_read_branch_records: -- cgit v1.2.3 From 2b9b305fcdda1810bdffeb599361174eb2cd0b7c Mon Sep 17 00:00:00 2001 From: Song Liu Date: Fri, 24 Jul 2020 13:05:02 -0700 Subject: bpf: Fix build on architectures with special bpf_user_pt_regs_t Architectures like s390, powerpc, arm64, riscv have speical definition of bpf_user_pt_regs_t. So we need to cast the pointer before passing it to bpf_get_stack(). This is similar to bpf_get_stack_tp(). Fixes: 03d42fd2d83f ("bpf: Separate bpf_get_[stack|stackid] for perf events BPF") Reported-by: kernel test robot Signed-off-by: Song Liu Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200724200503.3629591-1-songliubraving@fb.com --- kernel/bpf/stackmap.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'kernel/bpf/stackmap.c') diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index 5beb2f8c23da..4fd830a62be2 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -678,6 +678,7 @@ const struct bpf_func_proto bpf_get_task_stack_proto = { BPF_CALL_4(bpf_get_stack_pe, struct bpf_perf_event_data_kern *, ctx, void *, buf, u32, size, u64, flags) { + struct pt_regs *regs = (struct pt_regs *)(ctx->regs); struct perf_event *event = ctx->event; struct perf_callchain_entry *trace; bool kernel, user; @@ -685,7 +686,7 @@ BPF_CALL_4(bpf_get_stack_pe, struct bpf_perf_event_data_kern *, ctx, __u64 nr_kernel; if (!(event->attr.sample_type & __PERF_SAMPLE_CALLCHAIN_EARLY)) - return __bpf_get_stack(ctx->regs, NULL, NULL, buf, size, flags); + return __bpf_get_stack(regs, NULL, NULL, buf, size, flags); if (unlikely(flags & ~(BPF_F_SKIP_FIELD_MASK | BPF_F_USER_STACK | BPF_F_USER_BUILD_ID))) @@ -705,8 +706,7 @@ BPF_CALL_4(bpf_get_stack_pe, struct bpf_perf_event_data_kern *, ctx, __u64 nr = trace->nr; trace->nr = nr_kernel; - err = __bpf_get_stack(ctx->regs, NULL, trace, buf, - size, flags); + err = __bpf_get_stack(regs, NULL, trace, buf, size, flags); /* restore nr */ trace->nr = nr; @@ -718,8 +718,7 @@ BPF_CALL_4(bpf_get_stack_pe, struct bpf_perf_event_data_kern *, ctx, goto clear; flags = (flags & ~BPF_F_SKIP_FIELD_MASK) | skip; - err = __bpf_get_stack(ctx->regs, NULL, trace, buf, - size, flags); + err = __bpf_get_stack(regs, NULL, trace, buf, size, flags); } return err; -- cgit v1.2.3 From b33164f2bd1cedb094c32cb466287116164457ae Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Wed, 12 Aug 2020 14:31:02 +0200 Subject: bpf: Iterate through all PT_NOTE sections when looking for build id Currently when we look for build id within bpf_get_stackid helper call, we check the first NOTE section and we fail if build id is not there. However on some system (Fedora) there can be multiple NOTE sections in binaries and build id data is not always the first one, like: $ readelf -a /usr/bin/ls ... [ 2] .note.gnu.propert NOTE 0000000000000338 00000338 0000000000000020 0000000000000000 A 0 0 8358 [ 3] .note.gnu.build-i NOTE 0000000000000358 00000358 0000000000000024 0000000000000000 A 0 0 437c [ 4] .note.ABI-tag NOTE 000000000000037c 0000037c ... So the stack_map_get_build_id function will fail on build id retrieval and fallback to BPF_STACK_BUILD_ID_IP. This patch is changing the stack_map_get_build_id code to iterate through all the NOTE sections and try to get build id data from each of them. When tracing on sched_switch tracepoint that does bpf_get_stackid helper call kernel build, I can see about 60% increase of successful build id retrieval. The rest seems fails on -EFAULT. Signed-off-by: Jiri Olsa Signed-off-by: Alexei Starovoitov Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20200812123102.20032-1-jolsa@kernel.org --- kernel/bpf/stackmap.c | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) (limited to 'kernel/bpf/stackmap.c') diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index 4fd830a62be2..cfed0ac44d38 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -213,11 +213,13 @@ static int stack_map_get_build_id_32(void *page_addr, phdr = (Elf32_Phdr *)(page_addr + sizeof(Elf32_Ehdr)); - for (i = 0; i < ehdr->e_phnum; ++i) - if (phdr[i].p_type == PT_NOTE) - return stack_map_parse_build_id(page_addr, build_id, - page_addr + phdr[i].p_offset, - phdr[i].p_filesz); + for (i = 0; i < ehdr->e_phnum; ++i) { + if (phdr[i].p_type == PT_NOTE && + !stack_map_parse_build_id(page_addr, build_id, + page_addr + phdr[i].p_offset, + phdr[i].p_filesz)) + return 0; + } return -EINVAL; } @@ -236,11 +238,13 @@ static int stack_map_get_build_id_64(void *page_addr, phdr = (Elf64_Phdr *)(page_addr + sizeof(Elf64_Ehdr)); - for (i = 0; i < ehdr->e_phnum; ++i) - if (phdr[i].p_type == PT_NOTE) - return stack_map_parse_build_id(page_addr, build_id, - page_addr + phdr[i].p_offset, - phdr[i].p_filesz); + for (i = 0; i < ehdr->e_phnum; ++i) { + if (phdr[i].p_type == PT_NOTE && + !stack_map_parse_build_id(page_addr, build_id, + page_addr + phdr[i].p_offset, + phdr[i].p_filesz)) + return 0; + } return -EINVAL; } -- cgit v1.2.3