diff options
author | Tejun Heo <tj@kernel.org> | 2024-09-04 11:41:32 -1000 |
---|---|---|
committer | Tejun Heo <tj@kernel.org> | 2024-09-04 11:41:32 -1000 |
commit | 649e980dadee36f961738d054627225542d547a2 (patch) | |
tree | a67f0dc4ea53f03c85f5e0648f83f3ff0e4877f7 /kernel | |
parent | a4103eacc2ab408bb65e9902f0857b219fb489de (diff) | |
parent | 2ad6d23f465a4f851e3bcf6d74c315ce7b2c205b (diff) |
Merge branch 'bpf/master' into for-6.12
Pull bpf/master to receive baebe9aaba1e ("bpf: allow passing struct
bpf_iter_<type> as kfunc arguments") and related changes in preparation for
the DSQ iterator patchset.
Signed-off-by: Tejun Heo <tj@kernel.org>
Diffstat (limited to 'kernel')
54 files changed, 1365 insertions, 778 deletions
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index 0291eef9ce92..9b9c151b5c82 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -52,9 +52,3 @@ obj-$(CONFIG_BPF_PRELOAD) += preload/ obj-$(CONFIG_BPF_SYSCALL) += relo_core.o obj-$(CONFIG_BPF_SYSCALL) += btf_iter.o obj-$(CONFIG_BPF_SYSCALL) += btf_relocate.o - -# Some source files are common to libbpf. -vpath %.c $(srctree)/kernel/bpf:$(srctree)/tools/lib/bpf - -$(obj)/%.o: %.c FORCE - $(call if_changed_rule,cc_o_c) diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index feabc0193852..a43e62e2a8bb 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -494,7 +494,7 @@ static void array_map_seq_show_elem(struct bpf_map *map, void *key, if (map->btf_key_type_id) seq_printf(m, "%u: ", *(u32 *)key); btf_type_seq_show(map->btf, map->btf_value_type_id, value, m); - seq_puts(m, "\n"); + seq_putc(m, '\n'); rcu_read_unlock(); } @@ -515,7 +515,7 @@ static void percpu_array_map_seq_show_elem(struct bpf_map *map, void *key, seq_printf(m, "\tcpu%d: ", cpu); btf_type_seq_show(map->btf, map->btf_value_type_id, per_cpu_ptr(pptr, cpu), m); - seq_puts(m, "\n"); + seq_putc(m, '\n'); } seq_puts(m, "}\n"); @@ -600,7 +600,7 @@ static void *bpf_array_map_seq_start(struct seq_file *seq, loff_t *pos) array = container_of(map, struct bpf_array, map); index = info->index & array->index_mask; if (info->percpu_value_buf) - return array->pptrs[index]; + return (void *)(uintptr_t)array->pptrs[index]; return array_map_elem_ptr(array, index); } @@ -619,7 +619,7 @@ static void *bpf_array_map_seq_next(struct seq_file *seq, void *v, loff_t *pos) array = container_of(map, struct bpf_array, map); index = info->index & array->index_mask; if (info->percpu_value_buf) - return array->pptrs[index]; + return (void *)(uintptr_t)array->pptrs[index]; return array_map_elem_ptr(array, index); } @@ -632,7 +632,7 @@ static int __bpf_array_map_seq_show(struct seq_file *seq, void *v) struct bpf_iter_meta meta; struct bpf_prog *prog; int off = 0, cpu = 0; - void __percpu **pptr; + void __percpu *pptr; u32 size; meta.seq = seq; @@ -648,7 +648,7 @@ static int __bpf_array_map_seq_show(struct seq_file *seq, void *v) if (!info->percpu_value_buf) { ctx.value = v; } else { - pptr = v; + pptr = (void __percpu *)(uintptr_t)v; size = array->elem_size; for_each_possible_cpu(cpu) { copy_map_value_long(map, info->percpu_value_buf + off, @@ -993,7 +993,7 @@ static void prog_array_map_seq_show_elem(struct bpf_map *map, void *key, prog_id = prog_fd_array_sys_lookup_elem(ptr); btf_type_seq_show(map->btf, map->btf_value_type_id, &prog_id, m); - seq_puts(m, "\n"); + seq_putc(m, '\n'); } } diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c index 08a338e1f231..6292ac5f9bd1 100644 --- a/kernel/bpf/bpf_lsm.c +++ b/kernel/bpf/bpf_lsm.c @@ -11,7 +11,6 @@ #include <linux/lsm_hooks.h> #include <linux/bpf_lsm.h> #include <linux/kallsyms.h> -#include <linux/bpf_verifier.h> #include <net/bpf_sk_storage.h> #include <linux/bpf_local_storage.h> #include <linux/btf_ids.h> @@ -36,6 +35,24 @@ BTF_SET_START(bpf_lsm_hooks) #undef LSM_HOOK BTF_SET_END(bpf_lsm_hooks) +BTF_SET_START(bpf_lsm_disabled_hooks) +BTF_ID(func, bpf_lsm_vm_enough_memory) +BTF_ID(func, bpf_lsm_inode_need_killpriv) +BTF_ID(func, bpf_lsm_inode_getsecurity) +BTF_ID(func, bpf_lsm_inode_listsecurity) +BTF_ID(func, bpf_lsm_inode_copy_up_xattr) +BTF_ID(func, bpf_lsm_getselfattr) +BTF_ID(func, bpf_lsm_getprocattr) +BTF_ID(func, bpf_lsm_setprocattr) +#ifdef CONFIG_KEYS +BTF_ID(func, bpf_lsm_key_getsecurity) +#endif +#ifdef CONFIG_AUDIT +BTF_ID(func, bpf_lsm_audit_rule_match) +#endif +BTF_ID(func, bpf_lsm_ismaclabel) +BTF_SET_END(bpf_lsm_disabled_hooks) + /* List of LSM hooks that should operate on 'current' cgroup regardless * of function signature. */ @@ -97,15 +114,24 @@ void bpf_lsm_find_cgroup_shim(const struct bpf_prog *prog, int bpf_lsm_verify_prog(struct bpf_verifier_log *vlog, const struct bpf_prog *prog) { + u32 btf_id = prog->aux->attach_btf_id; + const char *func_name = prog->aux->attach_func_name; + if (!prog->gpl_compatible) { bpf_log(vlog, "LSM programs must have a GPL compatible license\n"); return -EINVAL; } - if (!btf_id_set_contains(&bpf_lsm_hooks, prog->aux->attach_btf_id)) { + if (btf_id_set_contains(&bpf_lsm_disabled_hooks, btf_id)) { + bpf_log(vlog, "attach_btf_id %u points to disabled hook %s\n", + btf_id, func_name); + return -EINVAL; + } + + if (!btf_id_set_contains(&bpf_lsm_hooks, btf_id)) { bpf_log(vlog, "attach_btf_id %u points to wrong type name %s\n", - prog->aux->attach_btf_id, prog->aux->attach_func_name); + btf_id, func_name); return -EINVAL; } @@ -390,3 +416,36 @@ const struct bpf_verifier_ops lsm_verifier_ops = { .get_func_proto = bpf_lsm_func_proto, .is_valid_access = btf_ctx_access, }; + +/* hooks return 0 or 1 */ +BTF_SET_START(bool_lsm_hooks) +#ifdef CONFIG_SECURITY_NETWORK_XFRM +BTF_ID(func, bpf_lsm_xfrm_state_pol_flow_match) +#endif +#ifdef CONFIG_AUDIT +BTF_ID(func, bpf_lsm_audit_rule_known) +#endif +BTF_ID(func, bpf_lsm_inode_xattr_skipcap) +BTF_SET_END(bool_lsm_hooks) + +int bpf_lsm_get_retval_range(const struct bpf_prog *prog, + struct bpf_retval_range *retval_range) +{ + /* no return value range for void hooks */ + if (!prog->aux->attach_func_proto->type) + return -EINVAL; + + if (btf_id_set_contains(&bool_lsm_hooks, prog->aux->attach_btf_id)) { + retval_range->minval = 0; + retval_range->maxval = 1; + } else { + /* All other available LSM hooks, except task_prctl, return 0 + * on success and negative error code on failure. + * To keep things simple, we only allow bpf progs to return 0 + * or negative errno for task_prctl too. + */ + retval_range->minval = -MAX_ERRNO; + retval_range->maxval = 0; + } + return 0; +} diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c index 0d515ec57aa5..fda3dd2ee984 100644 --- a/kernel/bpf/bpf_struct_ops.c +++ b/kernel/bpf/bpf_struct_ops.c @@ -837,7 +837,7 @@ static void bpf_struct_ops_map_seq_show_elem(struct bpf_map *map, void *key, btf_type_seq_show(st_map->btf, map->btf_vmlinux_value_type_id, value, m); - seq_puts(m, "\n"); + seq_putc(m, '\n'); } kfree(value); @@ -1040,6 +1040,13 @@ void bpf_struct_ops_put(const void *kdata) bpf_map_put(&st_map->map); } +int bpf_struct_ops_supported(const struct bpf_struct_ops *st_ops, u32 moff) +{ + void *func_ptr = *(void **)(st_ops->cfi_stubs + moff); + + return func_ptr ? 0 : -ENOTSUPP; +} + static bool bpf_struct_ops_valid_to_reg(struct bpf_map *map) { struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map; diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 520f49f422fe..1e29281653c6 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -212,7 +212,7 @@ enum btf_kfunc_hook { BTF_KFUNC_HOOK_TRACING, BTF_KFUNC_HOOK_SYSCALL, BTF_KFUNC_HOOK_FMODRET, - BTF_KFUNC_HOOK_CGROUP_SKB, + BTF_KFUNC_HOOK_CGROUP, BTF_KFUNC_HOOK_SCHED_ACT, BTF_KFUNC_HOOK_SK_SKB, BTF_KFUNC_HOOK_SOCKET_FILTER, @@ -790,7 +790,7 @@ const char *btf_str_by_offset(const struct btf *btf, u32 offset) return NULL; } -static bool __btf_name_valid(const struct btf *btf, u32 offset) +static bool btf_name_valid_identifier(const struct btf *btf, u32 offset) { /* offset must be valid */ const char *src = btf_str_by_offset(btf, offset); @@ -811,11 +811,6 @@ static bool __btf_name_valid(const struct btf *btf, u32 offset) return !*src; } -static bool btf_name_valid_identifier(const struct btf *btf, u32 offset) -{ - return __btf_name_valid(btf, offset); -} - /* Allow any printable character in DATASEC names */ static bool btf_name_valid_section(const struct btf *btf, u32 offset) { @@ -3759,6 +3754,7 @@ static int btf_find_field(const struct btf *btf, const struct btf_type *t, return -EINVAL; } +/* Callers have to ensure the life cycle of btf if it is program BTF */ static int btf_parse_kptr(const struct btf *btf, struct btf_field *field, struct btf_field_info *info) { @@ -3787,7 +3783,6 @@ static int btf_parse_kptr(const struct btf *btf, struct btf_field *field, field->kptr.dtor = NULL; id = info->kptr.type_id; kptr_btf = (struct btf *)btf; - btf_get(kptr_btf); goto found_dtor; } if (id < 0) @@ -4629,7 +4624,7 @@ static s32 btf_var_check_meta(struct btf_verifier_env *env, } if (!t->name_off || - !__btf_name_valid(env->btf, t->name_off)) { + !btf_name_valid_identifier(env->btf, t->name_off)) { btf_verifier_log_type(env, t, "Invalid name"); return -EINVAL; } @@ -5517,36 +5512,70 @@ static const char *alloc_obj_fields[] = { static struct btf_struct_metas * btf_parse_struct_metas(struct bpf_verifier_log *log, struct btf *btf) { - union { - struct btf_id_set set; - struct { - u32 _cnt; - u32 _ids[ARRAY_SIZE(alloc_obj_fields)]; - } _arr; - } aof; struct btf_struct_metas *tab = NULL; + struct btf_id_set *aof; int i, n, id, ret; BUILD_BUG_ON(offsetof(struct btf_id_set, cnt) != 0); BUILD_BUG_ON(sizeof(struct btf_id_set) != sizeof(u32)); - memset(&aof, 0, sizeof(aof)); + aof = kmalloc(sizeof(*aof), GFP_KERNEL | __GFP_NOWARN); + if (!aof) + return ERR_PTR(-ENOMEM); + aof->cnt = 0; + for (i = 0; i < ARRAY_SIZE(alloc_obj_fields); i++) { /* Try to find whether this special type exists in user BTF, and * if so remember its ID so we can easily find it among members * of structs that we iterate in the next loop. */ + struct btf_id_set *new_aof; + id = btf_find_by_name_kind(btf, alloc_obj_fields[i], BTF_KIND_STRUCT); if (id < 0) continue; - aof.set.ids[aof.set.cnt++] = id; + + new_aof = krealloc(aof, offsetof(struct btf_id_set, ids[aof->cnt + 1]), + GFP_KERNEL | __GFP_NOWARN); + if (!new_aof) { + ret = -ENOMEM; + goto free_aof; + } + aof = new_aof; + aof->ids[aof->cnt++] = id; } - if (!aof.set.cnt) + n = btf_nr_types(btf); + for (i = 1; i < n; i++) { + /* Try to find if there are kptrs in user BTF and remember their ID */ + struct btf_id_set *new_aof; + struct btf_field_info tmp; + const struct btf_type *t; + + t = btf_type_by_id(btf, i); + if (!t) { + ret = -EINVAL; + goto free_aof; + } + + ret = btf_find_kptr(btf, t, 0, 0, &tmp); + if (ret != BTF_FIELD_FOUND) + continue; + + new_aof = krealloc(aof, offsetof(struct btf_id_set, ids[aof->cnt + 1]), + GFP_KERNEL | __GFP_NOWARN); + if (!new_aof) { + ret = -ENOMEM; + goto free_aof; + } + aof = new_aof; + aof->ids[aof->cnt++] = i; + } + + if (!aof->cnt) return NULL; - sort(&aof.set.ids, aof.set.cnt, sizeof(aof.set.ids[0]), btf_id_cmp_func, NULL); + sort(&aof->ids, aof->cnt, sizeof(aof->ids[0]), btf_id_cmp_func, NULL); - n = btf_nr_types(btf); for (i = 1; i < n; i++) { struct btf_struct_metas *new_tab; const struct btf_member *member; @@ -5556,17 +5585,13 @@ btf_parse_struct_metas(struct bpf_verifier_log *log, struct btf *btf) int j, tab_cnt; t = btf_type_by_id(btf, i); - if (!t) { - ret = -EINVAL; - goto free; - } if (!__btf_type_is_struct(t)) continue; cond_resched(); for_each_member(j, t, member) { - if (btf_id_set_contains(&aof.set, member->type)) + if (btf_id_set_contains(aof, member->type)) goto parse; } continue; @@ -5585,7 +5610,8 @@ btf_parse_struct_metas(struct bpf_verifier_log *log, struct btf *btf) type = &tab->types[tab->cnt]; type->btf_id = i; record = btf_parse_fields(btf, t, BPF_SPIN_LOCK | BPF_LIST_HEAD | BPF_LIST_NODE | - BPF_RB_ROOT | BPF_RB_NODE | BPF_REFCOUNT, t->size); + BPF_RB_ROOT | BPF_RB_NODE | BPF_REFCOUNT | + BPF_KPTR, t->size); /* The record cannot be unset, treat it as an error if so */ if (IS_ERR_OR_NULL(record)) { ret = PTR_ERR_OR_ZERO(record) ?: -EFAULT; @@ -5594,9 +5620,12 @@ btf_parse_struct_metas(struct bpf_verifier_log *log, struct btf *btf) type->record = record; tab->cnt++; } + kfree(aof); return tab; free: btf_struct_metas_free(tab); +free_aof: + kfree(aof); return ERR_PTR(ret); } @@ -6243,12 +6272,11 @@ static struct btf *btf_parse_module(const char *module_name, const void *data, btf->kernel_btf = true; snprintf(btf->name, sizeof(btf->name), "%s", module_name); - btf->data = kvmalloc(data_size, GFP_KERNEL | __GFP_NOWARN); + btf->data = kvmemdup(data, data_size, GFP_KERNEL | __GFP_NOWARN); if (!btf->data) { err = -ENOMEM; goto errout; } - memcpy(btf->data, data, data_size); btf->data_size = data_size; err = btf_parse_hdr(env); @@ -6416,8 +6444,11 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, if (arg == nr_args) { switch (prog->expected_attach_type) { - case BPF_LSM_CGROUP: case BPF_LSM_MAC: + /* mark we are accessing the return value */ + info->is_retval = true; + fallthrough; + case BPF_LSM_CGROUP: case BPF_TRACE_FEXIT: /* When LSM programs are attached to void LSM hooks * they use FEXIT trampolines and when attached to @@ -8049,15 +8080,44 @@ BTF_ID_LIST_GLOBAL(btf_tracing_ids, MAX_BTF_TRACING_TYPE) BTF_TRACING_TYPE_xxx #undef BTF_TRACING_TYPE +/* Validate well-formedness of iter argument type. + * On success, return positive BTF ID of iter state's STRUCT type. + * On error, negative error is returned. + */ +int btf_check_iter_arg(struct btf *btf, const struct btf_type *func, int arg_idx) +{ + const struct btf_param *arg; + const struct btf_type *t; + const char *name; + int btf_id; + + if (btf_type_vlen(func) <= arg_idx) + return -EINVAL; + + arg = &btf_params(func)[arg_idx]; + t = btf_type_skip_modifiers(btf, arg->type, NULL); + if (!t || !btf_type_is_ptr(t)) + return -EINVAL; + t = btf_type_skip_modifiers(btf, t->type, &btf_id); + if (!t || !__btf_type_is_struct(t)) + return -EINVAL; + + name = btf_name_by_offset(btf, t->name_off); + if (!name || strncmp(name, ITER_PREFIX, sizeof(ITER_PREFIX) - 1)) + return -EINVAL; + + return btf_id; +} + static int btf_check_iter_kfuncs(struct btf *btf, const char *func_name, const struct btf_type *func, u32 func_flags) { u32 flags = func_flags & (KF_ITER_NEW | KF_ITER_NEXT | KF_ITER_DESTROY); - const char *name, *sfx, *iter_name; - const struct btf_param *arg; + const char *sfx, *iter_name; const struct btf_type *t; char exp_name[128]; u32 nr_args; + int btf_id; /* exactly one of KF_ITER_{NEW,NEXT,DESTROY} can be set */ if (!flags || (flags & (flags - 1))) @@ -8068,28 +8128,21 @@ static int btf_check_iter_kfuncs(struct btf *btf, const char *func_name, if (nr_args < 1) return -EINVAL; - arg = &btf_params(func)[0]; - t = btf_type_skip_modifiers(btf, arg->type, NULL); - if (!t || !btf_type_is_ptr(t)) - return -EINVAL; - t = btf_type_skip_modifiers(btf, t->type, NULL); - if (!t || !__btf_type_is_struct(t)) - return -EINVAL; - - name = btf_name_by_offset(btf, t->name_off); - if (!name || strncmp(name, ITER_PREFIX, sizeof(ITER_PREFIX) - 1)) - return -EINVAL; + btf_id = btf_check_iter_arg(btf, func, 0); + if (btf_id < 0) + return btf_id; /* sizeof(struct bpf_iter_<type>) should be a multiple of 8 to * fit nicely in stack slots */ + t = btf_type_by_id(btf, btf_id); if (t->size == 0 || (t->size % 8)) return -EINVAL; /* validate bpf_iter_<type>_{new,next,destroy}(struct bpf_iter_<type> *) * naming pattern */ - iter_name = name + sizeof(ITER_PREFIX) - 1; + iter_name = btf_name_by_offset(btf, t->name_off) + sizeof(ITER_PREFIX) - 1; if (flags & KF_ITER_NEW) sfx = "new"; else if (flags & KF_ITER_NEXT) @@ -8309,8 +8362,12 @@ static int bpf_prog_type_to_kfunc_hook(enum bpf_prog_type prog_type) case BPF_PROG_TYPE_SYSCALL: return BTF_KFUNC_HOOK_SYSCALL; case BPF_PROG_TYPE_CGROUP_SKB: + case BPF_PROG_TYPE_CGROUP_SOCK: + case BPF_PROG_TYPE_CGROUP_DEVICE: case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: - return BTF_KFUNC_HOOK_CGROUP_SKB; + case BPF_PROG_TYPE_CGROUP_SOCKOPT: + case BPF_PROG_TYPE_CGROUP_SYSCTL: + return BTF_KFUNC_HOOK_CGROUP; case BPF_PROG_TYPE_SCHED_ACT: return BTF_KFUNC_HOOK_SCHED_ACT; case BPF_PROG_TYPE_SK_SKB: @@ -8886,6 +8943,7 @@ int bpf_core_apply(struct bpf_core_ctx *ctx, const struct bpf_core_relo *relo, struct bpf_core_cand_list cands = {}; struct bpf_core_relo_res targ_res; struct bpf_core_spec *specs; + const struct btf_type *type; int err; /* ~4k of temp memory necessary to convert LLVM spec like "0:1:0:5" @@ -8895,6 +8953,13 @@ int bpf_core_apply(struct bpf_core_ctx *ctx, const struct bpf_core_relo *relo, if (!specs) return -ENOMEM; + type = btf_type_by_id(ctx->btf, relo->type_id); + if (!type) { + bpf_log(ctx->log, "relo #%u: bad type id %u\n", + relo_idx, relo->type_id); + return -EINVAL; + } + if (need_cands) { struct bpf_cand_cache *cc; int i; diff --git a/kernel/bpf/btf_iter.c b/kernel/bpf/btf_iter.c new file mode 100644 index 000000000000..0e2c66a52df9 --- /dev/null +++ b/kernel/bpf/btf_iter.c @@ -0,0 +1,2 @@ +// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) +#include "../../tools/lib/bpf/btf_iter.c" diff --git a/kernel/bpf/btf_relocate.c b/kernel/bpf/btf_relocate.c new file mode 100644 index 000000000000..c12ccbf66507 --- /dev/null +++ b/kernel/bpf/btf_relocate.c @@ -0,0 +1,2 @@ +// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) +#include "../../tools/lib/bpf/btf_relocate.c" diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 8ba73042a239..e7113d700b87 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -2581,6 +2581,8 @@ cgroup_current_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) case BPF_FUNC_get_cgroup_classid: return &bpf_get_cgroup_classid_curr_proto; #endif + case BPF_FUNC_current_task_under_cgroup: + return &bpf_current_task_under_cgroup_proto; default: return NULL; } diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 7ee62e38faf0..4e07cc057d6f 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -2302,6 +2302,7 @@ bool bpf_prog_map_compatible(struct bpf_map *map, { enum bpf_prog_type prog_type = resolve_prog_type(fp); bool ret; + struct bpf_prog_aux *aux = fp->aux; if (fp->kprobe_override) return false; @@ -2311,7 +2312,7 @@ bool bpf_prog_map_compatible(struct bpf_map *map, * in the case of devmap and cpumap). Until device checks * are implemented, prohibit adding dev-bound programs to program maps. */ - if (bpf_prog_is_dev_bound(fp->aux)) + if (bpf_prog_is_dev_bound(aux)) return false; spin_lock(&map->owner.lock); @@ -2321,12 +2322,26 @@ bool bpf_prog_map_compatible(struct bpf_map *map, */ map->owner.type = prog_type; map->owner.jited = fp->jited; - map->owner.xdp_has_frags = fp->aux->xdp_has_frags; + map->owner.xdp_has_frags = aux->xdp_has_frags; + map->owner.attach_func_proto = aux->attach_func_proto; ret = true; } else { ret = map->owner.type == prog_type && map->owner.jited == fp->jited && - map->owner.xdp_has_frags == fp->aux->xdp_has_frags; + map->owner.xdp_has_frags == aux->xdp_has_frags; + if (ret && + map->owner.attach_func_proto != aux->attach_func_proto) { + switch (prog_type) { + case BPF_PROG_TYPE_TRACING: + case BPF_PROG_TYPE_LSM: + case BPF_PROG_TYPE_EXT: + case BPF_PROG_TYPE_STRUCT_OPS: + ret = false; + break; + default: + break; + } + } } spin_unlock(&map->owner.lock); diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 06115f8728e8..45c7195b65ba 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -1049,14 +1049,15 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key, pptr = htab_elem_get_ptr(l_new, key_size); } else { /* alloc_percpu zero-fills */ - pptr = bpf_mem_cache_alloc(&htab->pcpu_ma); - if (!pptr) { + void *ptr = bpf_mem_cache_alloc(&htab->pcpu_ma); + + if (!ptr) { bpf_mem_cache_free(&htab->ma, l_new); l_new = ERR_PTR(-ENOMEM); goto dec_count; } - l_new->ptr_to_pptr = pptr; - pptr = *(void **)pptr; + l_new->ptr_to_pptr = ptr; + pptr = *(void __percpu **)ptr; } pcpu_init_value(htab, pptr, value, onallcpus); @@ -1586,7 +1587,7 @@ static void htab_map_seq_show_elem(struct bpf_map *map, void *key, btf_type_seq_show(map->btf, map->btf_key_type_id, key, m); seq_puts(m, ": "); btf_type_seq_show(map->btf, map->btf_value_type_id, value, m); - seq_puts(m, "\n"); + seq_putc(m, '\n'); rcu_read_unlock(); } @@ -2450,7 +2451,7 @@ static void htab_percpu_map_seq_show_elem(struct bpf_map *map, void *key, seq_printf(m, "\tcpu%d: ", cpu); btf_type_seq_show(map->btf, map->btf_value_type_id, per_cpu_ptr(pptr, cpu), m); - seq_puts(m, "\n"); + seq_putc(m, '\n'); } seq_puts(m, "}\n"); diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index b5f0adae8293..3956be5d6440 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -158,6 +158,7 @@ const struct bpf_func_proto bpf_get_smp_processor_id_proto = { .func = bpf_get_smp_processor_id, .gpl_only = false, .ret_type = RET_INTEGER, + .allow_fastcall = true, }; BPF_CALL_0(bpf_get_numa_node_id) @@ -714,7 +715,7 @@ BPF_CALL_2(bpf_per_cpu_ptr, const void *, ptr, u32, cpu) if (cpu >= nr_cpu_ids) return (unsigned long)NULL; - return (unsigned long)per_cpu_ptr((const void __percpu *)ptr, cpu); + return (unsigned long)per_cpu_ptr((const void __percpu *)(const uintptr_t)ptr, cpu); } const struct bpf_func_proto bpf_per_cpu_ptr_proto = { @@ -727,7 +728,7 @@ const struct bpf_func_proto bpf_per_cpu_ptr_proto = { BPF_CALL_1(bpf_this_cpu_ptr, const void *, percpu_ptr) { - return (unsigned long)this_cpu_ptr((const void __percpu *)percpu_ptr); + return (unsigned long)this_cpu_ptr((const void __percpu *)(const uintptr_t)percpu_ptr); } const struct bpf_func_proto bpf_this_cpu_ptr_proto = { @@ -1618,9 +1619,9 @@ void bpf_wq_cancel_and_free(void *val) schedule_work(&work->delete_work); } -BPF_CALL_2(bpf_kptr_xchg, void *, map_value, void *, ptr) +BPF_CALL_2(bpf_kptr_xchg, void *, dst, void *, ptr) { - unsigned long *kptr = map_value; + unsigned long *kptr = dst; /* This helper may be inlined by verifier. */ return xchg(kptr, (unsigned long)ptr); @@ -1635,7 +1636,7 @@ static const struct bpf_func_proto bpf_kptr_xchg_proto = { .gpl_only = false, .ret_type = RET_PTR_TO_BTF_ID_OR_NULL, .ret_btf_id = BPF_PTR_POISON, - .arg1_type = ARG_PTR_TO_KPTR, + .arg1_type = ARG_KPTR_XCHG_DEST, .arg2_type = ARG_PTR_TO_BTF_ID_OR_NULL | OBJ_RELEASE, .arg2_btf_id = BPF_PTR_POISON, }; @@ -2033,6 +2034,7 @@ bpf_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return NULL; } } +EXPORT_SYMBOL_GPL(bpf_base_func_proto); void bpf_list_head_free(const struct btf_field *field, void *list_head, struct bpf_spin_lock *spin_lock) @@ -2457,6 +2459,29 @@ __bpf_kfunc long bpf_task_under_cgroup(struct task_struct *task, return ret; } +BPF_CALL_2(bpf_current_task_under_cgroup, struct bpf_map *, map, u32, idx) +{ + struct bpf_array *array = container_of(map, struct bpf_array, map); + struct cgroup *cgrp; + + if (unlikely(idx >= array->map.max_entries)) + return -E2BIG; + + cgrp = READ_ONCE(array->ptrs[idx]); + if (unlikely(!cgrp)) + return -EAGAIN; + + return task_under_cgroup_hierarchy(current, cgrp); +} + +const struct bpf_func_proto bpf_current_task_under_cgroup_proto = { + .func = bpf_current_task_under_cgroup, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_CONST_MAP_PTR, + .arg2_type = ARG_ANYTHING, +}; + /** * bpf_task_get_cgroup1 - Acquires the associated cgroup of a task within a * specific cgroup1 hierarchy. The cgroup1 hierarchy is identified by its @@ -2938,6 +2963,47 @@ __bpf_kfunc void bpf_iter_bits_destroy(struct bpf_iter_bits *it) bpf_mem_free(&bpf_global_ma, kit->bits); } +/** + * bpf_copy_from_user_str() - Copy a string from an unsafe user address + * @dst: Destination address, in kernel space. This buffer must be + * at least @dst__sz bytes long. + * @dst__sz: Maximum number of bytes to copy, includes the trailing NUL. + * @unsafe_ptr__ign: Source address, in user space. + * @flags: The only supported flag is BPF_F_PAD_ZEROS + * + * Copies a NUL-terminated string from userspace to BPF space. If user string is + * too long this will still ensure zero termination in the dst buffer unless + * buffer size is 0. + * + * If BPF_F_PAD_ZEROS flag is set, memset the tail of @dst to 0 on success and + * memset all of @dst on failure. + */ +__bpf_kfunc int bpf_copy_from_user_str(void *dst, u32 dst__sz, const void __user *unsafe_ptr__ign, u64 flags) +{ + int ret; + + if (unlikely(flags & ~BPF_F_PAD_ZEROS)) + return -EINVAL; + + if (unlikely(!dst__sz)) + return 0; + + ret = strncpy_from_user(dst, unsafe_ptr__ign, dst__sz - 1); + if (ret < 0) { + if (flags & BPF_F_PAD_ZEROS) + memset((char *)dst, 0, dst__sz); + + return ret; + } + + if (flags & BPF_F_PAD_ZEROS) + memset((char *)dst + ret, 0, dst__sz - ret); + else + ((char *)dst)[ret] = '\0'; + + return ret + 1; +} + __bpf_kfunc_end_defs(); BTF_KFUNCS_START(generic_btf_ids) @@ -3023,6 +3089,7 @@ BTF_ID_FLAGS(func, bpf_preempt_enable) BTF_ID_FLAGS(func, bpf_iter_bits_new, KF_ITER_NEW) BTF_ID_FLAGS(func, bpf_iter_bits_next, KF_ITER_NEXT | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_iter_bits_destroy, KF_ITER_DESTROY) +BTF_ID_FLAGS(func, bpf_copy_from_user_str, KF_SLEEPABLE) BTF_KFUNCS_END(common_btf_ids) static const struct btf_kfunc_id_set common_kfunc_set = { @@ -3051,6 +3118,7 @@ static int __init kfunc_init(void) ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_XDP, &generic_kfunc_set); ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &generic_kfunc_set); ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SYSCALL, &generic_kfunc_set); + ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_CGROUP_SKB, &generic_kfunc_set); ret = ret ?: register_btf_id_dtor_kfuncs(generic_dtors, ARRAY_SIZE(generic_dtors), THIS_MODULE); diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index af5d2ffadd70..d8fc5eba529d 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -709,10 +709,10 @@ static void seq_print_delegate_opts(struct seq_file *m, msk = 1ULL << e->val; if (delegate_msk & msk) { /* emit lower-case name without prefix */ - seq_printf(m, "%c", first ? '=' : ':'); + seq_putc(m, first ? '=' : ':'); name += pfx_len; while (*name) { - seq_printf(m, "%c", tolower(*name)); + seq_putc(m, tolower(*name)); name++; } diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c index a04f505aefe9..3969eb0382af 100644 --- a/kernel/bpf/local_storage.c +++ b/kernel/bpf/local_storage.c @@ -431,7 +431,7 @@ static void cgroup_storage_seq_show_elem(struct bpf_map *map, void *key, seq_puts(m, ": "); btf_type_seq_show(map->btf, map->btf_value_type_id, &READ_ONCE(storage->buf)->data[0], m); - seq_puts(m, "\n"); + seq_putc(m, '\n'); } else { seq_puts(m, ": {\n"); for_each_possible_cpu(cpu) { @@ -439,7 +439,7 @@ static void cgroup_storage_seq_show_elem(struct bpf_map *map, void *key, btf_type_seq_show(map->btf, map->btf_value_type_id, per_cpu_ptr(storage->percpu_buf, cpu), m); - seq_puts(m, "\n"); + seq_putc(m, '\n'); } seq_puts(m, "}\n"); } diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c index dec892ded031..b3858a76e0b3 100644 --- a/kernel/bpf/memalloc.c +++ b/kernel/bpf/memalloc.c @@ -138,8 +138,8 @@ static struct llist_node notrace *__llist_del_first(struct llist_head *head) static void *__alloc(struct bpf_mem_cache *c, int node, gfp_t flags) { if (c->percpu_size) { - void **obj = kmalloc_node(c->percpu_size, flags, node); - void *pptr = __alloc_percpu_gfp(c->unit_size, 8, flags); + void __percpu **obj = kmalloc_node(c->percpu_size, flags, node); + void __percpu *pptr = __alloc_percpu_gfp(c->unit_size, 8, flags); if (!obj || !pptr) { free_percpu(pptr); @@ -253,7 +253,7 @@ static void alloc_bulk(struct bpf_mem_cache *c, int cnt, int node, bool atomic) static void free_one(void *obj, bool percpu) { if (percpu) { - free_percpu(((void **)obj)[1]); + free_percpu(((void __percpu **)obj)[1]); kfree(obj); return; } @@ -509,8 +509,8 @@ static void prefill_mem_cache(struct bpf_mem_cache *c, int cpu) */ int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu) { - struct bpf_mem_caches *cc, __percpu *pcc; - struct bpf_mem_cache *c, __percpu *pc; + struct bpf_mem_caches *cc; struct bpf_mem_caches __percpu *pcc; + struct bpf_mem_cache *c; struct bpf_mem_cache __percpu *pc; struct obj_cgroup *objcg = NULL; int cpu, i, unit_size, percpu_size = 0; @@ -591,7 +591,7 @@ int bpf_mem_alloc_percpu_init(struct bpf_mem_alloc *ma, struct obj_cgroup *objcg int bpf_mem_alloc_percpu_unit_init(struct bpf_mem_alloc *ma, int size) { - struct bpf_mem_caches *cc, __percpu *pcc; + struct bpf_mem_caches *cc; struct bpf_mem_caches __percpu *pcc; int cpu, i, unit_size, percpu_size; struct obj_cgroup *objcg; struct bpf_mem_cache *c; diff --git a/kernel/bpf/relo_core.c b/kernel/bpf/relo_core.c new file mode 100644 index 000000000000..aa822c9fcfde --- /dev/null +++ b/kernel/bpf/relo_core.c @@ -0,0 +1,2 @@ +// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) +#include "../../tools/lib/bpf/relo_core.c" diff --git a/kernel/bpf/reuseport_array.c b/kernel/bpf/reuseport_array.c index 4b4f9670f1a9..49b8e5a0c6b4 100644 --- a/kernel/bpf/reuseport_array.c +++ b/kernel/bpf/reuseport_array.c @@ -308,7 +308,7 @@ put_file_unlock: spin_unlock_bh(&reuseport_lock); put_file: - fput(socket->file); + sockfd_put(socket); return err; } diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index bf6c5f685ea2..fc62f5c4faf9 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -550,7 +550,8 @@ void btf_record_free(struct btf_record *rec) case BPF_KPTR_PERCPU: if (rec->fields[i].kptr.module) module_put(rec->fields[i].kptr.module); - btf_put(rec->fields[i].kptr.btf); + if (btf_is_kernel(rec->fields[i].kptr.btf)) + btf_put(rec->fields[i].kptr.btf); break; case BPF_LIST_HEAD: case BPF_LIST_NODE: @@ -596,7 +597,8 @@ struct btf_record *btf_record_dup(const struct btf_record *rec) case BPF_KPTR_UNREF: case BPF_KPTR_REF: case BPF_KPTR_PERCPU: - btf_get(fields[i].kptr.btf); + if (btf_is_kernel(fields[i].kptr.btf)) + btf_get(fields[i].kptr.btf); if (fields[i].kptr.module && !try_module_get(fields[i].kptr.module)) { ret = -ENXIO; goto free; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 4cb5441ad75f..217eb0eafa2a 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2182,6 +2182,44 @@ static void __reg_deduce_mixed_bounds(struct bpf_reg_state *reg) reg->smin_value = max_t(s64, reg->smin_value, new_smin); reg->smax_value = min_t(s64, reg->smax_value, new_smax); } + + /* Here we would like to handle a special case after sign extending load, + * when upper bits for a 64-bit range are all 1s or all 0s. + * + * Upper bits are all 1s when register is in a range: + * [0xffff_ffff_0000_0000, 0xffff_ffff_ffff_ffff] + * Upper bits are all 0s when register is in a range: + * [0x0000_0000_0000_0000, 0x0000_0000_ffff_ffff] + * Together this forms are continuous range: + * [0xffff_ffff_0000_0000, 0x0000_0000_ffff_ffff] + * + * Now, suppose that register range is in fact tighter: + * [0xffff_ffff_8000_0000, 0x0000_0000_ffff_ffff] (R) + * Also suppose that it's 32-bit range is positive, + * meaning that lower 32-bits of the full 64-bit register + * are in the range: + * [0x0000_0000, 0x7fff_ffff] (W) + * + * If this happens, then any value in a range: + * [0xffff_ffff_0000_0000, 0xffff_ffff_7fff_ffff] + * is smaller than a lowest bound of the range (R): + * 0xffff_ffff_8000_0000 + * which means that upper bits of the full 64-bit register + * can't be all 1s, when lower bits are in range (W). + * + * Note that: + * - 0xffff_ffff_8000_0000 == (s64)S32_MIN + * - 0x0000_0000_7fff_ffff == (s64)S32_MAX + * These relations are used in the conditions below. + */ + if (reg->s32_min_value >= 0 && reg->smin_value >= S32_MIN && reg->smax_value <= S32_MAX) { + reg->smin_value = reg->s32_min_value; + reg->smax_value = reg->s32_max_value; + reg->umin_value = reg->s32_min_value; + reg->umax_value = reg->s32_max_value; + reg->var_off = tnum_intersect(reg->var_off, + tnum_range(reg->smin_value, reg->smax_value)); + } } static void __reg_deduce_bounds(struct bpf_reg_state *reg) @@ -2334,6 +2372,25 @@ static void mark_reg_unknown(struct bpf_verifier_env *env, __mark_reg_unknown(env, regs + regno); } +static int __mark_reg_s32_range(struct bpf_verifier_env *env, + struct bpf_reg_state *regs, + u32 regno, + s32 s32_min, + s32 s32_max) +{ + struct bpf_reg_state *reg = regs + regno; + + reg->s32_min_value = max_t(s32, reg->s32_min_value, s32_min); + reg->s32_max_value = min_t(s32, reg->s32_max_value, s32_max); + + reg->smin_value = max_t(s64, reg->smin_value, s32_min); + reg->smax_value = min_t(s64, reg->smax_value, s32_max); + + reg_bounds_sync(reg); + + return reg_bounds_sanity_check(env, reg, "s32_range"); +} + static void __mark_reg_not_init(const struct bpf_verifier_env *env, struct bpf_reg_state *reg) { @@ -3335,9 +3392,87 @@ static bool is_jmp_point(struct bpf_verifier_env *env, int insn_idx) return env->insn_aux_data[insn_idx].jmp_point; } +#define LR_FRAMENO_BITS 3 +#define LR_SPI_BITS 6 +#define LR_ENTRY_BITS (LR_SPI_BITS + LR_FRAMENO_BITS + 1) +#define LR_SIZE_BITS 4 +#define LR_FRAMENO_MASK ((1ull << LR_FRAMENO_BITS) - 1) +#define LR_SPI_MASK ((1ull << LR_SPI_BITS) - 1) +#define LR_SIZE_MASK ((1ull << LR_SIZE_BITS) - 1) +#define LR_SPI_OFF LR_FRAMENO_BITS +#define LR_IS_REG_OFF (LR_SPI_BITS + LR_FRAMENO_BITS) +#define LINKED_REGS_MAX 6 + +struct linked_reg { + u8 frameno; + union { + u8 spi; + u8 regno; + }; + bool is_reg; +}; + +struct linked_regs { + int cnt; + struct linked_reg entries[LINKED_REGS_MAX]; +}; + +static struct linked_reg *linked_regs_push(struct linked_regs *s) +{ + if (s->cnt < LINKED_REGS_MAX) + return &s->entries[s->cnt++]; + + return NULL; +} + +/* Use u64 as a vector of 6 10-bit values, use first 4-bits to track + * number of elements currently in stack. + * Pack one history entry for linked registers as 10 bits in the following format: + * - 3-bits frameno + * - 6-bits spi_or_reg + * - 1-bit is_reg + */ +static u64 linked_regs_pack(struct linked_regs *s) +{ + u64 val = 0; + int i; + + for (i = 0; i < s->cnt; ++i) { + struct linked_reg *e = &s->entries[i]; + u64 tmp = 0; + + tmp |= e->frameno; + tmp |= e->spi << LR_SPI_OFF; + tmp |= (e->is_reg ? 1 : 0) << LR_IS_REG_OFF; + + val <<= LR_ENTRY_BITS; + val |= tmp; + } + val <<= LR_SIZE_BITS; + val |= s->cnt; + return val; +} + +static void linked_regs_unpack(u64 val, struct linked_regs *s) +{ + int i; + + s->cnt = val & LR_SIZE_MASK; + val >>= LR_SIZE_BITS; + + for (i = 0; i < s->cnt; ++i) { + struct linked_reg *e = &s->entries[i]; + + e->frameno = val & LR_FRAMENO_MASK; + e->spi = (val >> LR_SPI_OFF) & LR_SPI_MASK; + e->is_reg = (val >> LR_IS_REG_OFF) & 0x1; + val >>= LR_ENTRY_BITS; + } +} + /* for any branch, call, exit record the history of jmps in the given state */ static int push_jmp_history(struct bpf_verifier_env *env, struct bpf_verifier_state *cur, - int insn_flags) + int insn_flags, u64 linked_regs) { u32 cnt = cur->jmp_history_cnt; struct bpf_jmp_history_entry *p; @@ -3353,6 +3488,10 @@ static int push_jmp_history(struct bpf_verifier_env *env, struct bpf_verifier_st "verifier insn history bug: insn_idx %d cur flags %x new flags %x\n", env->insn_idx, env->cur_hist_ent->flags, insn_flags); env->cur_hist_ent->flags |= insn_flags; + WARN_ONCE(env->cur_hist_ent->linked_regs != 0, + "verifier insn history bug: insn_idx %d linked_regs != 0: %#llx\n", + env->insn_idx, env->cur_hist_ent->linked_regs); + env->cur_hist_ent->linked_regs = linked_regs; return 0; } @@ -3367,6 +3506,7 @@ static int push_jmp_history(struct bpf_verifier_env *env, struct bpf_verifier_st p->idx = env->insn_idx; p->prev_idx = env->prev_insn_idx; p->flags = insn_flags; + p->linked_regs = linked_regs; cur->jmp_history_cnt = cnt; env->cur_hist_ent = p; @@ -3532,6 +3672,11 @@ static inline bool bt_is_reg_set(struct backtrack_state *bt, u32 reg) return bt->reg_masks[bt->frame] & (1 << reg); } +static inline bool bt_is_frame_reg_set(struct backtrack_state *bt, u32 frame, u32 reg) +{ + return bt->reg_masks[frame] & (1 << reg); +} + static inline bool bt_is_frame_slot_set(struct backtrack_state *bt, u32 frame, u32 slot) { return bt->stack_masks[frame] & (1ull << slot); @@ -3576,6 +3721,42 @@ static void fmt_stack_mask(char *buf, ssize_t buf_sz, u64 stack_mask) } } +/* If any register R in hist->linked_regs is marked as precise in bt, + * do bt_set_frame_{reg,slot}(bt, R) for all registers in hist->linked_regs. + */ +static void bt_sync_linked_regs(struct backtrack_state *bt, struct bpf_jmp_history_entry *hist) +{ + struct linked_regs linked_regs; + bool some_precise = false; + int i; + + if (!hist || hist->linked_regs == 0) + return; + + linked_regs_unpack(hist->linked_regs, &linked_regs); + for (i = 0; i < linked_regs.cnt; ++i) { + struct linked_reg *e = &linked_regs.entries[i]; + + if ((e->is_reg && bt_is_frame_reg_set(bt, e->frameno, e->regno)) || + (!e->is_reg && bt_is_frame_slot_set(bt, e->frameno, e->spi))) { + some_precise = true; + break; + } + } + + if (!some_precise) + return; + + for (i = 0; i < linked_regs.cnt; ++i) { + struct linked_reg *e = &linked_regs.entries[i]; + + if (e->is_reg) + bt_set_frame_reg(bt, e->frameno, e->regno); + else + bt_set_frame_slot(bt, e->frameno, e->spi); + } +} + static bool calls_callback(struct bpf_verifier_env *env, int insn_idx); /* For given verifier state backtrack_insn() is called from the last insn to @@ -3615,6 +3796,12 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx, print_bpf_insn(&cbs, insn, env->allow_ptr_leaks); } + /* If there is a history record that some registers gained range at this insn, + * propagate precision marks to those registers, so that bt_is_reg_set() + * accounts for these registers. + */ + bt_sync_linked_regs(bt, hist); + if (class == BPF_ALU || class == BPF_ALU64) { if (!bt_is_reg_set(bt, dreg)) return 0; @@ -3844,7 +4031,8 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx, */ bt_set_reg(bt, dreg); bt_set_reg(bt, sreg); - /* else dreg <cond> K + } else if (BPF_SRC(insn->code) == BPF_K) { + /* dreg <cond> K * Only dreg still needs precision before * this insn, so for the K-based conditional * there is nothing new to be marked. @@ -3862,6 +4050,10 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx, /* to be analyzed */ return -ENOTSUPP; } + /* Propagate precision marks to linked registers, to account for + * registers marked as precise in this function. + */ + bt_sync_linked_regs(bt, hist); return 0; } @@ -3989,96 +4181,6 @@ static void mark_all_scalars_imprecise(struct bpf_verifier_env *env, struct bpf_ } } -static bool idset_contains(struct bpf_idset *s, u32 id) -{ - u32 i; - - for (i = 0; i < s->count; ++i) - if (s->ids[i] == (id & ~BPF_ADD_CONST)) - return true; - - return false; -} - -static int idset_push(struct bpf_idset *s, u32 id) -{ - if (WARN_ON_ONCE(s->count >= ARRAY_SIZE(s->ids))) - return -EFAULT; - s->ids[s->count++] = id & ~BPF_ADD_CONST; - return 0; -} - -static void idset_reset(struct bpf_idset *s) -{ - s->count = 0; -} - -/* Collect a set of IDs for all registers currently marked as precise in env->bt. - * Mark all registers with these IDs as precise. - */ -static int mark_precise_scalar_ids(struct bpf_verifier_env *env, struct bpf_verifier_state *st) -{ - struct bpf_idset *precise_ids = &env->idset_scratch; - struct backtrack_state *bt = &env->bt; - struct bpf_func_state *func; - struct bpf_reg_state *reg; - DECLARE_BITMAP(mask, 64); - int i, fr; - - idset_reset(precise_ids); - - for (fr = bt->frame; fr >= 0; fr--) { - func = st->frame[fr]; - - bitmap_from_u64(mask, bt_frame_reg_mask(bt, fr)); - for_each_set_bit(i, mask, 32) { - reg = &func->regs[i]; - if (!reg->id || reg->type != SCALAR_VALUE) - continue; - if (idset_push(precise_ids, reg->id)) - return -EFAULT; - } - - bitmap_from_u64(mask, bt_frame_stack_mask(bt, fr)); - for_each_set_bit(i, mask, 64) { - if (i >= func->allocated_stack / BPF_REG_SIZE) - break; - if (!is_spilled_scalar_reg(&func->stack[i])) - continue; - reg = &func->stack[i].spilled_ptr; - if (!reg->id) - continue; - if (idset_push(precise_ids, reg->id)) - return -EFAULT; - } - } - - for (fr = 0; fr <= st->curframe; ++fr) { - func = st->frame[fr]; - - for (i = BPF_REG_0; i < BPF_REG_10; ++i) { - reg = &func->regs[i]; - if (!reg->id) - continue; - if (!idset_contains(precise_ids, reg->id)) - continue; - bt_set_frame_reg(bt, fr, i); - } - for (i = 0; i < func->allocated_stack / BPF_REG_SIZE; ++i) { - if (!is_spilled_scalar_reg(&func->stack[i])) - continue; - reg = &func->stack[i].spilled_ptr; - if (!reg->id) - continue; - if (!idset_contains(precise_ids, reg->id)) - continue; - bt_set_frame_slot(bt, fr, i); - } - } - - return 0; -} - /* * __mark_chain_precision() backtracks BPF program instruction sequence and * chain of verifier states making sure that register *regno* (if regno >= 0) @@ -4211,31 +4313,6 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno) bt->frame, last_idx, first_idx, subseq_idx); } - /* If some register with scalar ID is marked as precise, - * make sure that all registers sharing this ID are also precise. - * This is needed to estimate effect of find_equal_scalars(). - * Do this at the last instruction of each state, - * bpf_reg_state::id fields are valid for these instructions. - * - * Allows to track precision in situation like below: - * - * r2 = unknown value - * ... - * --- state #0 --- - * ... - * r1 = r2 // r1 and r2 now share the same ID - * ... - * --- state #1 {r1.id = A, r2.id = A} --- - * ... - * if (r2 > 10) goto exit; // find_equal_scalars() assigns range to r1 - * ... - * --- state #2 {r1.id = A, r2.id = A} --- - * r3 = r10 - * r3 += r1 // need to mark both r1 and r2 - */ - if (mark_precise_scalar_ids(env, st)) - return -EFAULT; - if (last_idx < 0) { /* we are at the entry into subprog, which * is expected for global funcs, but only if @@ -4456,7 +4533,7 @@ static void assign_scalar_id_before_mov(struct bpf_verifier_env *env, if (!src_reg->id && !tnum_is_const(src_reg->var_off)) /* Ensure that src_reg has a valid ID that will be copied to - * dst_reg and then will be used by find_equal_scalars() to + * dst_reg and then will be used by sync_linked_regs() to * propagate min/max range. */ src_reg->id = ++env->id_gen; @@ -4502,6 +4579,31 @@ static int get_reg_width(struct bpf_reg_state *reg) return fls64(reg->umax_value); } +/* See comment for mark_fastcall_pattern_for_call() */ +static void check_fastcall_stack_contract(struct bpf_verifier_env *env, + struct bpf_func_state *state, int insn_idx, int off) +{ + struct bpf_subprog_info *subprog = &env->subprog_info[state->subprogno]; + struct bpf_insn_aux_data *aux = env->insn_aux_data; + int i; + + if (subprog->fastcall_stack_off <= off || aux[insn_idx].fastcall_pattern) + return; + /* access to the region [max_stack_depth .. fastcall_stack_off) + * from something that is not a part of the fastcall pattern, + * disable fastcall rewrites for current subprogram by setting + * fastcall_stack_off to a value smaller than any possible offset. + */ + subprog->fastcall_stack_off = S16_MIN; + /* reset fastcall aux flags within subprogram, + * happens at most once per subprogram + */ + for (i = subprog->start; i < (subprog + 1)->start; ++i) { + aux[i].fastcall_spills_num = 0; + aux[i].fastcall_pattern = 0; + } +} + /* check_stack_{read,write}_fixed_off functions track spill/fill of registers, * stack boundary and alignment are checked in check_mem_access() */ @@ -4550,6 +4652,7 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env, if (err) return err; + check_fastcall_stack_contract(env, state, insn_idx, off); mark_stack_slot_scratched(env, spi); if (reg && !(off % BPF_REG_SIZE) && reg->type == SCALAR_VALUE && env->bpf_capable) { bool reg_value_fits; @@ -4625,7 +4728,7 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env, } if (insn_flags) - return push_jmp_history(env, env->cur_state, insn_flags); + return push_jmp_history(env, env->cur_state, insn_flags, 0); return 0; } @@ -4684,6 +4787,7 @@ static int check_stack_write_var_off(struct bpf_verifier_env *env, return err; } + check_fastcall_stack_contract(env, state, insn_idx, min_off); /* Variable offset writes destroy any spilled pointers in range. */ for (i = min_off; i < max_off; i++) { u8 new_type, *stype; @@ -4822,6 +4926,7 @@ static int check_stack_read_fixed_off(struct bpf_verifier_env *env, reg = ®_state->stack[spi].spilled_ptr; mark_stack_slot_scratched(env, spi); + check_fastcall_stack_contract(env, state, env->insn_idx, off); if (is_spilled_reg(®_state->stack[spi])) { u8 spill_size = 1; @@ -4930,7 +5035,7 @@ static int check_stack_read_fixed_off(struct bpf_verifier_env *env, insn_flags = 0; /* we are not restoring spilled register */ } if (insn_flags) - return push_jmp_history(env, env->cur_state, insn_flags); + return push_jmp_history(env, env->cur_state, insn_flags, 0); return 0; } @@ -4982,6 +5087,7 @@ static int check_stack_read_var_off(struct bpf_verifier_env *env, min_off = reg->smin_value + off; max_off = reg->smax_value + off; mark_reg_stack_read(env, ptr_state, min_off, max_off + size, dst_regno); + check_fastcall_stack_contract(env, ptr_state, env->insn_idx, min_off); return 0; } @@ -5587,11 +5693,13 @@ static int check_packet_access(struct bpf_verifier_env *env, u32 regno, int off, /* check access to 'struct bpf_context' fields. Supports fixed offsets only */ static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off, int size, enum bpf_access_type t, enum bpf_reg_type *reg_type, - struct btf **btf, u32 *btf_id) + struct btf **btf, u32 *btf_id, bool *is_retval, bool is_ldsx) { struct bpf_insn_access_aux info = { .reg_type = *reg_type, .log = &env->log, + .is_retval = false, + .is_ldsx = is_ldsx, }; if (env->ops->is_valid_access && @@ -5604,6 +5712,7 @@ static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off, * type of narrower access. */ *reg_type = info.reg_type; + *is_retval = info.is_retval; if (base_type(*reg_type) == PTR_TO_BTF_ID) { *btf = info.btf; @@ -6692,10 +6801,20 @@ static int check_stack_slot_within_bounds(struct bpf_verifier_env *env, struct bpf_func_state *state, enum bpf_access_type t) { - int min_valid_off; + struct bpf_insn_aux_data *aux = &env->insn_aux_data[env->insn_idx]; + int min_valid_off, max_bpf_stack; + + /* If accessing instruction is a spill/fill from bpf_fastcall pattern, + * add room for all caller saved registers below MAX_BPF_STACK. + * In case if bpf_fastcall rewrite won't happen maximal stack depth + * would be checked by check_max_stack_depth_subprog(). + */ + max_bpf_stack = MAX_BPF_STACK; + if (aux->fastcall_pattern) + max_bpf_stack += CALLER_SAVED_REGS * BPF_REG_SIZE; if (t == BPF_WRITE || env->allow_uninit_stack) - min_valid_off = -MAX_BPF_STACK; + min_valid_off = -max_bpf_stack; else min_valid_off = -state->allocated_stack; @@ -6772,6 +6891,17 @@ static int check_stack_access_within_bounds( return grow_stack_state(env, state, -min_off /* size */); } +static bool get_func_retval_range(struct bpf_prog *prog, + struct bpf_retval_range *range) +{ + if (prog->type == BPF_PROG_TYPE_LSM && + prog->expected_attach_type == BPF_LSM_MAC && + !bpf_lsm_get_retval_range(prog, range)) { + return true; + } + return false; +} + /* check whether memory at (regno + off) is accessible for t = (read | write) * if t==write, value_regno is a register which value is stored into memory * if t==read, value_regno is a register which will receive the value from memory @@ -6876,6 +7006,8 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn if (!err && value_regno >= 0 && (t == BPF_READ || rdonly_mem)) mark_reg_unknown(env, regs, value_regno); } else if (reg->type == PTR_TO_CTX) { + bool is_retval = false; + struct bpf_retval_range range; enum bpf_reg_type reg_type = SCALAR_VALUE; struct btf *btf = NULL; u32 btf_id = 0; @@ -6891,7 +7023,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn return err; err = check_ctx_access(env, insn_idx, off, size, t, ®_type, &btf, - &btf_id); + &btf_id, &is_retval, is_ldsx); if (err) verbose_linfo(env, insn_idx, "; "); if (!err && t == BPF_READ && value_regno >= 0) { @@ -6900,7 +7032,14 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn * case, we know the offset is zero. */ if (reg_type == SCALAR_VALUE) { - mark_reg_unknown(env, regs, value_regno); + if (is_retval && get_func_retval_range(env->prog, &range)) { + err = __mark_reg_s32_range(env, regs, value_regno, + range.minval, range.maxval); + if (err) + return err; + } else { + mark_reg_unknown(env, regs, value_regno); + } } else { mark_reg_known_zero(env, regs, value_regno); @@ -7664,29 +7803,38 @@ static int process_kptr_func(struct bpf_verifier_env *env, int regno, struct bpf_call_arg_meta *meta) { struct bpf_reg_state *regs = cur_regs(env), *reg = ®s[regno]; - struct bpf_map *map_ptr = reg->map_ptr; struct btf_field *kptr_field; + struct bpf_map *map_ptr; + struct btf_record *rec; u32 kptr_off; + if (type_is_ptr_alloc_obj(reg->type)) { + rec = reg_btf_record(reg); + } else { /* PTR_TO_MAP_VALUE */ + map_ptr = reg->map_ptr; + if (!map_ptr->btf) { + verbose(env, "map '%s' has to have BTF in order to use bpf_kptr_xchg\n", + map_ptr->name); + return -EINVAL; + } + rec = map_ptr->record; + meta->map_ptr = map_ptr; + } + if (!tnum_is_const(reg->var_off)) { verbose(env, "R%d doesn't have constant offset. kptr has to be at the constant offset\n", regno); return -EINVAL; } - if (!map_ptr->btf) { - verbose(env, "map '%s' has to have BTF in order to use bpf_kptr_xchg\n", - map_ptr->name); - return -EINVAL; - } - if (!btf_record_has_field(map_ptr->record, BPF_KPTR)) { - verbose(env, "map '%s' has no valid kptr\n", map_ptr->name); + + if (!btf_record_has_field(rec, BPF_KPTR)) { + verbose(env, "R%d has no valid kptr\n", regno); return -EINVAL; } - meta->map_ptr = map_ptr; kptr_off = reg->off + reg->var_off.value; - kptr_field = btf_record_find(map_ptr->record, kptr_off, BPF_KPTR); + kptr_field = btf_record_find(rec, kptr_off, BPF_KPTR); if (!kptr_field) { verbose(env, "off=%d doesn't point to kptr\n", kptr_off); return -EACCES; @@ -7831,12 +7979,17 @@ static bool is_iter_destroy_kfunc(struct bpf_kfunc_call_arg_meta *meta) return meta->kfunc_flags & KF_ITER_DESTROY; } -static bool is_kfunc_arg_iter(struct bpf_kfunc_call_arg_meta *meta, int arg) +static bool is_kfunc_arg_iter(struct bpf_kfunc_call_arg_meta *meta, int arg_idx, + const struct btf_param *arg) { /* btf_check_iter_kfuncs() guarantees that first argument of any iter * kfunc is iter state pointer */ - return arg == 0 && is_iter_kfunc(meta); + if (is_iter_kfunc(meta)) + return arg_idx == 0; + + /* iter passed as an argument to a generic kfunc */ + return btf_param_match_suffix(meta->btf, arg, "__iter"); } static int process_iter_arg(struct bpf_verifier_env *env, int regno, int insn_idx, @@ -7844,14 +7997,20 @@ static int process_iter_arg(struct bpf_verifier_env *env, int regno, int insn_id { struct bpf_reg_state *regs = cur_regs(env), *reg = ®s[regno]; const struct btf_type *t; - const struct btf_param *arg; - int spi, err, i, nr_slots; - u32 btf_id; + int spi, err, i, nr_slots, btf_id; - /* btf_check_iter_kfuncs() ensures we don't need to validate anything here */ - arg = &btf_params(meta->func_proto)[0]; - t = btf_type_skip_modifiers(meta->btf, arg->type, NULL); /* PTR */ - t = btf_type_skip_modifiers(meta->btf, t->type, &btf_id); /* STRUCT */ + /* For iter_{new,next,destroy} functions, btf_check_iter_kfuncs() + * ensures struct convention, so we wouldn't need to do any BTF + * validation here. But given iter state can be passed as a parameter + * to any kfunc, if arg has "__iter" suffix, we need to be a bit more + * conservative here. + */ + btf_id = btf_check_iter_arg(meta->btf, meta->func_proto, regno - 1); + if (btf_id < 0) { + verbose(env, "expected valid iter pointer as arg #%d\n", regno); + return -EINVAL; + } + t = btf_type_by_id(meta->btf, btf_id); nr_slots = t->size / BPF_REG_SIZE; if (is_iter_new_kfunc(meta)) { @@ -7873,7 +8032,9 @@ static int process_iter_arg(struct bpf_verifier_env *env, int regno, int insn_id if (err) return err; } else { - /* iter_next() or iter_destroy() expect initialized iter state*/ + /* iter_next() or iter_destroy(), as well as any kfunc + * accepting iter argument, expect initialized iter state + */ err = is_iter_reg_valid_init(env, reg, meta->btf, btf_id, nr_slots); switch (err) { case 0: @@ -7987,6 +8148,15 @@ static int widen_imprecise_scalars(struct bpf_verifier_env *env, return 0; } +static struct bpf_reg_state *get_iter_from_state(struct bpf_verifier_state *cur_st, + struct bpf_kfunc_call_arg_meta *meta) +{ + int iter_frameno = meta->iter.frameno; + int iter_spi = meta->iter.spi; + + return &cur_st->frame[iter_frameno]->stack[iter_spi].spilled_ptr; +} + /* process_iter_next_call() is called when verifier gets to iterator's next * "method" (e.g., bpf_iter_num_next() for numbers iterator) call. We'll refer * to it as just "iter_next()" in comments below. @@ -8071,12 +8241,10 @@ static int process_iter_next_call(struct bpf_verifier_env *env, int insn_idx, struct bpf_verifier_state *cur_st = env->cur_state, *queued_st, *prev_st; struct bpf_func_state *cur_fr = cur_st->frame[cur_st->curframe], *queued_fr; struct bpf_reg_state *cur_iter, *queued_iter; - int iter_frameno = meta->iter.frameno; - int iter_spi = meta->iter.spi; BTF_TYPE_EMIT(struct bpf_iter); - cur_iter = &env->cur_state->frame[iter_frameno]->stack[iter_spi].spilled_ptr; + cur_iter = get_iter_from_state(cur_st, meta); if (cur_iter->iter.state != BPF_ITER_STATE_ACTIVE && cur_iter->iter.state != BPF_ITER_STATE_DRAINED) { @@ -8104,7 +8272,7 @@ static int process_iter_next_call(struct bpf_verifier_env *env, int insn_idx, if (!queued_st) return -ENOMEM; - queued_iter = &queued_st->frame[iter_frameno]->stack[iter_spi].spilled_ptr; + queued_iter = get_iter_from_state(queued_st, meta); queued_iter->iter.state = BPF_ITER_STATE_ACTIVE; queued_iter->iter.depth++; if (prev_st) @@ -8260,7 +8428,12 @@ static const struct bpf_reg_types func_ptr_types = { .types = { PTR_TO_FUNC } }; static const struct bpf_reg_types stack_ptr_types = { .types = { PTR_TO_STACK } }; static const struct bpf_reg_types const_str_ptr_types = { .types = { PTR_TO_MAP_VALUE } }; static const struct bpf_reg_types timer_types = { .types = { PTR_TO_MAP_VALUE } }; -static const struct bpf_reg_types kptr_types = { .types = { PTR_TO_MAP_VALUE } }; +static const struct bpf_reg_types kptr_xchg_dest_types = { + .types = { + PTR_TO_MAP_VALUE, + PTR_TO_BTF_ID | MEM_ALLOC + } +}; static const struct bpf_reg_types dynptr_types = { .types = { PTR_TO_STACK, @@ -8292,7 +8465,7 @@ static const struct bpf_reg_types *compatible_reg_types[__BPF_ARG_TYPE_MAX] = { [ARG_PTR_TO_STACK] = &stack_ptr_types, [ARG_PTR_TO_CONST_STR] = &const_str_ptr_types, [ARG_PTR_TO_TIMER] = &timer_types, - [ARG_PTR_TO_KPTR] = &kptr_types, + [ARG_KPTR_XCHG_DEST] = &kptr_xchg_dest_types, [ARG_PTR_TO_DYNPTR] = &dynptr_types, }; @@ -8331,7 +8504,8 @@ static int check_reg_type(struct bpf_verifier_env *env, u32 regno, if (base_type(arg_type) == ARG_PTR_TO_MEM) type &= ~DYNPTR_TYPE_FLAG_MASK; - if (meta->func_id == BPF_FUNC_kptr_xchg && type_is_alloc(type)) { + /* Local kptr types are allowed as the source argument of bpf_kptr_xchg */ + if (meta->func_id == BPF_FUNC_kptr_xchg && type_is_alloc(type) && regno == BPF_REG_2) { type &= ~MEM_ALLOC; type &= ~MEM_PERCPU; } @@ -8424,7 +8598,8 @@ found: verbose(env, "verifier internal error: unimplemented handling of MEM_ALLOC\n"); return -EFAULT; } - if (meta->func_id == BPF_FUNC_kptr_xchg) { + /* Check if local kptr in src arg matches kptr in dst arg */ + if (meta->func_id == BPF_FUNC_kptr_xchg && regno == BPF_REG_2) { if (map_kptr_match_type(env, meta->kptr_field, reg, regno)) return -EACCES; } @@ -8735,7 +8910,7 @@ skip_type_check: meta->release_regno = regno; } - if (reg->ref_obj_id) { + if (reg->ref_obj_id && base_type(arg_type) != ARG_KPTR_XCHG_DEST) { if (meta->ref_obj_id) { verbose(env, "verifier internal error: more than one arg with ref_obj_id R%d %u %u\n", regno, reg->ref_obj_id, @@ -8892,7 +9067,7 @@ skip_type_check: return err; break; } - case ARG_PTR_TO_KPTR: + case ARG_KPTR_XCHG_DEST: err = process_kptr_func(env, regno, meta); if (err) return err; @@ -9923,9 +10098,13 @@ static bool in_rbtree_lock_required_cb(struct bpf_verifier_env *env) return is_rbtree_lock_required_kfunc(kfunc_btf_id); } -static bool retval_range_within(struct bpf_retval_range range, const struct bpf_reg_state *reg) +static bool retval_range_within(struct bpf_retval_range range, const struct bpf_reg_state *reg, + bool return_32bit) { - return range.minval <= reg->smin_value && reg->smax_value <= range.maxval; + if (return_32bit) + return range.minval <= reg->s32_min_value && reg->s32_max_value <= range.maxval; + else + return range.minval <= reg->smin_value && reg->smax_value <= range.maxval; } static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx) @@ -9962,8 +10141,8 @@ static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx) if (err) return err; - /* enforce R0 return value range */ - if (!retval_range_within(callee->callback_ret_range, r0)) { + /* enforce R0 return value range, and bpf_callback_t returns 64bit */ + if (!retval_range_within(callee->callback_ret_range, r0, false)) { verbose_invalid_scalar(env, r0, callee->callback_ret_range, "At callback return", "R0"); return -EINVAL; @@ -10265,6 +10444,19 @@ static void update_loop_inline_state(struct bpf_verifier_env *env, u32 subprogno state->callback_subprogno == subprogno); } +static int get_helper_proto(struct bpf_verifier_env *env, int func_id, + const struct bpf_func_proto **ptr) +{ + if (func_id < 0 || func_id >= __BPF_FUNC_MAX_ID) + return -ERANGE; + + if (!env->ops->get_func_proto) + return -EINVAL; + + *ptr = env->ops->get_func_proto(func_id, env->prog); + return *ptr ? 0 : -EINVAL; +} + static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn, int *insn_idx_p) { @@ -10281,18 +10473,16 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn /* find function prototype */ func_id = insn->imm; - if (func_id < 0 || func_id >= __BPF_FUNC_MAX_ID) { - verbose(env, "invalid func %s#%d\n", func_id_name(func_id), - func_id); + err = get_helper_proto(env, insn->imm, &fn); + if (err == -ERANGE) { + verbose(env, "invalid func %s#%d\n", func_id_name(func_id), func_id); return -EINVAL; } - if (env->ops->get_func_proto) - fn = env->ops->get_func_proto(func_id, env->prog); - if (!fn) { + if (err) { verbose(env, "program of this type cannot use helper %s#%d\n", func_id_name(func_id), func_id); - return -EINVAL; + return err; } /* eBPF programs must be GPL compatible to use GPL-ed functions */ @@ -11228,7 +11418,7 @@ get_kfunc_ptr_arg_type(struct bpf_verifier_env *env, if (is_kfunc_arg_dynptr(meta->btf, &args[argno])) return KF_ARG_PTR_TO_DYNPTR; - if (is_kfunc_arg_iter(meta, argno)) + if (is_kfunc_arg_iter(meta, argno, &args[argno])) return KF_ARG_PTR_TO_ITER; if (is_kfunc_arg_list_head(meta->btf, &args[argno])) @@ -11330,8 +11520,7 @@ static int process_kf_arg_ptr_to_btf_id(struct bpf_verifier_env *env, * btf_struct_ids_match() to walk the struct at the 0th offset, and * resolve types. */ - if (is_kfunc_acquire(meta) || - (is_kfunc_release(meta) && reg->ref_obj_id) || + if ((is_kfunc_release(meta) && reg->ref_obj_id) || btf_type_ids_nocast_alias(&env->log, reg_btf, reg_ref_id, meta->btf, ref_id)) strict_type_match = true; @@ -12671,6 +12860,17 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, regs[BPF_REG_0].btf = desc_btf; regs[BPF_REG_0].type = PTR_TO_BTF_ID; regs[BPF_REG_0].btf_id = ptr_type_id; + + if (is_iter_next_kfunc(&meta)) { + struct bpf_reg_state *cur_iter; + + cur_iter = get_iter_from_state(env->cur_state, &meta); + + if (cur_iter->type & MEM_RCU) /* KF_RCU_PROTECTED */ + regs[BPF_REG_0].type |= MEM_RCU; + else + regs[BPF_REG_0].type |= PTR_TRUSTED; + } } if (is_kfunc_ret_null(&meta)) { @@ -14099,7 +14299,7 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env, u64 val = reg_const_value(src_reg, alu32); if ((dst_reg->id & BPF_ADD_CONST) || - /* prevent overflow in find_equal_scalars() later */ + /* prevent overflow in sync_linked_regs() later */ val > (u32)S32_MAX) { /* * If the register already went through rX += val @@ -14114,7 +14314,7 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env, } else { /* * Make sure ID is cleared otherwise dst_reg min/max could be - * incorrectly propagated into other registers by find_equal_scalars() + * incorrectly propagated into other registers by sync_linked_regs() */ dst_reg->id = 0; } @@ -14264,7 +14464,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) copy_register_state(dst_reg, src_reg); /* Make sure ID is cleared if src_reg is not in u32 * range otherwise dst_reg min/max could be incorrectly - * propagated into src_reg by find_equal_scalars() + * propagated into src_reg by sync_linked_regs() */ if (!is_src_reg_u32) dst_reg->id = 0; @@ -15087,14 +15287,66 @@ static bool try_match_pkt_pointers(const struct bpf_insn *insn, return true; } -static void find_equal_scalars(struct bpf_verifier_state *vstate, - struct bpf_reg_state *known_reg) +static void __collect_linked_regs(struct linked_regs *reg_set, struct bpf_reg_state *reg, + u32 id, u32 frameno, u32 spi_or_reg, bool is_reg) +{ + struct linked_reg *e; + + if (reg->type != SCALAR_VALUE || (reg->id & ~BPF_ADD_CONST) != id) + return; + + e = linked_regs_push(reg_set); + if (e) { + e->frameno = frameno; + e->is_reg = is_reg; + e->regno = spi_or_reg; + } else { + reg->id = 0; + } +} + +/* For all R being scalar registers or spilled scalar registers + * in verifier state, save R in linked_regs if R->id == id. + * If there are too many Rs sharing same id, reset id for leftover Rs. + */ +static void collect_linked_regs(struct bpf_verifier_state *vstate, u32 id, + struct linked_regs *linked_regs) +{ + struct bpf_func_state *func; + struct bpf_reg_state *reg; + int i, j; + + id = id & ~BPF_ADD_CONST; + for (i = vstate->curframe; i >= 0; i--) { + func = vstate->frame[i]; + for (j = 0; j < BPF_REG_FP; j++) { + reg = &func->regs[j]; + __collect_linked_regs(linked_regs, reg, id, i, j, true); + } + for (j = 0; j < func->allocated_stack / BPF_REG_SIZE; j++) { + if (!is_spilled_reg(&func->stack[j])) + continue; + reg = &func->stack[j].spilled_ptr; + __collect_linked_regs(linked_regs, reg, id, i, j, false); + } + } +} + +/* For all R in linked_regs, copy known_reg range into R + * if R->id == known_reg->id. + */ +static void sync_linked_regs(struct bpf_verifier_state *vstate, struct bpf_reg_state *known_reg, + struct linked_regs *linked_regs) { struct bpf_reg_state fake_reg; - struct bpf_func_state *state; struct bpf_reg_state *reg; + struct linked_reg *e; + int i; - bpf_for_each_reg_in_vstate(vstate, state, reg, ({ + for (i = 0; i < linked_regs->cnt; ++i) { + e = &linked_regs->entries[i]; + reg = e->is_reg ? &vstate->frame[e->frameno]->regs[e->regno] + : &vstate->frame[e->frameno]->stack[e->spi].spilled_ptr; if (reg->type != SCALAR_VALUE || reg == known_reg) continue; if ((reg->id & ~BPF_ADD_CONST) != (known_reg->id & ~BPF_ADD_CONST)) @@ -15112,7 +15364,7 @@ static void find_equal_scalars(struct bpf_verifier_state *vstate, copy_register_state(reg, known_reg); /* * Must preserve off, id and add_const flag, - * otherwise another find_equal_scalars() will be incorrect. + * otherwise another sync_linked_regs() will be incorrect. */ reg->off = saved_off; @@ -15120,7 +15372,7 @@ static void find_equal_scalars(struct bpf_verifier_state *vstate, scalar_min_max_add(reg, &fake_reg); reg->var_off = tnum_add(reg->var_off, fake_reg.var_off); } - })); + } } static int check_cond_jmp_op(struct bpf_verifier_env *env, @@ -15131,6 +15383,7 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, struct bpf_reg_state *regs = this_branch->frame[this_branch->curframe]->regs; struct bpf_reg_state *dst_reg, *other_branch_regs, *src_reg = NULL; struct bpf_reg_state *eq_branch_regs; + struct linked_regs linked_regs = {}; u8 opcode = BPF_OP(insn->code); bool is_jmp32; int pred = -1; @@ -15245,6 +15498,21 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, return 0; } + /* Push scalar registers sharing same ID to jump history, + * do this before creating 'other_branch', so that both + * 'this_branch' and 'other_branch' share this history + * if parent state is created. + */ + if (BPF_SRC(insn->code) == BPF_X && src_reg->type == SCALAR_VALUE && src_reg->id) + collect_linked_regs(this_branch, src_reg->id, &linked_regs); + if (dst_reg->type == SCALAR_VALUE && dst_reg->id) + collect_linked_regs(this_branch, dst_reg->id, &linked_regs); + if (linked_regs.cnt > 1) { + err = push_jmp_history(env, this_branch, 0, linked_regs_pack(&linked_regs)); + if (err) + return err; + } + other_branch = push_stack(env, *insn_idx + insn->off + 1, *insn_idx, false); if (!other_branch) @@ -15275,13 +15543,13 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, if (BPF_SRC(insn->code) == BPF_X && src_reg->type == SCALAR_VALUE && src_reg->id && !WARN_ON_ONCE(src_reg->id != other_branch_regs[insn->src_reg].id)) { - find_equal_scalars(this_branch, src_reg); - find_equal_scalars(other_branch, &other_branch_regs[insn->src_reg]); + sync_linked_regs(this_branch, src_reg, &linked_regs); + sync_linked_regs(other_branch, &other_branch_regs[insn->src_reg], &linked_regs); } if (dst_reg->type == SCALAR_VALUE && dst_reg->id && !WARN_ON_ONCE(dst_reg->id != other_branch_regs[insn->dst_reg].id)) { - find_equal_scalars(this_branch, dst_reg); - find_equal_scalars(other_branch, &other_branch_regs[insn->dst_reg]); + sync_linked_regs(this_branch, dst_reg, &linked_regs); + sync_linked_regs(other_branch, &other_branch_regs[insn->dst_reg], &linked_regs); } /* if one pointer register is compared to another pointer @@ -15569,6 +15837,7 @@ static int check_return_code(struct bpf_verifier_env *env, int regno, const char int err; struct bpf_func_state *frame = env->cur_state->frame[0]; const bool is_subprog = frame->subprogno; + bool return_32bit = false; /* LSM and struct_ops func-ptr's return type could be "void" */ if (!is_subprog || frame->in_exception_callback_fn) { @@ -15674,12 +15943,14 @@ static int check_return_code(struct bpf_verifier_env *env, int regno, const char case BPF_PROG_TYPE_LSM: if (env->prog->expected_attach_type != BPF_LSM_CGROUP) { - /* Regular BPF_PROG_TYPE_LSM programs can return - * any value. - */ - return 0; - } - if (!env->prog->aux->attach_func_proto->type) { + /* no range found, any return value is allowed */ + if (!get_func_retval_range(env->prog, &range)) + return 0; + /* no restricted range, any return value is allowed */ + if (range.minval == S32_MIN && range.maxval == S32_MAX) + return 0; + return_32bit = true; + } else if (!env->prog->aux->attach_func_proto->type) { /* Make sure programs that attach to void * hooks don't try to modify return value. */ @@ -15709,7 +15980,7 @@ enforce_retval: if (err) return err; - if (!retval_range_within(range, reg)) { + if (!retval_range_within(range, reg, return_32bit)) { verbose_invalid_scalar(env, reg, range, exit_ctx, reg_name); if (!is_subprog && prog->expected_attach_type == BPF_LSM_CGROUP && @@ -15875,6 +16146,274 @@ static int visit_func_call_insn(int t, struct bpf_insn *insns, return ret; } +/* Bitmask with 1s for all caller saved registers */ +#define ALL_CALLER_SAVED_REGS ((1u << CALLER_SAVED_REGS) - 1) + +/* Return a bitmask specifying which caller saved registers are + * clobbered by a call to a helper *as if* this helper follows + * bpf_fastcall contract: + * - includes R0 if function is non-void; + * - includes R1-R5 if corresponding parameter has is described + * in the function prototype. + */ +static u32 helper_fastcall_clobber_mask(const struct bpf_func_proto *fn) +{ + u32 mask; + int i; + + mask = 0; + if (fn->ret_type != RET_VOID) + mask |= BIT(BPF_REG_0); + for (i = 0; i < ARRAY_SIZE(fn->arg_type); ++i) + if (fn->arg_type[i] != ARG_DONTCARE) + mask |= BIT(BPF_REG_1 + i); + return mask; +} + +/* True if do_misc_fixups() replaces calls to helper number 'imm', + * replacement patch is presumed to follow bpf_fastcall contract + * (see mark_fastcall_pattern_for_call() below). + */ +static bool verifier_inlines_helper_call(struct bpf_verifier_env *env, s32 imm) +{ + switch (imm) { +#ifdef CONFIG_X86_64 + case BPF_FUNC_get_smp_processor_id: + return env->prog->jit_requested && bpf_jit_supports_percpu_insn(); +#endif + default: + return false; + } +} + +/* Same as helper_fastcall_clobber_mask() but for kfuncs, see comment above */ +static u32 kfunc_fastcall_clobber_mask(struct bpf_kfunc_call_arg_meta *meta) +{ + u32 vlen, i, mask; + + vlen = btf_type_vlen(meta->func_proto); + mask = 0; + if (!btf_type_is_void(btf_type_by_id(meta->btf, meta->func_proto->type))) + mask |= BIT(BPF_REG_0); + for (i = 0; i < vlen; ++i) + mask |= BIT(BPF_REG_1 + i); + return mask; +} + +/* Same as verifier_inlines_helper_call() but for kfuncs, see comment above */ +static bool is_fastcall_kfunc_call(struct bpf_kfunc_call_arg_meta *meta) +{ + if (meta->btf == btf_vmlinux) + return meta->func_id == special_kfunc_list[KF_bpf_cast_to_kern_ctx] || + meta->func_id == special_kfunc_list[KF_bpf_rdonly_cast]; + return false; +} + +/* LLVM define a bpf_fastcall function attribute. + * This attribute means that function scratches only some of + * the caller saved registers defined by ABI. + * For BPF the set of such registers could be defined as follows: + * - R0 is scratched only if function is non-void; + * - R1-R5 are scratched only if corresponding parameter type is defined + * in the function prototype. + * + * The contract between kernel and clang allows to simultaneously use + * such functions and maintain backwards compatibility with old + * kernels that don't understand bpf_fastcall calls: + * + * - for bpf_fastcall calls clang allocates registers as-if relevant r0-r5 + * registers are not scratched by the call; + * + * - as a post-processing step, clang visits each bpf_fastcall call and adds + * spill/fill for every live r0-r5; + * + * - stack offsets used for the spill/fill are allocated as lowest + * stack offsets in whole function and are not used for any other + * purposes; + * + * - when kernel loads a program, it looks for such patterns + * (bpf_fastcall function surrounded by spills/fills) and checks if + * spill/fill stack offsets are used exclusively in fastcall patterns; + * + * - if so, and if verifier or current JIT inlines the call to the + * bpf_fastcall function (e.g. a helper call), kernel removes unnecessary + * spill/fill pairs; + * + * - when old kernel loads a program, presence of spill/fill pairs + * keeps BPF program valid, albeit slightly less efficient. + * + * For example: + * + * r1 = 1; + * r2 = 2; + * *(u64 *)(r10 - 8) = r1; r1 = 1; + * *(u64 *)(r10 - 16) = r2; r2 = 2; + * call %[to_be_inlined] --> call %[to_be_inlined] + * r2 = *(u64 *)(r10 - 16); r0 = r1; + * r1 = *(u64 *)(r10 - 8); r0 += r2; + * r0 = r1; exit; + * r0 += r2; + * exit; + * + * The purpose of mark_fastcall_pattern_for_call is to: + * - look for such patterns; + * - mark spill and fill instructions in env->insn_aux_data[*].fastcall_pattern; + * - mark set env->insn_aux_data[*].fastcall_spills_num for call instruction; + * - update env->subprog_info[*]->fastcall_stack_off to find an offset + * at which bpf_fastcall spill/fill stack slots start; + * - update env->subprog_info[*]->keep_fastcall_stack. + * + * The .fastcall_pattern and .fastcall_stack_off are used by + * check_fastcall_stack_contract() to check if every stack access to + * fastcall spill/fill stack slot originates from spill/fill + * instructions, members of fastcall patterns. + * + * If such condition holds true for a subprogram, fastcall patterns could + * be rewritten by remove_fastcall_spills_fills(). + * Otherwise bpf_fastcall patterns are not changed in the subprogram + * (code, presumably, generated by an older clang version). + * + * For example, it is *not* safe to remove spill/fill below: + * + * r1 = 1; + * *(u64 *)(r10 - 8) = r1; r1 = 1; + * call %[to_be_inlined] --> call %[to_be_inlined] + * r1 = *(u64 *)(r10 - 8); r0 = *(u64 *)(r10 - 8); <---- wrong !!! + * r0 = *(u64 *)(r10 - 8); r0 += r1; + * r0 += r1; exit; + * exit; + */ +static void mark_fastcall_pattern_for_call(struct bpf_verifier_env *env, + struct bpf_subprog_info *subprog, + int insn_idx, s16 lowest_off) +{ + struct bpf_insn *insns = env->prog->insnsi, *stx, *ldx; + struct bpf_insn *call = &env->prog->insnsi[insn_idx]; + const struct bpf_func_proto *fn; + u32 clobbered_regs_mask = ALL_CALLER_SAVED_REGS; + u32 expected_regs_mask; + bool can_be_inlined = false; + s16 off; + int i; + + if (bpf_helper_call(call)) { + if (get_helper_proto(env, call->imm, &fn) < 0) + /* error would be reported later */ + return; + clobbered_regs_mask = helper_fastcall_clobber_mask(fn); + can_be_inlined = fn->allow_fastcall && + (verifier_inlines_helper_call(env, call->imm) || + bpf_jit_inlines_helper_call(call->imm)); + } + + if (bpf_pseudo_kfunc_call(call)) { + struct bpf_kfunc_call_arg_meta meta; + int err; + + err = fetch_kfunc_meta(env, call, &meta, NULL); + if (err < 0) + /* error would be reported later */ + return; + + clobbered_regs_mask = kfunc_fastcall_clobber_mask(&meta); + can_be_inlined = is_fastcall_kfunc_call(&meta); + } + + if (clobbered_regs_mask == ALL_CALLER_SAVED_REGS) + return; + + /* e.g. if helper call clobbers r{0,1}, expect r{2,3,4,5} in the pattern */ + expected_regs_mask = ~clobbered_regs_mask & ALL_CALLER_SAVED_REGS; + + /* match pairs of form: + * + * *(u64 *)(r10 - Y) = rX (where Y % 8 == 0) + * ... + * call %[to_be_inlined] + * ... + * rX = *(u64 *)(r10 - Y) + */ + for (i = 1, off = lowest_off; i <= ARRAY_SIZE(caller_saved); ++i, off += BPF_REG_SIZE) { + if (insn_idx - i < 0 || insn_idx + i >= env->prog->len) + break; + stx = &insns[insn_idx - i]; + ldx = &insns[insn_idx + i]; + /* must be a stack spill/fill pair */ + if (stx->code != (BPF_STX | BPF_MEM | BPF_DW) || + ldx->code != (BPF_LDX | BPF_MEM | BPF_DW) || + stx->dst_reg != BPF_REG_10 || + ldx->src_reg != BPF_REG_10) + break; + /* must be a spill/fill for the same reg */ + if (stx->src_reg != ldx->dst_reg) + break; + /* must be one of the previously unseen registers */ + if ((BIT(stx->src_reg) & expected_regs_mask) == 0) + break; + /* must be a spill/fill for the same expected offset, + * no need to check offset alignment, BPF_DW stack access + * is always 8-byte aligned. + */ + if (stx->off != off || ldx->off != off) + break; + expected_regs_mask &= ~BIT(stx->src_reg); + env->insn_aux_data[insn_idx - i].fastcall_pattern = 1; + env->insn_aux_data[insn_idx + i].fastcall_pattern = 1; + } + if (i == 1) + return; + + /* Conditionally set 'fastcall_spills_num' to allow forward + * compatibility when more helper functions are marked as + * bpf_fastcall at compile time than current kernel supports, e.g: + * + * 1: *(u64 *)(r10 - 8) = r1 + * 2: call A ;; assume A is bpf_fastcall for current kernel + * 3: r1 = *(u64 *)(r10 - 8) + * 4: *(u64 *)(r10 - 8) = r1 + * 5: call B ;; assume B is not bpf_fastcall for current kernel + * 6: r1 = *(u64 *)(r10 - 8) + * + * There is no need to block bpf_fastcall rewrite for such program. + * Set 'fastcall_pattern' for both calls to keep check_fastcall_stack_contract() happy, + * don't set 'fastcall_spills_num' for call B so that remove_fastcall_spills_fills() + * does not remove spill/fill pair {4,6}. + */ + if (can_be_inlined) + env->insn_aux_data[insn_idx].fastcall_spills_num = i - 1; + else + subprog->keep_fastcall_stack = 1; + subprog->fastcall_stack_off = min(subprog->fastcall_stack_off, off); +} + +static int mark_fastcall_patterns(struct bpf_verifier_env *env) +{ + struct bpf_subprog_info *subprog = env->subprog_info; + struct bpf_insn *insn; + s16 lowest_off; + int s, i; + + for (s = 0; s < env->subprog_cnt; ++s, ++subprog) { + /* find lowest stack spill offset used in this subprog */ + lowest_off = 0; + for (i = subprog->start; i < (subprog + 1)->start; ++i) { + insn = env->prog->insnsi + i; + if (insn->code != (BPF_STX | BPF_MEM | BPF_DW) || + insn->dst_reg != BPF_REG_10) + continue; + lowest_off = min(lowest_off, insn->off); + } + /* use this offset to find fastcall patterns */ + for (i = subprog->start; i < (subprog + 1)->start; ++i) { + insn = env->prog->insnsi + i; + if (insn->code != (BPF_JMP | BPF_CALL)) + continue; + mark_fastcall_pattern_for_call(env, subprog, i, lowest_off); + } + } + return 0; +} + /* Visits the instruction at index t and returns one of the following: * < 0 - an error occurred * DONE_EXPLORING - the instruction was fully explored @@ -16770,7 +17309,7 @@ static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold, * * First verification path is [1-6]: * - at (4) same bpf_reg_state::id (b) would be assigned to r6 and r7; - * - at (5) r6 would be marked <= X, find_equal_scalars() would also mark + * - at (5) r6 would be marked <= X, sync_linked_regs() would also mark * r7 <= X, because r6 and r7 share same id. * Next verification path is [1-4, 6]. * @@ -16884,8 +17423,9 @@ static bool stacksafe(struct bpf_verifier_env *env, struct bpf_func_state *old, spi = i / BPF_REG_SIZE; if (exact != NOT_EXACT && - old->stack[spi].slot_type[i % BPF_REG_SIZE] != - cur->stack[spi].slot_type[i % BPF_REG_SIZE]) + (i >= cur->allocated_stack || + old->stack[spi].slot_type[i % BPF_REG_SIZE] != + cur->stack[spi].slot_type[i % BPF_REG_SIZE])) return false; if (!(old->stack[spi].spilled_ptr.live & REG_LIVE_READ) @@ -17563,7 +18103,7 @@ hit: * the current state. */ if (is_jmp_point(env, env->insn_idx)) - err = err ? : push_jmp_history(env, cur, 0); + err = err ? : push_jmp_history(env, cur, 0, 0); err = err ? : propagate_precision(env, &sl->state); if (err) return err; @@ -17831,7 +18371,7 @@ static int do_check(struct bpf_verifier_env *env) } if (is_jmp_point(env, env->insn_idx)) { - err = push_jmp_history(env, state, 0); + err = push_jmp_history(env, state, 0, 0); if (err) return err; } @@ -18764,6 +19304,9 @@ static int adjust_jmp_off(struct bpf_prog *prog, u32 tgt_idx, u32 delta) for (i = 0; i < insn_cnt; i++, insn++) { u8 code = insn->code; + if (tgt_idx <= i && i < tgt_idx + delta) + continue; + if ((BPF_CLASS(code) != BPF_JMP && BPF_CLASS(code) != BPF_JMP32) || BPF_OP(code) == BPF_CALL || BPF_OP(code) == BPF_EXIT) continue; @@ -19023,9 +19566,11 @@ static int opt_remove_dead_code(struct bpf_verifier_env *env) return 0; } +static const struct bpf_insn NOP = BPF_JMP_IMM(BPF_JA, 0, 0, 0); + static int opt_remove_nops(struct bpf_verifier_env *env) { - const struct bpf_insn ja = BPF_JMP_IMM(BPF_JA, 0, 0, 0); + const struct bpf_insn ja = NOP; struct bpf_insn *insn = env->prog->insnsi; int insn_cnt = env->prog->len; int i, err; @@ -19150,14 +19695,39 @@ apply_patch_buffer: */ static int convert_ctx_accesses(struct bpf_verifier_env *env) { + struct bpf_subprog_info *subprogs = env->subprog_info; const struct bpf_verifier_ops *ops = env->ops; - int i, cnt, size, ctx_field_size, delta = 0; + int i, cnt, size, ctx_field_size, delta = 0, epilogue_cnt = 0; const int insn_cnt = env->prog->len; - struct bpf_insn insn_buf[16], *insn; + struct bpf_insn *epilogue_buf = env->epilogue_buf; + struct bpf_insn *insn_buf = env->insn_buf; + struct bpf_insn *insn; u32 target_size, size_default, off; struct bpf_prog *new_prog; enum bpf_access_type type; bool is_narrower_load; + int epilogue_idx = 0; + + if (ops->gen_epilogue) { + epilogue_cnt = ops->gen_epilogue(epilogue_buf, env->prog, + -(subprogs[0].stack_depth + 8)); + if (epilogue_cnt >= INSN_BUF_SIZE) { + verbose(env, "bpf verifier is misconfigured\n"); + return -EINVAL; + } else if (epilogue_cnt) { + /* Save the ARG_PTR_TO_CTX for the epilogue to use */ + cnt = 0; + subprogs[0].stack_depth += 8; + insn_buf[cnt++] = BPF_STX_MEM(BPF_DW, BPF_REG_FP, BPF_REG_1, + -subprogs[0].stack_depth); + insn_buf[cnt++] = env->prog->insnsi[0]; + new_prog = bpf_patch_insn_data(env, 0, insn_buf, cnt); + if (!new_prog) + return -ENOMEM; + env->prog = new_prog; + delta += cnt - 1; + } + } if (ops->gen_prologue || env->seen_direct_write) { if (!ops->gen_prologue) { @@ -19166,7 +19736,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) } cnt = ops->gen_prologue(insn_buf, env->seen_direct_write, env->prog); - if (cnt >= ARRAY_SIZE(insn_buf)) { + if (cnt >= INSN_BUF_SIZE) { verbose(env, "bpf verifier is misconfigured\n"); return -EINVAL; } else if (cnt) { @@ -19179,6 +19749,9 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) } } + if (delta) + WARN_ON(adjust_jmp_off(env->prog, 0, delta)); + if (bpf_prog_is_offloaded(env->prog->aux)) return 0; @@ -19211,6 +19784,25 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) insn->code = BPF_STX | BPF_PROBE_ATOMIC | BPF_SIZE(insn->code); env->prog->aux->num_exentries++; continue; + } else if (insn->code == (BPF_JMP | BPF_EXIT) && + epilogue_cnt && + i + delta < subprogs[1].start) { + /* Generate epilogue for the main prog */ + if (epilogue_idx) { + /* jump back to the earlier generated epilogue */ + insn_buf[0] = BPF_JMP32_A(epilogue_idx - i - delta - 1); + cnt = 1; + } else { + memcpy(insn_buf, epilogue_buf, + epilogue_cnt * sizeof(*epilogue_buf)); + cnt = epilogue_cnt; + /* epilogue_idx cannot be 0. It must have at + * least one ctx ptr saving insn before the + * epilogue. + */ + epilogue_idx = i + delta; + } + goto patch_insn_buf; } else { continue; } @@ -19313,7 +19905,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) target_size = 0; cnt = convert_ctx_access(type, insn, insn_buf, env->prog, &target_size); - if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf) || + if (cnt == 0 || cnt >= INSN_BUF_SIZE || (ctx_field_size && !target_size)) { verbose(env, "bpf verifier is misconfigured\n"); return -EINVAL; @@ -19322,7 +19914,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) if (is_narrower_load && size < target_size) { u8 shift = bpf_ctx_narrow_access_offset( off, size, size_default) * 8; - if (shift && cnt + 1 >= ARRAY_SIZE(insn_buf)) { + if (shift && cnt + 1 >= INSN_BUF_SIZE) { verbose(env, "bpf verifier narrow ctx load misconfigured\n"); return -EINVAL; } @@ -19347,6 +19939,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) insn->dst_reg, insn->dst_reg, size * 8, 0); +patch_insn_buf: new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt); if (!new_prog) return -ENOMEM; @@ -19867,7 +20460,7 @@ static int do_misc_fixups(struct bpf_verifier_env *env) const int insn_cnt = prog->len; const struct bpf_map_ops *ops; struct bpf_insn_aux_data *aux; - struct bpf_insn insn_buf[16]; + struct bpf_insn *insn_buf = env->insn_buf; struct bpf_prog *new_prog; struct bpf_map *map_ptr; int i, ret, cnt, delta = 0, cur_subprog = 0; @@ -19986,7 +20579,7 @@ static int do_misc_fixups(struct bpf_verifier_env *env) (BPF_MODE(insn->code) == BPF_ABS || BPF_MODE(insn->code) == BPF_IND)) { cnt = env->ops->gen_ld_abs(insn, insn_buf); - if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf)) { + if (cnt == 0 || cnt >= INSN_BUF_SIZE) { verbose(env, "bpf verifier is misconfigured\n"); return -EINVAL; } @@ -20279,7 +20872,7 @@ static int do_misc_fixups(struct bpf_verifier_env *env) cnt = ops->map_gen_lookup(map_ptr, insn_buf); if (cnt == -EOPNOTSUPP) goto patch_map_ops_generic; - if (cnt <= 0 || cnt >= ARRAY_SIZE(insn_buf)) { + if (cnt <= 0 || cnt >= INSN_BUF_SIZE) { verbose(env, "bpf verifier is misconfigured\n"); return -EINVAL; } @@ -20381,7 +20974,7 @@ patch_map_ops_generic: #if defined(CONFIG_X86_64) && !defined(CONFIG_UML) /* Implement bpf_get_smp_processor_id() inline. */ if (insn->imm == BPF_FUNC_get_smp_processor_id && - prog->jit_requested && bpf_jit_supports_percpu_insn()) { + verifier_inlines_helper_call(env, insn->imm)) { /* BPF_FUNC_get_smp_processor_id inlining is an * optimization, so if pcpu_hot.cpu_number is ever * changed in some incompatible and hard to support @@ -20771,6 +21364,40 @@ static int optimize_bpf_loop(struct bpf_verifier_env *env) return 0; } +/* Remove unnecessary spill/fill pairs, members of fastcall pattern, + * adjust subprograms stack depth when possible. + */ +static int remove_fastcall_spills_fills(struct bpf_verifier_env *env) +{ + struct bpf_subprog_info *subprog = env->subprog_info; + struct bpf_insn_aux_data *aux = env->insn_aux_data; + struct bpf_insn *insn = env->prog->insnsi; + int insn_cnt = env->prog->len; + u32 spills_num; + bool modified = false; + int i, j; + + for (i = 0; i < insn_cnt; i++, insn++) { + if (aux[i].fastcall_spills_num > 0) { + spills_num = aux[i].fastcall_spills_num; + /* NOPs would be removed by opt_remove_nops() */ + for (j = 1; j <= spills_num; ++j) { + *(insn - j) = NOP; + *(insn + j) = NOP; + } + modified = true; + } + if ((subprog + 1)->start == i + 1) { + if (modified && !subprog->keep_fastcall_stack) + subprog->stack_depth = -subprog->fastcall_stack_off; + subprog++; + modified = false; + } + } + + return 0; +} + static void free_states(struct bpf_verifier_env *env) { struct bpf_verifier_state_list *sl, *sln; @@ -21044,6 +21671,7 @@ static int check_struct_ops_btf_id(struct bpf_verifier_env *env) u32 btf_id, member_idx; struct btf *btf; const char *mname; + int err; if (!prog->gpl_compatible) { verbose(env, "struct ops programs must have a GPL compatible license\n"); @@ -21091,8 +21719,15 @@ static int check_struct_ops_btf_id(struct bpf_verifier_env *env) return -EINVAL; } + err = bpf_struct_ops_supported(st_ops, __btf_member_bit_offset(t, member) / 8); + if (err) { + verbose(env, "attach to unsupported member %s of struct %s\n", + mname, st_ops->name); + return err; + } + if (st_ops->check_member) { - int err = st_ops->check_member(t, member, prog); + err = st_ops->check_member(t, member, prog); if (err) { verbose(env, "attach to unsupported member %s of struct %s\n", @@ -21677,6 +22312,10 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u3 if (ret < 0) goto skip_full_check; + ret = mark_fastcall_patterns(env); + if (ret < 0) + goto skip_full_check; + ret = do_check_main(env); ret = ret ?: do_check_subprogs(env); @@ -21686,6 +22325,12 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u3 skip_full_check: kvfree(env->explored_states); + /* might decrease stack depth, keep it before passes that + * allocate additional slots. + */ + if (ret == 0) + ret = remove_fastcall_spills_fills(env); + if (ret == 0) ret = check_max_stack_depth(env); diff --git a/kernel/cpu.c b/kernel/cpu.c index 1209ddaec026..b1fd2a3db91a 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -2689,6 +2689,16 @@ int cpuhp_smt_disable(enum cpuhp_smt_control ctrlval) return ret; } +/** + * Check if the core a CPU belongs to is online + */ +#if !defined(topology_is_core_online) +static inline bool topology_is_core_online(unsigned int cpu) +{ + return true; +} +#endif + int cpuhp_smt_enable(void) { int cpu, ret = 0; @@ -2699,7 +2709,7 @@ int cpuhp_smt_enable(void) /* Skip online CPUs and CPUs on offline nodes */ if (cpu_online(cpu) || !node_online(cpu_to_node(cpu))) continue; - if (!cpu_smt_thread_allowed(cpu)) + if (!cpu_smt_thread_allowed(cpu) || !topology_is_core_online(cpu)) continue; ret = _cpu_up(cpu, 0, CPUHP_ONLINE); if (ret) diff --git a/kernel/crash_reserve.c b/kernel/crash_reserve.c index d3b4cd12bdd1..64d44a52c011 100644 --- a/kernel/crash_reserve.c +++ b/kernel/crash_reserve.c @@ -423,7 +423,8 @@ retry: if (high && search_end == CRASH_ADDR_HIGH_MAX) { search_end = CRASH_ADDR_LOW_MAX; search_base = 0; - goto retry; + if (search_end != CRASH_ADDR_HIGH_MAX) + goto retry; } pr_warn("cannot allocate crashkernel (size:0x%llx)\n", crash_size); diff --git a/kernel/dma/debug.c b/kernel/dma/debug.c index a6e3792b15f8..d570535342cb 100644 --- a/kernel/dma/debug.c +++ b/kernel/dma/debug.c @@ -416,8 +416,11 @@ static unsigned long long phys_addr(struct dma_debug_entry *entry) * dma_active_cacheline entry to track per event. dma_map_sg(), on the * other hand, consumes a single dma_debug_entry, but inserts 'nents' * entries into the tree. + * + * Use __GFP_NOWARN because the printk from an OOM, to netconsole, could end + * up right back in the DMA debugging code, leading to a deadlock. */ -static RADIX_TREE(dma_active_cacheline, GFP_ATOMIC); +static RADIX_TREE(dma_active_cacheline, GFP_ATOMIC | __GFP_NOWARN); static DEFINE_SPINLOCK(radix_lock); #define ACTIVE_CACHELINE_MAX_OVERLAP ((1 << RADIX_TREE_MAX_TAGS) - 1) #define CACHELINE_PER_PAGE_SHIFT (PAGE_SHIFT - L1_CACHE_SHIFT) diff --git a/kernel/events/core.c b/kernel/events/core.c index aa3450bdc227..c973e3c11e03 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -9706,7 +9706,8 @@ static int __perf_event_overflow(struct perf_event *event, ret = __perf_event_account_interrupt(event, throttle); - if (event->prog && !bpf_overflow_handler(event, data, regs)) + if (event->prog && event->prog->type == BPF_PROG_TYPE_PERF_EVENT && + !bpf_overflow_handler(event, data, regs)) return ret; /* diff --git a/kernel/fork.c b/kernel/fork.c index 41771bde2ce7..e33f87b2cdde 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2055,11 +2055,24 @@ static int __pidfd_prepare(struct pid *pid, unsigned int flags, struct file **re */ int pidfd_prepare(struct pid *pid, unsigned int flags, struct file **ret) { - bool thread = flags & PIDFD_THREAD; - - if (!pid || !pid_has_task(pid, thread ? PIDTYPE_PID : PIDTYPE_TGID)) + if (!pid) return -EINVAL; + scoped_guard(rcu) { + struct task_struct *tsk; + + if (flags & PIDFD_THREAD) + tsk = pid_task(pid, PIDTYPE_PID); + else + tsk = pid_task(pid, PIDTYPE_TGID); + if (!tsk) + return -EINVAL; + + /* Don't create pidfds for kernel threads for now. */ + if (tsk->flags & PF_KTHREAD) + return -EINVAL; + } + return __pidfd_prepare(pid, flags, ret); } @@ -2405,6 +2418,12 @@ __latent_entropy struct task_struct *copy_process( if (clone_flags & CLONE_PIDFD) { int flags = (clone_flags & CLONE_THREAD) ? PIDFD_THREAD : 0; + /* Don't create pidfds for kernel threads for now. */ + if (args->kthread) { + retval = -EINVAL; + goto bad_fork_free_pid; + } + /* Note that no task has been attached to @pid yet. */ retval = __pidfd_prepare(pid, flags, &pidfile); if (retval < 0) diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 07e99c936ba5..1dee88ba0ae4 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -530,6 +530,7 @@ static int alloc_descs(unsigned int start, unsigned int cnt, int node, flags = IRQD_AFFINITY_MANAGED | IRQD_MANAGED_SHUTDOWN; } + flags |= IRQD_AFFINITY_SET; mask = &affinity->mask; node = cpu_to_node(cpumask_first(mask)); affinity++; diff --git a/kernel/jump_label.c b/kernel/jump_label.c index 4ad5ed8adf96..6dc76b590703 100644 --- a/kernel/jump_label.c +++ b/kernel/jump_label.c @@ -236,7 +236,7 @@ void static_key_disable_cpuslocked(struct static_key *key) } jump_label_lock(); - if (atomic_cmpxchg(&key->enabled, 1, 0)) + if (atomic_cmpxchg(&key->enabled, 1, 0) == 1) jump_label_update(key); jump_label_unlock(); } @@ -289,7 +289,7 @@ static void __static_key_slow_dec_cpuslocked(struct static_key *key) return; guard(mutex)(&jump_label_mutex); - if (atomic_cmpxchg(&key->enabled, 1, 0)) + if (atomic_cmpxchg(&key->enabled, 1, 0) == 1) jump_label_update(key); else WARN_ON_ONCE(!static_key_slow_try_dec(key)); diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index fb2c77368d18..a9a0ca605d4a 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -160,38 +160,6 @@ unsigned long kallsyms_sym_address(int idx) return kallsyms_relative_base - 1 - kallsyms_offsets[idx]; } -static void cleanup_symbol_name(char *s) -{ - char *res; - - if (!IS_ENABLED(CONFIG_LTO_CLANG)) - return; - - /* - * LLVM appends various suffixes for local functions and variables that - * must be promoted to global scope as part of LTO. This can break - * hooking of static functions with kprobes. '.' is not a valid - * character in an identifier in C. Suffixes only in LLVM LTO observed: - * - foo.llvm.[0-9a-f]+ - */ - res = strstr(s, ".llvm."); - if (res) - *res = '\0'; - - return; -} - -static int compare_symbol_name(const char *name, char *namebuf) -{ - /* The kallsyms_seqs_of_names is sorted based on names after - * cleanup_symbol_name() (see scripts/kallsyms.c) if clang lto is enabled. - * To ensure correct bisection in kallsyms_lookup_names(), do - * cleanup_symbol_name(namebuf) before comparing name and namebuf. - */ - cleanup_symbol_name(namebuf); - return strcmp(name, namebuf); -} - static unsigned int get_symbol_seq(int index) { unsigned int i, seq = 0; @@ -219,7 +187,7 @@ static int kallsyms_lookup_names(const char *name, seq = get_symbol_seq(mid); off = get_symbol_offset(seq); kallsyms_expand_symbol(off, namebuf, ARRAY_SIZE(namebuf)); - ret = compare_symbol_name(name, namebuf); + ret = strcmp(name, namebuf); if (ret > 0) low = mid + 1; else if (ret < 0) @@ -236,7 +204,7 @@ static int kallsyms_lookup_names(const char *name, seq = get_symbol_seq(low - 1); off = get_symbol_offset(seq); kallsyms_expand_symbol(off, namebuf, ARRAY_SIZE(namebuf)); - if (compare_symbol_name(name, namebuf)) + if (strcmp(name, namebuf)) break; low--; } @@ -248,7 +216,7 @@ static int kallsyms_lookup_names(const char *name, seq = get_symbol_seq(high + 1); off = get_symbol_offset(seq); kallsyms_expand_symbol(off, namebuf, ARRAY_SIZE(namebuf)); - if (compare_symbol_name(name, namebuf)) + if (strcmp(name, namebuf)) break; high++; } @@ -407,8 +375,7 @@ static int kallsyms_lookup_buildid(unsigned long addr, if (modbuildid) *modbuildid = NULL; - ret = strlen(namebuf); - goto found; + return strlen(namebuf); } /* See if it's in a module or a BPF JITed image. */ @@ -422,8 +389,6 @@ static int kallsyms_lookup_buildid(unsigned long addr, ret = ftrace_mod_address_lookup(addr, symbolsize, offset, modname, namebuf); -found: - cleanup_symbol_name(namebuf); return ret; } @@ -450,8 +415,6 @@ const char *kallsyms_lookup(unsigned long addr, int lookup_symbol_name(unsigned long addr, char *symname) { - int res; - symname[0] = '\0'; symname[KSYM_NAME_LEN - 1] = '\0'; @@ -462,16 +425,10 @@ int lookup_symbol_name(unsigned long addr, char *symname) /* Grab name */ kallsyms_expand_symbol(get_symbol_offset(pos), symname, KSYM_NAME_LEN); - goto found; + return 0; } /* See if it's in a module. */ - res = lookup_module_symbol_name(addr, symname); - if (res) - return res; - -found: - cleanup_symbol_name(symname); - return 0; + return lookup_module_symbol_name(addr, symname); } /* Look up a kernel symbol and return it in a text buffer. */ diff --git a/kernel/kallsyms_selftest.c b/kernel/kallsyms_selftest.c index 2f84896a7bcb..873f7c445488 100644 --- a/kernel/kallsyms_selftest.c +++ b/kernel/kallsyms_selftest.c @@ -187,31 +187,11 @@ static void test_perf_kallsyms_lookup_name(void) stat.min, stat.max, div_u64(stat.sum, stat.real_cnt)); } -static bool match_cleanup_name(const char *s, const char *name) -{ - char *p; - int len; - - if (!IS_ENABLED(CONFIG_LTO_CLANG)) - return false; - - p = strstr(s, ".llvm."); - if (!p) - return false; - - len = strlen(name); - if (p - s != len) - return false; - - return !strncmp(s, name, len); -} - static int find_symbol(void *data, const char *name, unsigned long addr) { struct test_stat *stat = (struct test_stat *)data; - if (strcmp(name, stat->name) == 0 || - (!stat->perf && match_cleanup_name(name, stat->name))) { + if (!strcmp(name, stat->name)) { stat->real_cnt++; stat->addr = addr; diff --git a/kernel/kcov.c b/kernel/kcov.c index f0a69d402066..274b6b7c718d 100644 --- a/kernel/kcov.c +++ b/kernel/kcov.c @@ -161,6 +161,15 @@ static void kcov_remote_area_put(struct kcov_remote_area *area, kmsan_unpoison_memory(&area->list, sizeof(area->list)); } +/* + * Unlike in_serving_softirq(), this function returns false when called during + * a hardirq or an NMI that happened in the softirq context. + */ +static inline bool in_softirq_really(void) +{ + return in_serving_softirq() && !in_hardirq() && !in_nmi(); +} + static notrace bool check_kcov_mode(enum kcov_mode needed_mode, struct task_struct *t) { unsigned int mode; @@ -170,7 +179,7 @@ static notrace bool check_kcov_mode(enum kcov_mode needed_mode, struct task_stru * so we ignore code executed in interrupts, unless we are in a remote * coverage collection section in a softirq. */ - if (!in_task() && !(in_serving_softirq() && t->kcov_softirq)) + if (!in_task() && !(in_softirq_really() && t->kcov_softirq)) return false; mode = READ_ONCE(t->kcov_mode); /* @@ -849,7 +858,7 @@ void kcov_remote_start(u64 handle) if (WARN_ON(!kcov_check_handle(handle, true, true, true))) return; - if (!in_task() && !in_serving_softirq()) + if (!in_task() && !in_softirq_really()) return; local_lock_irqsave(&kcov_percpu_data.lock, flags); @@ -991,7 +1000,7 @@ void kcov_remote_stop(void) int sequence; unsigned long flags; - if (!in_task() && !in_serving_softirq()) + if (!in_task() && !in_softirq_really()) return; local_lock_irqsave(&kcov_percpu_data.lock, flags); diff --git a/kernel/kprobes.c b/kernel/kprobes.c index e85de37d9e1e..da59c68df841 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -1557,8 +1557,8 @@ static bool is_cfi_preamble_symbol(unsigned long addr) if (lookup_symbol_name(addr, symbuf)) return false; - return str_has_prefix("__cfi_", symbuf) || - str_has_prefix("__pfx_", symbuf); + return str_has_prefix(symbuf, "__cfi_") || + str_has_prefix(symbuf, "__pfx_"); } static int check_kprobe_address_safe(struct kprobe *p, diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c index 07fb5987b42b..1bab21b4718f 100644 --- a/kernel/ksysfs.c +++ b/kernel/ksysfs.c @@ -92,7 +92,14 @@ static ssize_t profiling_store(struct kobject *kobj, const char *buf, size_t count) { int ret; + static DEFINE_MUTEX(lock); + /* + * We need serialization, for profile_setup() initializes prof_on + * value and profile_init() must not reallocate prof_buffer after + * once allocated. + */ + guard(mutex)(&lock); if (prof_on) return -EEXIST; /* diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 58c88220a478..0349f957e672 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -5936,6 +5936,9 @@ __lock_contended(struct lockdep_map *lock, unsigned long ip) if (DEBUG_LOCKS_WARN_ON(!depth)) return; + if (unlikely(lock->key == &__lockdep_no_track__)) + return; + hlock = find_held_lock(curr, lock, depth, &i); if (!hlock) { print_lock_contention_bug(curr, lock, ip); @@ -5978,6 +5981,9 @@ __lock_acquired(struct lockdep_map *lock, unsigned long ip) if (DEBUG_LOCKS_WARN_ON(!depth)) return; + if (unlikely(lock->key == &__lockdep_no_track__)) + return; + hlock = find_held_lock(curr, lock, depth, &i); if (!hlock) { print_lock_contention_bug(curr, lock, _RET_IP_); diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h index f5a36e67b593..ac2e22502741 100644 --- a/kernel/locking/qspinlock_paravirt.h +++ b/kernel/locking/qspinlock_paravirt.h @@ -357,7 +357,7 @@ static void pv_wait_node(struct mcs_spinlock *node, struct mcs_spinlock *prev) static void pv_kick_node(struct qspinlock *lock, struct mcs_spinlock *node) { struct pv_node *pn = (struct pv_node *)node; - enum vcpu_state old = vcpu_halted; + u8 old = vcpu_halted; /* * If the vCPU is indeed halted, advance its state to match that of * pv_wait_node(). If OTOH this fails, the vCPU was running and will diff --git a/kernel/module/main.c b/kernel/module/main.c index d9592195c5bb..71396e297499 100644 --- a/kernel/module/main.c +++ b/kernel/module/main.c @@ -3104,7 +3104,7 @@ static bool idempotent(struct idempotent *u, const void *cookie) struct idempotent *existing; bool first; - u->ret = 0; + u->ret = -EINTR; u->cookie = cookie; init_completion(&u->complete); @@ -3140,7 +3140,7 @@ static int idempotent_complete(struct idempotent *u, int ret) hlist_for_each_entry_safe(pos, next, head, entry) { if (pos->cookie != cookie) continue; - hlist_del(&pos->entry); + hlist_del_init(&pos->entry); pos->ret = ret; complete(&pos->complete); } @@ -3148,6 +3148,28 @@ static int idempotent_complete(struct idempotent *u, int ret) return ret; } +/* + * Wait for the idempotent worker. + * + * If we get interrupted, we need to remove ourselves from the + * the idempotent list, and the completion may still come in. + * + * The 'idem_lock' protects against the race, and 'idem.ret' was + * initialized to -EINTR and is thus always the right return + * value even if the idempotent work then completes between + * the wait_for_completion and the cleanup. + */ +static int idempotent_wait_for_completion(struct idempotent *u) +{ + if (wait_for_completion_interruptible(&u->complete)) { + spin_lock(&idem_lock); + if (!hlist_unhashed(&u->entry)) + hlist_del(&u->entry); + spin_unlock(&idem_lock); + } + return u->ret; +} + static int init_module_from_file(struct file *f, const char __user * uargs, int flags) { struct load_info info = { }; @@ -3183,15 +3205,16 @@ static int idempotent_init_module(struct file *f, const char __user * uargs, int if (!f || !(f->f_mode & FMODE_READ)) return -EBADF; - /* See if somebody else is doing the operation? */ - if (idempotent(&idem, file_inode(f))) { - wait_for_completion(&idem.complete); - return idem.ret; + /* Are we the winners of the race and get to do this? */ + if (!idempotent(&idem, file_inode(f))) { + int ret = init_module_from_file(f, uargs, flags); + return idempotent_complete(&idem, ret); } - /* Otherwise, we'll do it and complete others */ - return idempotent_complete(&idem, - init_module_from_file(f, uargs, flags)); + /* + * Somebody else won the race and is loading the module. + */ + return idempotent_wait_for_completion(&idem); } SYSCALL_DEFINE3(finit_module, int, fd, const char __user *, uargs, int, flags) diff --git a/kernel/padata.c b/kernel/padata.c index 53f4bc912712..0fa6c2895460 100644 --- a/kernel/padata.c +++ b/kernel/padata.c @@ -517,6 +517,13 @@ void __init padata_do_multithreaded(struct padata_mt_job *job) ps.chunk_size = max(ps.chunk_size, job->min_chunk); ps.chunk_size = roundup(ps.chunk_size, job->align); + /* + * chunk_size can be 0 if the caller sets min_chunk to 0. So force it + * to at least 1 to prevent divide-by-0 panic in padata_mt_helper().` + */ + if (!ps.chunk_size) + ps.chunk_size = 1U; + list_for_each_entry(pw, &works, pw_list) if (job->numa_aware) { int old_node = atomic_read(&last_used_nid); diff --git a/kernel/panic.c b/kernel/panic.c index f861bedc1925..2a0449144f82 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -64,6 +64,8 @@ unsigned long panic_on_taint; bool panic_on_taint_nousertaint = false; static unsigned int warn_limit __read_mostly; +bool panic_triggering_all_cpu_backtrace; + int panic_timeout = CONFIG_PANIC_TIMEOUT; EXPORT_SYMBOL_GPL(panic_timeout); @@ -253,8 +255,12 @@ void check_panic_on_warn(const char *origin) */ static void panic_other_cpus_shutdown(bool crash_kexec) { - if (panic_print & PANIC_PRINT_ALL_CPU_BT) + if (panic_print & PANIC_PRINT_ALL_CPU_BT) { + /* Temporary allow non-panic CPUs to write their backtraces. */ + panic_triggering_all_cpu_backtrace = true; trigger_all_cpu_backtrace(); + panic_triggering_all_cpu_backtrace = false; + } /* * Note that smp_send_stop() is the usual SMP shutdown function, diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 054c0e7784fd..c22b07049c38 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -2316,7 +2316,7 @@ asmlinkage int vprintk_emit(int facility, int level, * non-panic CPUs are generating any messages, they will be * silently dropped. */ - if (other_cpu_in_panic()) + if (other_cpu_in_panic() && !panic_triggering_all_cpu_backtrace) return 0; if (level == LOGLEVEL_SCHED) { diff --git a/kernel/profile.c b/kernel/profile.c index 2b775cc5c28f..1fcf1adcf4eb 100644 --- a/kernel/profile.c +++ b/kernel/profile.c @@ -47,30 +47,14 @@ static unsigned short int prof_shift; int prof_on __read_mostly; EXPORT_SYMBOL_GPL(prof_on); -static cpumask_var_t prof_cpu_mask; -#if defined(CONFIG_SMP) && defined(CONFIG_PROC_FS) -static DEFINE_PER_CPU(struct profile_hit *[2], cpu_profile_hits); -static DEFINE_PER_CPU(int, cpu_profile_flip); -static DEFINE_MUTEX(profile_flip_mutex); -#endif /* CONFIG_SMP */ - int profile_setup(char *str) { static const char schedstr[] = "schedule"; - static const char sleepstr[] = "sleep"; static const char kvmstr[] = "kvm"; const char *select = NULL; int par; - if (!strncmp(str, sleepstr, strlen(sleepstr))) { -#ifdef CONFIG_SCHEDSTATS - force_schedstat_enabled(); - prof_on = SLEEP_PROFILING; - select = sleepstr; -#else - pr_warn("kernel sleep profiling requires CONFIG_SCHEDSTATS\n"); -#endif /* CONFIG_SCHEDSTATS */ - } else if (!strncmp(str, schedstr, strlen(schedstr))) { + if (!strncmp(str, schedstr, strlen(schedstr))) { prof_on = SCHED_PROFILING; select = schedstr; } else if (!strncmp(str, kvmstr, strlen(kvmstr))) { @@ -114,11 +98,6 @@ int __ref profile_init(void) buffer_bytes = prof_len*sizeof(atomic_t); - if (!alloc_cpumask_var(&prof_cpu_mask, GFP_KERNEL)) - return -ENOMEM; - - cpumask_copy(prof_cpu_mask, cpu_possible_mask); - prof_buffer = kzalloc(buffer_bytes, GFP_KERNEL|__GFP_NOWARN); if (prof_buffer) return 0; @@ -132,195 +111,16 @@ int __ref profile_init(void) if (prof_buffer) return 0; - free_cpumask_var(prof_cpu_mask); return -ENOMEM; } -#if defined(CONFIG_SMP) && defined(CONFIG_PROC_FS) -/* - * Each cpu has a pair of open-addressed hashtables for pending - * profile hits. read_profile() IPI's all cpus to request them - * to flip buffers and flushes their contents to prof_buffer itself. - * Flip requests are serialized by the profile_flip_mutex. The sole - * use of having a second hashtable is for avoiding cacheline - * contention that would otherwise happen during flushes of pending - * profile hits required for the accuracy of reported profile hits - * and so resurrect the interrupt livelock issue. - * - * The open-addressed hashtables are indexed by profile buffer slot - * and hold the number of pending hits to that profile buffer slot on - * a cpu in an entry. When the hashtable overflows, all pending hits - * are accounted to their corresponding profile buffer slots with - * atomic_add() and the hashtable emptied. As numerous pending hits - * may be accounted to a profile buffer slot in a hashtable entry, - * this amortizes a number of atomic profile buffer increments likely - * to be far larger than the number of entries in the hashtable, - * particularly given that the number of distinct profile buffer - * positions to which hits are accounted during short intervals (e.g. - * several seconds) is usually very small. Exclusion from buffer - * flipping is provided by interrupt disablement (note that for - * SCHED_PROFILING or SLEEP_PROFILING profile_hit() may be called from - * process context). - * The hash function is meant to be lightweight as opposed to strong, - * and was vaguely inspired by ppc64 firmware-supported inverted - * pagetable hash functions, but uses a full hashtable full of finite - * collision chains, not just pairs of them. - * - * -- nyc - */ -static void __profile_flip_buffers(void *unused) -{ - int cpu = smp_processor_id(); - - per_cpu(cpu_profile_flip, cpu) = !per_cpu(cpu_profile_flip, cpu); -} - -static void profile_flip_buffers(void) -{ - int i, j, cpu; - - mutex_lock(&profile_flip_mutex); - j = per_cpu(cpu_profile_flip, get_cpu()); - put_cpu(); - on_each_cpu(__profile_flip_buffers, NULL, 1); - for_each_online_cpu(cpu) { - struct profile_hit *hits = per_cpu(cpu_profile_hits, cpu)[j]; - for (i = 0; i < NR_PROFILE_HIT; ++i) { - if (!hits[i].hits) { - if (hits[i].pc) - hits[i].pc = 0; - continue; - } - atomic_add(hits[i].hits, &prof_buffer[hits[i].pc]); - hits[i].hits = hits[i].pc = 0; - } - } - mutex_unlock(&profile_flip_mutex); -} - -static void profile_discard_flip_buffers(void) -{ - int i, cpu; - - mutex_lock(&profile_flip_mutex); - i = per_cpu(cpu_profile_flip, get_cpu()); - put_cpu(); - on_each_cpu(__profile_flip_buffers, NULL, 1); - for_each_online_cpu(cpu) { - struct profile_hit *hits = per_cpu(cpu_profile_hits, cpu)[i]; - memset(hits, 0, NR_PROFILE_HIT*sizeof(struct profile_hit)); - } - mutex_unlock(&profile_flip_mutex); -} - -static void do_profile_hits(int type, void *__pc, unsigned int nr_hits) -{ - unsigned long primary, secondary, flags, pc = (unsigned long)__pc; - int i, j, cpu; - struct profile_hit *hits; - - pc = min((pc - (unsigned long)_stext) >> prof_shift, prof_len - 1); - i = primary = (pc & (NR_PROFILE_GRP - 1)) << PROFILE_GRPSHIFT; - secondary = (~(pc << 1) & (NR_PROFILE_GRP - 1)) << PROFILE_GRPSHIFT; - cpu = get_cpu(); - hits = per_cpu(cpu_profile_hits, cpu)[per_cpu(cpu_profile_flip, cpu)]; - if (!hits) { - put_cpu(); - return; - } - /* - * We buffer the global profiler buffer into a per-CPU - * queue and thus reduce the number of global (and possibly - * NUMA-alien) accesses. The write-queue is self-coalescing: - */ - local_irq_save(flags); - do { - for (j = 0; j < PROFILE_GRPSZ; ++j) { - if (hits[i + j].pc == pc) { - hits[i + j].hits += nr_hits; - goto out; - } else if (!hits[i + j].hits) { - hits[i + j].pc = pc; - hits[i + j].hits = nr_hits; - goto out; - } - } - i = (i + secondary) & (NR_PROFILE_HIT - 1); - } while (i != primary); - - /* - * Add the current hit(s) and flush the write-queue out - * to the global buffer: - */ - atomic_add(nr_hits, &prof_buffer[pc]); - for (i = 0; i < NR_PROFILE_HIT; ++i) { - atomic_add(hits[i].hits, &prof_buffer[hits[i].pc]); - hits[i].pc = hits[i].hits = 0; - } -out: - local_irq_restore(flags); - put_cpu(); -} - -static int profile_dead_cpu(unsigned int cpu) -{ - struct page *page; - int i; - - if (cpumask_available(prof_cpu_mask)) - cpumask_clear_cpu(cpu, prof_cpu_mask); - - for (i = 0; i < 2; i++) { - if (per_cpu(cpu_profile_hits, cpu)[i]) { - page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[i]); - per_cpu(cpu_profile_hits, cpu)[i] = NULL; - __free_page(page); - } - } - return 0; -} - -static int profile_prepare_cpu(unsigned int cpu) -{ - int i, node = cpu_to_mem(cpu); - struct page *page; - - per_cpu(cpu_profile_flip, cpu) = 0; - - for (i = 0; i < 2; i++) { - if (per_cpu(cpu_profile_hits, cpu)[i]) - continue; - - page = __alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0); - if (!page) { - profile_dead_cpu(cpu); - return -ENOMEM; - } - per_cpu(cpu_profile_hits, cpu)[i] = page_address(page); - - } - return 0; -} - -static int profile_online_cpu(unsigned int cpu) -{ - if (cpumask_available(prof_cpu_mask)) - cpumask_set_cpu(cpu, prof_cpu_mask); - - return 0; -} - -#else /* !CONFIG_SMP */ -#define profile_flip_buffers() do { } while (0) -#define profile_discard_flip_buffers() do { } while (0) - static void do_profile_hits(int type, void *__pc, unsigned int nr_hits) { unsigned long pc; pc = ((unsigned long)__pc - (unsigned long)_stext) >> prof_shift; - atomic_add(nr_hits, &prof_buffer[min(pc, prof_len - 1)]); + if (pc < prof_len) + atomic_add(nr_hits, &prof_buffer[pc]); } -#endif /* !CONFIG_SMP */ void profile_hits(int type, void *__pc, unsigned int nr_hits) { @@ -334,8 +134,8 @@ void profile_tick(int type) { struct pt_regs *regs = get_irq_regs(); - if (!user_mode(regs) && cpumask_available(prof_cpu_mask) && - cpumask_test_cpu(smp_processor_id(), prof_cpu_mask)) + /* This is the old kernel-only legacy profiling */ + if (!user_mode(regs)) profile_hit(type, (void *)profile_pc(regs)); } @@ -358,7 +158,6 @@ read_profile(struct file *file, char __user *buf, size_t count, loff_t *ppos) char *pnt; unsigned long sample_step = 1UL << prof_shift; - profile_flip_buffers(); if (p >= (prof_len+1)*sizeof(unsigned int)) return 0; if (count > (prof_len+1)*sizeof(unsigned int) - p) @@ -404,7 +203,6 @@ static ssize_t write_profile(struct file *file, const char __user *buf, return -EINVAL; } #endif - profile_discard_flip_buffers(); memset(prof_buffer, 0, prof_len * sizeof(atomic_t)); return count; } @@ -418,40 +216,14 @@ static const struct proc_ops profile_proc_ops = { int __ref create_proc_profile(void) { struct proc_dir_entry *entry; -#ifdef CONFIG_SMP - enum cpuhp_state online_state; -#endif - int err = 0; if (!prof_on) return 0; -#ifdef CONFIG_SMP - err = cpuhp_setup_state(CPUHP_PROFILE_PREPARE, "PROFILE_PREPARE", - profile_prepare_cpu, profile_dead_cpu); - if (err) - return err; - - err = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "AP_PROFILE_ONLINE", - profile_online_cpu, NULL); - if (err < 0) - goto err_state_prep; - online_state = err; - err = 0; -#endif entry = proc_create("profile", S_IWUSR | S_IRUGO, NULL, &profile_proc_ops); - if (!entry) - goto err_state_onl; - proc_set_size(entry, (1 + prof_len) * sizeof(atomic_t)); - - return err; -err_state_onl: -#ifdef CONFIG_SMP - cpuhp_remove_state(online_state); -err_state_prep: - cpuhp_remove_state(CPUHP_PROFILE_PREPARE); -#endif + if (entry) + proc_set_size(entry, (1 + prof_len) * sizeof(atomic_t)); return err; } subsys_initcall(create_proc_profile); diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c index 78e48f5426ee..eb0cdcd4d921 100644 --- a/kernel/sched/stats.c +++ b/kernel/sched/stats.c @@ -92,16 +92,6 @@ void __update_stats_enqueue_sleeper(struct rq *rq, struct task_struct *p, trace_sched_stat_blocked(p, delta); - /* - * Blocking time is in units of nanosecs, so shift by - * 20 to get a milliseconds-range estimation of the - * amount of time that the task spent sleeping: - */ - if (unlikely(prof_on == SLEEP_PROFILING)) { - profile_hits(SLEEP_PROFILING, - (void *)get_wchan(p), - delta >> 20); - } account_scheduler_latency(p, delta >> 10, 0); } } diff --git a/kernel/task_work.c b/kernel/task_work.c index 5c2daa7ad3f9..5d14d639ac71 100644 --- a/kernel/task_work.c +++ b/kernel/task_work.c @@ -6,12 +6,14 @@ static struct callback_head work_exited; /* all we need is ->next == NULL */ +#ifdef CONFIG_IRQ_WORK static void task_work_set_notify_irq(struct irq_work *entry) { test_and_set_tsk_thread_flag(current, TIF_NOTIFY_RESUME); } static DEFINE_PER_CPU(struct irq_work, irq_work_NMI_resume) = IRQ_WORK_INIT_HARD(task_work_set_notify_irq); +#endif /** * task_work_add - ask the @task to execute @work->func() @@ -57,6 +59,8 @@ int task_work_add(struct task_struct *task, struct callback_head *work, if (notify == TWA_NMI_CURRENT) { if (WARN_ON_ONCE(task != current)) return -EINVAL; + if (!IS_ENABLED(CONFIG_IRQ_WORK)) + return -EINVAL; } else { /* record the work call stack in order to print it in KASAN reports */ kasan_record_aux_stack(work); @@ -81,9 +85,11 @@ int task_work_add(struct task_struct *task, struct callback_head *work, case TWA_SIGNAL_NO_IPI: __set_notify_signal(task); break; +#ifdef CONFIG_IRQ_WORK case TWA_NMI_CURRENT: irq_work_queue(this_cpu_ptr(&irq_work_NMI_resume)); break; +#endif default: WARN_ON_ONCE(1); break; diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index d25ba49e313c..d0538a75f4c6 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -246,7 +246,7 @@ static enum wd_read_status cs_watchdog_read(struct clocksource *cs, u64 *csnow, wd_delay = cycles_to_nsec_safe(watchdog, *wdnow, wd_end); if (wd_delay <= WATCHDOG_MAX_SKEW) { - if (nretries > 1 || nretries >= max_retries) { + if (nretries > 1 && nretries >= max_retries) { pr_warn("timekeeping watchdog on CPU%d: %s retried %d times before success\n", smp_processor_id(), watchdog->name, nretries); } diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 406dccb79c2b..8d2dd214ec68 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -727,17 +727,16 @@ static inline void process_adjtimex_modes(const struct __kernel_timex *txc, } if (txc->modes & ADJ_MAXERROR) - time_maxerror = txc->maxerror; + time_maxerror = clamp(txc->maxerror, 0, NTP_PHASE_LIMIT); if (txc->modes & ADJ_ESTERROR) - time_esterror = txc->esterror; + time_esterror = clamp(txc->esterror, 0, NTP_PHASE_LIMIT); if (txc->modes & ADJ_TIMECONST) { - time_constant = txc->constant; + time_constant = clamp(txc->constant, 0, MAXTC); if (!(time_status & STA_NANO)) time_constant += 4; - time_constant = min(time_constant, (long)MAXTC); - time_constant = max(time_constant, 0l); + time_constant = clamp(time_constant, 0, MAXTC); } if (txc->modes & ADJ_TAI && diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index b4843099a8da..ed58eebb4e8f 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -1141,7 +1141,6 @@ void tick_broadcast_switch_to_oneshot(void) #ifdef CONFIG_HOTPLUG_CPU void hotplug_cpu__broadcast_tick_pull(int deadcpu) { - struct tick_device *td = this_cpu_ptr(&tick_cpu_device); struct clock_event_device *bc; unsigned long flags; @@ -1167,6 +1166,8 @@ void hotplug_cpu__broadcast_tick_pull(int deadcpu) * device to avoid the starvation. */ if (tick_check_broadcast_expired()) { + struct tick_device *td = this_cpu_ptr(&tick_cpu_device); + cpumask_clear_cpu(smp_processor_id(), tick_broadcast_force_mask); tick_program_event(td->evtdev->next_event, 1); } diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 2fa87dcfeda9..5391e4167d60 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -2606,7 +2606,7 @@ int do_adjtimex(struct __kernel_timex *txc) clock_set |= timekeeping_advance(TK_ADV_FREQ); if (clock_set) - clock_was_set(CLOCK_REALTIME); + clock_was_set(CLOCK_SET_WALL); ntp_notify_cmos_timer(); diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index cd098846e251..b69a39316c0c 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -24,7 +24,6 @@ #include <linux/key.h> #include <linux/verification.h> #include <linux/namei.h> -#include <linux/fileattr.h> #include <net/bpf_sk_storage.h> @@ -798,29 +797,6 @@ const struct bpf_func_proto bpf_task_pt_regs_proto = { .ret_btf_id = &bpf_task_pt_regs_ids[0], }; -BPF_CALL_2(bpf_current_task_under_cgroup, struct bpf_map *, map, u32, idx) -{ - struct bpf_array *array = container_of(map, struct bpf_array, map); - struct cgroup *cgrp; - - if (unlikely(idx >= array->map.max_entries)) - return -E2BIG; - - cgrp = READ_ONCE(array->ptrs[idx]); - if (unlikely(!cgrp)) - return -EAGAIN; - - return task_under_cgroup_hierarchy(current, cgrp); -} - -static const struct bpf_func_proto bpf_current_task_under_cgroup_proto = { - .func = bpf_current_task_under_cgroup, - .gpl_only = false, - .ret_type = RET_INTEGER, - .arg1_type = ARG_CONST_MAP_PTR, - .arg2_type = ARG_ANYTHING, -}; - struct send_signal_irq_work { struct irq_work irq_work; struct task_struct *task; @@ -1439,73 +1415,6 @@ static int __init bpf_key_sig_kfuncs_init(void) late_initcall(bpf_key_sig_kfuncs_init); #endif /* CONFIG_KEYS */ -/* filesystem kfuncs */ -__bpf_kfunc_start_defs(); - -/** - * bpf_get_file_xattr - get xattr of a file - * @file: file to get xattr from - * @name__str: name of the xattr - * @value_p: output buffer of the xattr value - * - * Get xattr *name__str* of *file* and store the output in *value_ptr*. - * - * For security reasons, only *name__str* with prefix "user." is allowed. - * - * Return: 0 on success, a negative value on error. - */ -__bpf_kfunc int bpf_get_file_xattr(struct file *file, const char *name__str, - struct bpf_dynptr *value_p) -{ - struct bpf_dynptr_kern *value_ptr = (struct bpf_dynptr_kern *)value_p; - struct dentry *dentry; - u32 value_len; - void *value; - int ret; - - if (strncmp(name__str, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN)) - return -EPERM; - - value_len = __bpf_dynptr_size(value_ptr); - value = __bpf_dynptr_data_rw(value_ptr, value_len); - if (!value) - return -EINVAL; - - dentry = file_dentry(file); - ret = inode_permission(&nop_mnt_idmap, dentry->d_inode, MAY_READ); - if (ret) - return ret; - return __vfs_getxattr(dentry, dentry->d_inode, name__str, value, value_len); -} - -__bpf_kfunc_end_defs(); - -BTF_KFUNCS_START(fs_kfunc_set_ids) -BTF_ID_FLAGS(func, bpf_get_file_xattr, KF_SLEEPABLE | KF_TRUSTED_ARGS) -BTF_KFUNCS_END(fs_kfunc_set_ids) - -static int bpf_get_file_xattr_filter(const struct bpf_prog *prog, u32 kfunc_id) -{ - if (!btf_id_set8_contains(&fs_kfunc_set_ids, kfunc_id)) - return 0; - - /* Only allow to attach from LSM hooks, to avoid recursion */ - return prog->type != BPF_PROG_TYPE_LSM ? -EACCES : 0; -} - -static const struct btf_kfunc_id_set bpf_fs_kfunc_set = { - .owner = THIS_MODULE, - .set = &fs_kfunc_set_ids, - .filter = bpf_get_file_xattr_filter, -}; - -static int __init bpf_fs_kfuncs_init(void) -{ - return register_btf_kfunc_id_set(BPF_PROG_TYPE_LSM, &bpf_fs_kfunc_set); -} - -late_initcall(bpf_fs_kfuncs_init); - static const struct bpf_func_proto * bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { @@ -1548,8 +1457,6 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_get_numa_node_id_proto; case BPF_FUNC_perf_event_read: return &bpf_perf_event_read_proto; - case BPF_FUNC_current_task_under_cgroup: - return &bpf_current_task_under_cgroup_proto; case BPF_FUNC_get_prandom_u32: return &bpf_get_prandom_u32_proto; case BPF_FUNC_probe_write_user: @@ -1578,6 +1485,8 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_cgrp_storage_get_proto; case BPF_FUNC_cgrp_storage_delete: return &bpf_cgrp_storage_delete_proto; + case BPF_FUNC_current_task_under_cgroup: + return &bpf_current_task_under_cgroup_proto; #endif case BPF_FUNC_send_signal: return &bpf_send_signal_proto; diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c index fc205ad167a9..d1d5ea2d0a1b 100644 --- a/kernel/trace/fgraph.c +++ b/kernel/trace/fgraph.c @@ -902,7 +902,7 @@ unsigned long ftrace_graph_ret_addr(struct task_struct *task, int *idx, i = *idx ? : task->curr_ret_stack; while (i > 0) { - ret_stack = get_ret_stack(current, i, &i); + ret_stack = get_ret_stack(task, i, &i); if (!ret_stack) break; /* diff --git a/kernel/trace/preemptirq_delay_test.c b/kernel/trace/preemptirq_delay_test.c index cb0871fbdb07..314ffc143039 100644 --- a/kernel/trace/preemptirq_delay_test.c +++ b/kernel/trace/preemptirq_delay_test.c @@ -34,8 +34,6 @@ MODULE_PARM_DESC(cpu_affinity, "Cpu num test is running on"); static struct completion done; -#define MIN(x, y) ((x) < (y) ? (x) : (y)) - static void busy_wait(ulong time) { u64 start, end; diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 28853966aa9a..cebd879a30cb 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -693,18 +693,6 @@ u64 ring_buffer_event_time_stamp(struct trace_buffer *buffer, } /** - * ring_buffer_nr_pages - get the number of buffer pages in the ring buffer - * @buffer: The ring_buffer to get the number of pages from - * @cpu: The cpu of the ring_buffer to get the number of pages from - * - * Returns the number of pages used by a per_cpu buffer of the ring buffer. - */ -size_t ring_buffer_nr_pages(struct trace_buffer *buffer, int cpu) -{ - return buffer->buffers[cpu]->nr_pages; -} - -/** * ring_buffer_nr_dirty_pages - get the number of used pages in the ring buffer * @buffer: The ring_buffer to get the number of pages from * @cpu: The cpu of the ring_buffer to get the number of pages from diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 10cd38bce2f1..ebe7ce2f5f4a 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -7956,7 +7956,7 @@ tracing_buffers_read(struct file *filp, char __user *ubuf, trace_access_unlock(iter->cpu_file); if (ret < 0) { - if (trace_empty(iter)) { + if (trace_empty(iter) && !iter->closed) { if ((filp->f_flags & O_NONBLOCK)) return -EAGAIN; diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 8783bebd0562..bd3e3069300e 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -1634,6 +1634,29 @@ static inline void *event_file_data(struct file *filp) extern struct mutex event_mutex; extern struct list_head ftrace_events; +/* + * When the trace_event_file is the filp->i_private pointer, + * it must be taken under the event_mutex lock, and then checked + * if the EVENT_FILE_FL_FREED flag is set. If it is, then the + * data pointed to by the trace_event_file can not be trusted. + * + * Use the event_file_file() to access the trace_event_file from + * the filp the first time under the event_mutex and check for + * NULL. If it is needed to be retrieved again and the event_mutex + * is still held, then the event_file_data() can be used and it + * is guaranteed to be valid. + */ +static inline struct trace_event_file *event_file_file(struct file *filp) +{ + struct trace_event_file *file; + + lockdep_assert_held(&event_mutex); + file = READ_ONCE(file_inode(filp)->i_private); + if (!file || file->flags & EVENT_FILE_FL_FREED) + return NULL; + return file; +} + extern const struct file_operations event_trigger_fops; extern const struct file_operations event_hist_fops; extern const struct file_operations event_hist_debug_fops; diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 6ef29eba90ce..7266ec2a4eea 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -992,18 +992,18 @@ static void remove_subsystem(struct trace_subsystem_dir *dir) void event_file_get(struct trace_event_file *file) { - atomic_inc(&file->ref); + refcount_inc(&file->ref); } void event_file_put(struct trace_event_file *file) { - if (WARN_ON_ONCE(!atomic_read(&file->ref))) { + if (WARN_ON_ONCE(!refcount_read(&file->ref))) { if (file->flags & EVENT_FILE_FL_FREED) kmem_cache_free(file_cachep, file); return; } - if (atomic_dec_and_test(&file->ref)) { + if (refcount_dec_and_test(&file->ref)) { /* Count should only go to zero when it is freed */ if (WARN_ON_ONCE(!(file->flags & EVENT_FILE_FL_FREED))) return; @@ -1386,12 +1386,12 @@ event_enable_read(struct file *filp, char __user *ubuf, size_t cnt, char buf[4] = "0"; mutex_lock(&event_mutex); - file = event_file_data(filp); + file = event_file_file(filp); if (likely(file)) flags = file->flags; mutex_unlock(&event_mutex); - if (!file || flags & EVENT_FILE_FL_FREED) + if (!file) return -ENODEV; if (flags & EVENT_FILE_FL_ENABLED && @@ -1424,8 +1424,8 @@ event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt, case 1: ret = -ENODEV; mutex_lock(&event_mutex); - file = event_file_data(filp); - if (likely(file && !(file->flags & EVENT_FILE_FL_FREED))) { + file = event_file_file(filp); + if (likely(file)) { ret = tracing_update_buffers(file->tr); if (ret < 0) { mutex_unlock(&event_mutex); @@ -1540,7 +1540,8 @@ enum { static void *f_next(struct seq_file *m, void *v, loff_t *pos) { - struct trace_event_call *call = event_file_data(m->private); + struct trace_event_file *file = event_file_data(m->private); + struct trace_event_call *call = file->event_call; struct list_head *common_head = &ftrace_common_fields; struct list_head *head = trace_get_fields(call); struct list_head *node = v; @@ -1572,7 +1573,8 @@ static void *f_next(struct seq_file *m, void *v, loff_t *pos) static int f_show(struct seq_file *m, void *v) { - struct trace_event_call *call = event_file_data(m->private); + struct trace_event_file *file = event_file_data(m->private); + struct trace_event_call *call = file->event_call; struct ftrace_event_field *field; const char *array_descriptor; @@ -1627,12 +1629,14 @@ static int f_show(struct seq_file *m, void *v) static void *f_start(struct seq_file *m, loff_t *pos) { + struct trace_event_file *file; void *p = (void *)FORMAT_HEADER; loff_t l = 0; /* ->stop() is called even if ->start() fails */ mutex_lock(&event_mutex); - if (!event_file_data(m->private)) + file = event_file_file(m->private); + if (!file) return ERR_PTR(-ENODEV); while (l < *pos && p) @@ -1706,8 +1710,8 @@ event_filter_read(struct file *filp, char __user *ubuf, size_t cnt, trace_seq_init(s); mutex_lock(&event_mutex); - file = event_file_data(filp); - if (file && !(file->flags & EVENT_FILE_FL_FREED)) + file = event_file_file(filp); + if (file) print_event_filter(file, s); mutex_unlock(&event_mutex); @@ -1736,9 +1740,13 @@ event_filter_write(struct file *filp, const char __user *ubuf, size_t cnt, return PTR_ERR(buf); mutex_lock(&event_mutex); - file = event_file_data(filp); - if (file) - err = apply_event_filter(file, buf); + file = event_file_file(filp); + if (file) { + if (file->flags & EVENT_FILE_FL_FREED) + err = -ENODEV; + else + err = apply_event_filter(file, buf); + } mutex_unlock(&event_mutex); kfree(buf); @@ -2485,7 +2493,6 @@ static int event_callback(const char *name, umode_t *mode, void **data, if (strcmp(name, "format") == 0) { *mode = TRACE_MODE_READ; *fops = &ftrace_event_format_fops; - *data = call; return 1; } @@ -2996,7 +3003,7 @@ trace_create_new_event(struct trace_event_call *call, atomic_set(&file->tm_ref, 0); INIT_LIST_HEAD(&file->triggers); list_add(&file->list, &tr->events); - event_file_get(file); + refcount_set(&file->ref, 1); return file; } diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 6ece1308d36a..5f9119eb7c67 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -5601,7 +5601,7 @@ static int hist_show(struct seq_file *m, void *v) mutex_lock(&event_mutex); - event_file = event_file_data(m->private); + event_file = event_file_file(m->private); if (unlikely(!event_file)) { ret = -ENODEV; goto out_unlock; @@ -5880,7 +5880,7 @@ static int hist_debug_show(struct seq_file *m, void *v) mutex_lock(&event_mutex); - event_file = event_file_data(m->private); + event_file = event_file_file(m->private); if (unlikely(!event_file)) { ret = -ENODEV; goto out_unlock; diff --git a/kernel/trace/trace_events_inject.c b/kernel/trace/trace_events_inject.c index 8650562bdaa9..a8f076809db4 100644 --- a/kernel/trace/trace_events_inject.c +++ b/kernel/trace/trace_events_inject.c @@ -299,7 +299,7 @@ event_inject_write(struct file *filp, const char __user *ubuf, size_t cnt, strim(buf); mutex_lock(&event_mutex); - file = event_file_data(filp); + file = event_file_file(filp); if (file) { call = file->event_call; size = parse_entry(buf, call, &entry); diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c index 4bec043c8690..a5e3d6acf1e1 100644 --- a/kernel/trace/trace_events_trigger.c +++ b/kernel/trace/trace_events_trigger.c @@ -159,7 +159,7 @@ static void *trigger_start(struct seq_file *m, loff_t *pos) /* ->stop() is called even if ->start() fails */ mutex_lock(&event_mutex); - event_file = event_file_data(m->private); + event_file = event_file_file(m->private); if (unlikely(!event_file)) return ERR_PTR(-ENODEV); @@ -213,7 +213,7 @@ static int event_trigger_regex_open(struct inode *inode, struct file *file) mutex_lock(&event_mutex); - if (unlikely(!event_file_data(file))) { + if (unlikely(!event_file_file(file))) { mutex_unlock(&event_mutex); return -ENODEV; } @@ -293,7 +293,7 @@ static ssize_t event_trigger_regex_write(struct file *file, strim(buf); mutex_lock(&event_mutex); - event_file = event_file_data(file); + event_file = event_file_file(file); if (unlikely(!event_file)) { mutex_unlock(&event_mutex); kfree(buf); diff --git a/kernel/trace/tracing_map.c b/kernel/trace/tracing_map.c index a4dcf0f24352..3a56e7c8aa4f 100644 --- a/kernel/trace/tracing_map.c +++ b/kernel/trace/tracing_map.c @@ -454,7 +454,7 @@ static struct tracing_map_elt *get_free_elt(struct tracing_map *map) struct tracing_map_elt *elt = NULL; int idx; - idx = atomic_inc_return(&map->next_elt); + idx = atomic_fetch_add_unless(&map->next_elt, 1, map->max_elts); if (idx < map->max_elts) { elt = *(TRACING_MAP_ELT(map->elts, idx)); if (map->ops && map->ops->elt_init) @@ -699,7 +699,7 @@ void tracing_map_clear(struct tracing_map *map) { unsigned int i; - atomic_set(&map->next_elt, -1); + atomic_set(&map->next_elt, 0); atomic64_set(&map->hits, 0); atomic64_set(&map->drops, 0); @@ -783,7 +783,7 @@ struct tracing_map *tracing_map_create(unsigned int map_bits, map->map_bits = map_bits; map->max_elts = (1 << map_bits); - atomic_set(&map->next_elt, -1); + atomic_set(&map->next_elt, 0); map->map_size = (1 << (map_bits + 1)); map->ops = ops; |