From c8e2ee1f3df05dc4caa746c062c6b5791c745c79 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Tue, 3 Dec 2024 19:03:57 -0800 Subject: bpf: Introduce support for bpf_local_irq_{save,restore} Teach the verifier about IRQ-disabled sections through the introduction of two new kfuncs, bpf_local_irq_save, to save IRQ state and disable them, and bpf_local_irq_restore, to restore IRQ state and enable them back again. For the purposes of tracking the saved IRQ state, the verifier is taught about a new special object on the stack of type STACK_IRQ_FLAG. This is a 8 byte value which saves the IRQ flags which are to be passed back to the IRQ restore kfunc. Renumber the enums for REF_TYPE_* to simplify the check in find_lock_state, filtering out non-lock types as they grow will become cumbersome and is unecessary. To track a dynamic number of IRQ-disabled regions and their associated saved states, a new resource type RES_TYPE_IRQ is introduced, which its state management functions: acquire_irq_state and release_irq_state, taking advantage of the refactoring and clean ups made in earlier commits. One notable requirement of the kernel's IRQ save and restore API is that they cannot happen out of order. For this purpose, when releasing reference we keep track of the prev_id we saw with REF_TYPE_IRQ. Since reference states are inserted in increasing order of the index, this is used to remember the ordering of acquisitions of IRQ saved states, so that we maintain a logical stack in acquisition order of resource identities, and can enforce LIFO ordering when restoring IRQ state. The top of the stack is maintained using bpf_verifier_state's active_irq_id. To maintain the stack property when releasing reference states, we need to modify release_reference_state to instead shift the remaining array left using memmove instead of swapping deleted element with last that might break the ordering. A selftest to test this subtle behavior is added in late patches. The logic to detect initialized and unitialized irq flag slots, marking and unmarking is similar to how it's done for iterators. No additional checks are needed in refsafe for REF_TYPE_IRQ, apart from the usual check_id satisfiability check on the ref[i].id. We have to perform the same check_ids check on state->active_irq_id as well. To ensure we don't get assigned REF_TYPE_PTR by default after acquire_reference_state, if someone forgets to assign the type, let's also renumber the enum ref_state_type. This way any unassigned types get caught by refsafe's default switch statement, don't assume REF_TYPE_PTR by default. The kfuncs themselves are plain wrappers over local_irq_save and local_irq_restore macros. Acked-by: Eduard Zingerman Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20241204030400.208005-5-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/helpers.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'kernel/bpf/helpers.c') diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 751c150f9e1cd..532ea74d48508 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -3057,6 +3057,21 @@ __bpf_kfunc int bpf_copy_from_user_str(void *dst, u32 dst__sz, const void __user return ret + 1; } +/* Keep unsinged long in prototype so that kfunc is usable when emitted to + * vmlinux.h in BPF programs directly, but note that while in BPF prog, the + * unsigned long always points to 8-byte region on stack, the kernel may only + * read and write the 4-bytes on 32-bit. + */ +__bpf_kfunc void bpf_local_irq_save(unsigned long *flags__irq_flag) +{ + local_irq_save(*flags__irq_flag); +} + +__bpf_kfunc void bpf_local_irq_restore(unsigned long *flags__irq_flag) +{ + local_irq_restore(*flags__irq_flag); +} + __bpf_kfunc_end_defs(); BTF_KFUNCS_START(generic_btf_ids) @@ -3149,6 +3164,8 @@ BTF_ID_FLAGS(func, bpf_get_kmem_cache) BTF_ID_FLAGS(func, bpf_iter_kmem_cache_new, KF_ITER_NEW | KF_SLEEPABLE) BTF_ID_FLAGS(func, bpf_iter_kmem_cache_next, KF_ITER_NEXT | KF_RET_NULL | KF_SLEEPABLE) BTF_ID_FLAGS(func, bpf_iter_kmem_cache_destroy, KF_ITER_DESTROY | KF_SLEEPABLE) +BTF_ID_FLAGS(func, bpf_local_irq_save) +BTF_ID_FLAGS(func, bpf_local_irq_restore) BTF_KFUNCS_END(common_btf_ids) static const struct btf_kfunc_id_set common_kfunc_set = { -- cgit v1.2.3 From 00a5acdbf39816ad23b8db3255c366bbc77e69af Mon Sep 17 00:00:00 2001 From: Thomas Weißschuh Date: Fri, 13 Dec 2024 00:00:30 +0100 Subject: bpf: Fix configuration-dependent BTF function references MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit These BTF functions are not available unconditionally, only reference them when they are available. Avoid the following build warnings: BTF .tmp_vmlinux1.btf.o btf_encoder__tag_kfunc: failed to find kfunc 'bpf_send_signal_task' in BTF btf_encoder__tag_kfuncs: failed to tag kfunc 'bpf_send_signal_task' NM .tmp_vmlinux1.syms KSYMS .tmp_vmlinux1.kallsyms.S AS .tmp_vmlinux1.kallsyms.o LD .tmp_vmlinux2 NM .tmp_vmlinux2.syms KSYMS .tmp_vmlinux2.kallsyms.S AS .tmp_vmlinux2.kallsyms.o LD vmlinux BTFIDS vmlinux WARN: resolve_btfids: unresolved symbol prog_test_ref_kfunc WARN: resolve_btfids: unresolved symbol bpf_crypto_ctx WARN: resolve_btfids: unresolved symbol bpf_send_signal_task WARN: resolve_btfids: unresolved symbol bpf_modify_return_test_tp WARN: resolve_btfids: unresolved symbol bpf_dynptr_from_xdp WARN: resolve_btfids: unresolved symbol bpf_dynptr_from_skb Signed-off-by: Thomas Weißschuh Signed-off-by: Andrii Nakryiko Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20241213-bpf-cond-ids-v1-1-881849997219@weissschuh.net --- kernel/bpf/helpers.c | 4 ++++ kernel/bpf/verifier.c | 8 ++++++++ 2 files changed, 12 insertions(+) (limited to 'kernel/bpf/helpers.c') diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 532ea74d48508..cd5f9884d85bf 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -3104,7 +3104,9 @@ BTF_ID_FLAGS(func, bpf_task_get_cgroup1, KF_ACQUIRE | KF_RCU | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_task_from_pid, KF_ACQUIRE | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_task_from_vpid, KF_ACQUIRE | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_throw) +#ifdef CONFIG_BPF_EVENTS BTF_ID_FLAGS(func, bpf_send_signal_task, KF_TRUSTED_ARGS) +#endif BTF_KFUNCS_END(generic_btf_ids) static const struct btf_kfunc_id_set generic_kfunc_set = { @@ -3150,7 +3152,9 @@ BTF_ID_FLAGS(func, bpf_dynptr_is_null) BTF_ID_FLAGS(func, bpf_dynptr_is_rdonly) BTF_ID_FLAGS(func, bpf_dynptr_size) BTF_ID_FLAGS(func, bpf_dynptr_clone) +#ifdef CONFIG_NET BTF_ID_FLAGS(func, bpf_modify_return_test_tp) +#endif BTF_ID_FLAGS(func, bpf_wq_init) BTF_ID_FLAGS(func, bpf_wq_set_callback_impl) BTF_ID_FLAGS(func, bpf_wq_start) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index c0e772996c52c..8a9433a940073 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5681,7 +5681,9 @@ static bool in_rcu_cs(struct bpf_verifier_env *env) /* Once GCC supports btf_type_tag the following mechanism will be replaced with tag check */ BTF_SET_START(rcu_protected_types) +#ifdef CONFIG_NET BTF_ID(struct, prog_test_ref_kfunc) +#endif #ifdef CONFIG_CGROUPS BTF_ID(struct, cgroup) #endif @@ -5689,7 +5691,9 @@ BTF_ID(struct, cgroup) BTF_ID(struct, bpf_cpumask) #endif BTF_ID(struct, task_struct) +#ifdef CONFIG_CRYPTO BTF_ID(struct, bpf_crypto_ctx) +#endif BTF_SET_END(rcu_protected_types) static bool rcu_protected_object(const struct btf *btf, u32 btf_id) @@ -11722,8 +11726,10 @@ BTF_ID(func, bpf_rdonly_cast) BTF_ID(func, bpf_rbtree_remove) BTF_ID(func, bpf_rbtree_add_impl) BTF_ID(func, bpf_rbtree_first) +#ifdef CONFIG_NET BTF_ID(func, bpf_dynptr_from_skb) BTF_ID(func, bpf_dynptr_from_xdp) +#endif BTF_ID(func, bpf_dynptr_slice) BTF_ID(func, bpf_dynptr_slice_rdwr) BTF_ID(func, bpf_dynptr_clone) @@ -11751,8 +11757,10 @@ BTF_ID(func, bpf_rcu_read_unlock) BTF_ID(func, bpf_rbtree_remove) BTF_ID(func, bpf_rbtree_add_impl) BTF_ID(func, bpf_rbtree_first) +#ifdef CONFIG_NET BTF_ID(func, bpf_dynptr_from_skb) BTF_ID(func, bpf_dynptr_from_xdp) +#endif BTF_ID(func, bpf_dynptr_slice) BTF_ID(func, bpf_dynptr_slice_rdwr) BTF_ID(func, bpf_dynptr_clone) -- cgit v1.2.3 From 1d2dbe7120e89090ed4f6be03d6fbadfbfff59bf Mon Sep 17 00:00:00 2001 From: Hou Tao Date: Wed, 8 Jan 2025 09:07:23 +0800 Subject: bpf: Remove migrate_{disable|enable} in bpf_obj_free_fields() The callers of bpf_obj_free_fields() have already guaranteed that the migration is disabled, therefore, there is no need to invoke migrate_{disable,enable} pair in bpf_obj_free_fields()'s underly implementation. This patch removes unnecessary migrate_{disable|enable} pairs from bpf_obj_free_fields() and its callees: bpf_list_head_free() and bpf_rb_root_free(). Signed-off-by: Hou Tao Link: https://lore.kernel.org/r/20250108010728.207536-12-houtao@huaweicloud.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/helpers.c | 4 ---- kernel/bpf/syscall.c | 2 -- 2 files changed, 6 deletions(-) (limited to 'kernel/bpf/helpers.c') diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index cd5f9884d85bf..bcda671feafd9 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -2066,9 +2066,7 @@ unlock: /* The contained type can also have resources, including a * bpf_list_head which needs to be freed. */ - migrate_disable(); __bpf_obj_drop_impl(obj, field->graph_root.value_rec, false); - migrate_enable(); } } @@ -2105,9 +2103,7 @@ void bpf_rb_root_free(const struct btf_field *field, void *rb_root, obj -= field->graph_root.node_offset; - migrate_disable(); __bpf_obj_drop_impl(obj, field->graph_root.value_rec, false); - migrate_enable(); } } diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 5d24204929461..0daf098e32074 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -796,11 +796,9 @@ void bpf_obj_free_fields(const struct btf_record *rec, void *obj) if (!btf_is_kernel(field->kptr.btf)) { pointee_struct_meta = btf_find_struct_meta(field->kptr.btf, field->kptr.btf_id); - migrate_disable(); __bpf_obj_drop_impl(xchgd_field, pointee_struct_meta ? pointee_struct_meta->record : NULL, fields[i].type == BPF_KPTR_PERCPU); - migrate_enable(); } else { field->kptr.dtor(xchgd_field); } -- cgit v1.2.3 From 58f038e6d209d2dd862fcf5de55407855856794d Mon Sep 17 00:00:00 2001 From: Hou Tao Date: Fri, 17 Jan 2025 18:18:15 +0800 Subject: bpf: Cancel the running bpf_timer through kworker for PREEMPT_RT MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit During the update procedure, when overwrite element in a pre-allocated htab, the freeing of old_element is protected by the bucket lock. The reason why the bucket lock is necessary is that the old_element has already been stashed in htab->extra_elems after alloc_htab_elem() returns. If freeing the old_element after the bucket lock is unlocked, the stashed element may be reused by concurrent update procedure and the freeing of old_element will run concurrently with the reuse of the old_element. However, the invocation of check_and_free_fields() may acquire a spin-lock which violates the lockdep rule because its caller has already held a raw-spin-lock (bucket lock). The following warning will be reported when such race happens: BUG: scheduling while atomic: test_progs/676/0x00000003 3 locks held by test_progs/676: #0: ffffffff864b0240 (rcu_read_lock_trace){....}-{0:0}, at: bpf_prog_test_run_syscall+0x2c0/0x830 #1: ffff88810e961188 (&htab->lockdep_key){....}-{2:2}, at: htab_map_update_elem+0x306/0x1500 #2: ffff8881f4eac1b8 (&base->softirq_expiry_lock){....}-{2:2}, at: hrtimer_cancel_wait_running+0xe9/0x1b0 Modules linked in: bpf_testmod(O) Preemption disabled at: [] htab_map_update_elem+0x293/0x1500 CPU: 0 UID: 0 PID: 676 Comm: test_progs Tainted: G ... 6.12.0+ #11 Tainted: [W]=WARN, [O]=OOT_MODULE Hardware name: QEMU Standard PC (i440FX + PIIX, 1996)... Call Trace: dump_stack_lvl+0x57/0x70 dump_stack+0x10/0x20 __schedule_bug+0x120/0x170 __schedule+0x300c/0x4800 schedule_rtlock+0x37/0x60 rtlock_slowlock_locked+0x6d9/0x54c0 rt_spin_lock+0x168/0x230 hrtimer_cancel_wait_running+0xe9/0x1b0 hrtimer_cancel+0x24/0x30 bpf_timer_delete_work+0x1d/0x40 bpf_timer_cancel_and_free+0x5e/0x80 bpf_obj_free_fields+0x262/0x4a0 check_and_free_fields+0x1d0/0x280 htab_map_update_elem+0x7fc/0x1500 bpf_prog_9f90bc20768e0cb9_overwrite_cb+0x3f/0x43 bpf_prog_ea601c4649694dbd_overwrite_timer+0x5d/0x7e bpf_prog_test_run_syscall+0x322/0x830 __sys_bpf+0x135d/0x3ca0 __x64_sys_bpf+0x75/0xb0 x64_sys_call+0x1b5/0xa10 do_syscall_64+0x3b/0xc0 entry_SYSCALL_64_after_hwframe+0x4b/0x53 ... It seems feasible to break the reuse and refill of per-cpu extra_elems into two independent parts: reuse the per-cpu extra_elems with bucket lock being held and refill the old_element as per-cpu extra_elems after the bucket lock is unlocked. However, it will make the concurrent overwrite procedures on the same CPU return unexpected -E2BIG error when the map is full. Therefore, the patch fixes the lock problem by breaking the cancelling of bpf_timer into two steps for PREEMPT_RT: 1) use hrtimer_try_to_cancel() and check its return value 2) if the timer is running, use hrtimer_cancel() through a kworker to cancel it again Considering that the current implementation of hrtimer_cancel() will try to acquire a being held softirq_expiry_lock when the current timer is running, these steps above are reasonable. However, it also has downside. When the timer is running, the cancelling of the timer is delayed when releasing the last map uref. The delay is also fixable (e.g., break the cancelling of bpf timer into two parts: one part in locked scope, another one in unlocked scope), it can be revised later if necessary. It is a bit hard to decide the right fix tag. One reason is that the problem depends on PREEMPT_RT which is enabled in v6.12. Considering the softirq_expiry_lock lock exists since v5.4 and bpf_timer is introduced in v5.15, the bpf_timer commit is used in the fixes tag and an extra depends-on tag is added to state the dependency on PREEMPT_RT. Fixes: b00628b1c7d5 ("bpf: Introduce bpf timers.") Depends-on: v6.12+ with PREEMPT_RT enabled Reported-by: Sebastian Andrzej Siewior Closes: https://lore.kernel.org/bpf/20241106084527.4gPrMnHt@linutronix.de Signed-off-by: Hou Tao Reviewed-by: Toke Høiland-Jørgensen Link: https://lore.kernel.org/r/20250117101816.2101857-5-houtao@huaweicloud.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/helpers.c | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) (limited to 'kernel/bpf/helpers.c') diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index bcda671feafd9..f27ce162427ab 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -1593,10 +1593,24 @@ void bpf_timer_cancel_and_free(void *val) * To avoid these issues, punt to workqueue context when we are in a * timer callback. */ - if (this_cpu_read(hrtimer_running)) + if (this_cpu_read(hrtimer_running)) { queue_work(system_unbound_wq, &t->cb.delete_work); - else + return; + } + + if (IS_ENABLED(CONFIG_PREEMPT_RT)) { + /* If the timer is running on other CPU, also use a kworker to + * wait for the completion of the timer instead of trying to + * acquire a sleepable lock in hrtimer_cancel() to wait for its + * completion. + */ + if (hrtimer_try_to_cancel(&t->timer) >= 0) + kfree_rcu(t, cb.rcu); + else + queue_work(system_unbound_wq, &t->cb.delete_work); + } else { bpf_timer_delete_work(&t->cb.delete_work); + } } /* This function is called by map_delete/update_elem for individual element and -- cgit v1.2.3