From c8e2ee1f3df05dc4caa746c062c6b5791c745c79 Mon Sep 17 00:00:00 2001
From: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Date: Tue, 3 Dec 2024 19:03:57 -0800
Subject: bpf: Introduce support for bpf_local_irq_{save,restore}

Teach the verifier about IRQ-disabled sections through the introduction
of two new kfuncs, bpf_local_irq_save, to save IRQ state and disable
them, and bpf_local_irq_restore, to restore IRQ state and enable them
back again.

For the purposes of tracking the saved IRQ state, the verifier is taught
about a new special object on the stack of type STACK_IRQ_FLAG. This is
a 8 byte value which saves the IRQ flags which are to be passed back to
the IRQ restore kfunc.

Renumber the enums for REF_TYPE_* to simplify the check in
find_lock_state, filtering out non-lock types as they grow will become
cumbersome and is unecessary.

To track a dynamic number of IRQ-disabled regions and their associated
saved states, a new resource type RES_TYPE_IRQ is introduced, which its
state management functions: acquire_irq_state and release_irq_state,
taking advantage of the refactoring and clean ups made in earlier
commits.

One notable requirement of the kernel's IRQ save and restore API is that
they cannot happen out of order. For this purpose, when releasing reference
we keep track of the prev_id we saw with REF_TYPE_IRQ. Since reference
states are inserted in increasing order of the index, this is used to
remember the ordering of acquisitions of IRQ saved states, so that we
maintain a logical stack in acquisition order of resource identities,
and can enforce LIFO ordering when restoring IRQ state. The top of the
stack is maintained using bpf_verifier_state's active_irq_id.

To maintain the stack property when releasing reference states, we need
to modify release_reference_state to instead shift the remaining array
left using memmove instead of swapping deleted element with last that
might break the ordering. A selftest to test this subtle behavior is
added in late patches.

The logic to detect initialized and unitialized irq flag slots, marking
and unmarking is similar to how it's done for iterators. No additional
checks are needed in refsafe for REF_TYPE_IRQ, apart from the usual
check_id satisfiability check on the ref[i].id. We have to perform the
same check_ids check on state->active_irq_id as well.

To ensure we don't get assigned REF_TYPE_PTR by default after
acquire_reference_state, if someone forgets to assign the type, let's
also renumber the enum ref_state_type. This way any unassigned types
get caught by refsafe's default switch statement, don't assume
REF_TYPE_PTR by default.

The kfuncs themselves are plain wrappers over local_irq_save and
local_irq_restore macros.

Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Link: https://lore.kernel.org/r/20241204030400.208005-5-memxor@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/helpers.c | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

(limited to 'kernel/bpf/helpers.c')

diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 751c150f9e1cd..532ea74d48508 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -3057,6 +3057,21 @@ __bpf_kfunc int bpf_copy_from_user_str(void *dst, u32 dst__sz, const void __user
 	return ret + 1;
 }
 
+/* Keep unsinged long in prototype so that kfunc is usable when emitted to
+ * vmlinux.h in BPF programs directly, but note that while in BPF prog, the
+ * unsigned long always points to 8-byte region on stack, the kernel may only
+ * read and write the 4-bytes on 32-bit.
+ */
+__bpf_kfunc void bpf_local_irq_save(unsigned long *flags__irq_flag)
+{
+	local_irq_save(*flags__irq_flag);
+}
+
+__bpf_kfunc void bpf_local_irq_restore(unsigned long *flags__irq_flag)
+{
+	local_irq_restore(*flags__irq_flag);
+}
+
 __bpf_kfunc_end_defs();
 
 BTF_KFUNCS_START(generic_btf_ids)
@@ -3149,6 +3164,8 @@ BTF_ID_FLAGS(func, bpf_get_kmem_cache)
 BTF_ID_FLAGS(func, bpf_iter_kmem_cache_new, KF_ITER_NEW | KF_SLEEPABLE)
 BTF_ID_FLAGS(func, bpf_iter_kmem_cache_next, KF_ITER_NEXT | KF_RET_NULL | KF_SLEEPABLE)
 BTF_ID_FLAGS(func, bpf_iter_kmem_cache_destroy, KF_ITER_DESTROY | KF_SLEEPABLE)
+BTF_ID_FLAGS(func, bpf_local_irq_save)
+BTF_ID_FLAGS(func, bpf_local_irq_restore)
 BTF_KFUNCS_END(common_btf_ids)
 
 static const struct btf_kfunc_id_set common_kfunc_set = {
-- 
cgit v1.2.3


From 00a5acdbf39816ad23b8db3255c366bbc77e69af Mon Sep 17 00:00:00 2001
From: Thomas Weißschuh <linux@weissschuh.net>
Date: Fri, 13 Dec 2024 00:00:30 +0100
Subject: bpf: Fix configuration-dependent BTF function references
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

These BTF functions are not available unconditionally,
only reference them when they are available.

Avoid the following build warnings:

  BTF     .tmp_vmlinux1.btf.o
btf_encoder__tag_kfunc: failed to find kfunc 'bpf_send_signal_task' in BTF
btf_encoder__tag_kfuncs: failed to tag kfunc 'bpf_send_signal_task'
  NM      .tmp_vmlinux1.syms
  KSYMS   .tmp_vmlinux1.kallsyms.S
  AS      .tmp_vmlinux1.kallsyms.o
  LD      .tmp_vmlinux2
  NM      .tmp_vmlinux2.syms
  KSYMS   .tmp_vmlinux2.kallsyms.S
  AS      .tmp_vmlinux2.kallsyms.o
  LD      vmlinux
  BTFIDS  vmlinux
WARN: resolve_btfids: unresolved symbol prog_test_ref_kfunc
WARN: resolve_btfids: unresolved symbol bpf_crypto_ctx
WARN: resolve_btfids: unresolved symbol bpf_send_signal_task
WARN: resolve_btfids: unresolved symbol bpf_modify_return_test_tp
WARN: resolve_btfids: unresolved symbol bpf_dynptr_from_xdp
WARN: resolve_btfids: unresolved symbol bpf_dynptr_from_skb

Signed-off-by: Thomas Weißschuh <linux@weissschuh.net>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20241213-bpf-cond-ids-v1-1-881849997219@weissschuh.net
---
 kernel/bpf/helpers.c  | 4 ++++
 kernel/bpf/verifier.c | 8 ++++++++
 2 files changed, 12 insertions(+)

(limited to 'kernel/bpf/helpers.c')

diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 532ea74d48508..cd5f9884d85bf 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -3104,7 +3104,9 @@ BTF_ID_FLAGS(func, bpf_task_get_cgroup1, KF_ACQUIRE | KF_RCU | KF_RET_NULL)
 BTF_ID_FLAGS(func, bpf_task_from_pid, KF_ACQUIRE | KF_RET_NULL)
 BTF_ID_FLAGS(func, bpf_task_from_vpid, KF_ACQUIRE | KF_RET_NULL)
 BTF_ID_FLAGS(func, bpf_throw)
+#ifdef CONFIG_BPF_EVENTS
 BTF_ID_FLAGS(func, bpf_send_signal_task, KF_TRUSTED_ARGS)
+#endif
 BTF_KFUNCS_END(generic_btf_ids)
 
 static const struct btf_kfunc_id_set generic_kfunc_set = {
@@ -3150,7 +3152,9 @@ BTF_ID_FLAGS(func, bpf_dynptr_is_null)
 BTF_ID_FLAGS(func, bpf_dynptr_is_rdonly)
 BTF_ID_FLAGS(func, bpf_dynptr_size)
 BTF_ID_FLAGS(func, bpf_dynptr_clone)
+#ifdef CONFIG_NET
 BTF_ID_FLAGS(func, bpf_modify_return_test_tp)
+#endif
 BTF_ID_FLAGS(func, bpf_wq_init)
 BTF_ID_FLAGS(func, bpf_wq_set_callback_impl)
 BTF_ID_FLAGS(func, bpf_wq_start)
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index c0e772996c52c..8a9433a940073 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -5681,7 +5681,9 @@ static bool in_rcu_cs(struct bpf_verifier_env *env)
 
 /* Once GCC supports btf_type_tag the following mechanism will be replaced with tag check */
 BTF_SET_START(rcu_protected_types)
+#ifdef CONFIG_NET
 BTF_ID(struct, prog_test_ref_kfunc)
+#endif
 #ifdef CONFIG_CGROUPS
 BTF_ID(struct, cgroup)
 #endif
@@ -5689,7 +5691,9 @@ BTF_ID(struct, cgroup)
 BTF_ID(struct, bpf_cpumask)
 #endif
 BTF_ID(struct, task_struct)
+#ifdef CONFIG_CRYPTO
 BTF_ID(struct, bpf_crypto_ctx)
+#endif
 BTF_SET_END(rcu_protected_types)
 
 static bool rcu_protected_object(const struct btf *btf, u32 btf_id)
@@ -11722,8 +11726,10 @@ BTF_ID(func, bpf_rdonly_cast)
 BTF_ID(func, bpf_rbtree_remove)
 BTF_ID(func, bpf_rbtree_add_impl)
 BTF_ID(func, bpf_rbtree_first)
+#ifdef CONFIG_NET
 BTF_ID(func, bpf_dynptr_from_skb)
 BTF_ID(func, bpf_dynptr_from_xdp)
+#endif
 BTF_ID(func, bpf_dynptr_slice)
 BTF_ID(func, bpf_dynptr_slice_rdwr)
 BTF_ID(func, bpf_dynptr_clone)
@@ -11751,8 +11757,10 @@ BTF_ID(func, bpf_rcu_read_unlock)
 BTF_ID(func, bpf_rbtree_remove)
 BTF_ID(func, bpf_rbtree_add_impl)
 BTF_ID(func, bpf_rbtree_first)
+#ifdef CONFIG_NET
 BTF_ID(func, bpf_dynptr_from_skb)
 BTF_ID(func, bpf_dynptr_from_xdp)
+#endif
 BTF_ID(func, bpf_dynptr_slice)
 BTF_ID(func, bpf_dynptr_slice_rdwr)
 BTF_ID(func, bpf_dynptr_clone)
-- 
cgit v1.2.3


From 1d2dbe7120e89090ed4f6be03d6fbadfbfff59bf Mon Sep 17 00:00:00 2001
From: Hou Tao <houtao1@huawei.com>
Date: Wed, 8 Jan 2025 09:07:23 +0800
Subject: bpf: Remove migrate_{disable|enable} in bpf_obj_free_fields()

The callers of bpf_obj_free_fields() have already guaranteed that the
migration is disabled, therefore, there is no need to invoke
migrate_{disable,enable} pair in bpf_obj_free_fields()'s underly
implementation.

This patch removes unnecessary migrate_{disable|enable} pairs from
bpf_obj_free_fields() and its callees: bpf_list_head_free() and
bpf_rb_root_free().

Signed-off-by: Hou Tao <houtao1@huawei.com>
Link: https://lore.kernel.org/r/20250108010728.207536-12-houtao@huaweicloud.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/helpers.c | 4 ----
 kernel/bpf/syscall.c | 2 --
 2 files changed, 6 deletions(-)

(limited to 'kernel/bpf/helpers.c')

diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index cd5f9884d85bf..bcda671feafd9 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -2066,9 +2066,7 @@ unlock:
 		/* The contained type can also have resources, including a
 		 * bpf_list_head which needs to be freed.
 		 */
-		migrate_disable();
 		__bpf_obj_drop_impl(obj, field->graph_root.value_rec, false);
-		migrate_enable();
 	}
 }
 
@@ -2105,9 +2103,7 @@ void bpf_rb_root_free(const struct btf_field *field, void *rb_root,
 		obj -= field->graph_root.node_offset;
 
 
-		migrate_disable();
 		__bpf_obj_drop_impl(obj, field->graph_root.value_rec, false);
-		migrate_enable();
 	}
 }
 
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 5d24204929461..0daf098e32074 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -796,11 +796,9 @@ void bpf_obj_free_fields(const struct btf_record *rec, void *obj)
 			if (!btf_is_kernel(field->kptr.btf)) {
 				pointee_struct_meta = btf_find_struct_meta(field->kptr.btf,
 									   field->kptr.btf_id);
-				migrate_disable();
 				__bpf_obj_drop_impl(xchgd_field, pointee_struct_meta ?
 								 pointee_struct_meta->record : NULL,
 								 fields[i].type == BPF_KPTR_PERCPU);
-				migrate_enable();
 			} else {
 				field->kptr.dtor(xchgd_field);
 			}
-- 
cgit v1.2.3


From 58f038e6d209d2dd862fcf5de55407855856794d Mon Sep 17 00:00:00 2001
From: Hou Tao <houtao1@huawei.com>
Date: Fri, 17 Jan 2025 18:18:15 +0800
Subject: bpf: Cancel the running bpf_timer through kworker for PREEMPT_RT
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

During the update procedure, when overwrite element in a pre-allocated
htab, the freeing of old_element is protected by the bucket lock. The
reason why the bucket lock is necessary is that the old_element has
already been stashed in htab->extra_elems after alloc_htab_elem()
returns. If freeing the old_element after the bucket lock is unlocked,
the stashed element may be reused by concurrent update procedure and the
freeing of old_element will run concurrently with the reuse of the
old_element. However, the invocation of check_and_free_fields() may
acquire a spin-lock which violates the lockdep rule because its caller
has already held a raw-spin-lock (bucket lock). The following warning
will be reported when such race happens:

  BUG: scheduling while atomic: test_progs/676/0x00000003
  3 locks held by test_progs/676:
  #0: ffffffff864b0240 (rcu_read_lock_trace){....}-{0:0}, at: bpf_prog_test_run_syscall+0x2c0/0x830
  #1: ffff88810e961188 (&htab->lockdep_key){....}-{2:2}, at: htab_map_update_elem+0x306/0x1500
  #2: ffff8881f4eac1b8 (&base->softirq_expiry_lock){....}-{2:2}, at: hrtimer_cancel_wait_running+0xe9/0x1b0
  Modules linked in: bpf_testmod(O)
  Preemption disabled at:
  [<ffffffff817837a3>] htab_map_update_elem+0x293/0x1500
  CPU: 0 UID: 0 PID: 676 Comm: test_progs Tainted: G ... 6.12.0+ #11
  Tainted: [W]=WARN, [O]=OOT_MODULE
  Hardware name: QEMU Standard PC (i440FX + PIIX, 1996)...
  Call Trace:
  <TASK>
  dump_stack_lvl+0x57/0x70
  dump_stack+0x10/0x20
  __schedule_bug+0x120/0x170
  __schedule+0x300c/0x4800
  schedule_rtlock+0x37/0x60
  rtlock_slowlock_locked+0x6d9/0x54c0
  rt_spin_lock+0x168/0x230
  hrtimer_cancel_wait_running+0xe9/0x1b0
  hrtimer_cancel+0x24/0x30
  bpf_timer_delete_work+0x1d/0x40
  bpf_timer_cancel_and_free+0x5e/0x80
  bpf_obj_free_fields+0x262/0x4a0
  check_and_free_fields+0x1d0/0x280
  htab_map_update_elem+0x7fc/0x1500
  bpf_prog_9f90bc20768e0cb9_overwrite_cb+0x3f/0x43
  bpf_prog_ea601c4649694dbd_overwrite_timer+0x5d/0x7e
  bpf_prog_test_run_syscall+0x322/0x830
  __sys_bpf+0x135d/0x3ca0
  __x64_sys_bpf+0x75/0xb0
  x64_sys_call+0x1b5/0xa10
  do_syscall_64+0x3b/0xc0
  entry_SYSCALL_64_after_hwframe+0x4b/0x53
  ...
  </TASK>

It seems feasible to break the reuse and refill of per-cpu extra_elems
into two independent parts: reuse the per-cpu extra_elems with bucket
lock being held and refill the old_element as per-cpu extra_elems after
the bucket lock is unlocked. However, it will make the concurrent
overwrite procedures on the same CPU return unexpected -E2BIG error when
the map is full.

Therefore, the patch fixes the lock problem by breaking the cancelling
of bpf_timer into two steps for PREEMPT_RT:
1) use hrtimer_try_to_cancel() and check its return value
2) if the timer is running, use hrtimer_cancel() through a kworker to
   cancel it again
Considering that the current implementation of hrtimer_cancel() will try
to acquire a being held softirq_expiry_lock when the current timer is
running, these steps above are reasonable. However, it also has
downside. When the timer is running, the cancelling of the timer is
delayed when releasing the last map uref. The delay is also fixable
(e.g., break the cancelling of bpf timer into two parts: one part in
locked scope, another one in unlocked scope), it can be revised later if
necessary.

It is a bit hard to decide the right fix tag. One reason is that the
problem depends on PREEMPT_RT which is enabled in v6.12. Considering the
softirq_expiry_lock lock exists since v5.4 and bpf_timer is introduced
in v5.15, the bpf_timer commit is used in the fixes tag and an extra
depends-on tag is added to state the dependency on PREEMPT_RT.

Fixes: b00628b1c7d5 ("bpf: Introduce bpf timers.")
Depends-on: v6.12+ with PREEMPT_RT enabled
Reported-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Closes: https://lore.kernel.org/bpf/20241106084527.4gPrMnHt@linutronix.de
Signed-off-by: Hou Tao <houtao1@huawei.com>
Reviewed-by: Toke Høiland-Jørgensen <toke@kernel.org>
Link: https://lore.kernel.org/r/20250117101816.2101857-5-houtao@huaweicloud.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/helpers.c | 18 ++++++++++++++++--
 1 file changed, 16 insertions(+), 2 deletions(-)

(limited to 'kernel/bpf/helpers.c')

diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index bcda671feafd9..f27ce162427ab 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -1593,10 +1593,24 @@ void bpf_timer_cancel_and_free(void *val)
 	 * To avoid these issues, punt to workqueue context when we are in a
 	 * timer callback.
 	 */
-	if (this_cpu_read(hrtimer_running))
+	if (this_cpu_read(hrtimer_running)) {
 		queue_work(system_unbound_wq, &t->cb.delete_work);
-	else
+		return;
+	}
+
+	if (IS_ENABLED(CONFIG_PREEMPT_RT)) {
+		/* If the timer is running on other CPU, also use a kworker to
+		 * wait for the completion of the timer instead of trying to
+		 * acquire a sleepable lock in hrtimer_cancel() to wait for its
+		 * completion.
+		 */
+		if (hrtimer_try_to_cancel(&t->timer) >= 0)
+			kfree_rcu(t, cb.rcu);
+		else
+			queue_work(system_unbound_wq, &t->cb.delete_work);
+	} else {
 		bpf_timer_delete_work(&t->cb.delete_work);
+	}
 }
 
 /* This function is called by map_delete/update_elem for individual element and
-- 
cgit v1.2.3