diff options
Diffstat (limited to 'tools/testing/selftests/bpf/progs')
36 files changed, 2737 insertions, 73 deletions
| diff --git a/tools/testing/selftests/bpf/progs/bench_sockmap_prog.c b/tools/testing/selftests/bpf/progs/bench_sockmap_prog.c new file mode 100644 index 000000000000..079bf3794b3a --- /dev/null +++ b/tools/testing/selftests/bpf/progs/bench_sockmap_prog.c @@ -0,0 +1,65 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/bpf.h> +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_endian.h> + +long process_byte = 0; +int  verdict_dir = 0; +int  dropped = 0; +int  pkt_size = 0; +struct { +	__uint(type, BPF_MAP_TYPE_SOCKMAP); +	__uint(max_entries, 20); +	__type(key, int); +	__type(value, int); +} sock_map_rx SEC(".maps"); + +struct { +	__uint(type, BPF_MAP_TYPE_SOCKMAP); +	__uint(max_entries, 20); +	__type(key, int); +	__type(value, int); +} sock_map_tx SEC(".maps"); + +SEC("sk_skb/stream_parser") +int prog_skb_parser(struct __sk_buff *skb) +{ +	return pkt_size; +} + +SEC("sk_skb/stream_verdict") +int prog_skb_verdict(struct __sk_buff *skb) +{ +	int one = 1; +	int ret =  bpf_sk_redirect_map(skb, &sock_map_rx, one, verdict_dir); + +	if (ret == SK_DROP) +		dropped++; +	__sync_fetch_and_add(&process_byte, skb->len); +	return ret; +} + +SEC("sk_skb/stream_verdict") +int prog_skb_pass(struct __sk_buff *skb) +{ +	__sync_fetch_and_add(&process_byte, skb->len); +	return SK_PASS; +} + +SEC("sk_msg") +int prog_skmsg_verdict(struct sk_msg_md *msg) +{ +	int one = 1; + +	__sync_fetch_and_add(&process_byte, msg->size); +	return bpf_msg_redirect_map(msg, &sock_map_tx, one, verdict_dir); +} + +SEC("sk_msg") +int prog_skmsg_pass(struct sk_msg_md *msg) +{ +	__sync_fetch_and_add(&process_byte, msg->size); +	return SK_PASS; +} + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/bpf_arena_spin_lock.h b/tools/testing/selftests/bpf/progs/bpf_arena_spin_lock.h new file mode 100644 index 000000000000..d67466c1ff77 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/bpf_arena_spin_lock.h @@ -0,0 +1,542 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2025 Meta Platforms, Inc. and affiliates. */ +#ifndef BPF_ARENA_SPIN_LOCK_H +#define BPF_ARENA_SPIN_LOCK_H + +#include <vmlinux.h> +#include <bpf/bpf_helpers.h> +#include "bpf_atomic.h" + +#define arch_mcs_spin_lock_contended_label(l, label) smp_cond_load_acquire_label(l, VAL, label) +#define arch_mcs_spin_unlock_contended(l) smp_store_release((l), 1) + +#if defined(ENABLE_ATOMICS_TESTS) && defined(__BPF_FEATURE_ADDR_SPACE_CAST) + +#define EBUSY 16 +#define EOPNOTSUPP 95 +#define ETIMEDOUT 110 + +#ifndef __arena +#define __arena __attribute__((address_space(1))) +#endif + +extern unsigned long CONFIG_NR_CPUS __kconfig; + +/* + * Typically, we'd just rely on the definition in vmlinux.h for qspinlock, but + * PowerPC overrides the definition to define lock->val as u32 instead of + * atomic_t, leading to compilation errors.  Import a local definition below so + * that we don't depend on the vmlinux.h version. + */ + +struct __qspinlock { +	union { +		atomic_t val; +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ +		struct { +			u8 locked; +			u8 pending; +		}; +		struct { +			u16 locked_pending; +			u16 tail; +		}; +#else +		struct { +			u16 tail; +			u16 locked_pending; +		}; +		struct { +			u8 reserved[2]; +			u8 pending; +			u8 locked; +		}; +#endif +	}; +}; + +#define arena_spinlock_t struct __qspinlock +/* FIXME: Using typedef causes CO-RE relocation error */ +/* typedef struct qspinlock arena_spinlock_t; */ + +struct arena_mcs_spinlock { +	struct arena_mcs_spinlock __arena *next; +	int locked; +	int count; +}; + +struct arena_qnode { +	struct arena_mcs_spinlock mcs; +}; + +#define _Q_MAX_NODES		4 +#define _Q_PENDING_LOOPS	1 + +/* + * Bitfields in the atomic value: + * + *  0- 7: locked byte + *     8: pending + *  9-15: not used + * 16-17: tail index + * 18-31: tail cpu (+1) + */ +#define _Q_MAX_CPUS		1024 + +#define	_Q_SET_MASK(type)	(((1U << _Q_ ## type ## _BITS) - 1)\ +				      << _Q_ ## type ## _OFFSET) +#define _Q_LOCKED_OFFSET	0 +#define _Q_LOCKED_BITS		8 +#define _Q_LOCKED_MASK		_Q_SET_MASK(LOCKED) + +#define _Q_PENDING_OFFSET	(_Q_LOCKED_OFFSET + _Q_LOCKED_BITS) +#define _Q_PENDING_BITS		8 +#define _Q_PENDING_MASK		_Q_SET_MASK(PENDING) + +#define _Q_TAIL_IDX_OFFSET	(_Q_PENDING_OFFSET + _Q_PENDING_BITS) +#define _Q_TAIL_IDX_BITS	2 +#define _Q_TAIL_IDX_MASK	_Q_SET_MASK(TAIL_IDX) + +#define _Q_TAIL_CPU_OFFSET	(_Q_TAIL_IDX_OFFSET + _Q_TAIL_IDX_BITS) +#define _Q_TAIL_CPU_BITS	(32 - _Q_TAIL_CPU_OFFSET) +#define _Q_TAIL_CPU_MASK	_Q_SET_MASK(TAIL_CPU) + +#define _Q_TAIL_OFFSET		_Q_TAIL_IDX_OFFSET +#define _Q_TAIL_MASK		(_Q_TAIL_IDX_MASK | _Q_TAIL_CPU_MASK) + +#define _Q_LOCKED_VAL		(1U << _Q_LOCKED_OFFSET) +#define _Q_PENDING_VAL		(1U << _Q_PENDING_OFFSET) + +struct arena_qnode __arena qnodes[_Q_MAX_CPUS][_Q_MAX_NODES]; + +static inline u32 encode_tail(int cpu, int idx) +{ +	u32 tail; + +	tail  = (cpu + 1) << _Q_TAIL_CPU_OFFSET; +	tail |= idx << _Q_TAIL_IDX_OFFSET; /* assume < 4 */ + +	return tail; +} + +static inline struct arena_mcs_spinlock __arena *decode_tail(u32 tail) +{ +	u32 cpu = (tail >> _Q_TAIL_CPU_OFFSET) - 1; +	u32 idx = (tail &  _Q_TAIL_IDX_MASK) >> _Q_TAIL_IDX_OFFSET; + +	return &qnodes[cpu][idx].mcs; +} + +static inline +struct arena_mcs_spinlock __arena *grab_mcs_node(struct arena_mcs_spinlock __arena *base, int idx) +{ +	return &((struct arena_qnode __arena *)base + idx)->mcs; +} + +#define _Q_LOCKED_PENDING_MASK (_Q_LOCKED_MASK | _Q_PENDING_MASK) + +/** + * xchg_tail - Put in the new queue tail code word & retrieve previous one + * @lock : Pointer to queued spinlock structure + * @tail : The new queue tail code word + * Return: The previous queue tail code word + * + * xchg(lock, tail) + * + * p,*,* -> n,*,* ; prev = xchg(lock, node) + */ +static __always_inline u32 xchg_tail(arena_spinlock_t __arena *lock, u32 tail) +{ +	u32 old, new; + +	old = atomic_read(&lock->val); +	do { +		new = (old & _Q_LOCKED_PENDING_MASK) | tail; +		/* +		 * We can use relaxed semantics since the caller ensures that +		 * the MCS node is properly initialized before updating the +		 * tail. +		 */ +		/* These loops are not expected to stall, but we still need to +		 * prove to the verifier they will terminate eventually. +		 */ +		cond_break_label(out); +	} while (!atomic_try_cmpxchg_relaxed(&lock->val, &old, new)); + +	return old; +out: +	bpf_printk("RUNTIME ERROR: %s unexpected cond_break exit!!!", __func__); +	return old; +} + +/** + * clear_pending - clear the pending bit. + * @lock: Pointer to queued spinlock structure + * + * *,1,* -> *,0,* + */ +static __always_inline void clear_pending(arena_spinlock_t __arena *lock) +{ +	WRITE_ONCE(lock->pending, 0); +} + +/** + * clear_pending_set_locked - take ownership and clear the pending bit. + * @lock: Pointer to queued spinlock structure + * + * *,1,0 -> *,0,1 + * + * Lock stealing is not allowed if this function is used. + */ +static __always_inline void clear_pending_set_locked(arena_spinlock_t __arena *lock) +{ +	WRITE_ONCE(lock->locked_pending, _Q_LOCKED_VAL); +} + +/** + * set_locked - Set the lock bit and own the lock + * @lock: Pointer to queued spinlock structure + * + * *,*,0 -> *,0,1 + */ +static __always_inline void set_locked(arena_spinlock_t __arena *lock) +{ +	WRITE_ONCE(lock->locked, _Q_LOCKED_VAL); +} + +static __always_inline +u32 arena_fetch_set_pending_acquire(arena_spinlock_t __arena *lock) +{ +	u32 old, new; + +	old = atomic_read(&lock->val); +	do { +		new = old | _Q_PENDING_VAL; +		/* +		 * These loops are not expected to stall, but we still need to +		 * prove to the verifier they will terminate eventually. +		 */ +		cond_break_label(out); +	} while (!atomic_try_cmpxchg_acquire(&lock->val, &old, new)); + +	return old; +out: +	bpf_printk("RUNTIME ERROR: %s unexpected cond_break exit!!!", __func__); +	return old; +} + +/** + * arena_spin_trylock - try to acquire the queued spinlock + * @lock : Pointer to queued spinlock structure + * Return: 1 if lock acquired, 0 if failed + */ +static __always_inline int arena_spin_trylock(arena_spinlock_t __arena *lock) +{ +	int val = atomic_read(&lock->val); + +	if (unlikely(val)) +		return 0; + +	return likely(atomic_try_cmpxchg_acquire(&lock->val, &val, _Q_LOCKED_VAL)); +} + +__noinline +int arena_spin_lock_slowpath(arena_spinlock_t __arena __arg_arena *lock, u32 val) +{ +	struct arena_mcs_spinlock __arena *prev, *next, *node0, *node; +	int ret = -ETIMEDOUT; +	u32 old, tail; +	int idx; + +	/* +	 * Wait for in-progress pending->locked hand-overs with a bounded +	 * number of spins so that we guarantee forward progress. +	 * +	 * 0,1,0 -> 0,0,1 +	 */ +	if (val == _Q_PENDING_VAL) { +		int cnt = _Q_PENDING_LOOPS; +		val = atomic_cond_read_relaxed_label(&lock->val, +						     (VAL != _Q_PENDING_VAL) || !cnt--, +						     release_err); +	} + +	/* +	 * If we observe any contention; queue. +	 */ +	if (val & ~_Q_LOCKED_MASK) +		goto queue; + +	/* +	 * trylock || pending +	 * +	 * 0,0,* -> 0,1,* -> 0,0,1 pending, trylock +	 */ +	val = arena_fetch_set_pending_acquire(lock); + +	/* +	 * If we observe contention, there is a concurrent locker. +	 * +	 * Undo and queue; our setting of PENDING might have made the +	 * n,0,0 -> 0,0,0 transition fail and it will now be waiting +	 * on @next to become !NULL. +	 */ +	if (unlikely(val & ~_Q_LOCKED_MASK)) { + +		/* Undo PENDING if we set it. */ +		if (!(val & _Q_PENDING_MASK)) +			clear_pending(lock); + +		goto queue; +	} + +	/* +	 * We're pending, wait for the owner to go away. +	 * +	 * 0,1,1 -> *,1,0 +	 * +	 * this wait loop must be a load-acquire such that we match the +	 * store-release that clears the locked bit and create lock +	 * sequentiality; this is because not all +	 * clear_pending_set_locked() implementations imply full +	 * barriers. +	 */ +	if (val & _Q_LOCKED_MASK) +		smp_cond_load_acquire_label(&lock->locked, !VAL, release_err); + +	/* +	 * take ownership and clear the pending bit. +	 * +	 * 0,1,0 -> 0,0,1 +	 */ +	clear_pending_set_locked(lock); +	return 0; + +	/* +	 * End of pending bit optimistic spinning and beginning of MCS +	 * queuing. +	 */ +queue: +	node0 = &(qnodes[bpf_get_smp_processor_id()])[0].mcs; +	idx = node0->count++; +	tail = encode_tail(bpf_get_smp_processor_id(), idx); + +	/* +	 * 4 nodes are allocated based on the assumption that there will not be +	 * nested NMIs taking spinlocks. That may not be true in some +	 * architectures even though the chance of needing more than 4 nodes +	 * will still be extremely unlikely. When that happens, we simply return +	 * an error. Original qspinlock has a trylock fallback in this case. +	 */ +	if (unlikely(idx >= _Q_MAX_NODES)) { +		ret = -EBUSY; +		goto release_node_err; +	} + +	node = grab_mcs_node(node0, idx); + +	/* +	 * Ensure that we increment the head node->count before initialising +	 * the actual node. If the compiler is kind enough to reorder these +	 * stores, then an IRQ could overwrite our assignments. +	 */ +	barrier(); + +	node->locked = 0; +	node->next = NULL; + +	/* +	 * We touched a (possibly) cold cacheline in the per-cpu queue node; +	 * attempt the trylock once more in the hope someone let go while we +	 * weren't watching. +	 */ +	if (arena_spin_trylock(lock)) +		goto release; + +	/* +	 * Ensure that the initialisation of @node is complete before we +	 * publish the updated tail via xchg_tail() and potentially link +	 * @node into the waitqueue via WRITE_ONCE(prev->next, node) below. +	 */ +	smp_wmb(); + +	/* +	 * Publish the updated tail. +	 * We have already touched the queueing cacheline; don't bother with +	 * pending stuff. +	 * +	 * p,*,* -> n,*,* +	 */ +	old = xchg_tail(lock, tail); +	next = NULL; + +	/* +	 * if there was a previous node; link it and wait until reaching the +	 * head of the waitqueue. +	 */ +	if (old & _Q_TAIL_MASK) { +		prev = decode_tail(old); + +		/* Link @node into the waitqueue. */ +		WRITE_ONCE(prev->next, node); + +		arch_mcs_spin_lock_contended_label(&node->locked, release_node_err); + +		/* +		 * While waiting for the MCS lock, the next pointer may have +		 * been set by another lock waiter. We cannot prefetch here +		 * due to lack of equivalent instruction in BPF ISA. +		 */ +		next = READ_ONCE(node->next); +	} + +	/* +	 * we're at the head of the waitqueue, wait for the owner & pending to +	 * go away. +	 * +	 * *,x,y -> *,0,0 +	 * +	 * this wait loop must use a load-acquire such that we match the +	 * store-release that clears the locked bit and create lock +	 * sequentiality; this is because the set_locked() function below +	 * does not imply a full barrier. +	 */ +	val = atomic_cond_read_acquire_label(&lock->val, !(VAL & _Q_LOCKED_PENDING_MASK), +					     release_node_err); + +	/* +	 * claim the lock: +	 * +	 * n,0,0 -> 0,0,1 : lock, uncontended +	 * *,*,0 -> *,*,1 : lock, contended +	 * +	 * If the queue head is the only one in the queue (lock value == tail) +	 * and nobody is pending, clear the tail code and grab the lock. +	 * Otherwise, we only need to grab the lock. +	 */ + +	/* +	 * In the PV case we might already have _Q_LOCKED_VAL set, because +	 * of lock stealing; therefore we must also allow: +	 * +	 * n,0,1 -> 0,0,1 +	 * +	 * Note: at this point: (val & _Q_PENDING_MASK) == 0, because of the +	 *       above wait condition, therefore any concurrent setting of +	 *       PENDING will make the uncontended transition fail. +	 */ +	if ((val & _Q_TAIL_MASK) == tail) { +		if (atomic_try_cmpxchg_relaxed(&lock->val, &val, _Q_LOCKED_VAL)) +			goto release; /* No contention */ +	} + +	/* +	 * Either somebody is queued behind us or _Q_PENDING_VAL got set +	 * which will then detect the remaining tail and queue behind us +	 * ensuring we'll see a @next. +	 */ +	set_locked(lock); + +	/* +	 * contended path; wait for next if not observed yet, release. +	 */ +	if (!next) +		next = smp_cond_load_relaxed_label(&node->next, (VAL), release_node_err); + +	arch_mcs_spin_unlock_contended(&next->locked); + +release:; +	/* +	 * release the node +	 * +	 * Doing a normal dec vs this_cpu_dec is fine. An upper context always +	 * decrements count it incremented before returning, thus we're fine. +	 * For contexts interrupting us, they either observe our dec or not. +	 * Just ensure the compiler doesn't reorder this statement, as a +	 * this_cpu_dec implicitly implied that. +	 */ +	barrier(); +	node0->count--; +	return 0; +release_node_err: +	barrier(); +	node0->count--; +	goto release_err; +release_err: +	return ret; +} + +/** + * arena_spin_lock - acquire a queued spinlock + * @lock: Pointer to queued spinlock structure + * + * On error, returned value will be negative. + * On success, zero is returned. + * + * The return value _must_ be tested against zero for success, + * instead of checking it against negative, for passing the + * BPF verifier. + * + * The user should do: + *	if (arena_spin_lock(...) != 0) // failure + *		or + *	if (arena_spin_lock(...) == 0) // success + *		or + *	if (arena_spin_lock(...)) // failure + *		or + *	if (!arena_spin_lock(...)) // success + * instead of: + *	if (arena_spin_lock(...) < 0) // failure + * + * The return value can still be inspected later. + */ +static __always_inline int arena_spin_lock(arena_spinlock_t __arena *lock) +{ +	int val = 0; + +	if (CONFIG_NR_CPUS > 1024) +		return -EOPNOTSUPP; + +	bpf_preempt_disable(); +	if (likely(atomic_try_cmpxchg_acquire(&lock->val, &val, _Q_LOCKED_VAL))) +		return 0; + +	val = arena_spin_lock_slowpath(lock, val); +	/* FIXME: bpf_assert_range(-MAX_ERRNO, 0) once we have it working for all cases. */ +	if (val) +		bpf_preempt_enable(); +	return val; +} + +/** + * arena_spin_unlock - release a queued spinlock + * @lock : Pointer to queued spinlock structure + */ +static __always_inline void arena_spin_unlock(arena_spinlock_t __arena *lock) +{ +	/* +	 * unlock() needs release semantics: +	 */ +	smp_store_release(&lock->locked, 0); +	bpf_preempt_enable(); +} + +#define arena_spin_lock_irqsave(lock, flags)             \ +	({                                               \ +		int __ret;                               \ +		bpf_local_irq_save(&(flags));            \ +		__ret = arena_spin_lock((lock));         \ +		if (__ret)                               \ +			bpf_local_irq_restore(&(flags)); \ +		(__ret);                                 \ +	}) + +#define arena_spin_unlock_irqrestore(lock, flags) \ +	({                                        \ +		arena_spin_unlock((lock));        \ +		bpf_local_irq_restore(&(flags));  \ +	}) + +#endif + +#endif /* BPF_ARENA_SPIN_LOCK_H */ diff --git a/tools/testing/selftests/bpf/progs/bpf_misc.h b/tools/testing/selftests/bpf/progs/bpf_misc.h index 863df7c0fdd0..6e208e24ba3b 100644 --- a/tools/testing/selftests/bpf/progs/bpf_misc.h +++ b/tools/testing/selftests/bpf/progs/bpf_misc.h @@ -225,8 +225,9 @@  #define CAN_USE_BPF_ST  #endif -#if __clang_major__ >= 18 && defined(ENABLE_ATOMICS_TESTS) && \ -	(defined(__TARGET_ARCH_arm64) || defined(__TARGET_ARCH_x86)) +#if __clang_major__ >= 18 && defined(ENABLE_ATOMICS_TESTS) &&		\ +	(defined(__TARGET_ARCH_arm64) || defined(__TARGET_ARCH_x86) ||	\ +	 (defined(__TARGET_ARCH_riscv) && __riscv_xlen == 64))  #define CAN_USE_LOAD_ACQ_STORE_REL  #endif diff --git a/tools/testing/selftests/bpf/progs/bpf_qdisc_common.h b/tools/testing/selftests/bpf/progs/bpf_qdisc_common.h new file mode 100644 index 000000000000..3754f581b328 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/bpf_qdisc_common.h @@ -0,0 +1,27 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef _BPF_QDISC_COMMON_H +#define _BPF_QDISC_COMMON_H + +#define NET_XMIT_SUCCESS        0x00 +#define NET_XMIT_DROP           0x01    /* skb dropped                  */ +#define NET_XMIT_CN             0x02    /* congestion notification      */ + +#define TC_PRIO_CONTROL  7 +#define TC_PRIO_MAX      15 + +#define private(name) SEC(".data." #name) __hidden __attribute__((aligned(8))) + +struct bpf_sk_buff_ptr; + +static struct qdisc_skb_cb *qdisc_skb_cb(const struct sk_buff *skb) +{ +	return (struct qdisc_skb_cb *)skb->cb; +} + +static inline unsigned int qdisc_pkt_len(const struct sk_buff *skb) +{ +	return qdisc_skb_cb(skb)->pkt_len; +} + +#endif diff --git a/tools/testing/selftests/bpf/progs/bpf_qdisc_fail__incompl_ops.c b/tools/testing/selftests/bpf/progs/bpf_qdisc_fail__incompl_ops.c new file mode 100644 index 000000000000..f188062ed730 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/bpf_qdisc_fail__incompl_ops.c @@ -0,0 +1,41 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <vmlinux.h> +#include "bpf_experimental.h" +#include "bpf_qdisc_common.h" + +char _license[] SEC("license") = "GPL"; + +SEC("struct_ops") +int BPF_PROG(bpf_qdisc_test_enqueue, struct sk_buff *skb, struct Qdisc *sch, +	     struct bpf_sk_buff_ptr *to_free) +{ +	bpf_qdisc_skb_drop(skb, to_free); +	return NET_XMIT_DROP; +} + +SEC("struct_ops") +struct sk_buff *BPF_PROG(bpf_qdisc_test_dequeue, struct Qdisc *sch) +{ +	return NULL; +} + +SEC("struct_ops") +void BPF_PROG(bpf_qdisc_test_reset, struct Qdisc *sch) +{ +} + +SEC("struct_ops") +void BPF_PROG(bpf_qdisc_test_destroy, struct Qdisc *sch) +{ +} + +SEC(".struct_ops") +struct Qdisc_ops test = { +	.enqueue   = (void *)bpf_qdisc_test_enqueue, +	.dequeue   = (void *)bpf_qdisc_test_dequeue, +	.reset     = (void *)bpf_qdisc_test_reset, +	.destroy   = (void *)bpf_qdisc_test_destroy, +	.id        = "bpf_qdisc_test", +}; + diff --git a/tools/testing/selftests/bpf/progs/bpf_qdisc_fifo.c b/tools/testing/selftests/bpf/progs/bpf_qdisc_fifo.c new file mode 100644 index 000000000000..1de2be3e370b --- /dev/null +++ b/tools/testing/selftests/bpf/progs/bpf_qdisc_fifo.c @@ -0,0 +1,126 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <vmlinux.h> +#include "bpf_experimental.h" +#include "bpf_qdisc_common.h" + +char _license[] SEC("license") = "GPL"; + +struct skb_node { +	struct sk_buff __kptr * skb; +	struct bpf_list_node node; +}; + +private(A) struct bpf_spin_lock q_fifo_lock; +private(A) struct bpf_list_head q_fifo __contains(skb_node, node); + +bool init_called; + +SEC("struct_ops/bpf_fifo_enqueue") +int BPF_PROG(bpf_fifo_enqueue, struct sk_buff *skb, struct Qdisc *sch, +	     struct bpf_sk_buff_ptr *to_free) +{ +	struct skb_node *skbn; +	u32 pkt_len; + +	if (sch->q.qlen == sch->limit) +		goto drop; + +	skbn = bpf_obj_new(typeof(*skbn)); +	if (!skbn) +		goto drop; + +	pkt_len = qdisc_pkt_len(skb); + +	sch->q.qlen++; +	skb = bpf_kptr_xchg(&skbn->skb, skb); +	if (skb) +		bpf_qdisc_skb_drop(skb, to_free); + +	bpf_spin_lock(&q_fifo_lock); +	bpf_list_push_back(&q_fifo, &skbn->node); +	bpf_spin_unlock(&q_fifo_lock); + +	sch->qstats.backlog += pkt_len; +	return NET_XMIT_SUCCESS; +drop: +	bpf_qdisc_skb_drop(skb, to_free); +	return NET_XMIT_DROP; +} + +SEC("struct_ops/bpf_fifo_dequeue") +struct sk_buff *BPF_PROG(bpf_fifo_dequeue, struct Qdisc *sch) +{ +	struct bpf_list_node *node; +	struct sk_buff *skb = NULL; +	struct skb_node *skbn; + +	bpf_spin_lock(&q_fifo_lock); +	node = bpf_list_pop_front(&q_fifo); +	bpf_spin_unlock(&q_fifo_lock); +	if (!node) +		return NULL; + +	skbn = container_of(node, struct skb_node, node); +	skb = bpf_kptr_xchg(&skbn->skb, skb); +	bpf_obj_drop(skbn); +	if (!skb) +		return NULL; + +	sch->qstats.backlog -= qdisc_pkt_len(skb); +	bpf_qdisc_bstats_update(sch, skb); +	sch->q.qlen--; + +	return skb; +} + +SEC("struct_ops/bpf_fifo_init") +int BPF_PROG(bpf_fifo_init, struct Qdisc *sch, struct nlattr *opt, +	     struct netlink_ext_ack *extack) +{ +	sch->limit = 1000; +	init_called = true; +	return 0; +} + +SEC("struct_ops/bpf_fifo_reset") +void BPF_PROG(bpf_fifo_reset, struct Qdisc *sch) +{ +	struct bpf_list_node *node; +	struct skb_node *skbn; +	int i; + +	bpf_for(i, 0, sch->q.qlen) { +		struct sk_buff *skb = NULL; + +		bpf_spin_lock(&q_fifo_lock); +		node = bpf_list_pop_front(&q_fifo); +		bpf_spin_unlock(&q_fifo_lock); + +		if (!node) +			break; + +		skbn = container_of(node, struct skb_node, node); +		skb = bpf_kptr_xchg(&skbn->skb, skb); +		if (skb) +			bpf_kfree_skb(skb); +		bpf_obj_drop(skbn); +	} +	sch->q.qlen = 0; +} + +SEC("struct_ops") +void BPF_PROG(bpf_fifo_destroy, struct Qdisc *sch) +{ +} + +SEC(".struct_ops") +struct Qdisc_ops fifo = { +	.enqueue   = (void *)bpf_fifo_enqueue, +	.dequeue   = (void *)bpf_fifo_dequeue, +	.init      = (void *)bpf_fifo_init, +	.reset     = (void *)bpf_fifo_reset, +	.destroy   = (void *)bpf_fifo_destroy, +	.id        = "bpf_fifo", +}; + diff --git a/tools/testing/selftests/bpf/progs/bpf_qdisc_fq.c b/tools/testing/selftests/bpf/progs/bpf_qdisc_fq.c new file mode 100644 index 000000000000..1a3233a275c7 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/bpf_qdisc_fq.c @@ -0,0 +1,756 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* bpf_fq is intended for testing the bpf qdisc infrastructure and not a direct + * copy of sch_fq. bpf_fq implements the scheduling algorithm of sch_fq before + * 29f834aa326e ("net_sched: sch_fq: add 3 bands and WRR scheduling") was + * introduced. It gives each flow a fair chance to transmit packets in a + * round-robin fashion. Note that for flow pacing, bpf_fq currently only + * respects skb->tstamp but not skb->sk->sk_pacing_rate. In addition, if there + * are multiple bpf_fq instances, they will have a shared view of flows and + * configuration since some key data structure such as fq_prio_flows, + * fq_nonprio_flows, and fq_bpf_data are global. + * + * To use bpf_fq alone without running selftests, use the following commands. + * + * 1. Register bpf_fq to the kernel + *     bpftool struct_ops register bpf_qdisc_fq.bpf.o /sys/fs/bpf + * 2. Add bpf_fq to an interface + *     tc qdisc add dev <interface name> root handle <handle> bpf_fq + * 3. Delete bpf_fq attached to the interface + *     tc qdisc delete dev <interface name> root + * 4. Unregister bpf_fq + *     bpftool struct_ops unregister name fq + * + * The qdisc name, bpf_fq, used in tc commands is defined by Qdisc_ops.id. + * The struct_ops_map_name, fq, used in the bpftool command is the name of the + * Qdisc_ops. + * + * SEC(".struct_ops") + * struct Qdisc_ops fq = { + *         ... + *         .id        = "bpf_fq", + * }; + */ + +#include <vmlinux.h> +#include <errno.h> +#include <bpf/bpf_helpers.h> +#include "bpf_experimental.h" +#include "bpf_qdisc_common.h" + +char _license[] SEC("license") = "GPL"; + +#define NSEC_PER_USEC 1000L +#define NSEC_PER_SEC 1000000000L + +#define NUM_QUEUE (1 << 20) + +struct fq_bpf_data { +	u32 quantum; +	u32 initial_quantum; +	u32 flow_refill_delay; +	u32 flow_plimit; +	u64 horizon; +	u32 orphan_mask; +	u32 timer_slack; +	u64 time_next_delayed_flow; +	u64 unthrottle_latency_ns; +	u8 horizon_drop; +	u32 new_flow_cnt; +	u32 old_flow_cnt; +	u64 ktime_cache; +}; + +enum { +	CLS_RET_PRIO	= 0, +	CLS_RET_NONPRIO = 1, +	CLS_RET_ERR	= 2, +}; + +struct skb_node { +	u64 tstamp; +	struct sk_buff __kptr * skb; +	struct bpf_rb_node node; +}; + +struct fq_flow_node { +	int credit; +	u32 qlen; +	u64 age; +	u64 time_next_packet; +	struct bpf_list_node list_node; +	struct bpf_rb_node rb_node; +	struct bpf_rb_root queue __contains(skb_node, node); +	struct bpf_spin_lock lock; +	struct bpf_refcount refcount; +}; + +struct dequeue_nonprio_ctx { +	bool stop_iter; +	u64 expire; +	u64 now; +}; + +struct remove_flows_ctx { +	bool gc_only; +	u32 reset_cnt; +	u32 reset_max; +}; + +struct unset_throttled_flows_ctx { +	bool unset_all; +	u64 now; +}; + +struct fq_stashed_flow { +	struct fq_flow_node __kptr * flow; +}; + +struct { +	__uint(type, BPF_MAP_TYPE_HASH); +	__type(key, __u64); +	__type(value, struct fq_stashed_flow); +	__uint(max_entries, NUM_QUEUE); +} fq_nonprio_flows SEC(".maps"); + +struct { +	__uint(type, BPF_MAP_TYPE_HASH); +	__type(key, __u64); +	__type(value, struct fq_stashed_flow); +	__uint(max_entries, 1); +} fq_prio_flows SEC(".maps"); + +private(A) struct bpf_spin_lock fq_delayed_lock; +private(A) struct bpf_rb_root fq_delayed __contains(fq_flow_node, rb_node); + +private(B) struct bpf_spin_lock fq_new_flows_lock; +private(B) struct bpf_list_head fq_new_flows __contains(fq_flow_node, list_node); + +private(C) struct bpf_spin_lock fq_old_flows_lock; +private(C) struct bpf_list_head fq_old_flows __contains(fq_flow_node, list_node); + +private(D) struct fq_bpf_data q; + +/* Wrapper for bpf_kptr_xchg that expects NULL dst */ +static void bpf_kptr_xchg_back(void *map_val, void *ptr) +{ +	void *ret; + +	ret = bpf_kptr_xchg(map_val, ptr); +	if (ret) +		bpf_obj_drop(ret); +} + +static bool skbn_tstamp_less(struct bpf_rb_node *a, const struct bpf_rb_node *b) +{ +	struct skb_node *skbn_a; +	struct skb_node *skbn_b; + +	skbn_a = container_of(a, struct skb_node, node); +	skbn_b = container_of(b, struct skb_node, node); + +	return skbn_a->tstamp < skbn_b->tstamp; +} + +static bool fn_time_next_packet_less(struct bpf_rb_node *a, const struct bpf_rb_node *b) +{ +	struct fq_flow_node *flow_a; +	struct fq_flow_node *flow_b; + +	flow_a = container_of(a, struct fq_flow_node, rb_node); +	flow_b = container_of(b, struct fq_flow_node, rb_node); + +	return flow_a->time_next_packet < flow_b->time_next_packet; +} + +static void +fq_flows_add_head(struct bpf_list_head *head, struct bpf_spin_lock *lock, +		  struct fq_flow_node *flow, u32 *flow_cnt) +{ +	bpf_spin_lock(lock); +	bpf_list_push_front(head, &flow->list_node); +	bpf_spin_unlock(lock); +	*flow_cnt += 1; +} + +static void +fq_flows_add_tail(struct bpf_list_head *head, struct bpf_spin_lock *lock, +		  struct fq_flow_node *flow, u32 *flow_cnt) +{ +	bpf_spin_lock(lock); +	bpf_list_push_back(head, &flow->list_node); +	bpf_spin_unlock(lock); +	*flow_cnt += 1; +} + +static void +fq_flows_remove_front(struct bpf_list_head *head, struct bpf_spin_lock *lock, +		      struct bpf_list_node **node, u32 *flow_cnt) +{ +	bpf_spin_lock(lock); +	*node = bpf_list_pop_front(head); +	bpf_spin_unlock(lock); +	*flow_cnt -= 1; +} + +static bool +fq_flows_is_empty(struct bpf_list_head *head, struct bpf_spin_lock *lock) +{ +	struct bpf_list_node *node; + +	bpf_spin_lock(lock); +	node = bpf_list_pop_front(head); +	if (node) { +		bpf_list_push_front(head, node); +		bpf_spin_unlock(lock); +		return false; +	} +	bpf_spin_unlock(lock); + +	return true; +} + +/* flow->age is used to denote the state of the flow (not-detached, detached, throttled) + * as well as the timestamp when the flow is detached. + * + * 0: not-detached + * 1 - (~0ULL-1): detached + * ~0ULL: throttled + */ +static void fq_flow_set_detached(struct fq_flow_node *flow) +{ +	flow->age = bpf_jiffies64(); +} + +static bool fq_flow_is_detached(struct fq_flow_node *flow) +{ +	return flow->age != 0 && flow->age != ~0ULL; +} + +static bool sk_listener(struct sock *sk) +{ +	return (1 << sk->__sk_common.skc_state) & (TCPF_LISTEN | TCPF_NEW_SYN_RECV); +} + +static void fq_gc(void); + +static int fq_new_flow(void *flow_map, struct fq_stashed_flow **sflow, u64 hash) +{ +	struct fq_stashed_flow tmp = {}; +	struct fq_flow_node *flow; +	int ret; + +	flow = bpf_obj_new(typeof(*flow)); +	if (!flow) +		return -ENOMEM; + +	flow->credit = q.initial_quantum, +	flow->qlen = 0, +	flow->age = 1, +	flow->time_next_packet = 0, + +	ret = bpf_map_update_elem(flow_map, &hash, &tmp, 0); +	if (ret == -ENOMEM || ret == -E2BIG) { +		fq_gc(); +		bpf_map_update_elem(&fq_nonprio_flows, &hash, &tmp, 0); +	} + +	*sflow = bpf_map_lookup_elem(flow_map, &hash); +	if (!*sflow) { +		bpf_obj_drop(flow); +		return -ENOMEM; +	} + +	bpf_kptr_xchg_back(&(*sflow)->flow, flow); +	return 0; +} + +static int +fq_classify(struct sk_buff *skb, struct fq_stashed_flow **sflow) +{ +	struct sock *sk = skb->sk; +	int ret = CLS_RET_NONPRIO; +	u64 hash = 0; + +	if ((skb->priority & TC_PRIO_MAX) == TC_PRIO_CONTROL) { +		*sflow = bpf_map_lookup_elem(&fq_prio_flows, &hash); +		ret = CLS_RET_PRIO; +	} else { +		if (!sk || sk_listener(sk)) { +			hash = bpf_skb_get_hash(skb) & q.orphan_mask; +			/* Avoid collision with an existing flow hash, which +			 * only uses the lower 32 bits of hash, by setting the +			 * upper half of hash to 1. +			 */ +			hash |= (1ULL << 32); +		} else if (sk->__sk_common.skc_state == TCP_CLOSE) { +			hash = bpf_skb_get_hash(skb) & q.orphan_mask; +			hash |= (1ULL << 32); +		} else { +			hash = sk->__sk_common.skc_hash; +		} +		*sflow = bpf_map_lookup_elem(&fq_nonprio_flows, &hash); +	} + +	if (!*sflow) +		ret = fq_new_flow(&fq_nonprio_flows, sflow, hash) < 0 ? +		      CLS_RET_ERR : CLS_RET_NONPRIO; + +	return ret; +} + +static bool fq_packet_beyond_horizon(struct sk_buff *skb) +{ +	return (s64)skb->tstamp > (s64)(q.ktime_cache + q.horizon); +} + +SEC("struct_ops/bpf_fq_enqueue") +int BPF_PROG(bpf_fq_enqueue, struct sk_buff *skb, struct Qdisc *sch, +	     struct bpf_sk_buff_ptr *to_free) +{ +	struct fq_flow_node *flow = NULL, *flow_copy; +	struct fq_stashed_flow *sflow; +	u64 time_to_send, jiffies; +	struct skb_node *skbn; +	int ret; + +	if (sch->q.qlen >= sch->limit) +		goto drop; + +	if (!skb->tstamp) { +		time_to_send = q.ktime_cache = bpf_ktime_get_ns(); +	} else { +		if (fq_packet_beyond_horizon(skb)) { +			q.ktime_cache = bpf_ktime_get_ns(); +			if (fq_packet_beyond_horizon(skb)) { +				if (q.horizon_drop) +					goto drop; + +				skb->tstamp = q.ktime_cache + q.horizon; +			} +		} +		time_to_send = skb->tstamp; +	} + +	ret = fq_classify(skb, &sflow); +	if (ret == CLS_RET_ERR) +		goto drop; + +	flow = bpf_kptr_xchg(&sflow->flow, flow); +	if (!flow) +		goto drop; + +	if (ret == CLS_RET_NONPRIO) { +		if (flow->qlen >= q.flow_plimit) { +			bpf_kptr_xchg_back(&sflow->flow, flow); +			goto drop; +		} + +		if (fq_flow_is_detached(flow)) { +			flow_copy = bpf_refcount_acquire(flow); + +			jiffies = bpf_jiffies64(); +			if ((s64)(jiffies - (flow_copy->age + q.flow_refill_delay)) > 0) { +				if (flow_copy->credit < q.quantum) +					flow_copy->credit = q.quantum; +			} +			flow_copy->age = 0; +			fq_flows_add_tail(&fq_new_flows, &fq_new_flows_lock, flow_copy, +					  &q.new_flow_cnt); +		} +	} + +	skbn = bpf_obj_new(typeof(*skbn)); +	if (!skbn) { +		bpf_kptr_xchg_back(&sflow->flow, flow); +		goto drop; +	} + +	skbn->tstamp = skb->tstamp = time_to_send; + +	sch->qstats.backlog += qdisc_pkt_len(skb); + +	skb = bpf_kptr_xchg(&skbn->skb, skb); +	if (skb) +		bpf_qdisc_skb_drop(skb, to_free); + +	bpf_spin_lock(&flow->lock); +	bpf_rbtree_add(&flow->queue, &skbn->node, skbn_tstamp_less); +	bpf_spin_unlock(&flow->lock); + +	flow->qlen++; +	bpf_kptr_xchg_back(&sflow->flow, flow); + +	sch->q.qlen++; +	return NET_XMIT_SUCCESS; + +drop: +	bpf_qdisc_skb_drop(skb, to_free); +	sch->qstats.drops++; +	return NET_XMIT_DROP; +} + +static int fq_unset_throttled_flows(u32 index, struct unset_throttled_flows_ctx *ctx) +{ +	struct bpf_rb_node *node = NULL; +	struct fq_flow_node *flow; + +	bpf_spin_lock(&fq_delayed_lock); + +	node = bpf_rbtree_first(&fq_delayed); +	if (!node) { +		bpf_spin_unlock(&fq_delayed_lock); +		return 1; +	} + +	flow = container_of(node, struct fq_flow_node, rb_node); +	if (!ctx->unset_all && flow->time_next_packet > ctx->now) { +		q.time_next_delayed_flow = flow->time_next_packet; +		bpf_spin_unlock(&fq_delayed_lock); +		return 1; +	} + +	node = bpf_rbtree_remove(&fq_delayed, &flow->rb_node); + +	bpf_spin_unlock(&fq_delayed_lock); + +	if (!node) +		return 1; + +	flow = container_of(node, struct fq_flow_node, rb_node); +	flow->age = 0; +	fq_flows_add_tail(&fq_old_flows, &fq_old_flows_lock, flow, &q.old_flow_cnt); + +	return 0; +} + +static void fq_flow_set_throttled(struct fq_flow_node *flow) +{ +	flow->age = ~0ULL; + +	if (q.time_next_delayed_flow > flow->time_next_packet) +		q.time_next_delayed_flow = flow->time_next_packet; + +	bpf_spin_lock(&fq_delayed_lock); +	bpf_rbtree_add(&fq_delayed, &flow->rb_node, fn_time_next_packet_less); +	bpf_spin_unlock(&fq_delayed_lock); +} + +static void fq_check_throttled(u64 now) +{ +	struct unset_throttled_flows_ctx ctx = { +		.unset_all = false, +		.now = now, +	}; +	unsigned long sample; + +	if (q.time_next_delayed_flow > now) +		return; + +	sample = (unsigned long)(now - q.time_next_delayed_flow); +	q.unthrottle_latency_ns -= q.unthrottle_latency_ns >> 3; +	q.unthrottle_latency_ns += sample >> 3; + +	q.time_next_delayed_flow = ~0ULL; +	bpf_loop(NUM_QUEUE, fq_unset_throttled_flows, &ctx, 0); +} + +static struct sk_buff* +fq_dequeue_nonprio_flows(u32 index, struct dequeue_nonprio_ctx *ctx) +{ +	u64 time_next_packet, time_to_send; +	struct bpf_rb_node *rb_node; +	struct sk_buff *skb = NULL; +	struct bpf_list_head *head; +	struct bpf_list_node *node; +	struct bpf_spin_lock *lock; +	struct fq_flow_node *flow; +	struct skb_node *skbn; +	bool is_empty; +	u32 *cnt; + +	if (q.new_flow_cnt) { +		head = &fq_new_flows; +		lock = &fq_new_flows_lock; +		cnt = &q.new_flow_cnt; +	} else if (q.old_flow_cnt) { +		head = &fq_old_flows; +		lock = &fq_old_flows_lock; +		cnt = &q.old_flow_cnt; +	} else { +		if (q.time_next_delayed_flow != ~0ULL) +			ctx->expire = q.time_next_delayed_flow; +		goto break_loop; +	} + +	fq_flows_remove_front(head, lock, &node, cnt); +	if (!node) +		goto break_loop; + +	flow = container_of(node, struct fq_flow_node, list_node); +	if (flow->credit <= 0) { +		flow->credit += q.quantum; +		fq_flows_add_tail(&fq_old_flows, &fq_old_flows_lock, flow, &q.old_flow_cnt); +		return NULL; +	} + +	bpf_spin_lock(&flow->lock); +	rb_node = bpf_rbtree_first(&flow->queue); +	if (!rb_node) { +		bpf_spin_unlock(&flow->lock); +		is_empty = fq_flows_is_empty(&fq_old_flows, &fq_old_flows_lock); +		if (head == &fq_new_flows && !is_empty) { +			fq_flows_add_tail(&fq_old_flows, &fq_old_flows_lock, flow, &q.old_flow_cnt); +		} else { +			fq_flow_set_detached(flow); +			bpf_obj_drop(flow); +		} +		return NULL; +	} + +	skbn = container_of(rb_node, struct skb_node, node); +	time_to_send = skbn->tstamp; + +	time_next_packet = (time_to_send > flow->time_next_packet) ? +		time_to_send : flow->time_next_packet; +	if (ctx->now < time_next_packet) { +		bpf_spin_unlock(&flow->lock); +		flow->time_next_packet = time_next_packet; +		fq_flow_set_throttled(flow); +		return NULL; +	} + +	rb_node = bpf_rbtree_remove(&flow->queue, rb_node); +	bpf_spin_unlock(&flow->lock); + +	if (!rb_node) +		goto add_flow_and_break; + +	skbn = container_of(rb_node, struct skb_node, node); +	skb = bpf_kptr_xchg(&skbn->skb, skb); +	bpf_obj_drop(skbn); + +	if (!skb) +		goto add_flow_and_break; + +	flow->credit -= qdisc_skb_cb(skb)->pkt_len; +	flow->qlen--; + +add_flow_and_break: +	fq_flows_add_head(head, lock, flow, cnt); + +break_loop: +	ctx->stop_iter = true; +	return skb; +} + +static struct sk_buff *fq_dequeue_prio(void) +{ +	struct fq_flow_node *flow = NULL; +	struct fq_stashed_flow *sflow; +	struct bpf_rb_node *rb_node; +	struct sk_buff *skb = NULL; +	struct skb_node *skbn; +	u64 hash = 0; + +	sflow = bpf_map_lookup_elem(&fq_prio_flows, &hash); +	if (!sflow) +		return NULL; + +	flow = bpf_kptr_xchg(&sflow->flow, flow); +	if (!flow) +		return NULL; + +	bpf_spin_lock(&flow->lock); +	rb_node = bpf_rbtree_first(&flow->queue); +	if (!rb_node) { +		bpf_spin_unlock(&flow->lock); +		goto out; +	} + +	skbn = container_of(rb_node, struct skb_node, node); +	rb_node = bpf_rbtree_remove(&flow->queue, &skbn->node); +	bpf_spin_unlock(&flow->lock); + +	if (!rb_node) +		goto out; + +	skbn = container_of(rb_node, struct skb_node, node); +	skb = bpf_kptr_xchg(&skbn->skb, skb); +	bpf_obj_drop(skbn); + +out: +	bpf_kptr_xchg_back(&sflow->flow, flow); + +	return skb; +} + +SEC("struct_ops/bpf_fq_dequeue") +struct sk_buff *BPF_PROG(bpf_fq_dequeue, struct Qdisc *sch) +{ +	struct dequeue_nonprio_ctx cb_ctx = {}; +	struct sk_buff *skb = NULL; +	int i; + +	if (!sch->q.qlen) +		goto out; + +	skb = fq_dequeue_prio(); +	if (skb) +		goto dequeue; + +	q.ktime_cache = cb_ctx.now = bpf_ktime_get_ns(); +	fq_check_throttled(q.ktime_cache); +	bpf_for(i, 0, sch->limit) { +		skb = fq_dequeue_nonprio_flows(i, &cb_ctx); +		if (cb_ctx.stop_iter) +			break; +	}; + +	if (skb) { +dequeue: +		sch->q.qlen--; +		sch->qstats.backlog -= qdisc_pkt_len(skb); +		bpf_qdisc_bstats_update(sch, skb); +		return skb; +	} + +	if (cb_ctx.expire) +		bpf_qdisc_watchdog_schedule(sch, cb_ctx.expire, q.timer_slack); +out: +	return NULL; +} + +static int fq_remove_flows_in_list(u32 index, void *ctx) +{ +	struct bpf_list_node *node; +	struct fq_flow_node *flow; + +	bpf_spin_lock(&fq_new_flows_lock); +	node = bpf_list_pop_front(&fq_new_flows); +	bpf_spin_unlock(&fq_new_flows_lock); +	if (!node) { +		bpf_spin_lock(&fq_old_flows_lock); +		node = bpf_list_pop_front(&fq_old_flows); +		bpf_spin_unlock(&fq_old_flows_lock); +		if (!node) +			return 1; +	} + +	flow = container_of(node, struct fq_flow_node, list_node); +	bpf_obj_drop(flow); + +	return 0; +} + +extern unsigned CONFIG_HZ __kconfig; + +/* limit number of collected flows per round */ +#define FQ_GC_MAX 8 +#define FQ_GC_AGE (3*CONFIG_HZ) + +static bool fq_gc_candidate(struct fq_flow_node *flow) +{ +	u64 jiffies = bpf_jiffies64(); + +	return fq_flow_is_detached(flow) && +	       ((s64)(jiffies - (flow->age + FQ_GC_AGE)) > 0); +} + +static int +fq_remove_flows(struct bpf_map *flow_map, u64 *hash, +		struct fq_stashed_flow *sflow, struct remove_flows_ctx *ctx) +{ +	if (sflow->flow && +	    (!ctx->gc_only || fq_gc_candidate(sflow->flow))) { +		bpf_map_delete_elem(flow_map, hash); +		ctx->reset_cnt++; +	} + +	return ctx->reset_cnt < ctx->reset_max ? 0 : 1; +} + +static void fq_gc(void) +{ +	struct remove_flows_ctx cb_ctx = { +		.gc_only = true, +		.reset_cnt = 0, +		.reset_max = FQ_GC_MAX, +	}; + +	bpf_for_each_map_elem(&fq_nonprio_flows, fq_remove_flows, &cb_ctx, 0); +} + +SEC("struct_ops/bpf_fq_reset") +void BPF_PROG(bpf_fq_reset, struct Qdisc *sch) +{ +	struct unset_throttled_flows_ctx utf_ctx = { +		.unset_all = true, +	}; +	struct remove_flows_ctx rf_ctx = { +		.gc_only = false, +		.reset_cnt = 0, +		.reset_max = NUM_QUEUE, +	}; +	struct fq_stashed_flow *sflow; +	u64 hash = 0; + +	sch->q.qlen = 0; +	sch->qstats.backlog = 0; + +	bpf_for_each_map_elem(&fq_nonprio_flows, fq_remove_flows, &rf_ctx, 0); + +	rf_ctx.reset_cnt = 0; +	bpf_for_each_map_elem(&fq_prio_flows, fq_remove_flows, &rf_ctx, 0); +	fq_new_flow(&fq_prio_flows, &sflow, hash); + +	bpf_loop(NUM_QUEUE, fq_remove_flows_in_list, NULL, 0); +	q.new_flow_cnt = 0; +	q.old_flow_cnt = 0; + +	bpf_loop(NUM_QUEUE, fq_unset_throttled_flows, &utf_ctx, 0); +} + +SEC("struct_ops/bpf_fq_init") +int BPF_PROG(bpf_fq_init, struct Qdisc *sch, struct nlattr *opt, +	     struct netlink_ext_ack *extack) +{ +	struct net_device *dev = sch->dev_queue->dev; +	u32 psched_mtu = dev->mtu + dev->hard_header_len; +	struct fq_stashed_flow *sflow; +	u64 hash = 0; + +	if (fq_new_flow(&fq_prio_flows, &sflow, hash) < 0) +		return -ENOMEM; + +	sch->limit = 10000; +	q.initial_quantum = 10 * psched_mtu; +	q.quantum = 2 * psched_mtu; +	q.flow_refill_delay = 40; +	q.flow_plimit = 100; +	q.horizon = 10ULL * NSEC_PER_SEC; +	q.horizon_drop = 1; +	q.orphan_mask = 1024 - 1; +	q.timer_slack = 10 * NSEC_PER_USEC; +	q.time_next_delayed_flow = ~0ULL; +	q.unthrottle_latency_ns = 0ULL; +	q.new_flow_cnt = 0; +	q.old_flow_cnt = 0; + +	return 0; +} + +SEC("struct_ops") +void BPF_PROG(bpf_fq_destroy, struct Qdisc *sch) +{ +} + +SEC(".struct_ops") +struct Qdisc_ops fq = { +	.enqueue   = (void *)bpf_fq_enqueue, +	.dequeue   = (void *)bpf_fq_dequeue, +	.reset     = (void *)bpf_fq_reset, +	.init      = (void *)bpf_fq_init, +	.destroy   = (void *)bpf_fq_destroy, +	.id        = "bpf_fq", +}; diff --git a/tools/testing/selftests/bpf/progs/bpf_tracing_net.h b/tools/testing/selftests/bpf/progs/bpf_tracing_net.h index 659694162739..17db400f0e0d 100644 --- a/tools/testing/selftests/bpf/progs/bpf_tracing_net.h +++ b/tools/testing/selftests/bpf/progs/bpf_tracing_net.h @@ -128,6 +128,7 @@  #define sk_refcnt		__sk_common.skc_refcnt  #define sk_state		__sk_common.skc_state  #define sk_net			__sk_common.skc_net +#define sk_rcv_saddr		__sk_common.skc_rcv_saddr  #define sk_v6_daddr		__sk_common.skc_v6_daddr  #define sk_v6_rcv_saddr		__sk_common.skc_v6_rcv_saddr  #define sk_flags		__sk_common.skc_flags diff --git a/tools/testing/selftests/bpf/progs/btf_type_tag_percpu.c b/tools/testing/selftests/bpf/progs/btf_type_tag_percpu.c index 38f78d9345de..d93f68024cc6 100644 --- a/tools/testing/selftests/bpf/progs/btf_type_tag_percpu.c +++ b/tools/testing/selftests/bpf/progs/btf_type_tag_percpu.c @@ -30,36 +30,42 @@ int BPF_PROG(test_percpu2, struct bpf_testmod_btf_type_tag_2 *arg)  /* trace_cgroup_mkdir(struct cgroup *cgrp, const char *path)   * - * struct cgroup_rstat_cpu { + * struct css_rstat_cpu {   *   ... - *   struct cgroup *updated_children; + *   struct cgroup_subsys_state *updated_children;   *   ...   * };   * - * struct cgroup { + * struct cgroup_subsys_state { + *   ... + *   struct css_rstat_cpu __percpu *rstat_cpu;   *   ... - *   struct cgroup_rstat_cpu __percpu *rstat_cpu; + * }; + * + * struct cgroup { + *   struct cgroup_subsys_state self;   *   ...   * };   */  SEC("tp_btf/cgroup_mkdir")  int BPF_PROG(test_percpu_load, struct cgroup *cgrp, const char *path)  { -	g = (__u64)cgrp->rstat_cpu->updated_children; +	g = (__u64)cgrp->self.rstat_cpu->updated_children;  	return 0;  }  SEC("tp_btf/cgroup_mkdir")  int BPF_PROG(test_percpu_helper, struct cgroup *cgrp, const char *path)  { -	struct cgroup_rstat_cpu *rstat; +	struct css_rstat_cpu *rstat;  	__u32 cpu;  	cpu = bpf_get_smp_processor_id(); -	rstat = (struct cgroup_rstat_cpu *)bpf_per_cpu_ptr(cgrp->rstat_cpu, cpu); +	rstat = (struct css_rstat_cpu *)bpf_per_cpu_ptr( +			cgrp->self.rstat_cpu, cpu);  	if (rstat) {  		/* READ_ONCE */ -		*(volatile int *)rstat; +		*(volatile long *)rstat;  	}  	return 0; diff --git a/tools/testing/selftests/bpf/progs/cgroup_hierarchical_stats.c b/tools/testing/selftests/bpf/progs/cgroup_hierarchical_stats.c index c74362854948..ff189a736ad8 100644 --- a/tools/testing/selftests/bpf/progs/cgroup_hierarchical_stats.c +++ b/tools/testing/selftests/bpf/progs/cgroup_hierarchical_stats.c @@ -37,8 +37,9 @@ struct {  	__type(value, struct attach_counter);  } attach_counters SEC(".maps"); -extern void cgroup_rstat_updated(struct cgroup *cgrp, int cpu) __ksym; -extern void cgroup_rstat_flush(struct cgroup *cgrp) __ksym; +extern void css_rstat_updated( +		struct cgroup_subsys_state *css, int cpu) __ksym; +extern void css_rstat_flush(struct cgroup_subsys_state *css) __ksym;  static uint64_t cgroup_id(struct cgroup *cgrp)  { @@ -75,7 +76,7 @@ int BPF_PROG(counter, struct cgroup *dst_cgrp, struct task_struct *leader,  	else if (create_percpu_attach_counter(cg_id, 1))  		return 0; -	cgroup_rstat_updated(dst_cgrp, bpf_get_smp_processor_id()); +	css_rstat_updated(&dst_cgrp->self, bpf_get_smp_processor_id());  	return 0;  } @@ -141,7 +142,7 @@ int BPF_PROG(dumper, struct bpf_iter_meta *meta, struct cgroup *cgrp)  		return 1;  	/* Flush the stats to make sure we get the most updated numbers */ -	cgroup_rstat_flush(cgrp); +	css_rstat_flush(&cgrp->self);  	total_counter = bpf_map_lookup_elem(&attach_counters, &cg_id);  	if (!total_counter) { diff --git a/tools/testing/selftests/bpf/progs/dmabuf_iter.c b/tools/testing/selftests/bpf/progs/dmabuf_iter.c new file mode 100644 index 000000000000..13cdb11fdeb2 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/dmabuf_iter.c @@ -0,0 +1,101 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2025 Google LLC */ +#include <vmlinux.h> +#include <bpf/bpf_core_read.h> +#include <bpf/bpf_helpers.h> + +/* From uapi/linux/dma-buf.h */ +#define DMA_BUF_NAME_LEN 32 + +char _license[] SEC("license") = "GPL"; + +struct { +	__uint(type, BPF_MAP_TYPE_HASH); +	__uint(key_size, DMA_BUF_NAME_LEN); +	__type(value, bool); +	__uint(max_entries, 5); +} testbuf_hash SEC(".maps"); + +/* + * Fields output by this iterator are delimited by newlines. Convert any + * newlines in user-provided printed strings to spaces. + */ +static void sanitize_string(char *src, size_t size) +{ +	for (char *c = src; (size_t)(c - src) < size && *c; ++c) +		if (*c == '\n') +			*c = ' '; +} + +SEC("iter/dmabuf") +int dmabuf_collector(struct bpf_iter__dmabuf *ctx) +{ +	const struct dma_buf *dmabuf = ctx->dmabuf; +	struct seq_file *seq = ctx->meta->seq; +	unsigned long inode = 0; +	size_t size; +	const char *pname, *exporter; +	char name[DMA_BUF_NAME_LEN] = {'\0'}; + +	if (!dmabuf) +		return 0; + +	if (BPF_CORE_READ_INTO(&inode, dmabuf, file, f_inode, i_ino) || +	    bpf_core_read(&size, sizeof(size), &dmabuf->size) || +	    bpf_core_read(&pname, sizeof(pname), &dmabuf->name) || +	    bpf_core_read(&exporter, sizeof(exporter), &dmabuf->exp_name)) +		return 1; + +	/* Buffers are not required to be named */ +	if (pname) { +		if (bpf_probe_read_kernel(name, sizeof(name), pname)) +			return 1; + +		/* Name strings can be provided by userspace */ +		sanitize_string(name, sizeof(name)); +	} + +	BPF_SEQ_PRINTF(seq, "%lu\n%llu\n%s\n%s\n", inode, size, name, exporter); +	return 0; +} + +SEC("syscall") +int iter_dmabuf_for_each(const void *ctx) +{ +	struct dma_buf *d; + +	bpf_for_each(dmabuf, d) { +		char name[DMA_BUF_NAME_LEN]; +		const char *pname; +		bool *found; +		long len; +		int i; + +		if (bpf_core_read(&pname, sizeof(pname), &d->name)) +			return 1; + +		/* Buffers are not required to be named */ +		if (!pname) +			continue; + +		len = bpf_probe_read_kernel_str(name, sizeof(name), pname); +		if (len < 0) +			return 1; + +		/* +		 * The entire name buffer is used as a map key. +		 * Zeroize any uninitialized trailing bytes after the NUL. +		 */ +		bpf_for(i, len, DMA_BUF_NAME_LEN) +			name[i] = 0; + +		found = bpf_map_lookup_elem(&testbuf_hash, name); +		if (found) { +			bool t = true; + +			bpf_map_update_elem(&testbuf_hash, name, &t, BPF_EXIST); +		} +	} + +	return 0; +} diff --git a/tools/testing/selftests/bpf/progs/dynptr_success.c b/tools/testing/selftests/bpf/progs/dynptr_success.c index e1fba28e4a86..a0391f9da2d4 100644 --- a/tools/testing/selftests/bpf/progs/dynptr_success.c +++ b/tools/testing/selftests/bpf/progs/dynptr_success.c @@ -680,3 +680,233 @@ out:  	bpf_ringbuf_discard_dynptr(&ptr_buf, 0);  	return XDP_DROP;  } + +void *user_ptr; +/* Contains the copy of the data pointed by user_ptr. + * Size 384 to make it not fit into a single kernel chunk when copying + * but less than the maximum bpf stack size (512). + */ +char expected_str[384]; +__u32 test_len[7] = {0/* placeholder */, 0, 1, 2, 255, 256, 257}; + +typedef int (*bpf_read_dynptr_fn_t)(struct bpf_dynptr *dptr, u32 off, +				    u32 size, const void *unsafe_ptr); + +/* Returns the offset just before the end of the maximum sized xdp fragment. + * Any write larger than 32 bytes will be split between 2 fragments. + */ +__u32 xdp_near_frag_end_offset(void) +{ +	const __u32 headroom = 256; +	const __u32 max_frag_size =  __PAGE_SIZE - headroom - sizeof(struct skb_shared_info); + +	/* 32 bytes before the approximate end of the fragment */ +	return max_frag_size - 32; +} + +/* Use __always_inline on test_dynptr_probe[_str][_xdp]() and callbacks + * of type bpf_read_dynptr_fn_t to prevent compiler from generating + * indirect calls that make program fail to load with "unknown opcode" error. + */ +static __always_inline void test_dynptr_probe(void *ptr, bpf_read_dynptr_fn_t bpf_read_dynptr_fn) +{ +	char buf[sizeof(expected_str)]; +	struct bpf_dynptr ptr_buf; +	int i; + +	if (bpf_get_current_pid_tgid() >> 32 != pid) +		return; + +	err = bpf_ringbuf_reserve_dynptr(&ringbuf, sizeof(buf), 0, &ptr_buf); + +	bpf_for(i, 0, ARRAY_SIZE(test_len)) { +		__u32 len = test_len[i]; + +		err = err ?: bpf_read_dynptr_fn(&ptr_buf, 0, test_len[i], ptr); +		if (len > sizeof(buf)) +			break; +		err = err ?: bpf_dynptr_read(&buf, len, &ptr_buf, 0, 0); + +		if (err || bpf_memcmp(expected_str, buf, len)) +			err = 1; + +		/* Reset buffer and dynptr */ +		__builtin_memset(buf, 0, sizeof(buf)); +		err = err ?: bpf_dynptr_write(&ptr_buf, 0, buf, len, 0); +	} +	bpf_ringbuf_discard_dynptr(&ptr_buf, 0); +} + +static __always_inline void test_dynptr_probe_str(void *ptr, +						  bpf_read_dynptr_fn_t bpf_read_dynptr_fn) +{ +	char buf[sizeof(expected_str)]; +	struct bpf_dynptr ptr_buf; +	__u32 cnt, i; + +	if (bpf_get_current_pid_tgid() >> 32 != pid) +		return; + +	bpf_ringbuf_reserve_dynptr(&ringbuf, sizeof(buf), 0, &ptr_buf); + +	bpf_for(i, 0, ARRAY_SIZE(test_len)) { +		__u32 len = test_len[i]; + +		cnt = bpf_read_dynptr_fn(&ptr_buf, 0, len, ptr); +		if (cnt != len) +			err = 1; + +		if (len > sizeof(buf)) +			continue; +		err = err ?: bpf_dynptr_read(&buf, len, &ptr_buf, 0, 0); +		if (!len) +			continue; +		if (err || bpf_memcmp(expected_str, buf, len - 1) || buf[len - 1] != '\0') +			err = 1; +	} +	bpf_ringbuf_discard_dynptr(&ptr_buf, 0); +} + +static __always_inline void test_dynptr_probe_xdp(struct xdp_md *xdp, void *ptr, +						  bpf_read_dynptr_fn_t bpf_read_dynptr_fn) +{ +	struct bpf_dynptr ptr_xdp; +	char buf[sizeof(expected_str)]; +	__u32 off, i; + +	if (bpf_get_current_pid_tgid() >> 32 != pid) +		return; + +	off = xdp_near_frag_end_offset(); +	err = bpf_dynptr_from_xdp(xdp, 0, &ptr_xdp); + +	bpf_for(i, 0, ARRAY_SIZE(test_len)) { +		__u32 len = test_len[i]; + +		err = err ?: bpf_read_dynptr_fn(&ptr_xdp, off, len, ptr); +		if (len > sizeof(buf)) +			continue; +		err = err ?: bpf_dynptr_read(&buf, len, &ptr_xdp, off, 0); +		if (err || bpf_memcmp(expected_str, buf, len)) +			err = 1; +		/* Reset buffer and dynptr */ +		__builtin_memset(buf, 0, sizeof(buf)); +		err = err ?: bpf_dynptr_write(&ptr_xdp, off, buf, len, 0); +	} +} + +static __always_inline void test_dynptr_probe_str_xdp(struct xdp_md *xdp, void *ptr, +						      bpf_read_dynptr_fn_t bpf_read_dynptr_fn) +{ +	struct bpf_dynptr ptr_xdp; +	char buf[sizeof(expected_str)]; +	__u32 cnt, off, i; + +	if (bpf_get_current_pid_tgid() >> 32 != pid) +		return; + +	off = xdp_near_frag_end_offset(); +	err = bpf_dynptr_from_xdp(xdp, 0, &ptr_xdp); +	if (err) +		return; + +	bpf_for(i, 0, ARRAY_SIZE(test_len)) { +		__u32 len = test_len[i]; + +		cnt = bpf_read_dynptr_fn(&ptr_xdp, off, len, ptr); +		if (cnt != len) +			err = 1; + +		if (len > sizeof(buf)) +			continue; +		err = err ?: bpf_dynptr_read(&buf, len, &ptr_xdp, off, 0); + +		if (!len) +			continue; +		if (err || bpf_memcmp(expected_str, buf, len - 1) || buf[len - 1] != '\0') +			err = 1; + +		__builtin_memset(buf, 0, sizeof(buf)); +		err = err ?: bpf_dynptr_write(&ptr_xdp, off, buf, len, 0); +	} +} + +SEC("xdp") +int test_probe_read_user_dynptr(struct xdp_md *xdp) +{ +	test_dynptr_probe(user_ptr, bpf_probe_read_user_dynptr); +	if (!err) +		test_dynptr_probe_xdp(xdp, user_ptr, bpf_probe_read_user_dynptr); +	return XDP_PASS; +} + +SEC("xdp") +int test_probe_read_kernel_dynptr(struct xdp_md *xdp) +{ +	test_dynptr_probe(expected_str, bpf_probe_read_kernel_dynptr); +	if (!err) +		test_dynptr_probe_xdp(xdp, expected_str, bpf_probe_read_kernel_dynptr); +	return XDP_PASS; +} + +SEC("xdp") +int test_probe_read_user_str_dynptr(struct xdp_md *xdp) +{ +	test_dynptr_probe_str(user_ptr, bpf_probe_read_user_str_dynptr); +	if (!err) +		test_dynptr_probe_str_xdp(xdp, user_ptr, bpf_probe_read_user_str_dynptr); +	return XDP_PASS; +} + +SEC("xdp") +int test_probe_read_kernel_str_dynptr(struct xdp_md *xdp) +{ +	test_dynptr_probe_str(expected_str, bpf_probe_read_kernel_str_dynptr); +	if (!err) +		test_dynptr_probe_str_xdp(xdp, expected_str, bpf_probe_read_kernel_str_dynptr); +	return XDP_PASS; +} + +SEC("fentry.s/" SYS_PREFIX "sys_nanosleep") +int test_copy_from_user_dynptr(void *ctx) +{ +	test_dynptr_probe(user_ptr, bpf_copy_from_user_dynptr); +	return 0; +} + +SEC("fentry.s/" SYS_PREFIX "sys_nanosleep") +int test_copy_from_user_str_dynptr(void *ctx) +{ +	test_dynptr_probe_str(user_ptr, bpf_copy_from_user_str_dynptr); +	return 0; +} + +static int bpf_copy_data_from_user_task(struct bpf_dynptr *dptr, u32 off, +					u32 size, const void *unsafe_ptr) +{ +	struct task_struct *task = bpf_get_current_task_btf(); + +	return bpf_copy_from_user_task_dynptr(dptr, off, size, unsafe_ptr, task); +} + +static int bpf_copy_data_from_user_task_str(struct bpf_dynptr *dptr, u32 off, +					    u32 size, const void *unsafe_ptr) +{ +	struct task_struct *task = bpf_get_current_task_btf(); + +	return bpf_copy_from_user_task_str_dynptr(dptr, off, size, unsafe_ptr, task); +} + +SEC("fentry.s/" SYS_PREFIX "sys_nanosleep") +int test_copy_from_user_task_dynptr(void *ctx) +{ +	test_dynptr_probe(user_ptr, bpf_copy_data_from_user_task); +	return 0; +} + +SEC("fentry.s/" SYS_PREFIX "sys_nanosleep") +int test_copy_from_user_task_str_dynptr(void *ctx) +{ +	test_dynptr_probe_str(user_ptr, bpf_copy_data_from_user_task_str); +	return 0; +} diff --git a/tools/testing/selftests/bpf/progs/fd_htab_lookup.c b/tools/testing/selftests/bpf/progs/fd_htab_lookup.c new file mode 100644 index 000000000000..a4a9e1db626f --- /dev/null +++ b/tools/testing/selftests/bpf/progs/fd_htab_lookup.c @@ -0,0 +1,25 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (C) 2025. Huawei Technologies Co., Ltd */ +#include <linux/bpf.h> +#include <bpf/bpf_helpers.h> + +char _license[] SEC("license") = "GPL"; + +struct inner_map_type { +	__uint(type, BPF_MAP_TYPE_ARRAY); +	__uint(key_size, 4); +	__uint(value_size, 4); +	__uint(max_entries, 1); +} inner_map SEC(".maps"); + +struct { +	__uint(type, BPF_MAP_TYPE_HASH_OF_MAPS); +	__uint(max_entries, 64); +	__type(key, int); +	__type(value, int); +	__array(values, struct inner_map_type); +} outer_map SEC(".maps") = { +	.values = { +		[0] = &inner_map, +	}, +}; diff --git a/tools/testing/selftests/bpf/progs/iters.c b/tools/testing/selftests/bpf/progs/iters.c index 427b72954b87..76adf4a8f2da 100644 --- a/tools/testing/selftests/bpf/progs/iters.c +++ b/tools/testing/selftests/bpf/progs/iters.c @@ -7,8 +7,6 @@  #include "bpf_misc.h"  #include "bpf_compiler.h" -#define unlikely(x)	__builtin_expect(!!(x), 0) -  static volatile int zero = 0;  int my_pid; diff --git a/tools/testing/selftests/bpf/progs/linked_list_peek.c b/tools/testing/selftests/bpf/progs/linked_list_peek.c new file mode 100644 index 000000000000..264e81bfb287 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/linked_list_peek.c @@ -0,0 +1,113 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2025 Meta Platforms, Inc. and affiliates. */ + +#include <vmlinux.h> +#include <bpf/bpf_helpers.h> +#include "bpf_misc.h" +#include "bpf_experimental.h" + +struct node_data { +	struct bpf_list_node l; +	int key; +}; + +#define private(name) SEC(".data." #name) __hidden __attribute__((aligned(8))) +private(A) struct bpf_spin_lock glock; +private(A) struct bpf_list_head ghead __contains(node_data, l); + +#define list_entry(ptr, type, member) container_of(ptr, type, member) +#define NR_NODES 16 + +int zero = 0; + +SEC("syscall") +__retval(0) +long list_peek(void *ctx) +{ +	struct bpf_list_node *l_n; +	struct node_data *n; +	int i, err = 0; + +	bpf_spin_lock(&glock); +	l_n = bpf_list_front(&ghead); +	bpf_spin_unlock(&glock); +	if (l_n) +		return __LINE__; + +	bpf_spin_lock(&glock); +	l_n = bpf_list_back(&ghead); +	bpf_spin_unlock(&glock); +	if (l_n) +		return __LINE__; + +	for (i = zero; i < NR_NODES && can_loop; i++) { +		n = bpf_obj_new(typeof(*n)); +		if (!n) +			return __LINE__; +		n->key = i; +		bpf_spin_lock(&glock); +		bpf_list_push_back(&ghead, &n->l); +		bpf_spin_unlock(&glock); +	} + +	bpf_spin_lock(&glock); + +	l_n = bpf_list_front(&ghead); +	if (!l_n) { +		err = __LINE__; +		goto done; +	} + +	n = list_entry(l_n, struct node_data, l); +	if (n->key != 0) { +		err = __LINE__; +		goto done; +	} + +	l_n = bpf_list_back(&ghead); +	if (!l_n) { +		err = __LINE__; +		goto done; +	} + +	n = list_entry(l_n, struct node_data, l); +	if (n->key != NR_NODES - 1) { +		err = __LINE__; +		goto done; +	} + +done: +	bpf_spin_unlock(&glock); +	return err; +} + +#define TEST_FB(op, dolock)					\ +SEC("syscall")							\ +__failure __msg(MSG)						\ +long test_##op##_spinlock_##dolock(void *ctx)			\ +{								\ +	struct bpf_list_node *l_n;				\ +	__u64 jiffies = 0;					\ +								\ +	if (dolock)						\ +		bpf_spin_lock(&glock);				\ +	l_n = bpf_list_##op(&ghead);				\ +	if (l_n)						\ +		jiffies = bpf_jiffies64();			\ +	if (dolock)						\ +		bpf_spin_unlock(&glock);			\ +								\ +	return !!jiffies;					\ +} + +#define MSG "call bpf_list_{{(front|back).+}}; R0{{(_w)?}}=ptr_or_null_node_data(id={{[0-9]+}},non_own_ref" +TEST_FB(front, true) +TEST_FB(back, true) +#undef MSG + +#define MSG "bpf_spin_lock at off=0 must be held for bpf_list_head" +TEST_FB(front, false) +TEST_FB(back, false) +#undef MSG + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/prepare.c b/tools/testing/selftests/bpf/progs/prepare.c index 1f1dd547e4ee..cfc1f48e0d28 100644 --- a/tools/testing/selftests/bpf/progs/prepare.c +++ b/tools/testing/selftests/bpf/progs/prepare.c @@ -2,7 +2,6 @@  /* Copyright (c) 2025 Meta */  #include <vmlinux.h>  #include <bpf/bpf_helpers.h> -//#include <bpf/bpf_tracing.h>  char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/raw_tp_null.c b/tools/testing/selftests/bpf/progs/raw_tp_null.c index 5927054b6dd9..efa416f53968 100644 --- a/tools/testing/selftests/bpf/progs/raw_tp_null.c +++ b/tools/testing/selftests/bpf/progs/raw_tp_null.c @@ -10,7 +10,7 @@ char _license[] SEC("license") = "GPL";  int tid;  int i; -SEC("tp_btf/bpf_testmod_test_raw_tp_null") +SEC("tp_btf/bpf_testmod_test_raw_tp_null_tp")  int BPF_PROG(test_raw_tp_null, struct sk_buff *skb)  {  	struct task_struct *task = bpf_get_current_task_btf(); diff --git a/tools/testing/selftests/bpf/progs/raw_tp_null_fail.c b/tools/testing/selftests/bpf/progs/raw_tp_null_fail.c index 38d669957bf1..0d58114a4955 100644 --- a/tools/testing/selftests/bpf/progs/raw_tp_null_fail.c +++ b/tools/testing/selftests/bpf/progs/raw_tp_null_fail.c @@ -8,7 +8,7 @@  char _license[] SEC("license") = "GPL";  /* Ensure module parameter has PTR_MAYBE_NULL */ -SEC("tp_btf/bpf_testmod_test_raw_tp_null") +SEC("tp_btf/bpf_testmod_test_raw_tp_null_tp")  __failure __msg("R1 invalid mem access 'trusted_ptr_or_null_'")  int test_raw_tp_null_bpf_testmod_test_raw_tp_null_arg_1(void *ctx) {      asm volatile("r1 = *(u64 *)(r1 +0); r1 = *(u64 *)(r1 +0);" ::: __clobber_all); diff --git a/tools/testing/selftests/bpf/progs/rbtree_fail.c b/tools/testing/selftests/bpf/progs/rbtree_fail.c index dbd5eee8e25e..4acb6af2dfe3 100644 --- a/tools/testing/selftests/bpf/progs/rbtree_fail.c +++ b/tools/testing/selftests/bpf/progs/rbtree_fail.c @@ -69,11 +69,11 @@ long rbtree_api_nolock_first(void *ctx)  }  SEC("?tc") -__failure __msg("rbtree_remove node input must be non-owning ref") +__retval(0)  long rbtree_api_remove_unadded_node(void *ctx)  {  	struct node_data *n, *m; -	struct bpf_rb_node *res; +	struct bpf_rb_node *res_n, *res_m;  	n = bpf_obj_new(typeof(*n));  	if (!n) @@ -88,19 +88,20 @@ long rbtree_api_remove_unadded_node(void *ctx)  	bpf_spin_lock(&glock);  	bpf_rbtree_add(&groot, &n->node, less); -	/* This remove should pass verifier */ -	res = bpf_rbtree_remove(&groot, &n->node); -	n = container_of(res, struct node_data, node); +	res_n = bpf_rbtree_remove(&groot, &n->node); -	/* This remove shouldn't, m isn't in an rbtree */ -	res = bpf_rbtree_remove(&groot, &m->node); -	m = container_of(res, struct node_data, node); +	res_m = bpf_rbtree_remove(&groot, &m->node);  	bpf_spin_unlock(&glock); -	if (n) -		bpf_obj_drop(n); -	if (m) -		bpf_obj_drop(m); +	bpf_obj_drop(m); +	if (res_n) +		bpf_obj_drop(container_of(res_n, struct node_data, node)); +	if (res_m) { +		bpf_obj_drop(container_of(res_m, struct node_data, node)); +		/* m was not added to the rbtree */ +		return 2; +	} +  	return 0;  } @@ -178,7 +179,7 @@ err_out:  }  SEC("?tc") -__failure __msg("rbtree_remove node input must be non-owning ref") +__failure __msg("bpf_rbtree_remove can only take non-owning or refcounted bpf_rb_node pointer")  long rbtree_api_add_release_unlock_escape(void *ctx)  {  	struct node_data *n; @@ -202,7 +203,7 @@ long rbtree_api_add_release_unlock_escape(void *ctx)  }  SEC("?tc") -__failure __msg("rbtree_remove node input must be non-owning ref") +__failure __msg("bpf_rbtree_remove can only take non-owning or refcounted bpf_rb_node pointer")  long rbtree_api_first_release_unlock_escape(void *ctx)  {  	struct bpf_rb_node *res; diff --git a/tools/testing/selftests/bpf/progs/rbtree_search.c b/tools/testing/selftests/bpf/progs/rbtree_search.c new file mode 100644 index 000000000000..098ef970fac1 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/rbtree_search.c @@ -0,0 +1,206 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2025 Meta Platforms, Inc. and affiliates. */ + +#include <vmlinux.h> +#include <bpf/bpf_helpers.h> +#include "bpf_misc.h" +#include "bpf_experimental.h" + +struct node_data { +	struct bpf_refcount ref; +	struct bpf_rb_node r0; +	struct bpf_rb_node r1; +	int key0; +	int key1; +}; + +#define private(name) SEC(".data." #name) __hidden __attribute__((aligned(8))) +private(A) struct bpf_spin_lock glock0; +private(A) struct bpf_rb_root groot0 __contains(node_data, r0); + +private(B) struct bpf_spin_lock glock1; +private(B) struct bpf_rb_root groot1 __contains(node_data, r1); + +#define rb_entry(ptr, type, member) container_of(ptr, type, member) +#define NR_NODES 16 + +int zero = 0; + +static bool less0(struct bpf_rb_node *a, const struct bpf_rb_node *b) +{ +	struct node_data *node_a; +	struct node_data *node_b; + +	node_a = rb_entry(a, struct node_data, r0); +	node_b = rb_entry(b, struct node_data, r0); + +	return node_a->key0 < node_b->key0; +} + +static bool less1(struct bpf_rb_node *a, const struct bpf_rb_node *b) +{ +	struct node_data *node_a; +	struct node_data *node_b; + +	node_a = rb_entry(a, struct node_data, r1); +	node_b = rb_entry(b, struct node_data, r1); + +	return node_a->key1 < node_b->key1; +} + +SEC("syscall") +__retval(0) +long rbtree_search(void *ctx) +{ +	struct bpf_rb_node *rb_n, *rb_m, *gc_ns[NR_NODES]; +	long lookup_key = NR_NODES / 2; +	struct node_data *n, *m; +	int i, nr_gc = 0; + +	for (i = zero; i < NR_NODES && can_loop; i++) { +		n = bpf_obj_new(typeof(*n)); +		if (!n) +			return __LINE__; + +		m = bpf_refcount_acquire(n); + +		n->key0 = i; +		m->key1 = i; + +		bpf_spin_lock(&glock0); +		bpf_rbtree_add(&groot0, &n->r0, less0); +		bpf_spin_unlock(&glock0); + +		bpf_spin_lock(&glock1); +		bpf_rbtree_add(&groot1, &m->r1, less1); +		bpf_spin_unlock(&glock1); +	} + +	n = NULL; +	bpf_spin_lock(&glock0); +	rb_n = bpf_rbtree_root(&groot0); +	while (can_loop) { +		if (!rb_n) { +			bpf_spin_unlock(&glock0); +			return __LINE__; +		} + +		n = rb_entry(rb_n, struct node_data, r0); +		if (lookup_key == n->key0) +			break; +		if (nr_gc < NR_NODES) +			gc_ns[nr_gc++] = rb_n; +		if (lookup_key < n->key0) +			rb_n = bpf_rbtree_left(&groot0, rb_n); +		else +			rb_n = bpf_rbtree_right(&groot0, rb_n); +	} + +	if (!n || lookup_key != n->key0) { +		bpf_spin_unlock(&glock0); +		return __LINE__; +	} + +	for (i = 0; i < nr_gc; i++) { +		rb_n = gc_ns[i]; +		gc_ns[i] = bpf_rbtree_remove(&groot0, rb_n); +	} + +	m = bpf_refcount_acquire(n); +	bpf_spin_unlock(&glock0); + +	for (i = 0; i < nr_gc; i++) { +		rb_n = gc_ns[i]; +		if (rb_n) { +			n = rb_entry(rb_n, struct node_data, r0); +			bpf_obj_drop(n); +		} +	} + +	if (!m) +		return __LINE__; + +	bpf_spin_lock(&glock1); +	rb_m = bpf_rbtree_remove(&groot1, &m->r1); +	bpf_spin_unlock(&glock1); +	bpf_obj_drop(m); +	if (!rb_m) +		return __LINE__; +	bpf_obj_drop(rb_entry(rb_m, struct node_data, r1)); + +	return 0; +} + +#define TEST_ROOT(dolock)				\ +SEC("syscall")						\ +__failure __msg(MSG)					\ +long test_root_spinlock_##dolock(void *ctx)		\ +{							\ +	struct bpf_rb_node *rb_n;			\ +	__u64 jiffies = 0;				\ +							\ +	if (dolock)					\ +		bpf_spin_lock(&glock0);			\ +	rb_n = bpf_rbtree_root(&groot0);		\ +	if (rb_n)					\ +		jiffies = bpf_jiffies64();		\ +	if (dolock)					\ +		bpf_spin_unlock(&glock0);		\ +							\ +	return !!jiffies;				\ +} + +#define TEST_LR(op, dolock)				\ +SEC("syscall")						\ +__failure __msg(MSG)					\ +long test_##op##_spinlock_##dolock(void *ctx)		\ +{							\ +	struct bpf_rb_node *rb_n;			\ +	struct node_data *n;				\ +	__u64 jiffies = 0;				\ +							\ +	bpf_spin_lock(&glock0);				\ +	rb_n = bpf_rbtree_root(&groot0);		\ +	if (!rb_n) {					\ +		bpf_spin_unlock(&glock0);		\ +		return 1;				\ +	}						\ +	n = rb_entry(rb_n, struct node_data, r0);	\ +	n = bpf_refcount_acquire(n);			\ +	bpf_spin_unlock(&glock0);			\ +	if (!n)						\ +		return 1;				\ +							\ +	if (dolock)					\ +		bpf_spin_lock(&glock0);			\ +	rb_n = bpf_rbtree_##op(&groot0, &n->r0);	\ +	if (rb_n)					\ +		jiffies = bpf_jiffies64();		\ +	if (dolock)					\ +		bpf_spin_unlock(&glock0);		\ +							\ +	return !!jiffies;				\ +} + +/* + * Use a spearate MSG macro instead of passing to TEST_XXX(..., MSG) + * to ensure the message itself is not in the bpf prog lineinfo + * which the verifier includes in its log. + * Otherwise, the test_loader will incorrectly match the prog lineinfo + * instead of the log generated by the verifier. + */ +#define MSG "call bpf_rbtree_root{{.+}}; R0{{(_w)?}}=rcu_ptr_or_null_node_data(id={{[0-9]+}},non_own_ref" +TEST_ROOT(true) +#undef MSG +#define MSG "call bpf_rbtree_{{(left|right).+}}; R0{{(_w)?}}=rcu_ptr_or_null_node_data(id={{[0-9]+}},non_own_ref" +TEST_LR(left,  true) +TEST_LR(right, true) +#undef MSG + +#define MSG "bpf_spin_lock at off=0 must be held for bpf_rb_root" +TEST_ROOT(false) +TEST_LR(left, false) +TEST_LR(right, false) +#undef MSG + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/set_global_vars.c b/tools/testing/selftests/bpf/progs/set_global_vars.c index 9adb5ba4cd4d..90f5656c3991 100644 --- a/tools/testing/selftests/bpf/progs/set_global_vars.c +++ b/tools/testing/selftests/bpf/progs/set_global_vars.c @@ -24,6 +24,44 @@ const volatile enum Enumu64 var_eb = EB1;  const volatile enum Enums64 var_ec = EC1;  const volatile bool var_b = false; +struct Struct { +	int:16; +	__u16 filler; +	struct { +		const __u16 filler2; +	}; +	struct Struct2 { +		__u16 filler; +		volatile struct { +			const int:1; +			union { +				const volatile __u8 var_u8; +				const volatile __s16 filler3; +				const int:1; +			} u; +		}; +	} struct2; +}; + +const volatile __u32 stru = 0; /* same prefix as below */ +const volatile struct Struct struct1 = {.struct2 = {.u = {.var_u8 = 1}}}; + +union Union { +	__u16 var_u16; +	struct Struct3 { +		struct { +			__u8 var_u8_l; +		}; +		struct { +			struct { +				__u8 var_u8_h; +			}; +		}; +	} struct3; +}; + +const volatile union Union union1 = {.var_u16 = -1}; +  char arr[4] = {0};  SEC("socket") @@ -43,5 +81,8 @@ int test_set_globals(void *ctx)  	a = var_eb;  	a = var_ec;  	a = var_b; +	a = struct1.struct2.u.var_u8; +	a = union1.var_u16; +  	return a;  } diff --git a/tools/testing/selftests/bpf/progs/setget_sockopt.c b/tools/testing/selftests/bpf/progs/setget_sockopt.c index 0107a24b7522..d330b1511979 100644 --- a/tools/testing/selftests/bpf/progs/setget_sockopt.c +++ b/tools/testing/selftests/bpf/progs/setget_sockopt.c @@ -83,6 +83,14 @@ struct loop_ctx {  	struct sock *sk;  }; +static bool sk_is_tcp(struct sock *sk) +{ +	return (sk->__sk_common.skc_family == AF_INET || +		sk->__sk_common.skc_family == AF_INET6) && +		sk->sk_type == SOCK_STREAM && +		sk->sk_protocol == IPPROTO_TCP; +} +  static int bpf_test_sockopt_flip(void *ctx, struct sock *sk,  				 const struct sockopt_test *t,  				 int level) @@ -91,6 +99,9 @@ static int bpf_test_sockopt_flip(void *ctx, struct sock *sk,  	opt = t->opt; +	if (opt == SO_TXREHASH && !sk_is_tcp(sk)) +		return 0; +  	if (bpf_getsockopt(ctx, level, opt, &old, sizeof(old)))  		return 1;  	/* kernel initialized txrehash to 255 */ diff --git a/tools/testing/selftests/bpf/progs/sock_iter_batch.c b/tools/testing/selftests/bpf/progs/sock_iter_batch.c index 96531b0d9d55..8f483337e103 100644 --- a/tools/testing/selftests/bpf/progs/sock_iter_batch.c +++ b/tools/testing/selftests/bpf/progs/sock_iter_batch.c @@ -17,6 +17,12 @@ static bool ipv6_addr_loopback(const struct in6_addr *a)  		a->s6_addr32[2] | (a->s6_addr32[3] ^ bpf_htonl(1))) == 0;  } +static bool ipv4_addr_loopback(__be32 a) +{ +	return a == bpf_ntohl(0x7f000001); +} + +volatile const unsigned int sf;  volatile const __u16 ports[2];  unsigned int bucket[2]; @@ -26,16 +32,20 @@ int iter_tcp_soreuse(struct bpf_iter__tcp *ctx)  	struct sock *sk = (struct sock *)ctx->sk_common;  	struct inet_hashinfo *hinfo;  	unsigned int hash; +	__u64 sock_cookie;  	struct net *net;  	int idx;  	if (!sk)  		return 0; +	sock_cookie = bpf_get_socket_cookie(sk);  	sk = bpf_core_cast(sk, struct sock); -	if (sk->sk_family != AF_INET6 || +	if (sk->sk_family != sf ||  	    sk->sk_state != TCP_LISTEN || -	    !ipv6_addr_loopback(&sk->sk_v6_rcv_saddr)) +	    sk->sk_family == AF_INET6 ? +	    !ipv6_addr_loopback(&sk->sk_v6_rcv_saddr) : +	    !ipv4_addr_loopback(sk->sk_rcv_saddr))  		return 0;  	if (sk->sk_num == ports[0]) @@ -52,6 +62,7 @@ int iter_tcp_soreuse(struct bpf_iter__tcp *ctx)  	hinfo = net->ipv4.tcp_death_row.hashinfo;  	bucket[idx] = hash & hinfo->lhash2_mask;  	bpf_seq_write(ctx->meta->seq, &idx, sizeof(idx)); +	bpf_seq_write(ctx->meta->seq, &sock_cookie, sizeof(sock_cookie));  	return 0;  } @@ -63,14 +74,18 @@ int iter_udp_soreuse(struct bpf_iter__udp *ctx)  {  	struct sock *sk = (struct sock *)ctx->udp_sk;  	struct udp_table *udptable; +	__u64 sock_cookie;  	int idx;  	if (!sk)  		return 0; +	sock_cookie = bpf_get_socket_cookie(sk);  	sk = bpf_core_cast(sk, struct sock); -	if (sk->sk_family != AF_INET6 || -	    !ipv6_addr_loopback(&sk->sk_v6_rcv_saddr)) +	if (sk->sk_family != sf || +	    sk->sk_family == AF_INET6 ? +	    !ipv6_addr_loopback(&sk->sk_v6_rcv_saddr) : +	    !ipv4_addr_loopback(sk->sk_rcv_saddr))  		return 0;  	if (sk->sk_num == ports[0]) @@ -84,6 +99,7 @@ int iter_udp_soreuse(struct bpf_iter__udp *ctx)  	udptable = sk->sk_net.net->ipv4.udp_table;  	bucket[idx] = udp_sk(sk)->udp_portaddr_hash & udptable->mask;  	bpf_seq_write(ctx->meta->seq, &idx, sizeof(idx)); +	bpf_seq_write(ctx->meta->seq, &sock_cookie, sizeof(sock_cookie));  	return 0;  } diff --git a/tools/testing/selftests/bpf/progs/test_btf_ext.c b/tools/testing/selftests/bpf/progs/test_btf_ext.c new file mode 100644 index 000000000000..cdf20331db04 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_btf_ext.c @@ -0,0 +1,22 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright (c) 2025 Meta Platforms Inc. */ + +#include <linux/bpf.h> +#include <bpf/bpf_helpers.h> +#include "bpf_misc.h" + +char _license[] SEC("license") = "GPL"; + +__noinline static void f0(void) +{ +	__u64 a = 1; + +	__sink(a); +} + +SEC("xdp") +__u64 global_func(struct xdp_md *xdp) +{ +	f0(); +	return XDP_DROP; +} diff --git a/tools/testing/selftests/bpf/progs/test_module_attach.c b/tools/testing/selftests/bpf/progs/test_module_attach.c index 7f3c233943b3..03d7f89787a1 100644 --- a/tools/testing/selftests/bpf/progs/test_module_attach.c +++ b/tools/testing/selftests/bpf/progs/test_module_attach.c @@ -19,7 +19,7 @@ int BPF_PROG(handle_raw_tp,  __u32 raw_tp_bare_write_sz = 0; -SEC("raw_tp/bpf_testmod_test_write_bare") +SEC("raw_tp/bpf_testmod_test_write_bare_tp")  int BPF_PROG(handle_raw_tp_bare,  	     struct task_struct *task, struct bpf_testmod_test_write_ctx *write_ctx)  { @@ -31,7 +31,7 @@ int raw_tp_writable_bare_in_val = 0;  int raw_tp_writable_bare_early_ret = 0;  int raw_tp_writable_bare_out_val = 0; -SEC("raw_tp.w/bpf_testmod_test_writable_bare") +SEC("raw_tp.w/bpf_testmod_test_writable_bare_tp")  int BPF_PROG(handle_raw_tp_writable_bare,  	     struct bpf_testmod_test_writable_ctx *writable)  { diff --git a/tools/testing/selftests/bpf/progs/test_sockmap_ktls.c b/tools/testing/selftests/bpf/progs/test_sockmap_ktls.c new file mode 100644 index 000000000000..8bdb9987c0c7 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_sockmap_ktls.c @@ -0,0 +1,36 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/bpf.h> +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_endian.h> + +int cork_byte; +int push_start; +int push_end; +int apply_bytes; + +struct { +	__uint(type, BPF_MAP_TYPE_SOCKMAP); +	__uint(max_entries, 20); +	__type(key, int); +	__type(value, int); +} sock_map SEC(".maps"); + +SEC("sk_msg") +int prog_sk_policy(struct sk_msg_md *msg) +{ +	if (cork_byte > 0) +		bpf_msg_cork_bytes(msg, cork_byte); +	if (push_start > 0 && push_end > 0) +		bpf_msg_push_data(msg, push_start, push_end, 0); + +	return SK_PASS; +} + +SEC("sk_msg") +int prog_sk_policy_redir(struct sk_msg_md *msg) +{ +	int two = 2; + +	bpf_msg_apply_bytes(msg, apply_bytes); +	return bpf_msg_redirect_map(msg, &sock_map, two, 0); +} diff --git a/tools/testing/selftests/bpf/progs/test_sockmap_redir.c b/tools/testing/selftests/bpf/progs/test_sockmap_redir.c new file mode 100644 index 000000000000..34d9f4f2f0a2 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_sockmap_redir.c @@ -0,0 +1,68 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <linux/bpf.h> +#include <bpf/bpf_helpers.h> +#include "bpf_misc.h" + +SEC(".maps") struct { +	__uint(type, BPF_MAP_TYPE_SOCKMAP); +	__uint(max_entries, 1); +	__type(key, __u32); +	__type(value, __u64); +} nop_map, sock_map; + +SEC(".maps") struct { +	__uint(type, BPF_MAP_TYPE_SOCKHASH); +	__uint(max_entries, 1); +	__type(key, __u32); +	__type(value, __u64); +} nop_hash, sock_hash; + +SEC(".maps") struct { +	__uint(type, BPF_MAP_TYPE_ARRAY); +	__uint(max_entries, 2); +	__type(key, int); +	__type(value, unsigned int); +} verdict_map; + +/* Set by user space */ +int redirect_type; +int redirect_flags; + +#define redirect_map(__data)                                                   \ +	_Generic((__data),                                                     \ +		 struct __sk_buff * : bpf_sk_redirect_map,                     \ +		 struct sk_msg_md * : bpf_msg_redirect_map                     \ +	)((__data), &sock_map, (__u32){0}, redirect_flags) + +#define redirect_hash(__data)                                                  \ +	_Generic((__data),                                                     \ +		 struct __sk_buff * : bpf_sk_redirect_hash,                    \ +		 struct sk_msg_md * : bpf_msg_redirect_hash                    \ +	)((__data), &sock_hash, &(__u32){0}, redirect_flags) + +#define DEFINE_PROG(__type, __param)                                           \ +SEC("sk_" XSTR(__type))                                                        \ +int prog_ ## __type ## _verdict(__param data)                                  \ +{                                                                              \ +	unsigned int *count;                                                   \ +	int verdict;                                                           \ +									       \ +	if (redirect_type == BPF_MAP_TYPE_SOCKMAP)                             \ +		verdict = redirect_map(data);                                  \ +	else if (redirect_type == BPF_MAP_TYPE_SOCKHASH)                       \ +		verdict = redirect_hash(data);                                 \ +	else                                                                   \ +		verdict = redirect_type - __MAX_BPF_MAP_TYPE;                  \ +									       \ +	count = bpf_map_lookup_elem(&verdict_map, &verdict);                   \ +	if (count)                                                             \ +		(*count)++;                                                    \ +									       \ +	return verdict;                                                        \ +} + +DEFINE_PROG(skb, struct __sk_buff *); +DEFINE_PROG(msg, struct sk_msg_md *); + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/test_tcp_custom_syncookie.c b/tools/testing/selftests/bpf/progs/test_tcp_custom_syncookie.c index eb5cca1fce16..7d5293de1952 100644 --- a/tools/testing/selftests/bpf/progs/test_tcp_custom_syncookie.c +++ b/tools/testing/selftests/bpf/progs/test_tcp_custom_syncookie.c @@ -294,7 +294,9 @@ static int tcp_validate_sysctl(struct tcp_syncookie *ctx)  	    (ctx->ipv6 && ctx->attrs.mss != MSS_LOCAL_IPV6))  		goto err; -	if (!ctx->attrs.wscale_ok || ctx->attrs.snd_wscale != 7) +	if (!ctx->attrs.wscale_ok || +	    !ctx->attrs.snd_wscale || +	    ctx->attrs.snd_wscale >= BPF_SYNCOOKIE_WSCALE_MASK)  		goto err;  	if (!ctx->attrs.tstamp_ok) diff --git a/tools/testing/selftests/bpf/progs/test_tp_btf_nullable.c b/tools/testing/selftests/bpf/progs/test_tp_btf_nullable.c index 39ff06f2c834..cf0547a613ff 100644 --- a/tools/testing/selftests/bpf/progs/test_tp_btf_nullable.c +++ b/tools/testing/selftests/bpf/progs/test_tp_btf_nullable.c @@ -6,14 +6,14 @@  #include "../test_kmods/bpf_testmod.h"  #include "bpf_misc.h" -SEC("tp_btf/bpf_testmod_test_nullable_bare") +SEC("tp_btf/bpf_testmod_test_nullable_bare_tp")  __failure __msg("R1 invalid mem access 'trusted_ptr_or_null_'")  int BPF_PROG(handle_tp_btf_nullable_bare1, struct bpf_testmod_test_read_ctx *nullable_ctx)  {  	return nullable_ctx->len;  } -SEC("tp_btf/bpf_testmod_test_nullable_bare") +SEC("tp_btf/bpf_testmod_test_nullable_bare_tp")  int BPF_PROG(handle_tp_btf_nullable_bare2, struct bpf_testmod_test_read_ctx *nullable_ctx)  {  	if (nullable_ctx) diff --git a/tools/testing/selftests/bpf/progs/verifier_bpf_trap.c b/tools/testing/selftests/bpf/progs/verifier_bpf_trap.c new file mode 100644 index 000000000000..35e2cdc00a01 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/verifier_bpf_trap.c @@ -0,0 +1,71 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2025 Meta Platforms, Inc. and affiliates. */ +#include <vmlinux.h> +#include <bpf/bpf_helpers.h> +#include "bpf_misc.h" + +#if __clang_major__ >= 21 && 0 +SEC("socket") +__description("__builtin_trap with simple c code") +__failure __msg("unexpected __bpf_trap() due to uninitialized variable?") +void bpf_builtin_trap_with_simple_c(void) +{ +	__builtin_trap(); +} +#endif + +SEC("socket") +__description("__bpf_trap with simple c code") +__failure __msg("unexpected __bpf_trap() due to uninitialized variable?") +void bpf_trap_with_simple_c(void) +{ +	__bpf_trap(); +} + +SEC("socket") +__description("__bpf_trap as the second-from-last insn") +__failure __msg("unexpected __bpf_trap() due to uninitialized variable?") +__naked void bpf_trap_at_func_end(void) +{ +	asm volatile ( +	"r0 = 0;" +	"call %[__bpf_trap];" +	"exit;" +	: +	: __imm(__bpf_trap) +	: __clobber_all); +} + +SEC("socket") +__description("dead code __bpf_trap in the middle of code") +__success +__naked void dead_bpf_trap_in_middle(void) +{ +	asm volatile ( +	"r0 = 0;" +	"if r0 == 0 goto +1;" +	"call %[__bpf_trap];" +	"r0 = 2;" +	"exit;" +	: +	: __imm(__bpf_trap) +	: __clobber_all); +} + +SEC("socket") +__description("reachable __bpf_trap in the middle of code") +__failure __msg("unexpected __bpf_trap() due to uninitialized variable?") +__naked void live_bpf_trap_in_middle(void) +{ +	asm volatile ( +	"r0 = 0;" +	"if r0 == 1 goto +1;" +	"call %[__bpf_trap];" +	"r0 = 2;" +	"exit;" +	: +	: __imm(__bpf_trap) +	: __clobber_all); +} + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/verifier_btf_ctx_access.c b/tools/testing/selftests/bpf/progs/verifier_btf_ctx_access.c index 28b939572cda..03942cec07e5 100644 --- a/tools/testing/selftests/bpf/progs/verifier_btf_ctx_access.c +++ b/tools/testing/selftests/bpf/progs/verifier_btf_ctx_access.c @@ -65,4 +65,16 @@ __naked void ctx_access_u32_pointer_reject_8(void)  "	::: __clobber_all);  } +SEC("fentry/bpf_fentry_test10") +__description("btf_ctx_access const void pointer accept") +__success __retval(0) +__naked void ctx_access_const_void_pointer_accept(void) +{ +	asm volatile ("					\ +	r2 = *(u64 *)(r1 + 0);		/* load 1st argument value (const void pointer) */\ +	r0 = 0;						\ +	exit;						\ +"	::: __clobber_all); +} +  char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/verifier_load_acquire.c b/tools/testing/selftests/bpf/progs/verifier_load_acquire.c index 77698d5a19e4..74f4f19c10b8 100644 --- a/tools/testing/selftests/bpf/progs/verifier_load_acquire.c +++ b/tools/testing/selftests/bpf/progs/verifier_load_acquire.c @@ -10,65 +10,81 @@  SEC("socket")  __description("load-acquire, 8-bit") -__success __success_unpriv __retval(0x12) +__success __success_unpriv __retval(0)  __naked void load_acquire_8(void)  {  	asm volatile ( -	"w1 = 0x12;" +	"r0 = 0;" +	"w1 = 0xfe;"  	"*(u8 *)(r10 - 1) = w1;" -	".8byte %[load_acquire_insn];" // w0 = load_acquire((u8 *)(r10 - 1)); +	".8byte %[load_acquire_insn];" // w2 = load_acquire((u8 *)(r10 - 1)); +	"if r2 == r1 goto 1f;" +	"r0 = 1;" +"1:"  	"exit;"  	:  	: __imm_insn(load_acquire_insn, -		     BPF_ATOMIC_OP(BPF_B, BPF_LOAD_ACQ, BPF_REG_0, BPF_REG_10, -1)) +		     BPF_ATOMIC_OP(BPF_B, BPF_LOAD_ACQ, BPF_REG_2, BPF_REG_10, -1))  	: __clobber_all);  }  SEC("socket")  __description("load-acquire, 16-bit") -__success __success_unpriv __retval(0x1234) +__success __success_unpriv __retval(0)  __naked void load_acquire_16(void)  {  	asm volatile ( -	"w1 = 0x1234;" +	"r0 = 0;" +	"w1 = 0xfedc;"  	"*(u16 *)(r10 - 2) = w1;" -	".8byte %[load_acquire_insn];" // w0 = load_acquire((u16 *)(r10 - 2)); +	".8byte %[load_acquire_insn];" // w2 = load_acquire((u16 *)(r10 - 2)); +	"if r2 == r1 goto 1f;" +	"r0 = 1;" +"1:"  	"exit;"  	:  	: __imm_insn(load_acquire_insn, -		     BPF_ATOMIC_OP(BPF_H, BPF_LOAD_ACQ, BPF_REG_0, BPF_REG_10, -2)) +		     BPF_ATOMIC_OP(BPF_H, BPF_LOAD_ACQ, BPF_REG_2, BPF_REG_10, -2))  	: __clobber_all);  }  SEC("socket")  __description("load-acquire, 32-bit") -__success __success_unpriv __retval(0x12345678) +__success __success_unpriv __retval(0)  __naked void load_acquire_32(void)  {  	asm volatile ( -	"w1 = 0x12345678;" +	"r0 = 0;" +	"w1 = 0xfedcba09;"  	"*(u32 *)(r10 - 4) = w1;" -	".8byte %[load_acquire_insn];" // w0 = load_acquire((u32 *)(r10 - 4)); +	".8byte %[load_acquire_insn];" // w2 = load_acquire((u32 *)(r10 - 4)); +	"if r2 == r1 goto 1f;" +	"r0 = 1;" +"1:"  	"exit;"  	:  	: __imm_insn(load_acquire_insn, -		     BPF_ATOMIC_OP(BPF_W, BPF_LOAD_ACQ, BPF_REG_0, BPF_REG_10, -4)) +		     BPF_ATOMIC_OP(BPF_W, BPF_LOAD_ACQ, BPF_REG_2, BPF_REG_10, -4))  	: __clobber_all);  }  SEC("socket")  __description("load-acquire, 64-bit") -__success __success_unpriv __retval(0x1234567890abcdef) +__success __success_unpriv __retval(0)  __naked void load_acquire_64(void)  {  	asm volatile ( -	"r1 = 0x1234567890abcdef ll;" +	"r0 = 0;" +	"r1 = 0xfedcba0987654321 ll;"  	"*(u64 *)(r10 - 8) = r1;" -	".8byte %[load_acquire_insn];" // r0 = load_acquire((u64 *)(r10 - 8)); +	".8byte %[load_acquire_insn];" // r2 = load_acquire((u64 *)(r10 - 8)); +	"if r2 == r1 goto 1f;" +	"r0 = 1;" +"1:"  	"exit;"  	:  	: __imm_insn(load_acquire_insn, -		     BPF_ATOMIC_OP(BPF_DW, BPF_LOAD_ACQ, BPF_REG_0, BPF_REG_10, -8)) +		     BPF_ATOMIC_OP(BPF_DW, BPF_LOAD_ACQ, BPF_REG_2, BPF_REG_10, -8))  	: __clobber_all);  } diff --git a/tools/testing/selftests/bpf/progs/verifier_precision.c b/tools/testing/selftests/bpf/progs/verifier_precision.c index 6662d4b39969..9fe5d255ee37 100644 --- a/tools/testing/selftests/bpf/progs/verifier_precision.c +++ b/tools/testing/selftests/bpf/progs/verifier_precision.c @@ -91,8 +91,7 @@ __naked int bpf_end_bswap(void)  		::: __clobber_all);  } -#if defined(ENABLE_ATOMICS_TESTS) && \ -	(defined(__TARGET_ARCH_arm64) || defined(__TARGET_ARCH_x86)) +#ifdef CAN_USE_LOAD_ACQ_STORE_REL  SEC("?raw_tp")  __success __log_level(2) @@ -138,7 +137,7 @@ __naked int bpf_store_release(void)  	: __clobber_all);  } -#endif /* load-acquire, store-release */ +#endif /* CAN_USE_LOAD_ACQ_STORE_REL */  #endif /* v4 instruction */  SEC("?raw_tp") @@ -179,4 +178,57 @@ __naked int state_loop_first_last_equal(void)  	);  } +__used __naked static void __bpf_cond_op_r10(void) +{ +	asm volatile ( +	"r2 = 2314885393468386424 ll;" +	"goto +0;" +	"if r2 <= r10 goto +3;" +	"if r1 >= -1835016 goto +0;" +	"if r2 <= 8 goto +0;" +	"if r3 <= 0 goto +0;" +	"exit;" +	::: __clobber_all); +} + +SEC("?raw_tp") +__success __log_level(2) +__msg("8: (bd) if r2 <= r10 goto pc+3") +__msg("9: (35) if r1 >= 0xffe3fff8 goto pc+0") +__msg("10: (b5) if r2 <= 0x8 goto pc+0") +__msg("mark_precise: frame1: last_idx 10 first_idx 0 subseq_idx -1") +__msg("mark_precise: frame1: regs=r2 stack= before 9: (35) if r1 >= 0xffe3fff8 goto pc+0") +__msg("mark_precise: frame1: regs=r2 stack= before 8: (bd) if r2 <= r10 goto pc+3") +__msg("mark_precise: frame1: regs=r2 stack= before 7: (05) goto pc+0") +__naked void bpf_cond_op_r10(void) +{ +	asm volatile ( +	"r3 = 0 ll;" +	"call __bpf_cond_op_r10;" +	"r0 = 0;" +	"exit;" +	::: __clobber_all); +} + +SEC("?raw_tp") +__success __log_level(2) +__msg("3: (bf) r3 = r10") +__msg("4: (bd) if r3 <= r2 goto pc+1") +__msg("5: (b5) if r2 <= 0x8 goto pc+2") +__msg("mark_precise: frame0: last_idx 5 first_idx 0 subseq_idx -1") +__msg("mark_precise: frame0: regs=r2 stack= before 4: (bd) if r3 <= r2 goto pc+1") +__msg("mark_precise: frame0: regs=r2 stack= before 3: (bf) r3 = r10") +__naked void bpf_cond_op_not_r10(void) +{ +	asm volatile ( +	"r0 = 0;" +	"r2 = 2314885393468386424 ll;" +	"r3 = r10;" +	"if r3 <= r2 goto +1;" +	"if r2 <= 8 goto +2;" +	"r0 = 2 ll;" +	"exit;" +	::: __clobber_all); +} +  char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/verifier_store_release.c b/tools/testing/selftests/bpf/progs/verifier_store_release.c index c0442d5bb049..72f1eb006074 100644 --- a/tools/testing/selftests/bpf/progs/verifier_store_release.c +++ b/tools/testing/selftests/bpf/progs/verifier_store_release.c @@ -6,18 +6,21 @@  #include "../../../include/linux/filter.h"  #include "bpf_misc.h" -#if __clang_major__ >= 18 && defined(ENABLE_ATOMICS_TESTS) && \ -	(defined(__TARGET_ARCH_arm64) || defined(__TARGET_ARCH_x86)) +#ifdef CAN_USE_LOAD_ACQ_STORE_REL  SEC("socket")  __description("store-release, 8-bit") -__success __success_unpriv __retval(0x12) +__success __success_unpriv __retval(0)  __naked void store_release_8(void)  {  	asm volatile ( +	"r0 = 0;"  	"w1 = 0x12;"  	".8byte %[store_release_insn];" // store_release((u8 *)(r10 - 1), w1); -	"w0 = *(u8 *)(r10 - 1);" +	"w2 = *(u8 *)(r10 - 1);" +	"if r2 == r1 goto 1f;" +	"r0 = 1;" +"1:"  	"exit;"  	:  	: __imm_insn(store_release_insn, @@ -27,13 +30,17 @@ __naked void store_release_8(void)  SEC("socket")  __description("store-release, 16-bit") -__success __success_unpriv __retval(0x1234) +__success __success_unpriv __retval(0)  __naked void store_release_16(void)  {  	asm volatile ( +	"r0 = 0;"  	"w1 = 0x1234;"  	".8byte %[store_release_insn];" // store_release((u16 *)(r10 - 2), w1); -	"w0 = *(u16 *)(r10 - 2);" +	"w2 = *(u16 *)(r10 - 2);" +	"if r2 == r1 goto 1f;" +	"r0 = 1;" +"1:"  	"exit;"  	:  	: __imm_insn(store_release_insn, @@ -43,13 +50,17 @@ __naked void store_release_16(void)  SEC("socket")  __description("store-release, 32-bit") -__success __success_unpriv __retval(0x12345678) +__success __success_unpriv __retval(0)  __naked void store_release_32(void)  {  	asm volatile ( +	"r0 = 0;"  	"w1 = 0x12345678;"  	".8byte %[store_release_insn];" // store_release((u32 *)(r10 - 4), w1); -	"w0 = *(u32 *)(r10 - 4);" +	"w2 = *(u32 *)(r10 - 4);" +	"if r2 == r1 goto 1f;" +	"r0 = 1;" +"1:"  	"exit;"  	:  	: __imm_insn(store_release_insn, @@ -59,13 +70,17 @@ __naked void store_release_32(void)  SEC("socket")  __description("store-release, 64-bit") -__success __success_unpriv __retval(0x1234567890abcdef) +__success __success_unpriv __retval(0)  __naked void store_release_64(void)  {  	asm volatile ( +	"r0 = 0;"  	"r1 = 0x1234567890abcdef ll;"  	".8byte %[store_release_insn];" // store_release((u64 *)(r10 - 8), r1); -	"r0 = *(u64 *)(r10 - 8);" +	"r2 = *(u64 *)(r10 - 8);" +	"if r2 == r1 goto 1f;" +	"r0 = 1;" +"1:"  	"exit;"  	:  	: __imm_insn(store_release_insn, @@ -271,7 +286,7 @@ __naked void store_release_with_invalid_reg(void)  	: __clobber_all);  } -#else +#else /* CAN_USE_LOAD_ACQ_STORE_REL */  SEC("socket")  __description("Clang version < 18, ENABLE_ATOMICS_TESTS not defined, and/or JIT doesn't support store-release, use a dummy test") @@ -281,6 +296,6 @@ int dummy_test(void)  	return 0;  } -#endif +#endif /* CAN_USE_LOAD_ACQ_STORE_REL */  char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/xdp_metadata.c b/tools/testing/selftests/bpf/progs/xdp_metadata.c index 31ca229bb3c0..09bb8a038d52 100644 --- a/tools/testing/selftests/bpf/progs/xdp_metadata.c +++ b/tools/testing/selftests/bpf/progs/xdp_metadata.c @@ -19,6 +19,13 @@ struct {  	__type(value, __u32);  } prog_arr SEC(".maps"); +struct { +	__uint(type, BPF_MAP_TYPE_DEVMAP); +	__uint(key_size, sizeof(__u32)); +	__uint(value_size, sizeof(struct bpf_devmap_val)); +	__uint(max_entries, 1); +} dev_map SEC(".maps"); +  extern int bpf_xdp_metadata_rx_timestamp(const struct xdp_md *ctx,  					 __u64 *timestamp) __ksym;  extern int bpf_xdp_metadata_rx_hash(const struct xdp_md *ctx, __u32 *hash, @@ -95,4 +102,10 @@ int rx(struct xdp_md *ctx)  	return bpf_redirect_map(&xsk, ctx->rx_queue_index, XDP_PASS);  } +SEC("xdp") +int redirect(struct xdp_md *ctx) +{ +	return bpf_redirect_map(&dev_map, ctx->rx_queue_index, XDP_PASS); +} +  char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/xsk_xdp_progs.c b/tools/testing/selftests/bpf/progs/xsk_xdp_progs.c index ccde6a4c6319..683306db8594 100644 --- a/tools/testing/selftests/bpf/progs/xsk_xdp_progs.c +++ b/tools/testing/selftests/bpf/progs/xsk_xdp_progs.c @@ -4,6 +4,8 @@  #include <linux/bpf.h>  #include <bpf/bpf_helpers.h>  #include <linux/if_ether.h> +#include <linux/ip.h> +#include <linux/errno.h>  #include "xsk_xdp_common.h"  struct { @@ -14,6 +16,7 @@ struct {  } xsk SEC(".maps");  static unsigned int idx; +int adjust_value = 0;  int count = 0;  SEC("xdp.frags") int xsk_def_prog(struct xdp_md *xdp) @@ -70,4 +73,51 @@ SEC("xdp") int xsk_xdp_shared_umem(struct xdp_md *xdp)  	return bpf_redirect_map(&xsk, idx, XDP_DROP);  } +SEC("xdp.frags") int xsk_xdp_adjust_tail(struct xdp_md *xdp) +{ +	__u32 buff_len, curr_buff_len; +	int ret; + +	buff_len = bpf_xdp_get_buff_len(xdp); +	if (buff_len == 0) +		return XDP_DROP; + +	ret = bpf_xdp_adjust_tail(xdp, adjust_value); +	if (ret < 0) { +		/* Handle unsupported cases */ +		if (ret == -EOPNOTSUPP) { +			/* Set adjust_value to -EOPNOTSUPP to indicate to userspace that this case +			 * is unsupported +			 */ +			adjust_value = -EOPNOTSUPP; +			return bpf_redirect_map(&xsk, 0, XDP_DROP); +		} + +		return XDP_DROP; +	} + +	curr_buff_len = bpf_xdp_get_buff_len(xdp); +	if (curr_buff_len != buff_len + adjust_value) +		return XDP_DROP; + +	if (curr_buff_len > buff_len) { +		__u32 *pkt_data = (void *)(long)xdp->data; +		__u32 len, words_to_end, seq_num; + +		len = curr_buff_len - PKT_HDR_ALIGN; +		words_to_end = len / sizeof(*pkt_data) - 1; +		seq_num = words_to_end; + +		/* Convert sequence number to network byte order. Store this in the last 4 bytes of +		 * the packet. Use 'adjust_value' to determine the position at the end of the +		 * packet for storing the sequence number. +		 */ +		seq_num = __constant_htonl(words_to_end); +		bpf_xdp_store_bytes(xdp, curr_buff_len - sizeof(seq_num), &seq_num, +				    sizeof(seq_num)); +	} + +	return bpf_redirect_map(&xsk, 0, XDP_DROP); +} +  char _license[] SEC("license") = "GPL"; | 
