diff options
Diffstat (limited to 'kernel/bpf')
| -rw-r--r-- | kernel/bpf/Makefile | 8 | ||||
| -rw-r--r-- | kernel/bpf/arena.c | 16 | ||||
| -rw-r--r-- | kernel/bpf/bpf_local_storage.c | 4 | ||||
| -rw-r--r-- | kernel/bpf/bpf_lsm.c | 1 | ||||
| -rw-r--r-- | kernel/bpf/bpf_struct_ops.c | 77 | ||||
| -rw-r--r-- | kernel/bpf/btf.c | 509 | ||||
| -rw-r--r-- | kernel/bpf/core.c | 15 | ||||
| -rw-r--r-- | kernel/bpf/cpumap.c | 35 | ||||
| -rw-r--r-- | kernel/bpf/crypto.c | 42 | ||||
| -rw-r--r-- | kernel/bpf/devmap.c | 60 | ||||
| -rw-r--r-- | kernel/bpf/helpers.c | 263 | ||||
| -rw-r--r-- | kernel/bpf/log.c | 6 | ||||
| -rw-r--r-- | kernel/bpf/memalloc.c | 9 | ||||
| -rw-r--r-- | kernel/bpf/ringbuf.c | 31 | ||||
| -rw-r--r-- | kernel/bpf/syscall.c | 55 | ||||
| -rw-r--r-- | kernel/bpf/task_iter.c | 9 | ||||
| -rw-r--r-- | kernel/bpf/verifier.c | 405 | 
17 files changed, 1048 insertions, 497 deletions
| diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index 7eb9ad3a3ae6..0291eef9ce92 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -50,5 +50,11 @@ endif  obj-$(CONFIG_BPF_PRELOAD) += preload/  obj-$(CONFIG_BPF_SYSCALL) += relo_core.o -$(obj)/relo_core.o: $(srctree)/tools/lib/bpf/relo_core.c FORCE +obj-$(CONFIG_BPF_SYSCALL) += btf_iter.o +obj-$(CONFIG_BPF_SYSCALL) += btf_relocate.o + +# Some source files are common to libbpf. +vpath %.c $(srctree)/kernel/bpf:$(srctree)/tools/lib/bpf + +$(obj)/%.o: %.c FORCE  	$(call if_changed_rule,cc_o_c) diff --git a/kernel/bpf/arena.c b/kernel/bpf/arena.c index 583ee4fe48ef..e52b3ad231b9 100644 --- a/kernel/bpf/arena.c +++ b/kernel/bpf/arena.c @@ -212,6 +212,7 @@ static u64 arena_map_mem_usage(const struct bpf_map *map)  struct vma_list {  	struct vm_area_struct *vma;  	struct list_head head; +	atomic_t mmap_count;  };  static int remember_vma(struct bpf_arena *arena, struct vm_area_struct *vma) @@ -221,20 +222,30 @@ static int remember_vma(struct bpf_arena *arena, struct vm_area_struct *vma)  	vml = kmalloc(sizeof(*vml), GFP_KERNEL);  	if (!vml)  		return -ENOMEM; +	atomic_set(&vml->mmap_count, 1);  	vma->vm_private_data = vml;  	vml->vma = vma;  	list_add(&vml->head, &arena->vma_list);  	return 0;  } +static void arena_vm_open(struct vm_area_struct *vma) +{ +	struct vma_list *vml = vma->vm_private_data; + +	atomic_inc(&vml->mmap_count); +} +  static void arena_vm_close(struct vm_area_struct *vma)  {  	struct bpf_map *map = vma->vm_file->private_data;  	struct bpf_arena *arena = container_of(map, struct bpf_arena, map); -	struct vma_list *vml; +	struct vma_list *vml = vma->vm_private_data; +	if (!atomic_dec_and_test(&vml->mmap_count)) +		return;  	guard(mutex)(&arena->lock); -	vml = vma->vm_private_data; +	/* update link list under lock */  	list_del(&vml->head);  	vma->vm_private_data = NULL;  	kfree(vml); @@ -287,6 +298,7 @@ out:  }  static const struct vm_operations_struct arena_vm_ops = { +	.open		= arena_vm_open,  	.close		= arena_vm_close,  	.fault          = arena_vm_fault,  }; diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c index 976cb258a0ed..c938dea5ddbf 100644 --- a/kernel/bpf/bpf_local_storage.c +++ b/kernel/bpf/bpf_local_storage.c @@ -782,8 +782,8 @@ bpf_local_storage_map_alloc(union bpf_attr *attr,  	nbuckets = max_t(u32, 2, nbuckets);  	smap->bucket_log = ilog2(nbuckets); -	smap->buckets = bpf_map_kvcalloc(&smap->map, sizeof(*smap->buckets), -					 nbuckets, GFP_USER | __GFP_NOWARN); +	smap->buckets = bpf_map_kvcalloc(&smap->map, nbuckets, +					 sizeof(*smap->buckets), GFP_USER | __GFP_NOWARN);  	if (!smap->buckets) {  		err = -ENOMEM;  		goto free_smap; diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c index 68240c3c6e7d..08a338e1f231 100644 --- a/kernel/bpf/bpf_lsm.c +++ b/kernel/bpf/bpf_lsm.c @@ -280,6 +280,7 @@ BTF_ID(func, bpf_lsm_cred_prepare)  BTF_ID(func, bpf_lsm_file_ioctl)  BTF_ID(func, bpf_lsm_file_lock)  BTF_ID(func, bpf_lsm_file_open) +BTF_ID(func, bpf_lsm_file_post_open)  BTF_ID(func, bpf_lsm_file_receive)  BTF_ID(func, bpf_lsm_inode_create) diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c index 86c7884abaf8..0d515ec57aa5 100644 --- a/kernel/bpf/bpf_struct_ops.c +++ b/kernel/bpf/bpf_struct_ops.c @@ -12,6 +12,7 @@  #include <linux/mutex.h>  #include <linux/btf_ids.h>  #include <linux/rcupdate_wait.h> +#include <linux/poll.h>  struct bpf_struct_ops_value {  	struct bpf_struct_ops_common_value common; @@ -56,6 +57,7 @@ struct bpf_struct_ops_map {  struct bpf_struct_ops_link {  	struct bpf_link link;  	struct bpf_map __rcu *map; +	wait_queue_head_t wait_hup;  };  static DEFINE_MUTEX(update_mutex); @@ -571,7 +573,7 @@ int bpf_struct_ops_prepare_trampoline(struct bpf_tramp_links *tlinks,  	}  	size = arch_prepare_bpf_trampoline(NULL, image + image_off, -					   image + PAGE_SIZE, +					   image + image_off + size,  					   model, flags, tlinks, stub_func);  	if (size <= 0) {  		if (image != *_image) @@ -757,7 +759,7 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,  		goto unlock;  	} -	err = st_ops->reg(kdata); +	err = st_ops->reg(kdata, NULL);  	if (likely(!err)) {  		/* This refcnt increment on the map here after  		 * 'st_ops->reg()' is secure since the state of the @@ -805,7 +807,7 @@ static long bpf_struct_ops_map_delete_elem(struct bpf_map *map, void *key)  			     BPF_STRUCT_OPS_STATE_TOBEFREE);  	switch (prev_state) {  	case BPF_STRUCT_OPS_STATE_INUSE: -		st_map->st_ops_desc->st_ops->unreg(&st_map->kvalue.data); +		st_map->st_ops_desc->st_ops->unreg(&st_map->kvalue.data, NULL);  		bpf_map_put(map);  		return 0;  	case BPF_STRUCT_OPS_STATE_TOBEFREE: @@ -1057,10 +1059,7 @@ static void bpf_struct_ops_map_link_dealloc(struct bpf_link *link)  	st_map = (struct bpf_struct_ops_map *)  		rcu_dereference_protected(st_link->map, true);  	if (st_map) { -		/* st_link->map can be NULL if -		 * bpf_struct_ops_link_create() fails to register. -		 */ -		st_map->st_ops_desc->st_ops->unreg(&st_map->kvalue.data); +		st_map->st_ops_desc->st_ops->unreg(&st_map->kvalue.data, link);  		bpf_map_put(&st_map->map);  	}  	kfree(st_link); @@ -1075,7 +1074,8 @@ static void bpf_struct_ops_map_link_show_fdinfo(const struct bpf_link *link,  	st_link = container_of(link, struct bpf_struct_ops_link, link);  	rcu_read_lock();  	map = rcu_dereference(st_link->map); -	seq_printf(seq, "map_id:\t%d\n", map->id); +	if (map) +		seq_printf(seq, "map_id:\t%d\n", map->id);  	rcu_read_unlock();  } @@ -1088,7 +1088,8 @@ static int bpf_struct_ops_map_link_fill_link_info(const struct bpf_link *link,  	st_link = container_of(link, struct bpf_struct_ops_link, link);  	rcu_read_lock();  	map = rcu_dereference(st_link->map); -	info->struct_ops.map_id = map->id; +	if (map) +		info->struct_ops.map_id = map->id;  	rcu_read_unlock();  	return 0;  } @@ -1113,6 +1114,10 @@ static int bpf_struct_ops_map_link_update(struct bpf_link *link, struct bpf_map  	mutex_lock(&update_mutex);  	old_map = rcu_dereference_protected(st_link->map, lockdep_is_held(&update_mutex)); +	if (!old_map) { +		err = -ENOLINK; +		goto err_out; +	}  	if (expected_old_map && old_map != expected_old_map) {  		err = -EPERM;  		goto err_out; @@ -1125,7 +1130,7 @@ static int bpf_struct_ops_map_link_update(struct bpf_link *link, struct bpf_map  		goto err_out;  	} -	err = st_map->st_ops_desc->st_ops->update(st_map->kvalue.data, old_st_map->kvalue.data); +	err = st_map->st_ops_desc->st_ops->update(st_map->kvalue.data, old_st_map->kvalue.data, link);  	if (err)  		goto err_out; @@ -1139,11 +1144,53 @@ err_out:  	return err;  } +static int bpf_struct_ops_map_link_detach(struct bpf_link *link) +{ +	struct bpf_struct_ops_link *st_link = container_of(link, struct bpf_struct_ops_link, link); +	struct bpf_struct_ops_map *st_map; +	struct bpf_map *map; + +	mutex_lock(&update_mutex); + +	map = rcu_dereference_protected(st_link->map, lockdep_is_held(&update_mutex)); +	if (!map) { +		mutex_unlock(&update_mutex); +		return 0; +	} +	st_map = container_of(map, struct bpf_struct_ops_map, map); + +	st_map->st_ops_desc->st_ops->unreg(&st_map->kvalue.data, link); + +	RCU_INIT_POINTER(st_link->map, NULL); +	/* Pair with bpf_map_get() in bpf_struct_ops_link_create() or +	 * bpf_map_inc() in bpf_struct_ops_map_link_update(). +	 */ +	bpf_map_put(&st_map->map); + +	mutex_unlock(&update_mutex); + +	wake_up_interruptible_poll(&st_link->wait_hup, EPOLLHUP); + +	return 0; +} + +static __poll_t bpf_struct_ops_map_link_poll(struct file *file, +					     struct poll_table_struct *pts) +{ +	struct bpf_struct_ops_link *st_link = file->private_data; + +	poll_wait(file, &st_link->wait_hup, pts); + +	return rcu_access_pointer(st_link->map) ? 0 : EPOLLHUP; +} +  static const struct bpf_link_ops bpf_struct_ops_map_lops = {  	.dealloc = bpf_struct_ops_map_link_dealloc, +	.detach = bpf_struct_ops_map_link_detach,  	.show_fdinfo = bpf_struct_ops_map_link_show_fdinfo,  	.fill_link_info = bpf_struct_ops_map_link_fill_link_info,  	.update_map = bpf_struct_ops_map_link_update, +	.poll = bpf_struct_ops_map_link_poll,  };  int bpf_struct_ops_link_create(union bpf_attr *attr) @@ -1176,13 +1223,21 @@ int bpf_struct_ops_link_create(union bpf_attr *attr)  	if (err)  		goto err_out; -	err = st_map->st_ops_desc->st_ops->reg(st_map->kvalue.data); +	init_waitqueue_head(&link->wait_hup); + +	/* Hold the update_mutex such that the subsystem cannot +	 * do link->ops->detach() before the link is fully initialized. +	 */ +	mutex_lock(&update_mutex); +	err = st_map->st_ops_desc->st_ops->reg(st_map->kvalue.data, &link->link);  	if (err) { +		mutex_unlock(&update_mutex);  		bpf_link_cleanup(&link_primer);  		link = NULL;  		goto err_out;  	}  	RCU_INIT_POINTER(link->map, map); +	mutex_unlock(&update_mutex);  	return bpf_link_settle(&link_primer); diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 821063660d9f..520f49f422fe 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -274,6 +274,7 @@ struct btf {  	u32 start_str_off; /* first string offset (0 for base BTF) */  	char name[MODULE_NAME_LEN];  	bool kernel_btf; +	__u32 *base_id_map; /* map from distilled base BTF -> vmlinux BTF ids */  };  enum verifier_phase { @@ -414,7 +415,7 @@ const char *btf_type_str(const struct btf_type *t)  struct btf_show {  	u64 flags;  	void *target;	/* target of show operation (seq file, buffer) */ -	void (*showfn)(struct btf_show *show, const char *fmt, va_list args); +	__printf(2, 0) void (*showfn)(struct btf_show *show, const char *fmt, va_list args);  	const struct btf *btf;  	/* below are used during iteration */  	struct { @@ -530,6 +531,11 @@ static bool btf_type_is_decl_tag_target(const struct btf_type *t)  	       btf_type_is_var(t) || btf_type_is_typedef(t);  } +bool btf_is_vmlinux(const struct btf *btf) +{ +	return btf->kernel_btf && !btf->base_btf; +} +  u32 btf_nr_types(const struct btf *btf)  {  	u32 total = 0; @@ -772,7 +778,7 @@ static bool __btf_name_char_ok(char c, bool first)  	return true;  } -static const char *btf_str_by_offset(const struct btf *btf, u32 offset) +const char *btf_str_by_offset(const struct btf *btf, u32 offset)  {  	while (offset < btf->start_str_off)  		btf = btf->base_btf; @@ -1670,14 +1676,8 @@ static void btf_free_kfunc_set_tab(struct btf *btf)  	if (!tab)  		return; -	/* For module BTF, we directly assign the sets being registered, so -	 * there is nothing to free except kfunc_set_tab. -	 */ -	if (btf_is_module(btf)) -		goto free_tab;  	for (hook = 0; hook < ARRAY_SIZE(tab->sets); hook++)  		kfree(tab->sets[hook]); -free_tab:  	kfree(tab);  	btf->kfunc_set_tab = NULL;  } @@ -1735,7 +1735,12 @@ static void btf_free(struct btf *btf)  	kvfree(btf->types);  	kvfree(btf->resolved_sizes);  	kvfree(btf->resolved_ids); -	kvfree(btf->data); +	/* vmlinux does not allocate btf->data, it simply points it at +	 * __start_BTF. +	 */ +	if (!btf_is_vmlinux(btf)) +		kvfree(btf->data); +	kvfree(btf->base_id_map);  	kfree(btf);  } @@ -1764,6 +1769,23 @@ void btf_put(struct btf *btf)  	}  } +struct btf *btf_base_btf(const struct btf *btf) +{ +	return btf->base_btf; +} + +const struct btf_header *btf_header(const struct btf *btf) +{ +	return &btf->hdr; +} + +void btf_set_base_btf(struct btf *btf, const struct btf *base_btf) +{ +	btf->base_btf = (struct btf *)base_btf; +	btf->start_id = btf_nr_types(base_btf); +	btf->start_str_off = base_btf->hdr.str_len; +} +  static int env_resolve_init(struct btf_verifier_env *env)  {  	struct btf *btf = env->btf; @@ -3442,10 +3464,12 @@ btf_find_graph_root(const struct btf *btf, const struct btf_type *pt,  		goto end;						\  	} -static int btf_get_field_type(const char *name, u32 field_mask, u32 *seen_mask, +static int btf_get_field_type(const struct btf *btf, const struct btf_type *var_type, +			      u32 field_mask, u32 *seen_mask,  			      int *align, int *sz)  {  	int type = 0; +	const char *name = __btf_name_by_offset(btf, var_type->name_off);  	if (field_mask & BPF_SPIN_LOCK) {  		if (!strcmp(name, "bpf_spin_lock")) { @@ -3481,7 +3505,7 @@ static int btf_get_field_type(const char *name, u32 field_mask, u32 *seen_mask,  	field_mask_test_name(BPF_REFCOUNT,  "bpf_refcount");  	/* Only return BPF_KPTR when all other types with matchable names fail */ -	if (field_mask & BPF_KPTR) { +	if (field_mask & BPF_KPTR && !__btf_type_is_struct(var_type)) {  		type = BPF_KPTR_REF;  		goto end;  	} @@ -3494,140 +3518,232 @@ end:  #undef field_mask_test_name +/* Repeat a number of fields for a specified number of times. + * + * Copy the fields starting from the first field and repeat them for + * repeat_cnt times. The fields are repeated by adding the offset of each + * field with + *   (i + 1) * elem_size + * where i is the repeat index and elem_size is the size of an element. + */ +static int btf_repeat_fields(struct btf_field_info *info, +			     u32 field_cnt, u32 repeat_cnt, u32 elem_size) +{ +	u32 i, j; +	u32 cur; + +	/* Ensure not repeating fields that should not be repeated. */ +	for (i = 0; i < field_cnt; i++) { +		switch (info[i].type) { +		case BPF_KPTR_UNREF: +		case BPF_KPTR_REF: +		case BPF_KPTR_PERCPU: +		case BPF_LIST_HEAD: +		case BPF_RB_ROOT: +			break; +		default: +			return -EINVAL; +		} +	} + +	cur = field_cnt; +	for (i = 0; i < repeat_cnt; i++) { +		memcpy(&info[cur], &info[0], field_cnt * sizeof(info[0])); +		for (j = 0; j < field_cnt; j++) +			info[cur++].off += (i + 1) * elem_size; +	} + +	return 0; +} +  static int btf_find_struct_field(const struct btf *btf,  				 const struct btf_type *t, u32 field_mask, -				 struct btf_field_info *info, int info_cnt) +				 struct btf_field_info *info, int info_cnt, +				 u32 level); + +/* Find special fields in the struct type of a field. + * + * This function is used to find fields of special types that is not a + * global variable or a direct field of a struct type. It also handles the + * repetition if it is the element type of an array. + */ +static int btf_find_nested_struct(const struct btf *btf, const struct btf_type *t, +				  u32 off, u32 nelems, +				  u32 field_mask, struct btf_field_info *info, +				  int info_cnt, u32 level)  { -	int ret, idx = 0, align, sz, field_type; -	const struct btf_member *member; +	int ret, err, i; + +	level++; +	if (level >= MAX_RESOLVE_DEPTH) +		return -E2BIG; + +	ret = btf_find_struct_field(btf, t, field_mask, info, info_cnt, level); + +	if (ret <= 0) +		return ret; + +	/* Shift the offsets of the nested struct fields to the offsets +	 * related to the container. +	 */ +	for (i = 0; i < ret; i++) +		info[i].off += off; + +	if (nelems > 1) { +		err = btf_repeat_fields(info, ret, nelems - 1, t->size); +		if (err == 0) +			ret *= nelems; +		else +			ret = err; +	} + +	return ret; +} + +static int btf_find_field_one(const struct btf *btf, +			      const struct btf_type *var, +			      const struct btf_type *var_type, +			      int var_idx, +			      u32 off, u32 expected_size, +			      u32 field_mask, u32 *seen_mask, +			      struct btf_field_info *info, int info_cnt, +			      u32 level) +{ +	int ret, align, sz, field_type;  	struct btf_field_info tmp; +	const struct btf_array *array; +	u32 i, nelems = 1; + +	/* Walk into array types to find the element type and the number of +	 * elements in the (flattened) array. +	 */ +	for (i = 0; i < MAX_RESOLVE_DEPTH && btf_type_is_array(var_type); i++) { +		array = btf_array(var_type); +		nelems *= array->nelems; +		var_type = btf_type_by_id(btf, array->type); +	} +	if (i == MAX_RESOLVE_DEPTH) +		return -E2BIG; +	if (nelems == 0) +		return 0; + +	field_type = btf_get_field_type(btf, var_type, +					field_mask, seen_mask, &align, &sz); +	/* Look into variables of struct types */ +	if (!field_type && __btf_type_is_struct(var_type)) { +		sz = var_type->size; +		if (expected_size && expected_size != sz * nelems) +			return 0; +		ret = btf_find_nested_struct(btf, var_type, off, nelems, field_mask, +					     &info[0], info_cnt, level); +		return ret; +	} + +	if (field_type == 0) +		return 0; +	if (field_type < 0) +		return field_type; + +	if (expected_size && expected_size != sz * nelems) +		return 0; +	if (off % align) +		return 0; + +	switch (field_type) { +	case BPF_SPIN_LOCK: +	case BPF_TIMER: +	case BPF_WORKQUEUE: +	case BPF_LIST_NODE: +	case BPF_RB_NODE: +	case BPF_REFCOUNT: +		ret = btf_find_struct(btf, var_type, off, sz, field_type, +				      info_cnt ? &info[0] : &tmp); +		if (ret < 0) +			return ret; +		break; +	case BPF_KPTR_UNREF: +	case BPF_KPTR_REF: +	case BPF_KPTR_PERCPU: +		ret = btf_find_kptr(btf, var_type, off, sz, +				    info_cnt ? &info[0] : &tmp); +		if (ret < 0) +			return ret; +		break; +	case BPF_LIST_HEAD: +	case BPF_RB_ROOT: +		ret = btf_find_graph_root(btf, var, var_type, +					  var_idx, off, sz, +					  info_cnt ? &info[0] : &tmp, +					  field_type); +		if (ret < 0) +			return ret; +		break; +	default: +		return -EFAULT; +	} + +	if (ret == BTF_FIELD_IGNORE) +		return 0; +	if (nelems > info_cnt) +		return -E2BIG; +	if (nelems > 1) { +		ret = btf_repeat_fields(info, 1, nelems - 1, sz); +		if (ret < 0) +			return ret; +	} +	return nelems; +} + +static int btf_find_struct_field(const struct btf *btf, +				 const struct btf_type *t, u32 field_mask, +				 struct btf_field_info *info, int info_cnt, +				 u32 level) +{ +	int ret, idx = 0; +	const struct btf_member *member;  	u32 i, off, seen_mask = 0;  	for_each_member(i, t, member) {  		const struct btf_type *member_type = btf_type_by_id(btf,  								    member->type); -		field_type = btf_get_field_type(__btf_name_by_offset(btf, member_type->name_off), -						field_mask, &seen_mask, &align, &sz); -		if (field_type == 0) -			continue; -		if (field_type < 0) -			return field_type; -  		off = __btf_member_bit_offset(t, member);  		if (off % 8)  			/* valid C code cannot generate such BTF */  			return -EINVAL;  		off /= 8; -		if (off % align) -			continue; -		switch (field_type) { -		case BPF_SPIN_LOCK: -		case BPF_TIMER: -		case BPF_WORKQUEUE: -		case BPF_LIST_NODE: -		case BPF_RB_NODE: -		case BPF_REFCOUNT: -			ret = btf_find_struct(btf, member_type, off, sz, field_type, -					      idx < info_cnt ? &info[idx] : &tmp); -			if (ret < 0) -				return ret; -			break; -		case BPF_KPTR_UNREF: -		case BPF_KPTR_REF: -		case BPF_KPTR_PERCPU: -			ret = btf_find_kptr(btf, member_type, off, sz, -					    idx < info_cnt ? &info[idx] : &tmp); -			if (ret < 0) -				return ret; -			break; -		case BPF_LIST_HEAD: -		case BPF_RB_ROOT: -			ret = btf_find_graph_root(btf, t, member_type, -						  i, off, sz, -						  idx < info_cnt ? &info[idx] : &tmp, -						  field_type); -			if (ret < 0) -				return ret; -			break; -		default: -			return -EFAULT; -		} - -		if (ret == BTF_FIELD_IGNORE) -			continue; -		if (idx >= info_cnt) -			return -E2BIG; -		++idx; +		ret = btf_find_field_one(btf, t, member_type, i, +					 off, 0, +					 field_mask, &seen_mask, +					 &info[idx], info_cnt - idx, level); +		if (ret < 0) +			return ret; +		idx += ret;  	}  	return idx;  }  static int btf_find_datasec_var(const struct btf *btf, const struct btf_type *t,  				u32 field_mask, struct btf_field_info *info, -				int info_cnt) +				int info_cnt, u32 level)  { -	int ret, idx = 0, align, sz, field_type; +	int ret, idx = 0;  	const struct btf_var_secinfo *vsi; -	struct btf_field_info tmp;  	u32 i, off, seen_mask = 0;  	for_each_vsi(i, t, vsi) {  		const struct btf_type *var = btf_type_by_id(btf, vsi->type);  		const struct btf_type *var_type = btf_type_by_id(btf, var->type); -		field_type = btf_get_field_type(__btf_name_by_offset(btf, var_type->name_off), -						field_mask, &seen_mask, &align, &sz); -		if (field_type == 0) -			continue; -		if (field_type < 0) -			return field_type; -  		off = vsi->offset; -		if (vsi->size != sz) -			continue; -		if (off % align) -			continue; - -		switch (field_type) { -		case BPF_SPIN_LOCK: -		case BPF_TIMER: -		case BPF_WORKQUEUE: -		case BPF_LIST_NODE: -		case BPF_RB_NODE: -		case BPF_REFCOUNT: -			ret = btf_find_struct(btf, var_type, off, sz, field_type, -					      idx < info_cnt ? &info[idx] : &tmp); -			if (ret < 0) -				return ret; -			break; -		case BPF_KPTR_UNREF: -		case BPF_KPTR_REF: -		case BPF_KPTR_PERCPU: -			ret = btf_find_kptr(btf, var_type, off, sz, -					    idx < info_cnt ? &info[idx] : &tmp); -			if (ret < 0) -				return ret; -			break; -		case BPF_LIST_HEAD: -		case BPF_RB_ROOT: -			ret = btf_find_graph_root(btf, var, var_type, -						  -1, off, sz, -						  idx < info_cnt ? &info[idx] : &tmp, -						  field_type); -			if (ret < 0) -				return ret; -			break; -		default: -			return -EFAULT; -		} - -		if (ret == BTF_FIELD_IGNORE) -			continue; -		if (idx >= info_cnt) -			return -E2BIG; -		++idx; +		ret = btf_find_field_one(btf, var, var_type, -1, off, vsi->size, +					 field_mask, &seen_mask, +					 &info[idx], info_cnt - idx, +					 level); +		if (ret < 0) +			return ret; +		idx += ret;  	}  	return idx;  } @@ -3637,9 +3753,9 @@ static int btf_find_field(const struct btf *btf, const struct btf_type *t,  			  int info_cnt)  {  	if (__btf_type_is_struct(t)) -		return btf_find_struct_field(btf, t, field_mask, info, info_cnt); +		return btf_find_struct_field(btf, t, field_mask, info, info_cnt, 0);  	else if (btf_type_is_datasec(t)) -		return btf_find_datasec_var(btf, t, field_mask, info, info_cnt); +		return btf_find_datasec_var(btf, t, field_mask, info, info_cnt, 0);  	return -EINVAL;  } @@ -5726,6 +5842,15 @@ static int find_kern_ctx_type_id(enum bpf_prog_type prog_type)  	return ctx_type->type;  } +bool btf_is_projection_of(const char *pname, const char *tname) +{ +	if (strcmp(pname, "__sk_buff") == 0 && strcmp(tname, "sk_buff") == 0) +		return true; +	if (strcmp(pname, "xdp_md") == 0 && strcmp(tname, "xdp_buff") == 0) +		return true; +	return false; +} +  bool btf_is_prog_ctx_type(struct bpf_verifier_log *log, const struct btf *btf,  			  const struct btf_type *t, enum bpf_prog_type prog_type,  			  int arg) @@ -5788,9 +5913,7 @@ again:  	 * int socket_filter_bpf_prog(struct __sk_buff *skb)  	 * { // no fields of skb are ever used }  	 */ -	if (strcmp(ctx_tname, "__sk_buff") == 0 && strcmp(tname, "sk_buff") == 0) -		return true; -	if (strcmp(ctx_tname, "xdp_md") == 0 && strcmp(tname, "xdp_buff") == 0) +	if (btf_is_projection_of(ctx_tname, tname))  		return true;  	if (strcmp(ctx_tname, tname)) {  		/* bpf_user_pt_regs_t is a typedef, so resolve it to @@ -5982,23 +6105,15 @@ int get_kern_ctx_btf_id(struct bpf_verifier_log *log, enum bpf_prog_type prog_ty  BTF_ID_LIST(bpf_ctx_convert_btf_id)  BTF_ID(struct, bpf_ctx_convert) -struct btf *btf_parse_vmlinux(void) +static struct btf *btf_parse_base(struct btf_verifier_env *env, const char *name, +				  void *data, unsigned int data_size)  { -	struct btf_verifier_env *env = NULL; -	struct bpf_verifier_log *log;  	struct btf *btf = NULL;  	int err;  	if (!IS_ENABLED(CONFIG_DEBUG_INFO_BTF))  		return ERR_PTR(-ENOENT); -	env = kzalloc(sizeof(*env), GFP_KERNEL | __GFP_NOWARN); -	if (!env) -		return ERR_PTR(-ENOMEM); - -	log = &env->log; -	log->level = BPF_LOG_KERNEL; -  	btf = kzalloc(sizeof(*btf), GFP_KERNEL | __GFP_NOWARN);  	if (!btf) {  		err = -ENOMEM; @@ -6006,10 +6121,10 @@ struct btf *btf_parse_vmlinux(void)  	}  	env->btf = btf; -	btf->data = __start_BTF; -	btf->data_size = __stop_BTF - __start_BTF; +	btf->data = data; +	btf->data_size = data_size;  	btf->kernel_btf = true; -	snprintf(btf->name, sizeof(btf->name), "vmlinux"); +	snprintf(btf->name, sizeof(btf->name), "%s", name);  	err = btf_parse_hdr(env);  	if (err) @@ -6029,20 +6144,11 @@ struct btf *btf_parse_vmlinux(void)  	if (err)  		goto errout; -	/* btf_parse_vmlinux() runs under bpf_verifier_lock */ -	bpf_ctx_convert.t = btf_type_by_id(btf, bpf_ctx_convert_btf_id[0]); -  	refcount_set(&btf->refcnt, 1); -	err = btf_alloc_id(btf); -	if (err) -		goto errout; - -	btf_verifier_env_free(env);  	return btf;  errout: -	btf_verifier_env_free(env);  	if (btf) {  		kvfree(btf->types);  		kfree(btf); @@ -6050,19 +6156,61 @@ errout:  	return ERR_PTR(err);  } +struct btf *btf_parse_vmlinux(void) +{ +	struct btf_verifier_env *env = NULL; +	struct bpf_verifier_log *log; +	struct btf *btf; +	int err; + +	env = kzalloc(sizeof(*env), GFP_KERNEL | __GFP_NOWARN); +	if (!env) +		return ERR_PTR(-ENOMEM); + +	log = &env->log; +	log->level = BPF_LOG_KERNEL; +	btf = btf_parse_base(env, "vmlinux", __start_BTF, __stop_BTF - __start_BTF); +	if (IS_ERR(btf)) +		goto err_out; + +	/* btf_parse_vmlinux() runs under bpf_verifier_lock */ +	bpf_ctx_convert.t = btf_type_by_id(btf, bpf_ctx_convert_btf_id[0]); +	err = btf_alloc_id(btf); +	if (err) { +		btf_free(btf); +		btf = ERR_PTR(err); +	} +err_out: +	btf_verifier_env_free(env); +	return btf; +} + +/* If .BTF_ids section was created with distilled base BTF, both base and + * split BTF ids will need to be mapped to actual base/split ids for + * BTF now that it has been relocated. + */ +static __u32 btf_relocate_id(const struct btf *btf, __u32 id) +{ +	if (!btf->base_btf || !btf->base_id_map) +		return id; +	return btf->base_id_map[id]; +} +  #ifdef CONFIG_DEBUG_INFO_BTF_MODULES -static struct btf *btf_parse_module(const char *module_name, const void *data, unsigned int data_size) +static struct btf *btf_parse_module(const char *module_name, const void *data, +				    unsigned int data_size, void *base_data, +				    unsigned int base_data_size)  { +	struct btf *btf = NULL, *vmlinux_btf, *base_btf = NULL;  	struct btf_verifier_env *env = NULL;  	struct bpf_verifier_log *log; -	struct btf *btf = NULL, *base_btf; -	int err; +	int err = 0; -	base_btf = bpf_get_btf_vmlinux(); -	if (IS_ERR(base_btf)) -		return base_btf; -	if (!base_btf) +	vmlinux_btf = bpf_get_btf_vmlinux(); +	if (IS_ERR(vmlinux_btf)) +		return vmlinux_btf; +	if (!vmlinux_btf)  		return ERR_PTR(-EINVAL);  	env = kzalloc(sizeof(*env), GFP_KERNEL | __GFP_NOWARN); @@ -6072,6 +6220,16 @@ static struct btf *btf_parse_module(const char *module_name, const void *data, u  	log = &env->log;  	log->level = BPF_LOG_KERNEL; +	if (base_data) { +		base_btf = btf_parse_base(env, ".BTF.base", base_data, base_data_size); +		if (IS_ERR(base_btf)) { +			err = PTR_ERR(base_btf); +			goto errout; +		} +	} else { +		base_btf = vmlinux_btf; +	} +  	btf = kzalloc(sizeof(*btf), GFP_KERNEL | __GFP_NOWARN);  	if (!btf) {  		err = -ENOMEM; @@ -6111,12 +6269,22 @@ static struct btf *btf_parse_module(const char *module_name, const void *data, u  	if (err)  		goto errout; +	if (base_btf != vmlinux_btf) { +		err = btf_relocate(btf, vmlinux_btf, &btf->base_id_map); +		if (err) +			goto errout; +		btf_free(base_btf); +		base_btf = vmlinux_btf; +	} +  	btf_verifier_env_free(env);  	refcount_set(&btf->refcnt, 1);  	return btf;  errout:  	btf_verifier_env_free(env); +	if (base_btf != vmlinux_btf) +		btf_free(base_btf);  	if (btf) {  		kvfree(btf->data);  		kvfree(btf->types); @@ -6693,7 +6861,7 @@ int btf_struct_access(struct bpf_verifier_log *log,  		for (i = 0; i < rec->cnt; i++) {  			struct btf_field *field = &rec->fields[i];  			u32 offset = field->offset; -			if (off < offset + btf_field_type_size(field->type) && offset < off + size) { +			if (off < offset + field->size && offset < off + size) {  				bpf_log(log,  					"direct access to %s is disallowed\n",  					btf_field_type_name(field->type)); @@ -7370,8 +7538,8 @@ static void btf_type_show(const struct btf *btf, u32 type_id, void *obj,  	btf_type_ops(t)->show(btf, t, type_id, obj, 0, show);  } -static void btf_seq_show(struct btf_show *show, const char *fmt, -			 va_list args) +__printf(2, 0) static void btf_seq_show(struct btf_show *show, const char *fmt, +					va_list args)  {  	seq_vprintf((struct seq_file *)show->target, fmt, args);  } @@ -7404,8 +7572,8 @@ struct btf_show_snprintf {  	int len;		/* length we would have written */  }; -static void btf_snprintf_show(struct btf_show *show, const char *fmt, -			      va_list args) +__printf(2, 0) static void btf_snprintf_show(struct btf_show *show, const char *fmt, +					     va_list args)  {  	struct btf_show_snprintf *ssnprintf = (struct btf_show_snprintf *)show;  	int len; @@ -7669,7 +7837,8 @@ static int btf_module_notify(struct notifier_block *nb, unsigned long op,  			err = -ENOMEM;  			goto out;  		} -		btf = btf_parse_module(mod->name, mod->btf_data, mod->btf_data_size); +		btf = btf_parse_module(mod->name, mod->btf_data, mod->btf_data_size, +				       mod->btf_base_data, mod->btf_base_data_size);  		if (IS_ERR(btf)) {  			kfree(btf_mod);  			if (!IS_ENABLED(CONFIG_MODULE_ALLOW_BTF_MISMATCH)) { @@ -7993,7 +8162,7 @@ static int btf_populate_kfunc_set(struct btf *btf, enum btf_kfunc_hook hook,  	bool add_filter = !!kset->filter;  	struct btf_kfunc_set_tab *tab;  	struct btf_id_set8 *set; -	u32 set_cnt; +	u32 set_cnt, i;  	int ret;  	if (hook >= BTF_KFUNC_HOOK_MAX) { @@ -8039,21 +8208,15 @@ static int btf_populate_kfunc_set(struct btf *btf, enum btf_kfunc_hook hook,  		goto end;  	} -	/* We don't need to allocate, concatenate, and sort module sets, because -	 * only one is allowed per hook. Hence, we can directly assign the -	 * pointer and return. -	 */ -	if (!vmlinux_set) { -		tab->sets[hook] = add_set; -		goto do_add_filter; -	} -  	/* In case of vmlinux sets, there may be more than one set being  	 * registered per hook. To create a unified set, we allocate a new set  	 * and concatenate all individual sets being registered. While each set  	 * is individually sorted, they may become unsorted when concatenated,  	 * hence re-sorting the final set again is required to make binary  	 * searching the set using btf_id_set8_contains function work. +	 * +	 * For module sets, we need to allocate as we may need to relocate +	 * BTF ids.  	 */  	set_cnt = set ? set->cnt : 0; @@ -8083,11 +8246,14 @@ static int btf_populate_kfunc_set(struct btf *btf, enum btf_kfunc_hook hook,  	/* Concatenate the two sets */  	memcpy(set->pairs + set->cnt, add_set->pairs, add_set->cnt * sizeof(set->pairs[0])); +	/* Now that the set is copied, update with relocated BTF ids */ +	for (i = set->cnt; i < set->cnt + add_set->cnt; i++) +		set->pairs[i].id = btf_relocate_id(btf, set->pairs[i].id); +  	set->cnt += add_set->cnt;  	sort(set->pairs, set->cnt, sizeof(set->pairs[0]), btf_id_cmp_func, NULL); -do_add_filter:  	if (add_filter) {  		hook_filter = &tab->hook_filters[hook];  		hook_filter->filters[hook_filter->nr_filters++] = kset->filter; @@ -8207,7 +8373,7 @@ static int __register_btf_kfunc_id_set(enum btf_kfunc_hook hook,  		return PTR_ERR(btf);  	for (i = 0; i < kset->set->cnt; i++) { -		ret = btf_check_kfunc_protos(btf, kset->set->pairs[i].id, +		ret = btf_check_kfunc_protos(btf, btf_relocate_id(btf, kset->set->pairs[i].id),  					     kset->set->pairs[i].flags);  		if (ret)  			goto err_out; @@ -8271,7 +8437,7 @@ static int btf_check_dtor_kfuncs(struct btf *btf, const struct btf_id_dtor_kfunc  	u32 nr_args, i;  	for (i = 0; i < cnt; i++) { -		dtor_btf_id = dtors[i].kfunc_btf_id; +		dtor_btf_id = btf_relocate_id(btf, dtors[i].kfunc_btf_id);  		dtor_func = btf_type_by_id(btf, dtor_btf_id);  		if (!dtor_func || !btf_type_is_func(dtor_func)) @@ -8306,7 +8472,7 @@ int register_btf_id_dtor_kfuncs(const struct btf_id_dtor_kfunc *dtors, u32 add_c  {  	struct btf_id_dtor_kfunc_tab *tab;  	struct btf *btf; -	u32 tab_cnt; +	u32 tab_cnt, i;  	int ret;  	btf = btf_get_module_btf(owner); @@ -8357,6 +8523,13 @@ int register_btf_id_dtor_kfuncs(const struct btf_id_dtor_kfunc *dtors, u32 add_c  	btf->dtor_kfunc_tab = tab;  	memcpy(tab->dtors + tab->cnt, dtors, add_cnt * sizeof(tab->dtors[0])); + +	/* remap BTF ids based on BTF relocation (if any) */ +	for (i = tab_cnt; i < tab_cnt + add_cnt; i++) { +		tab->dtors[i].btf_id = btf_relocate_id(btf, tab->dtors[i].btf_id); +		tab->dtors[i].kfunc_btf_id = btf_relocate_id(btf, tab->dtors[i].kfunc_btf_id); +	} +  	tab->cnt += add_cnt;  	sort(tab->dtors, tab->cnt, sizeof(tab->dtors[0]), btf_id_cmp_func, NULL); diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 1a6c3faa6e4a..7ee62e38faf0 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -736,11 +736,11 @@ static struct bpf_ksym *bpf_ksym_find(unsigned long addr)  	return n ? container_of(n, struct bpf_ksym, tnode) : NULL;  } -const char *__bpf_address_lookup(unsigned long addr, unsigned long *size, +int __bpf_address_lookup(unsigned long addr, unsigned long *size,  				 unsigned long *off, char *sym)  {  	struct bpf_ksym *ksym; -	char *ret = NULL; +	int ret = 0;  	rcu_read_lock();  	ksym = bpf_ksym_find(addr); @@ -748,9 +748,8 @@ const char *__bpf_address_lookup(unsigned long addr, unsigned long *size,  		unsigned long symbol_start = ksym->start;  		unsigned long symbol_end = ksym->end; -		strscpy(sym, ksym->name, KSYM_NAME_LEN); +		ret = strscpy(sym, ksym->name, KSYM_NAME_LEN); -		ret = sym;  		if (size)  			*size = symbol_end - symbol_start;  		if (off) @@ -1174,8 +1173,7 @@ bpf_jit_binary_pack_alloc(unsigned int proglen, u8 **image_ptr,  }  /* Copy JITed text from rw_header to its final location, the ro_header. */ -int bpf_jit_binary_pack_finalize(struct bpf_prog *prog, -				 struct bpf_binary_header *ro_header, +int bpf_jit_binary_pack_finalize(struct bpf_binary_header *ro_header,  				 struct bpf_binary_header *rw_header)  {  	void *ptr; @@ -2743,8 +2741,7 @@ static void bpf_free_used_maps(struct bpf_prog_aux *aux)  	kfree(aux->used_maps);  } -void __bpf_free_used_btfs(struct bpf_prog_aux *aux, -			  struct btf_mod_pair *used_btfs, u32 len) +void __bpf_free_used_btfs(struct btf_mod_pair *used_btfs, u32 len)  {  #ifdef CONFIG_BPF_SYSCALL  	struct btf_mod_pair *btf_mod; @@ -2761,7 +2758,7 @@ void __bpf_free_used_btfs(struct bpf_prog_aux *aux,  static void bpf_free_used_btfs(struct bpf_prog_aux *aux)  { -	__bpf_free_used_btfs(aux, aux->used_btfs, aux->used_btf_cnt); +	__bpf_free_used_btfs(aux->used_btfs, aux->used_btf_cnt);  	kfree(aux->used_btfs);  } diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index a8e34416e960..fbdf5a1aabfe 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -79,8 +79,6 @@ struct bpf_cpu_map {  	struct bpf_cpu_map_entry __rcu **cpu_map;  }; -static DEFINE_PER_CPU(struct list_head, cpu_map_flush_list); -  static struct bpf_map *cpu_map_alloc(union bpf_attr *attr)  {  	u32 value_size = attr->value_size; @@ -240,12 +238,14 @@ static int cpu_map_bpf_prog_run(struct bpf_cpu_map_entry *rcpu, void **frames,  				int xdp_n, struct xdp_cpumap_stats *stats,  				struct list_head *list)  { +	struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx;  	int nframes;  	if (!rcpu->prog)  		return xdp_n;  	rcu_read_lock_bh(); +	bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx);  	nframes = cpu_map_bpf_prog_run_xdp(rcpu, frames, xdp_n, stats); @@ -255,6 +255,7 @@ static int cpu_map_bpf_prog_run(struct bpf_cpu_map_entry *rcpu, void **frames,  	if (unlikely(!list_empty(list)))  		cpu_map_bpf_prog_run_skb(rcpu, list, stats); +	bpf_net_ctx_clear(bpf_net_ctx);  	rcu_read_unlock_bh(); /* resched point, may call do_softirq() */  	return nframes; @@ -706,7 +707,6 @@ static void bq_flush_to_queue(struct xdp_bulk_queue *bq)   */  static void bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_frame *xdpf)  { -	struct list_head *flush_list = this_cpu_ptr(&cpu_map_flush_list);  	struct xdp_bulk_queue *bq = this_cpu_ptr(rcpu->bulkq);  	if (unlikely(bq->count == CPU_MAP_BULK_SIZE)) @@ -723,8 +723,11 @@ static void bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_frame *xdpf)  	 */  	bq->q[bq->count++] = xdpf; -	if (!bq->flush_node.prev) +	if (!bq->flush_node.prev) { +		struct list_head *flush_list = bpf_net_ctx_get_cpu_map_flush_list(); +  		list_add(&bq->flush_node, flush_list); +	}  }  int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_frame *xdpf, @@ -756,9 +759,8 @@ trace:  	return ret;  } -void __cpu_map_flush(void) +void __cpu_map_flush(struct list_head *flush_list)  { -	struct list_head *flush_list = this_cpu_ptr(&cpu_map_flush_list);  	struct xdp_bulk_queue *bq, *tmp;  	list_for_each_entry_safe(bq, tmp, flush_list, flush_node) { @@ -768,24 +770,3 @@ void __cpu_map_flush(void)  		wake_up_process(bq->obj->kthread);  	}  } - -#ifdef CONFIG_DEBUG_NET -bool cpu_map_check_flush(void) -{ -	if (list_empty(this_cpu_ptr(&cpu_map_flush_list))) -		return false; -	__cpu_map_flush(); -	return true; -} -#endif - -static int __init cpu_map_init(void) -{ -	int cpu; - -	for_each_possible_cpu(cpu) -		INIT_LIST_HEAD(&per_cpu(cpu_map_flush_list, cpu)); -	return 0; -} - -subsys_initcall(cpu_map_init); diff --git a/kernel/bpf/crypto.c b/kernel/bpf/crypto.c index 2bee4af91e38..94854cd9c4cc 100644 --- a/kernel/bpf/crypto.c +++ b/kernel/bpf/crypto.c @@ -275,7 +275,7 @@ static int bpf_crypto_crypt(const struct bpf_crypto_ctx *ctx,  	if (__bpf_dynptr_is_rdonly(dst))  		return -EINVAL; -	siv_len = __bpf_dynptr_size(siv); +	siv_len = siv ? __bpf_dynptr_size(siv) : 0;  	src_len = __bpf_dynptr_size(src);  	dst_len = __bpf_dynptr_size(dst);  	if (!src_len || !dst_len) @@ -303,36 +303,44 @@ static int bpf_crypto_crypt(const struct bpf_crypto_ctx *ctx,  /**   * bpf_crypto_decrypt() - Decrypt buffer using configured context and IV provided. - * @ctx:	The crypto context being used. The ctx must be a trusted pointer. - * @src:	bpf_dynptr to the encrypted data. Must be a trusted pointer. - * @dst:	bpf_dynptr to the buffer where to store the result. Must be a trusted pointer. - * @siv:	bpf_dynptr to IV data and state data to be used by decryptor. + * @ctx:		The crypto context being used. The ctx must be a trusted pointer. + * @src:		bpf_dynptr to the encrypted data. Must be a trusted pointer. + * @dst:		bpf_dynptr to the buffer where to store the result. Must be a trusted pointer. + * @siv__nullable:	bpf_dynptr to IV data and state data to be used by decryptor. May be NULL.   *   * Decrypts provided buffer using IV data and the crypto context. Crypto context must be configured.   */  __bpf_kfunc int bpf_crypto_decrypt(struct bpf_crypto_ctx *ctx, -				   const struct bpf_dynptr_kern *src, -				   const struct bpf_dynptr_kern *dst, -				   const struct bpf_dynptr_kern *siv) +				   const struct bpf_dynptr *src, +				   const struct bpf_dynptr *dst, +				   const struct bpf_dynptr *siv__nullable)  { -	return bpf_crypto_crypt(ctx, src, dst, siv, true); +	const struct bpf_dynptr_kern *src_kern = (struct bpf_dynptr_kern *)src; +	const struct bpf_dynptr_kern *dst_kern = (struct bpf_dynptr_kern *)dst; +	const struct bpf_dynptr_kern *siv_kern = (struct bpf_dynptr_kern *)siv__nullable; + +	return bpf_crypto_crypt(ctx, src_kern, dst_kern, siv_kern, true);  }  /**   * bpf_crypto_encrypt() - Encrypt buffer using configured context and IV provided. - * @ctx:	The crypto context being used. The ctx must be a trusted pointer. - * @src:	bpf_dynptr to the plain data. Must be a trusted pointer. - * @dst:	bpf_dynptr to buffer where to store the result. Must be a trusted pointer. - * @siv:	bpf_dynptr to IV data and state data to be used by decryptor. + * @ctx:		The crypto context being used. The ctx must be a trusted pointer. + * @src:		bpf_dynptr to the plain data. Must be a trusted pointer. + * @dst:		bpf_dynptr to the buffer where to store the result. Must be a trusted pointer. + * @siv__nullable:	bpf_dynptr to IV data and state data to be used by decryptor. May be NULL.   *   * Encrypts provided buffer using IV data and the crypto context. Crypto context must be configured.   */  __bpf_kfunc int bpf_crypto_encrypt(struct bpf_crypto_ctx *ctx, -				   const struct bpf_dynptr_kern *src, -				   const struct bpf_dynptr_kern *dst, -				   const struct bpf_dynptr_kern *siv) +				   const struct bpf_dynptr *src, +				   const struct bpf_dynptr *dst, +				   const struct bpf_dynptr *siv__nullable)  { -	return bpf_crypto_crypt(ctx, src, dst, siv, false); +	const struct bpf_dynptr_kern *src_kern = (struct bpf_dynptr_kern *)src; +	const struct bpf_dynptr_kern *dst_kern = (struct bpf_dynptr_kern *)dst; +	const struct bpf_dynptr_kern *siv_kern = (struct bpf_dynptr_kern *)siv__nullable; + +	return bpf_crypto_crypt(ctx, src_kern, dst_kern, siv_kern, false);  }  __bpf_kfunc_end_defs(); diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index 4e2cdbb5629f..9e0e3b0a18e4 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -83,7 +83,6 @@ struct bpf_dtab {  	u32 n_buckets;  }; -static DEFINE_PER_CPU(struct list_head, dev_flush_list);  static DEFINE_SPINLOCK(dev_map_lock);  static LIST_HEAD(dev_map_list); @@ -107,7 +106,7 @@ static inline struct hlist_head *dev_map_index_hash(struct bpf_dtab *dtab,  	return &dtab->dev_index_head[idx & (dtab->n_buckets - 1)];  } -static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr) +static int dev_map_alloc_check(union bpf_attr *attr)  {  	u32 valsize = attr->value_size; @@ -121,23 +120,28 @@ static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr)  	    attr->map_flags & ~DEV_CREATE_FLAG_MASK)  		return -EINVAL; +	if (attr->map_type == BPF_MAP_TYPE_DEVMAP_HASH) { +		/* Hash table size must be power of 2; roundup_pow_of_two() +		 * can overflow into UB on 32-bit arches +		 */ +		if (attr->max_entries > 1UL << 31) +			return -EINVAL; +	} + +	return 0; +} + +static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr) +{  	/* Lookup returns a pointer straight to dev->ifindex, so make sure the  	 * verifier prevents writes from the BPF side  	 */  	attr->map_flags |= BPF_F_RDONLY_PROG; - -  	bpf_map_init_from_attr(&dtab->map, attr);  	if (attr->map_type == BPF_MAP_TYPE_DEVMAP_HASH) { -		/* hash table size must be power of 2; roundup_pow_of_two() can -		 * overflow into UB on 32-bit arches, so check that first -		 */ -		if (dtab->map.max_entries > 1UL << 31) -			return -EINVAL; - +		/* Hash table size must be power of 2 */  		dtab->n_buckets = roundup_pow_of_two(dtab->map.max_entries); -  		dtab->dev_index_head = dev_map_create_hash(dtab->n_buckets,  							   dtab->map.numa_node);  		if (!dtab->dev_index_head) @@ -196,7 +200,14 @@ static void dev_map_free(struct bpf_map *map)  	list_del_rcu(&dtab->list);  	spin_unlock(&dev_map_lock); -	bpf_clear_redirect_map(map); +	/* bpf_redirect_info->map is assigned in __bpf_xdp_redirect_map() +	 * during NAPI callback and cleared after the XDP redirect. There is no +	 * explicit RCU read section which protects bpf_redirect_info->map but +	 * local_bh_disable() also marks the beginning an RCU section. This +	 * makes the complete softirq callback RCU protected. Thus after +	 * following synchronize_rcu() there no bpf_redirect_info->map == map +	 * assignment. +	 */  	synchronize_rcu();  	/* Make sure prior __dev_map_entry_free() have completed. */ @@ -406,9 +417,8 @@ out:   * driver before returning from its napi->poll() routine. See the comment above   * xdp_do_flush() in filter.c.   */ -void __dev_flush(void) +void __dev_flush(struct list_head *flush_list)  { -	struct list_head *flush_list = this_cpu_ptr(&dev_flush_list);  	struct xdp_dev_bulk_queue *bq, *tmp;  	list_for_each_entry_safe(bq, tmp, flush_list, flush_node) { @@ -419,16 +429,6 @@ void __dev_flush(void)  	}  } -#ifdef CONFIG_DEBUG_NET -bool dev_check_flush(void) -{ -	if (list_empty(this_cpu_ptr(&dev_flush_list))) -		return false; -	__dev_flush(); -	return true; -} -#endif -  /* Elements are kept alive by RCU; either by rcu_read_lock() (from syscall) or   * by local_bh_disable() (from XDP calls inside NAPI). The   * rcu_read_lock_bh_held() below makes lockdep accept both. @@ -453,7 +453,6 @@ static void *__dev_map_lookup_elem(struct bpf_map *map, u32 key)  static void bq_enqueue(struct net_device *dev, struct xdp_frame *xdpf,  		       struct net_device *dev_rx, struct bpf_prog *xdp_prog)  { -	struct list_head *flush_list = this_cpu_ptr(&dev_flush_list);  	struct xdp_dev_bulk_queue *bq = this_cpu_ptr(dev->xdp_bulkq);  	if (unlikely(bq->count == DEV_MAP_BULK_SIZE)) @@ -467,6 +466,8 @@ static void bq_enqueue(struct net_device *dev, struct xdp_frame *xdpf,  	 * are only ever modified together.  	 */  	if (!bq->dev_rx) { +		struct list_head *flush_list = bpf_net_ctx_get_dev_flush_list(); +  		bq->dev_rx = dev_rx;  		bq->xdp_prog = xdp_prog;  		list_add(&bq->flush_node, flush_list); @@ -760,9 +761,6 @@ int dev_map_redirect_multi(struct net_device *dev, struct sk_buff *skb,  		for (i = 0; i < dtab->n_buckets; i++) {  			head = dev_map_index_hash(dtab, i);  			hlist_for_each_entry_safe(dst, next, head, index_hlist) { -				if (!dst) -					continue; -  				if (is_ifindex_excluded(excluded_devices, num_excluded,  							dst->dev->ifindex))  					continue; @@ -1043,6 +1041,7 @@ static u64 dev_map_mem_usage(const struct bpf_map *map)  BTF_ID_LIST_SINGLE(dev_map_btf_ids, struct, bpf_dtab)  const struct bpf_map_ops dev_map_ops = {  	.map_meta_equal = bpf_map_meta_equal, +	.map_alloc_check = dev_map_alloc_check,  	.map_alloc = dev_map_alloc,  	.map_free = dev_map_free,  	.map_get_next_key = dev_map_get_next_key, @@ -1057,6 +1056,7 @@ const struct bpf_map_ops dev_map_ops = {  const struct bpf_map_ops dev_map_hash_ops = {  	.map_meta_equal = bpf_map_meta_equal, +	.map_alloc_check = dev_map_alloc_check,  	.map_alloc = dev_map_alloc,  	.map_free = dev_map_free,  	.map_get_next_key = dev_map_hash_get_next_key, @@ -1156,15 +1156,11 @@ static struct notifier_block dev_map_notifier = {  static int __init dev_map_init(void)  { -	int cpu; -  	/* Assure tracepoint shadow struct _bpf_dtab_netdev is in sync */  	BUILD_BUG_ON(offsetof(struct bpf_dtab_netdev, dev) !=  		     offsetof(struct _bpf_dtab_netdev, dev));  	register_netdevice_notifier(&dev_map_notifier); -	for_each_possible_cpu(cpu) -		INIT_LIST_HEAD(&per_cpu(dev_flush_list, cpu));  	return 0;  } diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 2a69a9a36c0f..b5f0adae8293 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -1084,7 +1084,10 @@ struct bpf_async_cb {  	struct bpf_prog *prog;  	void __rcu *callback_fn;  	void *value; -	struct rcu_head rcu; +	union { +		struct rcu_head rcu; +		struct work_struct delete_work; +	};  	u64 flags;  }; @@ -1107,6 +1110,7 @@ struct bpf_async_cb {  struct bpf_hrtimer {  	struct bpf_async_cb cb;  	struct hrtimer timer; +	atomic_t cancelling;  };  struct bpf_work { @@ -1219,6 +1223,21 @@ static void bpf_wq_delete_work(struct work_struct *work)  	kfree_rcu(w, cb.rcu);  } +static void bpf_timer_delete_work(struct work_struct *work) +{ +	struct bpf_hrtimer *t = container_of(work, struct bpf_hrtimer, cb.delete_work); + +	/* Cancel the timer and wait for callback to complete if it was running. +	 * If hrtimer_cancel() can be safely called it's safe to call +	 * kfree_rcu(t) right after for both preallocated and non-preallocated +	 * maps.  The async->cb = NULL was already done and no code path can see +	 * address 't' anymore. Timer if armed for existing bpf_hrtimer before +	 * bpf_timer_cancel_and_free will have been cancelled. +	 */ +	hrtimer_cancel(&t->timer); +	kfree_rcu(t, cb.rcu); +} +  static int __bpf_async_init(struct bpf_async_kern *async, struct bpf_map *map, u64 flags,  			    enum bpf_async_type type)  { @@ -1262,6 +1281,8 @@ static int __bpf_async_init(struct bpf_async_kern *async, struct bpf_map *map, u  		clockid = flags & (MAX_CLOCKS - 1);  		t = (struct bpf_hrtimer *)cb; +		atomic_set(&t->cancelling, 0); +		INIT_WORK(&t->cb.delete_work, bpf_timer_delete_work);  		hrtimer_init(&t->timer, clockid, HRTIMER_MODE_REL_SOFT);  		t->timer.function = bpf_timer_cb;  		cb->value = (void *)async - map->record->timer_off; @@ -1440,7 +1461,8 @@ static void drop_prog_refcnt(struct bpf_async_cb *async)  BPF_CALL_1(bpf_timer_cancel, struct bpf_async_kern *, timer)  { -	struct bpf_hrtimer *t; +	struct bpf_hrtimer *t, *cur_t; +	bool inc = false;  	int ret = 0;  	if (in_nmi()) @@ -1452,14 +1474,41 @@ BPF_CALL_1(bpf_timer_cancel, struct bpf_async_kern *, timer)  		ret = -EINVAL;  		goto out;  	} -	if (this_cpu_read(hrtimer_running) == t) { + +	cur_t = this_cpu_read(hrtimer_running); +	if (cur_t == t) {  		/* If bpf callback_fn is trying to bpf_timer_cancel()  		 * its own timer the hrtimer_cancel() will deadlock -		 * since it waits for callback_fn to finish +		 * since it waits for callback_fn to finish. +		 */ +		ret = -EDEADLK; +		goto out; +	} + +	/* Only account in-flight cancellations when invoked from a timer +	 * callback, since we want to avoid waiting only if other _callbacks_ +	 * are waiting on us, to avoid introducing lockups. Non-callback paths +	 * are ok, since nobody would synchronously wait for their completion. +	 */ +	if (!cur_t) +		goto drop; +	atomic_inc(&t->cancelling); +	/* Need full barrier after relaxed atomic_inc */ +	smp_mb__after_atomic(); +	inc = true; +	if (atomic_read(&cur_t->cancelling)) { +		/* We're cancelling timer t, while some other timer callback is +		 * attempting to cancel us. In such a case, it might be possible +		 * that timer t belongs to the other callback, or some other +		 * callback waiting upon it (creating transitive dependencies +		 * upon us), and we will enter a deadlock if we continue +		 * cancelling and waiting for it synchronously, since it might +		 * do the same. Bail!  		 */  		ret = -EDEADLK;  		goto out;  	} +drop:  	drop_prog_refcnt(&t->cb);  out:  	__bpf_spin_unlock_irqrestore(&timer->lock); @@ -1467,6 +1516,8 @@ out:  	 * if it was running.  	 */  	ret = ret ?: hrtimer_cancel(&t->timer); +	if (inc) +		atomic_dec(&t->cancelling);  	rcu_read_unlock();  	return ret;  } @@ -1512,25 +1563,39 @@ void bpf_timer_cancel_and_free(void *val)  	if (!t)  		return; -	/* Cancel the timer and wait for callback to complete if it was running. -	 * If hrtimer_cancel() can be safely called it's safe to call kfree(t) -	 * right after for both preallocated and non-preallocated maps. -	 * The async->cb = NULL was already done and no code path can -	 * see address 't' anymore. -	 * -	 * Check that bpf_map_delete/update_elem() wasn't called from timer -	 * callback_fn. In such case don't call hrtimer_cancel() (since it will -	 * deadlock) and don't call hrtimer_try_to_cancel() (since it will just -	 * return -1). Though callback_fn is still running on this cpu it's +	/* We check that bpf_map_delete/update_elem() was called from timer +	 * callback_fn. In such case we don't call hrtimer_cancel() (since it +	 * will deadlock) and don't call hrtimer_try_to_cancel() (since it will +	 * just return -1). Though callback_fn is still running on this cpu it's  	 * safe to do kfree(t) because bpf_timer_cb() read everything it needed  	 * from 't'. The bpf subprog callback_fn won't be able to access 't',  	 * since async->cb = NULL was already done. The timer will be  	 * effectively cancelled because bpf_timer_cb() will return  	 * HRTIMER_NORESTART. +	 * +	 * However, it is possible the timer callback_fn calling us armed the +	 * timer _before_ calling us, such that failing to cancel it here will +	 * cause it to possibly use struct hrtimer after freeing bpf_hrtimer. +	 * Therefore, we _need_ to cancel any outstanding timers before we do +	 * kfree_rcu, even though no more timers can be armed. +	 * +	 * Moreover, we need to schedule work even if timer does not belong to +	 * the calling callback_fn, as on two different CPUs, we can end up in a +	 * situation where both sides run in parallel, try to cancel one +	 * another, and we end up waiting on both sides in hrtimer_cancel +	 * without making forward progress, since timer1 depends on time2 +	 * callback to finish, and vice versa. +	 * +	 *  CPU 1 (timer1_cb)			CPU 2 (timer2_cb) +	 *  bpf_timer_cancel_and_free(timer2)	bpf_timer_cancel_and_free(timer1) +	 * +	 * To avoid these issues, punt to workqueue context when we are in a +	 * timer callback.  	 */ -	if (this_cpu_read(hrtimer_running) != t) -		hrtimer_cancel(&t->timer); -	kfree_rcu(t, cb.rcu); +	if (this_cpu_read(hrtimer_running)) +		queue_work(system_unbound_wq, &t->cb.delete_work); +	else +		bpf_timer_delete_work(&t->cb.delete_work);  }  /* This function is called by map_delete/update_elem for individual element and @@ -2433,7 +2498,7 @@ __bpf_kfunc struct task_struct *bpf_task_from_pid(s32 pid)  /**   * bpf_dynptr_slice() - Obtain a read-only pointer to the dynptr data. - * @ptr: The dynptr whose data slice to retrieve + * @p: The dynptr whose data slice to retrieve   * @offset: Offset into the dynptr   * @buffer__opt: User-provided buffer to copy contents into.  May be NULL   * @buffer__szk: Size (in bytes) of the buffer if present. This is the @@ -2459,9 +2524,10 @@ __bpf_kfunc struct task_struct *bpf_task_from_pid(s32 pid)   * provided buffer, with its contents containing the data, if unable to obtain   * direct pointer)   */ -__bpf_kfunc void *bpf_dynptr_slice(const struct bpf_dynptr_kern *ptr, u32 offset, +__bpf_kfunc void *bpf_dynptr_slice(const struct bpf_dynptr *p, u32 offset,  				   void *buffer__opt, u32 buffer__szk)  { +	const struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p;  	enum bpf_dynptr_type type;  	u32 len = buffer__szk;  	int err; @@ -2503,7 +2569,7 @@ __bpf_kfunc void *bpf_dynptr_slice(const struct bpf_dynptr_kern *ptr, u32 offset  /**   * bpf_dynptr_slice_rdwr() - Obtain a writable pointer to the dynptr data. - * @ptr: The dynptr whose data slice to retrieve + * @p: The dynptr whose data slice to retrieve   * @offset: Offset into the dynptr   * @buffer__opt: User-provided buffer to copy contents into. May be NULL   * @buffer__szk: Size (in bytes) of the buffer if present. This is the @@ -2543,9 +2609,11 @@ __bpf_kfunc void *bpf_dynptr_slice(const struct bpf_dynptr_kern *ptr, u32 offset   * provided buffer, with its contents containing the data, if unable to obtain   * direct pointer)   */ -__bpf_kfunc void *bpf_dynptr_slice_rdwr(const struct bpf_dynptr_kern *ptr, u32 offset, +__bpf_kfunc void *bpf_dynptr_slice_rdwr(const struct bpf_dynptr *p, u32 offset,  					void *buffer__opt, u32 buffer__szk)  { +	const struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p; +  	if (!ptr->data || __bpf_dynptr_is_rdonly(ptr))  		return NULL; @@ -2571,11 +2639,12 @@ __bpf_kfunc void *bpf_dynptr_slice_rdwr(const struct bpf_dynptr_kern *ptr, u32 o  	 * will be copied out into the buffer and the user will need to call  	 * bpf_dynptr_write() to commit changes.  	 */ -	return bpf_dynptr_slice(ptr, offset, buffer__opt, buffer__szk); +	return bpf_dynptr_slice(p, offset, buffer__opt, buffer__szk);  } -__bpf_kfunc int bpf_dynptr_adjust(struct bpf_dynptr_kern *ptr, u32 start, u32 end) +__bpf_kfunc int bpf_dynptr_adjust(const struct bpf_dynptr *p, u32 start, u32 end)  { +	struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p;  	u32 size;  	if (!ptr->data || start > end) @@ -2592,36 +2661,45 @@ __bpf_kfunc int bpf_dynptr_adjust(struct bpf_dynptr_kern *ptr, u32 start, u32 en  	return 0;  } -__bpf_kfunc bool bpf_dynptr_is_null(struct bpf_dynptr_kern *ptr) +__bpf_kfunc bool bpf_dynptr_is_null(const struct bpf_dynptr *p)  { +	struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p; +  	return !ptr->data;  } -__bpf_kfunc bool bpf_dynptr_is_rdonly(struct bpf_dynptr_kern *ptr) +__bpf_kfunc bool bpf_dynptr_is_rdonly(const struct bpf_dynptr *p)  { +	struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p; +  	if (!ptr->data)  		return false;  	return __bpf_dynptr_is_rdonly(ptr);  } -__bpf_kfunc __u32 bpf_dynptr_size(const struct bpf_dynptr_kern *ptr) +__bpf_kfunc __u32 bpf_dynptr_size(const struct bpf_dynptr *p)  { +	struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p; +  	if (!ptr->data)  		return -EINVAL;  	return __bpf_dynptr_size(ptr);  } -__bpf_kfunc int bpf_dynptr_clone(struct bpf_dynptr_kern *ptr, -				 struct bpf_dynptr_kern *clone__uninit) +__bpf_kfunc int bpf_dynptr_clone(const struct bpf_dynptr *p, +				 struct bpf_dynptr *clone__uninit)  { +	struct bpf_dynptr_kern *clone = (struct bpf_dynptr_kern *)clone__uninit; +	struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p; +  	if (!ptr->data) { -		bpf_dynptr_set_null(clone__uninit); +		bpf_dynptr_set_null(clone);  		return -EINVAL;  	} -	*clone__uninit = *ptr; +	*clone = *ptr;  	return 0;  } @@ -2721,7 +2799,7 @@ __bpf_kfunc int bpf_wq_start(struct bpf_wq *wq, unsigned int flags)  }  __bpf_kfunc int bpf_wq_set_callback_impl(struct bpf_wq *wq, -					 int (callback_fn)(void *map, int *key, struct bpf_wq *wq), +					 int (callback_fn)(void *map, int *key, void *value),  					 unsigned int flags,  					 void *aux__ign)  { @@ -2744,6 +2822,122 @@ __bpf_kfunc void bpf_preempt_enable(void)  	preempt_enable();  } +struct bpf_iter_bits { +	__u64 __opaque[2]; +} __aligned(8); + +struct bpf_iter_bits_kern { +	union { +		unsigned long *bits; +		unsigned long bits_copy; +	}; +	u32 nr_bits; +	int bit; +} __aligned(8); + +/** + * bpf_iter_bits_new() - Initialize a new bits iterator for a given memory area + * @it: The new bpf_iter_bits to be created + * @unsafe_ptr__ign: A pointer pointing to a memory area to be iterated over + * @nr_words: The size of the specified memory area, measured in 8-byte units. + * Due to the limitation of memalloc, it can't be greater than 512. + * + * This function initializes a new bpf_iter_bits structure for iterating over + * a memory area which is specified by the @unsafe_ptr__ign and @nr_words. It + * copies the data of the memory area to the newly created bpf_iter_bits @it for + * subsequent iteration operations. + * + * On success, 0 is returned. On failure, ERR is returned. + */ +__bpf_kfunc int +bpf_iter_bits_new(struct bpf_iter_bits *it, const u64 *unsafe_ptr__ign, u32 nr_words) +{ +	struct bpf_iter_bits_kern *kit = (void *)it; +	u32 nr_bytes = nr_words * sizeof(u64); +	u32 nr_bits = BYTES_TO_BITS(nr_bytes); +	int err; + +	BUILD_BUG_ON(sizeof(struct bpf_iter_bits_kern) != sizeof(struct bpf_iter_bits)); +	BUILD_BUG_ON(__alignof__(struct bpf_iter_bits_kern) != +		     __alignof__(struct bpf_iter_bits)); + +	kit->nr_bits = 0; +	kit->bits_copy = 0; +	kit->bit = -1; + +	if (!unsafe_ptr__ign || !nr_words) +		return -EINVAL; + +	/* Optimization for u64 mask */ +	if (nr_bits == 64) { +		err = bpf_probe_read_kernel_common(&kit->bits_copy, nr_bytes, unsafe_ptr__ign); +		if (err) +			return -EFAULT; + +		kit->nr_bits = nr_bits; +		return 0; +	} + +	/* Fallback to memalloc */ +	kit->bits = bpf_mem_alloc(&bpf_global_ma, nr_bytes); +	if (!kit->bits) +		return -ENOMEM; + +	err = bpf_probe_read_kernel_common(kit->bits, nr_bytes, unsafe_ptr__ign); +	if (err) { +		bpf_mem_free(&bpf_global_ma, kit->bits); +		return err; +	} + +	kit->nr_bits = nr_bits; +	return 0; +} + +/** + * bpf_iter_bits_next() - Get the next bit in a bpf_iter_bits + * @it: The bpf_iter_bits to be checked + * + * This function returns a pointer to a number representing the value of the + * next bit in the bits. + * + * If there are no further bits available, it returns NULL. + */ +__bpf_kfunc int *bpf_iter_bits_next(struct bpf_iter_bits *it) +{ +	struct bpf_iter_bits_kern *kit = (void *)it; +	u32 nr_bits = kit->nr_bits; +	const unsigned long *bits; +	int bit; + +	if (nr_bits == 0) +		return NULL; + +	bits = nr_bits == 64 ? &kit->bits_copy : kit->bits; +	bit = find_next_bit(bits, nr_bits, kit->bit + 1); +	if (bit >= nr_bits) { +		kit->nr_bits = 0; +		return NULL; +	} + +	kit->bit = bit; +	return &kit->bit; +} + +/** + * bpf_iter_bits_destroy() - Destroy a bpf_iter_bits + * @it: The bpf_iter_bits to be destroyed + * + * Destroy the resource associated with the bpf_iter_bits. + */ +__bpf_kfunc void bpf_iter_bits_destroy(struct bpf_iter_bits *it) +{ +	struct bpf_iter_bits_kern *kit = (void *)it; + +	if (kit->nr_bits <= 64) +		return; +	bpf_mem_free(&bpf_global_ma, kit->bits); +} +  __bpf_kfunc_end_defs();  BTF_KFUNCS_START(generic_btf_ids) @@ -2826,6 +3020,9 @@ BTF_ID_FLAGS(func, bpf_wq_set_callback_impl)  BTF_ID_FLAGS(func, bpf_wq_start)  BTF_ID_FLAGS(func, bpf_preempt_disable)  BTF_ID_FLAGS(func, bpf_preempt_enable) +BTF_ID_FLAGS(func, bpf_iter_bits_new, KF_ITER_NEW) +BTF_ID_FLAGS(func, bpf_iter_bits_next, KF_ITER_NEXT | KF_RET_NULL) +BTF_ID_FLAGS(func, bpf_iter_bits_destroy, KF_ITER_DESTROY)  BTF_KFUNCS_END(common_btf_ids)  static const struct btf_kfunc_id_set common_kfunc_set = { @@ -2867,7 +3064,9 @@ late_initcall(kfunc_init);   */  const void *__bpf_dynptr_data(const struct bpf_dynptr_kern *ptr, u32 len)  { -	return bpf_dynptr_slice(ptr, 0, NULL, len); +	const struct bpf_dynptr *p = (struct bpf_dynptr *)ptr; + +	return bpf_dynptr_slice(p, 0, NULL, len);  }  /* Get a pointer to dynptr data up to len bytes for read write access. If diff --git a/kernel/bpf/log.c b/kernel/bpf/log.c index 4bd8f17a9f24..5aebfc3051e3 100644 --- a/kernel/bpf/log.c +++ b/kernel/bpf/log.c @@ -91,7 +91,7 @@ void bpf_verifier_vlog(struct bpf_verifier_log *log, const char *fmt,  			goto fail;  	} else {  		u64 new_end, new_start; -		u32 buf_start, buf_end, new_n; +		u32 buf_start, buf_end;  		new_end = log->end_pos + n;  		if (new_end - log->start_pos >= log->len_total) @@ -708,7 +708,9 @@ static void print_reg_state(struct bpf_verifier_env *env,  		verbose(env, "%s", btf_type_name(reg->btf, reg->btf_id));  	verbose(env, "(");  	if (reg->id) -		verbose_a("id=%d", reg->id); +		verbose_a("id=%d", reg->id & ~BPF_ADD_CONST); +	if (reg->id & BPF_ADD_CONST) +		verbose(env, "%+d", reg->off);  	if (reg->ref_obj_id)  		verbose_a("ref_obj_id=%d", reg->ref_obj_id);  	if (type_is_non_owning_ref(reg->type)) diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c index a546aba46d5d..dec892ded031 100644 --- a/kernel/bpf/memalloc.c +++ b/kernel/bpf/memalloc.c @@ -155,12 +155,9 @@ static void *__alloc(struct bpf_mem_cache *c, int node, gfp_t flags)  static struct mem_cgroup *get_memcg(const struct bpf_mem_cache *c)  { -#ifdef CONFIG_MEMCG_KMEM +#ifdef CONFIG_MEMCG  	if (c->objcg)  		return get_mem_cgroup_from_objcg(c->objcg); -#endif - -#ifdef CONFIG_MEMCG  	return root_mem_cgroup;  #else  	return NULL; @@ -534,7 +531,7 @@ int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu)  			size += LLIST_NODE_SZ; /* room for llist_node */  		unit_size = size; -#ifdef CONFIG_MEMCG_KMEM +#ifdef CONFIG_MEMCG  		if (memcg_bpf_enabled())  			objcg = get_obj_cgroup_from_current();  #endif @@ -556,7 +553,7 @@ int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu)  	pcc = __alloc_percpu_gfp(sizeof(*cc), 8, GFP_KERNEL);  	if (!pcc)  		return -ENOMEM; -#ifdef CONFIG_MEMCG_KMEM +#ifdef CONFIG_MEMCG  	objcg = get_obj_cgroup_from_current();  #endif  	ma->objcg = objcg; diff --git a/kernel/bpf/ringbuf.c b/kernel/bpf/ringbuf.c index 0ee653a936ea..e20b90c36131 100644 --- a/kernel/bpf/ringbuf.c +++ b/kernel/bpf/ringbuf.c @@ -51,7 +51,8 @@ struct bpf_ringbuf {  	 * This prevents a user-space application from modifying the  	 * position and ruining in-kernel tracking. The permissions of the  	 * pages depend on who is producing samples: user-space or the -	 * kernel. +	 * kernel. Note that the pending counter is placed in the same +	 * page as the producer, so that it shares the same cache line.  	 *  	 * Kernel-producer  	 * --------------- @@ -70,6 +71,7 @@ struct bpf_ringbuf {  	 */  	unsigned long consumer_pos __aligned(PAGE_SIZE);  	unsigned long producer_pos __aligned(PAGE_SIZE); +	unsigned long pending_pos;  	char data[] __aligned(PAGE_SIZE);  }; @@ -179,6 +181,7 @@ static struct bpf_ringbuf *bpf_ringbuf_alloc(size_t data_sz, int numa_node)  	rb->mask = data_sz - 1;  	rb->consumer_pos = 0;  	rb->producer_pos = 0; +	rb->pending_pos = 0;  	return rb;  } @@ -404,9 +407,9 @@ bpf_ringbuf_restore_from_rec(struct bpf_ringbuf_hdr *hdr)  static void *__bpf_ringbuf_reserve(struct bpf_ringbuf *rb, u64 size)  { -	unsigned long cons_pos, prod_pos, new_prod_pos, flags; -	u32 len, pg_off; +	unsigned long cons_pos, prod_pos, new_prod_pos, pend_pos, flags;  	struct bpf_ringbuf_hdr *hdr; +	u32 len, pg_off, tmp_size, hdr_len;  	if (unlikely(size > RINGBUF_MAX_RECORD_SZ))  		return NULL; @@ -424,13 +427,29 @@ static void *__bpf_ringbuf_reserve(struct bpf_ringbuf *rb, u64 size)  		spin_lock_irqsave(&rb->spinlock, flags);  	} +	pend_pos = rb->pending_pos;  	prod_pos = rb->producer_pos;  	new_prod_pos = prod_pos + len; -	/* check for out of ringbuf space by ensuring producer position -	 * doesn't advance more than (ringbuf_size - 1) ahead +	while (pend_pos < prod_pos) { +		hdr = (void *)rb->data + (pend_pos & rb->mask); +		hdr_len = READ_ONCE(hdr->len); +		if (hdr_len & BPF_RINGBUF_BUSY_BIT) +			break; +		tmp_size = hdr_len & ~BPF_RINGBUF_DISCARD_BIT; +		tmp_size = round_up(tmp_size + BPF_RINGBUF_HDR_SZ, 8); +		pend_pos += tmp_size; +	} +	rb->pending_pos = pend_pos; + +	/* check for out of ringbuf space: +	 * - by ensuring producer position doesn't advance more than +	 *   (ringbuf_size - 1) ahead +	 * - by ensuring oldest not yet committed record until newest +	 *   record does not span more than (ringbuf_size - 1)  	 */ -	if (new_prod_pos - cons_pos > rb->mask) { +	if (new_prod_pos - cons_pos > rb->mask || +	    new_prod_pos - pend_pos > rb->mask) {  		spin_unlock_irqrestore(&rb->spinlock, flags);  		return NULL;  	} diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 2222c3ff88e7..bf6c5f685ea2 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -385,7 +385,7 @@ void bpf_map_free_id(struct bpf_map *map)  	spin_unlock_irqrestore(&map_idr_lock, flags);  } -#ifdef CONFIG_MEMCG_KMEM +#ifdef CONFIG_MEMCG  static void bpf_map_save_memcg(struct bpf_map *map)  {  	/* Currently if a map is created by a process belonging to the root @@ -486,7 +486,7 @@ int bpf_map_alloc_pages(const struct bpf_map *map, gfp_t gfp, int nid,  	unsigned long i, j;  	struct page *pg;  	int ret = 0; -#ifdef CONFIG_MEMCG_KMEM +#ifdef CONFIG_MEMCG  	struct mem_cgroup *memcg, *old_memcg;  	memcg = bpf_map_get_memcg(map); @@ -505,7 +505,7 @@ int bpf_map_alloc_pages(const struct bpf_map *map, gfp_t gfp, int nid,  		break;  	} -#ifdef CONFIG_MEMCG_KMEM +#ifdef CONFIG_MEMCG  	set_active_memcg(old_memcg);  	mem_cgroup_put(memcg);  #endif @@ -2998,6 +2998,7 @@ static int bpf_obj_get(const union bpf_attr *attr)  void bpf_link_init(struct bpf_link *link, enum bpf_link_type type,  		   const struct bpf_link_ops *ops, struct bpf_prog *prog)  { +	WARN_ON(ops->dealloc && ops->dealloc_deferred);  	atomic64_set(&link->refcnt, 1);  	link->type = type;  	link->id = 0; @@ -3056,16 +3057,17 @@ static void bpf_link_defer_dealloc_mult_rcu_gp(struct rcu_head *rcu)  /* bpf_link_free is guaranteed to be called from process context */  static void bpf_link_free(struct bpf_link *link)  { +	const struct bpf_link_ops *ops = link->ops;  	bool sleepable = false;  	bpf_link_free_id(link->id);  	if (link->prog) {  		sleepable = link->prog->sleepable;  		/* detach BPF program, clean up used resources */ -		link->ops->release(link); +		ops->release(link);  		bpf_prog_put(link->prog);  	} -	if (link->ops->dealloc_deferred) { +	if (ops->dealloc_deferred) {  		/* schedule BPF link deallocation; if underlying BPF program  		 * is sleepable, we need to first wait for RCU tasks trace  		 * sync, then go through "classic" RCU grace period @@ -3074,9 +3076,8 @@ static void bpf_link_free(struct bpf_link *link)  			call_rcu_tasks_trace(&link->rcu, bpf_link_defer_dealloc_mult_rcu_gp);  		else  			call_rcu(&link->rcu, bpf_link_defer_dealloc_rcu_gp); -	} -	if (link->ops->dealloc) -		link->ops->dealloc(link); +	} else if (ops->dealloc) +		ops->dealloc(link);  }  static void bpf_link_put_deferred(struct work_struct *work) @@ -3150,6 +3151,13 @@ static void bpf_link_show_fdinfo(struct seq_file *m, struct file *filp)  }  #endif +static __poll_t bpf_link_poll(struct file *file, struct poll_table_struct *pts) +{ +	struct bpf_link *link = file->private_data; + +	return link->ops->poll(file, pts); +} +  static const struct file_operations bpf_link_fops = {  #ifdef CONFIG_PROC_FS  	.show_fdinfo	= bpf_link_show_fdinfo, @@ -3159,6 +3167,16 @@ static const struct file_operations bpf_link_fops = {  	.write		= bpf_dummy_write,  }; +static const struct file_operations bpf_link_fops_poll = { +#ifdef CONFIG_PROC_FS +	.show_fdinfo	= bpf_link_show_fdinfo, +#endif +	.release	= bpf_link_release, +	.read		= bpf_dummy_read, +	.write		= bpf_dummy_write, +	.poll		= bpf_link_poll, +}; +  static int bpf_link_alloc_id(struct bpf_link *link)  {  	int id; @@ -3201,7 +3219,9 @@ int bpf_link_prime(struct bpf_link *link, struct bpf_link_primer *primer)  		return id;  	} -	file = anon_inode_getfile("bpf_link", &bpf_link_fops, link, O_CLOEXEC); +	file = anon_inode_getfile("bpf_link", +				  link->ops->poll ? &bpf_link_fops_poll : &bpf_link_fops, +				  link, O_CLOEXEC);  	if (IS_ERR(file)) {  		bpf_link_free_id(id);  		put_unused_fd(fd); @@ -3229,7 +3249,9 @@ int bpf_link_settle(struct bpf_link_primer *primer)  int bpf_link_new_fd(struct bpf_link *link)  { -	return anon_inode_getfd("bpf-link", &bpf_link_fops, link, O_CLOEXEC); +	return anon_inode_getfd("bpf-link", +				link->ops->poll ? &bpf_link_fops_poll : &bpf_link_fops, +				link, O_CLOEXEC);  }  struct bpf_link *bpf_link_get_from_fd(u32 ufd) @@ -3239,7 +3261,7 @@ struct bpf_link *bpf_link_get_from_fd(u32 ufd)  	if (!f.file)  		return ERR_PTR(-EBADF); -	if (f.file->f_op != &bpf_link_fops) { +	if (f.file->f_op != &bpf_link_fops && f.file->f_op != &bpf_link_fops_poll) {  		fdput(f);  		return ERR_PTR(-EINVAL);  	} @@ -4971,7 +4993,7 @@ static int bpf_obj_get_info_by_fd(const union bpf_attr *attr,  					     uattr);  	else if (f.file->f_op == &btf_fops)  		err = bpf_btf_get_info_by_fd(f.file, f.file->private_data, attr, uattr); -	else if (f.file->f_op == &bpf_link_fops) +	else if (f.file->f_op == &bpf_link_fops || f.file->f_op == &bpf_link_fops_poll)  		err = bpf_link_get_info_by_fd(f.file, f.file->private_data,  					      attr, uattr);  	else @@ -5106,7 +5128,7 @@ static int bpf_task_fd_query(const union bpf_attr *attr,  	if (!file)  		return -EBADF; -	if (file->f_op == &bpf_link_fops) { +	if (file->f_op == &bpf_link_fops || file->f_op == &bpf_link_fops_poll) {  		struct bpf_link *link = file->private_data;  		if (link->ops == &bpf_raw_tp_link_lops) { @@ -5416,10 +5438,11 @@ static int link_detach(union bpf_attr *attr)  	return ret;  } -static struct bpf_link *bpf_link_inc_not_zero(struct bpf_link *link) +struct bpf_link *bpf_link_inc_not_zero(struct bpf_link *link)  {  	return atomic64_fetch_add_unless(&link->refcnt, 1, 0) ? link : ERR_PTR(-ENOENT);  } +EXPORT_SYMBOL(bpf_link_inc_not_zero);  struct bpf_link *bpf_link_by_id(u32 id)  { @@ -5960,7 +5983,7 @@ const struct bpf_prog_ops bpf_syscall_prog_ops = {  };  #ifdef CONFIG_SYSCTL -static int bpf_stats_handler(struct ctl_table *table, int write, +static int bpf_stats_handler(const struct ctl_table *table, int write,  			     void *buffer, size_t *lenp, loff_t *ppos)  {  	struct static_key *key = (struct static_key *)table->data; @@ -5995,7 +6018,7 @@ void __weak unpriv_ebpf_notify(int new_state)  {  } -static int bpf_unpriv_handler(struct ctl_table *table, int write, +static int bpf_unpriv_handler(const struct ctl_table *table, int write,  			      void *buffer, size_t *lenp, loff_t *ppos)  {  	int ret, unpriv_enable = *(int *)table->data; diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c index ec4e97c61eef..02aa9db8d796 100644 --- a/kernel/bpf/task_iter.c +++ b/kernel/bpf/task_iter.c @@ -261,6 +261,7 @@ task_file_seq_get_next(struct bpf_iter_seq_task_file_info *info)  	u32 saved_tid = info->tid;  	struct task_struct *curr_task;  	unsigned int curr_fd = info->fd; +	struct file *f;  	/* If this function returns a non-NULL file object,  	 * it held a reference to the task/file. @@ -286,12 +287,8 @@ again:  	}  	rcu_read_lock(); -	for (;; curr_fd++) { -		struct file *f; -		f = task_lookup_next_fdget_rcu(curr_task, &curr_fd); -		if (!f) -			break; - +	f = task_lookup_next_fdget_rcu(curr_task, &curr_fd); +	if (f) {  		/* set info->fd */  		info->fd = curr_fd;  		rcu_read_unlock(); diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 77da1f438bec..d8520095ca03 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2982,8 +2982,10 @@ static int check_subprogs(struct bpf_verifier_env *env)  		if (code == (BPF_JMP | BPF_CALL) &&  		    insn[i].src_reg == 0 && -		    insn[i].imm == BPF_FUNC_tail_call) +		    insn[i].imm == BPF_FUNC_tail_call) {  			subprog[cur_subprog].has_tail_call = true; +			subprog[cur_subprog].tail_call_reachable = true; +		}  		if (BPF_CLASS(code) == BPF_LD &&  		    (BPF_MODE(code) == BPF_ABS || BPF_MODE(code) == BPF_IND))  			subprog[cur_subprog].has_ld_abs = true; @@ -3215,7 +3217,8 @@ static int insn_def_regno(const struct bpf_insn *insn)  	case BPF_ST:  		return -1;  	case BPF_STX: -		if (BPF_MODE(insn->code) == BPF_ATOMIC && +		if ((BPF_MODE(insn->code) == BPF_ATOMIC || +		     BPF_MODE(insn->code) == BPF_PROBE_ATOMIC) &&  		    (insn->imm & BPF_FETCH)) {  			if (insn->imm == BPF_CMPXCHG)  				return BPF_REG_0; @@ -3991,7 +3994,7 @@ static bool idset_contains(struct bpf_idset *s, u32 id)  	u32 i;  	for (i = 0; i < s->count; ++i) -		if (s->ids[i] == id) +		if (s->ids[i] == (id & ~BPF_ADD_CONST))  			return true;  	return false; @@ -4001,7 +4004,7 @@ static int idset_push(struct bpf_idset *s, u32 id)  {  	if (WARN_ON_ONCE(s->count >= ARRAY_SIZE(s->ids)))  		return -EFAULT; -	s->ids[s->count++] = id; +	s->ids[s->count++] = id & ~BPF_ADD_CONST;  	return 0;  } @@ -4438,8 +4441,20 @@ static bool __is_pointer_value(bool allow_ptr_leaks,  static void assign_scalar_id_before_mov(struct bpf_verifier_env *env,  					struct bpf_reg_state *src_reg)  { -	if (src_reg->type == SCALAR_VALUE && !src_reg->id && -	    !tnum_is_const(src_reg->var_off)) +	if (src_reg->type != SCALAR_VALUE) +		return; + +	if (src_reg->id & BPF_ADD_CONST) { +		/* +		 * The verifier is processing rX = rY insn and +		 * rY->id has special linked register already. +		 * Cleared it, since multiple rX += const are not supported. +		 */ +		src_reg->id = 0; +		src_reg->off = 0; +	} + +	if (!src_reg->id && !tnum_is_const(src_reg->var_off))  		/* Ensure that src_reg has a valid ID that will be copied to  		 * dst_reg and then will be used by find_equal_scalars() to  		 * propagate min/max range. @@ -4549,11 +4564,12 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env,  			state->stack[spi].spilled_ptr.id = 0;  	} else if (!reg && !(off % BPF_REG_SIZE) && is_bpf_st_mem(insn) &&  		   env->bpf_capable) { -		struct bpf_reg_state fake_reg = {}; +		struct bpf_reg_state *tmp_reg = &env->fake_reg[0]; -		__mark_reg_known(&fake_reg, insn->imm); -		fake_reg.type = SCALAR_VALUE; -		save_register_state(env, state, spi, &fake_reg, size); +		memset(tmp_reg, 0, sizeof(*tmp_reg)); +		__mark_reg_known(tmp_reg, insn->imm); +		tmp_reg->type = SCALAR_VALUE; +		save_register_state(env, state, spi, tmp_reg, size);  	} else if (reg && is_spillable_regtype(reg->type)) {  		/* register containing pointer is being spilled into stack */  		if (size != BPF_REG_SIZE) { @@ -5448,7 +5464,7 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno,  		 * this program. To check that [x1, x2) overlaps with [y1, y2),  		 * it is sufficient to check x1 < y2 && y1 < x2.  		 */ -		if (reg->smin_value + off < p + btf_field_type_size(field->type) && +		if (reg->smin_value + off < p + field->size &&  		    p < reg->umax_value + off + size) {  			switch (field->type) {  			case BPF_KPTR_UNREF: @@ -6235,6 +6251,7 @@ static void set_sext32_default_val(struct bpf_reg_state *reg, int size)  	}  	reg->u32_min_value = 0;  	reg->u32_max_value = U32_MAX; +	reg->var_off = tnum_subreg(tnum_unknown);  }  static void coerce_subreg_to_size_sx(struct bpf_reg_state *reg, int size) @@ -6279,6 +6296,7 @@ static void coerce_subreg_to_size_sx(struct bpf_reg_state *reg, int size)  		reg->s32_max_value = s32_max;  		reg->u32_min_value = (u32)s32_min;  		reg->u32_max_value = (u32)s32_max; +		reg->var_off = tnum_subreg(tnum_range(s32_min, s32_max));  		return;  	} @@ -7712,6 +7730,13 @@ static int process_dynptr_func(struct bpf_verifier_env *env, int regno, int insn  	struct bpf_reg_state *regs = cur_regs(env), *reg = ®s[regno];  	int err; +	if (reg->type != PTR_TO_STACK && reg->type != CONST_PTR_TO_DYNPTR) { +		verbose(env, +			"arg#%d expected pointer to stack or const struct bpf_dynptr\n", +			regno); +		return -EINVAL; +	} +  	/* MEM_UNINIT and MEM_RDONLY are exclusive, when applied to an  	 * ARG_PTR_TO_DYNPTR (or ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_*):  	 */ @@ -8882,7 +8907,8 @@ static bool may_update_sockmap(struct bpf_verifier_env *env, int func_id)  	enum bpf_attach_type eatype = env->prog->expected_attach_type;  	enum bpf_prog_type type = resolve_prog_type(env->prog); -	if (func_id != BPF_FUNC_map_update_elem) +	if (func_id != BPF_FUNC_map_update_elem && +	    func_id != BPF_FUNC_map_delete_elem)  		return false;  	/* It's not possible to get access to a locked struct sock in these @@ -8893,6 +8919,11 @@ static bool may_update_sockmap(struct bpf_verifier_env *env, int func_id)  		if (eatype == BPF_TRACE_ITER)  			return true;  		break; +	case BPF_PROG_TYPE_SOCK_OPS: +		/* map_update allowed only via dedicated helpers with event type checks */ +		if (func_id == BPF_FUNC_map_delete_elem) +			return true; +		break;  	case BPF_PROG_TYPE_SOCKET_FILTER:  	case BPF_PROG_TYPE_SCHED_CLS:  	case BPF_PROG_TYPE_SCHED_ACT: @@ -8988,7 +9019,6 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,  	case BPF_MAP_TYPE_SOCKMAP:  		if (func_id != BPF_FUNC_sk_redirect_map &&  		    func_id != BPF_FUNC_sock_map_update && -		    func_id != BPF_FUNC_map_delete_elem &&  		    func_id != BPF_FUNC_msg_redirect_map &&  		    func_id != BPF_FUNC_sk_select_reuseport &&  		    func_id != BPF_FUNC_map_lookup_elem && @@ -8998,7 +9028,6 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,  	case BPF_MAP_TYPE_SOCKHASH:  		if (func_id != BPF_FUNC_sk_redirect_hash &&  		    func_id != BPF_FUNC_sock_hash_update && -		    func_id != BPF_FUNC_map_delete_elem &&  		    func_id != BPF_FUNC_msg_redirect_hash &&  		    func_id != BPF_FUNC_sk_select_reuseport &&  		    func_id != BPF_FUNC_map_lookup_elem && @@ -9457,6 +9486,10 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, int subprog,  				return -EINVAL;  			}  		} else if (arg->arg_type == (ARG_PTR_TO_DYNPTR | MEM_RDONLY)) { +			ret = check_func_arg_reg_off(env, reg, regno, ARG_PTR_TO_DYNPTR); +			if (ret) +				return ret; +  			ret = process_dynptr_func(env, regno, -1, arg->arg_type, 0);  			if (ret)  				return ret; @@ -10910,7 +10943,7 @@ enum {  };  BTF_ID_LIST(kf_arg_btf_ids) -BTF_ID(struct, bpf_dynptr_kern) +BTF_ID(struct, bpf_dynptr)  BTF_ID(struct, bpf_list_head)  BTF_ID(struct, bpf_list_node)  BTF_ID(struct, bpf_rb_root) @@ -11124,7 +11157,11 @@ BTF_ID(func, bpf_iter_css_task_new)  #else  BTF_ID_UNUSED  #endif +#ifdef CONFIG_BPF_EVENTS  BTF_ID(func, bpf_session_cookie) +#else +BTF_ID_UNUSED +#endif  static bool is_kfunc_ret_null(struct bpf_kfunc_call_arg_meta *meta)  { @@ -11179,6 +11216,9 @@ get_kfunc_ptr_arg_type(struct bpf_verifier_env *env,  	if (btf_is_prog_ctx_type(&env->log, meta->btf, t, resolve_prog_type(env->prog), argno))  		return KF_ARG_PTR_TO_CTX; +	if (is_kfunc_arg_nullable(meta->btf, &args[argno]) && register_is_null(reg)) +		return KF_ARG_PTR_TO_NULL; +  	if (is_kfunc_arg_alloc_obj(meta->btf, &args[argno]))  		return KF_ARG_PTR_TO_ALLOC_BTF_ID; @@ -11224,9 +11264,6 @@ get_kfunc_ptr_arg_type(struct bpf_verifier_env *env,  	if (is_kfunc_arg_callback(env, meta->btf, &args[argno]))  		return KF_ARG_PTR_TO_CALLBACK; -	if (is_kfunc_arg_nullable(meta->btf, &args[argno]) && register_is_null(reg)) -		return KF_ARG_PTR_TO_NULL; -  	if (argno + 1 < nargs &&  	    (is_kfunc_arg_mem_size(meta->btf, &args[argno + 1], ®s[regno + 1]) ||  	     is_kfunc_arg_const_mem_size(meta->btf, &args[argno + 1], ®s[regno + 1]))) @@ -11257,6 +11294,8 @@ static int process_kf_arg_ptr_to_btf_id(struct bpf_verifier_env *env,  	bool strict_type_match = false;  	const struct btf *reg_btf;  	const char *reg_ref_tname; +	bool taking_projection; +	bool struct_same;  	u32 reg_ref_id;  	if (base_type(reg->type) == PTR_TO_BTF_ID) { @@ -11296,11 +11335,19 @@ static int process_kf_arg_ptr_to_btf_id(struct bpf_verifier_env *env,  	    btf_type_ids_nocast_alias(&env->log, reg_btf, reg_ref_id, meta->btf, ref_id))  		strict_type_match = true; -	WARN_ON_ONCE(is_kfunc_trusted_args(meta) && reg->off); +	WARN_ON_ONCE(is_kfunc_release(meta) && +		     (reg->off || !tnum_is_const(reg->var_off) || +		      reg->var_off.value));  	reg_ref_t = btf_type_skip_modifiers(reg_btf, reg_ref_id, ®_ref_id);  	reg_ref_tname = btf_name_by_offset(reg_btf, reg_ref_t->name_off); -	if (!btf_struct_ids_match(&env->log, reg_btf, reg_ref_id, reg->off, meta->btf, ref_id, strict_type_match)) { +	struct_same = btf_struct_ids_match(&env->log, reg_btf, reg_ref_id, reg->off, meta->btf, ref_id, strict_type_match); +	/* If kfunc is accepting a projection type (ie. __sk_buff), it cannot +	 * actually use it -- it must cast to the underlying type. So we allow +	 * caller to pass in the underlying type. +	 */ +	taking_projection = btf_is_projection_of(ref_tname, reg_ref_tname); +	if (!taking_projection && !struct_same) {  		verbose(env, "kernel function %s args#%d expected pointer to %s %s but R%d has a pointer to %s %s\n",  			meta->func_name, argno, btf_type_str(ref_t), ref_tname, argno + 1,  			btf_type_str(reg_ref_t), reg_ref_tname); @@ -11640,7 +11687,7 @@ __process_kf_arg_ptr_to_graph_node(struct bpf_verifier_env *env,  	node_off = reg->off + reg->var_off.value;  	field = reg_find_field_offset(reg, node_off, node_field_type); -	if (!field || field->offset != node_off) { +	if (!field) {  		verbose(env, "%s not found at offset=%u\n", node_type_name, node_off);  		return -EINVAL;  	} @@ -11872,12 +11919,8 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_  					return -EINVAL;  				}  			} -  			fallthrough;  		case KF_ARG_PTR_TO_CTX: -			/* Trusted arguments have the same offset checks as release arguments */ -			arg_type |= OBJ_RELEASE; -			break;  		case KF_ARG_PTR_TO_DYNPTR:  		case KF_ARG_PTR_TO_ITER:  		case KF_ARG_PTR_TO_LIST_HEAD: @@ -11890,7 +11933,6 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_  		case KF_ARG_PTR_TO_REFCOUNTED_KPTR:  		case KF_ARG_PTR_TO_CONST_STR:  		case KF_ARG_PTR_TO_WORKQUEUE: -			/* Trusted by default */  			break;  		default:  			WARN_ON_ONCE(1); @@ -11946,12 +11988,6 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_  			enum bpf_arg_type dynptr_arg_type = ARG_PTR_TO_DYNPTR;  			int clone_ref_obj_id = 0; -			if (reg->type != PTR_TO_STACK && -			    reg->type != CONST_PTR_TO_DYNPTR) { -				verbose(env, "arg#%d expected pointer to stack or dynptr_ptr\n", i); -				return -EINVAL; -			} -  			if (reg->type == CONST_PTR_TO_DYNPTR)  				dynptr_arg_type |= MEM_RDONLY; @@ -12690,46 +12726,6 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,  	return 0;  } -static bool signed_add_overflows(s64 a, s64 b) -{ -	/* Do the add in u64, where overflow is well-defined */ -	s64 res = (s64)((u64)a + (u64)b); - -	if (b < 0) -		return res > a; -	return res < a; -} - -static bool signed_add32_overflows(s32 a, s32 b) -{ -	/* Do the add in u32, where overflow is well-defined */ -	s32 res = (s32)((u32)a + (u32)b); - -	if (b < 0) -		return res > a; -	return res < a; -} - -static bool signed_sub_overflows(s64 a, s64 b) -{ -	/* Do the sub in u64, where overflow is well-defined */ -	s64 res = (s64)((u64)a - (u64)b); - -	if (b < 0) -		return res < a; -	return res > a; -} - -static bool signed_sub32_overflows(s32 a, s32 b) -{ -	/* Do the sub in u32, where overflow is well-defined */ -	s32 res = (s32)((u32)a - (u32)b); - -	if (b < 0) -		return res < a; -	return res > a; -} -  static bool check_reg_sane_offset(struct bpf_verifier_env *env,  				  const struct bpf_reg_state *reg,  				  enum bpf_reg_type type) @@ -13211,21 +13207,15 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,  		 * added into the variable offset, and we copy the fixed offset  		 * from ptr_reg.  		 */ -		if (signed_add_overflows(smin_ptr, smin_val) || -		    signed_add_overflows(smax_ptr, smax_val)) { +		if (check_add_overflow(smin_ptr, smin_val, &dst_reg->smin_value) || +		    check_add_overflow(smax_ptr, smax_val, &dst_reg->smax_value)) {  			dst_reg->smin_value = S64_MIN;  			dst_reg->smax_value = S64_MAX; -		} else { -			dst_reg->smin_value = smin_ptr + smin_val; -			dst_reg->smax_value = smax_ptr + smax_val;  		} -		if (umin_ptr + umin_val < umin_ptr || -		    umax_ptr + umax_val < umax_ptr) { +		if (check_add_overflow(umin_ptr, umin_val, &dst_reg->umin_value) || +		    check_add_overflow(umax_ptr, umax_val, &dst_reg->umax_value)) {  			dst_reg->umin_value = 0;  			dst_reg->umax_value = U64_MAX; -		} else { -			dst_reg->umin_value = umin_ptr + umin_val; -			dst_reg->umax_value = umax_ptr + umax_val;  		}  		dst_reg->var_off = tnum_add(ptr_reg->var_off, off_reg->var_off);  		dst_reg->off = ptr_reg->off; @@ -13268,14 +13258,11 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,  		/* A new variable offset is created.  If the subtrahend is known  		 * nonnegative, then any reg->range we had before is still good.  		 */ -		if (signed_sub_overflows(smin_ptr, smax_val) || -		    signed_sub_overflows(smax_ptr, smin_val)) { +		if (check_sub_overflow(smin_ptr, smax_val, &dst_reg->smin_value) || +		    check_sub_overflow(smax_ptr, smin_val, &dst_reg->smax_value)) {  			/* Overflow possible, we know nothing */  			dst_reg->smin_value = S64_MIN;  			dst_reg->smax_value = S64_MAX; -		} else { -			dst_reg->smin_value = smin_ptr - smax_val; -			dst_reg->smax_value = smax_ptr - smin_val;  		}  		if (umin_ptr < umax_val) {  			/* Overflow possible, we know nothing */ @@ -13328,71 +13315,56 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,  static void scalar32_min_max_add(struct bpf_reg_state *dst_reg,  				 struct bpf_reg_state *src_reg)  { -	s32 smin_val = src_reg->s32_min_value; -	s32 smax_val = src_reg->s32_max_value; -	u32 umin_val = src_reg->u32_min_value; -	u32 umax_val = src_reg->u32_max_value; +	s32 *dst_smin = &dst_reg->s32_min_value; +	s32 *dst_smax = &dst_reg->s32_max_value; +	u32 *dst_umin = &dst_reg->u32_min_value; +	u32 *dst_umax = &dst_reg->u32_max_value; -	if (signed_add32_overflows(dst_reg->s32_min_value, smin_val) || -	    signed_add32_overflows(dst_reg->s32_max_value, smax_val)) { -		dst_reg->s32_min_value = S32_MIN; -		dst_reg->s32_max_value = S32_MAX; -	} else { -		dst_reg->s32_min_value += smin_val; -		dst_reg->s32_max_value += smax_val; +	if (check_add_overflow(*dst_smin, src_reg->s32_min_value, dst_smin) || +	    check_add_overflow(*dst_smax, src_reg->s32_max_value, dst_smax)) { +		*dst_smin = S32_MIN; +		*dst_smax = S32_MAX;  	} -	if (dst_reg->u32_min_value + umin_val < umin_val || -	    dst_reg->u32_max_value + umax_val < umax_val) { -		dst_reg->u32_min_value = 0; -		dst_reg->u32_max_value = U32_MAX; -	} else { -		dst_reg->u32_min_value += umin_val; -		dst_reg->u32_max_value += umax_val; +	if (check_add_overflow(*dst_umin, src_reg->u32_min_value, dst_umin) || +	    check_add_overflow(*dst_umax, src_reg->u32_max_value, dst_umax)) { +		*dst_umin = 0; +		*dst_umax = U32_MAX;  	}  }  static void scalar_min_max_add(struct bpf_reg_state *dst_reg,  			       struct bpf_reg_state *src_reg)  { -	s64 smin_val = src_reg->smin_value; -	s64 smax_val = src_reg->smax_value; -	u64 umin_val = src_reg->umin_value; -	u64 umax_val = src_reg->umax_value; +	s64 *dst_smin = &dst_reg->smin_value; +	s64 *dst_smax = &dst_reg->smax_value; +	u64 *dst_umin = &dst_reg->umin_value; +	u64 *dst_umax = &dst_reg->umax_value; -	if (signed_add_overflows(dst_reg->smin_value, smin_val) || -	    signed_add_overflows(dst_reg->smax_value, smax_val)) { -		dst_reg->smin_value = S64_MIN; -		dst_reg->smax_value = S64_MAX; -	} else { -		dst_reg->smin_value += smin_val; -		dst_reg->smax_value += smax_val; +	if (check_add_overflow(*dst_smin, src_reg->smin_value, dst_smin) || +	    check_add_overflow(*dst_smax, src_reg->smax_value, dst_smax)) { +		*dst_smin = S64_MIN; +		*dst_smax = S64_MAX;  	} -	if (dst_reg->umin_value + umin_val < umin_val || -	    dst_reg->umax_value + umax_val < umax_val) { -		dst_reg->umin_value = 0; -		dst_reg->umax_value = U64_MAX; -	} else { -		dst_reg->umin_value += umin_val; -		dst_reg->umax_value += umax_val; +	if (check_add_overflow(*dst_umin, src_reg->umin_value, dst_umin) || +	    check_add_overflow(*dst_umax, src_reg->umax_value, dst_umax)) { +		*dst_umin = 0; +		*dst_umax = U64_MAX;  	}  }  static void scalar32_min_max_sub(struct bpf_reg_state *dst_reg,  				 struct bpf_reg_state *src_reg)  { -	s32 smin_val = src_reg->s32_min_value; -	s32 smax_val = src_reg->s32_max_value; +	s32 *dst_smin = &dst_reg->s32_min_value; +	s32 *dst_smax = &dst_reg->s32_max_value;  	u32 umin_val = src_reg->u32_min_value;  	u32 umax_val = src_reg->u32_max_value; -	if (signed_sub32_overflows(dst_reg->s32_min_value, smax_val) || -	    signed_sub32_overflows(dst_reg->s32_max_value, smin_val)) { +	if (check_sub_overflow(*dst_smin, src_reg->s32_max_value, dst_smin) || +	    check_sub_overflow(*dst_smax, src_reg->s32_min_value, dst_smax)) {  		/* Overflow possible, we know nothing */ -		dst_reg->s32_min_value = S32_MIN; -		dst_reg->s32_max_value = S32_MAX; -	} else { -		dst_reg->s32_min_value -= smax_val; -		dst_reg->s32_max_value -= smin_val; +		*dst_smin = S32_MIN; +		*dst_smax = S32_MAX;  	}  	if (dst_reg->u32_min_value < umax_val) {  		/* Overflow possible, we know nothing */ @@ -13408,19 +13380,16 @@ static void scalar32_min_max_sub(struct bpf_reg_state *dst_reg,  static void scalar_min_max_sub(struct bpf_reg_state *dst_reg,  			       struct bpf_reg_state *src_reg)  { -	s64 smin_val = src_reg->smin_value; -	s64 smax_val = src_reg->smax_value; +	s64 *dst_smin = &dst_reg->smin_value; +	s64 *dst_smax = &dst_reg->smax_value;  	u64 umin_val = src_reg->umin_value;  	u64 umax_val = src_reg->umax_value; -	if (signed_sub_overflows(dst_reg->smin_value, smax_val) || -	    signed_sub_overflows(dst_reg->smax_value, smin_val)) { +	if (check_sub_overflow(*dst_smin, src_reg->smax_value, dst_smin) || +	    check_sub_overflow(*dst_smax, src_reg->smin_value, dst_smax)) {  		/* Overflow possible, we know nothing */ -		dst_reg->smin_value = S64_MIN; -		dst_reg->smax_value = S64_MAX; -	} else { -		dst_reg->smin_value -= smax_val; -		dst_reg->smax_value -= smin_val; +		*dst_smin = S64_MIN; +		*dst_smax = S64_MAX;  	}  	if (dst_reg->umin_value < umax_val) {  		/* Overflow possible, we know nothing */ @@ -14026,6 +13995,7 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env,  	struct bpf_func_state *state = vstate->frame[vstate->curframe];  	struct bpf_reg_state *regs = state->regs, *dst_reg, *src_reg;  	struct bpf_reg_state *ptr_reg = NULL, off_reg = {0}; +	bool alu32 = (BPF_CLASS(insn->code) != BPF_ALU64);  	u8 opcode = BPF_OP(insn->code);  	int err; @@ -14048,11 +14018,7 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env,  	if (dst_reg->type != SCALAR_VALUE)  		ptr_reg = dst_reg; -	else -		/* Make sure ID is cleared otherwise dst_reg min/max could be -		 * incorrectly propagated into other registers by find_equal_scalars() -		 */ -		dst_reg->id = 0; +  	if (BPF_SRC(insn->code) == BPF_X) {  		src_reg = ®s[insn->src_reg];  		if (src_reg->type != SCALAR_VALUE) { @@ -14116,7 +14082,43 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env,  		verbose(env, "verifier internal error: no src_reg\n");  		return -EINVAL;  	} -	return adjust_scalar_min_max_vals(env, insn, dst_reg, *src_reg); +	err = adjust_scalar_min_max_vals(env, insn, dst_reg, *src_reg); +	if (err) +		return err; +	/* +	 * Compilers can generate the code +	 * r1 = r2 +	 * r1 += 0x1 +	 * if r2 < 1000 goto ... +	 * use r1 in memory access +	 * So remember constant delta between r2 and r1 and update r1 after +	 * 'if' condition. +	 */ +	if (env->bpf_capable && BPF_OP(insn->code) == BPF_ADD && +	    dst_reg->id && is_reg_const(src_reg, alu32)) { +		u64 val = reg_const_value(src_reg, alu32); + +		if ((dst_reg->id & BPF_ADD_CONST) || +		    /* prevent overflow in find_equal_scalars() later */ +		    val > (u32)S32_MAX) { +			/* +			 * If the register already went through rX += val +			 * we cannot accumulate another val into rx->off. +			 */ +			dst_reg->off = 0; +			dst_reg->id = 0; +		} else { +			dst_reg->id |= BPF_ADD_CONST; +			dst_reg->off = val; +		} +	} else { +		/* +		 * Make sure ID is cleared otherwise dst_reg min/max could be +		 * incorrectly propagated into other registers by find_equal_scalars() +		 */ +		dst_reg->id = 0; +	} +	return 0;  }  /* check validity of 32-bit and 64-bit arithmetic operations */ @@ -15088,12 +15090,36 @@ static bool try_match_pkt_pointers(const struct bpf_insn *insn,  static void find_equal_scalars(struct bpf_verifier_state *vstate,  			       struct bpf_reg_state *known_reg)  { +	struct bpf_reg_state fake_reg;  	struct bpf_func_state *state;  	struct bpf_reg_state *reg;  	bpf_for_each_reg_in_vstate(vstate, state, reg, ({ -		if (reg->type == SCALAR_VALUE && reg->id == known_reg->id) +		if (reg->type != SCALAR_VALUE || reg == known_reg) +			continue; +		if ((reg->id & ~BPF_ADD_CONST) != (known_reg->id & ~BPF_ADD_CONST)) +			continue; +		if ((!(reg->id & BPF_ADD_CONST) && !(known_reg->id & BPF_ADD_CONST)) || +		    reg->off == known_reg->off) { +			copy_register_state(reg, known_reg); +		} else { +			s32 saved_off = reg->off; + +			fake_reg.type = SCALAR_VALUE; +			__mark_reg_known(&fake_reg, (s32)reg->off - (s32)known_reg->off); + +			/* reg = known_reg; reg += delta */  			copy_register_state(reg, known_reg); +			/* +			 * Must preserve off, id and add_const flag, +			 * otherwise another find_equal_scalars() will be incorrect. +			 */ +			reg->off = saved_off; + +			scalar32_min_max_add(reg, &fake_reg); +			scalar_min_max_add(reg, &fake_reg); +			reg->var_off = tnum_add(reg->var_off, fake_reg.var_off); +		}  	}));  } @@ -15105,7 +15131,6 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,  	struct bpf_reg_state *regs = this_branch->frame[this_branch->curframe]->regs;  	struct bpf_reg_state *dst_reg, *other_branch_regs, *src_reg = NULL;  	struct bpf_reg_state *eq_branch_regs; -	struct bpf_reg_state fake_reg = {};  	u8 opcode = BPF_OP(insn->code);  	bool is_jmp32;  	int pred = -1; @@ -15171,7 +15196,8 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,  			verbose(env, "BPF_JMP/JMP32 uses reserved fields\n");  			return -EINVAL;  		} -		src_reg = &fake_reg; +		src_reg = &env->fake_reg[0]; +		memset(src_reg, 0, sizeof(*src_reg));  		src_reg->type = SCALAR_VALUE;  		__mark_reg_known(src_reg, insn->imm);  	} @@ -15231,10 +15257,16 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,  				      &other_branch_regs[insn->src_reg],  				      dst_reg, src_reg, opcode, is_jmp32);  	} else /* BPF_SRC(insn->code) == BPF_K */ { +		/* reg_set_min_max() can mangle the fake_reg. Make a copy +		 * so that these are two different memory locations. The +		 * src_reg is not used beyond here in context of K. +		 */ +		memcpy(&env->fake_reg[1], &env->fake_reg[0], +		       sizeof(env->fake_reg[0]));  		err = reg_set_min_max(env,  				      &other_branch_regs[insn->dst_reg], -				      src_reg /* fake one */, -				      dst_reg, src_reg /* same fake one */, +				      &env->fake_reg[0], +				      dst_reg, &env->fake_reg[1],  				      opcode, is_jmp32);  	}  	if (err) @@ -16722,6 +16754,10 @@ static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold,  		}  		if (!rold->precise && exact == NOT_EXACT)  			return true; +		if ((rold->id & BPF_ADD_CONST) != (rcur->id & BPF_ADD_CONST)) +			return false; +		if ((rold->id & BPF_ADD_CONST) && (rold->off != rcur->off)) +			return false;  		/* Why check_ids() for scalar registers?  		 *  		 * Consider the following BPF code: @@ -16848,8 +16884,9 @@ static bool stacksafe(struct bpf_verifier_env *env, struct bpf_func_state *old,  		spi = i / BPF_REG_SIZE;  		if (exact != NOT_EXACT && -		    old->stack[spi].slot_type[i % BPF_REG_SIZE] != -		    cur->stack[spi].slot_type[i % BPF_REG_SIZE]) +		    (i >= cur->allocated_stack || +		     old->stack[spi].slot_type[i % BPF_REG_SIZE] != +		     cur->stack[spi].slot_type[i % BPF_REG_SIZE]))  			return false;  		if (!(old->stack[spi].spilled_ptr.live & REG_LIVE_READ) @@ -17433,11 +17470,11 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)  				goto skip_inf_loop_check;  			}  			if (is_may_goto_insn_at(env, insn_idx)) { -				if (states_equal(env, &sl->state, cur, RANGE_WITHIN)) { +				if (sl->state.may_goto_depth != cur->may_goto_depth && +				    states_equal(env, &sl->state, cur, RANGE_WITHIN)) {  					update_loop_entry(cur, &sl->state);  					goto hit;  				} -				goto skip_inf_loop_check;  			}  			if (calls_callback(env, insn_idx)) {  				if (states_equal(env, &sl->state, cur, RANGE_WITHIN)) @@ -18603,8 +18640,7 @@ static void release_maps(struct bpf_verifier_env *env)  /* drop refcnt of maps used by the rejected program */  static void release_btfs(struct bpf_verifier_env *env)  { -	__bpf_free_used_btfs(env->prog->aux, env->used_btfs, -			     env->used_btf_cnt); +	__bpf_free_used_btfs(env->used_btfs, env->used_btf_cnt);  }  /* convert pseudo BPF_LD_IMM64 into generic BPF_LD_IMM64 */ @@ -18715,6 +18751,41 @@ static struct bpf_prog *bpf_patch_insn_data(struct bpf_verifier_env *env, u32 of  	return new_prog;  } +/* + * For all jmp insns in a given 'prog' that point to 'tgt_idx' insn adjust the + * jump offset by 'delta'. + */ +static int adjust_jmp_off(struct bpf_prog *prog, u32 tgt_idx, u32 delta) +{ +	struct bpf_insn *insn = prog->insnsi; +	u32 insn_cnt = prog->len, i; +	s32 imm; +	s16 off; + +	for (i = 0; i < insn_cnt; i++, insn++) { +		u8 code = insn->code; + +		if ((BPF_CLASS(code) != BPF_JMP && BPF_CLASS(code) != BPF_JMP32) || +		    BPF_OP(code) == BPF_CALL || BPF_OP(code) == BPF_EXIT) +			continue; + +		if (insn->code == (BPF_JMP32 | BPF_JA)) { +			if (i + 1 + insn->imm != tgt_idx) +				continue; +			if (check_add_overflow(insn->imm, delta, &imm)) +				return -ERANGE; +			insn->imm = imm; +		} else { +			if (i + 1 + insn->off != tgt_idx) +				continue; +			if (check_add_overflow(insn->off, delta, &off)) +				return -ERANGE; +			insn->off = off; +		} +	} +	return 0; +} +  static int adjust_subprog_starts_after_remove(struct bpf_verifier_env *env,  					      u32 off, u32 cnt)  { @@ -19989,7 +20060,10 @@ static int do_misc_fixups(struct bpf_verifier_env *env)  			stack_depth_extra = 8;  			insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_AX, BPF_REG_10, stack_off); -			insn_buf[1] = BPF_JMP_IMM(BPF_JEQ, BPF_REG_AX, 0, insn->off + 2); +			if (insn->off >= 0) +				insn_buf[1] = BPF_JMP_IMM(BPF_JEQ, BPF_REG_AX, 0, insn->off + 2); +			else +				insn_buf[1] = BPF_JMP_IMM(BPF_JEQ, BPF_REG_AX, 0, insn->off - 1);  			insn_buf[2] = BPF_ALU64_IMM(BPF_SUB, BPF_REG_AX, 1);  			insn_buf[3] = BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_AX, stack_off);  			cnt = 4; @@ -20305,7 +20379,7 @@ patch_map_ops_generic:  			goto next_insn;  		} -#ifdef CONFIG_X86_64 +#if defined(CONFIG_X86_64) && !defined(CONFIG_UML)  		/* Implement bpf_get_smp_processor_id() inline. */  		if (insn->imm == BPF_FUNC_get_smp_processor_id &&  		    prog->jit_requested && bpf_jit_supports_percpu_insn()) { @@ -20531,6 +20605,13 @@ next_insn:  		if (!new_prog)  			return -ENOMEM;  		env->prog = prog = new_prog; +		/* +		 * If may_goto is a first insn of a prog there could be a jmp +		 * insn that points to it, hence adjust all such jmps to point +		 * to insn after BPF_ST that inits may_goto count. +		 * Adjustment will succeed because bpf_patch_insn_data() didn't fail. +		 */ +		WARN_ON(adjust_jmp_off(env->prog, subprog_start, 1));  	}  	/* Since poke tab is now finalized, publish aux to tracker. */ @@ -21052,8 +21133,12 @@ BTF_SET_START(btf_non_sleepable_error_inject)   * Assume non-sleepable from bpf safety point of view.   */  BTF_ID(func, __filemap_add_folio) +#ifdef CONFIG_FAIL_PAGE_ALLOC  BTF_ID(func, should_fail_alloc_page) +#endif +#ifdef CONFIG_FAILSLAB  BTF_ID(func, should_failslab) +#endif  BTF_SET_END(btf_non_sleepable_error_inject)  static int check_non_sleepable_error_inject(u32 btf_id) | 
