diff options
Diffstat (limited to 'kernel/bpf')
| -rw-r--r-- | kernel/bpf/Makefile | 2 | ||||
| -rw-r--r-- | kernel/bpf/arraymap.c | 137 | ||||
| -rw-r--r-- | kernel/bpf/bpf_lru_list.c | 2 | ||||
| -rw-r--r-- | kernel/bpf/cgroup.c | 5 | ||||
| -rw-r--r-- | kernel/bpf/core.c | 21 | ||||
| -rw-r--r-- | kernel/bpf/hashtab.c | 185 | ||||
| -rw-r--r-- | kernel/bpf/lpm_trie.c | 14 | ||||
| -rw-r--r-- | kernel/bpf/map_in_map.c | 97 | ||||
| -rw-r--r-- | kernel/bpf/map_in_map.h | 23 | ||||
| -rw-r--r-- | kernel/bpf/stackmap.c | 14 | ||||
| -rw-r--r-- | kernel/bpf/syscall.c | 181 | ||||
| -rw-r--r-- | kernel/bpf/verifier.c | 198 | 
12 files changed, 625 insertions, 254 deletions
| diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index e1ce4f4fd7fd..e1e5e658f2db 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -1,7 +1,7 @@  obj-y := core.o  obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o -obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o +obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o  ifeq ($(CONFIG_PERF_EVENTS),y)  obj-$(CONFIG_BPF_SYSCALL) += stackmap.o  endif diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 6b6f41f0b211..5e00b2333c26 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -1,4 +1,5 @@  /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com + * Copyright (c) 2016,2017 Facebook   *   * This program is free software; you can redistribute it and/or   * modify it under the terms of version 2 of the GNU General Public @@ -16,6 +17,8 @@  #include <linux/filter.h>  #include <linux/perf_event.h> +#include "map_in_map.h" +  static void bpf_array_free_percpu(struct bpf_array *array)  {  	int i; @@ -113,6 +116,30 @@ static void *array_map_lookup_elem(struct bpf_map *map, void *key)  	return array->value + array->elem_size * index;  } +/* emit BPF instructions equivalent to C code of array_map_lookup_elem() */ +static u32 array_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) +{ +	struct bpf_insn *insn = insn_buf; +	u32 elem_size = round_up(map->value_size, 8); +	const int ret = BPF_REG_0; +	const int map_ptr = BPF_REG_1; +	const int index = BPF_REG_2; + +	*insn++ = BPF_ALU64_IMM(BPF_ADD, map_ptr, offsetof(struct bpf_array, value)); +	*insn++ = BPF_LDX_MEM(BPF_W, ret, index, 0); +	*insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 3); + +	if (is_power_of_2(elem_size)) { +		*insn++ = BPF_ALU64_IMM(BPF_LSH, ret, ilog2(elem_size)); +	} else { +		*insn++ = BPF_ALU64_IMM(BPF_MUL, ret, elem_size); +	} +	*insn++ = BPF_ALU64_REG(BPF_ADD, ret, map_ptr); +	*insn++ = BPF_JMP_IMM(BPF_JA, 0, 0, 1); +	*insn++ = BPF_MOV64_IMM(ret, 0); +	return insn - insn_buf; +} +  /* Called from eBPF program */  static void *percpu_array_map_lookup_elem(struct bpf_map *map, void *key)  { @@ -155,7 +182,7 @@ int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value)  static int array_map_get_next_key(struct bpf_map *map, void *key, void *next_key)  {  	struct bpf_array *array = container_of(map, struct bpf_array, map); -	u32 index = *(u32 *)key; +	u32 index = key ? *(u32 *)key : U32_MAX;  	u32 *next = (u32 *)next_key;  	if (index >= array->map.max_entries) { @@ -260,21 +287,17 @@ static void array_map_free(struct bpf_map *map)  	bpf_map_area_free(array);  } -static const struct bpf_map_ops array_ops = { +const struct bpf_map_ops array_map_ops = {  	.map_alloc = array_map_alloc,  	.map_free = array_map_free,  	.map_get_next_key = array_map_get_next_key,  	.map_lookup_elem = array_map_lookup_elem,  	.map_update_elem = array_map_update_elem,  	.map_delete_elem = array_map_delete_elem, +	.map_gen_lookup = array_map_gen_lookup,  }; -static struct bpf_map_type_list array_type __ro_after_init = { -	.ops = &array_ops, -	.type = BPF_MAP_TYPE_ARRAY, -}; - -static const struct bpf_map_ops percpu_array_ops = { +const struct bpf_map_ops percpu_array_map_ops = {  	.map_alloc = array_map_alloc,  	.map_free = array_map_free,  	.map_get_next_key = array_map_get_next_key, @@ -283,19 +306,6 @@ static const struct bpf_map_ops percpu_array_ops = {  	.map_delete_elem = array_map_delete_elem,  }; -static struct bpf_map_type_list percpu_array_type __ro_after_init = { -	.ops = &percpu_array_ops, -	.type = BPF_MAP_TYPE_PERCPU_ARRAY, -}; - -static int __init register_array_map(void) -{ -	bpf_register_map_type(&array_type); -	bpf_register_map_type(&percpu_array_type); -	return 0; -} -late_initcall(register_array_map); -  static struct bpf_map *fd_array_map_alloc(union bpf_attr *attr)  {  	/* only file descriptors can be stored in this type of map */ @@ -399,7 +409,7 @@ void bpf_fd_array_map_clear(struct bpf_map *map)  		fd_array_map_delete_elem(map, &i);  } -static const struct bpf_map_ops prog_array_ops = { +const struct bpf_map_ops prog_array_map_ops = {  	.map_alloc = fd_array_map_alloc,  	.map_free = fd_array_map_free,  	.map_get_next_key = array_map_get_next_key, @@ -409,18 +419,6 @@ static const struct bpf_map_ops prog_array_ops = {  	.map_fd_put_ptr = prog_fd_array_put_ptr,  }; -static struct bpf_map_type_list prog_array_type __ro_after_init = { -	.ops = &prog_array_ops, -	.type = BPF_MAP_TYPE_PROG_ARRAY, -}; - -static int __init register_prog_array_map(void) -{ -	bpf_register_map_type(&prog_array_type); -	return 0; -} -late_initcall(register_prog_array_map); -  static struct bpf_event_entry *bpf_event_entry_gen(struct file *perf_file,  						   struct file *map_file)  { @@ -511,7 +509,7 @@ static void perf_event_fd_array_release(struct bpf_map *map,  	rcu_read_unlock();  } -static const struct bpf_map_ops perf_event_array_ops = { +const struct bpf_map_ops perf_event_array_map_ops = {  	.map_alloc = fd_array_map_alloc,  	.map_free = fd_array_map_free,  	.map_get_next_key = array_map_get_next_key, @@ -522,18 +520,6 @@ static const struct bpf_map_ops perf_event_array_ops = {  	.map_release = perf_event_fd_array_release,  }; -static struct bpf_map_type_list perf_event_array_type __ro_after_init = { -	.ops = &perf_event_array_ops, -	.type = BPF_MAP_TYPE_PERF_EVENT_ARRAY, -}; - -static int __init register_perf_event_array_map(void) -{ -	bpf_register_map_type(&perf_event_array_type); -	return 0; -} -late_initcall(register_perf_event_array_map); -  #ifdef CONFIG_CGROUPS  static void *cgroup_fd_array_get_ptr(struct bpf_map *map,  				     struct file *map_file /* not used */, @@ -554,7 +540,7 @@ static void cgroup_fd_array_free(struct bpf_map *map)  	fd_array_map_free(map);  } -static const struct bpf_map_ops cgroup_array_ops = { +const struct bpf_map_ops cgroup_array_map_ops = {  	.map_alloc = fd_array_map_alloc,  	.map_free = cgroup_fd_array_free,  	.map_get_next_key = array_map_get_next_key, @@ -563,16 +549,53 @@ static const struct bpf_map_ops cgroup_array_ops = {  	.map_fd_get_ptr = cgroup_fd_array_get_ptr,  	.map_fd_put_ptr = cgroup_fd_array_put_ptr,  }; +#endif -static struct bpf_map_type_list cgroup_array_type __ro_after_init = { -	.ops = &cgroup_array_ops, -	.type = BPF_MAP_TYPE_CGROUP_ARRAY, -}; +static struct bpf_map *array_of_map_alloc(union bpf_attr *attr) +{ +	struct bpf_map *map, *inner_map_meta; + +	inner_map_meta = bpf_map_meta_alloc(attr->inner_map_fd); +	if (IS_ERR(inner_map_meta)) +		return inner_map_meta; -static int __init register_cgroup_array_map(void) +	map = fd_array_map_alloc(attr); +	if (IS_ERR(map)) { +		bpf_map_meta_free(inner_map_meta); +		return map; +	} + +	map->inner_map_meta = inner_map_meta; + +	return map; +} + +static void array_of_map_free(struct bpf_map *map)  { -	bpf_register_map_type(&cgroup_array_type); -	return 0; +	/* map->inner_map_meta is only accessed by syscall which +	 * is protected by fdget/fdput. +	 */ +	bpf_map_meta_free(map->inner_map_meta); +	bpf_fd_array_map_clear(map); +	fd_array_map_free(map);  } -late_initcall(register_cgroup_array_map); -#endif + +static void *array_of_map_lookup_elem(struct bpf_map *map, void *key) +{ +	struct bpf_map **inner_map = array_map_lookup_elem(map, key); + +	if (!inner_map) +		return NULL; + +	return READ_ONCE(*inner_map); +} + +const struct bpf_map_ops array_of_maps_map_ops = { +	.map_alloc = array_of_map_alloc, +	.map_free = array_of_map_free, +	.map_get_next_key = array_map_get_next_key, +	.map_lookup_elem = array_of_map_lookup_elem, +	.map_delete_elem = fd_array_map_delete_elem, +	.map_fd_get_ptr = bpf_map_fd_get_ptr, +	.map_fd_put_ptr = bpf_map_fd_put_ptr, +}; diff --git a/kernel/bpf/bpf_lru_list.c b/kernel/bpf/bpf_lru_list.c index f62d1d56f41d..e6ef4401a138 100644 --- a/kernel/bpf/bpf_lru_list.c +++ b/kernel/bpf/bpf_lru_list.c @@ -13,7 +13,7 @@  #define LOCAL_FREE_TARGET		(128)  #define LOCAL_NR_SCANS			LOCAL_FREE_TARGET -#define PERCPU_FREE_TARGET		(16) +#define PERCPU_FREE_TARGET		(4)  #define PERCPU_NR_SCANS			PERCPU_FREE_TARGET  /* Helpers to get the local list index */ diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index da0f53690295..ea6033cba947 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -154,7 +154,7 @@ int __cgroup_bpf_update(struct cgroup *cgrp, struct cgroup *parent,  /**   * __cgroup_bpf_run_filter_skb() - Run a program for packet filtering - * @sk: The socken sending or receiving traffic + * @sk: The socket sending or receiving traffic   * @skb: The skb that is being sent or received   * @type: The type of program to be exectuted   * @@ -189,10 +189,13 @@ int __cgroup_bpf_run_filter_skb(struct sock *sk,  	prog = rcu_dereference(cgrp->bpf.effective[type]);  	if (prog) {  		unsigned int offset = skb->data - skb_network_header(skb); +		struct sock *save_sk = skb->sk; +		skb->sk = sk;  		__skb_push(skb, offset);  		ret = bpf_prog_run_save_cb(prog, skb) == 1 ? 0 : -EPERM;  		__skb_pull(skb, offset); +		skb->sk = save_sk;  	}  	rcu_read_unlock(); diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index b4f1cb0c5ac7..dedf367f59bb 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -76,8 +76,7 @@ void *bpf_internal_load_pointer_neg_helper(const struct sk_buff *skb, int k, uns  struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags)  { -	gfp_t gfp_flags = GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO | -			  gfp_extra_flags; +	gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO | gfp_extra_flags;  	struct bpf_prog_aux *aux;  	struct bpf_prog *fp; @@ -107,8 +106,7 @@ EXPORT_SYMBOL_GPL(bpf_prog_alloc);  struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size,  				  gfp_t gfp_extra_flags)  { -	gfp_t gfp_flags = GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO | -			  gfp_extra_flags; +	gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO | gfp_extra_flags;  	struct bpf_prog *fp;  	u32 pages, delta;  	int ret; @@ -394,27 +392,23 @@ static bool bpf_prog_kallsyms_verify_off(const struct bpf_prog *fp)  void bpf_prog_kallsyms_add(struct bpf_prog *fp)  { -	unsigned long flags; -  	if (!bpf_prog_kallsyms_candidate(fp) ||  	    !capable(CAP_SYS_ADMIN))  		return; -	spin_lock_irqsave(&bpf_lock, flags); +	spin_lock_bh(&bpf_lock);  	bpf_prog_ksym_node_add(fp->aux); -	spin_unlock_irqrestore(&bpf_lock, flags); +	spin_unlock_bh(&bpf_lock);  }  void bpf_prog_kallsyms_del(struct bpf_prog *fp)  { -	unsigned long flags; -  	if (!bpf_prog_kallsyms_candidate(fp))  		return; -	spin_lock_irqsave(&bpf_lock, flags); +	spin_lock_bh(&bpf_lock);  	bpf_prog_ksym_node_del(fp->aux); -	spin_unlock_irqrestore(&bpf_lock, flags); +	spin_unlock_bh(&bpf_lock);  }  static struct bpf_prog *bpf_prog_kallsyms_find(unsigned long addr) @@ -659,8 +653,7 @@ out:  static struct bpf_prog *bpf_prog_clone_create(struct bpf_prog *fp_other,  					      gfp_t gfp_extra_flags)  { -	gfp_t gfp_flags = GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO | -			  gfp_extra_flags; +	gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO | gfp_extra_flags;  	struct bpf_prog *fp;  	fp = __vmalloc(fp_other->pages * PAGE_SIZE, gfp_flags, PAGE_KERNEL); diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 361a69dfe543..004334ea13ba 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -16,6 +16,7 @@  #include <linux/rculist_nulls.h>  #include "percpu_freelist.h"  #include "bpf_lru_list.h" +#include "map_in_map.h"  struct bucket {  	struct hlist_nulls_head head; @@ -86,6 +87,11 @@ static inline void __percpu *htab_elem_get_ptr(struct htab_elem *l, u32 key_size  	return *(void __percpu **)(l->key + key_size);  } +static void *fd_htab_map_get_ptr(const struct bpf_map *map, struct htab_elem *l) +{ +	return *(void **)(l->key + roundup(map->key_size, 8)); +} +  static struct htab_elem *get_htab_elem(struct bpf_htab *htab, int i)  {  	return (struct htab_elem *) (htab->elems + i * htab->elem_size); @@ -426,7 +432,11 @@ again:  	return NULL;  } -/* Called from syscall or from eBPF program */ +/* Called from syscall or from eBPF program directly, so + * arguments have to match bpf_map_lookup_elem() exactly. + * The return value is adjusted by BPF instructions + * in htab_map_gen_lookup(). + */  static void *__htab_map_lookup_elem(struct bpf_map *map, void *key)  {  	struct bpf_htab *htab = container_of(map, struct bpf_htab, map); @@ -458,6 +468,30 @@ static void *htab_map_lookup_elem(struct bpf_map *map, void *key)  	return NULL;  } +/* inline bpf_map_lookup_elem() call. + * Instead of: + * bpf_prog + *   bpf_map_lookup_elem + *     map->ops->map_lookup_elem + *       htab_map_lookup_elem + *         __htab_map_lookup_elem + * do: + * bpf_prog + *   __htab_map_lookup_elem + */ +static u32 htab_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) +{ +	struct bpf_insn *insn = insn_buf; +	const int ret = BPF_REG_0; + +	*insn++ = BPF_EMIT_CALL((u64 (*)(u64, u64, u64, u64, u64))__htab_map_lookup_elem); +	*insn++ = BPF_JMP_IMM(BPF_JEQ, ret, 0, 1); +	*insn++ = BPF_ALU64_IMM(BPF_ADD, ret, +				offsetof(struct htab_elem, key) + +				round_up(map->key_size, 8)); +	return insn - insn_buf; +} +  static void *htab_lru_map_lookup_elem(struct bpf_map *map, void *key)  {  	struct htab_elem *l = __htab_map_lookup_elem(map, key); @@ -506,12 +540,15 @@ static int htab_map_get_next_key(struct bpf_map *map, void *key, void *next_key)  	struct hlist_nulls_head *head;  	struct htab_elem *l, *next_l;  	u32 hash, key_size; -	int i; +	int i = 0;  	WARN_ON_ONCE(!rcu_read_lock_held());  	key_size = map->key_size; +	if (!key) +		goto find_first_elem; +  	hash = htab_map_hash(key, key_size);  	head = select_bucket(htab, hash); @@ -519,10 +556,8 @@ static int htab_map_get_next_key(struct bpf_map *map, void *key, void *next_key)  	/* lookup the key */  	l = lookup_nulls_elem_raw(head, hash, key, key_size, htab->n_buckets); -	if (!l) { -		i = 0; +	if (!l)  		goto find_first_elem; -	}  	/* key was found, get next key in the same bucket */  	next_l = hlist_nulls_entry_safe(rcu_dereference_raw(hlist_nulls_next_rcu(&l->hash_node)), @@ -582,6 +617,14 @@ static void htab_elem_free_rcu(struct rcu_head *head)  static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l)  { +	struct bpf_map *map = &htab->map; + +	if (map->ops->map_fd_put_ptr) { +		void *ptr = fd_htab_map_get_ptr(map, l); + +		map->ops->map_fd_put_ptr(ptr); +	} +  	if (htab_is_prealloc(htab)) {  		pcpu_freelist_push(&htab->freelist, &l->fnode);  	} else { @@ -1027,6 +1070,7 @@ static void delete_all_elements(struct bpf_htab *htab)  		}  	}  } +  /* Called when map->refcnt goes to zero, either from workqueue or from syscall */  static void htab_map_free(struct bpf_map *map)  { @@ -1053,21 +1097,17 @@ static void htab_map_free(struct bpf_map *map)  	kfree(htab);  } -static const struct bpf_map_ops htab_ops = { +const struct bpf_map_ops htab_map_ops = {  	.map_alloc = htab_map_alloc,  	.map_free = htab_map_free,  	.map_get_next_key = htab_map_get_next_key,  	.map_lookup_elem = htab_map_lookup_elem,  	.map_update_elem = htab_map_update_elem,  	.map_delete_elem = htab_map_delete_elem, +	.map_gen_lookup = htab_map_gen_lookup,  }; -static struct bpf_map_type_list htab_type __ro_after_init = { -	.ops = &htab_ops, -	.type = BPF_MAP_TYPE_HASH, -}; - -static const struct bpf_map_ops htab_lru_ops = { +const struct bpf_map_ops htab_lru_map_ops = {  	.map_alloc = htab_map_alloc,  	.map_free = htab_map_free,  	.map_get_next_key = htab_map_get_next_key, @@ -1076,11 +1116,6 @@ static const struct bpf_map_ops htab_lru_ops = {  	.map_delete_elem = htab_lru_map_delete_elem,  }; -static struct bpf_map_type_list htab_lru_type __ro_after_init = { -	.ops = &htab_lru_ops, -	.type = BPF_MAP_TYPE_LRU_HASH, -}; -  /* Called from eBPF program */  static void *htab_percpu_map_lookup_elem(struct bpf_map *map, void *key)  { @@ -1154,7 +1189,7 @@ int bpf_percpu_hash_update(struct bpf_map *map, void *key, void *value,  	return ret;  } -static const struct bpf_map_ops htab_percpu_ops = { +const struct bpf_map_ops htab_percpu_map_ops = {  	.map_alloc = htab_map_alloc,  	.map_free = htab_map_free,  	.map_get_next_key = htab_map_get_next_key, @@ -1163,12 +1198,7 @@ static const struct bpf_map_ops htab_percpu_ops = {  	.map_delete_elem = htab_map_delete_elem,  }; -static struct bpf_map_type_list htab_percpu_type __ro_after_init = { -	.ops = &htab_percpu_ops, -	.type = BPF_MAP_TYPE_PERCPU_HASH, -}; - -static const struct bpf_map_ops htab_lru_percpu_ops = { +const struct bpf_map_ops htab_lru_percpu_map_ops = {  	.map_alloc = htab_map_alloc,  	.map_free = htab_map_free,  	.map_get_next_key = htab_map_get_next_key, @@ -1177,17 +1207,102 @@ static const struct bpf_map_ops htab_lru_percpu_ops = {  	.map_delete_elem = htab_lru_map_delete_elem,  }; -static struct bpf_map_type_list htab_lru_percpu_type __ro_after_init = { -	.ops = &htab_lru_percpu_ops, -	.type = BPF_MAP_TYPE_LRU_PERCPU_HASH, -}; +static struct bpf_map *fd_htab_map_alloc(union bpf_attr *attr) +{ +	struct bpf_map *map; + +	if (attr->value_size != sizeof(u32)) +		return ERR_PTR(-EINVAL); + +	/* pointer is stored internally */ +	attr->value_size = sizeof(void *); +	map = htab_map_alloc(attr); +	attr->value_size = sizeof(u32); -static int __init register_htab_map(void) +	return map; +} + +static void fd_htab_map_free(struct bpf_map *map)  { -	bpf_register_map_type(&htab_type); -	bpf_register_map_type(&htab_percpu_type); -	bpf_register_map_type(&htab_lru_type); -	bpf_register_map_type(&htab_lru_percpu_type); -	return 0; +	struct bpf_htab *htab = container_of(map, struct bpf_htab, map); +	struct hlist_nulls_node *n; +	struct hlist_nulls_head *head; +	struct htab_elem *l; +	int i; + +	for (i = 0; i < htab->n_buckets; i++) { +		head = select_bucket(htab, i); + +		hlist_nulls_for_each_entry_safe(l, n, head, hash_node) { +			void *ptr = fd_htab_map_get_ptr(map, l); + +			map->ops->map_fd_put_ptr(ptr); +		} +	} + +	htab_map_free(map); +} + +/* only called from syscall */ +int bpf_fd_htab_map_update_elem(struct bpf_map *map, struct file *map_file, +				void *key, void *value, u64 map_flags) +{ +	void *ptr; +	int ret; +	u32 ufd = *(u32 *)value; + +	ptr = map->ops->map_fd_get_ptr(map, map_file, ufd); +	if (IS_ERR(ptr)) +		return PTR_ERR(ptr); + +	ret = htab_map_update_elem(map, key, &ptr, map_flags); +	if (ret) +		map->ops->map_fd_put_ptr(ptr); + +	return ret; +} + +static struct bpf_map *htab_of_map_alloc(union bpf_attr *attr) +{ +	struct bpf_map *map, *inner_map_meta; + +	inner_map_meta = bpf_map_meta_alloc(attr->inner_map_fd); +	if (IS_ERR(inner_map_meta)) +		return inner_map_meta; + +	map = fd_htab_map_alloc(attr); +	if (IS_ERR(map)) { +		bpf_map_meta_free(inner_map_meta); +		return map; +	} + +	map->inner_map_meta = inner_map_meta; + +	return map;  } -late_initcall(register_htab_map); + +static void *htab_of_map_lookup_elem(struct bpf_map *map, void *key) +{ +	struct bpf_map **inner_map  = htab_map_lookup_elem(map, key); + +	if (!inner_map) +		return NULL; + +	return READ_ONCE(*inner_map); +} + +static void htab_of_map_free(struct bpf_map *map) +{ +	bpf_map_meta_free(map->inner_map_meta); +	fd_htab_map_free(map); +} + +const struct bpf_map_ops htab_of_maps_map_ops = { +	.map_alloc = htab_of_map_alloc, +	.map_free = htab_of_map_free, +	.map_get_next_key = htab_map_get_next_key, +	.map_lookup_elem = htab_of_map_lookup_elem, +	.map_delete_elem = htab_map_delete_elem, +	.map_fd_get_ptr = bpf_map_fd_get_ptr, +	.map_fd_put_ptr = bpf_map_fd_put_ptr, +}; diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c index b37bd9ab7f57..39cfafd895b8 100644 --- a/kernel/bpf/lpm_trie.c +++ b/kernel/bpf/lpm_trie.c @@ -505,7 +505,7 @@ static int trie_get_next_key(struct bpf_map *map, void *key, void *next_key)  	return -ENOTSUPP;  } -static const struct bpf_map_ops trie_ops = { +const struct bpf_map_ops trie_map_ops = {  	.map_alloc = trie_alloc,  	.map_free = trie_free,  	.map_get_next_key = trie_get_next_key, @@ -513,15 +513,3 @@ static const struct bpf_map_ops trie_ops = {  	.map_update_elem = trie_update_elem,  	.map_delete_elem = trie_delete_elem,  }; - -static struct bpf_map_type_list trie_type __ro_after_init = { -	.ops = &trie_ops, -	.type = BPF_MAP_TYPE_LPM_TRIE, -}; - -static int __init register_trie_map(void) -{ -	bpf_register_map_type(&trie_type); -	return 0; -} -late_initcall(register_trie_map); diff --git a/kernel/bpf/map_in_map.c b/kernel/bpf/map_in_map.c new file mode 100644 index 000000000000..59bcdf821ae4 --- /dev/null +++ b/kernel/bpf/map_in_map.c @@ -0,0 +1,97 @@ +/* Copyright (c) 2017 Facebook + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + */ +#include <linux/slab.h> +#include <linux/bpf.h> + +#include "map_in_map.h" + +struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd) +{ +	struct bpf_map *inner_map, *inner_map_meta; +	struct fd f; + +	f = fdget(inner_map_ufd); +	inner_map = __bpf_map_get(f); +	if (IS_ERR(inner_map)) +		return inner_map; + +	/* prog_array->owner_prog_type and owner_jited +	 * is a runtime binding.  Doing static check alone +	 * in the verifier is not enough. +	 */ +	if (inner_map->map_type == BPF_MAP_TYPE_PROG_ARRAY) { +		fdput(f); +		return ERR_PTR(-ENOTSUPP); +	} + +	/* Does not support >1 level map-in-map */ +	if (inner_map->inner_map_meta) { +		fdput(f); +		return ERR_PTR(-EINVAL); +	} + +	inner_map_meta = kzalloc(sizeof(*inner_map_meta), GFP_USER); +	if (!inner_map_meta) { +		fdput(f); +		return ERR_PTR(-ENOMEM); +	} + +	inner_map_meta->map_type = inner_map->map_type; +	inner_map_meta->key_size = inner_map->key_size; +	inner_map_meta->value_size = inner_map->value_size; +	inner_map_meta->map_flags = inner_map->map_flags; +	inner_map_meta->ops = inner_map->ops; +	inner_map_meta->max_entries = inner_map->max_entries; + +	fdput(f); +	return inner_map_meta; +} + +void bpf_map_meta_free(struct bpf_map *map_meta) +{ +	kfree(map_meta); +} + +bool bpf_map_meta_equal(const struct bpf_map *meta0, +			const struct bpf_map *meta1) +{ +	/* No need to compare ops because it is covered by map_type */ +	return meta0->map_type == meta1->map_type && +		meta0->key_size == meta1->key_size && +		meta0->value_size == meta1->value_size && +		meta0->map_flags == meta1->map_flags && +		meta0->max_entries == meta1->max_entries; +} + +void *bpf_map_fd_get_ptr(struct bpf_map *map, +			 struct file *map_file /* not used */, +			 int ufd) +{ +	struct bpf_map *inner_map; +	struct fd f; + +	f = fdget(ufd); +	inner_map = __bpf_map_get(f); +	if (IS_ERR(inner_map)) +		return inner_map; + +	if (bpf_map_meta_equal(map->inner_map_meta, inner_map)) +		inner_map = bpf_map_inc(inner_map, false); +	else +		inner_map = ERR_PTR(-EINVAL); + +	fdput(f); +	return inner_map; +} + +void bpf_map_fd_put_ptr(void *ptr) +{ +	/* ptr->ops->map_free() has to go through one +	 * rcu grace period by itself. +	 */ +	bpf_map_put(ptr); +} diff --git a/kernel/bpf/map_in_map.h b/kernel/bpf/map_in_map.h new file mode 100644 index 000000000000..177fadb689dc --- /dev/null +++ b/kernel/bpf/map_in_map.h @@ -0,0 +1,23 @@ +/* Copyright (c) 2017 Facebook + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + */ +#ifndef __MAP_IN_MAP_H__ +#define __MAP_IN_MAP_H__ + +#include <linux/types.h> + +struct file; +struct bpf_map; + +struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd); +void bpf_map_meta_free(struct bpf_map *map_meta); +bool bpf_map_meta_equal(const struct bpf_map *meta0, +			const struct bpf_map *meta1); +void *bpf_map_fd_get_ptr(struct bpf_map *map, struct file *map_file, +			 int ufd); +void bpf_map_fd_put_ptr(void *ptr); + +#endif diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index 22aa45cd0324..4dfd6f2ec2f9 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -264,7 +264,7 @@ static void stack_map_free(struct bpf_map *map)  	put_callchain_buffers();  } -static const struct bpf_map_ops stack_map_ops = { +const struct bpf_map_ops stack_map_ops = {  	.map_alloc = stack_map_alloc,  	.map_free = stack_map_free,  	.map_get_next_key = stack_map_get_next_key, @@ -272,15 +272,3 @@ static const struct bpf_map_ops stack_map_ops = {  	.map_update_elem = stack_map_update_elem,  	.map_delete_elem = stack_map_delete_elem,  }; - -static struct bpf_map_type_list stack_map_type __ro_after_init = { -	.ops = &stack_map_ops, -	.type = BPF_MAP_TYPE_STACK_TRACE, -}; - -static int __init register_stack_map(void) -{ -	bpf_register_map_type(&stack_map_type); -	return 0; -} -late_initcall(register_stack_map); diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 7af0dcc5d755..fd2411fd6914 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -27,30 +27,29 @@ DEFINE_PER_CPU(int, bpf_prog_active);  int sysctl_unprivileged_bpf_disabled __read_mostly; -static LIST_HEAD(bpf_map_types); +static const struct bpf_map_ops * const bpf_map_types[] = { +#define BPF_PROG_TYPE(_id, _ops) +#define BPF_MAP_TYPE(_id, _ops) \ +	[_id] = &_ops, +#include <linux/bpf_types.h> +#undef BPF_PROG_TYPE +#undef BPF_MAP_TYPE +};  static struct bpf_map *find_and_alloc_map(union bpf_attr *attr)  { -	struct bpf_map_type_list *tl;  	struct bpf_map *map; -	list_for_each_entry(tl, &bpf_map_types, list_node) { -		if (tl->type == attr->map_type) { -			map = tl->ops->map_alloc(attr); -			if (IS_ERR(map)) -				return map; -			map->ops = tl->ops; -			map->map_type = attr->map_type; -			return map; -		} -	} -	return ERR_PTR(-EINVAL); -} +	if (attr->map_type >= ARRAY_SIZE(bpf_map_types) || +	    !bpf_map_types[attr->map_type]) +		return ERR_PTR(-EINVAL); -/* boot time registration of different map implementations */ -void bpf_register_map_type(struct bpf_map_type_list *tl) -{ -	list_add(&tl->list_node, &bpf_map_types); +	map = bpf_map_types[attr->map_type]->map_alloc(attr); +	if (IS_ERR(map)) +		return map; +	map->ops = bpf_map_types[attr->map_type]; +	map->map_type = attr->map_type; +	return map;  }  void *bpf_map_area_alloc(size_t size) @@ -68,8 +67,7 @@ void *bpf_map_area_alloc(size_t size)  			return area;  	} -	return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM | flags, -			 PAGE_KERNEL); +	return __vmalloc(size, GFP_KERNEL | flags, PAGE_KERNEL);  }  void bpf_map_area_free(void *area) @@ -215,7 +213,7 @@ int bpf_map_new_fd(struct bpf_map *map)  		   offsetof(union bpf_attr, CMD##_LAST_FIELD) - \  		   sizeof(attr->CMD##_LAST_FIELD)) != NULL -#define BPF_MAP_CREATE_LAST_FIELD map_flags +#define BPF_MAP_CREATE_LAST_FIELD inner_map_fd  /* called via syscall */  static int map_create(union bpf_attr *attr)  { @@ -352,6 +350,9 @@ static int map_lookup_elem(union bpf_attr *attr)  		err = bpf_percpu_array_copy(map, key, value);  	} else if (map->map_type == BPF_MAP_TYPE_STACK_TRACE) {  		err = bpf_stackmap_copy(map, key, value); +	} else if (map->map_type == BPF_MAP_TYPE_ARRAY_OF_MAPS || +		   map->map_type == BPF_MAP_TYPE_HASH_OF_MAPS) { +		err = -ENOTSUPP;  	} else {  		rcu_read_lock();  		ptr = map->ops->map_lookup_elem(map, key); @@ -438,11 +439,17 @@ static int map_update_elem(union bpf_attr *attr)  		err = bpf_percpu_array_update(map, key, value, attr->flags);  	} else if (map->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY ||  		   map->map_type == BPF_MAP_TYPE_PROG_ARRAY || -		   map->map_type == BPF_MAP_TYPE_CGROUP_ARRAY) { +		   map->map_type == BPF_MAP_TYPE_CGROUP_ARRAY || +		   map->map_type == BPF_MAP_TYPE_ARRAY_OF_MAPS) {  		rcu_read_lock();  		err = bpf_fd_array_map_update_elem(map, f.file, key, value,  						   attr->flags);  		rcu_read_unlock(); +	} else if (map->map_type == BPF_MAP_TYPE_HASH_OF_MAPS) { +		rcu_read_lock(); +		err = bpf_fd_htab_map_update_elem(map, f.file, key, value, +						  attr->flags); +		rcu_read_unlock();  	} else {  		rcu_read_lock();  		err = map->ops->map_update_elem(map, key, value, attr->flags); @@ -528,14 +535,18 @@ static int map_get_next_key(union bpf_attr *attr)  	if (IS_ERR(map))  		return PTR_ERR(map); -	err = -ENOMEM; -	key = kmalloc(map->key_size, GFP_USER); -	if (!key) -		goto err_put; +	if (ukey) { +		err = -ENOMEM; +		key = kmalloc(map->key_size, GFP_USER); +		if (!key) +			goto err_put; -	err = -EFAULT; -	if (copy_from_user(key, ukey, map->key_size) != 0) -		goto free_key; +		err = -EFAULT; +		if (copy_from_user(key, ukey, map->key_size) != 0) +			goto free_key; +	} else { +		key = NULL; +	}  	err = -ENOMEM;  	next_key = kmalloc(map->key_size, GFP_USER); @@ -564,79 +575,23 @@ err_put:  	return err;  } -static LIST_HEAD(bpf_prog_types); +static const struct bpf_verifier_ops * const bpf_prog_types[] = { +#define BPF_PROG_TYPE(_id, _ops) \ +	[_id] = &_ops, +#define BPF_MAP_TYPE(_id, _ops) +#include <linux/bpf_types.h> +#undef BPF_PROG_TYPE +#undef BPF_MAP_TYPE +};  static int find_prog_type(enum bpf_prog_type type, struct bpf_prog *prog)  { -	struct bpf_prog_type_list *tl; - -	list_for_each_entry(tl, &bpf_prog_types, list_node) { -		if (tl->type == type) { -			prog->aux->ops = tl->ops; -			prog->type = type; -			return 0; -		} -	} - -	return -EINVAL; -} - -void bpf_register_prog_type(struct bpf_prog_type_list *tl) -{ -	list_add(&tl->list_node, &bpf_prog_types); -} - -/* fixup insn->imm field of bpf_call instructions: - * if (insn->imm == BPF_FUNC_map_lookup_elem) - *      insn->imm = bpf_map_lookup_elem - __bpf_call_base; - * else if (insn->imm == BPF_FUNC_map_update_elem) - *      insn->imm = bpf_map_update_elem - __bpf_call_base; - * else ... - * - * this function is called after eBPF program passed verification - */ -static void fixup_bpf_calls(struct bpf_prog *prog) -{ -	const struct bpf_func_proto *fn; -	int i; +	if (type >= ARRAY_SIZE(bpf_prog_types) || !bpf_prog_types[type]) +		return -EINVAL; -	for (i = 0; i < prog->len; i++) { -		struct bpf_insn *insn = &prog->insnsi[i]; - -		if (insn->code == (BPF_JMP | BPF_CALL)) { -			/* we reach here when program has bpf_call instructions -			 * and it passed bpf_check(), means that -			 * ops->get_func_proto must have been supplied, check it -			 */ -			BUG_ON(!prog->aux->ops->get_func_proto); - -			if (insn->imm == BPF_FUNC_get_route_realm) -				prog->dst_needed = 1; -			if (insn->imm == BPF_FUNC_get_prandom_u32) -				bpf_user_rnd_init_once(); -			if (insn->imm == BPF_FUNC_xdp_adjust_head) -				prog->xdp_adjust_head = 1; -			if (insn->imm == BPF_FUNC_tail_call) { -				/* mark bpf_tail_call as different opcode -				 * to avoid conditional branch in -				 * interpeter for every normal call -				 * and to prevent accidental JITing by -				 * JIT compiler that doesn't support -				 * bpf_tail_call yet -				 */ -				insn->imm = 0; -				insn->code |= BPF_X; -				continue; -			} - -			fn = prog->aux->ops->get_func_proto(insn->imm); -			/* all functions that have prototype and verifier allowed -			 * programs to call them, must be real in-kernel functions -			 */ -			BUG_ON(!fn->func); -			insn->imm = fn->func - __bpf_call_base; -		} -	} +	prog->aux->ops = bpf_prog_types[type]; +	prog->type = type; +	return 0;  }  /* drop refcnt on maps used by eBPF program and free auxilary data */ @@ -892,9 +847,6 @@ static int bpf_prog_load(union bpf_attr *attr)  	if (err < 0)  		goto free_used_maps; -	/* fixup BPF_CALL->imm field */ -	fixup_bpf_calls(prog); -  	/* eBPF program is ready to be JITed */  	prog = bpf_prog_select_runtime(prog, &err);  	if (err < 0) @@ -1020,6 +972,28 @@ static int bpf_prog_detach(const union bpf_attr *attr)  }  #endif /* CONFIG_CGROUP_BPF */ +#define BPF_PROG_TEST_RUN_LAST_FIELD test.duration + +static int bpf_prog_test_run(const union bpf_attr *attr, +			     union bpf_attr __user *uattr) +{ +	struct bpf_prog *prog; +	int ret = -ENOTSUPP; + +	if (CHECK_ATTR(BPF_PROG_TEST_RUN)) +		return -EINVAL; + +	prog = bpf_prog_get(attr->test.prog_fd); +	if (IS_ERR(prog)) +		return PTR_ERR(prog); + +	if (prog->aux->ops->test_run) +		ret = prog->aux->ops->test_run(prog, attr, uattr); + +	bpf_prog_put(prog); +	return ret; +} +  SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size)  {  	union bpf_attr attr = {}; @@ -1086,7 +1060,6 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz  	case BPF_OBJ_GET:  		err = bpf_obj_get(&attr);  		break; -  #ifdef CONFIG_CGROUP_BPF  	case BPF_PROG_ATTACH:  		err = bpf_prog_attach(&attr); @@ -1095,7 +1068,9 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz  		err = bpf_prog_detach(&attr);  		break;  #endif - +	case BPF_PROG_TEST_RUN: +		err = bpf_prog_test_run(&attr, uattr); +		break;  	default:  		err = -EINVAL;  		break; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index a834068a400e..c2ff608c1984 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -143,6 +143,8 @@ struct bpf_verifier_stack_elem {  #define BPF_COMPLEXITY_LIMIT_INSNS	65536  #define BPF_COMPLEXITY_LIMIT_STACK	1024 +#define BPF_MAP_PTR_POISON ((void *)0xeB9F + POISON_POINTER_DELTA) +  struct bpf_call_arg_meta {  	struct bpf_map *map_ptr;  	bool raw_mode; @@ -1215,6 +1217,10 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id)  		    func_id != BPF_FUNC_current_task_under_cgroup)  			goto error;  		break; +	case BPF_MAP_TYPE_ARRAY_OF_MAPS: +	case BPF_MAP_TYPE_HASH_OF_MAPS: +		if (func_id != BPF_FUNC_map_lookup_elem) +			goto error;  	default:  		break;  	} @@ -1291,7 +1297,7 @@ static void clear_all_pkt_pointers(struct bpf_verifier_env *env)  	}  } -static int check_call(struct bpf_verifier_env *env, int func_id) +static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx)  {  	struct bpf_verifier_state *state = &env->cur_state;  	const struct bpf_func_proto *fn = NULL; @@ -1375,6 +1381,8 @@ static int check_call(struct bpf_verifier_env *env, int func_id)  	} else if (fn->ret_type == RET_VOID) {  		regs[BPF_REG_0].type = NOT_INIT;  	} else if (fn->ret_type == RET_PTR_TO_MAP_VALUE_OR_NULL) { +		struct bpf_insn_aux_data *insn_aux; +  		regs[BPF_REG_0].type = PTR_TO_MAP_VALUE_OR_NULL;  		regs[BPF_REG_0].max_value = regs[BPF_REG_0].min_value = 0;  		/* remember map_ptr, so that check_map_access() @@ -1387,6 +1395,11 @@ static int check_call(struct bpf_verifier_env *env, int func_id)  		}  		regs[BPF_REG_0].map_ptr = meta.map_ptr;  		regs[BPF_REG_0].id = ++env->id_gen; +		insn_aux = &env->insn_aux_data[insn_idx]; +		if (!insn_aux->map_ptr) +			insn_aux->map_ptr = meta.map_ptr; +		else if (insn_aux->map_ptr != meta.map_ptr) +			insn_aux->map_ptr = BPF_MAP_PTR_POISON;  	} else {  		verbose("unknown return type %d of func %s#%d\n",  			fn->ret_type, func_id_name(func_id), func_id); @@ -1911,6 +1924,17 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)  			return 0;  		} else if (opcode == BPF_ADD &&  			   BPF_CLASS(insn->code) == BPF_ALU64 && +			   dst_reg->type == PTR_TO_STACK && +			   ((BPF_SRC(insn->code) == BPF_X && +			     regs[insn->src_reg].type == CONST_IMM) || +			    BPF_SRC(insn->code) == BPF_K)) { +			if (BPF_SRC(insn->code) == BPF_X) +				dst_reg->imm += regs[insn->src_reg].imm; +			else +				dst_reg->imm += insn->imm; +			return 0; +		} else if (opcode == BPF_ADD && +			   BPF_CLASS(insn->code) == BPF_ALU64 &&  			   (dst_reg->type == PTR_TO_PACKET ||  			    (BPF_SRC(insn->code) == BPF_X &&  			     regs[insn->src_reg].type == PTR_TO_PACKET))) { @@ -2112,14 +2136,19 @@ static void mark_map_reg(struct bpf_reg_state *regs, u32 regno, u32 id,  	struct bpf_reg_state *reg = ®s[regno];  	if (reg->type == PTR_TO_MAP_VALUE_OR_NULL && reg->id == id) { -		reg->type = type; +		if (type == UNKNOWN_VALUE) { +			__mark_reg_unknown_value(regs, regno); +		} else if (reg->map_ptr->inner_map_meta) { +			reg->type = CONST_PTR_TO_MAP; +			reg->map_ptr = reg->map_ptr->inner_map_meta; +		} else { +			reg->type = type; +		}  		/* We don't need id from this point onwards anymore, thus we  		 * should better reset it, so that state pruning has chances  		 * to take effect.  		 */  		reg->id = 0; -		if (type == UNKNOWN_VALUE) -			__mark_reg_unknown_value(regs, regno);  	}  } @@ -2960,7 +2989,7 @@ static int do_check(struct bpf_verifier_env *env)  					return -EINVAL;  				} -				err = check_call(env, insn->imm); +				err = check_call(env, insn->imm, insn_idx);  				if (err)  					return err; @@ -3044,16 +3073,33 @@ process_bpf_exit:  	return 0;  } +static int check_map_prealloc(struct bpf_map *map) +{ +	return (map->map_type != BPF_MAP_TYPE_HASH && +		map->map_type != BPF_MAP_TYPE_PERCPU_HASH && +		map->map_type != BPF_MAP_TYPE_HASH_OF_MAPS) || +		!(map->map_flags & BPF_F_NO_PREALLOC); +} +  static int check_map_prog_compatibility(struct bpf_map *map,  					struct bpf_prog *prog)  { -	if (prog->type == BPF_PROG_TYPE_PERF_EVENT && -	    (map->map_type == BPF_MAP_TYPE_HASH || -	     map->map_type == BPF_MAP_TYPE_PERCPU_HASH) && -	    (map->map_flags & BPF_F_NO_PREALLOC)) { -		verbose("perf_event programs can only use preallocated hash map\n"); -		return -EINVAL; +	/* Make sure that BPF_PROG_TYPE_PERF_EVENT programs only use +	 * preallocated hash maps, since doing memory allocation +	 * in overflow_handler can crash depending on where nmi got +	 * triggered. +	 */ +	if (prog->type == BPF_PROG_TYPE_PERF_EVENT) { +		if (!check_map_prealloc(map)) { +			verbose("perf_event programs can only use preallocated hash map\n"); +			return -EINVAL; +		} +		if (map->inner_map_meta && +		    !check_map_prealloc(map->inner_map_meta)) { +			verbose("perf_event programs can only use preallocated inner hash map\n"); +			return -EINVAL; +		}  	}  	return 0;  } @@ -3182,6 +3228,41 @@ static void convert_pseudo_ld_imm64(struct bpf_verifier_env *env)  			insn->src_reg = 0;  } +/* single env->prog->insni[off] instruction was replaced with the range + * insni[off, off + cnt).  Adjust corresponding insn_aux_data by copying + * [0, off) and [off, end) to new locations, so the patched range stays zero + */ +static int adjust_insn_aux_data(struct bpf_verifier_env *env, u32 prog_len, +				u32 off, u32 cnt) +{ +	struct bpf_insn_aux_data *new_data, *old_data = env->insn_aux_data; + +	if (cnt == 1) +		return 0; +	new_data = vzalloc(sizeof(struct bpf_insn_aux_data) * prog_len); +	if (!new_data) +		return -ENOMEM; +	memcpy(new_data, old_data, sizeof(struct bpf_insn_aux_data) * off); +	memcpy(new_data + off + cnt - 1, old_data + off, +	       sizeof(struct bpf_insn_aux_data) * (prog_len - off - cnt + 1)); +	env->insn_aux_data = new_data; +	vfree(old_data); +	return 0; +} + +static struct bpf_prog *bpf_patch_insn_data(struct bpf_verifier_env *env, u32 off, +					    const struct bpf_insn *patch, u32 len) +{ +	struct bpf_prog *new_prog; + +	new_prog = bpf_patch_insn_single(env->prog, off, patch, len); +	if (!new_prog) +		return NULL; +	if (adjust_insn_aux_data(env, new_prog->len, off, len)) +		return NULL; +	return new_prog; +} +  /* convert load instructions that access fields of 'struct __sk_buff'   * into sequence of instructions that access fields of 'struct sk_buff'   */ @@ -3201,10 +3282,10 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)  			verbose("bpf verifier is misconfigured\n");  			return -EINVAL;  		} else if (cnt) { -			new_prog = bpf_patch_insn_single(env->prog, 0, -							 insn_buf, cnt); +			new_prog = bpf_patch_insn_data(env, 0, insn_buf, cnt);  			if (!new_prog)  				return -ENOMEM; +  			env->prog = new_prog;  			delta += cnt - 1;  		} @@ -3229,7 +3310,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)  		else  			continue; -		if (env->insn_aux_data[i].ptr_type != PTR_TO_CTX) +		if (env->insn_aux_data[i + delta].ptr_type != PTR_TO_CTX)  			continue;  		cnt = ops->convert_ctx_access(type, insn, insn_buf, env->prog); @@ -3238,8 +3319,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)  			return -EINVAL;  		} -		new_prog = bpf_patch_insn_single(env->prog, i + delta, insn_buf, -						 cnt); +		new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);  		if (!new_prog)  			return -ENOMEM; @@ -3253,6 +3333,89 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)  	return 0;  } +/* fixup insn->imm field of bpf_call instructions + * and inline eligible helpers as explicit sequence of BPF instructions + * + * this function is called after eBPF program passed verification + */ +static int fixup_bpf_calls(struct bpf_verifier_env *env) +{ +	struct bpf_prog *prog = env->prog; +	struct bpf_insn *insn = prog->insnsi; +	const struct bpf_func_proto *fn; +	const int insn_cnt = prog->len; +	struct bpf_insn insn_buf[16]; +	struct bpf_prog *new_prog; +	struct bpf_map *map_ptr; +	int i, cnt, delta = 0; + +	for (i = 0; i < insn_cnt; i++, insn++) { +		if (insn->code != (BPF_JMP | BPF_CALL)) +			continue; + +		if (insn->imm == BPF_FUNC_get_route_realm) +			prog->dst_needed = 1; +		if (insn->imm == BPF_FUNC_get_prandom_u32) +			bpf_user_rnd_init_once(); +		if (insn->imm == BPF_FUNC_tail_call) { +			/* If we tail call into other programs, we +			 * cannot make any assumptions since they can +			 * be replaced dynamically during runtime in +			 * the program array. +			 */ +			prog->cb_access = 1; + +			/* mark bpf_tail_call as different opcode to avoid +			 * conditional branch in the interpeter for every normal +			 * call and to prevent accidental JITing by JIT compiler +			 * that doesn't support bpf_tail_call yet +			 */ +			insn->imm = 0; +			insn->code |= BPF_X; +			continue; +		} + +		if (ebpf_jit_enabled() && insn->imm == BPF_FUNC_map_lookup_elem) { +			map_ptr = env->insn_aux_data[i + delta].map_ptr; +			if (map_ptr == BPF_MAP_PTR_POISON || +			    !map_ptr->ops->map_gen_lookup) +				goto patch_call_imm; + +			cnt = map_ptr->ops->map_gen_lookup(map_ptr, insn_buf); +			if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf)) { +				verbose("bpf verifier is misconfigured\n"); +				return -EINVAL; +			} + +			new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, +						       cnt); +			if (!new_prog) +				return -ENOMEM; + +			delta += cnt - 1; + +			/* keep walking new program and skip insns we just inserted */ +			env->prog = prog = new_prog; +			insn      = new_prog->insnsi + i + delta; +			continue; +		} + +patch_call_imm: +		fn = prog->aux->ops->get_func_proto(insn->imm); +		/* all functions that have prototype and verifier allowed +		 * programs to call them, must be real in-kernel functions +		 */ +		if (!fn->func) { +			verbose("kernel subsystem misconfigured func %s#%d\n", +				func_id_name(insn->imm), insn->imm); +			return -EFAULT; +		} +		insn->imm = fn->func - __bpf_call_base; +	} + +	return 0; +} +  static void free_states(struct bpf_verifier_env *env)  {  	struct bpf_verifier_state_list *sl, *sln; @@ -3348,6 +3511,9 @@ skip_full_check:  		/* program is valid, convert *(u32*)(ctx + off) accesses */  		ret = convert_ctx_accesses(env); +	if (ret == 0) +		ret = fixup_bpf_calls(env); +  	if (log_level && log_len >= log_size - 1) {  		BUG_ON(log_len >= log_size);  		/* verifier log exceeded user supplied buffer */ | 
