diff options
Diffstat (limited to 'kernel/bpf')
-rw-r--r-- | kernel/bpf/Makefile | 7 | ||||
-rw-r--r-- | kernel/bpf/arraymap.c | 211 | ||||
-rw-r--r-- | kernel/bpf/core.c | 39 | ||||
-rw-r--r-- | kernel/bpf/hashtab.c | 516 | ||||
-rw-r--r-- | kernel/bpf/helpers.c | 9 | ||||
-rw-r--r-- | kernel/bpf/inode.c | 407 | ||||
-rw-r--r-- | kernel/bpf/percpu_freelist.c | 100 | ||||
-rw-r--r-- | kernel/bpf/percpu_freelist.h | 31 | ||||
-rw-r--r-- | kernel/bpf/stackmap.c | 290 | ||||
-rw-r--r-- | kernel/bpf/syscall.c | 318 | ||||
-rw-r--r-- | kernel/bpf/verifier.c | 177 |
11 files changed, 1909 insertions, 196 deletions
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index e6983be12bd3..eed911d091da 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -1,2 +1,7 @@ obj-y := core.o -obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o hashtab.o arraymap.o helpers.o + +obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o +obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o +ifeq ($(CONFIG_PERF_EVENTS),y) +obj-$(CONFIG_BPF_SYSCALL) += stackmap.o +endif diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 29ace107f236..76d5a794e426 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -15,26 +15,65 @@ #include <linux/slab.h> #include <linux/mm.h> #include <linux/filter.h> +#include <linux/perf_event.h> + +static void bpf_array_free_percpu(struct bpf_array *array) +{ + int i; + + for (i = 0; i < array->map.max_entries; i++) + free_percpu(array->pptrs[i]); +} + +static int bpf_array_alloc_percpu(struct bpf_array *array) +{ + void __percpu *ptr; + int i; + + for (i = 0; i < array->map.max_entries; i++) { + ptr = __alloc_percpu_gfp(array->elem_size, 8, + GFP_USER | __GFP_NOWARN); + if (!ptr) { + bpf_array_free_percpu(array); + return -ENOMEM; + } + array->pptrs[i] = ptr; + } + + return 0; +} /* Called from syscall */ static struct bpf_map *array_map_alloc(union bpf_attr *attr) { + bool percpu = attr->map_type == BPF_MAP_TYPE_PERCPU_ARRAY; struct bpf_array *array; - u32 elem_size, array_size; + u64 array_size; + u32 elem_size; /* check sanity of attributes */ if (attr->max_entries == 0 || attr->key_size != 4 || - attr->value_size == 0) + attr->value_size == 0 || attr->map_flags) return ERR_PTR(-EINVAL); + if (attr->value_size >= 1 << (KMALLOC_SHIFT_MAX - 1)) + /* if value_size is bigger, the user space won't be able to + * access the elements. + */ + return ERR_PTR(-E2BIG); + elem_size = round_up(attr->value_size, 8); - /* check round_up into zero and u32 overflow */ - if (elem_size == 0 || - attr->max_entries > (U32_MAX - sizeof(*array)) / elem_size) + array_size = sizeof(*array); + if (percpu) + array_size += (u64) attr->max_entries * sizeof(void *); + else + array_size += (u64) attr->max_entries * elem_size; + + /* make sure there is no u32 overflow later in round_up() */ + if (array_size >= U32_MAX - PAGE_SIZE) return ERR_PTR(-ENOMEM); - array_size = sizeof(*array) + attr->max_entries * elem_size; /* allocate all map elements and zero-initialize them */ array = kzalloc(array_size, GFP_USER | __GFP_NOWARN); @@ -45,12 +84,25 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr) } /* copy mandatory map attributes */ + array->map.map_type = attr->map_type; array->map.key_size = attr->key_size; array->map.value_size = attr->value_size; array->map.max_entries = attr->max_entries; - array->elem_size = elem_size; + if (!percpu) + goto out; + + array_size += (u64) attr->max_entries * elem_size * num_possible_cpus(); + + if (array_size >= U32_MAX - PAGE_SIZE || + elem_size > PCPU_MIN_UNIT_SIZE || bpf_array_alloc_percpu(array)) { + kvfree(array); + return ERR_PTR(-ENOMEM); + } +out: + array->map.pages = round_up(array_size, PAGE_SIZE) >> PAGE_SHIFT; + return &array->map; } @@ -60,12 +112,50 @@ static void *array_map_lookup_elem(struct bpf_map *map, void *key) struct bpf_array *array = container_of(map, struct bpf_array, map); u32 index = *(u32 *)key; - if (index >= array->map.max_entries) + if (unlikely(index >= array->map.max_entries)) return NULL; return array->value + array->elem_size * index; } +/* Called from eBPF program */ +static void *percpu_array_map_lookup_elem(struct bpf_map *map, void *key) +{ + struct bpf_array *array = container_of(map, struct bpf_array, map); + u32 index = *(u32 *)key; + + if (unlikely(index >= array->map.max_entries)) + return NULL; + + return this_cpu_ptr(array->pptrs[index]); +} + +int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value) +{ + struct bpf_array *array = container_of(map, struct bpf_array, map); + u32 index = *(u32 *)key; + void __percpu *pptr; + int cpu, off = 0; + u32 size; + + if (unlikely(index >= array->map.max_entries)) + return -ENOENT; + + /* per_cpu areas are zero-filled and bpf programs can only + * access 'value_size' of them, so copying rounded areas + * will not leak any kernel data + */ + size = round_up(map->value_size, 8); + rcu_read_lock(); + pptr = array->pptrs[index]; + for_each_possible_cpu(cpu) { + bpf_long_memcpy(value + off, per_cpu_ptr(pptr, cpu), size); + off += size; + } + rcu_read_unlock(); + return 0; +} + /* Called from syscall */ static int array_map_get_next_key(struct bpf_map *map, void *key, void *next_key) { @@ -92,19 +182,62 @@ static int array_map_update_elem(struct bpf_map *map, void *key, void *value, struct bpf_array *array = container_of(map, struct bpf_array, map); u32 index = *(u32 *)key; - if (map_flags > BPF_EXIST) + if (unlikely(map_flags > BPF_EXIST)) /* unknown flags */ return -EINVAL; - if (index >= array->map.max_entries) + if (unlikely(index >= array->map.max_entries)) /* all elements were pre-allocated, cannot insert a new one */ return -E2BIG; - if (map_flags == BPF_NOEXIST) + if (unlikely(map_flags == BPF_NOEXIST)) /* all elements already exist */ return -EEXIST; - memcpy(array->value + array->elem_size * index, value, array->elem_size); + if (array->map.map_type == BPF_MAP_TYPE_PERCPU_ARRAY) + memcpy(this_cpu_ptr(array->pptrs[index]), + value, map->value_size); + else + memcpy(array->value + array->elem_size * index, + value, map->value_size); + return 0; +} + +int bpf_percpu_array_update(struct bpf_map *map, void *key, void *value, + u64 map_flags) +{ + struct bpf_array *array = container_of(map, struct bpf_array, map); + u32 index = *(u32 *)key; + void __percpu *pptr; + int cpu, off = 0; + u32 size; + + if (unlikely(map_flags > BPF_EXIST)) + /* unknown flags */ + return -EINVAL; + + if (unlikely(index >= array->map.max_entries)) + /* all elements were pre-allocated, cannot insert a new one */ + return -E2BIG; + + if (unlikely(map_flags == BPF_NOEXIST)) + /* all elements already exist */ + return -EEXIST; + + /* the user space will provide round_up(value_size, 8) bytes that + * will be copied into per-cpu area. bpf programs can only access + * value_size of it. During lookup the same extra bytes will be + * returned or zeros which were zero-filled by percpu_alloc, + * so no kernel data leaks possible + */ + size = round_up(map->value_size, 8); + rcu_read_lock(); + pptr = array->pptrs[index]; + for_each_possible_cpu(cpu) { + bpf_long_memcpy(per_cpu_ptr(pptr, cpu), value + off, size); + off += size; + } + rcu_read_unlock(); return 0; } @@ -126,6 +259,9 @@ static void array_map_free(struct bpf_map *map) */ synchronize_rcu(); + if (array->map.map_type == BPF_MAP_TYPE_PERCPU_ARRAY) + bpf_array_free_percpu(array); + kvfree(array); } @@ -143,9 +279,24 @@ static struct bpf_map_type_list array_type __read_mostly = { .type = BPF_MAP_TYPE_ARRAY, }; +static const struct bpf_map_ops percpu_array_ops = { + .map_alloc = array_map_alloc, + .map_free = array_map_free, + .map_get_next_key = array_map_get_next_key, + .map_lookup_elem = percpu_array_map_lookup_elem, + .map_update_elem = array_map_update_elem, + .map_delete_elem = array_map_delete_elem, +}; + +static struct bpf_map_type_list percpu_array_type __read_mostly = { + .ops = &percpu_array_ops, + .type = BPF_MAP_TYPE_PERCPU_ARRAY, +}; + static int __init register_array_map(void) { bpf_register_map_type(&array_type); + bpf_register_map_type(&percpu_array_type); return 0; } late_initcall(register_array_map); @@ -284,28 +435,38 @@ static void *perf_event_fd_array_get_ptr(struct bpf_map *map, int fd) { struct perf_event *event; const struct perf_event_attr *attr; + struct file *file; + + file = perf_event_get(fd); + if (IS_ERR(file)) + return file; - event = perf_event_get(fd); - if (IS_ERR(event)) - return event; + event = file->private_data; attr = perf_event_attrs(event); if (IS_ERR(attr)) - return (void *)attr; + goto err; - if (attr->type != PERF_TYPE_RAW && - attr->type != PERF_TYPE_HARDWARE) { - perf_event_release_kernel(event); - return ERR_PTR(-EINVAL); - } - return event; + if (attr->inherit) + goto err; + + if (attr->type == PERF_TYPE_RAW) + return file; + + if (attr->type == PERF_TYPE_HARDWARE) + return file; + + if (attr->type == PERF_TYPE_SOFTWARE && + attr->config == PERF_COUNT_SW_BPF_OUTPUT) + return file; +err: + fput(file); + return ERR_PTR(-EINVAL); } static void perf_event_fd_array_put_ptr(void *ptr) { - struct perf_event *event = ptr; - - perf_event_release_kernel(event); + fput((struct file *)ptr); } static const struct bpf_map_ops perf_event_array_ops = { diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 67c380cfa9ca..be0abf669ced 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -27,6 +27,7 @@ #include <linux/random.h> #include <linux/moduleloader.h> #include <linux/bpf.h> +#include <linux/frame.h> #include <asm/unaligned.h> @@ -82,6 +83,8 @@ struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags) if (fp == NULL) return NULL; + kmemcheck_annotate_bitfield(fp, meta); + aux = kzalloc(sizeof(*aux), GFP_KERNEL | gfp_extra_flags); if (aux == NULL) { vfree(fp); @@ -90,6 +93,7 @@ struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags) fp->pages = size / PAGE_SIZE; fp->aux = aux; + fp->aux->prog = fp; return fp; } @@ -110,8 +114,11 @@ struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size, fp = __vmalloc(size, gfp_flags, PAGE_KERNEL); if (fp != NULL) { + kmemcheck_annotate_bitfield(fp, meta); + memcpy(fp, fp_old, fp_old->pages * PAGE_SIZE); fp->pages = size / PAGE_SIZE; + fp->aux->prog = fp; /* We keep fp->aux from fp_old around in the new * reallocated structure. @@ -300,10 +307,6 @@ static unsigned int __bpf_prog_run(void *ctx, const struct bpf_insn *insn) FP = (u64) (unsigned long) &stack[ARRAY_SIZE(stack)]; ARG1 = (u64) (unsigned long) ctx; - /* Registers used in classic BPF programs need to be reset first. */ - regs[BPF_REG_A] = 0; - regs[BPF_REG_X] = 0; - select_insn: goto *jumptable[insn->code]; @@ -647,6 +650,7 @@ load_byte: WARN_RATELIMIT(1, "unknown opcode %02x\n", insn->code); return 0; } +STACK_FRAME_NON_STANDARD(__bpf_prog_run); /* jump table */ bool bpf_prog_array_compatible(struct bpf_array *array, const struct bpf_prog *fp) @@ -722,11 +726,36 @@ void bpf_prog_free(struct bpf_prog *fp) struct bpf_prog_aux *aux = fp->aux; INIT_WORK(&aux->work, bpf_prog_free_deferred); - aux->prog = fp; schedule_work(&aux->work); } EXPORT_SYMBOL_GPL(bpf_prog_free); +/* RNG for unpriviledged user space with separated state from prandom_u32(). */ +static DEFINE_PER_CPU(struct rnd_state, bpf_user_rnd_state); + +void bpf_user_rnd_init_once(void) +{ + prandom_init_once(&bpf_user_rnd_state); +} + +u64 bpf_user_rnd_u32(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) +{ + /* Should someone ever have the rather unwise idea to use some + * of the registers passed into this function, then note that + * this function is called from native eBPF and classic-to-eBPF + * transformations. Register assignments from both sides are + * different, f.e. classic always sets fn(ctx, A, X) here. + */ + struct rnd_state *state; + u32 res; + + state = &get_cpu_var(bpf_user_rnd_state); + res = prandom_u32_state(state); + put_cpu_var(state); + + return res; +} + /* Weak definitions of helper functions in case we don't have bpf syscall. */ const struct bpf_func_proto bpf_map_lookup_elem_proto __weak; const struct bpf_func_proto bpf_map_update_elem_proto __weak; diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 83c209d9b17a..fff3650d52fc 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -1,4 +1,5 @@ /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com + * Copyright (c) 2016 Facebook * * This program is free software; you can redistribute it and/or * modify it under the terms of version 2 of the GNU General Public @@ -13,38 +14,127 @@ #include <linux/jhash.h> #include <linux/filter.h> #include <linux/vmalloc.h> +#include "percpu_freelist.h" + +struct bucket { + struct hlist_head head; + raw_spinlock_t lock; +}; struct bpf_htab { struct bpf_map map; - struct hlist_head *buckets; - spinlock_t lock; - u32 count; /* number of elements in this hashtable */ + struct bucket *buckets; + void *elems; + struct pcpu_freelist freelist; + atomic_t count; /* number of elements in this hashtable */ u32 n_buckets; /* number of hash buckets */ u32 elem_size; /* size of each element in bytes */ }; /* each htab element is struct htab_elem + key + value */ struct htab_elem { - struct hlist_node hash_node; + union { + struct hlist_node hash_node; + struct bpf_htab *htab; + struct pcpu_freelist_node fnode; + }; struct rcu_head rcu; u32 hash; char key[0] __aligned(8); }; +static inline void htab_elem_set_ptr(struct htab_elem *l, u32 key_size, + void __percpu *pptr) +{ + *(void __percpu **)(l->key + key_size) = pptr; +} + +static inline void __percpu *htab_elem_get_ptr(struct htab_elem *l, u32 key_size) +{ + return *(void __percpu **)(l->key + key_size); +} + +static struct htab_elem *get_htab_elem(struct bpf_htab *htab, int i) +{ + return (struct htab_elem *) (htab->elems + i * htab->elem_size); +} + +static void htab_free_elems(struct bpf_htab *htab) +{ + int i; + + if (htab->map.map_type != BPF_MAP_TYPE_PERCPU_HASH) + goto free_elems; + + for (i = 0; i < htab->map.max_entries; i++) { + void __percpu *pptr; + + pptr = htab_elem_get_ptr(get_htab_elem(htab, i), + htab->map.key_size); + free_percpu(pptr); + } +free_elems: + vfree(htab->elems); +} + +static int prealloc_elems_and_freelist(struct bpf_htab *htab) +{ + int err = -ENOMEM, i; + + htab->elems = vzalloc(htab->elem_size * htab->map.max_entries); + if (!htab->elems) + return -ENOMEM; + + if (htab->map.map_type != BPF_MAP_TYPE_PERCPU_HASH) + goto skip_percpu_elems; + + for (i = 0; i < htab->map.max_entries; i++) { + u32 size = round_up(htab->map.value_size, 8); + void __percpu *pptr; + + pptr = __alloc_percpu_gfp(size, 8, GFP_USER | __GFP_NOWARN); + if (!pptr) + goto free_elems; + htab_elem_set_ptr(get_htab_elem(htab, i), htab->map.key_size, + pptr); + } + +skip_percpu_elems: + err = pcpu_freelist_init(&htab->freelist); + if (err) + goto free_elems; + + pcpu_freelist_populate(&htab->freelist, htab->elems, htab->elem_size, + htab->map.max_entries); + return 0; + +free_elems: + htab_free_elems(htab); + return err; +} + /* Called from syscall */ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) { + bool percpu = attr->map_type == BPF_MAP_TYPE_PERCPU_HASH; struct bpf_htab *htab; int err, i; + u64 cost; + + if (attr->map_flags & ~BPF_F_NO_PREALLOC) + /* reserved bits should not be used */ + return ERR_PTR(-EINVAL); htab = kzalloc(sizeof(*htab), GFP_USER); if (!htab) return ERR_PTR(-ENOMEM); /* mandatory map attributes */ + htab->map.map_type = attr->map_type; htab->map.key_size = attr->key_size; htab->map.value_size = attr->value_size; htab->map.max_entries = attr->max_entries; + htab->map.map_flags = attr->map_flags; /* check sanity of attributes. * value_size == 0 may be allowed in the future to use map as a set @@ -64,32 +154,74 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) */ goto free_htab; - err = -ENOMEM; + if (htab->map.value_size >= (1 << (KMALLOC_SHIFT_MAX - 1)) - + MAX_BPF_STACK - sizeof(struct htab_elem)) + /* if value_size is bigger, the user space won't be able to + * access the elements via bpf syscall. This check also makes + * sure that the elem_size doesn't overflow and it's + * kmalloc-able later in htab_map_update_elem() + */ + goto free_htab; + + if (percpu && round_up(htab->map.value_size, 8) > PCPU_MIN_UNIT_SIZE) + /* make sure the size for pcpu_alloc() is reasonable */ + goto free_htab; + + htab->elem_size = sizeof(struct htab_elem) + + round_up(htab->map.key_size, 8); + if (percpu) + htab->elem_size += sizeof(void *); + else + htab->elem_size += round_up(htab->map.value_size, 8); + /* prevent zero size kmalloc and check for u32 overflow */ if (htab->n_buckets == 0 || - htab->n_buckets > U32_MAX / sizeof(struct hlist_head)) + htab->n_buckets > U32_MAX / sizeof(struct bucket)) goto free_htab; - htab->buckets = kmalloc_array(htab->n_buckets, sizeof(struct hlist_head), + cost = (u64) htab->n_buckets * sizeof(struct bucket) + + (u64) htab->elem_size * htab->map.max_entries; + + if (percpu) + cost += (u64) round_up(htab->map.value_size, 8) * + num_possible_cpus() * htab->map.max_entries; + + if (cost >= U32_MAX - PAGE_SIZE) + /* make sure page count doesn't overflow */ + goto free_htab; + + htab->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; + + /* if map size is larger than memlock limit, reject it early */ + err = bpf_map_precharge_memlock(htab->map.pages); + if (err) + goto free_htab; + + err = -ENOMEM; + htab->buckets = kmalloc_array(htab->n_buckets, sizeof(struct bucket), GFP_USER | __GFP_NOWARN); if (!htab->buckets) { - htab->buckets = vmalloc(htab->n_buckets * sizeof(struct hlist_head)); + htab->buckets = vmalloc(htab->n_buckets * sizeof(struct bucket)); if (!htab->buckets) goto free_htab; } - for (i = 0; i < htab->n_buckets; i++) - INIT_HLIST_HEAD(&htab->buckets[i]); + for (i = 0; i < htab->n_buckets; i++) { + INIT_HLIST_HEAD(&htab->buckets[i].head); + raw_spin_lock_init(&htab->buckets[i].lock); + } - spin_lock_init(&htab->lock); - htab->count = 0; + if (!(attr->map_flags & BPF_F_NO_PREALLOC)) { + err = prealloc_elems_and_freelist(htab); + if (err) + goto free_buckets; + } - htab->elem_size = sizeof(struct htab_elem) + - round_up(htab->map.key_size, 8) + - htab->map.value_size; return &htab->map; +free_buckets: + kvfree(htab->buckets); free_htab: kfree(htab); return ERR_PTR(err); @@ -100,11 +232,16 @@ static inline u32 htab_map_hash(const void *key, u32 key_len) return jhash(key, key_len, 0); } -static inline struct hlist_head *select_bucket(struct bpf_htab *htab, u32 hash) +static inline struct bucket *__select_bucket(struct bpf_htab *htab, u32 hash) { return &htab->buckets[hash & (htab->n_buckets - 1)]; } +static inline struct hlist_head *select_bucket(struct bpf_htab *htab, u32 hash) +{ + return &__select_bucket(htab, hash)->head; +} + static struct htab_elem *lookup_elem_raw(struct hlist_head *head, u32 hash, void *key, u32 key_size) { @@ -118,7 +255,7 @@ static struct htab_elem *lookup_elem_raw(struct hlist_head *head, u32 hash, } /* Called from syscall or from eBPF program */ -static void *htab_map_lookup_elem(struct bpf_map *map, void *key) +static void *__htab_map_lookup_elem(struct bpf_map *map, void *key) { struct bpf_htab *htab = container_of(map, struct bpf_htab, map); struct hlist_head *head; @@ -136,6 +273,13 @@ static void *htab_map_lookup_elem(struct bpf_map *map, void *key) l = lookup_elem_raw(head, hash, key, key_size); + return l; +} + +static void *htab_map_lookup_elem(struct bpf_map *map, void *key) +{ + struct htab_elem *l = __htab_map_lookup_elem(map, key); + if (l) return l->key + round_up(map->key_size, 8); @@ -196,90 +340,254 @@ find_first_elem: } } - /* itereated over all buckets and all elements */ + /* iterated over all buckets and all elements */ return -ENOENT; } +static void htab_elem_free(struct bpf_htab *htab, struct htab_elem *l) +{ + if (htab->map.map_type == BPF_MAP_TYPE_PERCPU_HASH) + free_percpu(htab_elem_get_ptr(l, htab->map.key_size)); + kfree(l); + +} + +static void htab_elem_free_rcu(struct rcu_head *head) +{ + struct htab_elem *l = container_of(head, struct htab_elem, rcu); + struct bpf_htab *htab = l->htab; + + /* must increment bpf_prog_active to avoid kprobe+bpf triggering while + * we're calling kfree, otherwise deadlock is possible if kprobes + * are placed somewhere inside of slub + */ + preempt_disable(); + __this_cpu_inc(bpf_prog_active); + htab_elem_free(htab, l); + __this_cpu_dec(bpf_prog_active); + preempt_enable(); +} + +static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l) +{ + if (!(htab->map.map_flags & BPF_F_NO_PREALLOC)) { + pcpu_freelist_push(&htab->freelist, &l->fnode); + } else { + atomic_dec(&htab->count); + l->htab = htab; + call_rcu(&l->rcu, htab_elem_free_rcu); + } +} + +static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key, + void *value, u32 key_size, u32 hash, + bool percpu, bool onallcpus) +{ + u32 size = htab->map.value_size; + bool prealloc = !(htab->map.map_flags & BPF_F_NO_PREALLOC); + struct htab_elem *l_new; + void __percpu *pptr; + + if (prealloc) { + l_new = (struct htab_elem *)pcpu_freelist_pop(&htab->freelist); + if (!l_new) + return ERR_PTR(-E2BIG); + } else { + if (atomic_inc_return(&htab->count) > htab->map.max_entries) { + atomic_dec(&htab->count); + return ERR_PTR(-E2BIG); + } + l_new = kmalloc(htab->elem_size, GFP_ATOMIC | __GFP_NOWARN); + if (!l_new) + return ERR_PTR(-ENOMEM); + } + + memcpy(l_new->key, key, key_size); + if (percpu) { + /* round up value_size to 8 bytes */ + size = round_up(size, 8); + + if (prealloc) { + pptr = htab_elem_get_ptr(l_new, key_size); + } else { + /* alloc_percpu zero-fills */ + pptr = __alloc_percpu_gfp(size, 8, + GFP_ATOMIC | __GFP_NOWARN); + if (!pptr) { + kfree(l_new); + return ERR_PTR(-ENOMEM); + } + } + + if (!onallcpus) { + /* copy true value_size bytes */ + memcpy(this_cpu_ptr(pptr), value, htab->map.value_size); + } else { + int off = 0, cpu; + + for_each_possible_cpu(cpu) { + bpf_long_memcpy(per_cpu_ptr(pptr, cpu), + value + off, size); + off += size; + } + } + if (!prealloc) + htab_elem_set_ptr(l_new, key_size, pptr); + } else { + memcpy(l_new->key + round_up(key_size, 8), value, size); + } + + l_new->hash = hash; + return l_new; +} + +static int check_flags(struct bpf_htab *htab, struct htab_elem *l_old, + u64 map_flags) +{ + if (l_old && map_flags == BPF_NOEXIST) + /* elem already exists */ + return -EEXIST; + + if (!l_old && map_flags == BPF_EXIST) + /* elem doesn't exist, cannot update it */ + return -ENOENT; + + return 0; +} + /* Called from syscall or from eBPF program */ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value, u64 map_flags) { struct bpf_htab *htab = container_of(map, struct bpf_htab, map); - struct htab_elem *l_new, *l_old; + struct htab_elem *l_new = NULL, *l_old; struct hlist_head *head; unsigned long flags; - u32 key_size; + struct bucket *b; + u32 key_size, hash; int ret; - if (map_flags > BPF_EXIST) + if (unlikely(map_flags > BPF_EXIST)) /* unknown flags */ return -EINVAL; WARN_ON_ONCE(!rcu_read_lock_held()); - /* allocate new element outside of lock */ - l_new = kmalloc(htab->elem_size, GFP_ATOMIC); - if (!l_new) - return -ENOMEM; - key_size = map->key_size; - memcpy(l_new->key, key, key_size); - memcpy(l_new->key + round_up(key_size, 8), value, map->value_size); + hash = htab_map_hash(key, key_size); - l_new->hash = htab_map_hash(l_new->key, key_size); + b = __select_bucket(htab, hash); + head = &b->head; /* bpf_map_update_elem() can be called in_irq() */ - spin_lock_irqsave(&htab->lock, flags); - - head = select_bucket(htab, l_new->hash); + raw_spin_lock_irqsave(&b->lock, flags); - l_old = lookup_elem_raw(head, l_new->hash, key, key_size); + l_old = lookup_elem_raw(head, hash, key, key_size); - if (!l_old && unlikely(htab->count >= map->max_entries)) { - /* if elem with this 'key' doesn't exist and we've reached - * max_entries limit, fail insertion of new elem - */ - ret = -E2BIG; + ret = check_flags(htab, l_old, map_flags); + if (ret) goto err; - } - if (l_old && map_flags == BPF_NOEXIST) { - /* elem already exists */ - ret = -EEXIST; + l_new = alloc_htab_elem(htab, key, value, key_size, hash, false, false); + if (IS_ERR(l_new)) { + /* all pre-allocated elements are in use or memory exhausted */ + ret = PTR_ERR(l_new); goto err; } - if (!l_old && map_flags == BPF_EXIST) { - /* elem doesn't exist, cannot update it */ - ret = -ENOENT; - goto err; - } - - /* add new element to the head of the list, so that concurrent - * search will find it before old elem + /* add new element to the head of the list, so that + * concurrent search will find it before old elem */ hlist_add_head_rcu(&l_new->hash_node, head); if (l_old) { hlist_del_rcu(&l_old->hash_node); - kfree_rcu(l_old, rcu); - } else { - htab->count++; + free_htab_elem(htab, l_old); } - spin_unlock_irqrestore(&htab->lock, flags); + ret = 0; +err: + raw_spin_unlock_irqrestore(&b->lock, flags); + return ret; +} - return 0; +static int __htab_percpu_map_update_elem(struct bpf_map *map, void *key, + void *value, u64 map_flags, + bool onallcpus) +{ + struct bpf_htab *htab = container_of(map, struct bpf_htab, map); + struct htab_elem *l_new = NULL, *l_old; + struct hlist_head *head; + unsigned long flags; + struct bucket *b; + u32 key_size, hash; + int ret; + + if (unlikely(map_flags > BPF_EXIST)) + /* unknown flags */ + return -EINVAL; + + WARN_ON_ONCE(!rcu_read_lock_held()); + + key_size = map->key_size; + + hash = htab_map_hash(key, key_size); + + b = __select_bucket(htab, hash); + head = &b->head; + + /* bpf_map_update_elem() can be called in_irq() */ + raw_spin_lock_irqsave(&b->lock, flags); + + l_old = lookup_elem_raw(head, hash, key, key_size); + + ret = check_flags(htab, l_old, map_flags); + if (ret) + goto err; + + if (l_old) { + void __percpu *pptr = htab_elem_get_ptr(l_old, key_size); + u32 size = htab->map.value_size; + + /* per-cpu hash map can update value in-place */ + if (!onallcpus) { + memcpy(this_cpu_ptr(pptr), value, size); + } else { + int off = 0, cpu; + + size = round_up(size, 8); + for_each_possible_cpu(cpu) { + bpf_long_memcpy(per_cpu_ptr(pptr, cpu), + value + off, size); + off += size; + } + } + } else { + l_new = alloc_htab_elem(htab, key, value, key_size, + hash, true, onallcpus); + if (IS_ERR(l_new)) { + ret = PTR_ERR(l_new); + goto err; + } + hlist_add_head_rcu(&l_new->hash_node, head); + } + ret = 0; err: - spin_unlock_irqrestore(&htab->lock, flags); - kfree(l_new); + raw_spin_unlock_irqrestore(&b->lock, flags); return ret; } +static int htab_percpu_map_update_elem(struct bpf_map *map, void *key, + void *value, u64 map_flags) +{ + return __htab_percpu_map_update_elem(map, key, value, map_flags, false); +} + /* Called from syscall or from eBPF program */ static int htab_map_delete_elem(struct bpf_map *map, void *key) { struct bpf_htab *htab = container_of(map, struct bpf_htab, map); struct hlist_head *head; + struct bucket *b; struct htab_elem *l; unsigned long flags; u32 hash, key_size; @@ -290,21 +598,20 @@ static int htab_map_delete_elem(struct bpf_map *map, void *key) key_size = map->key_size; hash = htab_map_hash(key, key_size); + b = __select_bucket(htab, hash); + head = &b->head; - spin_lock_irqsave(&htab->lock, flags); - - head = select_bucket(htab, hash); + raw_spin_lock_irqsave(&b->lock, flags); l = lookup_elem_raw(head, hash, key, key_size); if (l) { hlist_del_rcu(&l->hash_node); - htab->count--; - kfree_rcu(l, rcu); + free_htab_elem(htab, l); ret = 0; } - spin_unlock_irqrestore(&htab->lock, flags); + raw_spin_unlock_irqrestore(&b->lock, flags); return ret; } @@ -319,12 +626,10 @@ static void delete_all_elements(struct bpf_htab *htab) hlist_for_each_entry_safe(l, n, head, hash_node) { hlist_del_rcu(&l->hash_node); - htab->count--; - kfree(l); + htab_elem_free(htab, l); } } } - /* Called when map->refcnt goes to zero, either from workqueue or from syscall */ static void htab_map_free(struct bpf_map *map) { @@ -337,10 +642,16 @@ static void htab_map_free(struct bpf_map *map) */ synchronize_rcu(); - /* some of kfree_rcu() callbacks for elements of this map may not have - * executed. It's ok. Proceed to free residual elements and map itself + /* some of free_htab_elem() callbacks for elements of this map may + * not have executed. Wait for them. */ - delete_all_elements(htab); + rcu_barrier(); + if (htab->map.map_flags & BPF_F_NO_PREALLOC) { + delete_all_elements(htab); + } else { + htab_free_elems(htab); + pcpu_freelist_destroy(&htab->freelist); + } kvfree(htab->buckets); kfree(htab); } @@ -359,9 +670,76 @@ static struct bpf_map_type_list htab_type __read_mostly = { .type = BPF_MAP_TYPE_HASH, }; +/* Called from eBPF program */ +static void *htab_percpu_map_lookup_elem(struct bpf_map *map, void *key) +{ + struct htab_elem *l = __htab_map_lookup_elem(map, key); + + if (l) + return this_cpu_ptr(htab_elem_get_ptr(l, map->key_size)); + else + return NULL; +} + +int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value) +{ + struct htab_elem *l; + void __percpu *pptr; + int ret = -ENOENT; + int cpu, off = 0; + u32 size; + + /* per_cpu areas are zero-filled and bpf programs can only + * access 'value_size' of them, so copying rounded areas + * will not leak any kernel data + */ + size = round_up(map->value_size, 8); + rcu_read_lock(); + l = __htab_map_lookup_elem(map, key); + if (!l) + goto out; + pptr = htab_elem_get_ptr(l, map->key_size); + for_each_possible_cpu(cpu) { + bpf_long_memcpy(value + off, + per_cpu_ptr(pptr, cpu), size); + off += size; + } + ret = 0; +out: + rcu_read_unlock(); + return ret; +} + +int bpf_percpu_hash_update(struct bpf_map *map, void *key, void *value, + u64 map_flags) +{ + int ret; + + rcu_read_lock(); + ret = __htab_percpu_map_update_elem(map, key, value, map_flags, true); + rcu_read_unlock(); + + return ret; +} + +static const struct bpf_map_ops htab_percpu_ops = { + .map_alloc = htab_map_alloc, + .map_free = htab_map_free, + .map_get_next_key = htab_map_get_next_key, + .map_lookup_elem = htab_percpu_map_lookup_elem, + .map_update_elem = htab_percpu_map_update_elem, + .map_delete_elem = htab_map_delete_elem, +}; + +static struct bpf_map_type_list htab_percpu_type __read_mostly = { + .ops = &htab_percpu_ops, + .type = BPF_MAP_TYPE_PERCPU_HASH, +}; + static int __init register_htab_map(void) { bpf_register_map_type(&htab_type); + bpf_register_map_type(&htab_percpu_type); return 0; } late_initcall(register_htab_map); diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 1447ec09421e..50da680c479f 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -93,13 +93,8 @@ const struct bpf_func_proto bpf_map_delete_elem_proto = { .arg2_type = ARG_PTR_TO_MAP_KEY, }; -static u64 bpf_get_prandom_u32(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) -{ - return prandom_u32(); -} - const struct bpf_func_proto bpf_get_prandom_u32_proto = { - .func = bpf_get_prandom_u32, + .func = bpf_user_rnd_u32, .gpl_only = false, .ret_type = RET_INTEGER, }; @@ -171,7 +166,7 @@ static u64 bpf_get_current_comm(u64 r1, u64 size, u64 r3, u64 r4, u64 r5) if (!task) return -EINVAL; - memcpy(buf, task->comm, min_t(size_t, size, sizeof(task->comm))); + strlcpy(buf, task->comm, min_t(size_t, size, sizeof(task->comm))); return 0; } diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c new file mode 100644 index 000000000000..f2ece3c174a5 --- /dev/null +++ b/kernel/bpf/inode.c @@ -0,0 +1,407 @@ +/* + * Minimal file system backend for holding eBPF maps and programs, + * used by bpf(2) object pinning. + * + * Authors: + * + * Daniel Borkmann <daniel@iogearbox.net> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * version 2 as published by the Free Software Foundation. + */ + +#include <linux/module.h> +#include <linux/magic.h> +#include <linux/major.h> +#include <linux/mount.h> +#include <linux/namei.h> +#include <linux/fs.h> +#include <linux/kdev_t.h> +#include <linux/filter.h> +#include <linux/bpf.h> + +enum bpf_type { + BPF_TYPE_UNSPEC = 0, + BPF_TYPE_PROG, + BPF_TYPE_MAP, +}; + +static void *bpf_any_get(void *raw, enum bpf_type type) +{ + switch (type) { + case BPF_TYPE_PROG: + atomic_inc(&((struct bpf_prog *)raw)->aux->refcnt); + break; + case BPF_TYPE_MAP: + bpf_map_inc(raw, true); + break; + default: + WARN_ON_ONCE(1); + break; + } + + return raw; +} + +static void bpf_any_put(void *raw, enum bpf_type type) +{ + switch (type) { + case BPF_TYPE_PROG: + bpf_prog_put(raw); + break; + case BPF_TYPE_MAP: + bpf_map_put_with_uref(raw); + break; + default: + WARN_ON_ONCE(1); + break; + } +} + +static void *bpf_fd_probe_obj(u32 ufd, enum bpf_type *type) +{ + void *raw; + + *type = BPF_TYPE_MAP; + raw = bpf_map_get_with_uref(ufd); + if (IS_ERR(raw)) { + *type = BPF_TYPE_PROG; + raw = bpf_prog_get(ufd); + } + + return raw; +} + +static const struct inode_operations bpf_dir_iops; + +static const struct inode_operations bpf_prog_iops = { }; +static const struct inode_operations bpf_map_iops = { }; + +static struct inode *bpf_get_inode(struct super_block *sb, + const struct inode *dir, + umode_t mode) +{ + struct inode *inode; + + switch (mode & S_IFMT) { + case S_IFDIR: + case S_IFREG: + break; + default: + return ERR_PTR(-EINVAL); + } + + inode = new_inode(sb); + if (!inode) + return ERR_PTR(-ENOSPC); + + inode->i_ino = get_next_ino(); + inode->i_atime = CURRENT_TIME; + inode->i_mtime = inode->i_atime; + inode->i_ctime = inode->i_atime; + + inode_init_owner(inode, dir, mode); + + return inode; +} + +static int bpf_inode_type(const struct inode *inode, enum bpf_type *type) +{ + *type = BPF_TYPE_UNSPEC; + if (inode->i_op == &bpf_prog_iops) + *type = BPF_TYPE_PROG; + else if (inode->i_op == &bpf_map_iops) + *type = BPF_TYPE_MAP; + else + return -EACCES; + + return 0; +} + +static bool bpf_dname_reserved(const struct dentry *dentry) +{ + return strchr(dentry->d_name.name, '.'); +} + +static int bpf_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) +{ + struct inode *inode; + + if (bpf_dname_reserved(dentry)) + return -EPERM; + + inode = bpf_get_inode(dir->i_sb, dir, mode | S_IFDIR); + if (IS_ERR(inode)) + return PTR_ERR(inode); + + inode->i_op = &bpf_dir_iops; + inode->i_fop = &simple_dir_operations; + + inc_nlink(inode); + inc_nlink(dir); + + d_instantiate(dentry, inode); + dget(dentry); + + return 0; +} + +static int bpf_mkobj_ops(struct inode *dir, struct dentry *dentry, + umode_t mode, const struct inode_operations *iops) +{ + struct inode *inode; + + if (bpf_dname_reserved(dentry)) + return -EPERM; + + inode = bpf_get_inode(dir->i_sb, dir, mode | S_IFREG); + if (IS_ERR(inode)) + return PTR_ERR(inode); + + inode->i_op = iops; + inode->i_private = dentry->d_fsdata; + + d_instantiate(dentry, inode); + dget(dentry); + + return 0; +} + +static int bpf_mkobj(struct inode *dir, struct dentry *dentry, umode_t mode, + dev_t devt) +{ + enum bpf_type type = MINOR(devt); + + if (MAJOR(devt) != UNNAMED_MAJOR || !S_ISREG(mode) || + dentry->d_fsdata == NULL) + return -EPERM; + + switch (type) { + case BPF_TYPE_PROG: + return bpf_mkobj_ops(dir, dentry, mode, &bpf_prog_iops); + case BPF_TYPE_MAP: + return bpf_mkobj_ops(dir, dentry, mode, &bpf_map_iops); + default: + return -EPERM; + } +} + +static int bpf_link(struct dentry *old_dentry, struct inode *dir, + struct dentry *new_dentry) +{ + if (bpf_dname_reserved(new_dentry)) + return -EPERM; + + return simple_link(old_dentry, dir, new_dentry); +} + +static int bpf_rename(struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry) +{ + if (bpf_dname_reserved(new_dentry)) + return -EPERM; + + return simple_rename(old_dir, old_dentry, new_dir, new_dentry); +} + +static const struct inode_operations bpf_dir_iops = { + .lookup = simple_lookup, + .mknod = bpf_mkobj, + .mkdir = bpf_mkdir, + .rmdir = simple_rmdir, + .rename = bpf_rename, + .link = bpf_link, + .unlink = simple_unlink, +}; + +static int bpf_obj_do_pin(const struct filename *pathname, void *raw, + enum bpf_type type) +{ + struct dentry *dentry; + struct inode *dir; + struct path path; + umode_t mode; + dev_t devt; + int ret; + + dentry = kern_path_create(AT_FDCWD, pathname->name, &path, 0); + if (IS_ERR(dentry)) + return PTR_ERR(dentry); + + mode = S_IFREG | ((S_IRUSR | S_IWUSR) & ~current_umask()); + devt = MKDEV(UNNAMED_MAJOR, type); + + ret = security_path_mknod(&path, dentry, mode, devt); + if (ret) + goto out; + + dir = d_inode(path.dentry); + if (dir->i_op != &bpf_dir_iops) { + ret = -EPERM; + goto out; + } + + dentry->d_fsdata = raw; + ret = vfs_mknod(dir, dentry, mode, devt); + dentry->d_fsdata = NULL; +out: + done_path_create(&path, dentry); + return ret; +} + +int bpf_obj_pin_user(u32 ufd, const char __user *pathname) +{ + struct filename *pname; + enum bpf_type type; + void *raw; + int ret; + + pname = getname(pathname); + if (IS_ERR(pname)) + return PTR_ERR(pname); + + raw = bpf_fd_probe_obj(ufd, &type); + if (IS_ERR(raw)) { + ret = PTR_ERR(raw); + goto out; + } + + ret = bpf_obj_do_pin(pname, raw, type); + if (ret != 0) + bpf_any_put(raw, type); +out: + putname(pname); + return ret; +} + +static void *bpf_obj_do_get(const struct filename *pathname, + enum bpf_type *type) +{ + struct inode *inode; + struct path path; + void *raw; + int ret; + + ret = kern_path(pathname->name, LOOKUP_FOLLOW, &path); + if (ret) + return ERR_PTR(ret); + + inode = d_backing_inode(path.dentry); + ret = inode_permission(inode, MAY_WRITE); + if (ret) + goto out; + + ret = bpf_inode_type(inode, type); + if (ret) + goto out; + + raw = bpf_any_get(inode->i_private, *type); + touch_atime(&path); + + path_put(&path); + return raw; +out: + path_put(&path); + return ERR_PTR(ret); +} + +int bpf_obj_get_user(const char __user *pathname) +{ + enum bpf_type type = BPF_TYPE_UNSPEC; + struct filename *pname; + int ret = -ENOENT; + void *raw; + + pname = getname(pathname); + if (IS_ERR(pname)) + return PTR_ERR(pname); + + raw = bpf_obj_do_get(pname, &type); + if (IS_ERR(raw)) { + ret = PTR_ERR(raw); + goto out; + } + + if (type == BPF_TYPE_PROG) + ret = bpf_prog_new_fd(raw); + else if (type == BPF_TYPE_MAP) + ret = bpf_map_new_fd(raw); + else + goto out; + + if (ret < 0) + bpf_any_put(raw, type); +out: + putname(pname); + return ret; +} + +static void bpf_evict_inode(struct inode *inode) +{ + enum bpf_type type; + + truncate_inode_pages_final(&inode->i_data); + clear_inode(inode); + + if (!bpf_inode_type(inode, &type)) + bpf_any_put(inode->i_private, type); +} + +static const struct super_operations bpf_super_ops = { + .statfs = simple_statfs, + .drop_inode = generic_delete_inode, + .evict_inode = bpf_evict_inode, +}; + +static int bpf_fill_super(struct super_block *sb, void *data, int silent) +{ + static struct tree_descr bpf_rfiles[] = { { "" } }; + struct inode *inode; + int ret; + + ret = simple_fill_super(sb, BPF_FS_MAGIC, bpf_rfiles); + if (ret) + return ret; + + sb->s_op = &bpf_super_ops; + + inode = sb->s_root->d_inode; + inode->i_op = &bpf_dir_iops; + inode->i_mode &= ~S_IALLUGO; + inode->i_mode |= S_ISVTX | S_IRWXUGO; + + return 0; +} + +static struct dentry *bpf_mount(struct file_system_type *type, int flags, + const char *dev_name, void *data) +{ + return mount_ns(type, flags, current->nsproxy->mnt_ns, bpf_fill_super); +} + +static struct file_system_type bpf_fs_type = { + .owner = THIS_MODULE, + .name = "bpf", + .mount = bpf_mount, + .kill_sb = kill_litter_super, + .fs_flags = FS_USERNS_MOUNT, +}; + +MODULE_ALIAS_FS("bpf"); + +static int __init bpf_init(void) +{ + int ret; + + ret = sysfs_create_mount_point(fs_kobj, "bpf"); + if (ret) + return ret; + + ret = register_filesystem(&bpf_fs_type); + if (ret) + sysfs_remove_mount_point(fs_kobj, "bpf"); + + return ret; +} +fs_initcall(bpf_init); diff --git a/kernel/bpf/percpu_freelist.c b/kernel/bpf/percpu_freelist.c new file mode 100644 index 000000000000..5c51d1985b51 --- /dev/null +++ b/kernel/bpf/percpu_freelist.c @@ -0,0 +1,100 @@ +/* Copyright (c) 2016 Facebook + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + */ +#include "percpu_freelist.h" + +int pcpu_freelist_init(struct pcpu_freelist *s) +{ + int cpu; + + s->freelist = alloc_percpu(struct pcpu_freelist_head); + if (!s->freelist) + return -ENOMEM; + + for_each_possible_cpu(cpu) { + struct pcpu_freelist_head *head = per_cpu_ptr(s->freelist, cpu); + + raw_spin_lock_init(&head->lock); + head->first = NULL; + } + return 0; +} + +void pcpu_freelist_destroy(struct pcpu_freelist *s) +{ + free_percpu(s->freelist); +} + +static inline void __pcpu_freelist_push(struct pcpu_freelist_head *head, + struct pcpu_freelist_node *node) +{ + raw_spin_lock(&head->lock); + node->next = head->first; + head->first = node; + raw_spin_unlock(&head->lock); +} + +void pcpu_freelist_push(struct pcpu_freelist *s, + struct pcpu_freelist_node *node) +{ + struct pcpu_freelist_head *head = this_cpu_ptr(s->freelist); + + __pcpu_freelist_push(head, node); +} + +void pcpu_freelist_populate(struct pcpu_freelist *s, void *buf, u32 elem_size, + u32 nr_elems) +{ + struct pcpu_freelist_head *head; + unsigned long flags; + int i, cpu, pcpu_entries; + + pcpu_entries = nr_elems / num_possible_cpus() + 1; + i = 0; + + /* disable irq to workaround lockdep false positive + * in bpf usage pcpu_freelist_populate() will never race + * with pcpu_freelist_push() + */ + local_irq_save(flags); + for_each_possible_cpu(cpu) { +again: + head = per_cpu_ptr(s->freelist, cpu); + __pcpu_freelist_push(head, buf); + i++; + buf += elem_size; + if (i == nr_elems) + break; + if (i % pcpu_entries) + goto again; + } + local_irq_restore(flags); +} + +struct pcpu_freelist_node *pcpu_freelist_pop(struct pcpu_freelist *s) +{ + struct pcpu_freelist_head *head; + struct pcpu_freelist_node *node; + int orig_cpu, cpu; + + orig_cpu = cpu = raw_smp_processor_id(); + while (1) { + head = per_cpu_ptr(s->freelist, cpu); + raw_spin_lock(&head->lock); + node = head->first; + if (node) { + head->first = node->next; + raw_spin_unlock(&head->lock); + return node; + } + raw_spin_unlock(&head->lock); + cpu = cpumask_next(cpu, cpu_possible_mask); + if (cpu >= nr_cpu_ids) + cpu = 0; + if (cpu == orig_cpu) + return NULL; + } +} diff --git a/kernel/bpf/percpu_freelist.h b/kernel/bpf/percpu_freelist.h new file mode 100644 index 000000000000..3049aae8ea1e --- /dev/null +++ b/kernel/bpf/percpu_freelist.h @@ -0,0 +1,31 @@ +/* Copyright (c) 2016 Facebook + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + */ +#ifndef __PERCPU_FREELIST_H__ +#define __PERCPU_FREELIST_H__ +#include <linux/spinlock.h> +#include <linux/percpu.h> + +struct pcpu_freelist_head { + struct pcpu_freelist_node *first; + raw_spinlock_t lock; +}; + +struct pcpu_freelist { + struct pcpu_freelist_head __percpu *freelist; +}; + +struct pcpu_freelist_node { + struct pcpu_freelist_node *next; +}; + +void pcpu_freelist_push(struct pcpu_freelist *, struct pcpu_freelist_node *); +struct pcpu_freelist_node *pcpu_freelist_pop(struct pcpu_freelist *); +void pcpu_freelist_populate(struct pcpu_freelist *s, void *buf, u32 elem_size, + u32 nr_elems); +int pcpu_freelist_init(struct pcpu_freelist *); +void pcpu_freelist_destroy(struct pcpu_freelist *s); +#endif diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c new file mode 100644 index 000000000000..499d9e933f8e --- /dev/null +++ b/kernel/bpf/stackmap.c @@ -0,0 +1,290 @@ +/* Copyright (c) 2016 Facebook + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + */ +#include <linux/bpf.h> +#include <linux/jhash.h> +#include <linux/filter.h> +#include <linux/vmalloc.h> +#include <linux/stacktrace.h> +#include <linux/perf_event.h> +#include "percpu_freelist.h" + +struct stack_map_bucket { + struct pcpu_freelist_node fnode; + u32 hash; + u32 nr; + u64 ip[]; +}; + +struct bpf_stack_map { + struct bpf_map map; + void *elems; + struct pcpu_freelist freelist; + u32 n_buckets; + struct stack_map_bucket *buckets[]; +}; + +static int prealloc_elems_and_freelist(struct bpf_stack_map *smap) +{ + u32 elem_size = sizeof(struct stack_map_bucket) + smap->map.value_size; + int err; + + smap->elems = vzalloc(elem_size * smap->map.max_entries); + if (!smap->elems) + return -ENOMEM; + + err = pcpu_freelist_init(&smap->freelist); + if (err) + goto free_elems; + + pcpu_freelist_populate(&smap->freelist, smap->elems, elem_size, + smap->map.max_entries); + return 0; + +free_elems: + vfree(smap->elems); + return err; +} + +/* Called from syscall */ +static struct bpf_map *stack_map_alloc(union bpf_attr *attr) +{ + u32 value_size = attr->value_size; + struct bpf_stack_map *smap; + u64 cost, n_buckets; + int err; + + if (!capable(CAP_SYS_ADMIN)) + return ERR_PTR(-EPERM); + + if (attr->map_flags) + return ERR_PTR(-EINVAL); + + /* check sanity of attributes */ + if (attr->max_entries == 0 || attr->key_size != 4 || + value_size < 8 || value_size % 8 || + value_size / 8 > PERF_MAX_STACK_DEPTH) + return ERR_PTR(-EINVAL); + + /* hash table size must be power of 2 */ + n_buckets = roundup_pow_of_two(attr->max_entries); + + cost = n_buckets * sizeof(struct stack_map_bucket *) + sizeof(*smap); + if (cost >= U32_MAX - PAGE_SIZE) + return ERR_PTR(-E2BIG); + + smap = kzalloc(cost, GFP_USER | __GFP_NOWARN); + if (!smap) { + smap = vzalloc(cost); + if (!smap) + return ERR_PTR(-ENOMEM); + } + + err = -E2BIG; + cost += n_buckets * (value_size + sizeof(struct stack_map_bucket)); + if (cost >= U32_MAX - PAGE_SIZE) + goto free_smap; + + smap->map.map_type = attr->map_type; + smap->map.key_size = attr->key_size; + smap->map.value_size = value_size; + smap->map.max_entries = attr->max_entries; + smap->n_buckets = n_buckets; + smap->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; + + err = bpf_map_precharge_memlock(smap->map.pages); + if (err) + goto free_smap; + + err = get_callchain_buffers(); + if (err) + goto free_smap; + + err = prealloc_elems_and_freelist(smap); + if (err) + goto put_buffers; + + return &smap->map; + +put_buffers: + put_callchain_buffers(); +free_smap: + kvfree(smap); + return ERR_PTR(err); +} + +static u64 bpf_get_stackid(u64 r1, u64 r2, u64 flags, u64 r4, u64 r5) +{ + struct pt_regs *regs = (struct pt_regs *) (long) r1; + struct bpf_map *map = (struct bpf_map *) (long) r2; + struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map); + struct perf_callchain_entry *trace; + struct stack_map_bucket *bucket, *new_bucket, *old_bucket; + u32 max_depth = map->value_size / 8; + /* stack_map_alloc() checks that max_depth <= PERF_MAX_STACK_DEPTH */ + u32 init_nr = PERF_MAX_STACK_DEPTH - max_depth; + u32 skip = flags & BPF_F_SKIP_FIELD_MASK; + u32 hash, id, trace_nr, trace_len; + bool user = flags & BPF_F_USER_STACK; + bool kernel = !user; + u64 *ips; + + if (unlikely(flags & ~(BPF_F_SKIP_FIELD_MASK | BPF_F_USER_STACK | + BPF_F_FAST_STACK_CMP | BPF_F_REUSE_STACKID))) + return -EINVAL; + + trace = get_perf_callchain(regs, init_nr, kernel, user, false, false); + + if (unlikely(!trace)) + /* couldn't fetch the stack trace */ + return -EFAULT; + + /* get_perf_callchain() guarantees that trace->nr >= init_nr + * and trace-nr <= PERF_MAX_STACK_DEPTH, so trace_nr <= max_depth + */ + trace_nr = trace->nr - init_nr; + + if (trace_nr <= skip) + /* skipping more than usable stack trace */ + return -EFAULT; + + trace_nr -= skip; + trace_len = trace_nr * sizeof(u64); + ips = trace->ip + skip + init_nr; + hash = jhash2((u32 *)ips, trace_len / sizeof(u32), 0); + id = hash & (smap->n_buckets - 1); + bucket = READ_ONCE(smap->buckets[id]); + + if (bucket && bucket->hash == hash) { + if (flags & BPF_F_FAST_STACK_CMP) + return id; + if (bucket->nr == trace_nr && + memcmp(bucket->ip, ips, trace_len) == 0) + return id; + } + + /* this call stack is not in the map, try to add it */ + if (bucket && !(flags & BPF_F_REUSE_STACKID)) + return -EEXIST; + + new_bucket = (struct stack_map_bucket *) + pcpu_freelist_pop(&smap->freelist); + if (unlikely(!new_bucket)) + return -ENOMEM; + + memcpy(new_bucket->ip, ips, trace_len); + new_bucket->hash = hash; + new_bucket->nr = trace_nr; + + old_bucket = xchg(&smap->buckets[id], new_bucket); + if (old_bucket) + pcpu_freelist_push(&smap->freelist, &old_bucket->fnode); + return id; +} + +const struct bpf_func_proto bpf_get_stackid_proto = { + .func = bpf_get_stackid, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_CONST_MAP_PTR, + .arg3_type = ARG_ANYTHING, +}; + +/* Called from eBPF program */ +static void *stack_map_lookup_elem(struct bpf_map *map, void *key) +{ + return NULL; +} + +/* Called from syscall */ +int bpf_stackmap_copy(struct bpf_map *map, void *key, void *value) +{ + struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map); + struct stack_map_bucket *bucket, *old_bucket; + u32 id = *(u32 *)key, trace_len; + + if (unlikely(id >= smap->n_buckets)) + return -ENOENT; + + bucket = xchg(&smap->buckets[id], NULL); + if (!bucket) + return -ENOENT; + + trace_len = bucket->nr * sizeof(u64); + memcpy(value, bucket->ip, trace_len); + memset(value + trace_len, 0, map->value_size - trace_len); + + old_bucket = xchg(&smap->buckets[id], bucket); + if (old_bucket) + pcpu_freelist_push(&smap->freelist, &old_bucket->fnode); + return 0; +} + +static int stack_map_get_next_key(struct bpf_map *map, void *key, void *next_key) +{ + return -EINVAL; +} + +static int stack_map_update_elem(struct bpf_map *map, void *key, void *value, + u64 map_flags) +{ + return -EINVAL; +} + +/* Called from syscall or from eBPF program */ +static int stack_map_delete_elem(struct bpf_map *map, void *key) +{ + struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map); + struct stack_map_bucket *old_bucket; + u32 id = *(u32 *)key; + + if (unlikely(id >= smap->n_buckets)) + return -E2BIG; + + old_bucket = xchg(&smap->buckets[id], NULL); + if (old_bucket) { + pcpu_freelist_push(&smap->freelist, &old_bucket->fnode); + return 0; + } else { + return -ENOENT; + } +} + +/* Called when map->refcnt goes to zero, either from workqueue or from syscall */ +static void stack_map_free(struct bpf_map *map) +{ + struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map); + + /* wait for bpf programs to complete before freeing stack map */ + synchronize_rcu(); + + vfree(smap->elems); + pcpu_freelist_destroy(&smap->freelist); + kvfree(smap); + put_callchain_buffers(); +} + +static const struct bpf_map_ops stack_map_ops = { + .map_alloc = stack_map_alloc, + .map_free = stack_map_free, + .map_get_next_key = stack_map_get_next_key, + .map_lookup_elem = stack_map_lookup_elem, + .map_update_elem = stack_map_update_elem, + .map_delete_elem = stack_map_delete_elem, +}; + +static struct bpf_map_type_list stack_map_type __read_mostly = { + .ops = &stack_map_ops, + .type = BPF_MAP_TYPE_STACK_TRACE, +}; + +static int __init register_stack_map(void) +{ + bpf_register_map_type(&stack_map_type); + return 0; +} +late_initcall(register_stack_map); diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 35bac8e8b071..2a2efe1bc76c 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -18,6 +18,10 @@ #include <linux/filter.h> #include <linux/version.h> +DEFINE_PER_CPU(int, bpf_prog_active); + +int sysctl_unprivileged_bpf_disabled __read_mostly; + static LIST_HEAD(bpf_map_types); static struct bpf_map *find_and_alloc_map(union bpf_attr *attr) @@ -44,15 +48,63 @@ void bpf_register_map_type(struct bpf_map_type_list *tl) list_add(&tl->list_node, &bpf_map_types); } +int bpf_map_precharge_memlock(u32 pages) +{ + struct user_struct *user = get_current_user(); + unsigned long memlock_limit, cur; + + memlock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; + cur = atomic_long_read(&user->locked_vm); + free_uid(user); + if (cur + pages > memlock_limit) + return -EPERM; + return 0; +} + +static int bpf_map_charge_memlock(struct bpf_map *map) +{ + struct user_struct *user = get_current_user(); + unsigned long memlock_limit; + + memlock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; + + atomic_long_add(map->pages, &user->locked_vm); + + if (atomic_long_read(&user->locked_vm) > memlock_limit) { + atomic_long_sub(map->pages, &user->locked_vm); + free_uid(user); + return -EPERM; + } + map->user = user; + return 0; +} + +static void bpf_map_uncharge_memlock(struct bpf_map *map) +{ + struct user_struct *user = map->user; + + atomic_long_sub(map->pages, &user->locked_vm); + free_uid(user); +} + /* called from workqueue */ static void bpf_map_free_deferred(struct work_struct *work) { struct bpf_map *map = container_of(work, struct bpf_map, work); + bpf_map_uncharge_memlock(map); /* implementation dependent freeing */ map->ops->map_free(map); } +static void bpf_map_put_uref(struct bpf_map *map) +{ + if (atomic_dec_and_test(&map->usercnt)) { + if (map->map_type == BPF_MAP_TYPE_PROG_ARRAY) + bpf_fd_array_map_clear(map); + } +} + /* decrement map refcnt and schedule it for freeing via workqueue * (unrelying map implementation ops->map_free() might sleep) */ @@ -64,24 +116,48 @@ void bpf_map_put(struct bpf_map *map) } } -static int bpf_map_release(struct inode *inode, struct file *filp) +void bpf_map_put_with_uref(struct bpf_map *map) { - struct bpf_map *map = filp->private_data; - - if (map->map_type == BPF_MAP_TYPE_PROG_ARRAY) - /* prog_array stores refcnt-ed bpf_prog pointers - * release them all when user space closes prog_array_fd - */ - bpf_fd_array_map_clear(map); - + bpf_map_put_uref(map); bpf_map_put(map); +} + +static int bpf_map_release(struct inode *inode, struct file *filp) +{ + bpf_map_put_with_uref(filp->private_data); return 0; } +#ifdef CONFIG_PROC_FS +static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp) +{ + const struct bpf_map *map = filp->private_data; + + seq_printf(m, + "map_type:\t%u\n" + "key_size:\t%u\n" + "value_size:\t%u\n" + "max_entries:\t%u\n", + map->map_type, + map->key_size, + map->value_size, + map->max_entries); +} +#endif + static const struct file_operations bpf_map_fops = { - .release = bpf_map_release, +#ifdef CONFIG_PROC_FS + .show_fdinfo = bpf_map_show_fdinfo, +#endif + .release = bpf_map_release, }; +int bpf_map_new_fd(struct bpf_map *map) +{ + return anon_inode_getfd("bpf-map", &bpf_map_fops, map, + O_RDWR | O_CLOEXEC); +} + /* helper macro to check that unused fields 'union bpf_attr' are zero */ #define CHECK_ATTR(CMD) \ memchr_inv((void *) &attr->CMD##_LAST_FIELD + \ @@ -90,7 +166,7 @@ static const struct file_operations bpf_map_fops = { offsetof(union bpf_attr, CMD##_LAST_FIELD) - \ sizeof(attr->CMD##_LAST_FIELD)) != NULL -#define BPF_MAP_CREATE_LAST_FIELD max_entries +#define BPF_MAP_CREATE_LAST_FIELD map_flags /* called via syscall */ static int map_create(union bpf_attr *attr) { @@ -107,9 +183,13 @@ static int map_create(union bpf_attr *attr) return PTR_ERR(map); atomic_set(&map->refcnt, 1); + atomic_set(&map->usercnt, 1); - err = anon_inode_getfd("bpf-map", &bpf_map_fops, map, O_RDWR | O_CLOEXEC); + err = bpf_map_charge_memlock(map); + if (err) + goto free_map; + err = bpf_map_new_fd(map); if (err < 0) /* failed to allocate fd */ goto free_map; @@ -124,19 +204,36 @@ free_map: /* if error is returned, fd is released. * On success caller should complete fd access with matching fdput() */ -struct bpf_map *bpf_map_get(struct fd f) +struct bpf_map *__bpf_map_get(struct fd f) { - struct bpf_map *map; - if (!f.file) return ERR_PTR(-EBADF); - if (f.file->f_op != &bpf_map_fops) { fdput(f); return ERR_PTR(-EINVAL); } - map = f.file->private_data; + return f.file->private_data; +} + +void bpf_map_inc(struct bpf_map *map, bool uref) +{ + atomic_inc(&map->refcnt); + if (uref) + atomic_inc(&map->usercnt); +} + +struct bpf_map *bpf_map_get_with_uref(u32 ufd) +{ + struct fd f = fdget(ufd); + struct bpf_map *map; + + map = __bpf_map_get(f); + if (IS_ERR(map)) + return map; + + bpf_map_inc(map, true); + fdput(f); return map; } @@ -147,6 +244,11 @@ static void __user *u64_to_ptr(__u64 val) return (void __user *) (unsigned long) val; } +int __weak bpf_stackmap_copy(struct bpf_map *map, void *key, void *value) +{ + return -ENOTSUPP; +} + /* last field in 'union bpf_attr' used by this command */ #define BPF_MAP_LOOKUP_ELEM_LAST_FIELD value @@ -157,6 +259,7 @@ static int map_lookup_elem(union bpf_attr *attr) int ufd = attr->map_fd; struct bpf_map *map; void *key, *value, *ptr; + u32 value_size; struct fd f; int err; @@ -164,7 +267,7 @@ static int map_lookup_elem(union bpf_attr *attr) return -EINVAL; f = fdget(ufd); - map = bpf_map_get(f); + map = __bpf_map_get(f); if (IS_ERR(map)) return PTR_ERR(map); @@ -177,23 +280,37 @@ static int map_lookup_elem(union bpf_attr *attr) if (copy_from_user(key, ukey, map->key_size) != 0) goto free_key; + if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || + map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) + value_size = round_up(map->value_size, 8) * num_possible_cpus(); + else + value_size = map->value_size; + err = -ENOMEM; - value = kmalloc(map->value_size, GFP_USER); + value = kmalloc(value_size, GFP_USER | __GFP_NOWARN); if (!value) goto free_key; - rcu_read_lock(); - ptr = map->ops->map_lookup_elem(map, key); - if (ptr) - memcpy(value, ptr, map->value_size); - rcu_read_unlock(); + if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH) { + err = bpf_percpu_hash_copy(map, key, value); + } else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) { + err = bpf_percpu_array_copy(map, key, value); + } else if (map->map_type == BPF_MAP_TYPE_STACK_TRACE) { + err = bpf_stackmap_copy(map, key, value); + } else { + rcu_read_lock(); + ptr = map->ops->map_lookup_elem(map, key); + if (ptr) + memcpy(value, ptr, value_size); + rcu_read_unlock(); + err = ptr ? 0 : -ENOENT; + } - err = -ENOENT; - if (!ptr) + if (err) goto free_value; err = -EFAULT; - if (copy_to_user(uvalue, value, map->value_size) != 0) + if (copy_to_user(uvalue, value, value_size) != 0) goto free_value; err = 0; @@ -216,6 +333,7 @@ static int map_update_elem(union bpf_attr *attr) int ufd = attr->map_fd; struct bpf_map *map; void *key, *value; + u32 value_size; struct fd f; int err; @@ -223,7 +341,7 @@ static int map_update_elem(union bpf_attr *attr) return -EINVAL; f = fdget(ufd); - map = bpf_map_get(f); + map = __bpf_map_get(f); if (IS_ERR(map)) return PTR_ERR(map); @@ -236,21 +354,37 @@ static int map_update_elem(union bpf_attr *attr) if (copy_from_user(key, ukey, map->key_size) != 0) goto free_key; + if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || + map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) + value_size = round_up(map->value_size, 8) * num_possible_cpus(); + else + value_size = map->value_size; + err = -ENOMEM; - value = kmalloc(map->value_size, GFP_USER); + value = kmalloc(value_size, GFP_USER | __GFP_NOWARN); if (!value) goto free_key; err = -EFAULT; - if (copy_from_user(value, uvalue, map->value_size) != 0) + if (copy_from_user(value, uvalue, value_size) != 0) goto free_value; - /* eBPF program that use maps are running under rcu_read_lock(), - * therefore all map accessors rely on this fact, so do the same here + /* must increment bpf_prog_active to avoid kprobe+bpf triggering from + * inside bpf map update or delete otherwise deadlocks are possible */ - rcu_read_lock(); - err = map->ops->map_update_elem(map, key, value, attr->flags); - rcu_read_unlock(); + preempt_disable(); + __this_cpu_inc(bpf_prog_active); + if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH) { + err = bpf_percpu_hash_update(map, key, value, attr->flags); + } else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) { + err = bpf_percpu_array_update(map, key, value, attr->flags); + } else { + rcu_read_lock(); + err = map->ops->map_update_elem(map, key, value, attr->flags); + rcu_read_unlock(); + } + __this_cpu_dec(bpf_prog_active); + preempt_enable(); free_value: kfree(value); @@ -276,7 +410,7 @@ static int map_delete_elem(union bpf_attr *attr) return -EINVAL; f = fdget(ufd); - map = bpf_map_get(f); + map = __bpf_map_get(f); if (IS_ERR(map)) return PTR_ERR(map); @@ -289,9 +423,13 @@ static int map_delete_elem(union bpf_attr *attr) if (copy_from_user(key, ukey, map->key_size) != 0) goto free_key; + preempt_disable(); + __this_cpu_inc(bpf_prog_active); rcu_read_lock(); err = map->ops->map_delete_elem(map, key); rcu_read_unlock(); + __this_cpu_dec(bpf_prog_active); + preempt_enable(); free_key: kfree(key); @@ -317,7 +455,7 @@ static int map_get_next_key(union bpf_attr *attr) return -EINVAL; f = fdget(ufd); - map = bpf_map_get(f); + map = __bpf_map_get(f); if (IS_ERR(map)) return PTR_ERR(map); @@ -402,6 +540,10 @@ static void fixup_bpf_calls(struct bpf_prog *prog) */ BUG_ON(!prog->aux->ops->get_func_proto); + if (insn->imm == BPF_FUNC_get_route_realm) + prog->dst_needed = 1; + if (insn->imm == BPF_FUNC_get_prandom_u32) + bpf_user_rnd_init_once(); if (insn->imm == BPF_FUNC_tail_call) { /* mark bpf_tail_call as different opcode * to avoid conditional branch in @@ -436,29 +578,51 @@ static void free_used_maps(struct bpf_prog_aux *aux) kfree(aux->used_maps); } -static void __prog_put_rcu(struct rcu_head *rcu) +static int bpf_prog_charge_memlock(struct bpf_prog *prog) +{ + struct user_struct *user = get_current_user(); + unsigned long memlock_limit; + + memlock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; + + atomic_long_add(prog->pages, &user->locked_vm); + if (atomic_long_read(&user->locked_vm) > memlock_limit) { + atomic_long_sub(prog->pages, &user->locked_vm); + free_uid(user); + return -EPERM; + } + prog->aux->user = user; + return 0; +} + +static void bpf_prog_uncharge_memlock(struct bpf_prog *prog) +{ + struct user_struct *user = prog->aux->user; + + atomic_long_sub(prog->pages, &user->locked_vm); + free_uid(user); +} + +static void __prog_put_common(struct rcu_head *rcu) { struct bpf_prog_aux *aux = container_of(rcu, struct bpf_prog_aux, rcu); free_used_maps(aux); + bpf_prog_uncharge_memlock(aux->prog); bpf_prog_free(aux->prog); } /* version of bpf_prog_put() that is called after a grace period */ void bpf_prog_put_rcu(struct bpf_prog *prog) { - if (atomic_dec_and_test(&prog->aux->refcnt)) { - prog->aux->prog = prog; - call_rcu(&prog->aux->rcu, __prog_put_rcu); - } + if (atomic_dec_and_test(&prog->aux->refcnt)) + call_rcu(&prog->aux->rcu, __prog_put_common); } void bpf_prog_put(struct bpf_prog *prog) { - if (atomic_dec_and_test(&prog->aux->refcnt)) { - free_used_maps(prog->aux); - bpf_prog_free(prog); - } + if (atomic_dec_and_test(&prog->aux->refcnt)) + __prog_put_common(&prog->aux->rcu); } EXPORT_SYMBOL_GPL(bpf_prog_put); @@ -474,21 +638,22 @@ static const struct file_operations bpf_prog_fops = { .release = bpf_prog_release, }; -static struct bpf_prog *get_prog(struct fd f) +int bpf_prog_new_fd(struct bpf_prog *prog) { - struct bpf_prog *prog; + return anon_inode_getfd("bpf-prog", &bpf_prog_fops, prog, + O_RDWR | O_CLOEXEC); +} +static struct bpf_prog *__bpf_prog_get(struct fd f) +{ if (!f.file) return ERR_PTR(-EBADF); - if (f.file->f_op != &bpf_prog_fops) { fdput(f); return ERR_PTR(-EINVAL); } - prog = f.file->private_data; - - return prog; + return f.file->private_data; } /* called by sockets/tracing/seccomp before attaching program to an event @@ -499,13 +664,13 @@ struct bpf_prog *bpf_prog_get(u32 ufd) struct fd f = fdget(ufd); struct bpf_prog *prog; - prog = get_prog(f); - + prog = __bpf_prog_get(f); if (IS_ERR(prog)) return prog; atomic_inc(&prog->aux->refcnt); fdput(f); + return prog; } EXPORT_SYMBOL_GPL(bpf_prog_get); @@ -540,11 +705,18 @@ static int bpf_prog_load(union bpf_attr *attr) attr->kern_version != LINUX_VERSION_CODE) return -EINVAL; + if (type != BPF_PROG_TYPE_SOCKET_FILTER && !capable(CAP_SYS_ADMIN)) + return -EPERM; + /* plain bpf_prog allocation */ prog = bpf_prog_alloc(bpf_prog_size(attr->insn_cnt), GFP_USER); if (!prog) return -ENOMEM; + err = bpf_prog_charge_memlock(prog); + if (err) + goto free_prog_nouncharge; + prog->len = attr->insn_cnt; err = -EFAULT; @@ -553,10 +725,10 @@ static int bpf_prog_load(union bpf_attr *attr) goto free_prog; prog->orig_prog = NULL; - prog->jited = false; + prog->jited = 0; atomic_set(&prog->aux->refcnt, 1); - prog->gpl_compatible = is_gpl; + prog->gpl_compatible = is_gpl ? 1 : 0; /* find program type: socket_filter vs tracing_filter */ err = find_prog_type(type, prog); @@ -576,7 +748,7 @@ static int bpf_prog_load(union bpf_attr *attr) if (err < 0) goto free_used_maps; - err = anon_inode_getfd("bpf-prog", &bpf_prog_fops, prog, O_RDWR | O_CLOEXEC); + err = bpf_prog_new_fd(prog); if (err < 0) /* failed to allocate fd */ goto free_used_maps; @@ -586,20 +758,36 @@ static int bpf_prog_load(union bpf_attr *attr) free_used_maps: free_used_maps(prog->aux); free_prog: + bpf_prog_uncharge_memlock(prog); +free_prog_nouncharge: bpf_prog_free(prog); return err; } +#define BPF_OBJ_LAST_FIELD bpf_fd + +static int bpf_obj_pin(const union bpf_attr *attr) +{ + if (CHECK_ATTR(BPF_OBJ)) + return -EINVAL; + + return bpf_obj_pin_user(attr->bpf_fd, u64_to_ptr(attr->pathname)); +} + +static int bpf_obj_get(const union bpf_attr *attr) +{ + if (CHECK_ATTR(BPF_OBJ) || attr->bpf_fd != 0) + return -EINVAL; + + return bpf_obj_get_user(u64_to_ptr(attr->pathname)); +} + SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size) { union bpf_attr attr = {}; int err; - /* the syscall is limited to root temporarily. This restriction will be - * lifted when security audit is clean. Note that eBPF+tracing must have - * this restriction, since it may pass kernel data to user space - */ - if (!capable(CAP_SYS_ADMIN)) + if (!capable(CAP_SYS_ADMIN) && sysctl_unprivileged_bpf_disabled) return -EPERM; if (!access_ok(VERIFY_READ, uattr, 1)) @@ -654,6 +842,12 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz case BPF_PROG_LOAD: err = bpf_prog_load(&attr); break; + case BPF_OBJ_PIN: + err = bpf_obj_pin(&attr); + break; + case BPF_OBJ_GET: + err = bpf_obj_get(&attr); + break; default: err = -EINVAL; break; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index b074b23000d6..2e08f8e9b771 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -199,6 +199,7 @@ struct verifier_env { struct verifier_state_list **explored_states; /* search pruning optimization */ struct bpf_map *used_maps[MAX_USED_MAPS]; /* array of map's used by eBPF program */ u32 used_map_cnt; /* number of used maps */ + bool allow_ptr_leaks; }; /* verbose verifier prints what it's seeing @@ -213,7 +214,7 @@ static DEFINE_MUTEX(bpf_verifier_lock); * verbose() is used to dump the verification trace to the log, so the user * can figure out what's wrong with the program */ -static void verbose(const char *fmt, ...) +static __printf(1, 2) void verbose(const char *fmt, ...) { va_list args; @@ -244,6 +245,8 @@ static const struct { } func_limit[] = { {BPF_MAP_TYPE_PROG_ARRAY, BPF_FUNC_tail_call}, {BPF_MAP_TYPE_PERF_EVENT_ARRAY, BPF_FUNC_perf_event_read}, + {BPF_MAP_TYPE_PERF_EVENT_ARRAY, BPF_FUNC_perf_event_output}, + {BPF_MAP_TYPE_STACK_TRACE, BPF_FUNC_get_stackid}, }; static void print_verifier_state(struct verifier_env *env) @@ -538,6 +541,21 @@ static int bpf_size_to_bytes(int bpf_size) return -EINVAL; } +static bool is_spillable_regtype(enum bpf_reg_type type) +{ + switch (type) { + case PTR_TO_MAP_VALUE: + case PTR_TO_MAP_VALUE_OR_NULL: + case PTR_TO_STACK: + case PTR_TO_CTX: + case FRAME_PTR: + case CONST_PTR_TO_MAP: + return true; + default: + return false; + } +} + /* check_stack_read/write functions track spill/fill of registers, * stack boundary and alignment are checked in check_mem_access() */ @@ -550,9 +568,7 @@ static int check_stack_write(struct verifier_state *state, int off, int size, */ if (value_regno >= 0 && - (state->regs[value_regno].type == PTR_TO_MAP_VALUE || - state->regs[value_regno].type == PTR_TO_STACK || - state->regs[value_regno].type == PTR_TO_CTX)) { + is_spillable_regtype(state->regs[value_regno].type)) { /* register containing pointer is being spilled into stack */ if (size != BPF_REG_SIZE) { @@ -643,6 +659,20 @@ static int check_ctx_access(struct verifier_env *env, int off, int size, return -EACCES; } +static bool is_pointer_value(struct verifier_env *env, int regno) +{ + if (env->allow_ptr_leaks) + return false; + + switch (env->cur_state.regs[regno].type) { + case UNKNOWN_VALUE: + case CONST_IMM: + return false; + default: + return true; + } +} + /* check whether memory at (regno + off) is accessible for t = (read | write) * if t==write, value_regno is a register which value is stored into memory * if t==read, value_regno is a register which will receive the value from memory @@ -669,11 +699,21 @@ static int check_mem_access(struct verifier_env *env, u32 regno, int off, } if (state->regs[regno].type == PTR_TO_MAP_VALUE) { + if (t == BPF_WRITE && value_regno >= 0 && + is_pointer_value(env, value_regno)) { + verbose("R%d leaks addr into map\n", value_regno); + return -EACCES; + } err = check_map_access(env, regno, off, size); if (!err && t == BPF_READ && value_regno >= 0) mark_reg_unknown_value(state->regs, value_regno); } else if (state->regs[regno].type == PTR_TO_CTX) { + if (t == BPF_WRITE && value_regno >= 0 && + is_pointer_value(env, value_regno)) { + verbose("R%d leaks addr into ctx\n", value_regno); + return -EACCES; + } err = check_ctx_access(env, off, size, t); if (!err && t == BPF_READ && value_regno >= 0) mark_reg_unknown_value(state->regs, value_regno); @@ -684,10 +724,17 @@ static int check_mem_access(struct verifier_env *env, u32 regno, int off, verbose("invalid stack off=%d size=%d\n", off, size); return -EACCES; } - if (t == BPF_WRITE) + if (t == BPF_WRITE) { + if (!env->allow_ptr_leaks && + state->stack_slot_type[MAX_BPF_STACK + off] == STACK_SPILL && + size != BPF_REG_SIZE) { + verbose("attempt to corrupt spilled pointer on stack\n"); + return -EACCES; + } err = check_stack_write(state, off, size, value_regno); - else + } else { err = check_stack_read(state, off, size, value_regno); + } } else { verbose("R%d invalid mem access '%s'\n", regno, reg_type_str[state->regs[regno].type]); @@ -732,15 +779,24 @@ static int check_xadd(struct verifier_env *env, struct bpf_insn *insn) * bytes from that pointer, make sure that it's within stack boundary * and all elements of stack are initialized */ -static int check_stack_boundary(struct verifier_env *env, - int regno, int access_size) +static int check_stack_boundary(struct verifier_env *env, int regno, + int access_size, bool zero_size_allowed) { struct verifier_state *state = &env->cur_state; struct reg_state *regs = state->regs; int off, i; - if (regs[regno].type != PTR_TO_STACK) + if (regs[regno].type != PTR_TO_STACK) { + if (zero_size_allowed && access_size == 0 && + regs[regno].type == CONST_IMM && + regs[regno].imm == 0) + return 0; + + verbose("R%d type=%s expected=%s\n", regno, + reg_type_str[regs[regno].type], + reg_type_str[PTR_TO_STACK]); return -EACCES; + } off = regs[regno].imm; if (off >= 0 || off < -MAX_BPF_STACK || off + access_size > 0 || @@ -775,18 +831,32 @@ static int check_func_arg(struct verifier_env *env, u32 regno, return -EACCES; } - if (arg_type == ARG_ANYTHING) + if (arg_type == ARG_ANYTHING) { + if (is_pointer_value(env, regno)) { + verbose("R%d leaks addr into helper function\n", regno); + return -EACCES; + } return 0; + } - if (arg_type == ARG_PTR_TO_STACK || arg_type == ARG_PTR_TO_MAP_KEY || + if (arg_type == ARG_PTR_TO_MAP_KEY || arg_type == ARG_PTR_TO_MAP_VALUE) { expected_type = PTR_TO_STACK; - } else if (arg_type == ARG_CONST_STACK_SIZE) { + } else if (arg_type == ARG_CONST_STACK_SIZE || + arg_type == ARG_CONST_STACK_SIZE_OR_ZERO) { expected_type = CONST_IMM; } else if (arg_type == ARG_CONST_MAP_PTR) { expected_type = CONST_PTR_TO_MAP; } else if (arg_type == ARG_PTR_TO_CTX) { expected_type = PTR_TO_CTX; + } else if (arg_type == ARG_PTR_TO_STACK) { + expected_type = PTR_TO_STACK; + /* One exception here. In case function allows for NULL to be + * passed in as argument, it's a CONST_IMM type. Final test + * happens during stack boundary checking. + */ + if (reg->type == CONST_IMM && reg->imm == 0) + expected_type = CONST_IMM; } else { verbose("unsupported arg_type %d\n", arg_type); return -EFAULT; @@ -816,8 +886,8 @@ static int check_func_arg(struct verifier_env *env, u32 regno, verbose("invalid map_ptr to access map->key\n"); return -EACCES; } - err = check_stack_boundary(env, regno, (*mapp)->key_size); - + err = check_stack_boundary(env, regno, (*mapp)->key_size, + false); } else if (arg_type == ARG_PTR_TO_MAP_VALUE) { /* bpf_map_xxx(..., map_ptr, ..., value) call: * check [value, value + map->value_size) validity @@ -827,9 +897,12 @@ static int check_func_arg(struct verifier_env *env, u32 regno, verbose("invalid map_ptr to access map->value\n"); return -EACCES; } - err = check_stack_boundary(env, regno, (*mapp)->value_size); + err = check_stack_boundary(env, regno, (*mapp)->value_size, + false); + } else if (arg_type == ARG_CONST_STACK_SIZE || + arg_type == ARG_CONST_STACK_SIZE_OR_ZERO) { + bool zero_size_allowed = (arg_type == ARG_CONST_STACK_SIZE_OR_ZERO); - } else if (arg_type == ARG_CONST_STACK_SIZE) { /* bpf_xxx(..., buf, len) call will access 'len' bytes * from stack pointer 'buf'. Check it * note: regno == len, regno - 1 == buf @@ -839,7 +912,8 @@ static int check_func_arg(struct verifier_env *env, u32 regno, verbose("ARG_CONST_STACK_SIZE cannot be first argument\n"); return -EACCES; } - err = check_stack_boundary(env, regno - 1, reg->imm); + err = check_stack_boundary(env, regno - 1, reg->imm, + zero_size_allowed); } return err; @@ -860,8 +934,11 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id) * don't allow any other map type to be passed into * the special func; */ - if (bool_map != bool_func) + if (bool_func && bool_map != bool_func) { + verbose("cannot pass map_type %d into func %d\n", + map->map_type, func_id); return -EINVAL; + } } return 0; @@ -950,8 +1027,9 @@ static int check_call(struct verifier_env *env, int func_id) } /* check validity of 32-bit and 64-bit arithmetic operations */ -static int check_alu_op(struct reg_state *regs, struct bpf_insn *insn) +static int check_alu_op(struct verifier_env *env, struct bpf_insn *insn) { + struct reg_state *regs = env->cur_state.regs; u8 opcode = BPF_OP(insn->code); int err; @@ -976,6 +1054,12 @@ static int check_alu_op(struct reg_state *regs, struct bpf_insn *insn) if (err) return err; + if (is_pointer_value(env, insn->dst_reg)) { + verbose("R%d pointer arithmetic prohibited\n", + insn->dst_reg); + return -EACCES; + } + /* check dest operand */ err = check_reg_arg(regs, insn->dst_reg, DST_OP); if (err) @@ -1012,6 +1096,11 @@ static int check_alu_op(struct reg_state *regs, struct bpf_insn *insn) */ regs[insn->dst_reg] = regs[insn->src_reg]; } else { + if (is_pointer_value(env, insn->src_reg)) { + verbose("R%d partial copy of pointer\n", + insn->src_reg); + return -EACCES; + } regs[insn->dst_reg].type = UNKNOWN_VALUE; regs[insn->dst_reg].map_ptr = NULL; } @@ -1058,11 +1147,31 @@ static int check_alu_op(struct reg_state *regs, struct bpf_insn *insn) return -EINVAL; } + if ((opcode == BPF_LSH || opcode == BPF_RSH || + opcode == BPF_ARSH) && BPF_SRC(insn->code) == BPF_K) { + int size = BPF_CLASS(insn->code) == BPF_ALU64 ? 64 : 32; + + if (insn->imm < 0 || insn->imm >= size) { + verbose("invalid shift %d\n", insn->imm); + return -EINVAL; + } + } + /* pattern match 'bpf_add Rx, imm' instruction */ if (opcode == BPF_ADD && BPF_CLASS(insn->code) == BPF_ALU64 && regs[insn->dst_reg].type == FRAME_PTR && - BPF_SRC(insn->code) == BPF_K) + BPF_SRC(insn->code) == BPF_K) { stack_relative = true; + } else if (is_pointer_value(env, insn->dst_reg)) { + verbose("R%d pointer arithmetic prohibited\n", + insn->dst_reg); + return -EACCES; + } else if (BPF_SRC(insn->code) == BPF_X && + is_pointer_value(env, insn->src_reg)) { + verbose("R%d pointer arithmetic prohibited\n", + insn->src_reg); + return -EACCES; + } /* check dest operand */ err = check_reg_arg(regs, insn->dst_reg, DST_OP); @@ -1101,6 +1210,12 @@ static int check_cond_jmp_op(struct verifier_env *env, err = check_reg_arg(regs, insn->src_reg, SRC_OP); if (err) return err; + + if (is_pointer_value(env, insn->src_reg)) { + verbose("R%d pointer comparison prohibited\n", + insn->src_reg); + return -EACCES; + } } else { if (insn->src_reg != BPF_REG_0) { verbose("BPF_JMP uses reserved fields\n"); @@ -1155,6 +1270,9 @@ static int check_cond_jmp_op(struct verifier_env *env, regs[insn->dst_reg].type = CONST_IMM; regs[insn->dst_reg].imm = 0; } + } else if (is_pointer_value(env, insn->dst_reg)) { + verbose("R%d pointer comparison prohibited\n", insn->dst_reg); + return -EACCES; } else if (BPF_SRC(insn->code) == BPF_K && (opcode == BPF_JEQ || opcode == BPF_JNE)) { @@ -1658,7 +1776,7 @@ static int do_check(struct verifier_env *env) } if (class == BPF_ALU || class == BPF_ALU64) { - err = check_alu_op(regs, insn); + err = check_alu_op(env, insn); if (err) return err; @@ -1816,6 +1934,11 @@ static int do_check(struct verifier_env *env) if (err) return err; + if (is_pointer_value(env, BPF_REG_0)) { + verbose("R0 leaks addr as return value\n"); + return -EACCES; + } + process_bpf_exit: insn_idx = pop_stack(env, &prev_insn_idx); if (insn_idx < 0) { @@ -1902,8 +2025,7 @@ static int replace_map_fd_with_map_ptr(struct verifier_env *env) } f = fdget(insn->imm); - - map = bpf_map_get(f); + map = __bpf_map_get(f); if (IS_ERR(map)) { verbose("fd %d is not pointing to valid bpf_map\n", insn->imm); @@ -1935,8 +2057,7 @@ static int replace_map_fd_with_map_ptr(struct verifier_env *env) * will be used by the valid program until it's unloaded * and all maps are released in free_bpf_prog_info() */ - atomic_inc(&map->refcnt); - + bpf_map_inc(map, false); fdput(f); next_insn: insn++; @@ -1987,7 +2108,7 @@ static void adjust_branches(struct bpf_prog *prog, int pos, int delta) /* adjust offset of jmps if necessary */ if (i < pos && i + insn->off + 1 > pos) insn->off += delta; - else if (i > pos && i + insn->off + 1 < pos) + else if (i > pos + delta && i + insn->off + 1 <= pos + delta) insn->off -= delta; } } @@ -2024,7 +2145,7 @@ static int convert_ctx_accesses(struct verifier_env *env) cnt = env->prog->aux->ops-> convert_ctx_access(type, insn->dst_reg, insn->src_reg, - insn->off, insn_buf); + insn->off, insn_buf, env->prog); if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf)) { verbose("bpf verifier is misconfigured\n"); return -EINVAL; @@ -2144,6 +2265,8 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr) if (ret < 0) goto skip_full_check; + env->allow_ptr_leaks = capable(CAP_SYS_ADMIN); + ret = do_check(env); skip_full_check: |