diff options
Diffstat (limited to 'kernel/bpf/hashtab.c')
| -rw-r--r-- | kernel/bpf/hashtab.c | 174 | 
1 files changed, 124 insertions, 50 deletions
| diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index a1468e3f5af2..d541c8486c95 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -27,9 +27,62 @@  	.map_delete_batch =			\  	generic_map_delete_batch +/* + * The bucket lock has two protection scopes: + * + * 1) Serializing concurrent operations from BPF programs on differrent + *    CPUs + * + * 2) Serializing concurrent operations from BPF programs and sys_bpf() + * + * BPF programs can execute in any context including perf, kprobes and + * tracing. As there are almost no limits where perf, kprobes and tracing + * can be invoked from the lock operations need to be protected against + * deadlocks. Deadlocks can be caused by recursion and by an invocation in + * the lock held section when functions which acquire this lock are invoked + * from sys_bpf(). BPF recursion is prevented by incrementing the per CPU + * variable bpf_prog_active, which prevents BPF programs attached to perf + * events, kprobes and tracing to be invoked before the prior invocation + * from one of these contexts completed. sys_bpf() uses the same mechanism + * by pinning the task to the current CPU and incrementing the recursion + * protection accross the map operation. + * + * This has subtle implications on PREEMPT_RT. PREEMPT_RT forbids certain + * operations like memory allocations (even with GFP_ATOMIC) from atomic + * contexts. This is required because even with GFP_ATOMIC the memory + * allocator calls into code pathes which acquire locks with long held lock + * sections. To ensure the deterministic behaviour these locks are regular + * spinlocks, which are converted to 'sleepable' spinlocks on RT. The only + * true atomic contexts on an RT kernel are the low level hardware + * handling, scheduling, low level interrupt handling, NMIs etc. None of + * these contexts should ever do memory allocations. + * + * As regular device interrupt handlers and soft interrupts are forced into + * thread context, the existing code which does + *   spin_lock*(); alloc(GPF_ATOMIC); spin_unlock*(); + * just works. + * + * In theory the BPF locks could be converted to regular spinlocks as well, + * but the bucket locks and percpu_freelist locks can be taken from + * arbitrary contexts (perf, kprobes, tracepoints) which are required to be + * atomic contexts even on RT. These mechanisms require preallocated maps, + * so there is no need to invoke memory allocations within the lock held + * sections. + * + * BPF maps which need dynamic allocation are only used from (forced) + * thread context on RT and can therefore use regular spinlocks which in + * turn allows to invoke memory allocations from the lock held section. + * + * On a non RT kernel this distinction is neither possible nor required. + * spinlock maps to raw_spinlock and the extra code is optimized out by the + * compiler. + */  struct bucket {  	struct hlist_nulls_head head; -	raw_spinlock_t lock; +	union { +		raw_spinlock_t raw_lock; +		spinlock_t     lock; +	};  };  struct bpf_htab { @@ -65,9 +118,54 @@ struct htab_elem {  		struct bpf_lru_node lru_node;  	};  	u32 hash; -	char key[0] __aligned(8); +	char key[] __aligned(8);  }; +static inline bool htab_is_prealloc(const struct bpf_htab *htab) +{ +	return !(htab->map.map_flags & BPF_F_NO_PREALLOC); +} + +static inline bool htab_use_raw_lock(const struct bpf_htab *htab) +{ +	return (!IS_ENABLED(CONFIG_PREEMPT_RT) || htab_is_prealloc(htab)); +} + +static void htab_init_buckets(struct bpf_htab *htab) +{ +	unsigned i; + +	for (i = 0; i < htab->n_buckets; i++) { +		INIT_HLIST_NULLS_HEAD(&htab->buckets[i].head, i); +		if (htab_use_raw_lock(htab)) +			raw_spin_lock_init(&htab->buckets[i].raw_lock); +		else +			spin_lock_init(&htab->buckets[i].lock); +	} +} + +static inline unsigned long htab_lock_bucket(const struct bpf_htab *htab, +					     struct bucket *b) +{ +	unsigned long flags; + +	if (htab_use_raw_lock(htab)) +		raw_spin_lock_irqsave(&b->raw_lock, flags); +	else +		spin_lock_irqsave(&b->lock, flags); +	return flags; +} + +static inline void htab_unlock_bucket(const struct bpf_htab *htab, +				      struct bucket *b, +				      unsigned long flags) +{ +	if (htab_use_raw_lock(htab)) +		raw_spin_unlock_irqrestore(&b->raw_lock, flags); +	else +		spin_unlock_irqrestore(&b->lock, flags); +} +  static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node);  static bool htab_is_lru(const struct bpf_htab *htab) @@ -82,11 +180,6 @@ static bool htab_is_percpu(const struct bpf_htab *htab)  		htab->map.map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH;  } -static bool htab_is_prealloc(const struct bpf_htab *htab) -{ -	return !(htab->map.map_flags & BPF_F_NO_PREALLOC); -} -  static inline void htab_elem_set_ptr(struct htab_elem *l, u32 key_size,  				     void __percpu *pptr)  { @@ -328,8 +421,8 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)  	bool percpu_lru = (attr->map_flags & BPF_F_NO_COMMON_LRU);  	bool prealloc = !(attr->map_flags & BPF_F_NO_PREALLOC);  	struct bpf_htab *htab; -	int err, i;  	u64 cost; +	int err;  	htab = kzalloc(sizeof(*htab), GFP_USER);  	if (!htab) @@ -391,10 +484,7 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)  	else  		htab->hashrnd = get_random_int(); -	for (i = 0; i < htab->n_buckets; i++) { -		INIT_HLIST_NULLS_HEAD(&htab->buckets[i].head, i); -		raw_spin_lock_init(&htab->buckets[i].lock); -	} +	htab_init_buckets(htab);  	if (prealloc) {  		err = prealloc_init(htab); @@ -602,7 +692,7 @@ static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node)  	b = __select_bucket(htab, tgt_l->hash);  	head = &b->head; -	raw_spin_lock_irqsave(&b->lock, flags); +	flags = htab_lock_bucket(htab, b);  	hlist_nulls_for_each_entry_rcu(l, n, head, hash_node)  		if (l == tgt_l) { @@ -610,7 +700,7 @@ static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node)  			break;  		} -	raw_spin_unlock_irqrestore(&b->lock, flags); +	htab_unlock_bucket(htab, b, flags);  	return l == tgt_l;  } @@ -686,15 +776,7 @@ static void htab_elem_free_rcu(struct rcu_head *head)  	struct htab_elem *l = container_of(head, struct htab_elem, rcu);  	struct bpf_htab *htab = l->htab; -	/* must increment bpf_prog_active to avoid kprobe+bpf triggering while -	 * we're calling kfree, otherwise deadlock is possible if kprobes -	 * are placed somewhere inside of slub -	 */ -	preempt_disable(); -	__this_cpu_inc(bpf_prog_active);  	htab_elem_free(htab, l); -	__this_cpu_dec(bpf_prog_active); -	preempt_enable();  }  static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l) @@ -884,8 +966,7 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,  		 */  	} -	/* bpf_map_update_elem() can be called in_irq() */ -	raw_spin_lock_irqsave(&b->lock, flags); +	flags = htab_lock_bucket(htab, b);  	l_old = lookup_elem_raw(head, hash, key, key_size); @@ -926,7 +1007,7 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,  	}  	ret = 0;  err: -	raw_spin_unlock_irqrestore(&b->lock, flags); +	htab_unlock_bucket(htab, b, flags);  	return ret;  } @@ -964,8 +1045,7 @@ static int htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value,  		return -ENOMEM;  	memcpy(l_new->key + round_up(map->key_size, 8), value, map->value_size); -	/* bpf_map_update_elem() can be called in_irq() */ -	raw_spin_lock_irqsave(&b->lock, flags); +	flags = htab_lock_bucket(htab, b);  	l_old = lookup_elem_raw(head, hash, key, key_size); @@ -984,7 +1064,7 @@ static int htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value,  	ret = 0;  err: -	raw_spin_unlock_irqrestore(&b->lock, flags); +	htab_unlock_bucket(htab, b, flags);  	if (ret)  		bpf_lru_push_free(&htab->lru, &l_new->lru_node); @@ -1019,8 +1099,7 @@ static int __htab_percpu_map_update_elem(struct bpf_map *map, void *key,  	b = __select_bucket(htab, hash);  	head = &b->head; -	/* bpf_map_update_elem() can be called in_irq() */ -	raw_spin_lock_irqsave(&b->lock, flags); +	flags = htab_lock_bucket(htab, b);  	l_old = lookup_elem_raw(head, hash, key, key_size); @@ -1043,7 +1122,7 @@ static int __htab_percpu_map_update_elem(struct bpf_map *map, void *key,  	}  	ret = 0;  err: -	raw_spin_unlock_irqrestore(&b->lock, flags); +	htab_unlock_bucket(htab, b, flags);  	return ret;  } @@ -1083,8 +1162,7 @@ static int __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key,  			return -ENOMEM;  	} -	/* bpf_map_update_elem() can be called in_irq() */ -	raw_spin_lock_irqsave(&b->lock, flags); +	flags = htab_lock_bucket(htab, b);  	l_old = lookup_elem_raw(head, hash, key, key_size); @@ -1106,7 +1184,7 @@ static int __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key,  	}  	ret = 0;  err: -	raw_spin_unlock_irqrestore(&b->lock, flags); +	htab_unlock_bucket(htab, b, flags);  	if (l_new)  		bpf_lru_push_free(&htab->lru, &l_new->lru_node);  	return ret; @@ -1144,7 +1222,7 @@ static int htab_map_delete_elem(struct bpf_map *map, void *key)  	b = __select_bucket(htab, hash);  	head = &b->head; -	raw_spin_lock_irqsave(&b->lock, flags); +	flags = htab_lock_bucket(htab, b);  	l = lookup_elem_raw(head, hash, key, key_size); @@ -1154,7 +1232,7 @@ static int htab_map_delete_elem(struct bpf_map *map, void *key)  		ret = 0;  	} -	raw_spin_unlock_irqrestore(&b->lock, flags); +	htab_unlock_bucket(htab, b, flags);  	return ret;  } @@ -1176,7 +1254,7 @@ static int htab_lru_map_delete_elem(struct bpf_map *map, void *key)  	b = __select_bucket(htab, hash);  	head = &b->head; -	raw_spin_lock_irqsave(&b->lock, flags); +	flags = htab_lock_bucket(htab, b);  	l = lookup_elem_raw(head, hash, key, key_size); @@ -1185,7 +1263,7 @@ static int htab_lru_map_delete_elem(struct bpf_map *map, void *key)  		ret = 0;  	} -	raw_spin_unlock_irqrestore(&b->lock, flags); +	htab_unlock_bucket(htab, b, flags);  	if (l)  		bpf_lru_push_free(&htab->lru, &l->lru_node);  	return ret; @@ -1325,8 +1403,7 @@ alloc:  	}  again: -	preempt_disable(); -	this_cpu_inc(bpf_prog_active); +	bpf_disable_instrumentation();  	rcu_read_lock();  again_nocopy:  	dst_key = keys; @@ -1335,7 +1412,7 @@ again_nocopy:  	head = &b->head;  	/* do not grab the lock unless need it (bucket_cnt > 0). */  	if (locked) -		raw_spin_lock_irqsave(&b->lock, flags); +		flags = htab_lock_bucket(htab, b);  	bucket_cnt = 0;  	hlist_nulls_for_each_entry_rcu(l, n, head, hash_node) @@ -1352,10 +1429,9 @@ again_nocopy:  		/* Note that since bucket_cnt > 0 here, it is implicit  		 * that the locked was grabbed, so release it.  		 */ -		raw_spin_unlock_irqrestore(&b->lock, flags); +		htab_unlock_bucket(htab, b, flags);  		rcu_read_unlock(); -		this_cpu_dec(bpf_prog_active); -		preempt_enable(); +		bpf_enable_instrumentation();  		goto after_loop;  	} @@ -1364,10 +1440,9 @@ again_nocopy:  		/* Note that since bucket_cnt > 0 here, it is implicit  		 * that the locked was grabbed, so release it.  		 */ -		raw_spin_unlock_irqrestore(&b->lock, flags); +		htab_unlock_bucket(htab, b, flags);  		rcu_read_unlock(); -		this_cpu_dec(bpf_prog_active); -		preempt_enable(); +		bpf_enable_instrumentation();  		kvfree(keys);  		kvfree(values);  		goto alloc; @@ -1418,7 +1493,7 @@ again_nocopy:  		dst_val += value_size;  	} -	raw_spin_unlock_irqrestore(&b->lock, flags); +	htab_unlock_bucket(htab, b, flags);  	locked = false;  	while (node_to_free) { @@ -1437,8 +1512,7 @@ next_batch:  	}  	rcu_read_unlock(); -	this_cpu_dec(bpf_prog_active); -	preempt_enable(); +	bpf_enable_instrumentation();  	if (bucket_cnt && (copy_to_user(ukeys + total * key_size, keys,  	    key_size * bucket_cnt) ||  	    copy_to_user(uvalues + total * value_size, values, | 
