diff options
Diffstat (limited to 'kernel')
102 files changed, 4675 insertions, 2010 deletions
| diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks index 84d882f3e299..fbba478ae522 100644 --- a/kernel/Kconfig.locks +++ b/kernel/Kconfig.locks @@ -242,6 +242,9 @@ config QUEUED_SPINLOCKS  	def_bool y if ARCH_USE_QUEUED_SPINLOCKS  	depends on SMP +config BPF_ARCH_SPINLOCK +	bool +  config ARCH_USE_QUEUED_RWLOCKS  	bool diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 25632a75d630..c72e0d8e1e65 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -253,8 +253,9 @@ static int array_map_update_elem(struct bpf_map *map, void *key, void *value,  {  	struct bpf_array *array = container_of(map, struct bpf_array, map);  	u32 index = *(u32 *)key; +	char *val; -	if (unlikely(map_flags > BPF_EXIST)) +	if (unlikely((map_flags & ~BPF_F_LOCK) > BPF_EXIST))  		/* unknown flags */  		return -EINVAL; @@ -262,17 +263,25 @@ static int array_map_update_elem(struct bpf_map *map, void *key, void *value,  		/* all elements were pre-allocated, cannot insert a new one */  		return -E2BIG; -	if (unlikely(map_flags == BPF_NOEXIST)) +	if (unlikely(map_flags & BPF_NOEXIST))  		/* all elements already exist */  		return -EEXIST; -	if (array->map.map_type == BPF_MAP_TYPE_PERCPU_ARRAY) +	if (unlikely((map_flags & BPF_F_LOCK) && +		     !map_value_has_spin_lock(map))) +		return -EINVAL; + +	if (array->map.map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {  		memcpy(this_cpu_ptr(array->pptrs[index & array->index_mask]),  		       value, map->value_size); -	else -		memcpy(array->value + -		       array->elem_size * (index & array->index_mask), -		       value, map->value_size); +	} else { +		val = array->value + +			array->elem_size * (index & array->index_mask); +		if (map_flags & BPF_F_LOCK) +			copy_map_value_locked(map, val, value, false); +		else +			copy_map_value(map, val, value); +	}  	return 0;  } diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index c57bd10340ed..bd3921b1514b 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -157,7 +157,7 @@   *   */ -#define BITS_PER_U64 (sizeof(u64) * BITS_PER_BYTE) +#define BITS_PER_U128 (sizeof(u64) * BITS_PER_BYTE * 2)  #define BITS_PER_BYTE_MASK (BITS_PER_BYTE - 1)  #define BITS_PER_BYTE_MASKED(bits) ((bits) & BITS_PER_BYTE_MASK)  #define BITS_ROUNDDOWN_BYTES(bits) ((bits) >> 3) @@ -355,6 +355,11 @@ static bool btf_type_is_struct(const struct btf_type *t)  	return kind == BTF_KIND_STRUCT || kind == BTF_KIND_UNION;  } +static bool __btf_type_is_struct(const struct btf_type *t) +{ +	return BTF_INFO_KIND(t->info) == BTF_KIND_STRUCT; +} +  static bool btf_type_is_array(const struct btf_type *t)  {  	return BTF_INFO_KIND(t->info) == BTF_KIND_ARRAY; @@ -525,7 +530,7 @@ const struct btf_type *btf_type_by_id(const struct btf *btf, u32 type_id)  /*   * Regular int is not a bit field and it must be either - * u8/u16/u32/u64. + * u8/u16/u32/u64 or __int128.   */  static bool btf_type_int_is_regular(const struct btf_type *t)  { @@ -538,7 +543,8 @@ static bool btf_type_int_is_regular(const struct btf_type *t)  	if (BITS_PER_BYTE_MASKED(nr_bits) ||  	    BTF_INT_OFFSET(int_data) ||  	    (nr_bytes != sizeof(u8) && nr_bytes != sizeof(u16) && -	     nr_bytes != sizeof(u32) && nr_bytes != sizeof(u64))) { +	     nr_bytes != sizeof(u32) && nr_bytes != sizeof(u64) && +	     nr_bytes != (2 * sizeof(u64)))) {  		return false;  	} @@ -1063,9 +1069,9 @@ static int btf_int_check_member(struct btf_verifier_env *env,  	nr_copy_bits = BTF_INT_BITS(int_data) +  		BITS_PER_BYTE_MASKED(struct_bits_off); -	if (nr_copy_bits > BITS_PER_U64) { +	if (nr_copy_bits > BITS_PER_U128) {  		btf_verifier_log_member(env, struct_type, member, -					"nr_copy_bits exceeds 64"); +					"nr_copy_bits exceeds 128");  		return -EINVAL;  	} @@ -1119,9 +1125,9 @@ static int btf_int_check_kflag_member(struct btf_verifier_env *env,  	bytes_offset = BITS_ROUNDDOWN_BYTES(struct_bits_off);  	nr_copy_bits = nr_bits + BITS_PER_BYTE_MASKED(struct_bits_off); -	if (nr_copy_bits > BITS_PER_U64) { +	if (nr_copy_bits > BITS_PER_U128) {  		btf_verifier_log_member(env, struct_type, member, -					"nr_copy_bits exceeds 64"); +					"nr_copy_bits exceeds 128");  		return -EINVAL;  	} @@ -1168,9 +1174,9 @@ static s32 btf_int_check_meta(struct btf_verifier_env *env,  	nr_bits = BTF_INT_BITS(int_data) + BTF_INT_OFFSET(int_data); -	if (nr_bits > BITS_PER_U64) { +	if (nr_bits > BITS_PER_U128) {  		btf_verifier_log_type(env, t, "nr_bits exceeds %zu", -				      BITS_PER_U64); +				      BITS_PER_U128);  		return -EINVAL;  	} @@ -1211,31 +1217,93 @@ static void btf_int_log(struct btf_verifier_env *env,  			 btf_int_encoding_str(BTF_INT_ENCODING(int_data)));  } +static void btf_int128_print(struct seq_file *m, void *data) +{ +	/* data points to a __int128 number. +	 * Suppose +	 *     int128_num = *(__int128 *)data; +	 * The below formulas shows what upper_num and lower_num represents: +	 *     upper_num = int128_num >> 64; +	 *     lower_num = int128_num & 0xffffffffFFFFFFFFULL; +	 */ +	u64 upper_num, lower_num; + +#ifdef __BIG_ENDIAN_BITFIELD +	upper_num = *(u64 *)data; +	lower_num = *(u64 *)(data + 8); +#else +	upper_num = *(u64 *)(data + 8); +	lower_num = *(u64 *)data; +#endif +	if (upper_num == 0) +		seq_printf(m, "0x%llx", lower_num); +	else +		seq_printf(m, "0x%llx%016llx", upper_num, lower_num); +} + +static void btf_int128_shift(u64 *print_num, u16 left_shift_bits, +			     u16 right_shift_bits) +{ +	u64 upper_num, lower_num; + +#ifdef __BIG_ENDIAN_BITFIELD +	upper_num = print_num[0]; +	lower_num = print_num[1]; +#else +	upper_num = print_num[1]; +	lower_num = print_num[0]; +#endif + +	/* shake out un-needed bits by shift/or operations */ +	if (left_shift_bits >= 64) { +		upper_num = lower_num << (left_shift_bits - 64); +		lower_num = 0; +	} else { +		upper_num = (upper_num << left_shift_bits) | +			    (lower_num >> (64 - left_shift_bits)); +		lower_num = lower_num << left_shift_bits; +	} + +	if (right_shift_bits >= 64) { +		lower_num = upper_num >> (right_shift_bits - 64); +		upper_num = 0; +	} else { +		lower_num = (lower_num >> right_shift_bits) | +			    (upper_num << (64 - right_shift_bits)); +		upper_num = upper_num >> right_shift_bits; +	} + +#ifdef __BIG_ENDIAN_BITFIELD +	print_num[0] = upper_num; +	print_num[1] = lower_num; +#else +	print_num[0] = lower_num; +	print_num[1] = upper_num; +#endif +} +  static void btf_bitfield_seq_show(void *data, u8 bits_offset,  				  u8 nr_bits, struct seq_file *m)  {  	u16 left_shift_bits, right_shift_bits;  	u8 nr_copy_bytes;  	u8 nr_copy_bits; -	u64 print_num; +	u64 print_num[2] = {};  	nr_copy_bits = nr_bits + bits_offset;  	nr_copy_bytes = BITS_ROUNDUP_BYTES(nr_copy_bits); -	print_num = 0; -	memcpy(&print_num, data, nr_copy_bytes); +	memcpy(print_num, data, nr_copy_bytes);  #ifdef __BIG_ENDIAN_BITFIELD  	left_shift_bits = bits_offset;  #else -	left_shift_bits = BITS_PER_U64 - nr_copy_bits; +	left_shift_bits = BITS_PER_U128 - nr_copy_bits;  #endif -	right_shift_bits = BITS_PER_U64 - nr_bits; +	right_shift_bits = BITS_PER_U128 - nr_bits; -	print_num <<= left_shift_bits; -	print_num >>= right_shift_bits; - -	seq_printf(m, "0x%llx", print_num); +	btf_int128_shift(print_num, left_shift_bits, right_shift_bits); +	btf_int128_print(m, print_num);  } @@ -1250,7 +1318,7 @@ static void btf_int_bits_seq_show(const struct btf *btf,  	/*  	 * bits_offset is at most 7. -	 * BTF_INT_OFFSET() cannot exceed 64 bits. +	 * BTF_INT_OFFSET() cannot exceed 128 bits.  	 */  	total_bits_offset = bits_offset + BTF_INT_OFFSET(int_data);  	data += BITS_ROUNDDOWN_BYTES(total_bits_offset); @@ -1274,6 +1342,9 @@ static void btf_int_seq_show(const struct btf *btf, const struct btf_type *t,  	}  	switch (nr_bits) { +	case 128: +		btf_int128_print(m, data); +		break;  	case 64:  		if (sign)  			seq_printf(m, "%lld", *(s64 *)data); @@ -1980,6 +2051,43 @@ static void btf_struct_log(struct btf_verifier_env *env,  	btf_verifier_log(env, "size=%u vlen=%u", t->size, btf_type_vlen(t));  } +/* find 'struct bpf_spin_lock' in map value. + * return >= 0 offset if found + * and < 0 in case of error + */ +int btf_find_spin_lock(const struct btf *btf, const struct btf_type *t) +{ +	const struct btf_member *member; +	u32 i, off = -ENOENT; + +	if (!__btf_type_is_struct(t)) +		return -EINVAL; + +	for_each_member(i, t, member) { +		const struct btf_type *member_type = btf_type_by_id(btf, +								    member->type); +		if (!__btf_type_is_struct(member_type)) +			continue; +		if (member_type->size != sizeof(struct bpf_spin_lock)) +			continue; +		if (strcmp(__btf_name_by_offset(btf, member_type->name_off), +			   "bpf_spin_lock")) +			continue; +		if (off != -ENOENT) +			/* only one 'struct bpf_spin_lock' is allowed */ +			return -E2BIG; +		off = btf_member_bit_offset(t, member); +		if (off % 8) +			/* valid C code cannot generate such BTF */ +			return -EINVAL; +		off /= 8; +		if (off % __alignof__(struct bpf_spin_lock)) +			/* valid struct bpf_spin_lock will be 4 byte aligned */ +			return -EINVAL; +	} +	return off; +} +  static void btf_struct_seq_show(const struct btf *btf, const struct btf_type *t,  				u32 type_id, void *data, u8 bits_offset,  				struct seq_file *m) diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index d17d05570a3f..4e807973aa80 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -230,6 +230,7 @@ cleanup:   * @cgrp: The cgroup which descendants to traverse   * @prog: A program to attach   * @type: Type of attach operation + * @flags: Option flags   *   * Must be called with cgroup_mutex held.   */ @@ -363,7 +364,7 @@ cleanup:   * Must be called with cgroup_mutex held.   */  int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, -			enum bpf_attach_type type, u32 unused_flags) +			enum bpf_attach_type type)  {  	struct list_head *progs = &cgrp->bpf.progs[type];  	enum bpf_cgroup_storage_type stype; diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index f908b9356025..ff09d32a8a1b 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -78,7 +78,7 @@ void *bpf_internal_load_pointer_neg_helper(const struct sk_buff *skb, int k, uns  	return NULL;  } -struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags) +struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flags)  {  	gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO | gfp_extra_flags;  	struct bpf_prog_aux *aux; @@ -104,6 +104,32 @@ struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags)  	return fp;  } + +struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags) +{ +	gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO | gfp_extra_flags; +	struct bpf_prog *prog; +	int cpu; + +	prog = bpf_prog_alloc_no_stats(size, gfp_extra_flags); +	if (!prog) +		return NULL; + +	prog->aux->stats = alloc_percpu_gfp(struct bpf_prog_stats, gfp_flags); +	if (!prog->aux->stats) { +		kfree(prog->aux); +		vfree(prog); +		return NULL; +	} + +	for_each_possible_cpu(cpu) { +		struct bpf_prog_stats *pstats; + +		pstats = per_cpu_ptr(prog->aux->stats, cpu); +		u64_stats_init(&pstats->syncp); +	} +	return prog; +}  EXPORT_SYMBOL_GPL(bpf_prog_alloc);  int bpf_prog_alloc_jited_linfo(struct bpf_prog *prog) @@ -231,7 +257,10 @@ struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size,  void __bpf_prog_free(struct bpf_prog *fp)  { -	kfree(fp->aux); +	if (fp->aux) { +		free_percpu(fp->aux->stats); +		kfree(fp->aux); +	}  	vfree(fp);  } @@ -307,15 +336,16 @@ int bpf_prog_calc_tag(struct bpf_prog *fp)  	return 0;  } -static int bpf_adj_delta_to_imm(struct bpf_insn *insn, u32 pos, u32 delta, -				u32 curr, const bool probe_pass) +static int bpf_adj_delta_to_imm(struct bpf_insn *insn, u32 pos, s32 end_old, +				s32 end_new, u32 curr, const bool probe_pass)  {  	const s64 imm_min = S32_MIN, imm_max = S32_MAX; +	s32 delta = end_new - end_old;  	s64 imm = insn->imm; -	if (curr < pos && curr + imm + 1 > pos) +	if (curr < pos && curr + imm + 1 >= end_old)  		imm += delta; -	else if (curr > pos + delta && curr + imm + 1 <= pos + delta) +	else if (curr >= end_new && curr + imm + 1 < end_new)  		imm -= delta;  	if (imm < imm_min || imm > imm_max)  		return -ERANGE; @@ -324,15 +354,16 @@ static int bpf_adj_delta_to_imm(struct bpf_insn *insn, u32 pos, u32 delta,  	return 0;  } -static int bpf_adj_delta_to_off(struct bpf_insn *insn, u32 pos, u32 delta, -				u32 curr, const bool probe_pass) +static int bpf_adj_delta_to_off(struct bpf_insn *insn, u32 pos, s32 end_old, +				s32 end_new, u32 curr, const bool probe_pass)  {  	const s32 off_min = S16_MIN, off_max = S16_MAX; +	s32 delta = end_new - end_old;  	s32 off = insn->off; -	if (curr < pos && curr + off + 1 > pos) +	if (curr < pos && curr + off + 1 >= end_old)  		off += delta; -	else if (curr > pos + delta && curr + off + 1 <= pos + delta) +	else if (curr >= end_new && curr + off + 1 < end_new)  		off -= delta;  	if (off < off_min || off > off_max)  		return -ERANGE; @@ -341,10 +372,10 @@ static int bpf_adj_delta_to_off(struct bpf_insn *insn, u32 pos, u32 delta,  	return 0;  } -static int bpf_adj_branches(struct bpf_prog *prog, u32 pos, u32 delta, -			    const bool probe_pass) +static int bpf_adj_branches(struct bpf_prog *prog, u32 pos, s32 end_old, +			    s32 end_new, const bool probe_pass)  { -	u32 i, insn_cnt = prog->len + (probe_pass ? delta : 0); +	u32 i, insn_cnt = prog->len + (probe_pass ? end_new - end_old : 0);  	struct bpf_insn *insn = prog->insnsi;  	int ret = 0; @@ -356,22 +387,23 @@ static int bpf_adj_branches(struct bpf_prog *prog, u32 pos, u32 delta,  		 * do any other adjustments. Therefore skip the patchlet.  		 */  		if (probe_pass && i == pos) { -			i += delta + 1; -			insn++; +			i = end_new; +			insn = prog->insnsi + end_old;  		}  		code = insn->code; -		if (BPF_CLASS(code) != BPF_JMP || +		if ((BPF_CLASS(code) != BPF_JMP && +		     BPF_CLASS(code) != BPF_JMP32) ||  		    BPF_OP(code) == BPF_EXIT)  			continue;  		/* Adjust offset of jmps if we cross patch boundaries. */  		if (BPF_OP(code) == BPF_CALL) {  			if (insn->src_reg != BPF_PSEUDO_CALL)  				continue; -			ret = bpf_adj_delta_to_imm(insn, pos, delta, i, -						   probe_pass); +			ret = bpf_adj_delta_to_imm(insn, pos, end_old, +						   end_new, i, probe_pass);  		} else { -			ret = bpf_adj_delta_to_off(insn, pos, delta, i, -						   probe_pass); +			ret = bpf_adj_delta_to_off(insn, pos, end_old, +						   end_new, i, probe_pass);  		}  		if (ret)  			break; @@ -421,7 +453,7 @@ struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off,  	 * we afterwards may not fail anymore.  	 */  	if (insn_adj_cnt > cnt_max && -	    bpf_adj_branches(prog, off, insn_delta, true)) +	    bpf_adj_branches(prog, off, off + 1, off + len, true))  		return NULL;  	/* Several new instructions need to be inserted. Make room @@ -453,13 +485,25 @@ struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off,  	 * the ship has sailed to reverse to the original state. An  	 * overflow cannot happen at this point.  	 */ -	BUG_ON(bpf_adj_branches(prog_adj, off, insn_delta, false)); +	BUG_ON(bpf_adj_branches(prog_adj, off, off + 1, off + len, false));  	bpf_adj_linfo(prog_adj, off, insn_delta);  	return prog_adj;  } +int bpf_remove_insns(struct bpf_prog *prog, u32 off, u32 cnt) +{ +	/* Branch offsets can't overflow when program is shrinking, no need +	 * to call bpf_adj_branches(..., true) here +	 */ +	memmove(prog->insnsi + off, prog->insnsi + off + cnt, +		sizeof(struct bpf_insn) * (prog->len - off - cnt)); +	prog->len -= cnt; + +	return WARN_ON_ONCE(bpf_adj_branches(prog, off, off + cnt, off, false)); +} +  void bpf_prog_kallsyms_del_subprogs(struct bpf_prog *fp)  {  	int i; @@ -495,7 +539,7 @@ bpf_get_prog_addr_region(const struct bpf_prog *prog,  	*symbol_end   = addr + hdr->pages * PAGE_SIZE;  } -static void bpf_get_prog_name(const struct bpf_prog *prog, char *sym) +void bpf_get_prog_name(const struct bpf_prog *prog, char *sym)  {  	const char *end = sym + KSYM_NAME_LEN;  	const struct btf_type *type; @@ -934,6 +978,27 @@ static int bpf_jit_blind_insn(const struct bpf_insn *from,  		*to++ = BPF_JMP_REG(from->code, from->dst_reg, BPF_REG_AX, off);  		break; +	case BPF_JMP32 | BPF_JEQ  | BPF_K: +	case BPF_JMP32 | BPF_JNE  | BPF_K: +	case BPF_JMP32 | BPF_JGT  | BPF_K: +	case BPF_JMP32 | BPF_JLT  | BPF_K: +	case BPF_JMP32 | BPF_JGE  | BPF_K: +	case BPF_JMP32 | BPF_JLE  | BPF_K: +	case BPF_JMP32 | BPF_JSGT | BPF_K: +	case BPF_JMP32 | BPF_JSLT | BPF_K: +	case BPF_JMP32 | BPF_JSGE | BPF_K: +	case BPF_JMP32 | BPF_JSLE | BPF_K: +	case BPF_JMP32 | BPF_JSET | BPF_K: +		/* Accommodate for extra offset in case of a backjump. */ +		off = from->off; +		if (off < 0) +			off -= 2; +		*to++ = BPF_ALU32_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ from->imm); +		*to++ = BPF_ALU32_IMM(BPF_XOR, BPF_REG_AX, imm_rnd); +		*to++ = BPF_JMP32_REG(from->code, from->dst_reg, BPF_REG_AX, +				      off); +		break; +  	case BPF_LD | BPF_IMM | BPF_DW:  		*to++ = BPF_ALU64_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ aux[1].imm);  		*to++ = BPF_ALU64_IMM(BPF_XOR, BPF_REG_AX, imm_rnd); @@ -1130,6 +1195,31 @@ EXPORT_SYMBOL_GPL(__bpf_call_base);  	INSN_2(JMP, CALL),			\  	/* Exit instruction. */			\  	INSN_2(JMP, EXIT),			\ +	/* 32-bit Jump instructions. */		\ +	/*   Register based. */			\ +	INSN_3(JMP32, JEQ,  X),			\ +	INSN_3(JMP32, JNE,  X),			\ +	INSN_3(JMP32, JGT,  X),			\ +	INSN_3(JMP32, JLT,  X),			\ +	INSN_3(JMP32, JGE,  X),			\ +	INSN_3(JMP32, JLE,  X),			\ +	INSN_3(JMP32, JSGT, X),			\ +	INSN_3(JMP32, JSLT, X),			\ +	INSN_3(JMP32, JSGE, X),			\ +	INSN_3(JMP32, JSLE, X),			\ +	INSN_3(JMP32, JSET, X),			\ +	/*   Immediate based. */		\ +	INSN_3(JMP32, JEQ,  K),			\ +	INSN_3(JMP32, JNE,  K),			\ +	INSN_3(JMP32, JGT,  K),			\ +	INSN_3(JMP32, JLT,  K),			\ +	INSN_3(JMP32, JGE,  K),			\ +	INSN_3(JMP32, JLE,  K),			\ +	INSN_3(JMP32, JSGT, K),			\ +	INSN_3(JMP32, JSLT, K),			\ +	INSN_3(JMP32, JSGE, K),			\ +	INSN_3(JMP32, JSLE, K),			\ +	INSN_3(JMP32, JSET, K),			\  	/* Jump instructions. */		\  	/*   Register based. */			\  	INSN_3(JMP, JEQ,  X),			\ @@ -1202,8 +1292,9 @@ bool bpf_opcode_in_insntable(u8 code)  #ifndef CONFIG_BPF_JIT_ALWAYS_ON  /**   *	__bpf_prog_run - run eBPF program on a given context - *	@ctx: is the data we are operating on + *	@regs: is the array of MAX_BPF_EXT_REG eBPF pseudo-registers   *	@insn: is the array of eBPF instructions + *	@stack: is the eBPF storage stack   *   * Decode and execute eBPF instructions.   */ @@ -1390,145 +1481,49 @@ select_insn:  out:  		CONT;  	} -	/* JMP */  	JMP_JA:  		insn += insn->off;  		CONT; -	JMP_JEQ_X: -		if (DST == SRC) { -			insn += insn->off; -			CONT_JMP; -		} -		CONT; -	JMP_JEQ_K: -		if (DST == IMM) { -			insn += insn->off; -			CONT_JMP; -		} -		CONT; -	JMP_JNE_X: -		if (DST != SRC) { -			insn += insn->off; -			CONT_JMP; -		} -		CONT; -	JMP_JNE_K: -		if (DST != IMM) { -			insn += insn->off; -			CONT_JMP; -		} -		CONT; -	JMP_JGT_X: -		if (DST > SRC) { -			insn += insn->off; -			CONT_JMP; -		} -		CONT; -	JMP_JGT_K: -		if (DST > IMM) { -			insn += insn->off; -			CONT_JMP; -		} -		CONT; -	JMP_JLT_X: -		if (DST < SRC) { -			insn += insn->off; -			CONT_JMP; -		} -		CONT; -	JMP_JLT_K: -		if (DST < IMM) { -			insn += insn->off; -			CONT_JMP; -		} -		CONT; -	JMP_JGE_X: -		if (DST >= SRC) { -			insn += insn->off; -			CONT_JMP; -		} -		CONT; -	JMP_JGE_K: -		if (DST >= IMM) { -			insn += insn->off; -			CONT_JMP; -		} -		CONT; -	JMP_JLE_X: -		if (DST <= SRC) { -			insn += insn->off; -			CONT_JMP; -		} -		CONT; -	JMP_JLE_K: -		if (DST <= IMM) { -			insn += insn->off; -			CONT_JMP; -		} -		CONT; -	JMP_JSGT_X: -		if (((s64) DST) > ((s64) SRC)) { -			insn += insn->off; -			CONT_JMP; -		} -		CONT; -	JMP_JSGT_K: -		if (((s64) DST) > ((s64) IMM)) { -			insn += insn->off; -			CONT_JMP; -		} -		CONT; -	JMP_JSLT_X: -		if (((s64) DST) < ((s64) SRC)) { -			insn += insn->off; -			CONT_JMP; -		} -		CONT; -	JMP_JSLT_K: -		if (((s64) DST) < ((s64) IMM)) { -			insn += insn->off; -			CONT_JMP; -		} -		CONT; -	JMP_JSGE_X: -		if (((s64) DST) >= ((s64) SRC)) { -			insn += insn->off; -			CONT_JMP; -		} -		CONT; -	JMP_JSGE_K: -		if (((s64) DST) >= ((s64) IMM)) { -			insn += insn->off; -			CONT_JMP; -		} -		CONT; -	JMP_JSLE_X: -		if (((s64) DST) <= ((s64) SRC)) { -			insn += insn->off; -			CONT_JMP; -		} -		CONT; -	JMP_JSLE_K: -		if (((s64) DST) <= ((s64) IMM)) { -			insn += insn->off; -			CONT_JMP; -		} -		CONT; -	JMP_JSET_X: -		if (DST & SRC) { -			insn += insn->off; -			CONT_JMP; -		} -		CONT; -	JMP_JSET_K: -		if (DST & IMM) { -			insn += insn->off; -			CONT_JMP; -		} -		CONT;  	JMP_EXIT:  		return BPF_R0; - +	/* JMP */ +#define COND_JMP(SIGN, OPCODE, CMP_OP)				\ +	JMP_##OPCODE##_X:					\ +		if ((SIGN##64) DST CMP_OP (SIGN##64) SRC) {	\ +			insn += insn->off;			\ +			CONT_JMP;				\ +		}						\ +		CONT;						\ +	JMP32_##OPCODE##_X:					\ +		if ((SIGN##32) DST CMP_OP (SIGN##32) SRC) {	\ +			insn += insn->off;			\ +			CONT_JMP;				\ +		}						\ +		CONT;						\ +	JMP_##OPCODE##_K:					\ +		if ((SIGN##64) DST CMP_OP (SIGN##64) IMM) {	\ +			insn += insn->off;			\ +			CONT_JMP;				\ +		}						\ +		CONT;						\ +	JMP32_##OPCODE##_K:					\ +		if ((SIGN##32) DST CMP_OP (SIGN##32) IMM) {	\ +			insn += insn->off;			\ +			CONT_JMP;				\ +		}						\ +		CONT; +	COND_JMP(u, JEQ, ==) +	COND_JMP(u, JNE, !=) +	COND_JMP(u, JGT, >) +	COND_JMP(u, JLT, <) +	COND_JMP(u, JGE, >=) +	COND_JMP(u, JLE, <=) +	COND_JMP(u, JSET, &) +	COND_JMP(s, JSGT, >) +	COND_JMP(s, JSLT, <) +	COND_JMP(s, JSGE, >=) +	COND_JMP(s, JSLE, <=) +#undef COND_JMP  	/* STX and ST and LDX*/  #define LDST(SIZEOP, SIZE)						\  	STX_MEM_##SIZEOP:						\ @@ -2036,6 +2031,8 @@ const struct bpf_func_proto bpf_map_delete_elem_proto __weak;  const struct bpf_func_proto bpf_map_push_elem_proto __weak;  const struct bpf_func_proto bpf_map_pop_elem_proto __weak;  const struct bpf_func_proto bpf_map_peek_elem_proto __weak; +const struct bpf_func_proto bpf_spin_lock_proto __weak; +const struct bpf_func_proto bpf_spin_unlock_proto __weak;  const struct bpf_func_proto bpf_get_prandom_u32_proto __weak;  const struct bpf_func_proto bpf_get_smp_processor_id_proto __weak; @@ -2101,6 +2098,10 @@ int __weak skb_copy_bits(const struct sk_buff *skb, int offset, void *to,  	return -EFAULT;  } +DEFINE_STATIC_KEY_FALSE(bpf_stats_enabled_key); +EXPORT_SYMBOL(bpf_stats_enabled_key); +int sysctl_bpf_stats_enabled __read_mostly; +  /* All definitions of tracepoints related to BPF. */  #define CREATE_TRACE_POINTS  #include <linux/bpf_trace.h> diff --git a/kernel/bpf/disasm.c b/kernel/bpf/disasm.c index d6b76377cb6e..de73f55e42fd 100644 --- a/kernel/bpf/disasm.c +++ b/kernel/bpf/disasm.c @@ -67,7 +67,7 @@ const char *const bpf_class_string[8] = {  	[BPF_STX]   = "stx",  	[BPF_ALU]   = "alu",  	[BPF_JMP]   = "jmp", -	[BPF_RET]   = "BUG", +	[BPF_JMP32] = "jmp32",  	[BPF_ALU64] = "alu64",  }; @@ -136,23 +136,22 @@ void print_bpf_insn(const struct bpf_insn_cbs *cbs,  			else  				print_bpf_end_insn(verbose, cbs->private_data, insn);  		} else if (BPF_OP(insn->code) == BPF_NEG) { -			verbose(cbs->private_data, "(%02x) r%d = %s-r%d\n", -				insn->code, insn->dst_reg, -				class == BPF_ALU ? "(u32) " : "", +			verbose(cbs->private_data, "(%02x) %c%d = -%c%d\n", +				insn->code, class == BPF_ALU ? 'w' : 'r', +				insn->dst_reg, class == BPF_ALU ? 'w' : 'r',  				insn->dst_reg);  		} else if (BPF_SRC(insn->code) == BPF_X) { -			verbose(cbs->private_data, "(%02x) %sr%d %s %sr%d\n", -				insn->code, class == BPF_ALU ? "(u32) " : "", +			verbose(cbs->private_data, "(%02x) %c%d %s %c%d\n", +				insn->code, class == BPF_ALU ? 'w' : 'r',  				insn->dst_reg,  				bpf_alu_string[BPF_OP(insn->code) >> 4], -				class == BPF_ALU ? "(u32) " : "", +				class == BPF_ALU ? 'w' : 'r',  				insn->src_reg);  		} else { -			verbose(cbs->private_data, "(%02x) %sr%d %s %s%d\n", -				insn->code, class == BPF_ALU ? "(u32) " : "", +			verbose(cbs->private_data, "(%02x) %c%d %s %d\n", +				insn->code, class == BPF_ALU ? 'w' : 'r',  				insn->dst_reg,  				bpf_alu_string[BPF_OP(insn->code) >> 4], -				class == BPF_ALU ? "(u32) " : "",  				insn->imm);  		}  	} else if (class == BPF_STX) { @@ -220,7 +219,7 @@ void print_bpf_insn(const struct bpf_insn_cbs *cbs,  			verbose(cbs->private_data, "BUG_ld_%02x\n", insn->code);  			return;  		} -	} else if (class == BPF_JMP) { +	} else if (class == BPF_JMP32 || class == BPF_JMP) {  		u8 opcode = BPF_OP(insn->code);  		if (opcode == BPF_CALL) { @@ -244,13 +243,18 @@ void print_bpf_insn(const struct bpf_insn_cbs *cbs,  		} else if (insn->code == (BPF_JMP | BPF_EXIT)) {  			verbose(cbs->private_data, "(%02x) exit\n", insn->code);  		} else if (BPF_SRC(insn->code) == BPF_X) { -			verbose(cbs->private_data, "(%02x) if r%d %s r%d goto pc%+d\n", -				insn->code, insn->dst_reg, +			verbose(cbs->private_data, +				"(%02x) if %c%d %s %c%d goto pc%+d\n", +				insn->code, class == BPF_JMP32 ? 'w' : 'r', +				insn->dst_reg,  				bpf_jmp_string[BPF_OP(insn->code) >> 4], +				class == BPF_JMP32 ? 'w' : 'r',  				insn->src_reg, insn->off);  		} else { -			verbose(cbs->private_data, "(%02x) if r%d %s 0x%x goto pc%+d\n", -				insn->code, insn->dst_reg, +			verbose(cbs->private_data, +				"(%02x) if %c%d %s 0x%x goto pc%+d\n", +				insn->code, class == BPF_JMP32 ? 'w' : 'r', +				insn->dst_reg,  				bpf_jmp_string[BPF_OP(insn->code) >> 4],  				insn->imm, insn->off);  		} diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index f9274114c88d..fed15cf94dca 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -718,21 +718,12 @@ static bool fd_htab_map_needs_adjust(const struct bpf_htab *htab)  	       BITS_PER_LONG == 64;  } -static u32 htab_size_value(const struct bpf_htab *htab, bool percpu) -{ -	u32 size = htab->map.value_size; - -	if (percpu || fd_htab_map_needs_adjust(htab)) -		size = round_up(size, 8); -	return size; -} -  static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,  					 void *value, u32 key_size, u32 hash,  					 bool percpu, bool onallcpus,  					 struct htab_elem *old_elem)  { -	u32 size = htab_size_value(htab, percpu); +	u32 size = htab->map.value_size;  	bool prealloc = htab_is_prealloc(htab);  	struct htab_elem *l_new, **pl_new;  	void __percpu *pptr; @@ -770,10 +761,13 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,  			l_new = ERR_PTR(-ENOMEM);  			goto dec_count;  		} +		check_and_init_map_lock(&htab->map, +					l_new->key + round_up(key_size, 8));  	}  	memcpy(l_new->key, key, key_size);  	if (percpu) { +		size = round_up(size, 8);  		if (prealloc) {  			pptr = htab_elem_get_ptr(l_new, key_size);  		} else { @@ -791,8 +785,13 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,  		if (!prealloc)  			htab_elem_set_ptr(l_new, key_size, pptr); -	} else { +	} else if (fd_htab_map_needs_adjust(htab)) { +		size = round_up(size, 8);  		memcpy(l_new->key + round_up(key_size, 8), value, size); +	} else { +		copy_map_value(&htab->map, +			       l_new->key + round_up(key_size, 8), +			       value);  	}  	l_new->hash = hash; @@ -805,11 +804,11 @@ dec_count:  static int check_flags(struct bpf_htab *htab, struct htab_elem *l_old,  		       u64 map_flags)  { -	if (l_old && map_flags == BPF_NOEXIST) +	if (l_old && (map_flags & ~BPF_F_LOCK) == BPF_NOEXIST)  		/* elem already exists */  		return -EEXIST; -	if (!l_old && map_flags == BPF_EXIST) +	if (!l_old && (map_flags & ~BPF_F_LOCK) == BPF_EXIST)  		/* elem doesn't exist, cannot update it */  		return -ENOENT; @@ -828,7 +827,7 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,  	u32 key_size, hash;  	int ret; -	if (unlikely(map_flags > BPF_EXIST)) +	if (unlikely((map_flags & ~BPF_F_LOCK) > BPF_EXIST))  		/* unknown flags */  		return -EINVAL; @@ -841,6 +840,28 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,  	b = __select_bucket(htab, hash);  	head = &b->head; +	if (unlikely(map_flags & BPF_F_LOCK)) { +		if (unlikely(!map_value_has_spin_lock(map))) +			return -EINVAL; +		/* find an element without taking the bucket lock */ +		l_old = lookup_nulls_elem_raw(head, hash, key, key_size, +					      htab->n_buckets); +		ret = check_flags(htab, l_old, map_flags); +		if (ret) +			return ret; +		if (l_old) { +			/* grab the element lock and update value in place */ +			copy_map_value_locked(map, +					      l_old->key + round_up(key_size, 8), +					      value, false); +			return 0; +		} +		/* fall through, grab the bucket lock and lookup again. +		 * 99.9% chance that the element won't be found, +		 * but second lookup under lock has to be done. +		 */ +	} +  	/* bpf_map_update_elem() can be called in_irq() */  	raw_spin_lock_irqsave(&b->lock, flags); @@ -850,6 +871,20 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,  	if (ret)  		goto err; +	if (unlikely(l_old && (map_flags & BPF_F_LOCK))) { +		/* first lookup without the bucket lock didn't find the element, +		 * but second lookup with the bucket lock found it. +		 * This case is highly unlikely, but has to be dealt with: +		 * grab the element lock in addition to the bucket lock +		 * and update element in place +		 */ +		copy_map_value_locked(map, +				      l_old->key + round_up(key_size, 8), +				      value, false); +		ret = 0; +		goto err; +	} +  	l_new = alloc_htab_elem(htab, key, value, key_size, hash, false, false,  				l_old);  	if (IS_ERR(l_new)) { diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index a74972b07e74..a411fc17d265 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -221,6 +221,102 @@ const struct bpf_func_proto bpf_get_current_comm_proto = {  	.arg2_type	= ARG_CONST_SIZE,  }; +#if defined(CONFIG_QUEUED_SPINLOCKS) || defined(CONFIG_BPF_ARCH_SPINLOCK) + +static inline void __bpf_spin_lock(struct bpf_spin_lock *lock) +{ +	arch_spinlock_t *l = (void *)lock; +	union { +		__u32 val; +		arch_spinlock_t lock; +	} u = { .lock = __ARCH_SPIN_LOCK_UNLOCKED }; + +	compiletime_assert(u.val == 0, "__ARCH_SPIN_LOCK_UNLOCKED not 0"); +	BUILD_BUG_ON(sizeof(*l) != sizeof(__u32)); +	BUILD_BUG_ON(sizeof(*lock) != sizeof(__u32)); +	arch_spin_lock(l); +} + +static inline void __bpf_spin_unlock(struct bpf_spin_lock *lock) +{ +	arch_spinlock_t *l = (void *)lock; + +	arch_spin_unlock(l); +} + +#else + +static inline void __bpf_spin_lock(struct bpf_spin_lock *lock) +{ +	atomic_t *l = (void *)lock; + +	BUILD_BUG_ON(sizeof(*l) != sizeof(*lock)); +	do { +		atomic_cond_read_relaxed(l, !VAL); +	} while (atomic_xchg(l, 1)); +} + +static inline void __bpf_spin_unlock(struct bpf_spin_lock *lock) +{ +	atomic_t *l = (void *)lock; + +	atomic_set_release(l, 0); +} + +#endif + +static DEFINE_PER_CPU(unsigned long, irqsave_flags); + +notrace BPF_CALL_1(bpf_spin_lock, struct bpf_spin_lock *, lock) +{ +	unsigned long flags; + +	local_irq_save(flags); +	__bpf_spin_lock(lock); +	__this_cpu_write(irqsave_flags, flags); +	return 0; +} + +const struct bpf_func_proto bpf_spin_lock_proto = { +	.func		= bpf_spin_lock, +	.gpl_only	= false, +	.ret_type	= RET_VOID, +	.arg1_type	= ARG_PTR_TO_SPIN_LOCK, +}; + +notrace BPF_CALL_1(bpf_spin_unlock, struct bpf_spin_lock *, lock) +{ +	unsigned long flags; + +	flags = __this_cpu_read(irqsave_flags); +	__bpf_spin_unlock(lock); +	local_irq_restore(flags); +	return 0; +} + +const struct bpf_func_proto bpf_spin_unlock_proto = { +	.func		= bpf_spin_unlock, +	.gpl_only	= false, +	.ret_type	= RET_VOID, +	.arg1_type	= ARG_PTR_TO_SPIN_LOCK, +}; + +void copy_map_value_locked(struct bpf_map *map, void *dst, void *src, +			   bool lock_src) +{ +	struct bpf_spin_lock *lock; + +	if (lock_src) +		lock = src + map->spin_lock_off; +	else +		lock = dst + map->spin_lock_off; +	preempt_disable(); +	____bpf_spin_lock(lock); +	copy_map_value(map, dst, src); +	____bpf_spin_unlock(lock); +	preempt_enable(); +} +  #ifdef CONFIG_CGROUPS  BPF_CALL_0(bpf_get_current_cgroup_id)  { diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c index 07a34ef562a0..6b572e2de7fb 100644 --- a/kernel/bpf/local_storage.c +++ b/kernel/bpf/local_storage.c @@ -131,7 +131,14 @@ static int cgroup_storage_update_elem(struct bpf_map *map, void *_key,  	struct bpf_cgroup_storage *storage;  	struct bpf_storage_buffer *new; -	if (flags != BPF_ANY && flags != BPF_EXIST) +	if (unlikely(flags & ~(BPF_F_LOCK | BPF_EXIST | BPF_NOEXIST))) +		return -EINVAL; + +	if (unlikely(flags & BPF_NOEXIST)) +		return -EINVAL; + +	if (unlikely((flags & BPF_F_LOCK) && +		     !map_value_has_spin_lock(map)))  		return -EINVAL;  	storage = cgroup_storage_lookup((struct bpf_cgroup_storage_map *)map, @@ -139,6 +146,11 @@ static int cgroup_storage_update_elem(struct bpf_map *map, void *_key,  	if (!storage)  		return -ENOENT; +	if (flags & BPF_F_LOCK) { +		copy_map_value_locked(map, storage->buf->data, value, false); +		return 0; +	} +  	new = kmalloc_node(sizeof(struct bpf_storage_buffer) +  			   map->value_size,  			   __GFP_ZERO | GFP_ATOMIC | __GFP_NOWARN, @@ -147,6 +159,7 @@ static int cgroup_storage_update_elem(struct bpf_map *map, void *_key,  		return -ENOMEM;  	memcpy(&new->data[0], value, map->value_size); +	check_and_init_map_lock(map, new->data);  	new = xchg(&storage->buf, new);  	kfree_rcu(new, rcu); @@ -483,6 +496,7 @@ struct bpf_cgroup_storage *bpf_cgroup_storage_alloc(struct bpf_prog *prog,  		storage->buf = kmalloc_node(size, flags, map->numa_node);  		if (!storage->buf)  			goto enomem; +		check_and_init_map_lock(map, storage->buf->data);  	} else {  		storage->percpu_buf = __alloc_percpu_gfp(size, 8, flags);  		if (!storage->percpu_buf) diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c index abf1002080df..93a5cbbde421 100644 --- a/kernel/bpf/lpm_trie.c +++ b/kernel/bpf/lpm_trie.c @@ -471,6 +471,7 @@ static int trie_delete_elem(struct bpf_map *map, void *_key)  	}  	if (!node || node->prefixlen != key->prefixlen || +	    node->prefixlen != matchlen ||  	    (node->flags & LPM_TREE_NODE_FLAG_IM)) {  		ret = -ENOENT;  		goto out; diff --git a/kernel/bpf/map_in_map.c b/kernel/bpf/map_in_map.c index 52378d3e34b3..3dff41403583 100644 --- a/kernel/bpf/map_in_map.c +++ b/kernel/bpf/map_in_map.c @@ -37,6 +37,11 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd)  		return ERR_PTR(-EINVAL);  	} +	if (map_value_has_spin_lock(inner_map)) { +		fdput(f); +		return ERR_PTR(-ENOTSUPP); +	} +  	inner_map_meta_size = sizeof(*inner_map_meta);  	/* In some cases verifier needs to access beyond just base map. */  	if (inner_map->ops == &array_map_ops) @@ -53,6 +58,7 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd)  	inner_map_meta->value_size = inner_map->value_size;  	inner_map_meta->map_flags = inner_map->map_flags;  	inner_map_meta->max_entries = inner_map->max_entries; +	inner_map_meta->spin_lock_off = inner_map->spin_lock_off;  	/* Misc members not needed in bpf_map_meta_equal() check. */  	inner_map_meta->ops = inner_map->ops; diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c index 54cf2b9c44a4..ba635209ae9a 100644 --- a/kernel/bpf/offload.c +++ b/kernel/bpf/offload.c @@ -35,6 +35,7 @@ static DECLARE_RWSEM(bpf_devs_lock);  struct bpf_offload_dev {  	const struct bpf_prog_offload_ops *ops;  	struct list_head netdevs; +	void *priv;  };  struct bpf_offload_netdev { @@ -173,6 +174,41 @@ int bpf_prog_offload_finalize(struct bpf_verifier_env *env)  	return ret;  } +void +bpf_prog_offload_replace_insn(struct bpf_verifier_env *env, u32 off, +			      struct bpf_insn *insn) +{ +	const struct bpf_prog_offload_ops *ops; +	struct bpf_prog_offload *offload; +	int ret = -EOPNOTSUPP; + +	down_read(&bpf_devs_lock); +	offload = env->prog->aux->offload; +	if (offload) { +		ops = offload->offdev->ops; +		if (!offload->opt_failed && ops->replace_insn) +			ret = ops->replace_insn(env, off, insn); +		offload->opt_failed |= ret; +	} +	up_read(&bpf_devs_lock); +} + +void +bpf_prog_offload_remove_insns(struct bpf_verifier_env *env, u32 off, u32 cnt) +{ +	struct bpf_prog_offload *offload; +	int ret = -EOPNOTSUPP; + +	down_read(&bpf_devs_lock); +	offload = env->prog->aux->offload; +	if (offload) { +		if (!offload->opt_failed && offload->offdev->ops->remove_insns) +			ret = offload->offdev->ops->remove_insns(env, off, cnt); +		offload->opt_failed |= ret; +	} +	up_read(&bpf_devs_lock); +} +  static void __bpf_prog_offload_destroy(struct bpf_prog *prog)  {  	struct bpf_prog_offload *offload = prog->aux->offload; @@ -634,7 +670,7 @@ unlock:  EXPORT_SYMBOL_GPL(bpf_offload_dev_netdev_unregister);  struct bpf_offload_dev * -bpf_offload_dev_create(const struct bpf_prog_offload_ops *ops) +bpf_offload_dev_create(const struct bpf_prog_offload_ops *ops, void *priv)  {  	struct bpf_offload_dev *offdev;  	int err; @@ -653,6 +689,7 @@ bpf_offload_dev_create(const struct bpf_prog_offload_ops *ops)  		return ERR_PTR(-ENOMEM);  	offdev->ops = ops; +	offdev->priv = priv;  	INIT_LIST_HEAD(&offdev->netdevs);  	return offdev; @@ -665,3 +702,9 @@ void bpf_offload_dev_destroy(struct bpf_offload_dev *offdev)  	kfree(offdev);  }  EXPORT_SYMBOL_GPL(bpf_offload_dev_destroy); + +void *bpf_offload_dev_priv(struct bpf_offload_dev *offdev) +{ +	return offdev->priv; +} +EXPORT_SYMBOL_GPL(bpf_offload_dev_priv); diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index d43b14535827..950ab2f28922 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -44,7 +44,7 @@ static void do_up_read(struct irq_work *entry)  	struct stack_map_irq_work *work;  	work = container_of(entry, struct stack_map_irq_work, irq_work); -	up_read(work->sem); +	up_read_non_owner(work->sem);  	work->sem = NULL;  } @@ -338,6 +338,12 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs,  	} else {  		work->sem = ¤t->mm->mmap_sem;  		irq_work_queue(&work->irq_work); +		/* +		 * The irq_work will release the mmap_sem with +		 * up_read_non_owner(). The rwsem_release() is called +		 * here to release the lock from lockdep's perspective. +		 */ +		rwsem_release(¤t->mm->mmap_sem.dep_map, 1, _RET_IP_);  	}  } diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 8577bb7f8be6..62f6bced3a3c 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -463,7 +463,7 @@ int map_check_no_btf(const struct bpf_map *map,  	return -ENOTSUPP;  } -static int map_check_btf(const struct bpf_map *map, const struct btf *btf, +static int map_check_btf(struct bpf_map *map, const struct btf *btf,  			 u32 btf_key_id, u32 btf_value_id)  {  	const struct btf_type *key_type, *value_type; @@ -478,6 +478,22 @@ static int map_check_btf(const struct bpf_map *map, const struct btf *btf,  	if (!value_type || value_size != map->value_size)  		return -EINVAL; +	map->spin_lock_off = btf_find_spin_lock(btf, value_type); + +	if (map_value_has_spin_lock(map)) { +		if (map->map_type != BPF_MAP_TYPE_HASH && +		    map->map_type != BPF_MAP_TYPE_ARRAY && +		    map->map_type != BPF_MAP_TYPE_CGROUP_STORAGE) +			return -ENOTSUPP; +		if (map->spin_lock_off + sizeof(struct bpf_spin_lock) > +		    map->value_size) { +			WARN_ONCE(1, +				  "verifier bug spin_lock_off %d value_size %d\n", +				  map->spin_lock_off, map->value_size); +			return -EFAULT; +		} +	} +  	if (map->ops->map_check_btf)  		ret = map->ops->map_check_btf(map, btf, key_type, value_type); @@ -542,6 +558,8 @@ static int map_create(union bpf_attr *attr)  		map->btf = btf;  		map->btf_key_type_id = attr->btf_key_type_id;  		map->btf_value_type_id = attr->btf_value_type_id; +	} else { +		map->spin_lock_off = -EINVAL;  	}  	err = security_bpf_map_alloc(map); @@ -559,12 +577,12 @@ static int map_create(union bpf_attr *attr)  	err = bpf_map_new_fd(map, f_flags);  	if (err < 0) {  		/* failed to allocate fd. -		 * bpf_map_put() is needed because the above +		 * bpf_map_put_with_uref() is needed because the above  		 * bpf_map_alloc_id() has published the map  		 * to the userspace and the userspace may  		 * have refcnt-ed it through BPF_MAP_GET_FD_BY_ID.  		 */ -		bpf_map_put(map); +		bpf_map_put_with_uref(map);  		return err;  	} @@ -664,7 +682,7 @@ static void *__bpf_copy_key(void __user *ukey, u64 key_size)  }  /* last field in 'union bpf_attr' used by this command */ -#define BPF_MAP_LOOKUP_ELEM_LAST_FIELD value +#define BPF_MAP_LOOKUP_ELEM_LAST_FIELD flags  static int map_lookup_elem(union bpf_attr *attr)  { @@ -680,6 +698,9 @@ static int map_lookup_elem(union bpf_attr *attr)  	if (CHECK_ATTR(BPF_MAP_LOOKUP_ELEM))  		return -EINVAL; +	if (attr->flags & ~BPF_F_LOCK) +		return -EINVAL; +  	f = fdget(ufd);  	map = __bpf_map_get(f);  	if (IS_ERR(map)) @@ -690,6 +711,12 @@ static int map_lookup_elem(union bpf_attr *attr)  		goto err_put;  	} +	if ((attr->flags & BPF_F_LOCK) && +	    !map_value_has_spin_lock(map)) { +		err = -EINVAL; +		goto err_put; +	} +  	key = __bpf_copy_key(ukey, map->key_size);  	if (IS_ERR(key)) {  		err = PTR_ERR(key); @@ -745,7 +772,13 @@ static int map_lookup_elem(union bpf_attr *attr)  			err = -ENOENT;  		} else {  			err = 0; -			memcpy(value, ptr, value_size); +			if (attr->flags & BPF_F_LOCK) +				/* lock 'ptr' and copy everything but lock */ +				copy_map_value_locked(map, value, ptr, true); +			else +				copy_map_value(map, value, ptr); +			/* mask lock, since value wasn't zero inited */ +			check_and_init_map_lock(map, value);  		}  		rcu_read_unlock();  	} @@ -808,6 +841,12 @@ static int map_update_elem(union bpf_attr *attr)  		goto err_put;  	} +	if ((attr->flags & BPF_F_LOCK) && +	    !map_value_has_spin_lock(map)) { +		err = -EINVAL; +		goto err_put; +	} +  	key = __bpf_copy_key(ukey, map->key_size);  	if (IS_ERR(key)) {  		err = PTR_ERR(key); @@ -1219,6 +1258,7 @@ static void __bpf_prog_put_rcu(struct rcu_head *rcu)  static void __bpf_prog_put(struct bpf_prog *prog, bool do_idr_lock)  {  	if (atomic_dec_and_test(&prog->aux->refcnt)) { +		perf_event_bpf_event(prog, PERF_BPF_EVENT_PROG_UNLOAD, 0);  		/* bpf_prog_free_id() must be called first */  		bpf_prog_free_id(prog, do_idr_lock);  		bpf_prog_kallsyms_del_all(prog); @@ -1244,24 +1284,54 @@ static int bpf_prog_release(struct inode *inode, struct file *filp)  	return 0;  } +static void bpf_prog_get_stats(const struct bpf_prog *prog, +			       struct bpf_prog_stats *stats) +{ +	u64 nsecs = 0, cnt = 0; +	int cpu; + +	for_each_possible_cpu(cpu) { +		const struct bpf_prog_stats *st; +		unsigned int start; +		u64 tnsecs, tcnt; + +		st = per_cpu_ptr(prog->aux->stats, cpu); +		do { +			start = u64_stats_fetch_begin_irq(&st->syncp); +			tnsecs = st->nsecs; +			tcnt = st->cnt; +		} while (u64_stats_fetch_retry_irq(&st->syncp, start)); +		nsecs += tnsecs; +		cnt += tcnt; +	} +	stats->nsecs = nsecs; +	stats->cnt = cnt; +} +  #ifdef CONFIG_PROC_FS  static void bpf_prog_show_fdinfo(struct seq_file *m, struct file *filp)  {  	const struct bpf_prog *prog = filp->private_data;  	char prog_tag[sizeof(prog->tag) * 2 + 1] = { }; +	struct bpf_prog_stats stats; +	bpf_prog_get_stats(prog, &stats);  	bin2hex(prog_tag, prog->tag, sizeof(prog->tag));  	seq_printf(m,  		   "prog_type:\t%u\n"  		   "prog_jited:\t%u\n"  		   "prog_tag:\t%s\n"  		   "memlock:\t%llu\n" -		   "prog_id:\t%u\n", +		   "prog_id:\t%u\n" +		   "run_time_ns:\t%llu\n" +		   "run_cnt:\t%llu\n",  		   prog->type,  		   prog->jited,  		   prog_tag,  		   prog->pages * 1ULL << PAGE_SHIFT, -		   prog->aux->id); +		   prog->aux->id, +		   stats.nsecs, +		   stats.cnt);  }  #endif @@ -1562,6 +1632,7 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr)  	}  	bpf_prog_kallsyms_add(prog); +	perf_event_bpf_event(prog, PERF_BPF_EVENT_PROG_LOAD, 0);  	return err;  free_used_maps: @@ -1986,7 +2057,7 @@ static int bpf_map_get_fd_by_id(const union bpf_attr *attr)  	fd = bpf_map_new_fd(map, f_flags);  	if (fd < 0) -		bpf_map_put(map); +		bpf_map_put_with_uref(map);  	return fd;  } @@ -2083,6 +2154,7 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog,  	struct bpf_prog_info __user *uinfo = u64_to_user_ptr(attr->info.info);  	struct bpf_prog_info info = {};  	u32 info_len = attr->info.info_len; +	struct bpf_prog_stats stats;  	char __user *uinsns;  	u32 ulen;  	int err; @@ -2122,6 +2194,10 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog,  	if (err)  		return err; +	bpf_prog_get_stats(prog, &stats); +	info.run_time_ns = stats.nsecs; +	info.run_cnt = stats.cnt; +  	if (!capable(CAP_SYS_ADMIN)) {  		info.jited_prog_len = 0;  		info.xlated_prog_len = 0; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 56674a7c3778..a7b96bf0e654 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -213,6 +213,7 @@ struct bpf_call_arg_meta {  	s64 msize_smax_value;  	u64 msize_umax_value;  	int ptr_id; +	int func_id;  };  static DEFINE_MUTEX(bpf_verifier_lock); @@ -330,10 +331,19 @@ static bool type_is_pkt_pointer(enum bpf_reg_type type)  	       type == PTR_TO_PACKET_META;  } +static bool type_is_sk_pointer(enum bpf_reg_type type) +{ +	return type == PTR_TO_SOCKET || +		type == PTR_TO_SOCK_COMMON || +		type == PTR_TO_TCP_SOCK; +} +  static bool reg_type_may_be_null(enum bpf_reg_type type)  {  	return type == PTR_TO_MAP_VALUE_OR_NULL || -	       type == PTR_TO_SOCKET_OR_NULL; +	       type == PTR_TO_SOCKET_OR_NULL || +	       type == PTR_TO_SOCK_COMMON_OR_NULL || +	       type == PTR_TO_TCP_SOCK_OR_NULL;  }  static bool type_is_refcounted(enum bpf_reg_type type) @@ -351,6 +361,12 @@ static bool reg_is_refcounted(const struct bpf_reg_state *reg)  	return type_is_refcounted(reg->type);  } +static bool reg_may_point_to_spin_lock(const struct bpf_reg_state *reg) +{ +	return reg->type == PTR_TO_MAP_VALUE && +		map_value_has_spin_lock(reg->map_ptr); +} +  static bool reg_is_refcounted_or_null(const struct bpf_reg_state *reg)  {  	return type_is_refcounted_or_null(reg->type); @@ -370,6 +386,12 @@ static bool is_release_function(enum bpf_func_id func_id)  	return func_id == BPF_FUNC_sk_release;  } +static bool is_acquire_function(enum bpf_func_id func_id) +{ +	return func_id == BPF_FUNC_sk_lookup_tcp || +		func_id == BPF_FUNC_sk_lookup_udp; +} +  /* string representation of 'enum bpf_reg_type' */  static const char * const reg_type_str[] = {  	[NOT_INIT]		= "?", @@ -385,6 +407,10 @@ static const char * const reg_type_str[] = {  	[PTR_TO_FLOW_KEYS]	= "flow_keys",  	[PTR_TO_SOCKET]		= "sock",  	[PTR_TO_SOCKET_OR_NULL] = "sock_or_null", +	[PTR_TO_SOCK_COMMON]	= "sock_common", +	[PTR_TO_SOCK_COMMON_OR_NULL] = "sock_common_or_null", +	[PTR_TO_TCP_SOCK]	= "tcp_sock", +	[PTR_TO_TCP_SOCK_OR_NULL] = "tcp_sock_or_null",  };  static char slot_type_char[] = { @@ -611,13 +637,10 @@ static int acquire_reference_state(struct bpf_verifier_env *env, int insn_idx)  }  /* release function corresponding to acquire_reference_state(). Idempotent. */ -static int __release_reference_state(struct bpf_func_state *state, int ptr_id) +static int release_reference_state(struct bpf_func_state *state, int ptr_id)  {  	int i, last_idx; -	if (!ptr_id) -		return -EFAULT; -  	last_idx = state->acquired_refs - 1;  	for (i = 0; i < state->acquired_refs; i++) {  		if (state->refs[i].id == ptr_id) { @@ -629,21 +652,7 @@ static int __release_reference_state(struct bpf_func_state *state, int ptr_id)  			return 0;  		}  	} -	return -EFAULT; -} - -/* variation on the above for cases where we expect that there must be an - * outstanding reference for the specified ptr_id. - */ -static int release_reference_state(struct bpf_verifier_env *env, int ptr_id) -{ -	struct bpf_func_state *state = cur_func(env); -	int err; - -	err = __release_reference_state(state, ptr_id); -	if (WARN_ON_ONCE(err != 0)) -		verbose(env, "verifier internal error: can't release reference\n"); -	return err; +	return -EINVAL;  }  static int transfer_reference_state(struct bpf_func_state *dst, @@ -712,6 +721,7 @@ static int copy_verifier_state(struct bpf_verifier_state *dst_state,  	}  	dst_state->speculative = src->speculative;  	dst_state->curframe = src->curframe; +	dst_state->active_spin_lock = src->active_spin_lock;  	for (i = 0; i <= src->curframe; i++) {  		dst = dst_state->frame[i];  		if (!dst) { @@ -1095,7 +1105,7 @@ static int check_subprogs(struct bpf_verifier_env *env)  	for (i = 0; i < insn_cnt; i++) {  		u8 code = insn[i].code; -		if (BPF_CLASS(code) != BPF_JMP) +		if (BPF_CLASS(code) != BPF_JMP && BPF_CLASS(code) != BPF_JMP32)  			goto next;  		if (BPF_OP(code) == BPF_EXIT || BPF_OP(code) == BPF_CALL)  			goto next; @@ -1201,6 +1211,10 @@ static bool is_spillable_regtype(enum bpf_reg_type type)  	case CONST_PTR_TO_MAP:  	case PTR_TO_SOCKET:  	case PTR_TO_SOCKET_OR_NULL: +	case PTR_TO_SOCK_COMMON: +	case PTR_TO_SOCK_COMMON_OR_NULL: +	case PTR_TO_TCP_SOCK: +	case PTR_TO_TCP_SOCK_OR_NULL:  		return true;  	default:  		return false; @@ -1483,6 +1497,21 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno,  	if (err)  		verbose(env, "R%d max value is outside of the array range\n",  			regno); + +	if (map_value_has_spin_lock(reg->map_ptr)) { +		u32 lock = reg->map_ptr->spin_lock_off; + +		/* if any part of struct bpf_spin_lock can be touched by +		 * load/store reject this program. +		 * To check that [x1, x2) overlaps with [y1, y2) +		 * it is sufficient to check x1 < y2 && y1 < x2. +		 */ +		if (reg->smin_value + off < lock + sizeof(struct bpf_spin_lock) && +		     lock < reg->umax_value + off + size) { +			verbose(env, "bpf_spin_lock cannot be accessed directly by load/store\n"); +			return -EACCES; +		} +	}  	return err;  } @@ -1617,12 +1646,14 @@ static int check_flow_keys_access(struct bpf_verifier_env *env, int off,  	return 0;  } -static int check_sock_access(struct bpf_verifier_env *env, u32 regno, int off, -			     int size, enum bpf_access_type t) +static int check_sock_access(struct bpf_verifier_env *env, int insn_idx, +			     u32 regno, int off, int size, +			     enum bpf_access_type t)  {  	struct bpf_reg_state *regs = cur_regs(env);  	struct bpf_reg_state *reg = ®s[regno]; -	struct bpf_insn_access_aux info; +	struct bpf_insn_access_aux info = {}; +	bool valid;  	if (reg->smin_value < 0) {  		verbose(env, "R%d min value is negative, either use unsigned index or do a if (index >=0) check.\n", @@ -1630,13 +1661,31 @@ static int check_sock_access(struct bpf_verifier_env *env, u32 regno, int off,  		return -EACCES;  	} -	if (!bpf_sock_is_valid_access(off, size, t, &info)) { -		verbose(env, "invalid bpf_sock access off=%d size=%d\n", -			off, size); -		return -EACCES; +	switch (reg->type) { +	case PTR_TO_SOCK_COMMON: +		valid = bpf_sock_common_is_valid_access(off, size, t, &info); +		break; +	case PTR_TO_SOCKET: +		valid = bpf_sock_is_valid_access(off, size, t, &info); +		break; +	case PTR_TO_TCP_SOCK: +		valid = bpf_tcp_sock_is_valid_access(off, size, t, &info); +		break; +	default: +		valid = false;  	} -	return 0; + +	if (valid) { +		env->insn_aux_data[insn_idx].ctx_field_size = +			info.ctx_field_size; +		return 0; +	} + +	verbose(env, "R%d invalid %s access off=%d size=%d\n", +		regno, reg_type_str[reg->type], off, size); + +	return -EACCES;  }  static bool __is_pointer_value(bool allow_ptr_leaks, @@ -1662,8 +1711,14 @@ static bool is_ctx_reg(struct bpf_verifier_env *env, int regno)  {  	const struct bpf_reg_state *reg = reg_state(env, regno); -	return reg->type == PTR_TO_CTX || -	       reg->type == PTR_TO_SOCKET; +	return reg->type == PTR_TO_CTX; +} + +static bool is_sk_reg(struct bpf_verifier_env *env, int regno) +{ +	const struct bpf_reg_state *reg = reg_state(env, regno); + +	return type_is_sk_pointer(reg->type);  }  static bool is_pkt_reg(struct bpf_verifier_env *env, int regno) @@ -1774,6 +1829,12 @@ static int check_ptr_alignment(struct bpf_verifier_env *env,  	case PTR_TO_SOCKET:  		pointer_desc = "sock ";  		break; +	case PTR_TO_SOCK_COMMON: +		pointer_desc = "sock_common "; +		break; +	case PTR_TO_TCP_SOCK: +		pointer_desc = "tcp_sock "; +		break;  	default:  		break;  	} @@ -1977,11 +2038,14 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn  			 * PTR_TO_PACKET[_META,_END]. In the latter  			 * case, we know the offset is zero.  			 */ -			if (reg_type == SCALAR_VALUE) +			if (reg_type == SCALAR_VALUE) {  				mark_reg_unknown(env, regs, value_regno); -			else +			} else {  				mark_reg_known_zero(env, regs,  						    value_regno); +				if (reg_type_may_be_null(reg_type)) +					regs[value_regno].id = ++env->id_gen; +			}  			regs[value_regno].type = reg_type;  		} @@ -2027,12 +2091,13 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn  		err = check_flow_keys_access(env, off, size);  		if (!err && t == BPF_READ && value_regno >= 0)  			mark_reg_unknown(env, regs, value_regno); -	} else if (reg->type == PTR_TO_SOCKET) { +	} else if (type_is_sk_pointer(reg->type)) {  		if (t == BPF_WRITE) { -			verbose(env, "cannot write into socket\n"); +			verbose(env, "R%d cannot write into %s\n", +				regno, reg_type_str[reg->type]);  			return -EACCES;  		} -		err = check_sock_access(env, regno, off, size, t); +		err = check_sock_access(env, insn_idx, regno, off, size, t);  		if (!err && value_regno >= 0)  			mark_reg_unknown(env, regs, value_regno);  	} else { @@ -2076,7 +2141,8 @@ static int check_xadd(struct bpf_verifier_env *env, int insn_idx, struct bpf_ins  	if (is_ctx_reg(env, insn->dst_reg) ||  	    is_pkt_reg(env, insn->dst_reg) || -	    is_flow_key_reg(env, insn->dst_reg)) { +	    is_flow_key_reg(env, insn->dst_reg) || +	    is_sk_reg(env, insn->dst_reg)) {  		verbose(env, "BPF_XADD stores into R%d %s is not allowed\n",  			insn->dst_reg,  			reg_type_str[reg_state(env, insn->dst_reg)->type]); @@ -2192,6 +2258,91 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno,  	}  } +/* Implementation details: + * bpf_map_lookup returns PTR_TO_MAP_VALUE_OR_NULL + * Two bpf_map_lookups (even with the same key) will have different reg->id. + * For traditional PTR_TO_MAP_VALUE the verifier clears reg->id after + * value_or_null->value transition, since the verifier only cares about + * the range of access to valid map value pointer and doesn't care about actual + * address of the map element. + * For maps with 'struct bpf_spin_lock' inside map value the verifier keeps + * reg->id > 0 after value_or_null->value transition. By doing so + * two bpf_map_lookups will be considered two different pointers that + * point to different bpf_spin_locks. + * The verifier allows taking only one bpf_spin_lock at a time to avoid + * dead-locks. + * Since only one bpf_spin_lock is allowed the checks are simpler than + * reg_is_refcounted() logic. The verifier needs to remember only + * one spin_lock instead of array of acquired_refs. + * cur_state->active_spin_lock remembers which map value element got locked + * and clears it after bpf_spin_unlock. + */ +static int process_spin_lock(struct bpf_verifier_env *env, int regno, +			     bool is_lock) +{ +	struct bpf_reg_state *regs = cur_regs(env), *reg = ®s[regno]; +	struct bpf_verifier_state *cur = env->cur_state; +	bool is_const = tnum_is_const(reg->var_off); +	struct bpf_map *map = reg->map_ptr; +	u64 val = reg->var_off.value; + +	if (reg->type != PTR_TO_MAP_VALUE) { +		verbose(env, "R%d is not a pointer to map_value\n", regno); +		return -EINVAL; +	} +	if (!is_const) { +		verbose(env, +			"R%d doesn't have constant offset. bpf_spin_lock has to be at the constant offset\n", +			regno); +		return -EINVAL; +	} +	if (!map->btf) { +		verbose(env, +			"map '%s' has to have BTF in order to use bpf_spin_lock\n", +			map->name); +		return -EINVAL; +	} +	if (!map_value_has_spin_lock(map)) { +		if (map->spin_lock_off == -E2BIG) +			verbose(env, +				"map '%s' has more than one 'struct bpf_spin_lock'\n", +				map->name); +		else if (map->spin_lock_off == -ENOENT) +			verbose(env, +				"map '%s' doesn't have 'struct bpf_spin_lock'\n", +				map->name); +		else +			verbose(env, +				"map '%s' is not a struct type or bpf_spin_lock is mangled\n", +				map->name); +		return -EINVAL; +	} +	if (map->spin_lock_off != val + reg->off) { +		verbose(env, "off %lld doesn't point to 'struct bpf_spin_lock'\n", +			val + reg->off); +		return -EINVAL; +	} +	if (is_lock) { +		if (cur->active_spin_lock) { +			verbose(env, +				"Locking two bpf_spin_locks are not allowed\n"); +			return -EINVAL; +		} +		cur->active_spin_lock = reg->id; +	} else { +		if (!cur->active_spin_lock) { +			verbose(env, "bpf_spin_unlock without taking a lock\n"); +			return -EINVAL; +		} +		if (cur->active_spin_lock != reg->id) { +			verbose(env, "bpf_spin_unlock of different lock\n"); +			return -EINVAL; +		} +		cur->active_spin_lock = 0; +	} +	return 0; +} +  static bool arg_type_is_mem_ptr(enum bpf_arg_type type)  {  	return type == ARG_PTR_TO_MEM || @@ -2258,6 +2409,11 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,  		err = check_ctx_reg(env, reg, regno);  		if (err < 0)  			return err; +	} else if (arg_type == ARG_PTR_TO_SOCK_COMMON) { +		expected_type = PTR_TO_SOCK_COMMON; +		/* Any sk pointer can be ARG_PTR_TO_SOCK_COMMON */ +		if (!type_is_sk_pointer(type)) +			goto err_type;  	} else if (arg_type == ARG_PTR_TO_SOCKET) {  		expected_type = PTR_TO_SOCKET;  		if (type != expected_type) @@ -2268,6 +2424,17 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,  			return -EFAULT;  		}  		meta->ptr_id = reg->id; +	} else if (arg_type == ARG_PTR_TO_SPIN_LOCK) { +		if (meta->func_id == BPF_FUNC_spin_lock) { +			if (process_spin_lock(env, regno, true)) +				return -EACCES; +		} else if (meta->func_id == BPF_FUNC_spin_unlock) { +			if (process_spin_lock(env, regno, false)) +				return -EACCES; +		} else { +			verbose(env, "verifier internal error\n"); +			return -EFAULT; +		}  	} else if (arg_type_is_mem_ptr(arg_type)) {  		expected_type = PTR_TO_STACK;  		/* One exception here. In case function allows for NULL to be @@ -2661,7 +2828,7 @@ static int release_reference(struct bpf_verifier_env *env,  	for (i = 0; i <= vstate->curframe; i++)  		release_reg_references(env, vstate->frame[i], meta->ptr_id); -	return release_reference_state(env, meta->ptr_id); +	return release_reference_state(cur_func(env), meta->ptr_id);  }  static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn, @@ -2887,6 +3054,7 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn  		return err;  	} +	meta.func_id = func_id;  	/* check args */  	err = check_func_arg(env, BPF_REG_1, fn->arg1_type, &meta);  	if (err) @@ -2926,8 +3094,11 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn  		}  	} else if (is_release_function(func_id)) {  		err = release_reference(env, &meta); -		if (err) +		if (err) { +			verbose(env, "func %s#%d reference has not been acquired before\n", +				func_id_name(func_id), func_id);  			return err; +		}  	}  	regs = cur_regs(env); @@ -2969,17 +3140,30 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn  		regs[BPF_REG_0].map_ptr = meta.map_ptr;  		if (fn->ret_type == RET_PTR_TO_MAP_VALUE) {  			regs[BPF_REG_0].type = PTR_TO_MAP_VALUE; +			if (map_value_has_spin_lock(meta.map_ptr)) +				regs[BPF_REG_0].id = ++env->id_gen;  		} else {  			regs[BPF_REG_0].type = PTR_TO_MAP_VALUE_OR_NULL;  			regs[BPF_REG_0].id = ++env->id_gen;  		}  	} else if (fn->ret_type == RET_PTR_TO_SOCKET_OR_NULL) { -		int id = acquire_reference_state(env, insn_idx); -		if (id < 0) -			return id;  		mark_reg_known_zero(env, regs, BPF_REG_0);  		regs[BPF_REG_0].type = PTR_TO_SOCKET_OR_NULL; -		regs[BPF_REG_0].id = id; +		if (is_acquire_function(func_id)) { +			int id = acquire_reference_state(env, insn_idx); + +			if (id < 0) +				return id; +			/* For release_reference() */ +			regs[BPF_REG_0].id = id; +		} else { +			/* For mark_ptr_or_null_reg() */ +			regs[BPF_REG_0].id = ++env->id_gen; +		} +	} else if (fn->ret_type == RET_PTR_TO_TCP_SOCK_OR_NULL) { +		mark_reg_known_zero(env, regs, BPF_REG_0); +		regs[BPF_REG_0].type = PTR_TO_TCP_SOCK_OR_NULL; +		regs[BPF_REG_0].id = ++env->id_gen;  	} else {  		verbose(env, "unknown return type %d of func %s#%d\n",  			fn->ret_type, func_id_name(func_id), func_id); @@ -3239,6 +3423,10 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,  	case PTR_TO_PACKET_END:  	case PTR_TO_SOCKET:  	case PTR_TO_SOCKET_OR_NULL: +	case PTR_TO_SOCK_COMMON: +	case PTR_TO_SOCK_COMMON_OR_NULL: +	case PTR_TO_TCP_SOCK: +	case PTR_TO_TCP_SOCK_OR_NULL:  		verbose(env, "R%d pointer arithmetic on %s prohibited\n",  			dst, reg_type_str[ptr_reg->type]);  		return -EACCES; @@ -4031,11 +4219,50 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *vstate,   *  0 - branch will not be taken and fall-through to next insn   * -1 - unknown. Example: "if (reg < 5)" is unknown when register value range [0,10]   */ -static int is_branch_taken(struct bpf_reg_state *reg, u64 val, u8 opcode) +static int is_branch_taken(struct bpf_reg_state *reg, u64 val, u8 opcode, +			   bool is_jmp32)  { +	struct bpf_reg_state reg_lo; +	s64 sval; +  	if (__is_pointer_value(false, reg))  		return -1; +	if (is_jmp32) { +		reg_lo = *reg; +		reg = ®_lo; +		/* For JMP32, only low 32 bits are compared, coerce_reg_to_size +		 * could truncate high bits and update umin/umax according to +		 * information of low bits. +		 */ +		coerce_reg_to_size(reg, 4); +		/* smin/smax need special handling. For example, after coerce, +		 * if smin_value is 0x00000000ffffffffLL, the value is -1 when +		 * used as operand to JMP32. It is a negative number from s32's +		 * point of view, while it is a positive number when seen as +		 * s64. The smin/smax are kept as s64, therefore, when used with +		 * JMP32, they need to be transformed into s32, then sign +		 * extended back to s64. +		 * +		 * Also, smin/smax were copied from umin/umax. If umin/umax has +		 * different sign bit, then min/max relationship doesn't +		 * maintain after casting into s32, for this case, set smin/smax +		 * to safest range. +		 */ +		if ((reg->umax_value ^ reg->umin_value) & +		    (1ULL << 31)) { +			reg->smin_value = S32_MIN; +			reg->smax_value = S32_MAX; +		} +		reg->smin_value = (s64)(s32)reg->smin_value; +		reg->smax_value = (s64)(s32)reg->smax_value; + +		val = (u32)val; +		sval = (s64)(s32)val; +	} else { +		sval = (s64)val; +	} +  	switch (opcode) {  	case BPF_JEQ:  		if (tnum_is_const(reg->var_off)) @@ -4058,9 +4285,9 @@ static int is_branch_taken(struct bpf_reg_state *reg, u64 val, u8 opcode)  			return 0;  		break;  	case BPF_JSGT: -		if (reg->smin_value > (s64)val) +		if (reg->smin_value > sval)  			return 1; -		else if (reg->smax_value < (s64)val) +		else if (reg->smax_value < sval)  			return 0;  		break;  	case BPF_JLT: @@ -4070,9 +4297,9 @@ static int is_branch_taken(struct bpf_reg_state *reg, u64 val, u8 opcode)  			return 0;  		break;  	case BPF_JSLT: -		if (reg->smax_value < (s64)val) +		if (reg->smax_value < sval)  			return 1; -		else if (reg->smin_value >= (s64)val) +		else if (reg->smin_value >= sval)  			return 0;  		break;  	case BPF_JGE: @@ -4082,9 +4309,9 @@ static int is_branch_taken(struct bpf_reg_state *reg, u64 val, u8 opcode)  			return 0;  		break;  	case BPF_JSGE: -		if (reg->smin_value >= (s64)val) +		if (reg->smin_value >= sval)  			return 1; -		else if (reg->smax_value < (s64)val) +		else if (reg->smax_value < sval)  			return 0;  		break;  	case BPF_JLE: @@ -4094,9 +4321,9 @@ static int is_branch_taken(struct bpf_reg_state *reg, u64 val, u8 opcode)  			return 0;  		break;  	case BPF_JSLE: -		if (reg->smax_value <= (s64)val) +		if (reg->smax_value <= sval)  			return 1; -		else if (reg->smin_value > (s64)val) +		else if (reg->smin_value > sval)  			return 0;  		break;  	} @@ -4104,6 +4331,29 @@ static int is_branch_taken(struct bpf_reg_state *reg, u64 val, u8 opcode)  	return -1;  } +/* Generate min value of the high 32-bit from TNUM info. */ +static u64 gen_hi_min(struct tnum var) +{ +	return var.value & ~0xffffffffULL; +} + +/* Generate max value of the high 32-bit from TNUM info. */ +static u64 gen_hi_max(struct tnum var) +{ +	return (var.value | var.mask) & ~0xffffffffULL; +} + +/* Return true if VAL is compared with a s64 sign extended from s32, and they + * are with the same signedness. + */ +static bool cmp_val_with_extended_s64(s64 sval, struct bpf_reg_state *reg) +{ +	return ((s32)sval >= 0 && +		reg->smin_value >= 0 && reg->smax_value <= S32_MAX) || +	       ((s32)sval < 0 && +		reg->smax_value <= 0 && reg->smin_value >= S32_MIN); +} +  /* Adjusts the register min/max values in the case that the dst_reg is the   * variable register that we are working on, and src_reg is a constant or we're   * simply doing a BPF_K check. @@ -4111,8 +4361,10 @@ static int is_branch_taken(struct bpf_reg_state *reg, u64 val, u8 opcode)   */  static void reg_set_min_max(struct bpf_reg_state *true_reg,  			    struct bpf_reg_state *false_reg, u64 val, -			    u8 opcode) +			    u8 opcode, bool is_jmp32)  { +	s64 sval; +  	/* If the dst_reg is a pointer, we can't learn anything about its  	 * variable offset from the compare (unless src_reg were a pointer into  	 * the same object, but we don't bother with that. @@ -4122,19 +4374,31 @@ static void reg_set_min_max(struct bpf_reg_state *true_reg,  	if (__is_pointer_value(false, false_reg))  		return; +	val = is_jmp32 ? (u32)val : val; +	sval = is_jmp32 ? (s64)(s32)val : (s64)val; +  	switch (opcode) {  	case BPF_JEQ: -		/* If this is false then we know nothing Jon Snow, but if it is -		 * true then we know for sure. -		 */ -		__mark_reg_known(true_reg, val); -		break;  	case BPF_JNE: -		/* If this is true we know nothing Jon Snow, but if it is false -		 * we know the value for sure; +	{ +		struct bpf_reg_state *reg = +			opcode == BPF_JEQ ? true_reg : false_reg; + +		/* For BPF_JEQ, if this is false we know nothing Jon Snow, but +		 * if it is true we know the value for sure. Likewise for +		 * BPF_JNE.  		 */ -		__mark_reg_known(false_reg, val); +		if (is_jmp32) { +			u64 old_v = reg->var_off.value; +			u64 hi_mask = ~0xffffffffULL; + +			reg->var_off.value = (old_v & hi_mask) | val; +			reg->var_off.mask &= hi_mask; +		} else { +			__mark_reg_known(reg, val); +		}  		break; +	}  	case BPF_JSET:  		false_reg->var_off = tnum_and(false_reg->var_off,  					      tnum_const(~val)); @@ -4142,38 +4406,61 @@ static void reg_set_min_max(struct bpf_reg_state *true_reg,  			true_reg->var_off = tnum_or(true_reg->var_off,  						    tnum_const(val));  		break; -	case BPF_JGT: -		false_reg->umax_value = min(false_reg->umax_value, val); -		true_reg->umin_value = max(true_reg->umin_value, val + 1); -		break; -	case BPF_JSGT: -		false_reg->smax_value = min_t(s64, false_reg->smax_value, val); -		true_reg->smin_value = max_t(s64, true_reg->smin_value, val + 1); -		break; -	case BPF_JLT: -		false_reg->umin_value = max(false_reg->umin_value, val); -		true_reg->umax_value = min(true_reg->umax_value, val - 1); -		break; -	case BPF_JSLT: -		false_reg->smin_value = max_t(s64, false_reg->smin_value, val); -		true_reg->smax_value = min_t(s64, true_reg->smax_value, val - 1); -		break;  	case BPF_JGE: -		false_reg->umax_value = min(false_reg->umax_value, val - 1); -		true_reg->umin_value = max(true_reg->umin_value, val); +	case BPF_JGT: +	{ +		u64 false_umax = opcode == BPF_JGT ? val    : val - 1; +		u64 true_umin = opcode == BPF_JGT ? val + 1 : val; + +		if (is_jmp32) { +			false_umax += gen_hi_max(false_reg->var_off); +			true_umin += gen_hi_min(true_reg->var_off); +		} +		false_reg->umax_value = min(false_reg->umax_value, false_umax); +		true_reg->umin_value = max(true_reg->umin_value, true_umin);  		break; +	}  	case BPF_JSGE: -		false_reg->smax_value = min_t(s64, false_reg->smax_value, val - 1); -		true_reg->smin_value = max_t(s64, true_reg->smin_value, val); +	case BPF_JSGT: +	{ +		s64 false_smax = opcode == BPF_JSGT ? sval    : sval - 1; +		s64 true_smin = opcode == BPF_JSGT ? sval + 1 : sval; + +		/* If the full s64 was not sign-extended from s32 then don't +		 * deduct further info. +		 */ +		if (is_jmp32 && !cmp_val_with_extended_s64(sval, false_reg)) +			break; +		false_reg->smax_value = min(false_reg->smax_value, false_smax); +		true_reg->smin_value = max(true_reg->smin_value, true_smin);  		break; +	}  	case BPF_JLE: -		false_reg->umin_value = max(false_reg->umin_value, val + 1); -		true_reg->umax_value = min(true_reg->umax_value, val); +	case BPF_JLT: +	{ +		u64 false_umin = opcode == BPF_JLT ? val    : val + 1; +		u64 true_umax = opcode == BPF_JLT ? val - 1 : val; + +		if (is_jmp32) { +			false_umin += gen_hi_min(false_reg->var_off); +			true_umax += gen_hi_max(true_reg->var_off); +		} +		false_reg->umin_value = max(false_reg->umin_value, false_umin); +		true_reg->umax_value = min(true_reg->umax_value, true_umax);  		break; +	}  	case BPF_JSLE: -		false_reg->smin_value = max_t(s64, false_reg->smin_value, val + 1); -		true_reg->smax_value = min_t(s64, true_reg->smax_value, val); +	case BPF_JSLT: +	{ +		s64 false_smin = opcode == BPF_JSLT ? sval    : sval + 1; +		s64 true_smax = opcode == BPF_JSLT ? sval - 1 : sval; + +		if (is_jmp32 && !cmp_val_with_extended_s64(sval, false_reg)) +			break; +		false_reg->smin_value = max(false_reg->smin_value, false_smin); +		true_reg->smax_value = min(true_reg->smax_value, true_smax);  		break; +	}  	default:  		break;  	} @@ -4196,24 +4483,34 @@ static void reg_set_min_max(struct bpf_reg_state *true_reg,   */  static void reg_set_min_max_inv(struct bpf_reg_state *true_reg,  				struct bpf_reg_state *false_reg, u64 val, -				u8 opcode) +				u8 opcode, bool is_jmp32)  { +	s64 sval; +  	if (__is_pointer_value(false, false_reg))  		return; +	val = is_jmp32 ? (u32)val : val; +	sval = is_jmp32 ? (s64)(s32)val : (s64)val; +  	switch (opcode) {  	case BPF_JEQ: -		/* If this is false then we know nothing Jon Snow, but if it is -		 * true then we know for sure. -		 */ -		__mark_reg_known(true_reg, val); -		break;  	case BPF_JNE: -		/* If this is true we know nothing Jon Snow, but if it is false -		 * we know the value for sure; -		 */ -		__mark_reg_known(false_reg, val); +	{ +		struct bpf_reg_state *reg = +			opcode == BPF_JEQ ? true_reg : false_reg; + +		if (is_jmp32) { +			u64 old_v = reg->var_off.value; +			u64 hi_mask = ~0xffffffffULL; + +			reg->var_off.value = (old_v & hi_mask) | val; +			reg->var_off.mask &= hi_mask; +		} else { +			__mark_reg_known(reg, val); +		}  		break; +	}  	case BPF_JSET:  		false_reg->var_off = tnum_and(false_reg->var_off,  					      tnum_const(~val)); @@ -4221,38 +4518,58 @@ static void reg_set_min_max_inv(struct bpf_reg_state *true_reg,  			true_reg->var_off = tnum_or(true_reg->var_off,  						    tnum_const(val));  		break; -	case BPF_JGT: -		true_reg->umax_value = min(true_reg->umax_value, val - 1); -		false_reg->umin_value = max(false_reg->umin_value, val); -		break; -	case BPF_JSGT: -		true_reg->smax_value = min_t(s64, true_reg->smax_value, val - 1); -		false_reg->smin_value = max_t(s64, false_reg->smin_value, val); -		break; -	case BPF_JLT: -		true_reg->umin_value = max(true_reg->umin_value, val + 1); -		false_reg->umax_value = min(false_reg->umax_value, val); -		break; -	case BPF_JSLT: -		true_reg->smin_value = max_t(s64, true_reg->smin_value, val + 1); -		false_reg->smax_value = min_t(s64, false_reg->smax_value, val); -		break;  	case BPF_JGE: -		true_reg->umax_value = min(true_reg->umax_value, val); -		false_reg->umin_value = max(false_reg->umin_value, val + 1); +	case BPF_JGT: +	{ +		u64 false_umin = opcode == BPF_JGT ? val    : val + 1; +		u64 true_umax = opcode == BPF_JGT ? val - 1 : val; + +		if (is_jmp32) { +			false_umin += gen_hi_min(false_reg->var_off); +			true_umax += gen_hi_max(true_reg->var_off); +		} +		false_reg->umin_value = max(false_reg->umin_value, false_umin); +		true_reg->umax_value = min(true_reg->umax_value, true_umax);  		break; +	}  	case BPF_JSGE: -		true_reg->smax_value = min_t(s64, true_reg->smax_value, val); -		false_reg->smin_value = max_t(s64, false_reg->smin_value, val + 1); +	case BPF_JSGT: +	{ +		s64 false_smin = opcode == BPF_JSGT ? sval    : sval + 1; +		s64 true_smax = opcode == BPF_JSGT ? sval - 1 : sval; + +		if (is_jmp32 && !cmp_val_with_extended_s64(sval, false_reg)) +			break; +		false_reg->smin_value = max(false_reg->smin_value, false_smin); +		true_reg->smax_value = min(true_reg->smax_value, true_smax);  		break; +	}  	case BPF_JLE: -		true_reg->umin_value = max(true_reg->umin_value, val); -		false_reg->umax_value = min(false_reg->umax_value, val - 1); +	case BPF_JLT: +	{ +		u64 false_umax = opcode == BPF_JLT ? val    : val - 1; +		u64 true_umin = opcode == BPF_JLT ? val + 1 : val; + +		if (is_jmp32) { +			false_umax += gen_hi_max(false_reg->var_off); +			true_umin += gen_hi_min(true_reg->var_off); +		} +		false_reg->umax_value = min(false_reg->umax_value, false_umax); +		true_reg->umin_value = max(true_reg->umin_value, true_umin);  		break; +	}  	case BPF_JSLE: -		true_reg->smin_value = max_t(s64, true_reg->smin_value, val); -		false_reg->smax_value = min_t(s64, false_reg->smax_value, val - 1); +	case BPF_JSLT: +	{ +		s64 false_smax = opcode == BPF_JSLT ? sval    : sval - 1; +		s64 true_smin = opcode == BPF_JSLT ? sval + 1 : sval; + +		if (is_jmp32 && !cmp_val_with_extended_s64(sval, false_reg)) +			break; +		false_reg->smax_value = min(false_reg->smax_value, false_smax); +		true_reg->smin_value = max(true_reg->smin_value, true_smin);  		break; +	}  	default:  		break;  	} @@ -4343,8 +4660,13 @@ static void mark_ptr_or_null_reg(struct bpf_func_state *state,  			}  		} else if (reg->type == PTR_TO_SOCKET_OR_NULL) {  			reg->type = PTR_TO_SOCKET; +		} else if (reg->type == PTR_TO_SOCK_COMMON_OR_NULL) { +			reg->type = PTR_TO_SOCK_COMMON; +		} else if (reg->type == PTR_TO_TCP_SOCK_OR_NULL) { +			reg->type = PTR_TO_TCP_SOCK;  		} -		if (is_null || !reg_is_refcounted(reg)) { +		if (is_null || !(reg_is_refcounted(reg) || +				 reg_may_point_to_spin_lock(reg))) {  			/* We don't need id from this point onwards anymore,  			 * thus we should better reset it, so that state  			 * pruning has chances to take effect. @@ -4366,7 +4688,7 @@ static void mark_ptr_or_null_regs(struct bpf_verifier_state *vstate, u32 regno,  	int i, j;  	if (reg_is_refcounted_or_null(®s[regno]) && is_null) -		__release_reference_state(state, id); +		release_reference_state(state, id);  	for (i = 0; i < MAX_BPF_REG; i++)  		mark_ptr_or_null_reg(state, ®s[i], id, is_null); @@ -4390,6 +4712,10 @@ static bool try_match_pkt_pointers(const struct bpf_insn *insn,  	if (BPF_SRC(insn->code) != BPF_X)  		return false; +	/* Pointers are always 64-bit. */ +	if (BPF_CLASS(insn->code) == BPF_JMP32) +		return false; +  	switch (BPF_OP(insn->code)) {  	case BPF_JGT:  		if ((dst_reg->type == PTR_TO_PACKET && @@ -4482,16 +4808,18 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,  	struct bpf_reg_state *regs = this_branch->frame[this_branch->curframe]->regs;  	struct bpf_reg_state *dst_reg, *other_branch_regs;  	u8 opcode = BPF_OP(insn->code); +	bool is_jmp32;  	int err; -	if (opcode > BPF_JSLE) { -		verbose(env, "invalid BPF_JMP opcode %x\n", opcode); +	/* Only conditional jumps are expected to reach here. */ +	if (opcode == BPF_JA || opcode > BPF_JSLE) { +		verbose(env, "invalid BPF_JMP/JMP32 opcode %x\n", opcode);  		return -EINVAL;  	}  	if (BPF_SRC(insn->code) == BPF_X) {  		if (insn->imm != 0) { -			verbose(env, "BPF_JMP uses reserved fields\n"); +			verbose(env, "BPF_JMP/JMP32 uses reserved fields\n");  			return -EINVAL;  		} @@ -4507,7 +4835,7 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,  		}  	} else {  		if (insn->src_reg != BPF_REG_0) { -			verbose(env, "BPF_JMP uses reserved fields\n"); +			verbose(env, "BPF_JMP/JMP32 uses reserved fields\n");  			return -EINVAL;  		}  	} @@ -4518,9 +4846,11 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,  		return err;  	dst_reg = ®s[insn->dst_reg]; +	is_jmp32 = BPF_CLASS(insn->code) == BPF_JMP32;  	if (BPF_SRC(insn->code) == BPF_K) { -		int pred = is_branch_taken(dst_reg, insn->imm, opcode); +		int pred = is_branch_taken(dst_reg, insn->imm, opcode, +					   is_jmp32);  		if (pred == 1) {  			 /* only follow the goto, ignore fall-through */ @@ -4548,30 +4878,51 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,  	 * comparable.  	 */  	if (BPF_SRC(insn->code) == BPF_X) { +		struct bpf_reg_state *src_reg = ®s[insn->src_reg]; +		struct bpf_reg_state lo_reg0 = *dst_reg; +		struct bpf_reg_state lo_reg1 = *src_reg; +		struct bpf_reg_state *src_lo, *dst_lo; + +		dst_lo = &lo_reg0; +		src_lo = &lo_reg1; +		coerce_reg_to_size(dst_lo, 4); +		coerce_reg_to_size(src_lo, 4); +  		if (dst_reg->type == SCALAR_VALUE && -		    regs[insn->src_reg].type == SCALAR_VALUE) { -			if (tnum_is_const(regs[insn->src_reg].var_off)) +		    src_reg->type == SCALAR_VALUE) { +			if (tnum_is_const(src_reg->var_off) || +			    (is_jmp32 && tnum_is_const(src_lo->var_off)))  				reg_set_min_max(&other_branch_regs[insn->dst_reg], -						dst_reg, regs[insn->src_reg].var_off.value, -						opcode); -			else if (tnum_is_const(dst_reg->var_off)) +						dst_reg, +						is_jmp32 +						? src_lo->var_off.value +						: src_reg->var_off.value, +						opcode, is_jmp32); +			else if (tnum_is_const(dst_reg->var_off) || +				 (is_jmp32 && tnum_is_const(dst_lo->var_off)))  				reg_set_min_max_inv(&other_branch_regs[insn->src_reg], -						    ®s[insn->src_reg], -						    dst_reg->var_off.value, opcode); -			else if (opcode == BPF_JEQ || opcode == BPF_JNE) +						    src_reg, +						    is_jmp32 +						    ? dst_lo->var_off.value +						    : dst_reg->var_off.value, +						    opcode, is_jmp32); +			else if (!is_jmp32 && +				 (opcode == BPF_JEQ || opcode == BPF_JNE))  				/* Comparing for equality, we can combine knowledge */  				reg_combine_min_max(&other_branch_regs[insn->src_reg],  						    &other_branch_regs[insn->dst_reg], -						    ®s[insn->src_reg], -						    ®s[insn->dst_reg], opcode); +						    src_reg, dst_reg, opcode);  		}  	} else if (dst_reg->type == SCALAR_VALUE) {  		reg_set_min_max(&other_branch_regs[insn->dst_reg], -					dst_reg, insn->imm, opcode); +					dst_reg, insn->imm, opcode, is_jmp32);  	} -	/* detect if R == 0 where R is returned from bpf_map_lookup_elem() */ -	if (BPF_SRC(insn->code) == BPF_K && +	/* detect if R == 0 where R is returned from bpf_map_lookup_elem(). +	 * NOTE: these optimizations below are related with pointer comparison +	 *       which will never be JMP32. +	 */ +	if (!is_jmp32 && BPF_SRC(insn->code) == BPF_K &&  	    insn->imm == 0 && (opcode == BPF_JEQ || opcode == BPF_JNE) &&  	    reg_type_may_be_null(dst_reg->type)) {  		/* Mark all identical registers in each branch as either @@ -4713,6 +5064,11 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn)  		return err;  	} +	if (env->cur_state->active_spin_lock) { +		verbose(env, "BPF_LD_[ABS|IND] cannot be used inside bpf_spin_lock-ed region\n"); +		return -EINVAL; +	} +  	if (regs[BPF_REG_6].type != PTR_TO_CTX) {  		verbose(env,  			"at the time of BPF_LD_ABS|IND R6 != pointer to skb\n"); @@ -4900,7 +5256,8 @@ peek_stack:  		goto check_state;  	t = insn_stack[cur_stack - 1]; -	if (BPF_CLASS(insns[t].code) == BPF_JMP) { +	if (BPF_CLASS(insns[t].code) == BPF_JMP || +	    BPF_CLASS(insns[t].code) == BPF_JMP32) {  		u8 opcode = BPF_OP(insns[t].code);  		if (opcode == BPF_EXIT) { @@ -4997,13 +5354,14 @@ static int check_btf_func(struct bpf_verifier_env *env,  			  const union bpf_attr *attr,  			  union bpf_attr __user *uattr)  { -	u32 i, nfuncs, urec_size, min_size, prev_offset; +	u32 i, nfuncs, urec_size, min_size;  	u32 krec_size = sizeof(struct bpf_func_info);  	struct bpf_func_info *krecord;  	const struct btf_type *type;  	struct bpf_prog *prog;  	const struct btf *btf;  	void __user *urecord; +	u32 prev_offset = 0;  	int ret = 0;  	nfuncs = attr->func_info_cnt; @@ -5447,8 +5805,11 @@ static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur,  	case PTR_TO_MAP_VALUE:  		/* If the new min/max/var_off satisfy the old ones and  		 * everything else matches, we are OK. -		 * We don't care about the 'id' value, because nothing -		 * uses it for PTR_TO_MAP_VALUE (only for ..._OR_NULL) +		 * 'id' is not compared, since it's only used for maps with +		 * bpf_spin_lock inside map element and in such cases if +		 * the rest of the prog is valid for one map element then +		 * it's valid for all map elements regardless of the key +		 * used in bpf_map_lookup()  		 */  		return memcmp(rold, rcur, offsetof(struct bpf_reg_state, id)) == 0 &&  		       range_within(rold, rcur) && @@ -5496,6 +5857,10 @@ static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur,  	case PTR_TO_FLOW_KEYS:  	case PTR_TO_SOCKET:  	case PTR_TO_SOCKET_OR_NULL: +	case PTR_TO_SOCK_COMMON: +	case PTR_TO_SOCK_COMMON_OR_NULL: +	case PTR_TO_TCP_SOCK: +	case PTR_TO_TCP_SOCK_OR_NULL:  		/* Only valid matches are exact, which memcmp() above  		 * would have accepted  		 */ @@ -5651,6 +6016,9 @@ static bool states_equal(struct bpf_verifier_env *env,  	if (old->speculative && !cur->speculative)  		return false; +	if (old->active_spin_lock != cur->active_spin_lock) +		return false; +  	/* for states to be equal callsites have to be the same  	 * and all frame states need to be equivalent  	 */ @@ -5813,6 +6181,10 @@ static bool reg_type_mismatch_ok(enum bpf_reg_type type)  	case PTR_TO_CTX:  	case PTR_TO_SOCKET:  	case PTR_TO_SOCKET_OR_NULL: +	case PTR_TO_SOCK_COMMON: +	case PTR_TO_SOCK_COMMON_OR_NULL: +	case PTR_TO_TCP_SOCK: +	case PTR_TO_TCP_SOCK_OR_NULL:  		return false;  	default:  		return true; @@ -6055,7 +6427,7 @@ static int do_check(struct bpf_verifier_env *env)  			if (err)  				return err; -		} else if (class == BPF_JMP) { +		} else if (class == BPF_JMP || class == BPF_JMP32) {  			u8 opcode = BPF_OP(insn->code);  			if (opcode == BPF_CALL) { @@ -6063,11 +6435,18 @@ static int do_check(struct bpf_verifier_env *env)  				    insn->off != 0 ||  				    (insn->src_reg != BPF_REG_0 &&  				     insn->src_reg != BPF_PSEUDO_CALL) || -				    insn->dst_reg != BPF_REG_0) { +				    insn->dst_reg != BPF_REG_0 || +				    class == BPF_JMP32) {  					verbose(env, "BPF_CALL uses reserved fields\n");  					return -EINVAL;  				} +				if (env->cur_state->active_spin_lock && +				    (insn->src_reg == BPF_PSEUDO_CALL || +				     insn->imm != BPF_FUNC_spin_unlock)) { +					verbose(env, "function calls are not allowed while holding a lock\n"); +					return -EINVAL; +				}  				if (insn->src_reg == BPF_PSEUDO_CALL)  					err = check_func_call(env, insn, &env->insn_idx);  				else @@ -6079,7 +6458,8 @@ static int do_check(struct bpf_verifier_env *env)  				if (BPF_SRC(insn->code) != BPF_K ||  				    insn->imm != 0 ||  				    insn->src_reg != BPF_REG_0 || -				    insn->dst_reg != BPF_REG_0) { +				    insn->dst_reg != BPF_REG_0 || +				    class == BPF_JMP32) {  					verbose(env, "BPF_JA uses reserved fields\n");  					return -EINVAL;  				} @@ -6091,11 +6471,17 @@ static int do_check(struct bpf_verifier_env *env)  				if (BPF_SRC(insn->code) != BPF_K ||  				    insn->imm != 0 ||  				    insn->src_reg != BPF_REG_0 || -				    insn->dst_reg != BPF_REG_0) { +				    insn->dst_reg != BPF_REG_0 || +				    class == BPF_JMP32) {  					verbose(env, "BPF_EXIT uses reserved fields\n");  					return -EINVAL;  				} +				if (env->cur_state->active_spin_lock) { +					verbose(env, "bpf_spin_unlock is missing\n"); +					return -EINVAL; +				} +  				if (state->curframe) {  					/* exit from nested function */  					env->prev_insn_idx = env->insn_idx; @@ -6193,6 +6579,19 @@ static int check_map_prealloc(struct bpf_map *map)  		!(map->map_flags & BPF_F_NO_PREALLOC);  } +static bool is_tracing_prog_type(enum bpf_prog_type type) +{ +	switch (type) { +	case BPF_PROG_TYPE_KPROBE: +	case BPF_PROG_TYPE_TRACEPOINT: +	case BPF_PROG_TYPE_PERF_EVENT: +	case BPF_PROG_TYPE_RAW_TRACEPOINT: +		return true; +	default: +		return false; +	} +} +  static int check_map_prog_compatibility(struct bpf_verifier_env *env,  					struct bpf_map *map,  					struct bpf_prog *prog) @@ -6215,6 +6614,13 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env,  		}  	} +	if ((is_tracing_prog_type(prog->type) || +	     prog->type == BPF_PROG_TYPE_SOCKET_FILTER) && +	    map_value_has_spin_lock(map)) { +		verbose(env, "tracing progs cannot use bpf_spin_lock yet\n"); +		return -EINVAL; +	} +  	if ((bpf_prog_is_dev_bound(prog->aux) || bpf_map_is_dev_bound(map)) &&  	    !bpf_offload_prog_map_match(prog, map)) {  		verbose(env, "offload device mismatch between prog and map\n"); @@ -6431,6 +6837,153 @@ static struct bpf_prog *bpf_patch_insn_data(struct bpf_verifier_env *env, u32 of  	return new_prog;  } +static int adjust_subprog_starts_after_remove(struct bpf_verifier_env *env, +					      u32 off, u32 cnt) +{ +	int i, j; + +	/* find first prog starting at or after off (first to remove) */ +	for (i = 0; i < env->subprog_cnt; i++) +		if (env->subprog_info[i].start >= off) +			break; +	/* find first prog starting at or after off + cnt (first to stay) */ +	for (j = i; j < env->subprog_cnt; j++) +		if (env->subprog_info[j].start >= off + cnt) +			break; +	/* if j doesn't start exactly at off + cnt, we are just removing +	 * the front of previous prog +	 */ +	if (env->subprog_info[j].start != off + cnt) +		j--; + +	if (j > i) { +		struct bpf_prog_aux *aux = env->prog->aux; +		int move; + +		/* move fake 'exit' subprog as well */ +		move = env->subprog_cnt + 1 - j; + +		memmove(env->subprog_info + i, +			env->subprog_info + j, +			sizeof(*env->subprog_info) * move); +		env->subprog_cnt -= j - i; + +		/* remove func_info */ +		if (aux->func_info) { +			move = aux->func_info_cnt - j; + +			memmove(aux->func_info + i, +				aux->func_info + j, +				sizeof(*aux->func_info) * move); +			aux->func_info_cnt -= j - i; +			/* func_info->insn_off is set after all code rewrites, +			 * in adjust_btf_func() - no need to adjust +			 */ +		} +	} else { +		/* convert i from "first prog to remove" to "first to adjust" */ +		if (env->subprog_info[i].start == off) +			i++; +	} + +	/* update fake 'exit' subprog as well */ +	for (; i <= env->subprog_cnt; i++) +		env->subprog_info[i].start -= cnt; + +	return 0; +} + +static int bpf_adj_linfo_after_remove(struct bpf_verifier_env *env, u32 off, +				      u32 cnt) +{ +	struct bpf_prog *prog = env->prog; +	u32 i, l_off, l_cnt, nr_linfo; +	struct bpf_line_info *linfo; + +	nr_linfo = prog->aux->nr_linfo; +	if (!nr_linfo) +		return 0; + +	linfo = prog->aux->linfo; + +	/* find first line info to remove, count lines to be removed */ +	for (i = 0; i < nr_linfo; i++) +		if (linfo[i].insn_off >= off) +			break; + +	l_off = i; +	l_cnt = 0; +	for (; i < nr_linfo; i++) +		if (linfo[i].insn_off < off + cnt) +			l_cnt++; +		else +			break; + +	/* First live insn doesn't match first live linfo, it needs to "inherit" +	 * last removed linfo.  prog is already modified, so prog->len == off +	 * means no live instructions after (tail of the program was removed). +	 */ +	if (prog->len != off && l_cnt && +	    (i == nr_linfo || linfo[i].insn_off != off + cnt)) { +		l_cnt--; +		linfo[--i].insn_off = off + cnt; +	} + +	/* remove the line info which refer to the removed instructions */ +	if (l_cnt) { +		memmove(linfo + l_off, linfo + i, +			sizeof(*linfo) * (nr_linfo - i)); + +		prog->aux->nr_linfo -= l_cnt; +		nr_linfo = prog->aux->nr_linfo; +	} + +	/* pull all linfo[i].insn_off >= off + cnt in by cnt */ +	for (i = l_off; i < nr_linfo; i++) +		linfo[i].insn_off -= cnt; + +	/* fix up all subprogs (incl. 'exit') which start >= off */ +	for (i = 0; i <= env->subprog_cnt; i++) +		if (env->subprog_info[i].linfo_idx > l_off) { +			/* program may have started in the removed region but +			 * may not be fully removed +			 */ +			if (env->subprog_info[i].linfo_idx >= l_off + l_cnt) +				env->subprog_info[i].linfo_idx -= l_cnt; +			else +				env->subprog_info[i].linfo_idx = l_off; +		} + +	return 0; +} + +static int verifier_remove_insns(struct bpf_verifier_env *env, u32 off, u32 cnt) +{ +	struct bpf_insn_aux_data *aux_data = env->insn_aux_data; +	unsigned int orig_prog_len = env->prog->len; +	int err; + +	if (bpf_prog_is_dev_bound(env->prog->aux)) +		bpf_prog_offload_remove_insns(env, off, cnt); + +	err = bpf_remove_insns(env->prog, off, cnt); +	if (err) +		return err; + +	err = adjust_subprog_starts_after_remove(env, off, cnt); +	if (err) +		return err; + +	err = bpf_adj_linfo_after_remove(env, off, cnt); +	if (err) +		return err; + +	memmove(aux_data + off,	aux_data + off + cnt, +		sizeof(*aux_data) * (orig_prog_len - off - cnt)); + +	return 0; +} +  /* The verifier does more data flow analysis than llvm and will not   * explore branches that are dead at run time. Malicious programs can   * have dead code too. Therefore replace all dead at-run-time code @@ -6457,6 +7010,91 @@ static void sanitize_dead_code(struct bpf_verifier_env *env)  	}  } +static bool insn_is_cond_jump(u8 code) +{ +	u8 op; + +	if (BPF_CLASS(code) == BPF_JMP32) +		return true; + +	if (BPF_CLASS(code) != BPF_JMP) +		return false; + +	op = BPF_OP(code); +	return op != BPF_JA && op != BPF_EXIT && op != BPF_CALL; +} + +static void opt_hard_wire_dead_code_branches(struct bpf_verifier_env *env) +{ +	struct bpf_insn_aux_data *aux_data = env->insn_aux_data; +	struct bpf_insn ja = BPF_JMP_IMM(BPF_JA, 0, 0, 0); +	struct bpf_insn *insn = env->prog->insnsi; +	const int insn_cnt = env->prog->len; +	int i; + +	for (i = 0; i < insn_cnt; i++, insn++) { +		if (!insn_is_cond_jump(insn->code)) +			continue; + +		if (!aux_data[i + 1].seen) +			ja.off = insn->off; +		else if (!aux_data[i + 1 + insn->off].seen) +			ja.off = 0; +		else +			continue; + +		if (bpf_prog_is_dev_bound(env->prog->aux)) +			bpf_prog_offload_replace_insn(env, i, &ja); + +		memcpy(insn, &ja, sizeof(ja)); +	} +} + +static int opt_remove_dead_code(struct bpf_verifier_env *env) +{ +	struct bpf_insn_aux_data *aux_data = env->insn_aux_data; +	int insn_cnt = env->prog->len; +	int i, err; + +	for (i = 0; i < insn_cnt; i++) { +		int j; + +		j = 0; +		while (i + j < insn_cnt && !aux_data[i + j].seen) +			j++; +		if (!j) +			continue; + +		err = verifier_remove_insns(env, i, j); +		if (err) +			return err; +		insn_cnt = env->prog->len; +	} + +	return 0; +} + +static int opt_remove_nops(struct bpf_verifier_env *env) +{ +	const struct bpf_insn ja = BPF_JMP_IMM(BPF_JA, 0, 0, 0); +	struct bpf_insn *insn = env->prog->insnsi; +	int insn_cnt = env->prog->len; +	int i, err; + +	for (i = 0; i < insn_cnt; i++) { +		if (memcmp(&insn[i], &ja, sizeof(ja))) +			continue; + +		err = verifier_remove_insns(env, i, 1); +		if (err) +			return err; +		insn_cnt--; +		i--; +	} + +	return 0; +} +  /* convert load instructions that access fields of a context type into a   * sequence of instructions that access fields of the underlying structure:   *     struct __sk_buff    -> struct sk_buff @@ -6549,8 +7187,12 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)  			convert_ctx_access = ops->convert_ctx_access;  			break;  		case PTR_TO_SOCKET: +		case PTR_TO_SOCK_COMMON:  			convert_ctx_access = bpf_sock_convert_ctx_access;  			break; +		case PTR_TO_TCP_SOCK: +			convert_ctx_access = bpf_tcp_sock_convert_ctx_access; +			break;  		default:  			continue;  		} @@ -6678,7 +7320,12 @@ static int jit_subprogs(struct bpf_verifier_env *env)  		subprog_end = env->subprog_info[i + 1].start;  		len = subprog_end - subprog_start; -		func[i] = bpf_prog_alloc(bpf_prog_size(len), GFP_USER); +		/* BPF_PROG_RUN doesn't call subprogs directly, +		 * hence main prog stats include the runtime of subprogs. +		 * subprogs don't have IDs and not reachable via prog_get_next_id +		 * func[i]->aux->stats will never be accessed and stays NULL +		 */ +		func[i] = bpf_prog_alloc_no_stats(bpf_prog_size(len), GFP_USER);  		if (!func[i])  			goto out_free;  		memcpy(func[i]->insnsi, &prog->insnsi[subprog_start], @@ -6917,7 +7564,8 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)  			u32 off_reg;  			aux = &env->insn_aux_data[i + delta]; -			if (!aux->alu_state) +			if (!aux->alu_state || +			    aux->alu_state == BPF_ALU_NON_POINTER)  				continue;  			isneg = aux->alu_state & BPF_ALU_NEG_VALUE; @@ -7147,7 +7795,8 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr,  {  	struct bpf_verifier_env *env;  	struct bpf_verifier_log *log; -	int ret = -EINVAL; +	int i, len, ret = -EINVAL; +	bool is_priv;  	/* no program is valid */  	if (ARRAY_SIZE(bpf_verifier_ops) == 0) @@ -7161,12 +7810,14 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr,  		return -ENOMEM;  	log = &env->log; +	len = (*prog)->len;  	env->insn_aux_data = -		vzalloc(array_size(sizeof(struct bpf_insn_aux_data), -				   (*prog)->len)); +		vzalloc(array_size(sizeof(struct bpf_insn_aux_data), len));  	ret = -ENOMEM;  	if (!env->insn_aux_data)  		goto err_free_env; +	for (i = 0; i < len; i++) +		env->insn_aux_data[i].orig_idx = i;  	env->prog = *prog;  	env->ops = bpf_verifier_ops[env->prog->type]; @@ -7194,6 +7845,9 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr,  	if (attr->prog_flags & BPF_F_ANY_ALIGNMENT)  		env->strict_alignment = false; +	is_priv = capable(CAP_SYS_ADMIN); +	env->allow_ptr_leaks = is_priv; +  	ret = replace_map_fd_with_map_ptr(env);  	if (ret < 0)  		goto skip_full_check; @@ -7211,8 +7865,6 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr,  	if (!env->explored_states)  		goto skip_full_check; -	env->allow_ptr_leaks = capable(CAP_SYS_ADMIN); -  	ret = check_subprogs(env);  	if (ret < 0)  		goto skip_full_check; @@ -7242,8 +7894,17 @@ skip_full_check:  		ret = check_max_stack_depth(env);  	/* instruction rewrites happen after this point */ -	if (ret == 0) -		sanitize_dead_code(env); +	if (is_priv) { +		if (ret == 0) +			opt_hard_wire_dead_code_branches(env); +		if (ret == 0) +			ret = opt_remove_dead_code(env); +		if (ret == 0) +			ret = opt_remove_nops(env); +	} else { +		if (ret == 0) +			sanitize_dead_code(env); +	}  	if (ret == 0)  		/* program is valid, convert *(u32*)(ctx + off) accesses */ diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h index c950864016e2..c9a35f09e4b9 100644 --- a/kernel/cgroup/cgroup-internal.h +++ b/kernel/cgroup/cgroup-internal.h @@ -198,7 +198,7 @@ int cgroup_path_ns_locked(struct cgroup *cgrp, char *buf, size_t buflen,  void cgroup_free_root(struct cgroup_root *root);  void init_cgroup_root(struct cgroup_root *root, struct cgroup_sb_opts *opts); -int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask, int ref_flags); +int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask);  int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask);  struct dentry *cgroup_do_mount(struct file_system_type *fs_type, int flags,  			       struct cgroup_root *root, unsigned long magic, diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index 583b969b0c0e..f94a7229974e 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -1116,13 +1116,11 @@ struct dentry *cgroup1_mount(struct file_system_type *fs_type, int flags,  			     void *data, unsigned long magic,  			     struct cgroup_namespace *ns)  { -	struct super_block *pinned_sb = NULL;  	struct cgroup_sb_opts opts;  	struct cgroup_root *root;  	struct cgroup_subsys *ss;  	struct dentry *dentry;  	int i, ret; -	bool new_root = false;  	cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp); @@ -1184,29 +1182,6 @@ struct dentry *cgroup1_mount(struct file_system_type *fs_type, int flags,  		if (root->flags ^ opts.flags)  			pr_warn("new mount options do not match the existing superblock, will be ignored\n"); -		/* -		 * We want to reuse @root whose lifetime is governed by its -		 * ->cgrp.  Let's check whether @root is alive and keep it -		 * that way.  As cgroup_kill_sb() can happen anytime, we -		 * want to block it by pinning the sb so that @root doesn't -		 * get killed before mount is complete. -		 * -		 * With the sb pinned, tryget_live can reliably indicate -		 * whether @root can be reused.  If it's being killed, -		 * drain it.  We can use wait_queue for the wait but this -		 * path is super cold.  Let's just sleep a bit and retry. -		 */ -		pinned_sb = kernfs_pin_sb(root->kf_root, NULL); -		if (IS_ERR(pinned_sb) || -		    !percpu_ref_tryget_live(&root->cgrp.self.refcnt)) { -			mutex_unlock(&cgroup_mutex); -			if (!IS_ERR_OR_NULL(pinned_sb)) -				deactivate_super(pinned_sb); -			msleep(10); -			ret = restart_syscall(); -			goto out_free; -		} -  		ret = 0;  		goto out_unlock;  	} @@ -1232,15 +1207,20 @@ struct dentry *cgroup1_mount(struct file_system_type *fs_type, int flags,  		ret = -ENOMEM;  		goto out_unlock;  	} -	new_root = true;  	init_cgroup_root(root, &opts); -	ret = cgroup_setup_root(root, opts.subsys_mask, PERCPU_REF_INIT_DEAD); +	ret = cgroup_setup_root(root, opts.subsys_mask);  	if (ret)  		cgroup_free_root(root);  out_unlock: +	if (!ret && !percpu_ref_tryget_live(&root->cgrp.self.refcnt)) { +		mutex_unlock(&cgroup_mutex); +		msleep(10); +		ret = restart_syscall(); +		goto out_free; +	}  	mutex_unlock(&cgroup_mutex);  out_free:  	kfree(opts.release_agent); @@ -1252,25 +1232,13 @@ out_free:  	dentry = cgroup_do_mount(&cgroup_fs_type, flags, root,  				 CGROUP_SUPER_MAGIC, ns); -	/* -	 * There's a race window after we release cgroup_mutex and before -	 * allocating a superblock. Make sure a concurrent process won't -	 * be able to re-use the root during this window by delaying the -	 * initialization of root refcnt. -	 */ -	if (new_root) { -		mutex_lock(&cgroup_mutex); -		percpu_ref_reinit(&root->cgrp.self.refcnt); -		mutex_unlock(&cgroup_mutex); +	if (!IS_ERR(dentry) && percpu_ref_is_dying(&root->cgrp.self.refcnt)) { +		struct super_block *sb = dentry->d_sb; +		dput(dentry); +		deactivate_locked_super(sb); +		msleep(10); +		dentry = ERR_PTR(restart_syscall());  	} - -	/* -	 * If @pinned_sb, we're reusing an existing root and holding an -	 * extra ref on its sb.  Mount is complete.  Put the extra ref. -	 */ -	if (pinned_sb) -		deactivate_super(pinned_sb); -  	return dentry;  } diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index f31bd61c9466..17828333f7c3 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -1927,7 +1927,7 @@ void init_cgroup_root(struct cgroup_root *root, struct cgroup_sb_opts *opts)  		set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags);  } -int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask, int ref_flags) +int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask)  {  	LIST_HEAD(tmp_links);  	struct cgroup *root_cgrp = &root->cgrp; @@ -1944,7 +1944,7 @@ int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask, int ref_flags)  	root_cgrp->ancestor_ids[0] = ret;  	ret = percpu_ref_init(&root_cgrp->self.refcnt, css_release, -			      ref_flags, GFP_KERNEL); +			      0, GFP_KERNEL);  	if (ret)  		goto out; @@ -2033,7 +2033,7 @@ struct dentry *cgroup_do_mount(struct file_system_type *fs_type, int flags,  			       struct cgroup_namespace *ns)  {  	struct dentry *dentry; -	bool new_sb; +	bool new_sb = false;  	dentry = kernfs_mount(fs_type, flags, root->kf_root, magic, &new_sb); @@ -2043,6 +2043,7 @@ struct dentry *cgroup_do_mount(struct file_system_type *fs_type, int flags,  	 */  	if (!IS_ERR(dentry) && ns != &init_cgroup_ns) {  		struct dentry *nsdentry; +		struct super_block *sb = dentry->d_sb;  		struct cgroup *cgrp;  		mutex_lock(&cgroup_mutex); @@ -2053,12 +2054,14 @@ struct dentry *cgroup_do_mount(struct file_system_type *fs_type, int flags,  		spin_unlock_irq(&css_set_lock);  		mutex_unlock(&cgroup_mutex); -		nsdentry = kernfs_node_dentry(cgrp->kn, dentry->d_sb); +		nsdentry = kernfs_node_dentry(cgrp->kn, sb);  		dput(dentry); +		if (IS_ERR(nsdentry)) +			deactivate_locked_super(sb);  		dentry = nsdentry;  	} -	if (IS_ERR(dentry) || !new_sb) +	if (!new_sb)  		cgroup_put(&root->cgrp);  	return dentry; @@ -2118,18 +2121,16 @@ static void cgroup_kill_sb(struct super_block *sb)  	struct cgroup_root *root = cgroup_root_from_kf(kf_root);  	/* -	 * If @root doesn't have any mounts or children, start killing it. +	 * If @root doesn't have any children, start killing it.  	 * This prevents new mounts by disabling percpu_ref_tryget_live().  	 * cgroup_mount() may wait for @root's release.  	 *  	 * And don't kill the default root.  	 */ -	if (!list_empty(&root->cgrp.self.children) || -	    root == &cgrp_dfl_root) -		cgroup_put(&root->cgrp); -	else +	if (list_empty(&root->cgrp.self.children) && root != &cgrp_dfl_root && +	    !percpu_ref_is_dying(&root->cgrp.self.refcnt))  		percpu_ref_kill(&root->cgrp.self.refcnt); - +	cgroup_put(&root->cgrp);  	kernfs_kill_sb(sb);  } @@ -3533,6 +3534,16 @@ static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf,  	return ret ?: nbytes;  } +static __poll_t cgroup_file_poll(struct kernfs_open_file *of, poll_table *pt) +{ +	struct cftype *cft = of->kn->priv; + +	if (cft->poll) +		return cft->poll(of, pt); + +	return kernfs_generic_poll(of, pt); +} +  static void *cgroup_seqfile_start(struct seq_file *seq, loff_t *ppos)  {  	return seq_cft(seq)->seq_start(seq, ppos); @@ -3571,6 +3582,7 @@ static struct kernfs_ops cgroup_kf_single_ops = {  	.open			= cgroup_file_open,  	.release		= cgroup_file_release,  	.write			= cgroup_file_write, +	.poll			= cgroup_file_poll,  	.seq_show		= cgroup_seqfile_show,  }; @@ -3579,6 +3591,7 @@ static struct kernfs_ops cgroup_kf_ops = {  	.open			= cgroup_file_open,  	.release		= cgroup_file_release,  	.write			= cgroup_file_write, +	.poll			= cgroup_file_poll,  	.seq_start		= cgroup_seqfile_start,  	.seq_next		= cgroup_seqfile_next,  	.seq_stop		= cgroup_seqfile_stop, @@ -5399,7 +5412,7 @@ int __init cgroup_init(void)  	hash_add(css_set_table, &init_css_set.hlist,  		 css_set_hash(init_css_set.subsys)); -	BUG_ON(cgroup_setup_root(&cgrp_dfl_root, 0, 0)); +	BUG_ON(cgroup_setup_root(&cgrp_dfl_root, 0));  	mutex_unlock(&cgroup_mutex); @@ -5996,7 +6009,7 @@ int cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,  	int ret;  	mutex_lock(&cgroup_mutex); -	ret = __cgroup_bpf_detach(cgrp, prog, type, flags); +	ret = __cgroup_bpf_detach(cgrp, prog, type);  	mutex_unlock(&cgroup_mutex);  	return ret;  } diff --git a/kernel/compat.c b/kernel/compat.c index f01affa17e22..d8a36c6ad7c9 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -20,7 +20,6 @@  #include <linux/syscalls.h>  #include <linux/unistd.h>  #include <linux/security.h> -#include <linux/timex.h>  #include <linux/export.h>  #include <linux/migrate.h>  #include <linux/posix-timers.h> @@ -30,69 +29,6 @@  #include <linux/uaccess.h> -int compat_get_timex(struct timex *txc, const struct compat_timex __user *utp) -{ -	struct compat_timex tx32; - -	memset(txc, 0, sizeof(struct timex)); -	if (copy_from_user(&tx32, utp, sizeof(struct compat_timex))) -		return -EFAULT; - -	txc->modes = tx32.modes; -	txc->offset = tx32.offset; -	txc->freq = tx32.freq; -	txc->maxerror = tx32.maxerror; -	txc->esterror = tx32.esterror; -	txc->status = tx32.status; -	txc->constant = tx32.constant; -	txc->precision = tx32.precision; -	txc->tolerance = tx32.tolerance; -	txc->time.tv_sec = tx32.time.tv_sec; -	txc->time.tv_usec = tx32.time.tv_usec; -	txc->tick = tx32.tick; -	txc->ppsfreq = tx32.ppsfreq; -	txc->jitter = tx32.jitter; -	txc->shift = tx32.shift; -	txc->stabil = tx32.stabil; -	txc->jitcnt = tx32.jitcnt; -	txc->calcnt = tx32.calcnt; -	txc->errcnt = tx32.errcnt; -	txc->stbcnt = tx32.stbcnt; - -	return 0; -} - -int compat_put_timex(struct compat_timex __user *utp, const struct timex *txc) -{ -	struct compat_timex tx32; - -	memset(&tx32, 0, sizeof(struct compat_timex)); -	tx32.modes = txc->modes; -	tx32.offset = txc->offset; -	tx32.freq = txc->freq; -	tx32.maxerror = txc->maxerror; -	tx32.esterror = txc->esterror; -	tx32.status = txc->status; -	tx32.constant = txc->constant; -	tx32.precision = txc->precision; -	tx32.tolerance = txc->tolerance; -	tx32.time.tv_sec = txc->time.tv_sec; -	tx32.time.tv_usec = txc->time.tv_usec; -	tx32.tick = txc->tick; -	tx32.ppsfreq = txc->ppsfreq; -	tx32.jitter = txc->jitter; -	tx32.shift = txc->shift; -	tx32.stabil = txc->stabil; -	tx32.jitcnt = txc->jitcnt; -	tx32.calcnt = txc->calcnt; -	tx32.errcnt = txc->errcnt; -	tx32.stbcnt = txc->stbcnt; -	tx32.tai = txc->tai; -	if (copy_to_user(utp, &tx32, sizeof(struct compat_timex))) -		return -EFAULT; -	return 0; -} -  static int __compat_get_timeval(struct timeval *tv, const struct old_timeval32 __user *ctv)  {  	return (!access_ok(ctv, sizeof(*ctv)) || diff --git a/kernel/cpu.c b/kernel/cpu.c index d1c6d152da89..025f419d16f6 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -313,6 +313,15 @@ void cpus_write_unlock(void)  void lockdep_assert_cpus_held(void)  { +	/* +	 * We can't have hotplug operations before userspace starts running, +	 * and some init codepaths will knowingly not take the hotplug lock. +	 * This is all valid, so mute lockdep until it makes sense to report +	 * unheld locks. +	 */ +	if (system_state < SYSTEM_RUNNING) +		return; +  	percpu_rwsem_assert_held(&cpu_hotplug_lock);  } diff --git a/kernel/crash_core.c b/kernel/crash_core.c index 933cb3e45b98..093c9f917ed0 100644 --- a/kernel/crash_core.c +++ b/kernel/crash_core.c @@ -464,6 +464,8 @@ static int __init crash_save_vmcoreinfo_init(void)  	VMCOREINFO_NUMBER(PAGE_BUDDY_MAPCOUNT_VALUE);  #ifdef CONFIG_HUGETLB_PAGE  	VMCOREINFO_NUMBER(HUGETLB_PAGE_DTOR); +#define PAGE_OFFLINE_MAPCOUNT_VALUE	(~PG_offline) +	VMCOREINFO_NUMBER(PAGE_OFFLINE_MAPCOUNT_VALUE);  #endif  	arch_crash_save_vmcoreinfo(); diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c index 24a77c34e9ad..c2b41a263166 100644 --- a/kernel/events/callchain.c +++ b/kernel/events/callchain.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0  /*   * Performance events callchain code, extracted from core.c:   * @@ -5,8 +6,6 @@   *  Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar   *  Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra   *  Copyright  ©  2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com> - * - * For licensing details see kernel-base/COPYING   */  #include <linux/perf_event.h> diff --git a/kernel/events/core.c b/kernel/events/core.c index 26d6edab051a..5f59d848171e 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0  /*   * Performance events core code:   * @@ -5,8 +6,6 @@   *  Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar   *  Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra   *  Copyright  ©  2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com> - * - * For licensing details see kernel-base/COPYING   */  #include <linux/fs.h> @@ -385,6 +384,8 @@ static atomic_t nr_namespaces_events __read_mostly;  static atomic_t nr_task_events __read_mostly;  static atomic_t nr_freq_events __read_mostly;  static atomic_t nr_switch_events __read_mostly; +static atomic_t nr_ksymbol_events __read_mostly; +static atomic_t nr_bpf_events __read_mostly;  static LIST_HEAD(pmus);  static DEFINE_MUTEX(pmus_lock); @@ -1171,7 +1172,7 @@ static void perf_event_ctx_deactivate(struct perf_event_context *ctx)  static void get_ctx(struct perf_event_context *ctx)  { -	WARN_ON(!atomic_inc_not_zero(&ctx->refcount)); +	refcount_inc(&ctx->refcount);  }  static void free_ctx(struct rcu_head *head) @@ -1185,7 +1186,7 @@ static void free_ctx(struct rcu_head *head)  static void put_ctx(struct perf_event_context *ctx)  { -	if (atomic_dec_and_test(&ctx->refcount)) { +	if (refcount_dec_and_test(&ctx->refcount)) {  		if (ctx->parent_ctx)  			put_ctx(ctx->parent_ctx);  		if (ctx->task && ctx->task != TASK_TOMBSTONE) @@ -1254,6 +1255,7 @@ static void put_ctx(struct perf_event_context *ctx)   *	      perf_event_context::lock   *	    perf_event::mmap_mutex   *	    mmap_sem + *	      perf_addr_filters_head::lock   *   *    cpu_hotplug_lock   *      pmus_lock @@ -1267,7 +1269,7 @@ perf_event_ctx_lock_nested(struct perf_event *event, int nesting)  again:  	rcu_read_lock();  	ctx = READ_ONCE(event->ctx); -	if (!atomic_inc_not_zero(&ctx->refcount)) { +	if (!refcount_inc_not_zero(&ctx->refcount)) {  		rcu_read_unlock();  		goto again;  	} @@ -1400,7 +1402,7 @@ retry:  		}  		if (ctx->task == TASK_TOMBSTONE || -		    !atomic_inc_not_zero(&ctx->refcount)) { +		    !refcount_inc_not_zero(&ctx->refcount)) {  			raw_spin_unlock(&ctx->lock);  			ctx = NULL;  		} else { @@ -2797,7 +2799,7 @@ static int perf_event_stop(struct perf_event *event, int restart)   *   * (p1) when userspace mappings change as a result of (1) or (2) or (3) below,   *      we update the addresses of corresponding vmas in - *	event::addr_filters_offs array and bump the event::addr_filters_gen; + *	event::addr_filter_ranges array and bump the event::addr_filters_gen;   * (p2) when an event is scheduled in (pmu::add), it calls   *      perf_event_addr_filters_sync() which calls pmu::addr_filters_sync()   *      if the generation has changed since the previous call. @@ -4056,7 +4058,7 @@ static void __perf_event_init_context(struct perf_event_context *ctx)  	INIT_LIST_HEAD(&ctx->event_list);  	INIT_LIST_HEAD(&ctx->pinned_active);  	INIT_LIST_HEAD(&ctx->flexible_active); -	atomic_set(&ctx->refcount, 1); +	refcount_set(&ctx->refcount, 1);  }  static struct perf_event_context * @@ -4235,7 +4237,7 @@ static bool is_sb_event(struct perf_event *event)  	if (attr->mmap || attr->mmap_data || attr->mmap2 ||  	    attr->comm || attr->comm_exec || -	    attr->task || +	    attr->task || attr->ksymbol ||  	    attr->context_switch)  		return true;  	return false; @@ -4305,6 +4307,10 @@ static void unaccount_event(struct perf_event *event)  		dec = true;  	if (has_branch_stack(event))  		dec = true; +	if (event->attr.ksymbol) +		atomic_dec(&nr_ksymbol_events); +	if (event->attr.bpf_event) +		atomic_dec(&nr_bpf_events);  	if (dec) {  		if (!atomic_add_unless(&perf_sched_count, -1, 1)) @@ -4440,7 +4446,7 @@ static void _free_event(struct perf_event *event)  	perf_event_free_bpf_prog(event);  	perf_addr_filters_splice(event, NULL); -	kfree(event->addr_filters_offs); +	kfree(event->addr_filter_ranges);  	if (event->destroy)  		event->destroy(event); @@ -5396,7 +5402,7 @@ struct ring_buffer *ring_buffer_get(struct perf_event *event)  	rcu_read_lock();  	rb = rcu_dereference(event->rb);  	if (rb) { -		if (!atomic_inc_not_zero(&rb->refcount)) +		if (!refcount_inc_not_zero(&rb->refcount))  			rb = NULL;  	}  	rcu_read_unlock(); @@ -5406,7 +5412,7 @@ struct ring_buffer *ring_buffer_get(struct perf_event *event)  void ring_buffer_put(struct ring_buffer *rb)  { -	if (!atomic_dec_and_test(&rb->refcount)) +	if (!refcount_dec_and_test(&rb->refcount))  		return;  	WARN_ON_ONCE(!list_empty(&rb->event_list)); @@ -5471,7 +5477,7 @@ static void perf_mmap_close(struct vm_area_struct *vma)  		/* this has to be the last one */  		rb_free_aux(rb); -		WARN_ON_ONCE(atomic_read(&rb->aux_refcount)); +		WARN_ON_ONCE(refcount_read(&rb->aux_refcount));  		mutex_unlock(&event->mmap_mutex);  	} @@ -6497,7 +6503,7 @@ void perf_prepare_sample(struct perf_event_header *header,  		data->phys_addr = perf_virt_to_phys(data->addr);  } -static __always_inline void +static __always_inline int  __perf_event_output(struct perf_event *event,  		    struct perf_sample_data *data,  		    struct pt_regs *regs, @@ -6507,13 +6513,15 @@ __perf_event_output(struct perf_event *event,  {  	struct perf_output_handle handle;  	struct perf_event_header header; +	int err;  	/* protect the callchain buffers */  	rcu_read_lock();  	perf_prepare_sample(&header, data, event, regs); -	if (output_begin(&handle, event, header.size)) +	err = output_begin(&handle, event, header.size); +	if (err)  		goto exit;  	perf_output_sample(&handle, &header, data, event); @@ -6522,6 +6530,7 @@ __perf_event_output(struct perf_event *event,  exit:  	rcu_read_unlock(); +	return err;  }  void @@ -6540,12 +6549,12 @@ perf_event_output_backward(struct perf_event *event,  	__perf_event_output(event, data, regs, perf_output_begin_backward);  } -void +int  perf_event_output(struct perf_event *event,  		  struct perf_sample_data *data,  		  struct pt_regs *regs)  { -	__perf_event_output(event, data, regs, perf_output_begin); +	return __perf_event_output(event, data, regs, perf_output_begin);  }  /* @@ -6686,7 +6695,8 @@ static void perf_event_addr_filters_exec(struct perf_event *event, void *data)  	raw_spin_lock_irqsave(&ifh->lock, flags);  	list_for_each_entry(filter, &ifh->list, entry) {  		if (filter->path.dentry) { -			event->addr_filters_offs[count] = 0; +			event->addr_filter_ranges[count].start = 0; +			event->addr_filter_ranges[count].size = 0;  			restart++;  		} @@ -7366,28 +7376,47 @@ static bool perf_addr_filter_match(struct perf_addr_filter *filter,  	return true;  } +static bool perf_addr_filter_vma_adjust(struct perf_addr_filter *filter, +					struct vm_area_struct *vma, +					struct perf_addr_filter_range *fr) +{ +	unsigned long vma_size = vma->vm_end - vma->vm_start; +	unsigned long off = vma->vm_pgoff << PAGE_SHIFT; +	struct file *file = vma->vm_file; + +	if (!perf_addr_filter_match(filter, file, off, vma_size)) +		return false; + +	if (filter->offset < off) { +		fr->start = vma->vm_start; +		fr->size = min(vma_size, filter->size - (off - filter->offset)); +	} else { +		fr->start = vma->vm_start + filter->offset - off; +		fr->size = min(vma->vm_end - fr->start, filter->size); +	} + +	return true; +} +  static void __perf_addr_filters_adjust(struct perf_event *event, void *data)  {  	struct perf_addr_filters_head *ifh = perf_event_addr_filters(event);  	struct vm_area_struct *vma = data; -	unsigned long off = vma->vm_pgoff << PAGE_SHIFT, flags; -	struct file *file = vma->vm_file;  	struct perf_addr_filter *filter;  	unsigned int restart = 0, count = 0; +	unsigned long flags;  	if (!has_addr_filter(event))  		return; -	if (!file) +	if (!vma->vm_file)  		return;  	raw_spin_lock_irqsave(&ifh->lock, flags);  	list_for_each_entry(filter, &ifh->list, entry) { -		if (perf_addr_filter_match(filter, file, off, -					     vma->vm_end - vma->vm_start)) { -			event->addr_filters_offs[count] = vma->vm_start; +		if (perf_addr_filter_vma_adjust(filter, vma, +						&event->addr_filter_ranges[count]))  			restart++; -		}  		count++;  	} @@ -7658,6 +7687,207 @@ static void perf_log_throttle(struct perf_event *event, int enable)  	perf_output_end(&handle);  } +/* + * ksymbol register/unregister tracking + */ + +struct perf_ksymbol_event { +	const char	*name; +	int		name_len; +	struct { +		struct perf_event_header        header; +		u64				addr; +		u32				len; +		u16				ksym_type; +		u16				flags; +	} event_id; +}; + +static int perf_event_ksymbol_match(struct perf_event *event) +{ +	return event->attr.ksymbol; +} + +static void perf_event_ksymbol_output(struct perf_event *event, void *data) +{ +	struct perf_ksymbol_event *ksymbol_event = data; +	struct perf_output_handle handle; +	struct perf_sample_data sample; +	int ret; + +	if (!perf_event_ksymbol_match(event)) +		return; + +	perf_event_header__init_id(&ksymbol_event->event_id.header, +				   &sample, event); +	ret = perf_output_begin(&handle, event, +				ksymbol_event->event_id.header.size); +	if (ret) +		return; + +	perf_output_put(&handle, ksymbol_event->event_id); +	__output_copy(&handle, ksymbol_event->name, ksymbol_event->name_len); +	perf_event__output_id_sample(event, &handle, &sample); + +	perf_output_end(&handle); +} + +void perf_event_ksymbol(u16 ksym_type, u64 addr, u32 len, bool unregister, +			const char *sym) +{ +	struct perf_ksymbol_event ksymbol_event; +	char name[KSYM_NAME_LEN]; +	u16 flags = 0; +	int name_len; + +	if (!atomic_read(&nr_ksymbol_events)) +		return; + +	if (ksym_type >= PERF_RECORD_KSYMBOL_TYPE_MAX || +	    ksym_type == PERF_RECORD_KSYMBOL_TYPE_UNKNOWN) +		goto err; + +	strlcpy(name, sym, KSYM_NAME_LEN); +	name_len = strlen(name) + 1; +	while (!IS_ALIGNED(name_len, sizeof(u64))) +		name[name_len++] = '\0'; +	BUILD_BUG_ON(KSYM_NAME_LEN % sizeof(u64)); + +	if (unregister) +		flags |= PERF_RECORD_KSYMBOL_FLAGS_UNREGISTER; + +	ksymbol_event = (struct perf_ksymbol_event){ +		.name = name, +		.name_len = name_len, +		.event_id = { +			.header = { +				.type = PERF_RECORD_KSYMBOL, +				.size = sizeof(ksymbol_event.event_id) + +					name_len, +			}, +			.addr = addr, +			.len = len, +			.ksym_type = ksym_type, +			.flags = flags, +		}, +	}; + +	perf_iterate_sb(perf_event_ksymbol_output, &ksymbol_event, NULL); +	return; +err: +	WARN_ONCE(1, "%s: Invalid KSYMBOL type 0x%x\n", __func__, ksym_type); +} + +/* + * bpf program load/unload tracking + */ + +struct perf_bpf_event { +	struct bpf_prog	*prog; +	struct { +		struct perf_event_header        header; +		u16				type; +		u16				flags; +		u32				id; +		u8				tag[BPF_TAG_SIZE]; +	} event_id; +}; + +static int perf_event_bpf_match(struct perf_event *event) +{ +	return event->attr.bpf_event; +} + +static void perf_event_bpf_output(struct perf_event *event, void *data) +{ +	struct perf_bpf_event *bpf_event = data; +	struct perf_output_handle handle; +	struct perf_sample_data sample; +	int ret; + +	if (!perf_event_bpf_match(event)) +		return; + +	perf_event_header__init_id(&bpf_event->event_id.header, +				   &sample, event); +	ret = perf_output_begin(&handle, event, +				bpf_event->event_id.header.size); +	if (ret) +		return; + +	perf_output_put(&handle, bpf_event->event_id); +	perf_event__output_id_sample(event, &handle, &sample); + +	perf_output_end(&handle); +} + +static void perf_event_bpf_emit_ksymbols(struct bpf_prog *prog, +					 enum perf_bpf_event_type type) +{ +	bool unregister = type == PERF_BPF_EVENT_PROG_UNLOAD; +	char sym[KSYM_NAME_LEN]; +	int i; + +	if (prog->aux->func_cnt == 0) { +		bpf_get_prog_name(prog, sym); +		perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_BPF, +				   (u64)(unsigned long)prog->bpf_func, +				   prog->jited_len, unregister, sym); +	} else { +		for (i = 0; i < prog->aux->func_cnt; i++) { +			struct bpf_prog *subprog = prog->aux->func[i]; + +			bpf_get_prog_name(subprog, sym); +			perf_event_ksymbol( +				PERF_RECORD_KSYMBOL_TYPE_BPF, +				(u64)(unsigned long)subprog->bpf_func, +				subprog->jited_len, unregister, sym); +		} +	} +} + +void perf_event_bpf_event(struct bpf_prog *prog, +			  enum perf_bpf_event_type type, +			  u16 flags) +{ +	struct perf_bpf_event bpf_event; + +	if (type <= PERF_BPF_EVENT_UNKNOWN || +	    type >= PERF_BPF_EVENT_MAX) +		return; + +	switch (type) { +	case PERF_BPF_EVENT_PROG_LOAD: +	case PERF_BPF_EVENT_PROG_UNLOAD: +		if (atomic_read(&nr_ksymbol_events)) +			perf_event_bpf_emit_ksymbols(prog, type); +		break; +	default: +		break; +	} + +	if (!atomic_read(&nr_bpf_events)) +		return; + +	bpf_event = (struct perf_bpf_event){ +		.prog = prog, +		.event_id = { +			.header = { +				.type = PERF_RECORD_BPF_EVENT, +				.size = sizeof(bpf_event.event_id), +			}, +			.type = type, +			.flags = flags, +			.id = prog->aux->id, +		}, +	}; + +	BUILD_BUG_ON(BPF_TAG_SIZE % sizeof(u64)); + +	memcpy(bpf_event.event_id.tag, prog->tag, BPF_TAG_SIZE); +	perf_iterate_sb(perf_event_bpf_output, &bpf_event, NULL); +} +  void perf_event_itrace_started(struct perf_event *event)  {  	event->attach_state |= PERF_ATTACH_ITRACE; @@ -8776,26 +9006,19 @@ static void perf_addr_filters_splice(struct perf_event *event,   * @filter; if so, adjust filter's address range.   * Called with mm::mmap_sem down for reading.   */ -static unsigned long perf_addr_filter_apply(struct perf_addr_filter *filter, -					    struct mm_struct *mm) +static void perf_addr_filter_apply(struct perf_addr_filter *filter, +				   struct mm_struct *mm, +				   struct perf_addr_filter_range *fr)  {  	struct vm_area_struct *vma;  	for (vma = mm->mmap; vma; vma = vma->vm_next) { -		struct file *file = vma->vm_file; -		unsigned long off = vma->vm_pgoff << PAGE_SHIFT; -		unsigned long vma_size = vma->vm_end - vma->vm_start; - -		if (!file) -			continue; - -		if (!perf_addr_filter_match(filter, file, off, vma_size)) +		if (!vma->vm_file)  			continue; -		return vma->vm_start; +		if (perf_addr_filter_vma_adjust(filter, vma, fr)) +			return;  	} - -	return 0;  }  /* @@ -8829,15 +9052,15 @@ static void perf_event_addr_filters_apply(struct perf_event *event)  	raw_spin_lock_irqsave(&ifh->lock, flags);  	list_for_each_entry(filter, &ifh->list, entry) { -		event->addr_filters_offs[count] = 0; +		event->addr_filter_ranges[count].start = 0; +		event->addr_filter_ranges[count].size = 0;  		/*  		 * Adjust base offset if the filter is associated to a binary  		 * that needs to be mapped:  		 */  		if (filter->path.dentry) -			event->addr_filters_offs[count] = -				perf_addr_filter_apply(filter, mm); +			perf_addr_filter_apply(filter, mm, &event->addr_filter_ranges[count]);  		count++;  	} @@ -9788,6 +10011,15 @@ static int perf_try_init_event(struct pmu *pmu, struct perf_event *event)  	if (ctx)  		perf_event_ctx_unlock(event->group_leader, ctx); +	if (!ret) { +		if (pmu->capabilities & PERF_PMU_CAP_NO_EXCLUDE && +				event_has_any_exclude_flag(event)) { +			if (event->destroy) +				event->destroy(event); +			ret = -EINVAL; +		} +	} +  	if (ret)  		module_put(pmu->module); @@ -9916,6 +10148,10 @@ static void account_event(struct perf_event *event)  		inc = true;  	if (is_cgroup_event(event))  		inc = true; +	if (event->attr.ksymbol) +		atomic_inc(&nr_ksymbol_events); +	if (event->attr.bpf_event) +		atomic_inc(&nr_bpf_events);  	if (inc) {  		/* @@ -10098,14 +10334,28 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,  		goto err_pmu;  	if (has_addr_filter(event)) { -		event->addr_filters_offs = kcalloc(pmu->nr_addr_filters, -						   sizeof(unsigned long), -						   GFP_KERNEL); -		if (!event->addr_filters_offs) { +		event->addr_filter_ranges = kcalloc(pmu->nr_addr_filters, +						    sizeof(struct perf_addr_filter_range), +						    GFP_KERNEL); +		if (!event->addr_filter_ranges) {  			err = -ENOMEM;  			goto err_per_task;  		} +		/* +		 * Clone the parent's vma offsets: they are valid until exec() +		 * even if the mm is not shared with the parent. +		 */ +		if (event->parent) { +			struct perf_addr_filters_head *ifh = perf_event_addr_filters(event); + +			raw_spin_lock_irq(&ifh->lock); +			memcpy(event->addr_filter_ranges, +			       event->parent->addr_filter_ranges, +			       pmu->nr_addr_filters * sizeof(struct perf_addr_filter_range)); +			raw_spin_unlock_irq(&ifh->lock); +		} +  		/* force hw sync on the address filters */  		event->addr_filters_gen = 1;  	} @@ -10124,7 +10374,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,  	return event;  err_addr_filters: -	kfree(event->addr_filters_offs); +	kfree(event->addr_filter_ranges);  err_per_task:  	exclusive_event_destroy(event); @@ -10407,7 +10657,7 @@ __perf_event_ctx_lock_double(struct perf_event *group_leader,  again:  	rcu_read_lock();  	gctx = READ_ONCE(group_leader->ctx); -	if (!atomic_inc_not_zero(&gctx->refcount)) { +	if (!refcount_inc_not_zero(&gctx->refcount)) {  		rcu_read_unlock();  		goto again;  	} diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c index 5befb338a18d..c5cd852fe86b 100644 --- a/kernel/events/hw_breakpoint.c +++ b/kernel/events/hw_breakpoint.c @@ -1,18 +1,5 @@ +// SPDX-License-Identifier: GPL-2.0+  /* - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. - *   * Copyright (C) 2007 Alan Stern   * Copyright (C) IBM Corporation, 2009   * Copyright (C) 2009, Frederic Weisbecker <fweisbec@gmail.com> diff --git a/kernel/events/internal.h b/kernel/events/internal.h index 6dc725a7e7bc..79c47076700a 100644 --- a/kernel/events/internal.h +++ b/kernel/events/internal.h @@ -4,13 +4,14 @@  #include <linux/hardirq.h>  #include <linux/uaccess.h> +#include <linux/refcount.h>  /* Buffer handling */  #define RING_BUFFER_WRITABLE		0x01  struct ring_buffer { -	atomic_t			refcount; +	refcount_t			refcount;  	struct rcu_head			rcu_head;  #ifdef CONFIG_PERF_USE_VMALLOC  	struct work_struct		work; @@ -48,7 +49,7 @@ struct ring_buffer {  	atomic_t			aux_mmap_count;  	unsigned long			aux_mmap_locked;  	void				(*free_aux)(void *); -	atomic_t			aux_refcount; +	refcount_t			aux_refcount;  	void				**aux_pages;  	void				*aux_priv; diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index 5ab4fe3b1dcc..678ccec60d8f 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0  /*   * Performance events ring-buffer code:   * @@ -5,8 +6,6 @@   *  Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar   *  Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra   *  Copyright  ©  2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com> - * - * For licensing details see kernel-base/COPYING   */  #include <linux/perf_event.h> @@ -285,7 +284,7 @@ ring_buffer_init(struct ring_buffer *rb, long watermark, int flags)  	else  		rb->overwrite = 1; -	atomic_set(&rb->refcount, 1); +	refcount_set(&rb->refcount, 1);  	INIT_LIST_HEAD(&rb->event_list);  	spin_lock_init(&rb->event_lock); @@ -358,7 +357,7 @@ void *perf_aux_output_begin(struct perf_output_handle *handle,  	if (!atomic_read(&rb->aux_mmap_count))  		goto err; -	if (!atomic_inc_not_zero(&rb->aux_refcount)) +	if (!refcount_inc_not_zero(&rb->aux_refcount))  		goto err;  	/* @@ -658,7 +657,7 @@ int rb_alloc_aux(struct ring_buffer *rb, struct perf_event *event,  			goto out;  	} -	rb->aux_priv = event->pmu->setup_aux(event->cpu, rb->aux_pages, nr_pages, +	rb->aux_priv = event->pmu->setup_aux(event, rb->aux_pages, nr_pages,  					     overwrite);  	if (!rb->aux_priv)  		goto out; @@ -671,7 +670,7 @@ int rb_alloc_aux(struct ring_buffer *rb, struct perf_event *event,  	 * we keep a refcount here to make sure either of the two can  	 * reference them safely.  	 */ -	atomic_set(&rb->aux_refcount, 1); +	refcount_set(&rb->aux_refcount, 1);  	rb->aux_overwrite = overwrite;  	rb->aux_watermark = watermark; @@ -690,7 +689,7 @@ out:  void rb_free_aux(struct ring_buffer *rb)  { -	if (atomic_dec_and_test(&rb->aux_refcount)) +	if (refcount_dec_and_test(&rb->aux_refcount))  		__rb_free_aux(rb);  } diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 8aef47ee7bfa..affa830a198c 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -1,20 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0+  /*   * User-space Probes (UProbes)   * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. - *   * Copyright (C) IBM Corporation, 2008-2012   * Authors:   *	Srikar Dronamraju diff --git a/kernel/fork.c b/kernel/fork.c index b69248e6f0e0..77059b211608 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -429,7 +429,7 @@ static void release_task_stack(struct task_struct *tsk)  #ifdef CONFIG_THREAD_INFO_IN_TASK  void put_task_stack(struct task_struct *tsk)  { -	if (atomic_dec_and_test(&tsk->stack_refcount)) +	if (refcount_dec_and_test(&tsk->stack_refcount))  		release_task_stack(tsk);  }  #endif @@ -447,7 +447,7 @@ void free_task(struct task_struct *tsk)  	 * If the task had a separate stack allocation, it should be gone  	 * by now.  	 */ -	WARN_ON_ONCE(atomic_read(&tsk->stack_refcount) != 0); +	WARN_ON_ONCE(refcount_read(&tsk->stack_refcount) != 0);  #endif  	rt_mutex_debug_task_free(tsk);  	ftrace_graph_exit_task(tsk); @@ -710,14 +710,14 @@ static inline void free_signal_struct(struct signal_struct *sig)  static inline void put_signal_struct(struct signal_struct *sig)  { -	if (atomic_dec_and_test(&sig->sigcnt)) +	if (refcount_dec_and_test(&sig->sigcnt))  		free_signal_struct(sig);  }  void __put_task_struct(struct task_struct *tsk)  {  	WARN_ON(!tsk->exit_state); -	WARN_ON(atomic_read(&tsk->usage)); +	WARN_ON(refcount_read(&tsk->usage));  	WARN_ON(tsk == current);  	cgroup_free(tsk); @@ -867,7 +867,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)  	tsk->stack_vm_area = stack_vm_area;  #endif  #ifdef CONFIG_THREAD_INFO_IN_TASK -	atomic_set(&tsk->stack_refcount, 1); +	refcount_set(&tsk->stack_refcount, 1);  #endif  	if (err) @@ -896,7 +896,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)  	 * One for us, one for whoever does the "release_task()" (usually  	 * parent)  	 */ -	atomic_set(&tsk->usage, 2); +	refcount_set(&tsk->usage, 2);  #ifdef CONFIG_BLK_DEV_IO_TRACE  	tsk->btrace_seq = 0;  #endif @@ -1463,7 +1463,7 @@ static int copy_sighand(unsigned long clone_flags, struct task_struct *tsk)  	struct sighand_struct *sig;  	if (clone_flags & CLONE_SIGHAND) { -		atomic_inc(¤t->sighand->count); +		refcount_inc(¤t->sighand->count);  		return 0;  	}  	sig = kmem_cache_alloc(sighand_cachep, GFP_KERNEL); @@ -1471,7 +1471,7 @@ static int copy_sighand(unsigned long clone_flags, struct task_struct *tsk)  	if (!sig)  		return -ENOMEM; -	atomic_set(&sig->count, 1); +	refcount_set(&sig->count, 1);  	spin_lock_irq(¤t->sighand->siglock);  	memcpy(sig->action, current->sighand->action, sizeof(sig->action));  	spin_unlock_irq(¤t->sighand->siglock); @@ -1480,7 +1480,7 @@ static int copy_sighand(unsigned long clone_flags, struct task_struct *tsk)  void __cleanup_sighand(struct sighand_struct *sighand)  { -	if (atomic_dec_and_test(&sighand->count)) { +	if (refcount_dec_and_test(&sighand->count)) {  		signalfd_cleanup(sighand);  		/*  		 * sighand_cachep is SLAB_TYPESAFE_BY_RCU so we can free it @@ -1527,7 +1527,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)  	sig->nr_threads = 1;  	atomic_set(&sig->live, 1); -	atomic_set(&sig->sigcnt, 1); +	refcount_set(&sig->sigcnt, 1);  	/* list_add(thread_node, thread_head) without INIT_LIST_HEAD() */  	sig->thread_head = (struct list_head)LIST_HEAD_INIT(tsk->thread_node); @@ -2082,7 +2082,7 @@ static __latent_entropy struct task_struct *copy_process(  		} else {  			current->signal->nr_threads++;  			atomic_inc(¤t->signal->live); -			atomic_inc(¤t->signal->sigcnt); +			refcount_inc(¤t->signal->sigcnt);  			task_join_group_stop(p);  			list_add_tail_rcu(&p->thread_group,  					  &p->group_leader->thread_group); @@ -2439,7 +2439,7 @@ static int check_unshare_flags(unsigned long unshare_flags)  			return -EINVAL;  	}  	if (unshare_flags & (CLONE_SIGHAND | CLONE_VM)) { -		if (atomic_read(¤t->sighand->count) > 1) +		if (refcount_read(¤t->sighand->count) > 1)  			return -EINVAL;  	}  	if (unshare_flags & CLONE_VM) { diff --git a/kernel/futex.c b/kernel/futex.c index a0514e01c3eb..c3b73b0311bc 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -68,6 +68,7 @@  #include <linux/freezer.h>  #include <linux/memblock.h>  #include <linux/fault-inject.h> +#include <linux/refcount.h>  #include <asm/futex.h> @@ -212,7 +213,7 @@ struct futex_pi_state {  	struct rt_mutex pi_mutex;  	struct task_struct *owner; -	atomic_t refcount; +	refcount_t refcount;  	union futex_key key;  } __randomize_layout; @@ -321,12 +322,8 @@ static int __init fail_futex_debugfs(void)  	if (IS_ERR(dir))  		return PTR_ERR(dir); -	if (!debugfs_create_bool("ignore-private", mode, dir, -				 &fail_futex.ignore_private)) { -		debugfs_remove_recursive(dir); -		return -ENOMEM; -	} - +	debugfs_create_bool("ignore-private", mode, dir, +			    &fail_futex.ignore_private);  	return 0;  } @@ -803,7 +800,7 @@ static int refill_pi_state_cache(void)  	INIT_LIST_HEAD(&pi_state->list);  	/* pi_mutex gets initialized later */  	pi_state->owner = NULL; -	atomic_set(&pi_state->refcount, 1); +	refcount_set(&pi_state->refcount, 1);  	pi_state->key = FUTEX_KEY_INIT;  	current->pi_state_cache = pi_state; @@ -823,7 +820,7 @@ static struct futex_pi_state *alloc_pi_state(void)  static void get_pi_state(struct futex_pi_state *pi_state)  { -	WARN_ON_ONCE(!atomic_inc_not_zero(&pi_state->refcount)); +	WARN_ON_ONCE(!refcount_inc_not_zero(&pi_state->refcount));  }  /* @@ -835,7 +832,7 @@ static void put_pi_state(struct futex_pi_state *pi_state)  	if (!pi_state)  		return; -	if (!atomic_dec_and_test(&pi_state->refcount)) +	if (!refcount_dec_and_test(&pi_state->refcount))  		return;  	/* @@ -865,7 +862,7 @@ static void put_pi_state(struct futex_pi_state *pi_state)  		 * refcount is at 0 - put it back to 1.  		 */  		pi_state->owner = NULL; -		atomic_set(&pi_state->refcount, 1); +		refcount_set(&pi_state->refcount, 1);  		current->pi_state_cache = pi_state;  	}  } @@ -908,7 +905,7 @@ void exit_pi_state_list(struct task_struct *curr)  		 * In that case; drop the locks to let put_pi_state() make  		 * progress and retry the loop.  		 */ -		if (!atomic_inc_not_zero(&pi_state->refcount)) { +		if (!refcount_inc_not_zero(&pi_state->refcount)) {  			raw_spin_unlock_irq(&curr->pi_lock);  			cpu_relax();  			raw_spin_lock_irq(&curr->pi_lock); @@ -1064,7 +1061,7 @@ static int attach_to_pi_state(u32 __user *uaddr, u32 uval,  	 * and futex_wait_requeue_pi() as it cannot go to 0 and consequently  	 * free pi_state before we can take a reference ourselves.  	 */ -	WARN_ON(!atomic_read(&pi_state->refcount)); +	WARN_ON(!refcount_read(&pi_state->refcount));  	/*  	 * Now that we have a pi_state, we can acquire wait_lock @@ -1467,8 +1464,7 @@ static void mark_wake_futex(struct wake_q_head *wake_q, struct futex_q *q)  	 * Queue the task for later wakeup for after we've released  	 * the hb->lock. wake_q_add() grabs reference to p.  	 */ -	wake_q_add(wake_q, p); -	put_task_struct(p); +	wake_q_add_safe(wake_q, p);  }  /* @@ -3823,7 +3819,7 @@ err_unlock:  #endif /* CONFIG_COMPAT */  #ifdef CONFIG_COMPAT_32BIT_TIME -COMPAT_SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val, +SYSCALL_DEFINE6(futex_time32, u32 __user *, uaddr, int, op, u32, val,  		struct old_timespec32 __user *, utime, u32 __user *, uaddr2,  		u32, val3)  { diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c index 45b68b4ea48b..f18cd5aa33e8 100644 --- a/kernel/irq/affinity.c +++ b/kernel/irq/affinity.c @@ -9,7 +9,7 @@  #include <linux/cpu.h>  static void irq_spread_init_one(struct cpumask *irqmsk, struct cpumask *nmsk, -				int cpus_per_vec) +				unsigned int cpus_per_vec)  {  	const struct cpumask *siblmsk;  	int cpu, sibl; @@ -95,15 +95,17 @@ static int get_nodes_in_cpumask(cpumask_var_t *node_to_cpumask,  }  static int __irq_build_affinity_masks(const struct irq_affinity *affd, -				      int startvec, int numvecs, int firstvec, +				      unsigned int startvec, +				      unsigned int numvecs, +				      unsigned int firstvec,  				      cpumask_var_t *node_to_cpumask,  				      const struct cpumask *cpu_mask,  				      struct cpumask *nmsk,  				      struct irq_affinity_desc *masks)  { -	int n, nodes, cpus_per_vec, extra_vecs, done = 0; -	int last_affv = firstvec + numvecs; -	int curvec = startvec; +	unsigned int n, nodes, cpus_per_vec, extra_vecs, done = 0; +	unsigned int last_affv = firstvec + numvecs; +	unsigned int curvec = startvec;  	nodemask_t nodemsk = NODE_MASK_NONE;  	if (!cpumask_weight(cpu_mask)) @@ -117,18 +119,16 @@ static int __irq_build_affinity_masks(const struct irq_affinity *affd,  	 */  	if (numvecs <= nodes) {  		for_each_node_mask(n, nodemsk) { -			cpumask_or(&masks[curvec].mask, -					&masks[curvec].mask, -					node_to_cpumask[n]); +			cpumask_or(&masks[curvec].mask, &masks[curvec].mask, +				   node_to_cpumask[n]);  			if (++curvec == last_affv)  				curvec = firstvec;  		} -		done = numvecs; -		goto out; +		return numvecs;  	}  	for_each_node_mask(n, nodemsk) { -		int ncpus, v, vecs_to_assign, vecs_per_node; +		unsigned int ncpus, v, vecs_to_assign, vecs_per_node;  		/* Spread the vectors per node */  		vecs_per_node = (numvecs - (curvec - firstvec)) / nodes; @@ -163,8 +163,6 @@ static int __irq_build_affinity_masks(const struct irq_affinity *affd,  			curvec = firstvec;  		--nodes;  	} - -out:  	return done;  } @@ -174,19 +172,24 @@ out:   *	2) spread other possible CPUs on these vectors   */  static int irq_build_affinity_masks(const struct irq_affinity *affd, -				    int startvec, int numvecs, int firstvec, -				    cpumask_var_t *node_to_cpumask, +				    unsigned int startvec, unsigned int numvecs, +				    unsigned int firstvec,  				    struct irq_affinity_desc *masks)  { -	int curvec = startvec, nr_present, nr_others; -	int ret = -ENOMEM; +	unsigned int curvec = startvec, nr_present, nr_others; +	cpumask_var_t *node_to_cpumask;  	cpumask_var_t nmsk, npresmsk; +	int ret = -ENOMEM;  	if (!zalloc_cpumask_var(&nmsk, GFP_KERNEL))  		return ret;  	if (!zalloc_cpumask_var(&npresmsk, GFP_KERNEL)) -		goto fail; +		goto fail_nmsk; + +	node_to_cpumask = alloc_node_to_cpumask(); +	if (!node_to_cpumask) +		goto fail_npresmsk;  	ret = 0;  	/* Stabilize the cpumasks */ @@ -217,13 +220,22 @@ static int irq_build_affinity_masks(const struct irq_affinity *affd,  	if (nr_present < numvecs)  		WARN_ON(nr_present + nr_others < numvecs); +	free_node_to_cpumask(node_to_cpumask); + + fail_npresmsk:  	free_cpumask_var(npresmsk); - fail: + fail_nmsk:  	free_cpumask_var(nmsk);  	return ret;  } +static void default_calc_sets(struct irq_affinity *affd, unsigned int affvecs) +{ +	affd->nr_sets = 1; +	affd->set_size[0] = affvecs; +} +  /**   * irq_create_affinity_masks - Create affinity masks for multiqueue spreading   * @nvecs:	The total number of vectors @@ -232,50 +244,62 @@ static int irq_build_affinity_masks(const struct irq_affinity *affd,   * Returns the irq_affinity_desc pointer or NULL if allocation failed.   */  struct irq_affinity_desc * -irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd) +irq_create_affinity_masks(unsigned int nvecs, struct irq_affinity *affd)  { -	int affvecs = nvecs - affd->pre_vectors - affd->post_vectors; -	int curvec, usedvecs; -	cpumask_var_t *node_to_cpumask; +	unsigned int affvecs, curvec, usedvecs, i;  	struct irq_affinity_desc *masks = NULL; -	int i, nr_sets;  	/* -	 * If there aren't any vectors left after applying the pre/post -	 * vectors don't bother with assigning affinity. +	 * Determine the number of vectors which need interrupt affinities +	 * assigned. If the pre/post request exhausts the available vectors +	 * then nothing to do here except for invoking the calc_sets() +	 * callback so the device driver can adjust to the situation. If there +	 * is only a single vector, then managing the queue is pointless as +	 * well.  	 */ -	if (nvecs == affd->pre_vectors + affd->post_vectors) +	if (nvecs > 1 && nvecs > affd->pre_vectors + affd->post_vectors) +		affvecs = nvecs - affd->pre_vectors - affd->post_vectors; +	else +		affvecs = 0; + +	/* +	 * Simple invocations do not provide a calc_sets() callback. Install +	 * the generic one. +	 */ +	if (!affd->calc_sets) +		affd->calc_sets = default_calc_sets; + +	/* Recalculate the sets */ +	affd->calc_sets(affd, affvecs); + +	if (WARN_ON_ONCE(affd->nr_sets > IRQ_AFFINITY_MAX_SETS))  		return NULL; -	node_to_cpumask = alloc_node_to_cpumask(); -	if (!node_to_cpumask) +	/* Nothing to assign? */ +	if (!affvecs)  		return NULL;  	masks = kcalloc(nvecs, sizeof(*masks), GFP_KERNEL);  	if (!masks) -		goto outnodemsk; +		return NULL;  	/* Fill out vectors at the beginning that don't need affinity */  	for (curvec = 0; curvec < affd->pre_vectors; curvec++)  		cpumask_copy(&masks[curvec].mask, irq_default_affinity); +  	/*  	 * Spread on present CPUs starting from affd->pre_vectors. If we  	 * have multiple sets, build each sets affinity mask separately.  	 */ -	nr_sets = affd->nr_sets; -	if (!nr_sets) -		nr_sets = 1; - -	for (i = 0, usedvecs = 0; i < nr_sets; i++) { -		int this_vecs = affd->sets ? affd->sets[i] : affvecs; +	for (i = 0, usedvecs = 0; i < affd->nr_sets; i++) { +		unsigned int this_vecs = affd->set_size[i];  		int ret;  		ret = irq_build_affinity_masks(affd, curvec, this_vecs, -						curvec, node_to_cpumask, masks); +					       curvec, masks);  		if (ret) {  			kfree(masks); -			masks = NULL; -			goto outnodemsk; +			return NULL;  		}  		curvec += this_vecs;  		usedvecs += this_vecs; @@ -293,8 +317,6 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd)  	for (i = affd->pre_vectors; i < nvecs - affd->post_vectors; i++)  		masks[i].is_managed = 1; -outnodemsk: -	free_node_to_cpumask(node_to_cpumask);  	return masks;  } @@ -304,25 +326,22 @@ outnodemsk:   * @maxvec:	The maximum number of vectors available   * @affd:	Description of the affinity requirements   */ -int irq_calc_affinity_vectors(int minvec, int maxvec, const struct irq_affinity *affd) +unsigned int irq_calc_affinity_vectors(unsigned int minvec, unsigned int maxvec, +				       const struct irq_affinity *affd)  { -	int resv = affd->pre_vectors + affd->post_vectors; -	int vecs = maxvec - resv; -	int set_vecs; +	unsigned int resv = affd->pre_vectors + affd->post_vectors; +	unsigned int set_vecs;  	if (resv > minvec)  		return 0; -	if (affd->nr_sets) { -		int i; - -		for (i = 0, set_vecs = 0;  i < affd->nr_sets; i++) -			set_vecs += affd->sets[i]; +	if (affd->calc_sets) { +		set_vecs = maxvec - resv;  	} else {  		get_online_cpus();  		set_vecs = cpumask_weight(cpu_possible_mask);  		put_online_cpus();  	} -	return resv + min(set_vecs, vecs); +	return resv + min(set_vecs, maxvec - resv);  } diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 34e969069488..99b7dd6982a4 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -730,6 +730,37 @@ out:  EXPORT_SYMBOL_GPL(handle_fasteoi_irq);  /** + *	handle_fasteoi_nmi - irq handler for NMI interrupt lines + *	@desc:	the interrupt description structure for this irq + * + *	A simple NMI-safe handler, considering the restrictions + *	from request_nmi. + * + *	Only a single callback will be issued to the chip: an ->eoi() + *	call when the interrupt has been serviced. This enables support + *	for modern forms of interrupt handlers, which handle the flow + *	details in hardware, transparently. + */ +void handle_fasteoi_nmi(struct irq_desc *desc) +{ +	struct irq_chip *chip = irq_desc_get_chip(desc); +	struct irqaction *action = desc->action; +	unsigned int irq = irq_desc_get_irq(desc); +	irqreturn_t res; + +	trace_irq_handler_entry(irq, action); +	/* +	 * NMIs cannot be shared, there is only one action. +	 */ +	res = action->handler(irq, action->dev_id); +	trace_irq_handler_exit(irq, action, res); + +	if (chip->irq_eoi) +		chip->irq_eoi(&desc->irq_data); +} +EXPORT_SYMBOL_GPL(handle_fasteoi_nmi); + +/**   *	handle_edge_irq - edge type IRQ handler   *	@desc:	the interrupt description structure for this irq   * @@ -855,7 +886,11 @@ void handle_percpu_irq(struct irq_desc *desc)  {  	struct irq_chip *chip = irq_desc_get_chip(desc); -	kstat_incr_irqs_this_cpu(desc); +	/* +	 * PER CPU interrupts are not serialized. Do not touch +	 * desc->tot_count. +	 */ +	__kstat_incr_irqs_this_cpu(desc);  	if (chip->irq_ack)  		chip->irq_ack(&desc->irq_data); @@ -884,7 +919,11 @@ void handle_percpu_devid_irq(struct irq_desc *desc)  	unsigned int irq = irq_desc_get_irq(desc);  	irqreturn_t res; -	kstat_incr_irqs_this_cpu(desc); +	/* +	 * PER CPU interrupts are not serialized. Do not touch +	 * desc->tot_count. +	 */ +	__kstat_incr_irqs_this_cpu(desc);  	if (chip->irq_ack)  		chip->irq_ack(&desc->irq_data); @@ -908,6 +947,29 @@ void handle_percpu_devid_irq(struct irq_desc *desc)  		chip->irq_eoi(&desc->irq_data);  } +/** + * handle_percpu_devid_fasteoi_nmi - Per CPU local NMI handler with per cpu + *				     dev ids + * @desc:	the interrupt description structure for this irq + * + * Similar to handle_fasteoi_nmi, but handling the dev_id cookie + * as a percpu pointer. + */ +void handle_percpu_devid_fasteoi_nmi(struct irq_desc *desc) +{ +	struct irq_chip *chip = irq_desc_get_chip(desc); +	struct irqaction *action = desc->action; +	unsigned int irq = irq_desc_get_irq(desc); +	irqreturn_t res; + +	trace_irq_handler_entry(irq, action); +	res = action->handler(irq, raw_cpu_ptr(action->percpu_dev_id)); +	trace_irq_handler_exit(irq, action, res); + +	if (chip->irq_eoi) +		chip->irq_eoi(&desc->irq_data); +} +  static void  __irq_do_set_handler(struct irq_desc *desc, irq_flow_handler_t handle,  		     int is_chained, const char *name) diff --git a/kernel/irq/debugfs.c b/kernel/irq/debugfs.c index 6f636136cccc..516c00a5e867 100644 --- a/kernel/irq/debugfs.c +++ b/kernel/irq/debugfs.c @@ -56,6 +56,7 @@ static const struct irq_bit_descr irqchip_flags[] = {  	BIT_MASK_DESCR(IRQCHIP_ONESHOT_SAFE),  	BIT_MASK_DESCR(IRQCHIP_EOI_THREADED),  	BIT_MASK_DESCR(IRQCHIP_SUPPORTS_LEVEL_MSI), +	BIT_MASK_DESCR(IRQCHIP_SUPPORTS_NMI),  };  static void @@ -140,6 +141,7 @@ static const struct irq_bit_descr irqdesc_istates[] = {  	BIT_MASK_DESCR(IRQS_WAITING),  	BIT_MASK_DESCR(IRQS_PENDING),  	BIT_MASK_DESCR(IRQS_SUSPENDED), +	BIT_MASK_DESCR(IRQS_NMI),  }; @@ -203,8 +205,8 @@ static ssize_t irq_debug_write(struct file *file, const char __user *user_buf,  		chip_bus_lock(desc);  		raw_spin_lock_irqsave(&desc->lock, flags); -		if (irq_settings_is_level(desc)) { -			/* Can't do level, sorry */ +		if (irq_settings_is_level(desc) || desc->istate & IRQS_NMI) { +			/* Can't do level nor NMIs, sorry */  			err = -EINVAL;  		} else {  			desc->istate |= IRQS_PENDING; @@ -256,8 +258,6 @@ static int __init irq_debugfs_init(void)  	int irq;  	root_dir = debugfs_create_dir("irq", NULL); -	if (!root_dir) -		return -ENOMEM;  	irq_domain_debugfs_init(root_dir); diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index 38554bc35375..6df5ddfdb0f8 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -166,7 +166,7 @@ irqreturn_t __handle_irq_event_percpu(struct irq_desc *desc, unsigned int *flags  			__irq_wake_thread(desc, action); -			/* Fall through to add to randomness */ +			/* Fall through - to add to randomness */  		case IRQ_HANDLED:  			*flags |= action->flags;  			break; diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index ca6afa267070..70c3053bc1f6 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -49,6 +49,7 @@ enum {   * IRQS_WAITING			- irq is waiting   * IRQS_PENDING			- irq is pending and replayed later   * IRQS_SUSPENDED		- irq is suspended + * IRQS_NMI			- irq line is used to deliver NMIs   */  enum {  	IRQS_AUTODETECT		= 0x00000001, @@ -60,6 +61,7 @@ enum {  	IRQS_PENDING		= 0x00000200,  	IRQS_SUSPENDED		= 0x00000800,  	IRQS_TIMINGS		= 0x00001000, +	IRQS_NMI		= 0x00002000,  };  #include "debug.h" @@ -242,12 +244,18 @@ static inline void irq_state_set_masked(struct irq_desc *desc)  #undef __irqd_to_state -static inline void kstat_incr_irqs_this_cpu(struct irq_desc *desc) +static inline void __kstat_incr_irqs_this_cpu(struct irq_desc *desc)  {  	__this_cpu_inc(*desc->kstat_irqs);  	__this_cpu_inc(kstat.irqs_sum);  } +static inline void kstat_incr_irqs_this_cpu(struct irq_desc *desc) +{ +	__kstat_incr_irqs_this_cpu(desc); +	desc->tot_count++; +} +  static inline int irq_desc_get_node(struct irq_desc *desc)  {  	return irq_common_data_get_node(&desc->irq_common_data); diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index ef8ad36cadcf..13539e12cd80 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -119,6 +119,7 @@ static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node,  	desc->depth = 1;  	desc->irq_count = 0;  	desc->irqs_unhandled = 0; +	desc->tot_count = 0;  	desc->name = NULL;  	desc->owner = owner;  	for_each_possible_cpu(cpu) @@ -669,6 +670,41 @@ int __handle_domain_irq(struct irq_domain *domain, unsigned int hwirq,  	set_irq_regs(old_regs);  	return ret;  } + +#ifdef CONFIG_IRQ_DOMAIN +/** + * handle_domain_nmi - Invoke the handler for a HW irq belonging to a domain + * @domain:	The domain where to perform the lookup + * @hwirq:	The HW irq number to convert to a logical one + * @regs:	Register file coming from the low-level handling code + * + * Returns:	0 on success, or -EINVAL if conversion has failed + */ +int handle_domain_nmi(struct irq_domain *domain, unsigned int hwirq, +		      struct pt_regs *regs) +{ +	struct pt_regs *old_regs = set_irq_regs(regs); +	unsigned int irq; +	int ret = 0; + +	nmi_enter(); + +	irq = irq_find_mapping(domain, hwirq); + +	/* +	 * ack_bad_irq is not NMI-safe, just report +	 * an invalid interrupt. +	 */ +	if (likely(irq)) +		generic_handle_irq(irq); +	else +		ret = -EINVAL; + +	nmi_exit(); +	set_irq_regs(old_regs); +	return ret; +} +#endif  #endif  /* Dynamic interrupt handling */ @@ -919,11 +955,15 @@ unsigned int kstat_irqs_cpu(unsigned int irq, int cpu)  unsigned int kstat_irqs(unsigned int irq)  {  	struct irq_desc *desc = irq_to_desc(irq); -	int cpu;  	unsigned int sum = 0; +	int cpu;  	if (!desc || !desc->kstat_irqs)  		return 0; +	if (!irq_settings_is_per_cpu_devid(desc) && +	    !irq_settings_is_per_cpu(desc)) +	    return desc->tot_count; +  	for_each_possible_cpu(cpu)  		sum += *per_cpu_ptr(desc->kstat_irqs, cpu);  	return sum; diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 8b0be4bd6565..3bf9793d8825 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -458,6 +458,20 @@ void irq_set_default_host(struct irq_domain *domain)  }  EXPORT_SYMBOL_GPL(irq_set_default_host); +/** + * irq_get_default_host() - Retrieve the "default" irq domain + * + * Returns: the default domain, if any. + * + * Modern code should never use this. This should only be used on + * systems that cannot implement a firmware->fwnode mapping (which + * both DT and ACPI provide). + */ +struct irq_domain *irq_get_default_host(void) +{ +	return irq_default_domain; +} +  static void irq_domain_clear_mapping(struct irq_domain *domain,  				     irq_hw_number_t hwirq)  { @@ -1749,8 +1763,6 @@ void __init irq_domain_debugfs_init(struct dentry *root)  	struct irq_domain *d;  	domain_dir = debugfs_create_dir("domains", root); -	if (!domain_dir) -		return;  	debugfs_create_file("default", 0444, domain_dir, NULL,  			    &irq_domain_debug_fops); diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 84b54a17b95d..9ec34a2a6638 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -341,7 +341,7 @@ irq_set_affinity_notifier(unsigned int irq, struct irq_affinity_notify *notify)  	/* The release function is promised process context */  	might_sleep(); -	if (!desc) +	if (!desc || desc->istate & IRQS_NMI)  		return -EINVAL;  	/* Complete initialisation of *notify */ @@ -553,6 +553,21 @@ bool disable_hardirq(unsigned int irq)  }  EXPORT_SYMBOL_GPL(disable_hardirq); +/** + *	disable_nmi_nosync - disable an nmi without waiting + *	@irq: Interrupt to disable + * + *	Disable the selected interrupt line. Disables and enables are + *	nested. + *	The interrupt to disable must have been requested through request_nmi. + *	Unlike disable_nmi(), this function does not ensure existing + *	instances of the IRQ handler have completed before returning. + */ +void disable_nmi_nosync(unsigned int irq) +{ +	disable_irq_nosync(irq); +} +  void __enable_irq(struct irq_desc *desc)  {  	switch (desc->depth) { @@ -609,6 +624,20 @@ out:  }  EXPORT_SYMBOL(enable_irq); +/** + *	enable_nmi - enable handling of an nmi + *	@irq: Interrupt to enable + * + *	The interrupt to enable must have been requested through request_nmi. + *	Undoes the effect of one call to disable_nmi(). If this + *	matches the last disable, processing of interrupts on this + *	IRQ line is re-enabled. + */ +void enable_nmi(unsigned int irq) +{ +	enable_irq(irq); +} +  static int set_irq_wake_real(unsigned int irq, unsigned int on)  {  	struct irq_desc *desc = irq_to_desc(irq); @@ -644,6 +673,12 @@ int irq_set_irq_wake(unsigned int irq, unsigned int on)  	if (!desc)  		return -EINVAL; +	/* Don't use NMIs as wake up interrupts please */ +	if (desc->istate & IRQS_NMI) { +		ret = -EINVAL; +		goto out_unlock; +	} +  	/* wakeup-capable irqs can be shared between drivers that  	 * don't need to have the same sleep mode behaviors.  	 */ @@ -666,6 +701,8 @@ int irq_set_irq_wake(unsigned int irq, unsigned int on)  				irqd_clear(&desc->irq_data, IRQD_WAKEUP_STATE);  		}  	} + +out_unlock:  	irq_put_desc_busunlock(desc, flags);  	return ret;  } @@ -726,6 +763,7 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned long flags)  	case IRQ_SET_MASK_OK_DONE:  		irqd_clear(&desc->irq_data, IRQD_TRIGGER_MASK);  		irqd_set(&desc->irq_data, flags); +		/* fall through */  	case IRQ_SET_MASK_OK_NOCOPY:  		flags = irqd_get_trigger_type(&desc->irq_data); @@ -1128,6 +1166,39 @@ static void irq_release_resources(struct irq_desc *desc)  		c->irq_release_resources(d);  } +static bool irq_supports_nmi(struct irq_desc *desc) +{ +	struct irq_data *d = irq_desc_get_irq_data(desc); + +#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY +	/* Only IRQs directly managed by the root irqchip can be set as NMI */ +	if (d->parent_data) +		return false; +#endif +	/* Don't support NMIs for chips behind a slow bus */ +	if (d->chip->irq_bus_lock || d->chip->irq_bus_sync_unlock) +		return false; + +	return d->chip->flags & IRQCHIP_SUPPORTS_NMI; +} + +static int irq_nmi_setup(struct irq_desc *desc) +{ +	struct irq_data *d = irq_desc_get_irq_data(desc); +	struct irq_chip *c = d->chip; + +	return c->irq_nmi_setup ? c->irq_nmi_setup(d) : -EINVAL; +} + +static void irq_nmi_teardown(struct irq_desc *desc) +{ +	struct irq_data *d = irq_desc_get_irq_data(desc); +	struct irq_chip *c = d->chip; + +	if (c->irq_nmi_teardown) +		c->irq_nmi_teardown(d); +} +  static int  setup_irq_thread(struct irqaction *new, unsigned int irq, bool secondary)  { @@ -1302,9 +1373,17 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)  		 * fields must have IRQF_SHARED set and the bits which  		 * set the trigger type must match. Also all must  		 * agree on ONESHOT. +		 * Interrupt lines used for NMIs cannot be shared.  		 */  		unsigned int oldtype; +		if (desc->istate & IRQS_NMI) { +			pr_err("Invalid attempt to share NMI for %s (irq %d) on irqchip %s.\n", +				new->name, irq, desc->irq_data.chip->name); +			ret = -EINVAL; +			goto out_unlock; +		} +  		/*  		 * If nobody did set the configuration before, inherit  		 * the one provided by the requester. @@ -1756,6 +1835,59 @@ const void *free_irq(unsigned int irq, void *dev_id)  }  EXPORT_SYMBOL(free_irq); +/* This function must be called with desc->lock held */ +static const void *__cleanup_nmi(unsigned int irq, struct irq_desc *desc) +{ +	const char *devname = NULL; + +	desc->istate &= ~IRQS_NMI; + +	if (!WARN_ON(desc->action == NULL)) { +		irq_pm_remove_action(desc, desc->action); +		devname = desc->action->name; +		unregister_handler_proc(irq, desc->action); + +		kfree(desc->action); +		desc->action = NULL; +	} + +	irq_settings_clr_disable_unlazy(desc); +	irq_shutdown(desc); + +	irq_release_resources(desc); + +	irq_chip_pm_put(&desc->irq_data); +	module_put(desc->owner); + +	return devname; +} + +const void *free_nmi(unsigned int irq, void *dev_id) +{ +	struct irq_desc *desc = irq_to_desc(irq); +	unsigned long flags; +	const void *devname; + +	if (!desc || WARN_ON(!(desc->istate & IRQS_NMI))) +		return NULL; + +	if (WARN_ON(irq_settings_is_per_cpu_devid(desc))) +		return NULL; + +	/* NMI still enabled */ +	if (WARN_ON(desc->depth == 0)) +		disable_nmi_nosync(irq); + +	raw_spin_lock_irqsave(&desc->lock, flags); + +	irq_nmi_teardown(desc); +	devname = __cleanup_nmi(irq, desc); + +	raw_spin_unlock_irqrestore(&desc->lock, flags); + +	return devname; +} +  /**   *	request_threaded_irq - allocate an interrupt line   *	@irq: Interrupt line to allocate @@ -1925,6 +2057,101 @@ int request_any_context_irq(unsigned int irq, irq_handler_t handler,  }  EXPORT_SYMBOL_GPL(request_any_context_irq); +/** + *	request_nmi - allocate an interrupt line for NMI delivery + *	@irq: Interrupt line to allocate + *	@handler: Function to be called when the IRQ occurs. + *		  Threaded handler for threaded interrupts. + *	@irqflags: Interrupt type flags + *	@name: An ascii name for the claiming device + *	@dev_id: A cookie passed back to the handler function + * + *	This call allocates interrupt resources and enables the + *	interrupt line and IRQ handling. It sets up the IRQ line + *	to be handled as an NMI. + * + *	An interrupt line delivering NMIs cannot be shared and IRQ handling + *	cannot be threaded. + * + *	Interrupt lines requested for NMI delivering must produce per cpu + *	interrupts and have auto enabling setting disabled. + * + *	Dev_id must be globally unique. Normally the address of the + *	device data structure is used as the cookie. Since the handler + *	receives this value it makes sense to use it. + * + *	If the interrupt line cannot be used to deliver NMIs, function + *	will fail and return a negative value. + */ +int request_nmi(unsigned int irq, irq_handler_t handler, +		unsigned long irqflags, const char *name, void *dev_id) +{ +	struct irqaction *action; +	struct irq_desc *desc; +	unsigned long flags; +	int retval; + +	if (irq == IRQ_NOTCONNECTED) +		return -ENOTCONN; + +	/* NMI cannot be shared, used for Polling */ +	if (irqflags & (IRQF_SHARED | IRQF_COND_SUSPEND | IRQF_IRQPOLL)) +		return -EINVAL; + +	if (!(irqflags & IRQF_PERCPU)) +		return -EINVAL; + +	if (!handler) +		return -EINVAL; + +	desc = irq_to_desc(irq); + +	if (!desc || irq_settings_can_autoenable(desc) || +	    !irq_settings_can_request(desc) || +	    WARN_ON(irq_settings_is_per_cpu_devid(desc)) || +	    !irq_supports_nmi(desc)) +		return -EINVAL; + +	action = kzalloc(sizeof(struct irqaction), GFP_KERNEL); +	if (!action) +		return -ENOMEM; + +	action->handler = handler; +	action->flags = irqflags | IRQF_NO_THREAD | IRQF_NOBALANCING; +	action->name = name; +	action->dev_id = dev_id; + +	retval = irq_chip_pm_get(&desc->irq_data); +	if (retval < 0) +		goto err_out; + +	retval = __setup_irq(irq, desc, action); +	if (retval) +		goto err_irq_setup; + +	raw_spin_lock_irqsave(&desc->lock, flags); + +	/* Setup NMI state */ +	desc->istate |= IRQS_NMI; +	retval = irq_nmi_setup(desc); +	if (retval) { +		__cleanup_nmi(irq, desc); +		raw_spin_unlock_irqrestore(&desc->lock, flags); +		return -EINVAL; +	} + +	raw_spin_unlock_irqrestore(&desc->lock, flags); + +	return 0; + +err_irq_setup: +	irq_chip_pm_put(&desc->irq_data); +err_out: +	kfree(action); + +	return retval; +} +  void enable_percpu_irq(unsigned int irq, unsigned int type)  {  	unsigned int cpu = smp_processor_id(); @@ -1959,6 +2186,11 @@ out:  }  EXPORT_SYMBOL_GPL(enable_percpu_irq); +void enable_percpu_nmi(unsigned int irq, unsigned int type) +{ +	enable_percpu_irq(irq, type); +} +  /**   * irq_percpu_is_enabled - Check whether the per cpu irq is enabled   * @irq:	Linux irq number to check for @@ -1998,6 +2230,11 @@ void disable_percpu_irq(unsigned int irq)  }  EXPORT_SYMBOL_GPL(disable_percpu_irq); +void disable_percpu_nmi(unsigned int irq) +{ +	disable_percpu_irq(irq); +} +  /*   * Internal function to unregister a percpu irqaction.   */ @@ -2029,6 +2266,8 @@ static struct irqaction *__free_percpu_irq(unsigned int irq, void __percpu *dev_  	/* Found it - now remove it from the list of entries: */  	desc->action = NULL; +	desc->istate &= ~IRQS_NMI; +  	raw_spin_unlock_irqrestore(&desc->lock, flags);  	unregister_handler_proc(irq, action); @@ -2082,6 +2321,19 @@ void free_percpu_irq(unsigned int irq, void __percpu *dev_id)  }  EXPORT_SYMBOL_GPL(free_percpu_irq); +void free_percpu_nmi(unsigned int irq, void __percpu *dev_id) +{ +	struct irq_desc *desc = irq_to_desc(irq); + +	if (!desc || !irq_settings_is_per_cpu_devid(desc)) +		return; + +	if (WARN_ON(!(desc->istate & IRQS_NMI))) +		return; + +	kfree(__free_percpu_irq(irq, dev_id)); +} +  /**   *	setup_percpu_irq - setup a per-cpu interrupt   *	@irq: Interrupt line to setup @@ -2172,6 +2424,158 @@ int __request_percpu_irq(unsigned int irq, irq_handler_t handler,  EXPORT_SYMBOL_GPL(__request_percpu_irq);  /** + *	request_percpu_nmi - allocate a percpu interrupt line for NMI delivery + *	@irq: Interrupt line to allocate + *	@handler: Function to be called when the IRQ occurs. + *	@name: An ascii name for the claiming device + *	@dev_id: A percpu cookie passed back to the handler function + * + *	This call allocates interrupt resources for a per CPU NMI. Per CPU NMIs + *	have to be setup on each CPU by calling prepare_percpu_nmi() before + *	being enabled on the same CPU by using enable_percpu_nmi(). + * + *	Dev_id must be globally unique. It is a per-cpu variable, and + *	the handler gets called with the interrupted CPU's instance of + *	that variable. + * + *	Interrupt lines requested for NMI delivering should have auto enabling + *	setting disabled. + * + *	If the interrupt line cannot be used to deliver NMIs, function + *	will fail returning a negative value. + */ +int request_percpu_nmi(unsigned int irq, irq_handler_t handler, +		       const char *name, void __percpu *dev_id) +{ +	struct irqaction *action; +	struct irq_desc *desc; +	unsigned long flags; +	int retval; + +	if (!handler) +		return -EINVAL; + +	desc = irq_to_desc(irq); + +	if (!desc || !irq_settings_can_request(desc) || +	    !irq_settings_is_per_cpu_devid(desc) || +	    irq_settings_can_autoenable(desc) || +	    !irq_supports_nmi(desc)) +		return -EINVAL; + +	/* The line cannot already be NMI */ +	if (desc->istate & IRQS_NMI) +		return -EINVAL; + +	action = kzalloc(sizeof(struct irqaction), GFP_KERNEL); +	if (!action) +		return -ENOMEM; + +	action->handler = handler; +	action->flags = IRQF_PERCPU | IRQF_NO_SUSPEND | IRQF_NO_THREAD +		| IRQF_NOBALANCING; +	action->name = name; +	action->percpu_dev_id = dev_id; + +	retval = irq_chip_pm_get(&desc->irq_data); +	if (retval < 0) +		goto err_out; + +	retval = __setup_irq(irq, desc, action); +	if (retval) +		goto err_irq_setup; + +	raw_spin_lock_irqsave(&desc->lock, flags); +	desc->istate |= IRQS_NMI; +	raw_spin_unlock_irqrestore(&desc->lock, flags); + +	return 0; + +err_irq_setup: +	irq_chip_pm_put(&desc->irq_data); +err_out: +	kfree(action); + +	return retval; +} + +/** + *	prepare_percpu_nmi - performs CPU local setup for NMI delivery + *	@irq: Interrupt line to prepare for NMI delivery + * + *	This call prepares an interrupt line to deliver NMI on the current CPU, + *	before that interrupt line gets enabled with enable_percpu_nmi(). + * + *	As a CPU local operation, this should be called from non-preemptible + *	context. + * + *	If the interrupt line cannot be used to deliver NMIs, function + *	will fail returning a negative value. + */ +int prepare_percpu_nmi(unsigned int irq) +{ +	unsigned long flags; +	struct irq_desc *desc; +	int ret = 0; + +	WARN_ON(preemptible()); + +	desc = irq_get_desc_lock(irq, &flags, +				 IRQ_GET_DESC_CHECK_PERCPU); +	if (!desc) +		return -EINVAL; + +	if (WARN(!(desc->istate & IRQS_NMI), +		 KERN_ERR "prepare_percpu_nmi called for a non-NMI interrupt: irq %u\n", +		 irq)) { +		ret = -EINVAL; +		goto out; +	} + +	ret = irq_nmi_setup(desc); +	if (ret) { +		pr_err("Failed to setup NMI delivery: irq %u\n", irq); +		goto out; +	} + +out: +	irq_put_desc_unlock(desc, flags); +	return ret; +} + +/** + *	teardown_percpu_nmi - undoes NMI setup of IRQ line + *	@irq: Interrupt line from which CPU local NMI configuration should be + *	      removed + * + *	This call undoes the setup done by prepare_percpu_nmi(). + * + *	IRQ line should not be enabled for the current CPU. + * + *	As a CPU local operation, this should be called from non-preemptible + *	context. + */ +void teardown_percpu_nmi(unsigned int irq) +{ +	unsigned long flags; +	struct irq_desc *desc; + +	WARN_ON(preemptible()); + +	desc = irq_get_desc_lock(irq, &flags, +				 IRQ_GET_DESC_CHECK_PERCPU); +	if (!desc) +		return; + +	if (WARN_ON(!(desc->istate & IRQS_NMI))) +		goto out; + +	irq_nmi_teardown(desc); +out: +	irq_put_desc_unlock(desc, flags); +} + +/**   *	irq_get_irqchip_state - returns the irqchip state of a interrupt.   *	@irq: Interrupt line that is forwarded to a VM   *	@which: One of IRQCHIP_STATE_* the caller wants to know about diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index f3a04994e063..14934afa9e68 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -494,7 +494,7 @@ static int get_ksymbol_ftrace_mod(struct kallsym_iter *iter)  static int get_ksymbol_bpf(struct kallsym_iter *iter)  { -	iter->module_name[0] = '\0'; +	strlcpy(iter->module_name, "bpf", MODULE_NAME_LEN);  	iter->exported = 0;  	return bpf_get_kallsym(iter->pos - iter->pos_ftrace_mod_end,  			       &iter->value, &iter->type, diff --git a/kernel/kprobes.c b/kernel/kprobes.c index f4ddfdd2d07e..c83e54727131 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -1396,7 +1396,7 @@ bool __weak arch_within_kprobe_blacklist(unsigned long addr)  	       addr < (unsigned long)__kprobes_text_end;  } -bool within_kprobe_blacklist(unsigned long addr) +static bool __within_kprobe_blacklist(unsigned long addr)  {  	struct kprobe_blacklist_entry *ent; @@ -1410,7 +1410,26 @@ bool within_kprobe_blacklist(unsigned long addr)  		if (addr >= ent->start_addr && addr < ent->end_addr)  			return true;  	} +	return false; +} +bool within_kprobe_blacklist(unsigned long addr) +{ +	char symname[KSYM_NAME_LEN], *p; + +	if (__within_kprobe_blacklist(addr)) +		return true; + +	/* Check if the address is on a suffixed-symbol */ +	if (!lookup_symbol_name(addr, symname)) { +		p = strchr(symname, '.'); +		if (!p) +			return false; +		*p = '\0'; +		addr = (unsigned long)kprobe_lookup_name(symname, 0); +		if (addr) +			return __within_kprobe_blacklist(addr); +	}  	return false;  } diff --git a/kernel/kthread.c b/kernel/kthread.c index 087d18d771b5..5942eeafb9ac 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -20,6 +20,7 @@  #include <linux/freezer.h>  #include <linux/ptrace.h>  #include <linux/uaccess.h> +#include <linux/numa.h>  #include <trace/events/sched.h>  static DEFINE_SPINLOCK(kthread_create_lock); @@ -101,6 +102,12 @@ bool kthread_should_stop(void)  }  EXPORT_SYMBOL(kthread_should_stop); +bool __kthread_should_park(struct task_struct *k) +{ +	return test_bit(KTHREAD_SHOULD_PARK, &to_kthread(k)->flags); +} +EXPORT_SYMBOL_GPL(__kthread_should_park); +  /**   * kthread_should_park - should this kthread park now?   * @@ -114,7 +121,7 @@ EXPORT_SYMBOL(kthread_should_stop);   */  bool kthread_should_park(void)  { -	return test_bit(KTHREAD_SHOULD_PARK, &to_kthread(current)->flags); +	return __kthread_should_park(current);  }  EXPORT_SYMBOL_GPL(kthread_should_park); @@ -599,7 +606,7 @@ void __kthread_init_worker(struct kthread_worker *worker,  				struct lock_class_key *key)  {  	memset(worker, 0, sizeof(struct kthread_worker)); -	spin_lock_init(&worker->lock); +	raw_spin_lock_init(&worker->lock);  	lockdep_set_class_and_name(&worker->lock, key, name);  	INIT_LIST_HEAD(&worker->work_list);  	INIT_LIST_HEAD(&worker->delayed_work_list); @@ -641,21 +648,21 @@ repeat:  	if (kthread_should_stop()) {  		__set_current_state(TASK_RUNNING); -		spin_lock_irq(&worker->lock); +		raw_spin_lock_irq(&worker->lock);  		worker->task = NULL; -		spin_unlock_irq(&worker->lock); +		raw_spin_unlock_irq(&worker->lock);  		return 0;  	}  	work = NULL; -	spin_lock_irq(&worker->lock); +	raw_spin_lock_irq(&worker->lock);  	if (!list_empty(&worker->work_list)) {  		work = list_first_entry(&worker->work_list,  					struct kthread_work, node);  		list_del_init(&work->node);  	}  	worker->current_work = work; -	spin_unlock_irq(&worker->lock); +	raw_spin_unlock_irq(&worker->lock);  	if (work) {  		__set_current_state(TASK_RUNNING); @@ -675,7 +682,7 @@ __kthread_create_worker(int cpu, unsigned int flags,  {  	struct kthread_worker *worker;  	struct task_struct *task; -	int node = -1; +	int node = NUMA_NO_NODE;  	worker = kzalloc(sizeof(*worker), GFP_KERNEL);  	if (!worker) @@ -812,12 +819,12 @@ bool kthread_queue_work(struct kthread_worker *worker,  	bool ret = false;  	unsigned long flags; -	spin_lock_irqsave(&worker->lock, flags); +	raw_spin_lock_irqsave(&worker->lock, flags);  	if (!queuing_blocked(worker, work)) {  		kthread_insert_work(worker, work, &worker->work_list);  		ret = true;  	} -	spin_unlock_irqrestore(&worker->lock, flags); +	raw_spin_unlock_irqrestore(&worker->lock, flags);  	return ret;  }  EXPORT_SYMBOL_GPL(kthread_queue_work); @@ -835,6 +842,7 @@ void kthread_delayed_work_timer_fn(struct timer_list *t)  	struct kthread_delayed_work *dwork = from_timer(dwork, t, timer);  	struct kthread_work *work = &dwork->work;  	struct kthread_worker *worker = work->worker; +	unsigned long flags;  	/*  	 * This might happen when a pending work is reinitialized. @@ -843,7 +851,7 @@ void kthread_delayed_work_timer_fn(struct timer_list *t)  	if (WARN_ON_ONCE(!worker))  		return; -	spin_lock(&worker->lock); +	raw_spin_lock_irqsave(&worker->lock, flags);  	/* Work must not be used with >1 worker, see kthread_queue_work(). */  	WARN_ON_ONCE(work->worker != worker); @@ -852,7 +860,7 @@ void kthread_delayed_work_timer_fn(struct timer_list *t)  	list_del_init(&work->node);  	kthread_insert_work(worker, work, &worker->work_list); -	spin_unlock(&worker->lock); +	raw_spin_unlock_irqrestore(&worker->lock, flags);  }  EXPORT_SYMBOL(kthread_delayed_work_timer_fn); @@ -908,14 +916,14 @@ bool kthread_queue_delayed_work(struct kthread_worker *worker,  	unsigned long flags;  	bool ret = false; -	spin_lock_irqsave(&worker->lock, flags); +	raw_spin_lock_irqsave(&worker->lock, flags);  	if (!queuing_blocked(worker, work)) {  		__kthread_queue_delayed_work(worker, dwork, delay);  		ret = true;  	} -	spin_unlock_irqrestore(&worker->lock, flags); +	raw_spin_unlock_irqrestore(&worker->lock, flags);  	return ret;  }  EXPORT_SYMBOL_GPL(kthread_queue_delayed_work); @@ -951,7 +959,7 @@ void kthread_flush_work(struct kthread_work *work)  	if (!worker)  		return; -	spin_lock_irq(&worker->lock); +	raw_spin_lock_irq(&worker->lock);  	/* Work must not be used with >1 worker, see kthread_queue_work(). */  	WARN_ON_ONCE(work->worker != worker); @@ -963,7 +971,7 @@ void kthread_flush_work(struct kthread_work *work)  	else  		noop = true; -	spin_unlock_irq(&worker->lock); +	raw_spin_unlock_irq(&worker->lock);  	if (!noop)  		wait_for_completion(&fwork.done); @@ -996,9 +1004,9 @@ static bool __kthread_cancel_work(struct kthread_work *work, bool is_dwork,  		 * any queuing is blocked by setting the canceling counter.  		 */  		work->canceling++; -		spin_unlock_irqrestore(&worker->lock, *flags); +		raw_spin_unlock_irqrestore(&worker->lock, *flags);  		del_timer_sync(&dwork->timer); -		spin_lock_irqsave(&worker->lock, *flags); +		raw_spin_lock_irqsave(&worker->lock, *flags);  		work->canceling--;  	} @@ -1045,7 +1053,7 @@ bool kthread_mod_delayed_work(struct kthread_worker *worker,  	unsigned long flags;  	int ret = false; -	spin_lock_irqsave(&worker->lock, flags); +	raw_spin_lock_irqsave(&worker->lock, flags);  	/* Do not bother with canceling when never queued. */  	if (!work->worker) @@ -1062,7 +1070,7 @@ bool kthread_mod_delayed_work(struct kthread_worker *worker,  fast_queue:  	__kthread_queue_delayed_work(worker, dwork, delay);  out: -	spin_unlock_irqrestore(&worker->lock, flags); +	raw_spin_unlock_irqrestore(&worker->lock, flags);  	return ret;  }  EXPORT_SYMBOL_GPL(kthread_mod_delayed_work); @@ -1076,7 +1084,7 @@ static bool __kthread_cancel_work_sync(struct kthread_work *work, bool is_dwork)  	if (!worker)  		goto out; -	spin_lock_irqsave(&worker->lock, flags); +	raw_spin_lock_irqsave(&worker->lock, flags);  	/* Work must not be used with >1 worker, see kthread_queue_work(). */  	WARN_ON_ONCE(work->worker != worker); @@ -1090,13 +1098,13 @@ static bool __kthread_cancel_work_sync(struct kthread_work *work, bool is_dwork)  	 * In the meantime, block any queuing by setting the canceling counter.  	 */  	work->canceling++; -	spin_unlock_irqrestore(&worker->lock, flags); +	raw_spin_unlock_irqrestore(&worker->lock, flags);  	kthread_flush_work(work); -	spin_lock_irqsave(&worker->lock, flags); +	raw_spin_lock_irqsave(&worker->lock, flags);  	work->canceling--;  out_fast: -	spin_unlock_irqrestore(&worker->lock, flags); +	raw_spin_unlock_irqrestore(&worker->lock, flags);  out:  	return ret;  } diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 95932333a48b..21cb81fe6359 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -45,11 +45,14 @@  #include <linux/hash.h>  #include <linux/ftrace.h>  #include <linux/stringify.h> +#include <linux/bitmap.h>  #include <linux/bitops.h>  #include <linux/gfp.h>  #include <linux/random.h>  #include <linux/jhash.h>  #include <linux/nmi.h> +#include <linux/rcupdate.h> +#include <linux/kprobes.h>  #include <asm/sections.h> @@ -81,6 +84,7 @@ module_param(lock_stat, int, 0644);   * code to recurse back into the lockdep code...   */  static arch_spinlock_t lockdep_lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; +static struct task_struct *lockdep_selftest_task_struct;  static int graph_lock(void)  { @@ -130,13 +134,17 @@ static inline int debug_locks_off_graph_unlock(void)  unsigned long nr_list_entries;  static struct lock_list list_entries[MAX_LOCKDEP_ENTRIES]; +static DECLARE_BITMAP(list_entries_in_use, MAX_LOCKDEP_ENTRIES);  /*   * All data structures here are protected by the global debug_lock.   * - * Mutex key structs only get allocated, once during bootup, and never - * get freed - this significantly simplifies the debugging code. + * nr_lock_classes is the number of elements of lock_classes[] that is + * in use.   */ +#define KEYHASH_BITS		(MAX_LOCKDEP_KEYS_BITS - 1) +#define KEYHASH_SIZE		(1UL << KEYHASH_BITS) +static struct hlist_head lock_keys_hash[KEYHASH_SIZE];  unsigned long nr_lock_classes;  #ifndef CONFIG_DEBUG_LOCKDEP  static @@ -277,11 +285,42 @@ static inline void lock_release_holdtime(struct held_lock *hlock)  #endif  /* - * We keep a global list of all lock classes. The list only grows, - * never shrinks. The list is only accessed with the lockdep - * spinlock lock held. + * We keep a global list of all lock classes. The list is only accessed with + * the lockdep spinlock lock held. free_lock_classes is a list with free + * elements. These elements are linked together by the lock_entry member in + * struct lock_class.   */  LIST_HEAD(all_lock_classes); +static LIST_HEAD(free_lock_classes); + +/** + * struct pending_free - information about data structures about to be freed + * @zapped: Head of a list with struct lock_class elements. + * @lock_chains_being_freed: Bitmap that indicates which lock_chains[] elements + *	are about to be freed. + */ +struct pending_free { +	struct list_head zapped; +	DECLARE_BITMAP(lock_chains_being_freed, MAX_LOCKDEP_CHAINS); +}; + +/** + * struct delayed_free - data structures used for delayed freeing + * + * A data structure for delayed freeing of data structures that may be + * accessed by RCU readers at the time these were freed. + * + * @rcu_head:  Used to schedule an RCU callback for freeing data structures. + * @index:     Index of @pf to which freed data structures are added. + * @scheduled: Whether or not an RCU callback has been scheduled. + * @pf:        Array with information about data structures about to be freed. + */ +static struct delayed_free { +	struct rcu_head		rcu_head; +	int			index; +	int			scheduled; +	struct pending_free	pf[2]; +} delayed_free;  /*   * The lockdep classes are in a hash-table as well, for fast lookup: @@ -331,6 +370,11 @@ void lockdep_on(void)  }  EXPORT_SYMBOL(lockdep_on); +void lockdep_set_selftest_task(struct task_struct *task) +{ +	lockdep_selftest_task_struct = task; +} +  /*   * Debugging switches:   */ @@ -599,7 +643,7 @@ static int very_verbose(struct lock_class *class)   * Is this the address of a static object:   */  #ifdef __KERNEL__ -static int static_obj(void *obj) +static int static_obj(const void *obj)  {  	unsigned long start = (unsigned long) &_stext,  		      end   = (unsigned long) &_end, @@ -716,6 +760,17 @@ static bool assign_lock_key(struct lockdep_map *lock)  {  	unsigned long can_addr, addr = (unsigned long)lock; +#ifdef __KERNEL__ +	/* +	 * lockdep_free_key_range() assumes that struct lock_class_key +	 * objects do not overlap. Since we use the address of lock +	 * objects as class key for static objects, check whether the +	 * size of lock_class_key objects does not exceed the size of +	 * the smallest lock object. +	 */ +	BUILD_BUG_ON(sizeof(struct lock_class_key) > sizeof(raw_spinlock_t)); +#endif +  	if (__is_kernel_percpu_address(addr, &can_addr))  		lock->key = (void *)can_addr;  	else if (__is_module_percpu_address(addr, &can_addr)) @@ -735,6 +790,280 @@ static bool assign_lock_key(struct lockdep_map *lock)  	return true;  } +#ifdef CONFIG_DEBUG_LOCKDEP + +/* Check whether element @e occurs in list @h */ +static bool in_list(struct list_head *e, struct list_head *h) +{ +	struct list_head *f; + +	list_for_each(f, h) { +		if (e == f) +			return true; +	} + +	return false; +} + +/* + * Check whether entry @e occurs in any of the locks_after or locks_before + * lists. + */ +static bool in_any_class_list(struct list_head *e) +{ +	struct lock_class *class; +	int i; + +	for (i = 0; i < ARRAY_SIZE(lock_classes); i++) { +		class = &lock_classes[i]; +		if (in_list(e, &class->locks_after) || +		    in_list(e, &class->locks_before)) +			return true; +	} +	return false; +} + +static bool class_lock_list_valid(struct lock_class *c, struct list_head *h) +{ +	struct lock_list *e; + +	list_for_each_entry(e, h, entry) { +		if (e->links_to != c) { +			printk(KERN_INFO "class %s: mismatch for lock entry %ld; class %s <> %s", +			       c->name ? : "(?)", +			       (unsigned long)(e - list_entries), +			       e->links_to && e->links_to->name ? +			       e->links_to->name : "(?)", +			       e->class && e->class->name ? e->class->name : +			       "(?)"); +			return false; +		} +	} +	return true; +} + +static u16 chain_hlocks[]; + +static bool check_lock_chain_key(struct lock_chain *chain) +{ +#ifdef CONFIG_PROVE_LOCKING +	u64 chain_key = 0; +	int i; + +	for (i = chain->base; i < chain->base + chain->depth; i++) +		chain_key = iterate_chain_key(chain_key, chain_hlocks[i] + 1); +	/* +	 * The 'unsigned long long' casts avoid that a compiler warning +	 * is reported when building tools/lib/lockdep. +	 */ +	if (chain->chain_key != chain_key) { +		printk(KERN_INFO "chain %lld: key %#llx <> %#llx\n", +		       (unsigned long long)(chain - lock_chains), +		       (unsigned long long)chain->chain_key, +		       (unsigned long long)chain_key); +		return false; +	} +#endif +	return true; +} + +static bool in_any_zapped_class_list(struct lock_class *class) +{ +	struct pending_free *pf; +	int i; + +	for (i = 0, pf = delayed_free.pf; i < ARRAY_SIZE(delayed_free.pf); i++, pf++) { +		if (in_list(&class->lock_entry, &pf->zapped)) +			return true; +	} + +	return false; +} + +static bool __check_data_structures(void) +{ +	struct lock_class *class; +	struct lock_chain *chain; +	struct hlist_head *head; +	struct lock_list *e; +	int i; + +	/* Check whether all classes occur in a lock list. */ +	for (i = 0; i < ARRAY_SIZE(lock_classes); i++) { +		class = &lock_classes[i]; +		if (!in_list(&class->lock_entry, &all_lock_classes) && +		    !in_list(&class->lock_entry, &free_lock_classes) && +		    !in_any_zapped_class_list(class)) { +			printk(KERN_INFO "class %px/%s is not in any class list\n", +			       class, class->name ? : "(?)"); +			return false; +		} +	} + +	/* Check whether all classes have valid lock lists. */ +	for (i = 0; i < ARRAY_SIZE(lock_classes); i++) { +		class = &lock_classes[i]; +		if (!class_lock_list_valid(class, &class->locks_before)) +			return false; +		if (!class_lock_list_valid(class, &class->locks_after)) +			return false; +	} + +	/* Check the chain_key of all lock chains. */ +	for (i = 0; i < ARRAY_SIZE(chainhash_table); i++) { +		head = chainhash_table + i; +		hlist_for_each_entry_rcu(chain, head, entry) { +			if (!check_lock_chain_key(chain)) +				return false; +		} +	} + +	/* +	 * Check whether all list entries that are in use occur in a class +	 * lock list. +	 */ +	for_each_set_bit(i, list_entries_in_use, ARRAY_SIZE(list_entries)) { +		e = list_entries + i; +		if (!in_any_class_list(&e->entry)) { +			printk(KERN_INFO "list entry %d is not in any class list; class %s <> %s\n", +			       (unsigned int)(e - list_entries), +			       e->class->name ? : "(?)", +			       e->links_to->name ? : "(?)"); +			return false; +		} +	} + +	/* +	 * Check whether all list entries that are not in use do not occur in +	 * a class lock list. +	 */ +	for_each_clear_bit(i, list_entries_in_use, ARRAY_SIZE(list_entries)) { +		e = list_entries + i; +		if (in_any_class_list(&e->entry)) { +			printk(KERN_INFO "list entry %d occurs in a class list; class %s <> %s\n", +			       (unsigned int)(e - list_entries), +			       e->class && e->class->name ? e->class->name : +			       "(?)", +			       e->links_to && e->links_to->name ? +			       e->links_to->name : "(?)"); +			return false; +		} +	} + +	return true; +} + +int check_consistency = 0; +module_param(check_consistency, int, 0644); + +static void check_data_structures(void) +{ +	static bool once = false; + +	if (check_consistency && !once) { +		if (!__check_data_structures()) { +			once = true; +			WARN_ON(once); +		} +	} +} + +#else /* CONFIG_DEBUG_LOCKDEP */ + +static inline void check_data_structures(void) { } + +#endif /* CONFIG_DEBUG_LOCKDEP */ + +/* + * Initialize the lock_classes[] array elements, the free_lock_classes list + * and also the delayed_free structure. + */ +static void init_data_structures_once(void) +{ +	static bool initialization_happened; +	int i; + +	if (likely(initialization_happened)) +		return; + +	initialization_happened = true; + +	init_rcu_head(&delayed_free.rcu_head); +	INIT_LIST_HEAD(&delayed_free.pf[0].zapped); +	INIT_LIST_HEAD(&delayed_free.pf[1].zapped); + +	for (i = 0; i < ARRAY_SIZE(lock_classes); i++) { +		list_add_tail(&lock_classes[i].lock_entry, &free_lock_classes); +		INIT_LIST_HEAD(&lock_classes[i].locks_after); +		INIT_LIST_HEAD(&lock_classes[i].locks_before); +	} +} + +static inline struct hlist_head *keyhashentry(const struct lock_class_key *key) +{ +	unsigned long hash = hash_long((uintptr_t)key, KEYHASH_BITS); + +	return lock_keys_hash + hash; +} + +/* Register a dynamically allocated key. */ +void lockdep_register_key(struct lock_class_key *key) +{ +	struct hlist_head *hash_head; +	struct lock_class_key *k; +	unsigned long flags; + +	if (WARN_ON_ONCE(static_obj(key))) +		return; +	hash_head = keyhashentry(key); + +	raw_local_irq_save(flags); +	if (!graph_lock()) +		goto restore_irqs; +	hlist_for_each_entry_rcu(k, hash_head, hash_entry) { +		if (WARN_ON_ONCE(k == key)) +			goto out_unlock; +	} +	hlist_add_head_rcu(&key->hash_entry, hash_head); +out_unlock: +	graph_unlock(); +restore_irqs: +	raw_local_irq_restore(flags); +} +EXPORT_SYMBOL_GPL(lockdep_register_key); + +/* Check whether a key has been registered as a dynamic key. */ +static bool is_dynamic_key(const struct lock_class_key *key) +{ +	struct hlist_head *hash_head; +	struct lock_class_key *k; +	bool found = false; + +	if (WARN_ON_ONCE(static_obj(key))) +		return false; + +	/* +	 * If lock debugging is disabled lock_keys_hash[] may contain +	 * pointers to memory that has already been freed. Avoid triggering +	 * a use-after-free in that case by returning early. +	 */ +	if (!debug_locks) +		return true; + +	hash_head = keyhashentry(key); + +	rcu_read_lock(); +	hlist_for_each_entry_rcu(k, hash_head, hash_entry) { +		if (k == key) { +			found = true; +			break; +		} +	} +	rcu_read_unlock(); + +	return found; +} +  /*   * Register a lock's class in the hash-table, if the class is not present   * yet. Otherwise we look it up. We cache the result in the lock object @@ -756,7 +1085,7 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)  	if (!lock->key) {  		if (!assign_lock_key(lock))  			return NULL; -	} else if (!static_obj(lock->key)) { +	} else if (!static_obj(lock->key) && !is_dynamic_key(lock->key)) {  		return NULL;  	} @@ -775,11 +1104,12 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)  			goto out_unlock_set;  	} -	/* -	 * Allocate a new key from the static array, and add it to -	 * the hash: -	 */ -	if (nr_lock_classes >= MAX_LOCKDEP_KEYS) { +	init_data_structures_once(); + +	/* Allocate a new lock class and add it to the hash. */ +	class = list_first_entry_or_null(&free_lock_classes, typeof(*class), +					 lock_entry); +	if (!class) {  		if (!debug_locks_off_graph_unlock()) {  			return NULL;  		} @@ -788,13 +1118,13 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)  		dump_stack();  		return NULL;  	} -	class = lock_classes + nr_lock_classes++; +	nr_lock_classes++;  	debug_atomic_inc(nr_unused_locks);  	class->key = key;  	class->name = lock->name;  	class->subclass = subclass; -	INIT_LIST_HEAD(&class->locks_before); -	INIT_LIST_HEAD(&class->locks_after); +	WARN_ON_ONCE(!list_empty(&class->locks_before)); +	WARN_ON_ONCE(!list_empty(&class->locks_after));  	class->name_version = count_matching_names(class);  	/*  	 * We use RCU's safe list-add method to make @@ -802,9 +1132,10 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)  	 */  	hlist_add_head_rcu(&class->hash_entry, hash_head);  	/* -	 * Add it to the global list of classes: +	 * Remove the class from the free list and add it to the global list +	 * of classes.  	 */ -	list_add_tail(&class->lock_entry, &all_lock_classes); +	list_move_tail(&class->lock_entry, &all_lock_classes);  	if (verbose(class)) {  		graph_unlock(); @@ -845,7 +1176,10 @@ out_set_class_cache:   */  static struct lock_list *alloc_list_entry(void)  { -	if (nr_list_entries >= MAX_LOCKDEP_ENTRIES) { +	int idx = find_first_zero_bit(list_entries_in_use, +				      ARRAY_SIZE(list_entries)); + +	if (idx >= ARRAY_SIZE(list_entries)) {  		if (!debug_locks_off_graph_unlock())  			return NULL; @@ -853,13 +1187,16 @@ static struct lock_list *alloc_list_entry(void)  		dump_stack();  		return NULL;  	} -	return list_entries + nr_list_entries++; +	nr_list_entries++; +	__set_bit(idx, list_entries_in_use); +	return list_entries + idx;  }  /*   * Add a new dependency to the head of the list:   */ -static int add_lock_to_list(struct lock_class *this, struct list_head *head, +static int add_lock_to_list(struct lock_class *this, +			    struct lock_class *links_to, struct list_head *head,  			    unsigned long ip, int distance,  			    struct stack_trace *trace)  { @@ -873,6 +1210,7 @@ static int add_lock_to_list(struct lock_class *this, struct list_head *head,  		return 0;  	entry->class = this; +	entry->links_to = links_to;  	entry->distance = distance;  	entry->trace = *trace;  	/* @@ -955,7 +1293,7 @@ static inline void mark_lock_accessed(struct lock_list *lock,  	unsigned long nr;  	nr = lock - list_entries; -	WARN_ON(nr >= nr_list_entries); /* Out-of-bounds, input fail */ +	WARN_ON(nr >= ARRAY_SIZE(list_entries)); /* Out-of-bounds, input fail */  	lock->parent = parent;  	lock->class->dep_gen_id = lockdep_dependency_gen_id;  } @@ -965,7 +1303,7 @@ static inline unsigned long lock_accessed(struct lock_list *lock)  	unsigned long nr;  	nr = lock - list_entries; -	WARN_ON(nr >= nr_list_entries); /* Out-of-bounds, input fail */ +	WARN_ON(nr >= ARRAY_SIZE(list_entries)); /* Out-of-bounds, input fail */  	return lock->class->dep_gen_id == lockdep_dependency_gen_id;  } @@ -1624,29 +1962,18 @@ static const char *state_rnames[] = {  static inline const char *state_name(enum lock_usage_bit bit)  { -	return (bit & 1) ? state_rnames[bit >> 2] : state_names[bit >> 2]; +	return (bit & LOCK_USAGE_READ_MASK) ? state_rnames[bit >> 2] : state_names[bit >> 2];  }  static int exclusive_bit(int new_bit)  { -	/* -	 * USED_IN -	 * USED_IN_READ -	 * ENABLED -	 * ENABLED_READ -	 * -	 * bit 0 - write/read -	 * bit 1 - used_in/enabled -	 * bit 2+  state -	 */ - -	int state = new_bit & ~3; -	int dir = new_bit & 2; +	int state = new_bit & LOCK_USAGE_STATE_MASK; +	int dir = new_bit & LOCK_USAGE_DIR_MASK;  	/*  	 * keep state, bit flip the direction and strip read.  	 */ -	return state | (dir ^ 2); +	return state | (dir ^ LOCK_USAGE_DIR_MASK);  }  static int check_irq_usage(struct task_struct *curr, struct held_lock *prev, @@ -1842,6 +2169,24 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,  	struct lock_list this;  	int ret; +	if (!hlock_class(prev)->key || !hlock_class(next)->key) { +		/* +		 * The warning statements below may trigger a use-after-free +		 * of the class name. It is better to trigger a use-after free +		 * and to have the class name most of the time instead of not +		 * having the class name available. +		 */ +		WARN_ONCE(!debug_locks_silent && !hlock_class(prev)->key, +			  "Detected use-after-free of lock class %px/%s\n", +			  hlock_class(prev), +			  hlock_class(prev)->name); +		WARN_ONCE(!debug_locks_silent && !hlock_class(next)->key, +			  "Detected use-after-free of lock class %px/%s\n", +			  hlock_class(next), +			  hlock_class(next)->name); +		return 2; +	} +  	/*  	 * Prove that the new <prev> -> <next> dependency would not  	 * create a circular dependency in the graph. (We do this by @@ -1918,14 +2263,14 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,  	 * Ok, all validations passed, add the new lock  	 * to the previous lock's dependency list:  	 */ -	ret = add_lock_to_list(hlock_class(next), +	ret = add_lock_to_list(hlock_class(next), hlock_class(prev),  			       &hlock_class(prev)->locks_after,  			       next->acquire_ip, distance, trace);  	if (!ret)  		return 0; -	ret = add_lock_to_list(hlock_class(prev), +	ret = add_lock_to_list(hlock_class(prev), hlock_class(next),  			       &hlock_class(next)->locks_before,  			       next->acquire_ip, distance, trace);  	if (!ret) @@ -2018,8 +2363,8 @@ out_bug:  	return 0;  } -unsigned long nr_lock_chains;  struct lock_chain lock_chains[MAX_LOCKDEP_CHAINS]; +static DECLARE_BITMAP(lock_chains_in_use, MAX_LOCKDEP_CHAINS);  int nr_chain_hlocks;  static u16 chain_hlocks[MAX_LOCKDEP_CHAIN_HLOCKS]; @@ -2153,6 +2498,33 @@ static int check_no_collision(struct task_struct *curr,  }  /* + * Given an index that is >= -1, return the index of the next lock chain. + * Return -2 if there is no next lock chain. + */ +long lockdep_next_lockchain(long i) +{ +	i = find_next_bit(lock_chains_in_use, ARRAY_SIZE(lock_chains), i + 1); +	return i < ARRAY_SIZE(lock_chains) ? i : -2; +} + +unsigned long lock_chain_count(void) +{ +	return bitmap_weight(lock_chains_in_use, ARRAY_SIZE(lock_chains)); +} + +/* Must be called with the graph lock held. */ +static struct lock_chain *alloc_lock_chain(void) +{ +	int idx = find_first_zero_bit(lock_chains_in_use, +				      ARRAY_SIZE(lock_chains)); + +	if (unlikely(idx >= ARRAY_SIZE(lock_chains))) +		return NULL; +	__set_bit(idx, lock_chains_in_use); +	return lock_chains + idx; +} + +/*   * Adds a dependency chain into chain hashtable. And must be called with   * graph_lock held.   * @@ -2169,19 +2541,15 @@ static inline int add_chain_cache(struct task_struct *curr,  	int i, j;  	/* -	 * Allocate a new chain entry from the static array, and add -	 * it to the hash: -	 */ - -	/* -	 * We might need to take the graph lock, ensure we've got IRQs +	 * The caller must hold the graph lock, ensure we've got IRQs  	 * disabled to make this an IRQ-safe lock.. for recursion reasons  	 * lockdep won't complain about its own locking errors.  	 */  	if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))  		return 0; -	if (unlikely(nr_lock_chains >= MAX_LOCKDEP_CHAINS)) { +	chain = alloc_lock_chain(); +	if (!chain) {  		if (!debug_locks_off_graph_unlock())  			return 0; @@ -2189,7 +2557,6 @@ static inline int add_chain_cache(struct task_struct *curr,  		dump_stack();  		return 0;  	} -	chain = lock_chains + nr_lock_chains++;  	chain->chain_key = chain_key;  	chain->irq_context = hlock->irq_context;  	i = get_first_held_lock(curr, hlock); @@ -2206,16 +2573,8 @@ static inline int add_chain_cache(struct task_struct *curr,  			chain_hlocks[chain->base + j] = lock_id;  		}  		chain_hlocks[chain->base + j] = class - lock_classes; -	} - -	if (nr_chain_hlocks < MAX_LOCKDEP_CHAIN_HLOCKS)  		nr_chain_hlocks += chain->depth; - -#ifdef CONFIG_DEBUG_LOCKDEP -	/* -	 * Important for check_no_collision(). -	 */ -	if (unlikely(nr_chain_hlocks > MAX_LOCKDEP_CHAIN_HLOCKS)) { +	} else {  		if (!debug_locks_off_graph_unlock())  			return 0; @@ -2223,7 +2582,6 @@ static inline int add_chain_cache(struct task_struct *curr,  		dump_stack();  		return 0;  	} -#endif  	hlist_add_head_rcu(&chain->entry, hash_head);  	debug_atomic_inc(chain_lookup_misses); @@ -2233,19 +2591,16 @@ static inline int add_chain_cache(struct task_struct *curr,  }  /* - * Look up a dependency chain. + * Look up a dependency chain. Must be called with either the graph lock or + * the RCU read lock held.   */  static inline struct lock_chain *lookup_chain_cache(u64 chain_key)  {  	struct hlist_head *hash_head = chainhashentry(chain_key);  	struct lock_chain *chain; -	/* -	 * We can walk it lock-free, because entries only get added -	 * to the hash: -	 */  	hlist_for_each_entry_rcu(chain, hash_head, entry) { -		if (chain->chain_key == chain_key) { +		if (READ_ONCE(chain->chain_key) == chain_key) {  			debug_atomic_inc(chain_lookup_hits);  			return chain;  		} @@ -2662,8 +3017,8 @@ mark_lock_irq(struct task_struct *curr, struct held_lock *this,  		enum lock_usage_bit new_bit)  {  	int excl_bit = exclusive_bit(new_bit); -	int read = new_bit & 1; -	int dir = new_bit & 2; +	int read = new_bit & LOCK_USAGE_READ_MASK; +	int dir = new_bit & LOCK_USAGE_DIR_MASK;  	/*  	 * mark USED_IN has to look forwards -- to ensure no dependency @@ -2687,19 +3042,19 @@ mark_lock_irq(struct task_struct *curr, struct held_lock *this,  	 * states.  	 */  	if ((!read || !dir || STRICT_READ_CHECKS) && -			!usage(curr, this, excl_bit, state_name(new_bit & ~1))) +			!usage(curr, this, excl_bit, state_name(new_bit & ~LOCK_USAGE_READ_MASK)))  		return 0;  	/*  	 * Check for read in write conflicts  	 */  	if (!read) { -		if (!valid_state(curr, this, new_bit, excl_bit + 1)) +		if (!valid_state(curr, this, new_bit, excl_bit + LOCK_USAGE_READ_MASK))  			return 0;  		if (STRICT_READ_CHECKS && -			!usage(curr, this, excl_bit + 1, -				state_name(new_bit + 1))) +			!usage(curr, this, excl_bit + LOCK_USAGE_READ_MASK, +				state_name(new_bit + LOCK_USAGE_READ_MASK)))  			return 0;  	} @@ -2709,35 +3064,28 @@ mark_lock_irq(struct task_struct *curr, struct held_lock *this,  	return 1;  } -enum mark_type { -#define LOCKDEP_STATE(__STATE)	__STATE, -#include "lockdep_states.h" -#undef LOCKDEP_STATE -}; -  /*   * Mark all held locks with a usage bit:   */  static int -mark_held_locks(struct task_struct *curr, enum mark_type mark) +mark_held_locks(struct task_struct *curr, enum lock_usage_bit base_bit)  { -	enum lock_usage_bit usage_bit;  	struct held_lock *hlock;  	int i;  	for (i = 0; i < curr->lockdep_depth; i++) { +		enum lock_usage_bit hlock_bit = base_bit;  		hlock = curr->held_locks + i; -		usage_bit = 2 + (mark << 2); /* ENABLED */  		if (hlock->read) -			usage_bit += 1; /* READ */ +			hlock_bit += LOCK_USAGE_READ_MASK; -		BUG_ON(usage_bit >= LOCK_USAGE_STATES); +		BUG_ON(hlock_bit >= LOCK_USAGE_STATES);  		if (!hlock->check)  			continue; -		if (!mark_lock(curr, hlock, usage_bit)) +		if (!mark_lock(curr, hlock, hlock_bit))  			return 0;  	} @@ -2758,7 +3106,7 @@ static void __trace_hardirqs_on_caller(unsigned long ip)  	 * We are going to turn hardirqs on, so set the  	 * usage bit for all held locks:  	 */ -	if (!mark_held_locks(curr, HARDIRQ)) +	if (!mark_held_locks(curr, LOCK_ENABLED_HARDIRQ))  		return;  	/*  	 * If we have softirqs enabled, then set the usage @@ -2766,7 +3114,7 @@ static void __trace_hardirqs_on_caller(unsigned long ip)  	 * this bit from being set before)  	 */  	if (curr->softirqs_enabled) -		if (!mark_held_locks(curr, SOFTIRQ)) +		if (!mark_held_locks(curr, LOCK_ENABLED_SOFTIRQ))  			return;  	curr->hardirq_enable_ip = ip; @@ -2814,6 +3162,7 @@ void lockdep_hardirqs_on(unsigned long ip)  	__trace_hardirqs_on_caller(ip);  	current->lockdep_recursion = 0;  } +NOKPROBE_SYMBOL(lockdep_hardirqs_on);  /*   * Hardirqs were disabled: @@ -2843,6 +3192,7 @@ void lockdep_hardirqs_off(unsigned long ip)  	} else  		debug_atomic_inc(redundant_hardirqs_off);  } +NOKPROBE_SYMBOL(lockdep_hardirqs_off);  /*   * Softirqs will be enabled: @@ -2880,7 +3230,7 @@ void trace_softirqs_on(unsigned long ip)  	 * enabled too:  	 */  	if (curr->hardirqs_enabled) -		mark_held_locks(curr, SOFTIRQ); +		mark_held_locks(curr, LOCK_ENABLED_SOFTIRQ);  	current->lockdep_recursion = 0;  } @@ -3119,13 +3469,12 @@ void lockdep_init_map(struct lockdep_map *lock, const char *name,  	if (DEBUG_LOCKS_WARN_ON(!key))  		return;  	/* -	 * Sanity check, the lock-class key must be persistent: +	 * Sanity check, the lock-class key must either have been allocated +	 * statically or must have been registered as a dynamic key.  	 */ -	if (!static_obj(key)) { -		printk("BUG: key %px not in .data!\n", key); -		/* -		 * What it says above ^^^^^, I suggest you read it. -		 */ +	if (!static_obj(key) && !is_dynamic_key(key)) { +		if (debug_locks) +			printk(KERN_ERR "BUG: key %px has not been registered!\n", key);  		DEBUG_LOCKS_WARN_ON(1);  		return;  	} @@ -3335,6 +3684,11 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,  	if (nest_lock && !__lock_is_held(nest_lock, -1))  		return print_lock_nested_lock_not_held(curr, hlock, ip); +	if (!debug_locks_silent) { +		WARN_ON_ONCE(depth && !hlock_class(hlock - 1)->key); +		WARN_ON_ONCE(!hlock_class(hlock)->key); +	} +  	if (!validate_chain(curr, lock, hlock, chain_head, chain_key))  		return 0; @@ -3497,6 +3851,9 @@ __lock_set_class(struct lockdep_map *lock, const char *name,  	unsigned int depth;  	int i; +	if (unlikely(!debug_locks)) +		return 0; +  	depth = curr->lockdep_depth;  	/*  	 * This function is about (re)setting the class of a held lock, @@ -3535,6 +3892,9 @@ static int __lock_downgrade(struct lockdep_map *lock, unsigned long ip)  	unsigned int depth;  	int i; +	if (unlikely(!debug_locks)) +		return 0; +  	depth = curr->lockdep_depth;  	/*  	 * This function is about (re)setting the class of a held lock, @@ -3650,7 +4010,8 @@ __lock_release(struct lockdep_map *lock, int nested, unsigned long ip)  	return 0;  } -static int __lock_is_held(const struct lockdep_map *lock, int read) +static nokprobe_inline +int __lock_is_held(const struct lockdep_map *lock, int read)  {  	struct task_struct *curr = current;  	int i; @@ -3883,6 +4244,7 @@ int lock_is_held_type(const struct lockdep_map *lock, int read)  	return ret;  }  EXPORT_SYMBOL_GPL(lock_is_held_type); +NOKPROBE_SYMBOL(lock_is_held_type);  struct pin_cookie lock_pin_lock(struct lockdep_map *lock)  { @@ -4123,29 +4485,131 @@ void lockdep_reset(void)  	raw_local_irq_restore(flags);  } +/* Remove a class from a lock chain. Must be called with the graph lock held. */ +static void remove_class_from_lock_chain(struct pending_free *pf, +					 struct lock_chain *chain, +					 struct lock_class *class) +{ +#ifdef CONFIG_PROVE_LOCKING +	struct lock_chain *new_chain; +	u64 chain_key; +	int i; + +	for (i = chain->base; i < chain->base + chain->depth; i++) { +		if (chain_hlocks[i] != class - lock_classes) +			continue; +		/* The code below leaks one chain_hlock[] entry. */ +		if (--chain->depth > 0) { +			memmove(&chain_hlocks[i], &chain_hlocks[i + 1], +				(chain->base + chain->depth - i) * +				sizeof(chain_hlocks[0])); +		} +		/* +		 * Each lock class occurs at most once in a lock chain so once +		 * we found a match we can break out of this loop. +		 */ +		goto recalc; +	} +	/* Since the chain has not been modified, return. */ +	return; + +recalc: +	chain_key = 0; +	for (i = chain->base; i < chain->base + chain->depth; i++) +		chain_key = iterate_chain_key(chain_key, chain_hlocks[i] + 1); +	if (chain->depth && chain->chain_key == chain_key) +		return; +	/* Overwrite the chain key for concurrent RCU readers. */ +	WRITE_ONCE(chain->chain_key, chain_key); +	/* +	 * Note: calling hlist_del_rcu() from inside a +	 * hlist_for_each_entry_rcu() loop is safe. +	 */ +	hlist_del_rcu(&chain->entry); +	__set_bit(chain - lock_chains, pf->lock_chains_being_freed); +	if (chain->depth == 0) +		return; +	/* +	 * If the modified lock chain matches an existing lock chain, drop +	 * the modified lock chain. +	 */ +	if (lookup_chain_cache(chain_key)) +		return; +	new_chain = alloc_lock_chain(); +	if (WARN_ON_ONCE(!new_chain)) { +		debug_locks_off(); +		return; +	} +	*new_chain = *chain; +	hlist_add_head_rcu(&new_chain->entry, chainhashentry(chain_key)); +#endif +} + +/* Must be called with the graph lock held. */ +static void remove_class_from_lock_chains(struct pending_free *pf, +					  struct lock_class *class) +{ +	struct lock_chain *chain; +	struct hlist_head *head; +	int i; + +	for (i = 0; i < ARRAY_SIZE(chainhash_table); i++) { +		head = chainhash_table + i; +		hlist_for_each_entry_rcu(chain, head, entry) { +			remove_class_from_lock_chain(pf, chain, class); +		} +	} +} +  /*   * Remove all references to a lock class. The caller must hold the graph lock.   */ -static void zap_class(struct lock_class *class) +static void zap_class(struct pending_free *pf, struct lock_class *class)  { +	struct lock_list *entry;  	int i; +	WARN_ON_ONCE(!class->key); +  	/*  	 * Remove all dependencies this lock is  	 * involved in:  	 */ -	for (i = 0; i < nr_list_entries; i++) { -		if (list_entries[i].class == class) -			list_del_rcu(&list_entries[i].entry); +	for_each_set_bit(i, list_entries_in_use, ARRAY_SIZE(list_entries)) { +		entry = list_entries + i; +		if (entry->class != class && entry->links_to != class) +			continue; +		__clear_bit(i, list_entries_in_use); +		nr_list_entries--; +		list_del_rcu(&entry->entry); +	} +	if (list_empty(&class->locks_after) && +	    list_empty(&class->locks_before)) { +		list_move_tail(&class->lock_entry, &pf->zapped); +		hlist_del_rcu(&class->hash_entry); +		WRITE_ONCE(class->key, NULL); +		WRITE_ONCE(class->name, NULL); +		nr_lock_classes--; +	} else { +		WARN_ONCE(true, "%s() failed for class %s\n", __func__, +			  class->name);  	} -	/* -	 * Unhash the class and remove it from the all_lock_classes list: -	 */ -	hlist_del_rcu(&class->hash_entry); -	list_del(&class->lock_entry); -	RCU_INIT_POINTER(class->key, NULL); -	RCU_INIT_POINTER(class->name, NULL); +	remove_class_from_lock_chains(pf, class); +} + +static void reinit_class(struct lock_class *class) +{ +	void *const p = class; +	const unsigned int offset = offsetof(struct lock_class, key); + +	WARN_ON_ONCE(!class->lock_entry.next); +	WARN_ON_ONCE(!list_empty(&class->locks_after)); +	WARN_ON_ONCE(!list_empty(&class->locks_before)); +	memset(p + offset, 0, sizeof(*class) - offset); +	WARN_ON_ONCE(!class->lock_entry.next); +	WARN_ON_ONCE(!list_empty(&class->locks_after)); +	WARN_ON_ONCE(!list_empty(&class->locks_before));  }  static inline int within(const void *addr, void *start, unsigned long size) @@ -4153,55 +4617,175 @@ static inline int within(const void *addr, void *start, unsigned long size)  	return addr >= start && addr < start + size;  } +static bool inside_selftest(void) +{ +	return current == lockdep_selftest_task_struct; +} + +/* The caller must hold the graph lock. */ +static struct pending_free *get_pending_free(void) +{ +	return delayed_free.pf + delayed_free.index; +} + +static void free_zapped_rcu(struct rcu_head *cb); +  /* - * Used in module.c to remove lock classes from memory that is going to be - * freed; and possibly re-used by other modules. - * - * We will have had one sync_sched() before getting here, so we're guaranteed - * nobody will look up these exact classes -- they're properly dead but still - * allocated. + * Schedule an RCU callback if no RCU callback is pending. Must be called with + * the graph lock held.   */ -void lockdep_free_key_range(void *start, unsigned long size) +static void call_rcu_zapped(struct pending_free *pf) +{ +	WARN_ON_ONCE(inside_selftest()); + +	if (list_empty(&pf->zapped)) +		return; + +	if (delayed_free.scheduled) +		return; + +	delayed_free.scheduled = true; + +	WARN_ON_ONCE(delayed_free.pf + delayed_free.index != pf); +	delayed_free.index ^= 1; + +	call_rcu(&delayed_free.rcu_head, free_zapped_rcu); +} + +/* The caller must hold the graph lock. May be called from RCU context. */ +static void __free_zapped_classes(struct pending_free *pf)  {  	struct lock_class *class; -	struct hlist_head *head; + +	check_data_structures(); + +	list_for_each_entry(class, &pf->zapped, lock_entry) +		reinit_class(class); + +	list_splice_init(&pf->zapped, &free_lock_classes); + +#ifdef CONFIG_PROVE_LOCKING +	bitmap_andnot(lock_chains_in_use, lock_chains_in_use, +		      pf->lock_chains_being_freed, ARRAY_SIZE(lock_chains)); +	bitmap_clear(pf->lock_chains_being_freed, 0, ARRAY_SIZE(lock_chains)); +#endif +} + +static void free_zapped_rcu(struct rcu_head *ch) +{ +	struct pending_free *pf;  	unsigned long flags; -	int i; -	int locked; + +	if (WARN_ON_ONCE(ch != &delayed_free.rcu_head)) +		return;  	raw_local_irq_save(flags); -	locked = graph_lock(); +	if (!graph_lock()) +		goto out_irq; + +	/* closed head */ +	pf = delayed_free.pf + (delayed_free.index ^ 1); +	__free_zapped_classes(pf); +	delayed_free.scheduled = false;  	/* -	 * Unhash all classes that were created by this module: +	 * If there's anything on the open list, close and start a new callback.  	 */ +	call_rcu_zapped(delayed_free.pf + delayed_free.index); + +	graph_unlock(); +out_irq: +	raw_local_irq_restore(flags); +} + +/* + * Remove all lock classes from the class hash table and from the + * all_lock_classes list whose key or name is in the address range [start, + * start + size). Move these lock classes to the zapped_classes list. Must + * be called with the graph lock held. + */ +static void __lockdep_free_key_range(struct pending_free *pf, void *start, +				     unsigned long size) +{ +	struct lock_class *class; +	struct hlist_head *head; +	int i; + +	/* Unhash all classes that were created by a module. */  	for (i = 0; i < CLASSHASH_SIZE; i++) {  		head = classhash_table + i;  		hlist_for_each_entry_rcu(class, head, hash_entry) { -			if (within(class->key, start, size)) -				zap_class(class); -			else if (within(class->name, start, size)) -				zap_class(class); +			if (!within(class->key, start, size) && +			    !within(class->name, start, size)) +				continue; +			zap_class(pf, class);  		}  	} +} -	if (locked) -		graph_unlock(); +/* + * Used in module.c to remove lock classes from memory that is going to be + * freed; and possibly re-used by other modules. + * + * We will have had one synchronize_rcu() before getting here, so we're + * guaranteed nobody will look up these exact classes -- they're properly dead + * but still allocated. + */ +static void lockdep_free_key_range_reg(void *start, unsigned long size) +{ +	struct pending_free *pf; +	unsigned long flags; +	int locked; + +	init_data_structures_once(); + +	raw_local_irq_save(flags); +	locked = graph_lock(); +	if (!locked) +		goto out_irq; + +	pf = get_pending_free(); +	__lockdep_free_key_range(pf, start, size); +	call_rcu_zapped(pf); + +	graph_unlock(); +out_irq:  	raw_local_irq_restore(flags);  	/*  	 * Wait for any possible iterators from look_up_lock_class() to pass  	 * before continuing to free the memory they refer to. -	 * -	 * sync_sched() is sufficient because the read-side is IRQ disable.  	 */  	synchronize_rcu(); +} -	/* -	 * XXX at this point we could return the resources to the pool; -	 * instead we leak them. We would need to change to bitmap allocators -	 * instead of the linear allocators we have now. -	 */ +/* + * Free all lockdep keys in the range [start, start+size). Does not sleep. + * Ignores debug_locks. Must only be used by the lockdep selftests. + */ +static void lockdep_free_key_range_imm(void *start, unsigned long size) +{ +	struct pending_free *pf = delayed_free.pf; +	unsigned long flags; + +	init_data_structures_once(); + +	raw_local_irq_save(flags); +	arch_spin_lock(&lockdep_lock); +	__lockdep_free_key_range(pf, start, size); +	__free_zapped_classes(pf); +	arch_spin_unlock(&lockdep_lock); +	raw_local_irq_restore(flags); +} + +void lockdep_free_key_range(void *start, unsigned long size) +{ +	init_data_structures_once(); + +	if (inside_selftest()) +		lockdep_free_key_range_imm(start, size); +	else +		lockdep_free_key_range_reg(start, size);  }  /* @@ -4226,14 +4810,12 @@ static bool lock_class_cache_is_registered(struct lockdep_map *lock)  	return false;  } -void lockdep_reset_lock(struct lockdep_map *lock) +/* The caller must hold the graph lock. Does not sleep. */ +static void __lockdep_reset_lock(struct pending_free *pf, +				 struct lockdep_map *lock)  {  	struct lock_class *class; -	unsigned long flags; -	int j, locked; - -	raw_local_irq_save(flags); -	locked = graph_lock(); +	int j;  	/*  	 * Remove all classes this lock might have: @@ -4244,27 +4826,104 @@ void lockdep_reset_lock(struct lockdep_map *lock)  		 */  		class = look_up_lock_class(lock, j);  		if (class) -			zap_class(class); +			zap_class(pf, class);  	}  	/*  	 * Debug check: in the end all mapped classes should  	 * be gone.  	 */ -	if (unlikely(lock_class_cache_is_registered(lock))) { -		if (debug_locks_off_graph_unlock()) { -			/* -			 * We all just reset everything, how did it match? -			 */ -			WARN_ON(1); +	if (WARN_ON_ONCE(lock_class_cache_is_registered(lock))) +		debug_locks_off(); +} + +/* + * Remove all information lockdep has about a lock if debug_locks == 1. Free + * released data structures from RCU context. + */ +static void lockdep_reset_lock_reg(struct lockdep_map *lock) +{ +	struct pending_free *pf; +	unsigned long flags; +	int locked; + +	raw_local_irq_save(flags); +	locked = graph_lock(); +	if (!locked) +		goto out_irq; + +	pf = get_pending_free(); +	__lockdep_reset_lock(pf, lock); +	call_rcu_zapped(pf); + +	graph_unlock(); +out_irq: +	raw_local_irq_restore(flags); +} + +/* + * Reset a lock. Does not sleep. Ignores debug_locks. Must only be used by the + * lockdep selftests. + */ +static void lockdep_reset_lock_imm(struct lockdep_map *lock) +{ +	struct pending_free *pf = delayed_free.pf; +	unsigned long flags; + +	raw_local_irq_save(flags); +	arch_spin_lock(&lockdep_lock); +	__lockdep_reset_lock(pf, lock); +	__free_zapped_classes(pf); +	arch_spin_unlock(&lockdep_lock); +	raw_local_irq_restore(flags); +} + +void lockdep_reset_lock(struct lockdep_map *lock) +{ +	init_data_structures_once(); + +	if (inside_selftest()) +		lockdep_reset_lock_imm(lock); +	else +		lockdep_reset_lock_reg(lock); +} + +/* Unregister a dynamically allocated key. */ +void lockdep_unregister_key(struct lock_class_key *key) +{ +	struct hlist_head *hash_head = keyhashentry(key); +	struct lock_class_key *k; +	struct pending_free *pf; +	unsigned long flags; +	bool found = false; + +	might_sleep(); + +	if (WARN_ON_ONCE(static_obj(key))) +		return; + +	raw_local_irq_save(flags); +	if (!graph_lock()) +		goto out_irq; + +	pf = get_pending_free(); +	hlist_for_each_entry_rcu(k, hash_head, hash_entry) { +		if (k == key) { +			hlist_del_rcu(&k->hash_entry); +			found = true; +			break;  		} -		goto out_restore;  	} -	if (locked) -		graph_unlock(); - -out_restore: +	WARN_ON_ONCE(!found); +	__lockdep_free_key_range(pf, key, 1); +	call_rcu_zapped(pf); +	graph_unlock(); +out_irq:  	raw_local_irq_restore(flags); + +	/* Wait until is_dynamic_key() has finished accessing k->hash_entry. */ +	synchronize_rcu();  } +EXPORT_SYMBOL_GPL(lockdep_unregister_key);  void __init lockdep_init(void)  { @@ -4278,20 +4937,24 @@ void __init lockdep_init(void)  	printk("... MAX_LOCKDEP_CHAINS:      %lu\n", MAX_LOCKDEP_CHAINS);  	printk("... CHAINHASH_SIZE:          %lu\n", CHAINHASH_SIZE); -	printk(" memory used by lock dependency info: %lu kB\n", -		(sizeof(struct lock_class) * MAX_LOCKDEP_KEYS + -		sizeof(struct list_head) * CLASSHASH_SIZE + -		sizeof(struct lock_list) * MAX_LOCKDEP_ENTRIES + -		sizeof(struct lock_chain) * MAX_LOCKDEP_CHAINS + -		sizeof(struct list_head) * CHAINHASH_SIZE +	printk(" memory used by lock dependency info: %zu kB\n", +	       (sizeof(lock_classes) + +		sizeof(classhash_table) + +		sizeof(list_entries) + +		sizeof(list_entries_in_use) + +		sizeof(chainhash_table) + +		sizeof(delayed_free)  #ifdef CONFIG_PROVE_LOCKING -		+ sizeof(struct circular_queue) +		+ sizeof(lock_cq) +		+ sizeof(lock_chains) +		+ sizeof(lock_chains_in_use) +		+ sizeof(chain_hlocks)  #endif  		) / 1024  		); -	printk(" per task-struct memory footprint: %lu bytes\n", -		sizeof(struct held_lock) * MAX_LOCK_DEPTH); +	printk(" per task-struct memory footprint: %zu bytes\n", +	       sizeof(((struct task_struct *)NULL)->held_locks));  }  static void diff --git a/kernel/locking/lockdep_internals.h b/kernel/locking/lockdep_internals.h index 88c847a41c8a..d4c197425f68 100644 --- a/kernel/locking/lockdep_internals.h +++ b/kernel/locking/lockdep_internals.h @@ -22,6 +22,10 @@ enum lock_usage_bit {  	LOCK_USAGE_STATES  }; +#define LOCK_USAGE_READ_MASK 1 +#define LOCK_USAGE_DIR_MASK  2 +#define LOCK_USAGE_STATE_MASK (~(LOCK_USAGE_READ_MASK | LOCK_USAGE_DIR_MASK)) +  /*   * Usage-state bitmasks:   */ @@ -96,7 +100,8 @@ struct lock_class *lock_chain_get_class(struct lock_chain *chain, int i);  extern unsigned long nr_lock_classes;  extern unsigned long nr_list_entries; -extern unsigned long nr_lock_chains; +long lockdep_next_lockchain(long i); +unsigned long lock_chain_count(void);  extern int nr_chain_hlocks;  extern unsigned long nr_stack_trace_entries; diff --git a/kernel/locking/lockdep_proc.c b/kernel/locking/lockdep_proc.c index 3d31f9b0059e..9c49ec645d8b 100644 --- a/kernel/locking/lockdep_proc.c +++ b/kernel/locking/lockdep_proc.c @@ -104,18 +104,18 @@ static const struct seq_operations lockdep_ops = {  #ifdef CONFIG_PROVE_LOCKING  static void *lc_start(struct seq_file *m, loff_t *pos)  { +	if (*pos < 0) +		return NULL; +  	if (*pos == 0)  		return SEQ_START_TOKEN; -	if (*pos - 1 < nr_lock_chains) -		return lock_chains + (*pos - 1); - -	return NULL; +	return lock_chains + (*pos - 1);  }  static void *lc_next(struct seq_file *m, void *v, loff_t *pos)  { -	(*pos)++; +	*pos = lockdep_next_lockchain(*pos - 1) + 1;  	return lc_start(m, pos);  } @@ -268,7 +268,7 @@ static int lockdep_stats_show(struct seq_file *m, void *v)  #ifdef CONFIG_PROVE_LOCKING  	seq_printf(m, " dependency chains:             %11lu [max: %lu]\n", -			nr_lock_chains, MAX_LOCKDEP_CHAINS); +			lock_chain_count(), MAX_LOCKDEP_CHAINS);  	seq_printf(m, " dependency chain hlocks:       %11d [max: %lu]\n",  			nr_chain_hlocks, MAX_LOCKDEP_CHAIN_HLOCKS);  #endif diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c index 7d0b0ed74404..ad40a2617063 100644 --- a/kernel/locking/locktorture.c +++ b/kernel/locking/locktorture.c @@ -1,23 +1,10 @@ +// SPDX-License-Identifier: GPL-2.0+  /*   * Module-based torture test facility for locking   * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, you can access it online at - * http://www.gnu.org/licenses/gpl-2.0.html. - *   * Copyright (C) IBM Corporation, 2014   * - * Authors: Paul E. McKenney <paulmck@us.ibm.com> + * Authors: Paul E. McKenney <paulmck@linux.ibm.com>   *          Davidlohr Bueso <dave@stgolabs.net>   *	Based on kernel/rcu/torture.c.   */ @@ -45,7 +32,7 @@  #include <linux/torture.h>  MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com>"); +MODULE_AUTHOR("Paul E. McKenney <paulmck@linux.ibm.com>");  torture_param(int, nwriters_stress, -1,  	     "Number of write-locking stress-test threads"); @@ -970,7 +957,7 @@ static int __init lock_torture_init(void)  	/* Prepare torture context. */  	if (onoff_interval > 0) {  		firsterr = torture_onoff_init(onoff_holdoff * HZ, -					      onoff_interval * HZ); +					      onoff_interval * HZ, NULL);  		if (firsterr)  			goto unwind;  	} diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c index 8a8c3c208c5e..5e9247dc2515 100644 --- a/kernel/locking/qspinlock.c +++ b/kernel/locking/qspinlock.c @@ -124,9 +124,6 @@ static inline __pure u32 encode_tail(int cpu, int idx)  {  	u32 tail; -#ifdef CONFIG_DEBUG_SPINLOCK -	BUG_ON(idx > 3); -#endif  	tail  = (cpu + 1) << _Q_TAIL_CPU_OFFSET;  	tail |= idx << _Q_TAIL_IDX_OFFSET; /* assume < 4 */ @@ -412,12 +409,28 @@ pv_queue:  	idx = node->count++;  	tail = encode_tail(smp_processor_id(), idx); +	/* +	 * 4 nodes are allocated based on the assumption that there will +	 * not be nested NMIs taking spinlocks. That may not be true in +	 * some architectures even though the chance of needing more than +	 * 4 nodes will still be extremely unlikely. When that happens, +	 * we fall back to spinning on the lock directly without using +	 * any MCS node. This is not the most elegant solution, but is +	 * simple enough. +	 */ +	if (unlikely(idx >= MAX_NODES)) { +		qstat_inc(qstat_lock_no_node, true); +		while (!queued_spin_trylock(lock)) +			cpu_relax(); +		goto release; +	} +  	node = grab_mcs_node(node, idx);  	/*  	 * Keep counts of non-zero index values:  	 */ -	qstat_inc(qstat_lock_idx1 + idx - 1, idx); +	qstat_inc(qstat_lock_use_node2 + idx - 1, idx);  	/*  	 * Ensure that we increment the head node->count before initialising diff --git a/kernel/locking/qspinlock_stat.h b/kernel/locking/qspinlock_stat.h index 42d3d8dc8f49..d73f85388d5c 100644 --- a/kernel/locking/qspinlock_stat.h +++ b/kernel/locking/qspinlock_stat.h @@ -30,6 +30,13 @@   *   pv_wait_node	- # of vCPU wait's at a non-head queue node   *   lock_pending	- # of locking operations via pending code   *   lock_slowpath	- # of locking operations via MCS lock queue + *   lock_use_node2	- # of locking operations that use 2nd per-CPU node + *   lock_use_node3	- # of locking operations that use 3rd per-CPU node + *   lock_use_node4	- # of locking operations that use 4th per-CPU node + *   lock_no_node	- # of locking operations without using per-CPU node + * + * Subtracting lock_use_node[234] from lock_slowpath will give you + * lock_use_node1.   *   * Writing to the "reset_counters" file will reset all the above counter   * values. @@ -55,9 +62,10 @@ enum qlock_stats {  	qstat_pv_wait_node,  	qstat_lock_pending,  	qstat_lock_slowpath, -	qstat_lock_idx1, -	qstat_lock_idx2, -	qstat_lock_idx3, +	qstat_lock_use_node2, +	qstat_lock_use_node3, +	qstat_lock_use_node4, +	qstat_lock_no_node,  	qstat_num,	/* Total number of statistical counters */  	qstat_reset_cnts = qstat_num,  }; @@ -85,9 +93,10 @@ static const char * const qstat_names[qstat_num + 1] = {  	[qstat_pv_wait_node]       = "pv_wait_node",  	[qstat_lock_pending]       = "lock_pending",  	[qstat_lock_slowpath]      = "lock_slowpath", -	[qstat_lock_idx1]	   = "lock_index1", -	[qstat_lock_idx2]	   = "lock_index2", -	[qstat_lock_idx3]	   = "lock_index3", +	[qstat_lock_use_node2]	   = "lock_use_node2", +	[qstat_lock_use_node3]	   = "lock_use_node3", +	[qstat_lock_use_node4]	   = "lock_use_node4", +	[qstat_lock_no_node]	   = "lock_no_node",  	[qstat_reset_cnts]         = "reset_counters",  }; diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c index 50d9af615dc4..fbe96341beee 100644 --- a/kernel/locking/rwsem-xadd.c +++ b/kernel/locking/rwsem-xadd.c @@ -211,9 +211,7 @@ static void __rwsem_mark_wake(struct rw_semaphore *sem,  		 * Ensure issuing the wakeup (either by us or someone else)  		 * after setting the reader waiter to nil.  		 */ -		wake_q_add(wake_q, tsk); -		/* wake_q_add() already take the task ref */ -		put_task_struct(tsk); +		wake_q_add_safe(wake_q, tsk);  	}  	adjustment = woken * RWSEM_ACTIVE_READ_BIAS - adjustment; diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 640b2034edd6..4802b039b89f 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -1215,14 +1215,16 @@ static struct page *saveable_highmem_page(struct zone *zone, unsigned long pfn)  	if (!pfn_valid(pfn))  		return NULL; -	page = pfn_to_page(pfn); -	if (page_zone(page) != zone) +	page = pfn_to_online_page(pfn); +	if (!page || page_zone(page) != zone)  		return NULL;  	BUG_ON(!PageHighMem(page)); -	if (swsusp_page_is_forbidden(page) ||  swsusp_page_is_free(page) || -	    PageReserved(page)) +	if (swsusp_page_is_forbidden(page) ||  swsusp_page_is_free(page)) +		return NULL; + +	if (PageReserved(page) || PageOffline(page))  		return NULL;  	if (page_is_guard(page)) @@ -1277,8 +1279,8 @@ static struct page *saveable_page(struct zone *zone, unsigned long pfn)  	if (!pfn_valid(pfn))  		return NULL; -	page = pfn_to_page(pfn); -	if (page_zone(page) != zone) +	page = pfn_to_online_page(pfn); +	if (!page || page_zone(page) != zone)  		return NULL;  	BUG_ON(PageHighMem(page)); @@ -1286,6 +1288,9 @@ static struct page *saveable_page(struct zone *zone, unsigned long pfn)  	if (swsusp_page_is_forbidden(page) || swsusp_page_is_free(page))  		return NULL; +	if (PageOffline(page)) +		return NULL; +  	if (PageReserved(page)  	    && (!kernel_page_present(page) || pfn_is_nosave(pfn)))  		return NULL; diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig index 939a2056c87a..37301430970e 100644 --- a/kernel/rcu/Kconfig +++ b/kernel/rcu/Kconfig @@ -87,36 +87,6 @@ config RCU_STALL_COMMON  config RCU_NEED_SEGCBLIST  	def_bool ( TREE_RCU || PREEMPT_RCU || TREE_SRCU ) -config CONTEXT_TRACKING -       bool - -config CONTEXT_TRACKING_FORCE -	bool "Force context tracking" -	depends on CONTEXT_TRACKING -	default y if !NO_HZ_FULL -	help -	  The major pre-requirement for full dynticks to work is to -	  support the context tracking subsystem. But there are also -	  other dependencies to provide in order to make the full -	  dynticks working. - -	  This option stands for testing when an arch implements the -	  context tracking backend but doesn't yet fullfill all the -	  requirements to make the full dynticks feature working. -	  Without the full dynticks, there is no way to test the support -	  for context tracking and the subsystems that rely on it: RCU -	  userspace extended quiescent state and tickless cputime -	  accounting. This option copes with the absence of the full -	  dynticks subsystem by forcing the context tracking on all -	  CPUs in the system. - -	  Say Y only if you're working on the development of an -	  architecture backend for the context tracking. - -	  Say N otherwise, this option brings an overhead that you -	  don't want in production. - -  config RCU_FANOUT  	int "Tree-based hierarchical RCU fanout value"  	range 2 64 if 64BIT diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index a393e24a9195..acee72c0b24b 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -1,23 +1,10 @@ +/* SPDX-License-Identifier: GPL-2.0+ */  /*   * Read-Copy Update definitions shared among RCU implementations.   * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, you can access it online at - * http://www.gnu.org/licenses/gpl-2.0.html. - *   * Copyright IBM Corporation, 2011   * - * Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com> + * Author: Paul E. McKenney <paulmck@linux.ibm.com>   */  #ifndef __LINUX_RCU_H @@ -30,7 +17,7 @@  #define RCU_TRACE(stmt)  #endif /* #else #ifdef CONFIG_RCU_TRACE */ -/* Offset to allow for unmatched rcu_irq_{enter,exit}(). */ +/* Offset to allow distinguishing irq vs. task-based idle entry/exit. */  #define DYNTICK_IRQ_NONIDLE	((LONG_MAX / 2) + 1) @@ -462,8 +449,6 @@ void rcu_request_urgent_qs_task(struct task_struct *t);  enum rcutorture_type {  	RCU_FLAVOR, -	RCU_BH_FLAVOR, -	RCU_SCHED_FLAVOR,  	RCU_TASKS_FLAVOR,  	SRCU_FLAVOR,  	INVALID_RCU_FLAVOR diff --git a/kernel/rcu/rcu_segcblist.c b/kernel/rcu/rcu_segcblist.c index 5aff271adf1e..9bd5f6023c21 100644 --- a/kernel/rcu/rcu_segcblist.c +++ b/kernel/rcu/rcu_segcblist.c @@ -1,23 +1,10 @@ +// SPDX-License-Identifier: GPL-2.0+  /*   * RCU segmented callback lists, function definitions   * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, you can access it online at - * http://www.gnu.org/licenses/gpl-2.0.html. - *   * Copyright IBM Corporation, 2017   * - * Authors: Paul E. McKenney <paulmck@linux.vnet.ibm.com> + * Authors: Paul E. McKenney <paulmck@linux.ibm.com>   */  #include <linux/types.h> diff --git a/kernel/rcu/rcu_segcblist.h b/kernel/rcu/rcu_segcblist.h index 948470cef385..71b64648464e 100644 --- a/kernel/rcu/rcu_segcblist.h +++ b/kernel/rcu/rcu_segcblist.h @@ -1,23 +1,10 @@ +/* SPDX-License-Identifier: GPL-2.0+ */  /*   * RCU segmented callback lists, internal-to-rcu header file   * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, you can access it online at - * http://www.gnu.org/licenses/gpl-2.0.html. - *   * Copyright IBM Corporation, 2017   * - * Authors: Paul E. McKenney <paulmck@linux.vnet.ibm.com> + * Authors: Paul E. McKenney <paulmck@linux.ibm.com>   */  #include <linux/rcu_segcblist.h> diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c index b459da70b4fc..c29761152874 100644 --- a/kernel/rcu/rcuperf.c +++ b/kernel/rcu/rcuperf.c @@ -1,23 +1,10 @@ +// SPDX-License-Identifier: GPL-2.0+  /*   * Read-Copy Update module-based performance-test facility   * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, you can access it online at - * http://www.gnu.org/licenses/gpl-2.0.html. - *   * Copyright (C) IBM Corporation, 2015   * - * Authors: Paul E. McKenney <paulmck@us.ibm.com> + * Authors: Paul E. McKenney <paulmck@linux.ibm.com>   */  #define pr_fmt(fmt) fmt @@ -54,7 +41,7 @@  #include "rcu.h"  MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Paul E. McKenney <paulmck@linux.vnet.ibm.com>"); +MODULE_AUTHOR("Paul E. McKenney <paulmck@linux.ibm.com>");  #define PERF_FLAG "-perf:"  #define PERFOUT_STRING(s) \ @@ -83,13 +70,19 @@ MODULE_AUTHOR("Paul E. McKenney <paulmck@linux.vnet.ibm.com>");   * Various other use cases may of course be specified.   */ +#ifdef MODULE +# define RCUPERF_SHUTDOWN 0 +#else +# define RCUPERF_SHUTDOWN 1 +#endif +  torture_param(bool, gp_async, false, "Use asynchronous GP wait primitives");  torture_param(int, gp_async_max, 1000, "Max # outstanding waits per reader");  torture_param(bool, gp_exp, false, "Use expedited GP wait primitives");  torture_param(int, holdoff, 10, "Holdoff time before test start (s)");  torture_param(int, nreaders, -1, "Number of RCU reader threads");  torture_param(int, nwriters, -1, "Number of RCU updater threads"); -torture_param(bool, shutdown, !IS_ENABLED(MODULE), +torture_param(bool, shutdown, RCUPERF_SHUTDOWN,  	      "Shutdown at end of performance tests.");  torture_param(int, verbose, 1, "Enable verbose debugging printk()s");  torture_param(int, writer_holdoff, 0, "Holdoff (us) between GPs, zero to disable"); diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index f6e85faa4ff4..f14d1b18a74f 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1,23 +1,10 @@ +// SPDX-License-Identifier: GPL-2.0+  /*   * Read-Copy Update module-based torture test facility   * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, you can access it online at - * http://www.gnu.org/licenses/gpl-2.0.html. - *   * Copyright (C) IBM Corporation, 2005, 2006   * - * Authors: Paul E. McKenney <paulmck@us.ibm.com> + * Authors: Paul E. McKenney <paulmck@linux.ibm.com>   *	  Josh Triplett <josh@joshtriplett.org>   *   * See also:  Documentation/RCU/torture.txt @@ -61,7 +48,7 @@  #include "rcu.h"  MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and Josh Triplett <josh@joshtriplett.org>"); +MODULE_AUTHOR("Paul E. McKenney <paulmck@linux.ibm.com> and Josh Triplett <josh@joshtriplett.org>");  /* Bits for ->extendables field, extendables param, and related definitions. */ @@ -1630,21 +1617,34 @@ static bool rcu_fwd_emergency_stop;  #define MIN_FWD_CB_LAUNDERS	3	/* This many CB invocations to count. */  #define MIN_FWD_CBS_LAUNDERED	100	/* Number of counted CBs. */  #define FWD_CBS_HIST_DIV	10	/* Histogram buckets/second. */ -static long n_launders_hist[2 * MAX_FWD_CB_JIFFIES / (HZ / FWD_CBS_HIST_DIV)]; +struct rcu_launder_hist { +	long n_launders; +	unsigned long launder_gp_seq; +}; +#define N_LAUNDERS_HIST (2 * MAX_FWD_CB_JIFFIES / (HZ / FWD_CBS_HIST_DIV)) +static struct rcu_launder_hist n_launders_hist[N_LAUNDERS_HIST]; +static unsigned long rcu_launder_gp_seq_start;  static void rcu_torture_fwd_cb_hist(void)  { +	unsigned long gps; +	unsigned long gps_old;  	int i;  	int j;  	for (i = ARRAY_SIZE(n_launders_hist) - 1; i > 0; i--) -		if (n_launders_hist[i] > 0) +		if (n_launders_hist[i].n_launders > 0)  			break;  	pr_alert("%s: Callback-invocation histogram (duration %lu jiffies):",  		 __func__, jiffies - rcu_fwd_startat); -	for (j = 0; j <= i; j++) -		pr_cont(" %ds/%d: %ld", -			j + 1, FWD_CBS_HIST_DIV, n_launders_hist[j]); +	gps_old = rcu_launder_gp_seq_start; +	for (j = 0; j <= i; j++) { +		gps = n_launders_hist[j].launder_gp_seq; +		pr_cont(" %ds/%d: %ld:%ld", +			j + 1, FWD_CBS_HIST_DIV, n_launders_hist[j].n_launders, +			rcutorture_seq_diff(gps, gps_old)); +		gps_old = gps; +	}  	pr_cont("\n");  } @@ -1666,7 +1666,8 @@ static void rcu_torture_fwd_cb_cr(struct rcu_head *rhp)  	i = ((jiffies - rcu_fwd_startat) / (HZ / FWD_CBS_HIST_DIV));  	if (i >= ARRAY_SIZE(n_launders_hist))  		i = ARRAY_SIZE(n_launders_hist) - 1; -	n_launders_hist[i]++; +	n_launders_hist[i].n_launders++; +	n_launders_hist[i].launder_gp_seq = cur_ops->get_gp_seq();  	spin_unlock_irqrestore(&rcu_fwd_lock, flags);  } @@ -1786,9 +1787,10 @@ static void rcu_torture_fwd_prog_cr(void)  	n_max_cbs = 0;  	n_max_gps = 0;  	for (i = 0; i < ARRAY_SIZE(n_launders_hist); i++) -		n_launders_hist[i] = 0; +		n_launders_hist[i].n_launders = 0;  	cver = READ_ONCE(rcu_torture_current_version);  	gps = cur_ops->get_gp_seq(); +	rcu_launder_gp_seq_start = gps;  	while (time_before(jiffies, stopat) &&  	       !READ_ONCE(rcu_fwd_emergency_stop) && !torture_must_stop()) {  		rfcp = READ_ONCE(rcu_fwd_cb_head); @@ -2228,6 +2230,14 @@ static void rcu_test_debug_objects(void)  #endif /* #else #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */  } +static void rcutorture_sync(void) +{ +	static unsigned long n; + +	if (cur_ops->sync && !(++n & 0xfff)) +		cur_ops->sync(); +} +  static int __init  rcu_torture_init(void)  { @@ -2389,7 +2399,8 @@ rcu_torture_init(void)  	firsterr = torture_shutdown_init(shutdown_secs, rcu_torture_cleanup);  	if (firsterr)  		goto unwind; -	firsterr = torture_onoff_init(onoff_holdoff * HZ, onoff_interval); +	firsterr = torture_onoff_init(onoff_holdoff * HZ, onoff_interval, +				      rcutorture_sync);  	if (firsterr)  		goto unwind;  	firsterr = rcu_torture_stall_init(); diff --git a/kernel/rcu/srcutiny.c b/kernel/rcu/srcutiny.c index 32dfd6522548..5d4a39a6505a 100644 --- a/kernel/rcu/srcutiny.c +++ b/kernel/rcu/srcutiny.c @@ -1,24 +1,11 @@ +// SPDX-License-Identifier: GPL-2.0+  /*   * Sleepable Read-Copy Update mechanism for mutual exclusion,   *	tiny version for non-preemptible single-CPU use.   * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, you can access it online at - * http://www.gnu.org/licenses/gpl-2.0.html. - *   * Copyright (C) IBM Corporation, 2017   * - * Author: Paul McKenney <paulmck@us.ibm.com> + * Author: Paul McKenney <paulmck@linux.ibm.com>   */  #include <linux/export.h> diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index 3600d88d8956..a60b8ba9e1ac 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -1,24 +1,11 @@ +// SPDX-License-Identifier: GPL-2.0+  /*   * Sleepable Read-Copy Update mechanism for mutual exclusion.   * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, you can access it online at - * http://www.gnu.org/licenses/gpl-2.0.html. - *   * Copyright (C) IBM Corporation, 2006   * Copyright (C) Fujitsu, 2012   * - * Author: Paul McKenney <paulmck@us.ibm.com> + * Author: Paul McKenney <paulmck@linux.ibm.com>   *	   Lai Jiangshan <laijs@cn.fujitsu.com>   *   * For detailed explanation of Read-Copy Update mechanism see - @@ -58,6 +45,7 @@ static bool __read_mostly srcu_init_done;  static void srcu_invoke_callbacks(struct work_struct *work);  static void srcu_reschedule(struct srcu_struct *ssp, unsigned long delay);  static void process_srcu(struct work_struct *work); +static void srcu_delay_timer(struct timer_list *t);  /* Wrappers for lock acquisition and release, see raw_spin_lock_rcu_node(). */  #define spin_lock_rcu_node(p)					\ @@ -156,7 +144,8 @@ static void init_srcu_struct_nodes(struct srcu_struct *ssp, bool is_static)  			snp->grphi = cpu;  		}  		sdp->cpu = cpu; -		INIT_DELAYED_WORK(&sdp->work, srcu_invoke_callbacks); +		INIT_WORK(&sdp->work, srcu_invoke_callbacks); +		timer_setup(&sdp->delay_work, srcu_delay_timer, 0);  		sdp->ssp = ssp;  		sdp->grpmask = 1 << (cpu - sdp->mynode->grplo);  		if (is_static) @@ -386,13 +375,19 @@ void _cleanup_srcu_struct(struct srcu_struct *ssp, bool quiesced)  	} else {  		flush_delayed_work(&ssp->work);  	} -	for_each_possible_cpu(cpu) +	for_each_possible_cpu(cpu) { +		struct srcu_data *sdp = per_cpu_ptr(ssp->sda, cpu); +  		if (quiesced) { -			if (WARN_ON(delayed_work_pending(&per_cpu_ptr(ssp->sda, cpu)->work))) +			if (WARN_ON(timer_pending(&sdp->delay_work))) +				return; /* Just leak it! */ +			if (WARN_ON(work_pending(&sdp->work)))  				return; /* Just leak it! */  		} else { -			flush_delayed_work(&per_cpu_ptr(ssp->sda, cpu)->work); +			del_timer_sync(&sdp->delay_work); +			flush_work(&sdp->work);  		} +	}  	if (WARN_ON(rcu_seq_state(READ_ONCE(ssp->srcu_gp_seq)) != SRCU_STATE_IDLE) ||  	    WARN_ON(srcu_readers_active(ssp))) {  		pr_info("%s: Active srcu_struct %p state: %d\n", @@ -463,39 +458,23 @@ static void srcu_gp_start(struct srcu_struct *ssp)  	WARN_ON_ONCE(state != SRCU_STATE_SCAN1);  } -/* - * Track online CPUs to guide callback workqueue placement. - */ -DEFINE_PER_CPU(bool, srcu_online); -void srcu_online_cpu(unsigned int cpu) +static void srcu_delay_timer(struct timer_list *t)  { -	WRITE_ONCE(per_cpu(srcu_online, cpu), true); -} +	struct srcu_data *sdp = container_of(t, struct srcu_data, delay_work); -void srcu_offline_cpu(unsigned int cpu) -{ -	WRITE_ONCE(per_cpu(srcu_online, cpu), false); +	queue_work_on(sdp->cpu, rcu_gp_wq, &sdp->work);  } -/* - * Place the workqueue handler on the specified CPU if online, otherwise - * just run it whereever.  This is useful for placing workqueue handlers - * that are to invoke the specified CPU's callbacks. - */ -static bool srcu_queue_delayed_work_on(int cpu, struct workqueue_struct *wq, -				       struct delayed_work *dwork, +static void srcu_queue_delayed_work_on(struct srcu_data *sdp,  				       unsigned long delay)  { -	bool ret; +	if (!delay) { +		queue_work_on(sdp->cpu, rcu_gp_wq, &sdp->work); +		return; +	} -	preempt_disable(); -	if (READ_ONCE(per_cpu(srcu_online, cpu))) -		ret = queue_delayed_work_on(cpu, wq, dwork, delay); -	else -		ret = queue_delayed_work(wq, dwork, delay); -	preempt_enable(); -	return ret; +	timer_reduce(&sdp->delay_work, jiffies + delay);  }  /* @@ -504,7 +483,7 @@ static bool srcu_queue_delayed_work_on(int cpu, struct workqueue_struct *wq,   */  static void srcu_schedule_cbs_sdp(struct srcu_data *sdp, unsigned long delay)  { -	srcu_queue_delayed_work_on(sdp->cpu, rcu_gp_wq, &sdp->work, delay); +	srcu_queue_delayed_work_on(sdp, delay);  }  /* @@ -1186,7 +1165,8 @@ static void srcu_invoke_callbacks(struct work_struct *work)  	struct srcu_data *sdp;  	struct srcu_struct *ssp; -	sdp = container_of(work, struct srcu_data, work.work); +	sdp = container_of(work, struct srcu_data, work); +  	ssp = sdp->ssp;  	rcu_cblist_init(&ready_cbs);  	spin_lock_irq_rcu_node(sdp); diff --git a/kernel/rcu/sync.c b/kernel/rcu/sync.c index be10036fa621..a8304d90573f 100644 --- a/kernel/rcu/sync.c +++ b/kernel/rcu/sync.c @@ -1,20 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0+  /*   * RCU-based infrastructure for lightweight reader-writer locking   * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, you can access it online at - * http://www.gnu.org/licenses/gpl-2.0.html. - *   * Copyright (c) 2015, Red Hat, Inc.   *   * Author: Oleg Nesterov <oleg@redhat.com> diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c index 5f5963ba313e..911bd9076d43 100644 --- a/kernel/rcu/tiny.c +++ b/kernel/rcu/tiny.c @@ -1,23 +1,10 @@ +// SPDX-License-Identifier: GPL-2.0+  /*   * Read-Copy Update mechanism for mutual exclusion, the Bloatwatch edition.   * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, you can access it online at - * http://www.gnu.org/licenses/gpl-2.0.html. - *   * Copyright IBM Corporation, 2008   * - * Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com> + * Author: Paul E. McKenney <paulmck@linux.ibm.com>   *   * For detailed explanation of Read-Copy Update mechanism see -   *		Documentation/RCU @@ -76,7 +63,7 @@ void rcu_qs(void)   * be called from hardirq context.  It is normally called from the   * scheduling-clock interrupt.   */ -void rcu_check_callbacks(int user) +void rcu_sched_clock_irq(int user)  {  	if (user) {  		rcu_qs(); diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 9180158756d2..acd6ccf56faf 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1,27 +1,14 @@ +// SPDX-License-Identifier: GPL-2.0+  /*   * Read-Copy Update mechanism for mutual exclusion   * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, you can access it online at - * http://www.gnu.org/licenses/gpl-2.0.html. - *   * Copyright IBM Corporation, 2008   *   * Authors: Dipankar Sarma <dipankar@in.ibm.com>   *	    Manfred Spraul <manfred@colorfullife.com> - *	    Paul E. McKenney <paulmck@linux.vnet.ibm.com> Hierarchical version + *	    Paul E. McKenney <paulmck@linux.ibm.com> Hierarchical version   * - * Based on the original work by Paul McKenney <paulmck@us.ibm.com> + * Based on the original work by Paul McKenney <paulmck@linux.ibm.com>   * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen.   *   * For detailed explanation of Read-Copy Update mechanism see - @@ -62,6 +49,8 @@  #include <linux/suspend.h>  #include <linux/ftrace.h>  #include <linux/tick.h> +#include <linux/sysrq.h> +#include <linux/kprobes.h>  #include "tree.h"  #include "rcu.h" @@ -115,6 +104,9 @@ int num_rcu_lvl[] = NUM_RCU_LVL_INIT;  int rcu_num_nodes __read_mostly = NUM_RCU_NODES; /* Total # rcu_nodes in use. */  /* panic() on RCU Stall sysctl. */  int sysctl_panic_on_rcu_stall __read_mostly; +/* Commandeer a sysrq key to dump RCU's tree. */ +static bool sysrq_rcu; +module_param(sysrq_rcu, bool, 0444);  /*   * The rcu_scheduler_active variable is initialized to the value @@ -479,7 +471,6 @@ module_param_cb(jiffies_till_next_fqs, &next_fqs_jiffies_ops, &jiffies_till_next  module_param(rcu_kick_kthreads, bool, 0644);  static void force_qs_rnp(int (*f)(struct rcu_data *rdp)); -static void force_quiescent_state(void);  static int rcu_pending(void);  /* @@ -504,13 +495,12 @@ unsigned long rcu_exp_batches_completed(void)  EXPORT_SYMBOL_GPL(rcu_exp_batches_completed);  /* - * Force a quiescent state. + * Return the root node of the rcu_state structure.   */ -void rcu_force_quiescent_state(void) +static struct rcu_node *rcu_get_root(void)  { -	force_quiescent_state(); +	return &rcu_state.node[0];  } -EXPORT_SYMBOL_GPL(rcu_force_quiescent_state);  /*   * Convert a ->gp_state value to a character string. @@ -529,19 +519,30 @@ void show_rcu_gp_kthreads(void)  {  	int cpu;  	unsigned long j; +	unsigned long ja; +	unsigned long jr; +	unsigned long jw;  	struct rcu_data *rdp;  	struct rcu_node *rnp; -	j = jiffies - READ_ONCE(rcu_state.gp_activity); -	pr_info("%s: wait state: %s(%d) ->state: %#lx delta ->gp_activity %ld\n", +	j = jiffies; +	ja = j - READ_ONCE(rcu_state.gp_activity); +	jr = j - READ_ONCE(rcu_state.gp_req_activity); +	jw = j - READ_ONCE(rcu_state.gp_wake_time); +	pr_info("%s: wait state: %s(%d) ->state: %#lx delta ->gp_activity %lu ->gp_req_activity %lu ->gp_wake_time %lu ->gp_wake_seq %ld ->gp_seq %ld ->gp_seq_needed %ld ->gp_flags %#x\n",  		rcu_state.name, gp_state_getname(rcu_state.gp_state), -		rcu_state.gp_state, rcu_state.gp_kthread->state, j); +		rcu_state.gp_state, +		rcu_state.gp_kthread ? rcu_state.gp_kthread->state : 0x1ffffL, +		ja, jr, jw, (long)READ_ONCE(rcu_state.gp_wake_seq), +		(long)READ_ONCE(rcu_state.gp_seq), +		(long)READ_ONCE(rcu_get_root()->gp_seq_needed), +		READ_ONCE(rcu_state.gp_flags));  	rcu_for_each_node_breadth_first(rnp) {  		if (ULONG_CMP_GE(rcu_state.gp_seq, rnp->gp_seq_needed))  			continue; -		pr_info("\trcu_node %d:%d ->gp_seq %lu ->gp_seq_needed %lu\n", -			rnp->grplo, rnp->grphi, rnp->gp_seq, -			rnp->gp_seq_needed); +		pr_info("\trcu_node %d:%d ->gp_seq %ld ->gp_seq_needed %ld\n", +			rnp->grplo, rnp->grphi, (long)rnp->gp_seq, +			(long)rnp->gp_seq_needed);  		if (!rcu_is_leaf_node(rnp))  			continue;  		for_each_leaf_node_possible_cpu(rnp, cpu) { @@ -550,14 +551,35 @@ void show_rcu_gp_kthreads(void)  			    ULONG_CMP_GE(rcu_state.gp_seq,  					 rdp->gp_seq_needed))  				continue; -			pr_info("\tcpu %d ->gp_seq_needed %lu\n", -				cpu, rdp->gp_seq_needed); +			pr_info("\tcpu %d ->gp_seq_needed %ld\n", +				cpu, (long)rdp->gp_seq_needed);  		}  	}  	/* sched_show_task(rcu_state.gp_kthread); */  }  EXPORT_SYMBOL_GPL(show_rcu_gp_kthreads); +/* Dump grace-period-request information due to commandeered sysrq. */ +static void sysrq_show_rcu(int key) +{ +	show_rcu_gp_kthreads(); +} + +static struct sysrq_key_op sysrq_rcudump_op = { +	.handler = sysrq_show_rcu, +	.help_msg = "show-rcu(y)", +	.action_msg = "Show RCU tree", +	.enable_mask = SYSRQ_ENABLE_DUMP, +}; + +static int __init rcu_sysrq_init(void) +{ +	if (sysrq_rcu) +		return register_sysrq_key('y', &sysrq_rcudump_op); +	return 0; +} +early_initcall(rcu_sysrq_init); +  /*   * Send along grace-period-related data for rcutorture diagnostics.   */ @@ -566,8 +588,6 @@ void rcutorture_get_gp_data(enum rcutorture_type test_type, int *flags,  {  	switch (test_type) {  	case RCU_FLAVOR: -	case RCU_BH_FLAVOR: -	case RCU_SCHED_FLAVOR:  		*flags = READ_ONCE(rcu_state.gp_flags);  		*gp_seq = rcu_seq_current(&rcu_state.gp_seq);  		break; @@ -578,14 +598,6 @@ void rcutorture_get_gp_data(enum rcutorture_type test_type, int *flags,  EXPORT_SYMBOL_GPL(rcutorture_get_gp_data);  /* - * Return the root node of the rcu_state structure. - */ -static struct rcu_node *rcu_get_root(void) -{ -	return &rcu_state.node[0]; -} - -/*   * Enter an RCU extended quiescent state, which can be either the   * idle loop or adaptive-tickless usermode execution.   * @@ -701,7 +713,6 @@ static __always_inline void rcu_nmi_exit_common(bool irq)  /**   * rcu_nmi_exit - inform RCU of exit from NMI context - * @irq: Is this call from rcu_irq_exit?   *   * If you add or remove a call to rcu_nmi_exit(), be sure to test   * with CONFIG_RCU_EQS_DEBUG=y. @@ -872,6 +883,7 @@ void rcu_nmi_enter(void)  {  	rcu_nmi_enter_common(false);  } +NOKPROBE_SYMBOL(rcu_nmi_enter);  /**   * rcu_irq_enter - inform RCU that current CPU is entering irq away from idle @@ -1115,7 +1127,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)  	}  	/* -	 * NO_HZ_FULL CPUs can run in-kernel without rcu_check_callbacks! +	 * NO_HZ_FULL CPUs can run in-kernel without rcu_sched_clock_irq!  	 * The above code handles this, but only for straight cond_resched().  	 * And some in-kernel loops check need_resched() before calling  	 * cond_resched(), which defeats the above code for CPUs that are @@ -1181,7 +1193,7 @@ static void rcu_check_gp_kthread_starvation(void)  		pr_err("%s kthread starved for %ld jiffies! g%ld f%#x %s(%d) ->state=%#lx ->cpu=%d\n",  		       rcu_state.name, j,  		       (long)rcu_seq_current(&rcu_state.gp_seq), -		       rcu_state.gp_flags, +		       READ_ONCE(rcu_state.gp_flags),  		       gp_state_getname(rcu_state.gp_state), rcu_state.gp_state,  		       gpk ? gpk->state : ~0, gpk ? task_cpu(gpk) : -1);  		if (gpk) { @@ -1310,7 +1322,7 @@ static void print_other_cpu_stall(unsigned long gp_seq)  	panic_on_rcu_stall(); -	force_quiescent_state();  /* Kick them all. */ +	rcu_force_quiescent_state();  /* Kick them all. */  }  static void print_cpu_stall(void) @@ -1557,17 +1569,28 @@ static bool rcu_future_gp_cleanup(struct rcu_node *rnp)  }  /* - * Awaken the grace-period kthread.  Don't do a self-awaken, and don't - * bother awakening when there is nothing for the grace-period kthread - * to do (as in several CPUs raced to awaken, and we lost), and finally - * don't try to awaken a kthread that has not yet been created. + * Awaken the grace-period kthread.  Don't do a self-awaken (unless in + * an interrupt or softirq handler), and don't bother awakening when there + * is nothing for the grace-period kthread to do (as in several CPUs raced + * to awaken, and we lost), and finally don't try to awaken a kthread that + * has not yet been created.  If all those checks are passed, track some + * debug information and awaken. + * + * So why do the self-wakeup when in an interrupt or softirq handler + * in the grace-period kthread's context?  Because the kthread might have + * been interrupted just as it was going to sleep, and just after the final + * pre-sleep check of the awaken condition.  In this case, a wakeup really + * is required, and is therefore supplied.   */  static void rcu_gp_kthread_wake(void)  { -	if (current == rcu_state.gp_kthread || +	if ((current == rcu_state.gp_kthread && +	     !in_interrupt() && !in_serving_softirq()) ||  	    !READ_ONCE(rcu_state.gp_flags) ||  	    !rcu_state.gp_kthread)  		return; +	WRITE_ONCE(rcu_state.gp_wake_time, jiffies); +	WRITE_ONCE(rcu_state.gp_wake_seq, READ_ONCE(rcu_state.gp_seq));  	swake_up_one(&rcu_state.gp_wq);  } @@ -1711,7 +1734,7 @@ static bool __note_gp_changes(struct rcu_node *rnp, struct rcu_data *rdp)  		zero_cpu_stall_ticks(rdp);  	}  	rdp->gp_seq = rnp->gp_seq;  /* Remember new grace-period state. */ -	if (ULONG_CMP_GE(rnp->gp_seq_needed, rdp->gp_seq_needed) || rdp->gpwrap) +	if (ULONG_CMP_LT(rdp->gp_seq_needed, rnp->gp_seq_needed) || rdp->gpwrap)  		rdp->gp_seq_needed = rnp->gp_seq_needed;  	WRITE_ONCE(rdp->gpwrap, false);  	rcu_gpnum_ovf(rnp, rdp); @@ -1939,7 +1962,7 @@ static void rcu_gp_fqs_loop(void)  		if (!ret) {  			rcu_state.jiffies_force_qs = jiffies + j;  			WRITE_ONCE(rcu_state.jiffies_kick_kthreads, -				   jiffies + 3 * j); +				   jiffies + (j ? 3 * j : 2));  		}  		trace_rcu_grace_period(rcu_state.name,  				       READ_ONCE(rcu_state.gp_seq), @@ -2497,14 +2520,14 @@ static void rcu_do_batch(struct rcu_data *rdp)  }  /* - * Check to see if this CPU is in a non-context-switch quiescent state - * (user mode or idle loop for rcu, non-softirq execution for rcu_bh). - * Also schedule RCU core processing. - * - * This function must be called from hardirq context.  It is normally - * invoked from the scheduling-clock interrupt. + * This function is invoked from each scheduling-clock interrupt, + * and checks to see if this CPU is in a non-context-switch quiescent + * state, for example, user mode or idle loop.  It also schedules RCU + * core processing.  If the current grace period has gone on too long, + * it will ask the scheduler to manufacture a context switch for the sole + * purpose of providing a providing the needed quiescent state.   */ -void rcu_check_callbacks(int user) +void rcu_sched_clock_irq(int user)  {  	trace_rcu_utilization(TPS("Start scheduler-tick"));  	raw_cpu_inc(rcu_data.ticks_this_gp); @@ -2517,7 +2540,7 @@ void rcu_check_callbacks(int user)  		}  		__this_cpu_write(rcu_data.rcu_urgent_qs, false);  	} -	rcu_flavor_check_callbacks(user); +	rcu_flavor_sched_clock_irq(user);  	if (rcu_pending())  		invoke_rcu_core(); @@ -2578,7 +2601,7 @@ static void force_qs_rnp(int (*f)(struct rcu_data *rdp))   * Force quiescent states on reluctant CPUs, and also detect which   * CPUs are in dyntick-idle mode.   */ -static void force_quiescent_state(void) +void rcu_force_quiescent_state(void)  {  	unsigned long flags;  	bool ret; @@ -2610,6 +2633,7 @@ static void force_quiescent_state(void)  	raw_spin_unlock_irqrestore_rcu_node(rnp_old, flags);  	rcu_gp_kthread_wake();  } +EXPORT_SYMBOL_GPL(rcu_force_quiescent_state);  /*   * This function checks for grace-period requests that fail to motivate @@ -2657,16 +2681,11 @@ rcu_check_gp_start_stall(struct rcu_node *rnp, struct rcu_data *rdp,  		raw_spin_unlock_irqrestore_rcu_node(rnp, flags);  		return;  	} -	pr_alert("%s: g%ld->%ld gar:%lu ga:%lu f%#x gs:%d %s->state:%#lx\n", -		 __func__, (long)READ_ONCE(rcu_state.gp_seq), -		 (long)READ_ONCE(rnp_root->gp_seq_needed), -		 j - rcu_state.gp_req_activity, j - rcu_state.gp_activity, -		 rcu_state.gp_flags, rcu_state.gp_state, rcu_state.name, -		 rcu_state.gp_kthread ? rcu_state.gp_kthread->state : 0x1ffffL);  	WARN_ON(1);  	if (rnp_root != rnp)  		raw_spin_unlock_rcu_node(rnp_root);  	raw_spin_unlock_irqrestore_rcu_node(rnp, flags); +	show_rcu_gp_kthreads();  }  /* @@ -2711,12 +2730,8 @@ void rcu_fwd_progress_check(unsigned long j)  }  EXPORT_SYMBOL_GPL(rcu_fwd_progress_check); -/* - * This does the RCU core processing work for the specified rcu_data - * structures.  This may be called only from the CPU to whom the rdp - * belongs. - */ -static __latent_entropy void rcu_process_callbacks(struct softirq_action *unused) +/* Perform RCU core processing work for the current CPU.  */ +static __latent_entropy void rcu_core(struct softirq_action *unused)  {  	unsigned long flags;  	struct rcu_data *rdp = raw_cpu_ptr(&rcu_data); @@ -2801,9 +2816,9 @@ static void __call_rcu_core(struct rcu_data *rdp, struct rcu_head *head,  	/*  	 * Force the grace period if too many callbacks or too long waiting. -	 * Enforce hysteresis, and don't invoke force_quiescent_state() +	 * Enforce hysteresis, and don't invoke rcu_force_quiescent_state()  	 * if some other CPU has recently done so.  Also, don't bother -	 * invoking force_quiescent_state() if the newly enqueued callback +	 * invoking rcu_force_quiescent_state() if the newly enqueued callback  	 * is the only one waiting for a grace period to complete.  	 */  	if (unlikely(rcu_segcblist_n_cbs(&rdp->cblist) > @@ -2820,7 +2835,7 @@ static void __call_rcu_core(struct rcu_data *rdp, struct rcu_head *head,  			rdp->blimit = LONG_MAX;  			if (rcu_state.n_force_qs == rdp->n_force_qs_snap &&  			    rcu_segcblist_first_pend_cb(&rdp->cblist) != head) -				force_quiescent_state(); +				rcu_force_quiescent_state();  			rdp->n_force_qs_snap = rcu_state.n_force_qs;  			rdp->qlen_last_fqs_check = rcu_segcblist_n_cbs(&rdp->cblist);  		} @@ -2889,9 +2904,6 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func, int cpu, bool lazy)  			rcu_segcblist_init(&rdp->cblist);  	}  	rcu_segcblist_enqueue(&rdp->cblist, head, lazy); -	if (!lazy) -		rcu_idle_count_callbacks_posted(); -  	if (__is_kfree_rcu_offset((unsigned long)func))  		trace_rcu_kfree_callback(rcu_state.name, head,  					 (unsigned long)func, @@ -2961,6 +2973,79 @@ void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func)  }  EXPORT_SYMBOL_GPL(kfree_call_rcu); +/* + * During early boot, any blocking grace-period wait automatically + * implies a grace period.  Later on, this is never the case for PREEMPT. + * + * Howevr, because a context switch is a grace period for !PREEMPT, any + * blocking grace-period wait automatically implies a grace period if + * there is only one CPU online at any point time during execution of + * either synchronize_rcu() or synchronize_rcu_expedited().  It is OK to + * occasionally incorrectly indicate that there are multiple CPUs online + * when there was in fact only one the whole time, as this just adds some + * overhead: RCU still operates correctly. + */ +static int rcu_blocking_is_gp(void) +{ +	int ret; + +	if (IS_ENABLED(CONFIG_PREEMPT)) +		return rcu_scheduler_active == RCU_SCHEDULER_INACTIVE; +	might_sleep();  /* Check for RCU read-side critical section. */ +	preempt_disable(); +	ret = num_online_cpus() <= 1; +	preempt_enable(); +	return ret; +} + +/** + * synchronize_rcu - wait until a grace period has elapsed. + * + * Control will return to the caller some time after a full grace + * period has elapsed, in other words after all currently executing RCU + * read-side critical sections have completed.  Note, however, that + * upon return from synchronize_rcu(), the caller might well be executing + * concurrently with new RCU read-side critical sections that began while + * synchronize_rcu() was waiting.  RCU read-side critical sections are + * delimited by rcu_read_lock() and rcu_read_unlock(), and may be nested. + * In addition, regions of code across which interrupts, preemption, or + * softirqs have been disabled also serve as RCU read-side critical + * sections.  This includes hardware interrupt handlers, softirq handlers, + * and NMI handlers. + * + * Note that this guarantee implies further memory-ordering guarantees. + * On systems with more than one CPU, when synchronize_rcu() returns, + * each CPU is guaranteed to have executed a full memory barrier since + * the end of its last RCU read-side critical section whose beginning + * preceded the call to synchronize_rcu().  In addition, each CPU having + * an RCU read-side critical section that extends beyond the return from + * synchronize_rcu() is guaranteed to have executed a full memory barrier + * after the beginning of synchronize_rcu() and before the beginning of + * that RCU read-side critical section.  Note that these guarantees include + * CPUs that are offline, idle, or executing in user mode, as well as CPUs + * that are executing in the kernel. + * + * Furthermore, if CPU A invoked synchronize_rcu(), which returned + * to its caller on CPU B, then both CPU A and CPU B are guaranteed + * to have executed a full memory barrier during the execution of + * synchronize_rcu() -- even if CPU A and CPU B are the same CPU (but + * again only if the system has more than one CPU). + */ +void synchronize_rcu(void) +{ +	RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map) || +			 lock_is_held(&rcu_lock_map) || +			 lock_is_held(&rcu_sched_lock_map), +			 "Illegal synchronize_rcu() in RCU read-side critical section"); +	if (rcu_blocking_is_gp()) +		return; +	if (rcu_gp_is_expedited()) +		synchronize_rcu_expedited(); +	else +		wait_rcu_gp(call_rcu); +} +EXPORT_SYMBOL_GPL(synchronize_rcu); +  /**   * get_state_synchronize_rcu - Snapshot current RCU state   * @@ -3049,28 +3134,6 @@ static int rcu_pending(void)  }  /* - * Return true if the specified CPU has any callback.  If all_lazy is - * non-NULL, store an indication of whether all callbacks are lazy. - * (If there are no callbacks, all of them are deemed to be lazy.) - */ -static bool rcu_cpu_has_callbacks(bool *all_lazy) -{ -	bool al = true; -	bool hc = false; -	struct rcu_data *rdp; - -	rdp = this_cpu_ptr(&rcu_data); -	if (!rcu_segcblist_empty(&rdp->cblist)) { -		hc = true; -		if (rcu_segcblist_n_nonlazy_cbs(&rdp->cblist)) -			al = false; -	} -	if (all_lazy) -		*all_lazy = al; -	return hc; -} - -/*   * Helper function for rcu_barrier() tracing.  If tracing is disabled,   * the compiler is expected to optimize this away.   */ @@ -3299,7 +3362,7 @@ int rcutree_prepare_cpu(unsigned int cpu)  	trace_rcu_grace_period(rcu_state.name, rdp->gp_seq, TPS("cpuonl"));  	raw_spin_unlock_irqrestore_rcu_node(rnp, flags);  	rcu_prepare_kthreads(cpu); -	rcu_spawn_all_nocb_kthreads(cpu); +	rcu_spawn_cpu_nocb_kthread(cpu);  	return 0;  } @@ -3329,8 +3392,6 @@ int rcutree_online_cpu(unsigned int cpu)  	raw_spin_lock_irqsave_rcu_node(rnp, flags);  	rnp->ffmask |= rdp->grpmask;  	raw_spin_unlock_irqrestore_rcu_node(rnp, flags); -	if (IS_ENABLED(CONFIG_TREE_SRCU)) -		srcu_online_cpu(cpu);  	if (rcu_scheduler_active == RCU_SCHEDULER_INACTIVE)  		return 0; /* Too early in boot for scheduler work. */  	sync_sched_exp_online_cleanup(cpu); @@ -3355,8 +3416,6 @@ int rcutree_offline_cpu(unsigned int cpu)  	raw_spin_unlock_irqrestore_rcu_node(rnp, flags);  	rcutree_affinity_setting(cpu, cpu); -	if (IS_ENABLED(CONFIG_TREE_SRCU)) -		srcu_offline_cpu(cpu);  	return 0;  } @@ -3777,7 +3836,7 @@ void __init rcu_init(void)  	rcu_init_one();  	if (dump_tree)  		rcu_dump_rcu_node_tree(); -	open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); +	open_softirq(RCU_SOFTIRQ, rcu_core);  	/*  	 * We don't need protection against CPU-hotplug here because diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index d90b02b53c0e..bb4f995f2d3f 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -1,25 +1,12 @@ +/* SPDX-License-Identifier: GPL-2.0+ */  /*   * Read-Copy Update mechanism for mutual exclusion (tree-based version)   * Internal non-public definitions.   * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, you can access it online at - * http://www.gnu.org/licenses/gpl-2.0.html. - *   * Copyright IBM Corporation, 2008   *   * Author: Ingo Molnar <mingo@elte.hu> - *	   Paul E. McKenney <paulmck@linux.vnet.ibm.com> + *	   Paul E. McKenney <paulmck@linux.ibm.com>   */  #include <linux/cache.h> @@ -36,7 +23,6 @@  /* Communicate arguments to a workqueue handler. */  struct rcu_exp_work { -	smp_call_func_t rew_func;  	unsigned long rew_s;  	struct work_struct rew_work;  }; @@ -194,10 +180,7 @@ struct rcu_data {  	bool rcu_need_heavy_qs;		/* GP old, so heavy quiescent state! */  	bool rcu_urgent_qs;		/* GP old need light quiescent state. */  #ifdef CONFIG_RCU_FAST_NO_HZ -	bool all_lazy;			/* Are all CPU's CBs lazy? */ -	unsigned long nonlazy_posted;	/* # times non-lazy CB posted to CPU. */ -	unsigned long nonlazy_posted_snap; -					/* Nonlazy_posted snapshot. */ +	bool all_lazy;			/* All CPU's CBs lazy at idle start? */  	unsigned long last_accelerate;	/* Last jiffy CBs were accelerated. */  	unsigned long last_advance_all;	/* Last jiffy CBs were all advanced. */  	int tick_nohz_enabled_snap;	/* Previously seen value from sysfs. */ @@ -234,7 +217,13 @@ struct rcu_data {  					/* Leader CPU takes GP-end wakeups. */  #endif /* #ifdef CONFIG_RCU_NOCB_CPU */ -	/* 6) Diagnostic data, including RCU CPU stall warnings. */ +	/* 6) RCU priority boosting. */ +	struct task_struct *rcu_cpu_kthread_task; +					/* rcuc per-CPU kthread or NULL. */ +	unsigned int rcu_cpu_kthread_status; +	char rcu_cpu_has_work; + +	/* 7) Diagnostic data, including RCU CPU stall warnings. */  	unsigned int softirq_snap;	/* Snapshot of softirq activity. */  	/* ->rcu_iw* fields protected by leaf rcu_node ->lock. */  	struct irq_work rcu_iw;		/* Check for non-irq activity. */ @@ -303,6 +292,8 @@ struct rcu_state {  	struct swait_queue_head gp_wq;		/* Where GP task waits. */  	short gp_flags;				/* Commands for GP task. */  	short gp_state;				/* GP kthread sleep state. */ +	unsigned long gp_wake_time;		/* Last GP kthread wake. */ +	unsigned long gp_wake_seq;		/* ->gp_seq at ^^^. */  	/* End of fields guarded by root rcu_node's lock. */ @@ -402,13 +393,6 @@ static const char *tp_rcu_varname __used __tracepoint_string = rcu_name;  int rcu_dynticks_snap(struct rcu_data *rdp); -#ifdef CONFIG_RCU_BOOST -DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_status); -DECLARE_PER_CPU(int, rcu_cpu_kthread_cpu); -DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_loops); -DECLARE_PER_CPU(char, rcu_cpu_has_work); -#endif /* #ifdef CONFIG_RCU_BOOST */ -  /* Forward declarations for rcutree_plugin.h */  static void rcu_bootup_announce(void);  static void rcu_qs(void); @@ -420,7 +404,7 @@ static void rcu_print_detail_task_stall(void);  static int rcu_print_task_stall(struct rcu_node *rnp);  static int rcu_print_task_exp_stall(struct rcu_node *rnp);  static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp); -static void rcu_flavor_check_callbacks(int user); +static void rcu_flavor_sched_clock_irq(int user);  void call_rcu(struct rcu_head *head, rcu_callback_t func);  static void dump_blkd_tasks(struct rcu_node *rnp, int ncheck);  static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags); @@ -431,7 +415,6 @@ static void __init rcu_spawn_boost_kthreads(void);  static void rcu_prepare_kthreads(int cpu);  static void rcu_cleanup_after_idle(void);  static void rcu_prepare_for_idle(void); -static void rcu_idle_count_callbacks_posted(void);  static bool rcu_preempt_has_tasks(struct rcu_node *rnp);  static bool rcu_preempt_need_deferred_qs(struct task_struct *t);  static void rcu_preempt_deferred_qs(struct task_struct *t); @@ -451,7 +434,7 @@ static bool rcu_nocb_adopt_orphan_cbs(struct rcu_data *my_rdp,  static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp);  static void do_nocb_deferred_wakeup(struct rcu_data *rdp);  static void rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp); -static void rcu_spawn_all_nocb_kthreads(int cpu); +static void rcu_spawn_cpu_nocb_kthread(int cpu);  static void __init rcu_spawn_nocb_kthreads(void);  #ifdef CONFIG_RCU_NOCB_CPU  static void __init rcu_organize_nocb_kthreads(void); @@ -462,11 +445,3 @@ static void rcu_bind_gp_kthread(void);  static bool rcu_nohz_full_cpu(void);  static void rcu_dynticks_task_enter(void);  static void rcu_dynticks_task_exit(void); - -#ifdef CONFIG_SRCU -void srcu_online_cpu(unsigned int cpu); -void srcu_offline_cpu(unsigned int cpu); -#else /* #ifdef CONFIG_SRCU */ -void srcu_online_cpu(unsigned int cpu) { } -void srcu_offline_cpu(unsigned int cpu) { } -#endif /* #else #ifdef CONFIG_SRCU */ diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index 928fe5893a57..4c2a0189e748 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -1,27 +1,16 @@ +/* SPDX-License-Identifier: GPL-2.0+ */  /*   * RCU expedited grace periods   * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, you can access it online at - * http://www.gnu.org/licenses/gpl-2.0.html. - *   * Copyright IBM Corporation, 2016   * - * Authors: Paul E. McKenney <paulmck@linux.vnet.ibm.com> + * Authors: Paul E. McKenney <paulmck@linux.ibm.com>   */  #include <linux/lockdep.h> +static void rcu_exp_handler(void *unused); +  /*   * Record the start of an expedited grace period.   */ @@ -344,7 +333,6 @@ static void sync_rcu_exp_select_node_cpus(struct work_struct *wp)  {  	int cpu;  	unsigned long flags; -	smp_call_func_t func;  	unsigned long mask_ofl_test;  	unsigned long mask_ofl_ipi;  	int ret; @@ -352,7 +340,6 @@ static void sync_rcu_exp_select_node_cpus(struct work_struct *wp)  		container_of(wp, struct rcu_exp_work, rew_work);  	struct rcu_node *rnp = container_of(rewp, struct rcu_node, rew); -	func = rewp->rew_func;  	raw_spin_lock_irqsave_rcu_node(rnp, flags);  	/* Each pass checks a CPU for identity, offline, and idle. */ @@ -396,7 +383,7 @@ retry_ipi:  			mask_ofl_test |= mask;  			continue;  		} -		ret = smp_call_function_single(cpu, func, NULL, 0); +		ret = smp_call_function_single(cpu, rcu_exp_handler, NULL, 0);  		if (!ret) {  			mask_ofl_ipi &= ~mask;  			continue; @@ -426,7 +413,7 @@ retry_ipi:   * Select the nodes that the upcoming expedited grace period needs   * to wait for.   */ -static void sync_rcu_exp_select_cpus(smp_call_func_t func) +static void sync_rcu_exp_select_cpus(void)  {  	int cpu;  	struct rcu_node *rnp; @@ -440,7 +427,6 @@ static void sync_rcu_exp_select_cpus(smp_call_func_t func)  		rnp->exp_need_flush = false;  		if (!READ_ONCE(rnp->expmask))  			continue; /* Avoid early boot non-existent wq. */ -		rnp->rew.rew_func = func;  		if (!READ_ONCE(rcu_par_gp_wq) ||  		    rcu_scheduler_active != RCU_SCHEDULER_RUNNING ||  		    rcu_is_last_leaf_node(rnp)) { @@ -449,7 +435,6 @@ static void sync_rcu_exp_select_cpus(smp_call_func_t func)  			continue;  		}  		INIT_WORK(&rnp->rew.rew_work, sync_rcu_exp_select_node_cpus); -		preempt_disable();  		cpu = find_next_bit(&rnp->ffmask, BITS_PER_LONG, -1);  		/* If all offline, queue the work on an unbound CPU. */  		if (unlikely(cpu > rnp->grphi - rnp->grplo)) @@ -457,7 +442,6 @@ static void sync_rcu_exp_select_cpus(smp_call_func_t func)  		else  			cpu += rnp->grplo;  		queue_work_on(cpu, rcu_par_gp_wq, &rnp->rew.rew_work); -		preempt_enable();  		rnp->exp_need_flush = true;  	} @@ -580,10 +564,10 @@ static void rcu_exp_wait_wake(unsigned long s)   * Common code to drive an expedited grace period forward, used by   * workqueues and mid-boot-time tasks.   */ -static void rcu_exp_sel_wait_wake(smp_call_func_t func, unsigned long s) +static void rcu_exp_sel_wait_wake(unsigned long s)  {  	/* Initialize the rcu_node tree in preparation for the wait. */ -	sync_rcu_exp_select_cpus(func); +	sync_rcu_exp_select_cpus();  	/* Wait and clean up, including waking everyone. */  	rcu_exp_wait_wake(s); @@ -597,52 +581,7 @@ static void wait_rcu_exp_gp(struct work_struct *wp)  	struct rcu_exp_work *rewp;  	rewp = container_of(wp, struct rcu_exp_work, rew_work); -	rcu_exp_sel_wait_wake(rewp->rew_func, rewp->rew_s); -} - -/* - * Given a smp_call_function() handler, kick off the specified - * implementation of expedited grace period. - */ -static void _synchronize_rcu_expedited(smp_call_func_t func) -{ -	struct rcu_data *rdp; -	struct rcu_exp_work rew; -	struct rcu_node *rnp; -	unsigned long s; - -	/* If expedited grace periods are prohibited, fall back to normal. */ -	if (rcu_gp_is_normal()) { -		wait_rcu_gp(call_rcu); -		return; -	} - -	/* Take a snapshot of the sequence number.  */ -	s = rcu_exp_gp_seq_snap(); -	if (exp_funnel_lock(s)) -		return;  /* Someone else did our work for us. */ - -	/* Ensure that load happens before action based on it. */ -	if (unlikely(rcu_scheduler_active == RCU_SCHEDULER_INIT)) { -		/* Direct call during scheduler init and early_initcalls(). */ -		rcu_exp_sel_wait_wake(func, s); -	} else { -		/* Marshall arguments & schedule the expedited grace period. */ -		rew.rew_func = func; -		rew.rew_s = s; -		INIT_WORK_ONSTACK(&rew.rew_work, wait_rcu_exp_gp); -		queue_work(rcu_gp_wq, &rew.rew_work); -	} - -	/* Wait for expedited grace period to complete. */ -	rdp = per_cpu_ptr(&rcu_data, raw_smp_processor_id()); -	rnp = rcu_get_root(); -	wait_event(rnp->exp_wq[rcu_seq_ctr(s) & 0x3], -		   sync_exp_work_done(s)); -	smp_mb(); /* Workqueue actions happen before return. */ - -	/* Let the next expedited grace period start. */ -	mutex_unlock(&rcu_state.exp_mutex); +	rcu_exp_sel_wait_wake(rewp->rew_s);  }  #ifdef CONFIG_PREEMPT_RCU @@ -654,7 +593,7 @@ static void _synchronize_rcu_expedited(smp_call_func_t func)   * ->expmask fields in the rcu_node tree.  Otherwise, immediately   * report the quiescent state.   */ -static void sync_rcu_exp_handler(void *unused) +static void rcu_exp_handler(void *unused)  {  	unsigned long flags;  	struct rcu_data *rdp = this_cpu_ptr(&rcu_data); @@ -697,6 +636,7 @@ static void sync_rcu_exp_handler(void *unused)  			WRITE_ONCE(t->rcu_read_unlock_special.b.exp_hint, true);  		}  		raw_spin_unlock_irqrestore_rcu_node(rnp, flags); +		return;  	}  	/* @@ -730,43 +670,10 @@ static void sync_sched_exp_online_cleanup(int cpu)  {  } -/** - * synchronize_rcu_expedited - Brute-force RCU grace period - * - * Wait for an RCU-preempt grace period, but expedite it.  The basic - * idea is to IPI all non-idle non-nohz online CPUs.  The IPI handler - * checks whether the CPU is in an RCU-preempt critical section, and - * if so, it sets a flag that causes the outermost rcu_read_unlock() - * to report the quiescent state.  On the other hand, if the CPU is - * not in an RCU read-side critical section, the IPI handler reports - * the quiescent state immediately. - * - * Although this is a greate improvement over previous expedited - * implementations, it is still unfriendly to real-time workloads, so is - * thus not recommended for any sort of common-case code.  In fact, if - * you are using synchronize_rcu_expedited() in a loop, please restructure - * your code to batch your updates, and then Use a single synchronize_rcu() - * instead. - * - * This has the same semantics as (but is more brutal than) synchronize_rcu(). - */ -void synchronize_rcu_expedited(void) -{ -	RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map) || -			 lock_is_held(&rcu_lock_map) || -			 lock_is_held(&rcu_sched_lock_map), -			 "Illegal synchronize_rcu_expedited() in RCU read-side critical section"); - -	if (rcu_scheduler_active == RCU_SCHEDULER_INACTIVE) -		return; -	_synchronize_rcu_expedited(sync_rcu_exp_handler); -} -EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); -  #else /* #ifdef CONFIG_PREEMPT_RCU */  /* Invoked on each online non-idle CPU for expedited quiescent state. */ -static void sync_sched_exp_handler(void *unused) +static void rcu_exp_handler(void *unused)  {  	struct rcu_data *rdp;  	struct rcu_node *rnp; @@ -798,44 +705,78 @@ static void sync_sched_exp_online_cleanup(int cpu)  	rnp = rdp->mynode;  	if (!(READ_ONCE(rnp->expmask) & rdp->grpmask))  		return; -	ret = smp_call_function_single(cpu, sync_sched_exp_handler, NULL, 0); +	ret = smp_call_function_single(cpu, rcu_exp_handler, NULL, 0);  	WARN_ON_ONCE(ret);  } -/* - * Because a context switch is a grace period for !PREEMPT, any - * blocking grace-period wait automatically implies a grace period if - * there is only one CPU online at any point time during execution of - * either synchronize_rcu() or synchronize_rcu_expedited().  It is OK to - * occasionally incorrectly indicate that there are multiple CPUs online - * when there was in fact only one the whole time, as this just adds some - * overhead: RCU still operates correctly. - */ -static int rcu_blocking_is_gp(void) -{ -	int ret; - -	might_sleep();  /* Check for RCU read-side critical section. */ -	preempt_disable(); -	ret = num_online_cpus() <= 1; -	preempt_enable(); -	return ret; -} +#endif /* #else #ifdef CONFIG_PREEMPT_RCU */ -/* PREEMPT=n implementation of synchronize_rcu_expedited(). */ +/** + * synchronize_rcu_expedited - Brute-force RCU grace period + * + * Wait for an RCU grace period, but expedite it.  The basic idea is to + * IPI all non-idle non-nohz online CPUs.  The IPI handler checks whether + * the CPU is in an RCU critical section, and if so, it sets a flag that + * causes the outermost rcu_read_unlock() to report the quiescent state + * for RCU-preempt or asks the scheduler for help for RCU-sched.  On the + * other hand, if the CPU is not in an RCU read-side critical section, + * the IPI handler reports the quiescent state immediately. + * + * Although this is a greate improvement over previous expedited + * implementations, it is still unfriendly to real-time workloads, so is + * thus not recommended for any sort of common-case code.  In fact, if + * you are using synchronize_rcu_expedited() in a loop, please restructure + * your code to batch your updates, and then Use a single synchronize_rcu() + * instead. + * + * This has the same semantics as (but is more brutal than) synchronize_rcu(). + */  void synchronize_rcu_expedited(void)  { +	struct rcu_data *rdp; +	struct rcu_exp_work rew; +	struct rcu_node *rnp; +	unsigned long s; +  	RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map) ||  			 lock_is_held(&rcu_lock_map) ||  			 lock_is_held(&rcu_sched_lock_map),  			 "Illegal synchronize_rcu_expedited() in RCU read-side critical section"); -	/* If only one CPU, this is automatically a grace period. */ +	/* Is the state is such that the call is a grace period? */  	if (rcu_blocking_is_gp())  		return; -	_synchronize_rcu_expedited(sync_sched_exp_handler); +	/* If expedited grace periods are prohibited, fall back to normal. */ +	if (rcu_gp_is_normal()) { +		wait_rcu_gp(call_rcu); +		return; +	} + +	/* Take a snapshot of the sequence number.  */ +	s = rcu_exp_gp_seq_snap(); +	if (exp_funnel_lock(s)) +		return;  /* Someone else did our work for us. */ + +	/* Ensure that load happens before action based on it. */ +	if (unlikely(rcu_scheduler_active == RCU_SCHEDULER_INIT)) { +		/* Direct call during scheduler init and early_initcalls(). */ +		rcu_exp_sel_wait_wake(s); +	} else { +		/* Marshall arguments & schedule the expedited grace period. */ +		rew.rew_s = s; +		INIT_WORK_ONSTACK(&rew.rew_work, wait_rcu_exp_gp); +		queue_work(rcu_gp_wq, &rew.rew_work); +	} + +	/* Wait for expedited grace period to complete. */ +	rdp = per_cpu_ptr(&rcu_data, raw_smp_processor_id()); +	rnp = rcu_get_root(); +	wait_event(rnp->exp_wq[rcu_seq_ctr(s) & 0x3], +		   sync_exp_work_done(s)); +	smp_mb(); /* Workqueue actions happen before return. */ + +	/* Let the next expedited grace period start. */ +	mutex_unlock(&rcu_state.exp_mutex);  }  EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); - -#endif /* #else #ifdef CONFIG_PREEMPT_RCU */ diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 1b3dd2fc0cd6..97dba50f6fb2 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -1,27 +1,14 @@ +/* SPDX-License-Identifier: GPL-2.0+ */  /*   * Read-Copy Update mechanism for mutual exclusion (tree-based version)   * Internal non-public definitions that provide either classic   * or preemptible semantics.   * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, you can access it online at - * http://www.gnu.org/licenses/gpl-2.0.html. - *   * Copyright Red Hat, 2009   * Copyright IBM Corporation, 2009   *   * Author: Ingo Molnar <mingo@elte.hu> - *	   Paul E. McKenney <paulmck@linux.vnet.ibm.com> + *	   Paul E. McKenney <paulmck@linux.ibm.com>   */  #include <linux/delay.h> @@ -34,17 +21,7 @@  #include "../time/tick-internal.h"  #ifdef CONFIG_RCU_BOOST -  #include "../locking/rtmutex_common.h" - -/* - * Control variables for per-CPU and per-rcu_node kthreads. - */ -static DEFINE_PER_CPU(struct task_struct *, rcu_cpu_kthread_task); -DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_status); -DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_loops); -DEFINE_PER_CPU(char, rcu_cpu_has_work); -  #else /* #ifdef CONFIG_RCU_BOOST */  /* @@ -307,7 +284,7 @@ static void rcu_qs(void)  				       __this_cpu_read(rcu_data.gp_seq),  				       TPS("cpuqs"));  		__this_cpu_write(rcu_data.cpu_no_qs.b.norm, false); -		barrier(); /* Coordinate with rcu_flavor_check_callbacks(). */ +		barrier(); /* Coordinate with rcu_flavor_sched_clock_irq(). */  		current->rcu_read_unlock_special.b.need_qs = false;  	}  } @@ -788,13 +765,13 @@ static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp)  }  /* - * Check for a quiescent state from the current CPU.  When a task blocks, - * the task is recorded in the corresponding CPU's rcu_node structure, - * which is checked elsewhere. - * - * Caller must disable hard irqs. + * Check for a quiescent state from the current CPU, including voluntary + * context switches for Tasks RCU.  When a task blocks, the task is + * recorded in the corresponding CPU's rcu_node structure, which is checked + * elsewhere, hence this function need only check for quiescent states + * related to the current CPU, not to those related to tasks.   */ -static void rcu_flavor_check_callbacks(int user) +static void rcu_flavor_sched_clock_irq(int user)  {  	struct task_struct *t = current; @@ -825,54 +802,6 @@ static void rcu_flavor_check_callbacks(int user)  		t->rcu_read_unlock_special.b.need_qs = true;  } -/** - * synchronize_rcu - wait until a grace period has elapsed. - * - * Control will return to the caller some time after a full grace - * period has elapsed, in other words after all currently executing RCU - * read-side critical sections have completed.  Note, however, that - * upon return from synchronize_rcu(), the caller might well be executing - * concurrently with new RCU read-side critical sections that began while - * synchronize_rcu() was waiting.  RCU read-side critical sections are - * delimited by rcu_read_lock() and rcu_read_unlock(), and may be nested. - * In addition, regions of code across which interrupts, preemption, or - * softirqs have been disabled also serve as RCU read-side critical - * sections.  This includes hardware interrupt handlers, softirq handlers, - * and NMI handlers. - * - * Note that this guarantee implies further memory-ordering guarantees. - * On systems with more than one CPU, when synchronize_rcu() returns, - * each CPU is guaranteed to have executed a full memory barrier since - * the end of its last RCU read-side critical section whose beginning - * preceded the call to synchronize_rcu().  In addition, each CPU having - * an RCU read-side critical section that extends beyond the return from - * synchronize_rcu() is guaranteed to have executed a full memory barrier - * after the beginning of synchronize_rcu() and before the beginning of - * that RCU read-side critical section.  Note that these guarantees include - * CPUs that are offline, idle, or executing in user mode, as well as CPUs - * that are executing in the kernel. - * - * Furthermore, if CPU A invoked synchronize_rcu(), which returned - * to its caller on CPU B, then both CPU A and CPU B are guaranteed - * to have executed a full memory barrier during the execution of - * synchronize_rcu() -- even if CPU A and CPU B are the same CPU (but - * again only if the system has more than one CPU). - */ -void synchronize_rcu(void) -{ -	RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map) || -			 lock_is_held(&rcu_lock_map) || -			 lock_is_held(&rcu_sched_lock_map), -			 "Illegal synchronize_rcu() in RCU read-side critical section"); -	if (rcu_scheduler_active == RCU_SCHEDULER_INACTIVE) -		return; -	if (rcu_gp_is_expedited()) -		synchronize_rcu_expedited(); -	else -		wait_rcu_gp(call_rcu); -} -EXPORT_SYMBOL_GPL(synchronize_rcu); -  /*   * Check for a task exiting while in a preemptible-RCU read-side   * critical section, clean up if so.  No need to issue warnings, @@ -1088,14 +1017,10 @@ static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp)  }  /* - * Check to see if this CPU is in a non-context-switch quiescent state - * (user mode or idle loop for rcu, non-softirq execution for rcu_bh). - * Also schedule RCU core processing. - * - * This function must be called from hardirq context.  It is normally - * invoked from the scheduling-clock interrupt. + * Check to see if this CPU is in a non-context-switch quiescent state, + * namely user mode and idle loop.   */ -static void rcu_flavor_check_callbacks(int user) +static void rcu_flavor_sched_clock_irq(int user)  {  	if (user || rcu_is_cpu_rrupt_from_idle()) { @@ -1115,22 +1040,6 @@ static void rcu_flavor_check_callbacks(int user)  	}  } -/* PREEMPT=n implementation of synchronize_rcu(). */ -void synchronize_rcu(void) -{ -	RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map) || -			 lock_is_held(&rcu_lock_map) || -			 lock_is_held(&rcu_sched_lock_map), -			 "Illegal synchronize_rcu() in RCU read-side critical section"); -	if (rcu_blocking_is_gp()) -		return; -	if (rcu_gp_is_expedited()) -		synchronize_rcu_expedited(); -	else -		wait_rcu_gp(call_rcu); -} -EXPORT_SYMBOL_GPL(synchronize_rcu); -  /*   * Because preemptible RCU does not exist, tasks cannot possibly exit   * while in preemptible RCU read-side critical sections. @@ -1307,11 +1216,11 @@ static void invoke_rcu_callbacks_kthread(void)  	unsigned long flags;  	local_irq_save(flags); -	__this_cpu_write(rcu_cpu_has_work, 1); -	if (__this_cpu_read(rcu_cpu_kthread_task) != NULL && -	    current != __this_cpu_read(rcu_cpu_kthread_task)) { -		rcu_wake_cond(__this_cpu_read(rcu_cpu_kthread_task), -			      __this_cpu_read(rcu_cpu_kthread_status)); +	__this_cpu_write(rcu_data.rcu_cpu_has_work, 1); +	if (__this_cpu_read(rcu_data.rcu_cpu_kthread_task) != NULL && +	    current != __this_cpu_read(rcu_data.rcu_cpu_kthread_task)) { +		rcu_wake_cond(__this_cpu_read(rcu_data.rcu_cpu_kthread_task), +			      __this_cpu_read(rcu_data.rcu_cpu_kthread_status));  	}  	local_irq_restore(flags);  } @@ -1322,7 +1231,7 @@ static void invoke_rcu_callbacks_kthread(void)   */  static bool rcu_is_callbacks_kthread(void)  { -	return __this_cpu_read(rcu_cpu_kthread_task) == current; +	return __this_cpu_read(rcu_data.rcu_cpu_kthread_task) == current;  }  #define RCU_BOOST_DELAY_JIFFIES DIV_ROUND_UP(CONFIG_RCU_BOOST_DELAY * HZ, 1000) @@ -1369,11 +1278,6 @@ static int rcu_spawn_one_boost_kthread(struct rcu_node *rnp)  	return 0;  } -static void rcu_kthread_do_work(void) -{ -	rcu_do_batch(this_cpu_ptr(&rcu_data)); -} -  static void rcu_cpu_kthread_setup(unsigned int cpu)  {  	struct sched_param sp; @@ -1384,12 +1288,12 @@ static void rcu_cpu_kthread_setup(unsigned int cpu)  static void rcu_cpu_kthread_park(unsigned int cpu)  { -	per_cpu(rcu_cpu_kthread_status, cpu) = RCU_KTHREAD_OFFCPU; +	per_cpu(rcu_data.rcu_cpu_kthread_status, cpu) = RCU_KTHREAD_OFFCPU;  }  static int rcu_cpu_kthread_should_run(unsigned int cpu)  { -	return __this_cpu_read(rcu_cpu_has_work); +	return __this_cpu_read(rcu_data.rcu_cpu_has_work);  }  /* @@ -1399,21 +1303,20 @@ static int rcu_cpu_kthread_should_run(unsigned int cpu)   */  static void rcu_cpu_kthread(unsigned int cpu)  { -	unsigned int *statusp = this_cpu_ptr(&rcu_cpu_kthread_status); -	char work, *workp = this_cpu_ptr(&rcu_cpu_has_work); +	unsigned int *statusp = this_cpu_ptr(&rcu_data.rcu_cpu_kthread_status); +	char work, *workp = this_cpu_ptr(&rcu_data.rcu_cpu_has_work);  	int spincnt;  	for (spincnt = 0; spincnt < 10; spincnt++) {  		trace_rcu_utilization(TPS("Start CPU kthread@rcu_wait"));  		local_bh_disable();  		*statusp = RCU_KTHREAD_RUNNING; -		this_cpu_inc(rcu_cpu_kthread_loops);  		local_irq_disable();  		work = *workp;  		*workp = 0;  		local_irq_enable();  		if (work) -			rcu_kthread_do_work(); +			rcu_do_batch(this_cpu_ptr(&rcu_data));  		local_bh_enable();  		if (*workp == 0) {  			trace_rcu_utilization(TPS("End CPU kthread@rcu_wait")); @@ -1459,7 +1362,7 @@ static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu)  }  static struct smp_hotplug_thread rcu_cpu_thread_spec = { -	.store			= &rcu_cpu_kthread_task, +	.store			= &rcu_data.rcu_cpu_kthread_task,  	.thread_should_run	= rcu_cpu_kthread_should_run,  	.thread_fn		= rcu_cpu_kthread,  	.thread_comm		= "rcuc/%u", @@ -1476,7 +1379,7 @@ static void __init rcu_spawn_boost_kthreads(void)  	int cpu;  	for_each_possible_cpu(cpu) -		per_cpu(rcu_cpu_has_work, cpu) = 0; +		per_cpu(rcu_data.rcu_cpu_has_work, cpu) = 0;  	if (WARN_ONCE(smpboot_register_percpu_thread(&rcu_cpu_thread_spec), "%s: Could not start rcub kthread, OOM is now expected behavior\n", __func__))  		return;  	rcu_for_each_leaf_node(rnp) @@ -1543,7 +1446,7 @@ static void rcu_prepare_kthreads(int cpu)  int rcu_needs_cpu(u64 basemono, u64 *nextevt)  {  	*nextevt = KTIME_MAX; -	return rcu_cpu_has_callbacks(NULL); +	return !rcu_segcblist_empty(&this_cpu_ptr(&rcu_data)->cblist);  }  /* @@ -1562,14 +1465,6 @@ static void rcu_prepare_for_idle(void)  {  } -/* - * Don't bother keeping a running count of the number of RCU callbacks - * posted because CONFIG_RCU_FAST_NO_HZ=n. - */ -static void rcu_idle_count_callbacks_posted(void) -{ -} -  #else /* #if !defined(CONFIG_RCU_FAST_NO_HZ) */  /* @@ -1652,11 +1547,8 @@ int rcu_needs_cpu(u64 basemono, u64 *nextevt)  	lockdep_assert_irqs_disabled(); -	/* Snapshot to detect later posting of non-lazy callback. */ -	rdp->nonlazy_posted_snap = rdp->nonlazy_posted; -  	/* If no callbacks, RCU doesn't need the CPU. */ -	if (!rcu_cpu_has_callbacks(&rdp->all_lazy)) { +	if (rcu_segcblist_empty(&rdp->cblist)) {  		*nextevt = KTIME_MAX;  		return 0;  	} @@ -1670,11 +1562,12 @@ int rcu_needs_cpu(u64 basemono, u64 *nextevt)  	rdp->last_accelerate = jiffies;  	/* Request timer delay depending on laziness, and round. */ -	if (!rdp->all_lazy) { +	rdp->all_lazy = !rcu_segcblist_n_nonlazy_cbs(&rdp->cblist); +	if (rdp->all_lazy) { +		dj = round_jiffies(rcu_idle_lazy_gp_delay + jiffies) - jiffies; +	} else {  		dj = round_up(rcu_idle_gp_delay + jiffies,  			       rcu_idle_gp_delay) - jiffies; -	} else { -		dj = round_jiffies(rcu_idle_lazy_gp_delay + jiffies) - jiffies;  	}  	*nextevt = basemono + dj * TICK_NSEC;  	return 0; @@ -1704,7 +1597,7 @@ static void rcu_prepare_for_idle(void)  	/* Handle nohz enablement switches conservatively. */  	tne = READ_ONCE(tick_nohz_active);  	if (tne != rdp->tick_nohz_enabled_snap) { -		if (rcu_cpu_has_callbacks(NULL)) +		if (!rcu_segcblist_empty(&rdp->cblist))  			invoke_rcu_core(); /* force nohz to see update. */  		rdp->tick_nohz_enabled_snap = tne;  		return; @@ -1717,10 +1610,8 @@ static void rcu_prepare_for_idle(void)  	 * callbacks, invoke RCU core for the side-effect of recalculating  	 * idle duration on re-entry to idle.  	 */ -	if (rdp->all_lazy && -	    rdp->nonlazy_posted != rdp->nonlazy_posted_snap) { +	if (rdp->all_lazy && rcu_segcblist_n_nonlazy_cbs(&rdp->cblist)) {  		rdp->all_lazy = false; -		rdp->nonlazy_posted_snap = rdp->nonlazy_posted;  		invoke_rcu_core();  		return;  	} @@ -1756,19 +1647,6 @@ static void rcu_cleanup_after_idle(void)  		invoke_rcu_core();  } -/* - * Keep a running count of the number of non-lazy callbacks posted - * on this CPU.  This running counter (which is never decremented) allows - * rcu_prepare_for_idle() to detect when something out of the idle loop - * posts a callback, even if an equal number of callbacks are invoked. - * Of course, callbacks should only be posted from within a trace event - * designed to be called from idle or from within RCU_NONIDLE(). - */ -static void rcu_idle_count_callbacks_posted(void) -{ -	__this_cpu_add(rcu_data.nonlazy_posted, 1); -} -  #endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */  #ifdef CONFIG_RCU_FAST_NO_HZ @@ -1776,13 +1654,12 @@ static void rcu_idle_count_callbacks_posted(void)  static void print_cpu_stall_fast_no_hz(char *cp, int cpu)  {  	struct rcu_data *rdp = &per_cpu(rcu_data, cpu); -	unsigned long nlpd = rdp->nonlazy_posted - rdp->nonlazy_posted_snap; -	sprintf(cp, "last_accelerate: %04lx/%04lx, nonlazy_posted: %ld, %c%c", +	sprintf(cp, "last_accelerate: %04lx/%04lx, Nonlazy posted: %c%c%c",  		rdp->last_accelerate & 0xffff, jiffies & 0xffff, -		ulong2long(nlpd), -		rdp->all_lazy ? 'L' : '.', -		rdp->tick_nohz_enabled_snap ? '.' : 'D'); +		".l"[rdp->all_lazy], +		".L"[!rcu_segcblist_n_nonlazy_cbs(&rdp->cblist)], +		".D"[!rdp->tick_nohz_enabled_snap]);  }  #else /* #ifdef CONFIG_RCU_FAST_NO_HZ */ @@ -1868,22 +1745,24 @@ static void zero_cpu_stall_ticks(struct rcu_data *rdp)  /*   * Offload callback processing from the boot-time-specified set of CPUs - * specified by rcu_nocb_mask.  For each CPU in the set, there is a - * kthread created that pulls the callbacks from the corresponding CPU, - * waits for a grace period to elapse, and invokes the callbacks. - * The no-CBs CPUs do a wake_up() on their kthread when they insert - * a callback into any empty list, unless the rcu_nocb_poll boot parameter - * has been specified, in which case each kthread actively polls its - * CPU.  (Which isn't so great for energy efficiency, but which does - * reduce RCU's overhead on that CPU.) + * specified by rcu_nocb_mask.  For the CPUs in the set, there are kthreads + * created that pull the callbacks from the corresponding CPU, wait for + * a grace period to elapse, and invoke the callbacks.  These kthreads + * are organized into leaders, which manage incoming callbacks, wait for + * grace periods, and awaken followers, and the followers, which only + * invoke callbacks.  Each leader is its own follower.  The no-CBs CPUs + * do a wake_up() on their kthread when they insert a callback into any + * empty list, unless the rcu_nocb_poll boot parameter has been specified, + * in which case each kthread actively polls its CPU.  (Which isn't so great + * for energy efficiency, but which does reduce RCU's overhead on that CPU.)   *   * This is intended to be used in conjunction with Frederic Weisbecker's   * adaptive-idle work, which would seriously reduce OS jitter on CPUs   * running CPU-bound user-mode computations.   * - * Offloading of callback processing could also in theory be used as - * an energy-efficiency measure because CPUs with no RCU callbacks - * queued are more aggressive about entering dyntick-idle mode. + * Offloading of callbacks can also be used as an energy-efficiency + * measure because CPUs with no RCU callbacks queued are more aggressive + * about entering dyntick-idle mode.   */ @@ -1987,10 +1866,7 @@ static void wake_nocb_leader_defer(struct rcu_data *rdp, int waketype,  	raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags);  } -/* - * Does the specified CPU need an RCU callback for this invocation - * of rcu_barrier()? - */ +/* Does rcu_barrier need to queue an RCU callback on the specified CPU?  */  static bool rcu_nocb_cpu_needs_barrier(int cpu)  {  	struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); @@ -2006,8 +1882,8 @@ static bool rcu_nocb_cpu_needs_barrier(int cpu)  	 * callbacks would be posted.  In the worst case, the first  	 * barrier in rcu_barrier() suffices (but the caller cannot  	 * necessarily rely on this, not a substitute for the caller -	 * getting the concurrency design right!).  There must also be -	 * a barrier between the following load an posting of a callback +	 * getting the concurrency design right!).  There must also be a +	 * barrier between the following load and posting of a callback  	 * (if a callback is in fact needed).  This is associated with an  	 * atomic_inc() in the caller.  	 */ @@ -2517,9 +2393,9 @@ static void rcu_spawn_one_nocb_kthread(int cpu)  /*   * If the specified CPU is a no-CBs CPU that does not already have its - * rcuo kthreads, spawn them. + * rcuo kthread, spawn it.   */ -static void rcu_spawn_all_nocb_kthreads(int cpu) +static void rcu_spawn_cpu_nocb_kthread(int cpu)  {  	if (rcu_scheduler_fully_active)  		rcu_spawn_one_nocb_kthread(cpu); @@ -2536,7 +2412,7 @@ static void __init rcu_spawn_nocb_kthreads(void)  	int cpu;  	for_each_online_cpu(cpu) -		rcu_spawn_all_nocb_kthreads(cpu); +		rcu_spawn_cpu_nocb_kthread(cpu);  }  /* How many follower CPU IDs per leader?  Default of -1 for sqrt(nr_cpu_ids). */ @@ -2670,7 +2546,7 @@ static void do_nocb_deferred_wakeup(struct rcu_data *rdp)  {  } -static void rcu_spawn_all_nocb_kthreads(int cpu) +static void rcu_spawn_cpu_nocb_kthread(int cpu)  {  } diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index 1971869c4072..cbaa976c5945 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -1,26 +1,13 @@ +// SPDX-License-Identifier: GPL-2.0+  /*   * Read-Copy Update mechanism for mutual exclusion   * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, you can access it online at - * http://www.gnu.org/licenses/gpl-2.0.html. - *   * Copyright IBM Corporation, 2001   *   * Authors: Dipankar Sarma <dipankar@in.ibm.com>   *	    Manfred Spraul <manfred@colorfullife.com>   * - * Based on the original work by Paul McKenney <paulmck@us.ibm.com> + * Based on the original work by Paul McKenney <paulmck@linux.ibm.com>   * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen.   * Papers:   * http://www.rdrop.com/users/paulmck/paper/rclockpdcsproof.pdf @@ -52,6 +39,7 @@  #include <linux/tick.h>  #include <linux/rcupdate_wait.h>  #include <linux/sched/isolation.h> +#include <linux/kprobes.h>  #define CREATE_TRACE_POINTS @@ -249,6 +237,7 @@ int notrace debug_lockdep_rcu_enabled(void)  	       current->lockdep_recursion == 0;  }  EXPORT_SYMBOL_GPL(debug_lockdep_rcu_enabled); +NOKPROBE_SYMBOL(debug_lockdep_rcu_enabled);  /**   * rcu_read_lock_held() - might we be in RCU read-side critical section? diff --git a/kernel/sched/core.c b/kernel/sched/core.c index d8d76a65cfdd..ead464a0f2e5 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -107,11 +107,12 @@ struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf)  		 *					[L] ->on_rq  		 *	RELEASE (rq->lock)  		 * -		 * If we observe the old CPU in task_rq_lock, the acquire of +		 * If we observe the old CPU in task_rq_lock(), the acquire of  		 * the old rq->lock will fully serialize against the stores.  		 * -		 * If we observe the new CPU in task_rq_lock, the acquire will -		 * pair with the WMB to ensure we must then also see migrating. +		 * If we observe the new CPU in task_rq_lock(), the address +		 * dependency headed by '[L] rq = task_rq()' and the acquire +		 * will pair with the WMB to ensure we then also see migrating.  		 */  		if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) {  			rq_pin_lock(rq, rf); @@ -180,6 +181,7 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)  	if ((irq_delta + steal) && sched_feat(NONTASK_CAPACITY))  		update_irq_load_avg(rq, irq_delta + steal);  #endif +	update_rq_clock_pelt(rq, delta);  }  void update_rq_clock(struct rq *rq) @@ -396,19 +398,7 @@ static bool set_nr_if_polling(struct task_struct *p)  #endif  #endif -/** - * wake_q_add() - queue a wakeup for 'later' waking. - * @head: the wake_q_head to add @task to - * @task: the task to queue for 'later' wakeup - * - * Queue a task for later wakeup, most likely by the wake_up_q() call in the - * same context, _HOWEVER_ this is not guaranteed, the wakeup can come - * instantly. - * - * This function must be used as-if it were wake_up_process(); IOW the task - * must be ready to be woken at this location. - */ -void wake_q_add(struct wake_q_head *head, struct task_struct *task) +static bool __wake_q_add(struct wake_q_head *head, struct task_struct *task)  {  	struct wake_q_node *node = &task->wake_q; @@ -421,16 +411,56 @@ void wake_q_add(struct wake_q_head *head, struct task_struct *task)  	 * state, even in the failed case, an explicit smp_mb() must be used.  	 */  	smp_mb__before_atomic(); -	if (cmpxchg_relaxed(&node->next, NULL, WAKE_Q_TAIL)) -		return; - -	get_task_struct(task); +	if (unlikely(cmpxchg_relaxed(&node->next, NULL, WAKE_Q_TAIL))) +		return false;  	/*  	 * The head is context local, there can be no concurrency.  	 */  	*head->lastp = node;  	head->lastp = &node->next; +	return true; +} + +/** + * wake_q_add() - queue a wakeup for 'later' waking. + * @head: the wake_q_head to add @task to + * @task: the task to queue for 'later' wakeup + * + * Queue a task for later wakeup, most likely by the wake_up_q() call in the + * same context, _HOWEVER_ this is not guaranteed, the wakeup can come + * instantly. + * + * This function must be used as-if it were wake_up_process(); IOW the task + * must be ready to be woken at this location. + */ +void wake_q_add(struct wake_q_head *head, struct task_struct *task) +{ +	if (__wake_q_add(head, task)) +		get_task_struct(task); +} + +/** + * wake_q_add_safe() - safely queue a wakeup for 'later' waking. + * @head: the wake_q_head to add @task to + * @task: the task to queue for 'later' wakeup + * + * Queue a task for later wakeup, most likely by the wake_up_q() call in the + * same context, _HOWEVER_ this is not guaranteed, the wakeup can come + * instantly. + * + * This function must be used as-if it were wake_up_process(); IOW the task + * must be ready to be woken at this location. + * + * This function is essentially a task-safe equivalent to wake_q_add(). Callers + * that already hold reference to @task can call the 'safe' version and trust + * wake_q to do the right thing depending whether or not the @task is already + * queued for wakeup. + */ +void wake_q_add_safe(struct wake_q_head *head, struct task_struct *task) +{ +	if (!__wake_q_add(head, task)) +		put_task_struct(task);  }  void wake_up_q(struct wake_q_head *head) @@ -928,7 +958,7 @@ static struct rq *move_queued_task(struct rq *rq, struct rq_flags *rf,  {  	lockdep_assert_held(&rq->lock); -	p->on_rq = TASK_ON_RQ_MIGRATING; +	WRITE_ONCE(p->on_rq, TASK_ON_RQ_MIGRATING);  	dequeue_task(rq, p, DEQUEUE_NOCLOCK);  	set_task_cpu(p, new_cpu);  	rq_unlock(rq, rf); @@ -2190,6 +2220,9 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)  	INIT_HLIST_HEAD(&p->preempt_notifiers);  #endif +#ifdef CONFIG_COMPACTION +	p->capture_control = NULL; +#endif  	init_numa_balancing(clone_flags, p);  } @@ -2431,7 +2464,7 @@ void wake_up_new_task(struct task_struct *p)  #endif  	rq = __task_rq_lock(p, &rf);  	update_rq_clock(rq); -	post_init_entity_util_avg(&p->se); +	post_init_entity_util_avg(p);  	activate_task(rq, p, ENQUEUE_NOCLOCK);  	p->on_rq = TASK_ON_RQ_QUEUED; @@ -5265,9 +5298,8 @@ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,  }  #ifdef CONFIG_COMPAT_32BIT_TIME -COMPAT_SYSCALL_DEFINE2(sched_rr_get_interval, -		       compat_pid_t, pid, -		       struct old_timespec32 __user *, interval) +SYSCALL_DEFINE2(sched_rr_get_interval_time32, pid_t, pid, +		struct old_timespec32 __user *, interval)  {  	struct timespec64 t;  	int retval = sched_rr_get_interval(pid, &t); @@ -5867,14 +5899,11 @@ void __init sched_init_smp(void)  	/*  	 * There's no userspace yet to cause hotplug operations; hence all the  	 * CPU masks are stable and all blatant races in the below code cannot -	 * happen. The hotplug lock is nevertheless taken to satisfy lockdep, -	 * but there won't be any contention on it. +	 * happen.  	 */ -	cpus_read_lock();  	mutex_lock(&sched_domains_mutex);  	sched_init_domains(cpu_active_mask);  	mutex_unlock(&sched_domains_mutex); -	cpus_read_unlock();  	/* Move init over to a non-isolated CPU */  	if (set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_FLAG_DOMAIN)) < 0) @@ -6162,6 +6191,34 @@ void ___might_sleep(const char *file, int line, int preempt_offset)  	add_taint(TAINT_WARN, LOCKDEP_STILL_OK);  }  EXPORT_SYMBOL(___might_sleep); + +void __cant_sleep(const char *file, int line, int preempt_offset) +{ +	static unsigned long prev_jiffy; + +	if (irqs_disabled()) +		return; + +	if (!IS_ENABLED(CONFIG_PREEMPT_COUNT)) +		return; + +	if (preempt_count() > preempt_offset) +		return; + +	if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) +		return; +	prev_jiffy = jiffies; + +	printk(KERN_ERR "BUG: assuming atomic context at %s:%d\n", file, line); +	printk(KERN_ERR "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n", +			in_atomic(), irqs_disabled(), +			current->pid, current->comm); + +	debug_show_held_locks(current); +	dump_stack(); +	add_taint(TAINT_WARN, LOCKDEP_STILL_OK); +} +EXPORT_SYMBOL_GPL(__cant_sleep);  #endif  #ifdef CONFIG_MAGIC_SYSRQ diff --git a/kernel/sched/cpufreq.c b/kernel/sched/cpufreq.c index 22bd8980f32f..835671f0f917 100644 --- a/kernel/sched/cpufreq.c +++ b/kernel/sched/cpufreq.c @@ -48,8 +48,8 @@ EXPORT_SYMBOL_GPL(cpufreq_add_update_util_hook);   *   * Clear the update_util_data pointer for the given CPU.   * - * Callers must use RCU-sched callbacks to free any memory that might be - * accessed via the old update_util_data pointer or invoke synchronize_sched() + * Callers must use RCU callbacks to free any memory that might be + * accessed via the old update_util_data pointer or invoke synchronize_rcu()   * right after this function to avoid use-after-free.   */  void cpufreq_remove_update_util_hook(int cpu) diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 033ec7c45f13..2efe629425be 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -859,7 +859,7 @@ static void sugov_stop(struct cpufreq_policy *policy)  	for_each_cpu(cpu, policy->cpus)  		cpufreq_remove_update_util_hook(cpu); -	synchronize_sched(); +	synchronize_rcu();  	if (!policy->fast_switch_enabled) {  		irq_work_sync(&sg_policy->irq_work); diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index fb8b7b5d745d..6a73e41a2016 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -1767,7 +1767,7 @@ pick_next_task_dl(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)  	deadline_queue_push_tasks(rq);  	if (rq->curr->sched_class != &dl_sched_class) -		update_dl_rq_load_avg(rq_clock_task(rq), rq, 0); +		update_dl_rq_load_avg(rq_clock_pelt(rq), rq, 0);  	return p;  } @@ -1776,7 +1776,7 @@ static void put_prev_task_dl(struct rq *rq, struct task_struct *p)  {  	update_curr_dl(rq); -	update_dl_rq_load_avg(rq_clock_task(rq), rq, 1); +	update_dl_rq_load_avg(rq_clock_pelt(rq), rq, 1);  	if (on_dl_rq(&p->dl) && p->nr_cpus_allowed > 1)  		enqueue_pushable_dl_task(rq, p);  } @@ -1793,7 +1793,7 @@ static void task_tick_dl(struct rq *rq, struct task_struct *p, int queued)  {  	update_curr_dl(rq); -	update_dl_rq_load_avg(rq_clock_task(rq), rq, 1); +	update_dl_rq_load_avg(rq_clock_pelt(rq), rq, 1);  	/*  	 * Even when we have runtime, update_curr_dl() might have resulted in us  	 * not being the leftmost task anymore. In that case NEED_RESCHED will diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index de3de997e245..8039d62ae36e 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -315,6 +315,7 @@ void register_sched_domain_sysctl(void)  {  	static struct ctl_table *cpu_entries;  	static struct ctl_table **cpu_idx; +	static bool init_done = false;  	char buf[32];  	int i; @@ -344,7 +345,10 @@ void register_sched_domain_sysctl(void)  	if (!cpumask_available(sd_sysctl_cpus)) {  		if (!alloc_cpumask_var(&sd_sysctl_cpus, GFP_KERNEL))  			return; +	} +	if (!init_done) { +		init_done = true;  		/* init to possible to not have holes in @cpu_entries */  		cpumask_copy(sd_sysctl_cpus, cpu_possible_mask);  	} diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 310d0637fe4b..ea74d43924b2 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -248,13 +248,6 @@ const struct sched_class fair_sched_class;   */  #ifdef CONFIG_FAIR_GROUP_SCHED - -/* cpu runqueue to which this cfs_rq is attached */ -static inline struct rq *rq_of(struct cfs_rq *cfs_rq) -{ -	return cfs_rq->rq; -} -  static inline struct task_struct *task_of(struct sched_entity *se)  {  	SCHED_WARN_ON(!entity_is_task(se)); @@ -282,79 +275,103 @@ static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)  	return grp->my_q;  } -static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq) +static inline bool list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)  { -	if (!cfs_rq->on_list) { -		struct rq *rq = rq_of(cfs_rq); -		int cpu = cpu_of(rq); +	struct rq *rq = rq_of(cfs_rq); +	int cpu = cpu_of(rq); + +	if (cfs_rq->on_list) +		return rq->tmp_alone_branch == &rq->leaf_cfs_rq_list; + +	cfs_rq->on_list = 1; + +	/* +	 * Ensure we either appear before our parent (if already +	 * enqueued) or force our parent to appear after us when it is +	 * enqueued. The fact that we always enqueue bottom-up +	 * reduces this to two cases and a special case for the root +	 * cfs_rq. Furthermore, it also means that we will always reset +	 * tmp_alone_branch either when the branch is connected +	 * to a tree or when we reach the top of the tree +	 */ +	if (cfs_rq->tg->parent && +	    cfs_rq->tg->parent->cfs_rq[cpu]->on_list) {  		/* -		 * Ensure we either appear before our parent (if already -		 * enqueued) or force our parent to appear after us when it is -		 * enqueued. The fact that we always enqueue bottom-up -		 * reduces this to two cases and a special case for the root -		 * cfs_rq. Furthermore, it also means that we will always reset -		 * tmp_alone_branch either when the branch is connected -		 * to a tree or when we reach the beg of the tree +		 * If parent is already on the list, we add the child +		 * just before. Thanks to circular linked property of +		 * the list, this means to put the child at the tail +		 * of the list that starts by parent.  		 */ -		if (cfs_rq->tg->parent && -		    cfs_rq->tg->parent->cfs_rq[cpu]->on_list) { -			/* -			 * If parent is already on the list, we add the child -			 * just before. Thanks to circular linked property of -			 * the list, this means to put the child at the tail -			 * of the list that starts by parent. -			 */ -			list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list, -				&(cfs_rq->tg->parent->cfs_rq[cpu]->leaf_cfs_rq_list)); -			/* -			 * The branch is now connected to its tree so we can -			 * reset tmp_alone_branch to the beginning of the -			 * list. -			 */ -			rq->tmp_alone_branch = &rq->leaf_cfs_rq_list; -		} else if (!cfs_rq->tg->parent) { -			/* -			 * cfs rq without parent should be put -			 * at the tail of the list. -			 */ -			list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list, -				&rq->leaf_cfs_rq_list); -			/* -			 * We have reach the beg of a tree so we can reset -			 * tmp_alone_branch to the beginning of the list. -			 */ -			rq->tmp_alone_branch = &rq->leaf_cfs_rq_list; -		} else { -			/* -			 * The parent has not already been added so we want to -			 * make sure that it will be put after us. -			 * tmp_alone_branch points to the beg of the branch -			 * where we will add parent. -			 */ -			list_add_rcu(&cfs_rq->leaf_cfs_rq_list, -				rq->tmp_alone_branch); -			/* -			 * update tmp_alone_branch to points to the new beg -			 * of the branch -			 */ -			rq->tmp_alone_branch = &cfs_rq->leaf_cfs_rq_list; -		} +		list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list, +			&(cfs_rq->tg->parent->cfs_rq[cpu]->leaf_cfs_rq_list)); +		/* +		 * The branch is now connected to its tree so we can +		 * reset tmp_alone_branch to the beginning of the +		 * list. +		 */ +		rq->tmp_alone_branch = &rq->leaf_cfs_rq_list; +		return true; +	} -		cfs_rq->on_list = 1; +	if (!cfs_rq->tg->parent) { +		/* +		 * cfs rq without parent should be put +		 * at the tail of the list. +		 */ +		list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list, +			&rq->leaf_cfs_rq_list); +		/* +		 * We have reach the top of a tree so we can reset +		 * tmp_alone_branch to the beginning of the list. +		 */ +		rq->tmp_alone_branch = &rq->leaf_cfs_rq_list; +		return true;  	} + +	/* +	 * The parent has not already been added so we want to +	 * make sure that it will be put after us. +	 * tmp_alone_branch points to the begin of the branch +	 * where we will add parent. +	 */ +	list_add_rcu(&cfs_rq->leaf_cfs_rq_list, rq->tmp_alone_branch); +	/* +	 * update tmp_alone_branch to points to the new begin +	 * of the branch +	 */ +	rq->tmp_alone_branch = &cfs_rq->leaf_cfs_rq_list; +	return false;  }  static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)  {  	if (cfs_rq->on_list) { +		struct rq *rq = rq_of(cfs_rq); + +		/* +		 * With cfs_rq being unthrottled/throttled during an enqueue, +		 * it can happen the tmp_alone_branch points the a leaf that +		 * we finally want to del. In this case, tmp_alone_branch moves +		 * to the prev element but it will point to rq->leaf_cfs_rq_list +		 * at the end of the enqueue. +		 */ +		if (rq->tmp_alone_branch == &cfs_rq->leaf_cfs_rq_list) +			rq->tmp_alone_branch = cfs_rq->leaf_cfs_rq_list.prev; +  		list_del_rcu(&cfs_rq->leaf_cfs_rq_list);  		cfs_rq->on_list = 0;  	}  } -/* Iterate through all leaf cfs_rq's on a runqueue: */ -#define for_each_leaf_cfs_rq(rq, cfs_rq) \ -	list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list) +static inline void assert_list_leaf_cfs_rq(struct rq *rq) +{ +	SCHED_WARN_ON(rq->tmp_alone_branch != &rq->leaf_cfs_rq_list); +} + +/* Iterate thr' all leaf cfs_rq's on a runqueue */ +#define for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos)			\ +	list_for_each_entry_safe(cfs_rq, pos, &rq->leaf_cfs_rq_list,	\ +				 leaf_cfs_rq_list)  /* Do the two (enqueued) entities belong to the same group ? */  static inline struct cfs_rq * @@ -410,12 +427,6 @@ static inline struct task_struct *task_of(struct sched_entity *se)  	return container_of(se, struct task_struct, se);  } -static inline struct rq *rq_of(struct cfs_rq *cfs_rq) -{ -	return container_of(cfs_rq, struct rq, cfs); -} - -  #define for_each_sched_entity(se) \  		for (; se; se = NULL) @@ -438,16 +449,21 @@ static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)  	return NULL;  } -static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq) +static inline bool list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)  { +	return true;  }  static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)  {  } -#define for_each_leaf_cfs_rq(rq, cfs_rq)	\ -		for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL) +static inline void assert_list_leaf_cfs_rq(struct rq *rq) +{ +} + +#define for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos)	\ +		for (cfs_rq = &rq->cfs, pos = NULL; cfs_rq; cfs_rq = pos)  static inline struct sched_entity *parent_entity(struct sched_entity *se)  { @@ -686,9 +702,8 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)  	return calc_delta_fair(sched_slice(cfs_rq, se), se);  } -#ifdef CONFIG_SMP  #include "pelt.h" -#include "sched-pelt.h" +#ifdef CONFIG_SMP  static int select_idle_sibling(struct task_struct *p, int prev_cpu, int cpu);  static unsigned long task_h_load(struct task_struct *p); @@ -744,8 +759,9 @@ static void attach_entity_cfs_rq(struct sched_entity *se);   * Finally, that extrapolated util_avg is clamped to the cap (util_avg_cap)   * if util_avg > util_avg_cap.   */ -void post_init_entity_util_avg(struct sched_entity *se) +void post_init_entity_util_avg(struct task_struct *p)  { +	struct sched_entity *se = &p->se;  	struct cfs_rq *cfs_rq = cfs_rq_of(se);  	struct sched_avg *sa = &se->avg;  	long cpu_scale = arch_scale_cpu_capacity(NULL, cpu_of(rq_of(cfs_rq))); @@ -763,22 +779,19 @@ void post_init_entity_util_avg(struct sched_entity *se)  		}  	} -	if (entity_is_task(se)) { -		struct task_struct *p = task_of(se); -		if (p->sched_class != &fair_sched_class) { -			/* -			 * For !fair tasks do: -			 * -			update_cfs_rq_load_avg(now, cfs_rq); -			attach_entity_load_avg(cfs_rq, se, 0); -			switched_from_fair(rq, p); -			 * -			 * such that the next switched_to_fair() has the -			 * expected state. -			 */ -			se->avg.last_update_time = cfs_rq_clock_task(cfs_rq); -			return; -		} +	if (p->sched_class != &fair_sched_class) { +		/* +		 * For !fair tasks do: +		 * +		update_cfs_rq_load_avg(now, cfs_rq); +		attach_entity_load_avg(cfs_rq, se, 0); +		switched_from_fair(rq, p); +		 * +		 * such that the next switched_to_fair() has the +		 * expected state. +		 */ +		se->avg.last_update_time = cfs_rq_clock_pelt(cfs_rq); +		return;  	}  	attach_entity_cfs_rq(se); @@ -788,7 +801,7 @@ void post_init_entity_util_avg(struct sched_entity *se)  void init_entity_runnable_average(struct sched_entity *se)  {  } -void post_init_entity_util_avg(struct sched_entity *se) +void post_init_entity_util_avg(struct task_struct *p)  {  }  static void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) @@ -1035,7 +1048,7 @@ unsigned int sysctl_numa_balancing_scan_size = 256;  unsigned int sysctl_numa_balancing_scan_delay = 1000;  struct numa_group { -	atomic_t refcount; +	refcount_t refcount;  	spinlock_t lock; /* nr_tasks, tasks */  	int nr_tasks; @@ -1104,7 +1117,7 @@ static unsigned int task_scan_start(struct task_struct *p)  		unsigned long shared = group_faults_shared(ng);  		unsigned long private = group_faults_priv(ng); -		period *= atomic_read(&ng->refcount); +		period *= refcount_read(&ng->refcount);  		period *= shared + 1;  		period /= private + shared + 1;  	} @@ -1127,7 +1140,7 @@ static unsigned int task_scan_max(struct task_struct *p)  		unsigned long private = group_faults_priv(ng);  		unsigned long period = smax; -		period *= atomic_read(&ng->refcount); +		period *= refcount_read(&ng->refcount);  		period *= shared + 1;  		period /= private + shared + 1; @@ -1160,7 +1173,7 @@ void init_numa_balancing(unsigned long clone_flags, struct task_struct *p)  	/* New address space, reset the preferred nid */  	if (!(clone_flags & CLONE_VM)) { -		p->numa_preferred_nid = -1; +		p->numa_preferred_nid = NUMA_NO_NODE;  		return;  	} @@ -1180,13 +1193,13 @@ void init_numa_balancing(unsigned long clone_flags, struct task_struct *p)  static void account_numa_enqueue(struct rq *rq, struct task_struct *p)  { -	rq->nr_numa_running += (p->numa_preferred_nid != -1); +	rq->nr_numa_running += (p->numa_preferred_nid != NUMA_NO_NODE);  	rq->nr_preferred_running += (p->numa_preferred_nid == task_node(p));  }  static void account_numa_dequeue(struct rq *rq, struct task_struct *p)  { -	rq->nr_numa_running -= (p->numa_preferred_nid != -1); +	rq->nr_numa_running -= (p->numa_preferred_nid != NUMA_NO_NODE);  	rq->nr_preferred_running -= (p->numa_preferred_nid == task_node(p));  } @@ -1400,7 +1413,7 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page * page,  	 * two full passes of the "multi-stage node selection" test that is  	 * executed below.  	 */ -	if ((p->numa_preferred_nid == -1 || p->numa_scan_seq <= 4) && +	if ((p->numa_preferred_nid == NUMA_NO_NODE || p->numa_scan_seq <= 4) &&  	    (cpupid_pid_unset(last_cpupid) || cpupid_match_pid(p, last_cpupid)))  		return true; @@ -1848,7 +1861,7 @@ static void numa_migrate_preferred(struct task_struct *p)  	unsigned long interval = HZ;  	/* This task has no NUMA fault statistics yet */ -	if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults)) +	if (unlikely(p->numa_preferred_nid == NUMA_NO_NODE || !p->numa_faults))  		return;  	/* Periodically retry migrating the task to the preferred node */ @@ -2095,7 +2108,7 @@ static int preferred_group_nid(struct task_struct *p, int nid)  static void task_numa_placement(struct task_struct *p)  { -	int seq, nid, max_nid = -1; +	int seq, nid, max_nid = NUMA_NO_NODE;  	unsigned long max_faults = 0;  	unsigned long fault_types[2] = { 0, 0 };  	unsigned long total_faults; @@ -2203,12 +2216,12 @@ static void task_numa_placement(struct task_struct *p)  static inline int get_numa_group(struct numa_group *grp)  { -	return atomic_inc_not_zero(&grp->refcount); +	return refcount_inc_not_zero(&grp->refcount);  }  static inline void put_numa_group(struct numa_group *grp)  { -	if (atomic_dec_and_test(&grp->refcount)) +	if (refcount_dec_and_test(&grp->refcount))  		kfree_rcu(grp, rcu);  } @@ -2229,7 +2242,7 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags,  		if (!grp)  			return; -		atomic_set(&grp->refcount, 1); +		refcount_set(&grp->refcount, 1);  		grp->active_nodes = 1;  		grp->max_faults_cpu = 0;  		spin_lock_init(&grp->lock); @@ -2638,7 +2651,8 @@ static void update_scan_period(struct task_struct *p, int new_cpu)  		 * the preferred node.  		 */  		if (dst_nid == p->numa_preferred_nid || -		    (p->numa_preferred_nid != -1 && src_nid != p->numa_preferred_nid)) +		    (p->numa_preferred_nid != NUMA_NO_NODE && +			src_nid != p->numa_preferred_nid))  			return;  	} @@ -3122,7 +3136,7 @@ void set_task_rq_fair(struct sched_entity *se,  	p_last_update_time = prev->avg.last_update_time;  	n_last_update_time = next->avg.last_update_time;  #endif -	__update_load_avg_blocked_se(p_last_update_time, cpu_of(rq_of(prev)), se); +	__update_load_avg_blocked_se(p_last_update_time, se);  	se->avg.last_update_time = n_last_update_time;  } @@ -3257,11 +3271,11 @@ update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cf  	/*  	 * runnable_sum can't be lower than running_sum -	 * As running sum is scale with CPU capacity wehreas the runnable sum -	 * is not we rescale running_sum 1st +	 * Rescale running sum to be in the same range as runnable sum +	 * running_sum is in [0 : LOAD_AVG_MAX <<  SCHED_CAPACITY_SHIFT] +	 * runnable_sum is in [0 : LOAD_AVG_MAX]  	 */ -	running_sum = se->avg.util_sum / -		arch_scale_cpu_capacity(NULL, cpu_of(rq_of(cfs_rq))); +	running_sum = se->avg.util_sum >> SCHED_CAPACITY_SHIFT;  	runnable_sum = max(runnable_sum, running_sum);  	load_sum = (s64)se_weight(se) * runnable_sum; @@ -3364,7 +3378,7 @@ static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum  /**   * update_cfs_rq_load_avg - update the cfs_rq's load/util averages - * @now: current time, as per cfs_rq_clock_task() + * @now: current time, as per cfs_rq_clock_pelt()   * @cfs_rq: cfs_rq to update   *   * The cfs_rq avg is the direct sum of all its entities (blocked and runnable) @@ -3409,7 +3423,7 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)  		decayed = 1;  	} -	decayed |= __update_load_avg_cfs_rq(now, cpu_of(rq_of(cfs_rq)), cfs_rq); +	decayed |= __update_load_avg_cfs_rq(now, cfs_rq);  #ifndef CONFIG_64BIT  	smp_wmb(); @@ -3499,9 +3513,7 @@ static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s  /* Update task and its cfs_rq load average */  static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)  { -	u64 now = cfs_rq_clock_task(cfs_rq); -	struct rq *rq = rq_of(cfs_rq); -	int cpu = cpu_of(rq); +	u64 now = cfs_rq_clock_pelt(cfs_rq);  	int decayed;  	/* @@ -3509,7 +3521,7 @@ static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s  	 * track group sched_entity load average for task_h_load calc in migration  	 */  	if (se->avg.last_update_time && !(flags & SKIP_AGE_LOAD)) -		__update_load_avg_se(now, cpu, cfs_rq, se); +		__update_load_avg_se(now, cfs_rq, se);  	decayed  = update_cfs_rq_load_avg(now, cfs_rq);  	decayed |= propagate_entity_load_avg(se); @@ -3561,7 +3573,7 @@ void sync_entity_load_avg(struct sched_entity *se)  	u64 last_update_time;  	last_update_time = cfs_rq_last_update_time(cfs_rq); -	__update_load_avg_blocked_se(last_update_time, cpu_of(rq_of(cfs_rq)), se); +	__update_load_avg_blocked_se(last_update_time, se);  }  /* @@ -3577,10 +3589,6 @@ void remove_entity_load_avg(struct sched_entity *se)  	 * tasks cannot exit without having gone through wake_up_new_task() ->  	 * post_init_entity_util_avg() which will have added things to the  	 * cfs_rq, so we can remove unconditionally. -	 * -	 * Similarly for groups, they will have passed through -	 * post_init_entity_util_avg() before unregister_sched_fair_group() -	 * calls this.  	 */  	sync_entity_load_avg(se); @@ -3654,6 +3662,7 @@ util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep)  {  	long last_ewma_diff;  	struct util_est ue; +	int cpu;  	if (!sched_feat(UTIL_EST))  		return; @@ -3688,6 +3697,14 @@ util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep)  		return;  	/* +	 * To avoid overestimation of actual task utilization, skip updates if +	 * we cannot grant there is idle time in this CPU. +	 */ +	cpu = cpu_of(rq_of(cfs_rq)); +	if (task_util(p) > capacity_orig_of(cpu)) +		return; + +	/*  	 * Update Task's estimated utilization  	 *  	 * When *p completes an activation we can consolidate another sample @@ -4429,6 +4446,10 @@ static int tg_unthrottle_up(struct task_group *tg, void *data)  		/* adjust cfs_rq_clock_task() */  		cfs_rq->throttled_clock_task_time += rq_clock_task(rq) -  					     cfs_rq->throttled_clock_task; + +		/* Add cfs_rq with already running entity in the list */ +		if (cfs_rq->nr_running >= 1) +			list_add_leaf_cfs_rq(cfs_rq);  	}  	return 0; @@ -4440,8 +4461,10 @@ static int tg_throttle_down(struct task_group *tg, void *data)  	struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];  	/* group is entering throttled state, stop time */ -	if (!cfs_rq->throttle_count) +	if (!cfs_rq->throttle_count) {  		cfs_rq->throttled_clock_task = rq_clock_task(rq); +		list_del_leaf_cfs_rq(cfs_rq); +	}  	cfs_rq->throttle_count++;  	return 0; @@ -4544,6 +4567,8 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)  			break;  	} +	assert_list_leaf_cfs_rq(rq); +  	if (!se)  		add_nr_running(rq, task_delta); @@ -4565,7 +4590,7 @@ static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b,  		struct rq *rq = rq_of(cfs_rq);  		struct rq_flags rf; -		rq_lock(rq, &rf); +		rq_lock_irqsave(rq, &rf);  		if (!cfs_rq_throttled(cfs_rq))  			goto next; @@ -4582,7 +4607,7 @@ static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b,  			unthrottle_cfs_rq(cfs_rq);  next: -		rq_unlock(rq, &rf); +		rq_unlock_irqrestore(rq, &rf);  		if (!remaining)  			break; @@ -4598,7 +4623,7 @@ next:   * period the timer is deactivated until scheduling resumes; cfs_b->idle is   * used to track this state.   */ -static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun) +static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun, unsigned long flags)  {  	u64 runtime, runtime_expires;  	int throttled; @@ -4640,11 +4665,11 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)  	while (throttled && cfs_b->runtime > 0 && !cfs_b->distribute_running) {  		runtime = cfs_b->runtime;  		cfs_b->distribute_running = 1; -		raw_spin_unlock(&cfs_b->lock); +		raw_spin_unlock_irqrestore(&cfs_b->lock, flags);  		/* we can't nest cfs_b->lock while distributing bandwidth */  		runtime = distribute_cfs_runtime(cfs_b, runtime,  						 runtime_expires); -		raw_spin_lock(&cfs_b->lock); +		raw_spin_lock_irqsave(&cfs_b->lock, flags);  		cfs_b->distribute_running = 0;  		throttled = !list_empty(&cfs_b->throttled_cfs_rq); @@ -4753,17 +4778,18 @@ static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq)  static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)  {  	u64 runtime = 0, slice = sched_cfs_bandwidth_slice(); +	unsigned long flags;  	u64 expires;  	/* confirm we're still not at a refresh boundary */ -	raw_spin_lock(&cfs_b->lock); +	raw_spin_lock_irqsave(&cfs_b->lock, flags);  	if (cfs_b->distribute_running) { -		raw_spin_unlock(&cfs_b->lock); +		raw_spin_unlock_irqrestore(&cfs_b->lock, flags);  		return;  	}  	if (runtime_refresh_within(cfs_b, min_bandwidth_expiration)) { -		raw_spin_unlock(&cfs_b->lock); +		raw_spin_unlock_irqrestore(&cfs_b->lock, flags);  		return;  	} @@ -4774,18 +4800,18 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)  	if (runtime)  		cfs_b->distribute_running = 1; -	raw_spin_unlock(&cfs_b->lock); +	raw_spin_unlock_irqrestore(&cfs_b->lock, flags);  	if (!runtime)  		return;  	runtime = distribute_cfs_runtime(cfs_b, runtime, expires); -	raw_spin_lock(&cfs_b->lock); +	raw_spin_lock_irqsave(&cfs_b->lock, flags);  	if (expires == cfs_b->runtime_expires)  		lsub_positive(&cfs_b->runtime, runtime);  	cfs_b->distribute_running = 0; -	raw_spin_unlock(&cfs_b->lock); +	raw_spin_unlock_irqrestore(&cfs_b->lock, flags);  }  /* @@ -4863,20 +4889,21 @@ static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)  {  	struct cfs_bandwidth *cfs_b =  		container_of(timer, struct cfs_bandwidth, period_timer); +	unsigned long flags;  	int overrun;  	int idle = 0; -	raw_spin_lock(&cfs_b->lock); +	raw_spin_lock_irqsave(&cfs_b->lock, flags);  	for (;;) {  		overrun = hrtimer_forward_now(timer, cfs_b->period);  		if (!overrun)  			break; -		idle = do_sched_cfs_period_timer(cfs_b, overrun); +		idle = do_sched_cfs_period_timer(cfs_b, overrun, flags);  	}  	if (idle)  		cfs_b->period_active = 0; -	raw_spin_unlock(&cfs_b->lock); +	raw_spin_unlock_irqrestore(&cfs_b->lock, flags);  	return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;  } @@ -4986,6 +5013,12 @@ static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)  }  #else /* CONFIG_CFS_BANDWIDTH */ + +static inline bool cfs_bandwidth_used(void) +{ +	return false; +} +  static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)  {  	return rq_clock_task(rq_of(cfs_rq)); @@ -5177,6 +5210,23 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)  	} +	if (cfs_bandwidth_used()) { +		/* +		 * When bandwidth control is enabled; the cfs_rq_throttled() +		 * breaks in the above iteration can result in incomplete +		 * leaf list maintenance, resulting in triggering the assertion +		 * below. +		 */ +		for_each_sched_entity(se) { +			cfs_rq = cfs_rq_of(se); + +			if (list_add_leaf_cfs_rq(cfs_rq)) +				break; +		} +	} + +	assert_list_leaf_cfs_rq(rq); +  	hrtick_update(rq);  } @@ -5556,11 +5606,6 @@ static unsigned long capacity_of(int cpu)  	return cpu_rq(cpu)->cpu_capacity;  } -static unsigned long capacity_orig_of(int cpu) -{ -	return cpu_rq(cpu)->cpu_capacity_orig; -} -  static unsigned long cpu_avg_load_per_task(int cpu)  {  	struct rq *rq = cpu_rq(cpu); @@ -6053,7 +6098,7 @@ static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int  		bool idle = true;  		for_each_cpu(cpu, cpu_smt_mask(core)) { -			cpumask_clear_cpu(cpu, cpus); +			__cpumask_clear_cpu(cpu, cpus);  			if (!available_idle_cpu(cpu))  				idle = false;  		} @@ -6073,7 +6118,7 @@ static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int  /*   * Scan the local SMT mask for idle CPUs.   */ -static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target) +static int select_idle_smt(struct task_struct *p, int target)  {  	int cpu; @@ -6097,7 +6142,7 @@ static inline int select_idle_core(struct task_struct *p, struct sched_domain *s  	return -1;  } -static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target) +static inline int select_idle_smt(struct task_struct *p, int target)  {  	return -1;  } @@ -6202,7 +6247,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)  	if ((unsigned)i < nr_cpumask_bits)  		return i; -	i = select_idle_smt(p, sd, target); +	i = select_idle_smt(p, target);  	if ((unsigned)i < nr_cpumask_bits)  		return i; @@ -6608,7 +6653,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f  	if (sd_flag & SD_BALANCE_WAKE) {  		record_wakee(p); -		if (static_branch_unlikely(&sched_energy_present)) { +		if (sched_energy_enabled()) {  			new_cpu = find_energy_efficient_cpu(p, prev_cpu);  			if (new_cpu >= 0)  				return new_cpu; @@ -7027,6 +7072,12 @@ idle:  	if (new_tasks > 0)  		goto again; +	/* +	 * rq is about to be idle, check if we need to update the +	 * lost_idle_time of clock_pelt +	 */ +	update_idle_rq_clock_pelt(rq); +  	return NULL;  } @@ -7647,10 +7698,27 @@ static inline bool others_have_blocked(struct rq *rq)  #ifdef CONFIG_FAIR_GROUP_SCHED +static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq) +{ +	if (cfs_rq->load.weight) +		return false; + +	if (cfs_rq->avg.load_sum) +		return false; + +	if (cfs_rq->avg.util_sum) +		return false; + +	if (cfs_rq->avg.runnable_load_sum) +		return false; + +	return true; +} +  static void update_blocked_averages(int cpu)  {  	struct rq *rq = cpu_rq(cpu); -	struct cfs_rq *cfs_rq; +	struct cfs_rq *cfs_rq, *pos;  	const struct sched_class *curr_class;  	struct rq_flags rf;  	bool done = true; @@ -7662,14 +7730,10 @@ static void update_blocked_averages(int cpu)  	 * Iterates the task_group tree in a bottom up fashion, see  	 * list_add_leaf_cfs_rq() for details.  	 */ -	for_each_leaf_cfs_rq(rq, cfs_rq) { +	for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) {  		struct sched_entity *se; -		/* throttled entities do not contribute to load */ -		if (throttled_hierarchy(cfs_rq)) -			continue; - -		if (update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq)) +		if (update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq))  			update_tg_load_avg(cfs_rq, 0);  		/* Propagate pending load changes to the parent, if any: */ @@ -7677,14 +7741,21 @@ static void update_blocked_averages(int cpu)  		if (se && !skip_blocked_update(se))  			update_load_avg(cfs_rq_of(se), se, 0); +		/* +		 * There can be a lot of idle CPU cgroups.  Don't let fully +		 * decayed cfs_rqs linger on the list. +		 */ +		if (cfs_rq_is_decayed(cfs_rq)) +			list_del_leaf_cfs_rq(cfs_rq); +  		/* Don't need periodic decay once load/util_avg are null */  		if (cfs_rq_has_blocked(cfs_rq))  			done = false;  	}  	curr_class = rq->curr->sched_class; -	update_rt_rq_load_avg(rq_clock_task(rq), rq, curr_class == &rt_sched_class); -	update_dl_rq_load_avg(rq_clock_task(rq), rq, curr_class == &dl_sched_class); +	update_rt_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &rt_sched_class); +	update_dl_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &dl_sched_class);  	update_irq_load_avg(rq, 0);  	/* Don't need periodic decay once load/util_avg are null */  	if (others_have_blocked(rq)) @@ -7754,11 +7825,11 @@ static inline void update_blocked_averages(int cpu)  	rq_lock_irqsave(rq, &rf);  	update_rq_clock(rq); -	update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq); +	update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq);  	curr_class = rq->curr->sched_class; -	update_rt_rq_load_avg(rq_clock_task(rq), rq, curr_class == &rt_sched_class); -	update_dl_rq_load_avg(rq_clock_task(rq), rq, curr_class == &dl_sched_class); +	update_rt_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &rt_sched_class); +	update_dl_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &dl_sched_class);  	update_irq_load_avg(rq, 0);  #ifdef CONFIG_NO_HZ_COMMON  	rq->last_blocked_load_update_tick = jiffies; @@ -8452,9 +8523,7 @@ static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds)  	if (sched_asym_prefer(busiest_cpu, env->dst_cpu))  		return 0; -	env->imbalance = DIV_ROUND_CLOSEST( -		sds->busiest_stat.avg_load * sds->busiest_stat.group_capacity, -		SCHED_CAPACITY_SCALE); +	env->imbalance = sds->busiest_stat.group_load;  	return 1;  } @@ -8636,7 +8705,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env)  	 */  	update_sd_lb_stats(env, &sds); -	if (static_branch_unlikely(&sched_energy_present)) { +	if (sched_energy_enabled()) {  		struct root_domain *rd = env->dst_rq->rd;  		if (rcu_dereference(rd->pd) && !READ_ONCE(rd->overutilized)) @@ -8827,21 +8896,25 @@ static struct rq *find_busiest_queue(struct lb_env *env,   */  #define MAX_PINNED_INTERVAL	512 -static int need_active_balance(struct lb_env *env) +static inline bool +asym_active_balance(struct lb_env *env)  { -	struct sched_domain *sd = env->sd; +	/* +	 * ASYM_PACKING needs to force migrate tasks from busy but +	 * lower priority CPUs in order to pack all tasks in the +	 * highest priority CPUs. +	 */ +	return env->idle != CPU_NOT_IDLE && (env->sd->flags & SD_ASYM_PACKING) && +	       sched_asym_prefer(env->dst_cpu, env->src_cpu); +} -	if (env->idle == CPU_NEWLY_IDLE) { +static inline bool +voluntary_active_balance(struct lb_env *env) +{ +	struct sched_domain *sd = env->sd; -		/* -		 * ASYM_PACKING needs to force migrate tasks from busy but -		 * lower priority CPUs in order to pack all tasks in the -		 * highest priority CPUs. -		 */ -		if ((sd->flags & SD_ASYM_PACKING) && -		    sched_asym_prefer(env->dst_cpu, env->src_cpu)) -			return 1; -	} +	if (asym_active_balance(env)) +		return 1;  	/*  	 * The dst_cpu is idle and the src_cpu CPU has only 1 CFS task. @@ -8859,6 +8932,16 @@ static int need_active_balance(struct lb_env *env)  	if (env->src_grp_type == group_misfit_task)  		return 1; +	return 0; +} + +static int need_active_balance(struct lb_env *env) +{ +	struct sched_domain *sd = env->sd; + +	if (voluntary_active_balance(env)) +		return 1; +  	return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2);  } @@ -9023,7 +9106,7 @@ more_balance:  		if ((env.flags & LBF_DST_PINNED) && env.imbalance > 0) {  			/* Prevent to re-select dst_cpu via env's CPUs */ -			cpumask_clear_cpu(env.dst_cpu, env.cpus); +			__cpumask_clear_cpu(env.dst_cpu, env.cpus);  			env.dst_rq	 = cpu_rq(env.new_dst_cpu);  			env.dst_cpu	 = env.new_dst_cpu; @@ -9050,7 +9133,7 @@ more_balance:  		/* All tasks on this runqueue were pinned by CPU affinity */  		if (unlikely(env.flags & LBF_ALL_PINNED)) { -			cpumask_clear_cpu(cpu_of(busiest), cpus); +			__cpumask_clear_cpu(cpu_of(busiest), cpus);  			/*  			 * Attempting to continue load balancing at the current  			 * sched_domain level only makes sense if there are @@ -9120,7 +9203,7 @@ more_balance:  	} else  		sd->nr_balance_failed = 0; -	if (likely(!active_balance)) { +	if (likely(!active_balance) || voluntary_active_balance(&env)) {  		/* We were unbalanced, so reset the balancing interval */  		sd->balance_interval = sd->min_interval;  	} else { @@ -9469,15 +9552,8 @@ static void kick_ilb(unsigned int flags)  }  /* - * Current heuristic for kicking the idle load balancer in the presence - * of an idle cpu in the system. - *   - This rq has more than one task. - *   - This rq has at least one CFS task and the capacity of the CPU is - *     significantly reduced because of RT tasks or IRQs. - *   - At parent of LLC scheduler domain level, this cpu's scheduler group has - *     multiple busy cpu. - *   - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler - *     domain span are idle. + * Current decision point for kicking the idle load balancer in the presence + * of idle CPUs in the system.   */  static void nohz_balancer_kick(struct rq *rq)  { @@ -9519,8 +9595,13 @@ static void nohz_balancer_kick(struct rq *rq)  	sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));  	if (sds) {  		/* -		 * XXX: write a coherent comment on why we do this. -		 * See also: http://lkml.kernel.org/r/20111202010832.602203411@sbsiddha-desk.sc.intel.com +		 * If there is an imbalance between LLC domains (IOW we could +		 * increase the overall cache use), we need some less-loaded LLC +		 * domain to pull some load. Likewise, we may need to spread +		 * load within the current LLC domain (e.g. packed SMT cores but +		 * other CPUs are idle). We can't really know from here how busy +		 * the others are - so just get a nohz balance going if it looks +		 * like this LLC domain has tasks we could move.  		 */  		nr_busy = atomic_read(&sds->nr_busy_cpus);  		if (nr_busy > 1) { @@ -9533,7 +9614,7 @@ static void nohz_balancer_kick(struct rq *rq)  	sd = rcu_dereference(rq->sd);  	if (sd) {  		if ((rq->cfs.h_nr_running >= 1) && -				check_cpu_capacity(rq, sd)) { +		    check_cpu_capacity(rq, sd)) {  			flags = NOHZ_KICK_MASK;  			goto unlock;  		} @@ -9541,11 +9622,7 @@ static void nohz_balancer_kick(struct rq *rq)  	sd = rcu_dereference(per_cpu(sd_asym_packing, cpu));  	if (sd) { -		for_each_cpu(i, sched_domain_span(sd)) { -			if (i == cpu || -			    !cpumask_test_cpu(i, nohz.idle_cpus_mask)) -				continue; - +		for_each_cpu_and(i, sched_domain_span(sd), nohz.idle_cpus_mask) {  			if (sched_asym_prefer(i, cpu)) {  				flags = NOHZ_KICK_MASK;  				goto unlock; @@ -10546,10 +10623,10 @@ const struct sched_class fair_sched_class = {  #ifdef CONFIG_SCHED_DEBUG  void print_cfs_stats(struct seq_file *m, int cpu)  { -	struct cfs_rq *cfs_rq; +	struct cfs_rq *cfs_rq, *pos;  	rcu_read_lock(); -	for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq) +	for_each_leaf_cfs_rq_safe(cpu_rq(cpu), cfs_rq, pos)  		print_cfs_rq(m, cpu, cfs_rq);  	rcu_read_unlock();  } diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c index 81faddba9e20..b02d148e7672 100644 --- a/kernel/sched/isolation.c +++ b/kernel/sched/isolation.c @@ -80,7 +80,7 @@ static int __init housekeeping_setup(char *str, enum hk_flags flags)  		cpumask_andnot(housekeeping_mask,  			       cpu_possible_mask, non_housekeeping_mask);  		if (cpumask_empty(housekeeping_mask)) -			cpumask_set_cpu(smp_processor_id(), housekeeping_mask); +			__cpumask_set_cpu(smp_processor_id(), housekeeping_mask);  	} else {  		cpumask_var_t tmp; diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c index 90fb5bc12ad4..befce29bd882 100644 --- a/kernel/sched/pelt.c +++ b/kernel/sched/pelt.c @@ -26,7 +26,6 @@  #include <linux/sched.h>  #include "sched.h" -#include "sched-pelt.h"  #include "pelt.h"  /* @@ -106,16 +105,12 @@ static u32 __accumulate_pelt_segments(u64 periods, u32 d1, u32 d3)   *                     n=1   */  static __always_inline u32 -accumulate_sum(u64 delta, int cpu, struct sched_avg *sa, +accumulate_sum(u64 delta, struct sched_avg *sa,  	       unsigned long load, unsigned long runnable, int running)  { -	unsigned long scale_freq, scale_cpu;  	u32 contrib = (u32)delta; /* p == 0 -> delta < 1024 */  	u64 periods; -	scale_freq = arch_scale_freq_capacity(cpu); -	scale_cpu = arch_scale_cpu_capacity(NULL, cpu); -  	delta += sa->period_contrib;  	periods = delta / 1024; /* A period is 1024us (~1ms) */ @@ -137,13 +132,12 @@ accumulate_sum(u64 delta, int cpu, struct sched_avg *sa,  	}  	sa->period_contrib = delta; -	contrib = cap_scale(contrib, scale_freq);  	if (load)  		sa->load_sum += load * contrib;  	if (runnable)  		sa->runnable_load_sum += runnable * contrib;  	if (running) -		sa->util_sum += contrib * scale_cpu; +		sa->util_sum += contrib << SCHED_CAPACITY_SHIFT;  	return periods;  } @@ -177,7 +171,7 @@ accumulate_sum(u64 delta, int cpu, struct sched_avg *sa,   *            = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}]   */  static __always_inline int -___update_load_sum(u64 now, int cpu, struct sched_avg *sa, +___update_load_sum(u64 now, struct sched_avg *sa,  		  unsigned long load, unsigned long runnable, int running)  {  	u64 delta; @@ -221,7 +215,7 @@ ___update_load_sum(u64 now, int cpu, struct sched_avg *sa,  	 * Step 1: accumulate *_sum since last_update_time. If we haven't  	 * crossed period boundaries, finish.  	 */ -	if (!accumulate_sum(delta, cpu, sa, load, runnable, running)) +	if (!accumulate_sum(delta, sa, load, runnable, running))  		return 0;  	return 1; @@ -267,9 +261,9 @@ ___update_load_avg(struct sched_avg *sa, unsigned long load, unsigned long runna   *   runnable_load_avg = \Sum se->avg.runable_load_avg   */ -int __update_load_avg_blocked_se(u64 now, int cpu, struct sched_entity *se) +int __update_load_avg_blocked_se(u64 now, struct sched_entity *se)  { -	if (___update_load_sum(now, cpu, &se->avg, 0, 0, 0)) { +	if (___update_load_sum(now, &se->avg, 0, 0, 0)) {  		___update_load_avg(&se->avg, se_weight(se), se_runnable(se));  		return 1;  	} @@ -277,9 +271,9 @@ int __update_load_avg_blocked_se(u64 now, int cpu, struct sched_entity *se)  	return 0;  } -int __update_load_avg_se(u64 now, int cpu, struct cfs_rq *cfs_rq, struct sched_entity *se) +int __update_load_avg_se(u64 now, struct cfs_rq *cfs_rq, struct sched_entity *se)  { -	if (___update_load_sum(now, cpu, &se->avg, !!se->on_rq, !!se->on_rq, +	if (___update_load_sum(now, &se->avg, !!se->on_rq, !!se->on_rq,  				cfs_rq->curr == se)) {  		___update_load_avg(&se->avg, se_weight(se), se_runnable(se)); @@ -290,9 +284,9 @@ int __update_load_avg_se(u64 now, int cpu, struct cfs_rq *cfs_rq, struct sched_e  	return 0;  } -int __update_load_avg_cfs_rq(u64 now, int cpu, struct cfs_rq *cfs_rq) +int __update_load_avg_cfs_rq(u64 now, struct cfs_rq *cfs_rq)  { -	if (___update_load_sum(now, cpu, &cfs_rq->avg, +	if (___update_load_sum(now, &cfs_rq->avg,  				scale_load_down(cfs_rq->load.weight),  				scale_load_down(cfs_rq->runnable_weight),  				cfs_rq->curr != NULL)) { @@ -317,7 +311,7 @@ int __update_load_avg_cfs_rq(u64 now, int cpu, struct cfs_rq *cfs_rq)  int update_rt_rq_load_avg(u64 now, struct rq *rq, int running)  { -	if (___update_load_sum(now, rq->cpu, &rq->avg_rt, +	if (___update_load_sum(now, &rq->avg_rt,  				running,  				running,  				running)) { @@ -340,7 +334,7 @@ int update_rt_rq_load_avg(u64 now, struct rq *rq, int running)  int update_dl_rq_load_avg(u64 now, struct rq *rq, int running)  { -	if (___update_load_sum(now, rq->cpu, &rq->avg_dl, +	if (___update_load_sum(now, &rq->avg_dl,  				running,  				running,  				running)) { @@ -365,22 +359,31 @@ int update_dl_rq_load_avg(u64 now, struct rq *rq, int running)  int update_irq_load_avg(struct rq *rq, u64 running)  {  	int ret = 0; + +	/* +	 * We can't use clock_pelt because irq time is not accounted in +	 * clock_task. Instead we directly scale the running time to +	 * reflect the real amount of computation +	 */ +	running = cap_scale(running, arch_scale_freq_capacity(cpu_of(rq))); +	running = cap_scale(running, arch_scale_cpu_capacity(NULL, cpu_of(rq))); +  	/*  	 * We know the time that has been used by interrupt since last update  	 * but we don't when. Let be pessimistic and assume that interrupt has  	 * happened just before the update. This is not so far from reality  	 * because interrupt will most probably wake up task and trig an update -	 * of rq clock during which the metric si updated. +	 * of rq clock during which the metric is updated.  	 * We start to decay with normal context time and then we add the  	 * interrupt context time.  	 * We can safely remove running from rq->clock because  	 * rq->clock += delta with delta >= running  	 */ -	ret = ___update_load_sum(rq->clock - running, rq->cpu, &rq->avg_irq, +	ret = ___update_load_sum(rq->clock - running, &rq->avg_irq,  				0,  				0,  				0); -	ret += ___update_load_sum(rq->clock, rq->cpu, &rq->avg_irq, +	ret += ___update_load_sum(rq->clock, &rq->avg_irq,  				1,  				1,  				1); diff --git a/kernel/sched/pelt.h b/kernel/sched/pelt.h index 7e56b489ff32..7489d5f56960 100644 --- a/kernel/sched/pelt.h +++ b/kernel/sched/pelt.h @@ -1,8 +1,9 @@  #ifdef CONFIG_SMP +#include "sched-pelt.h" -int __update_load_avg_blocked_se(u64 now, int cpu, struct sched_entity *se); -int __update_load_avg_se(u64 now, int cpu, struct cfs_rq *cfs_rq, struct sched_entity *se); -int __update_load_avg_cfs_rq(u64 now, int cpu, struct cfs_rq *cfs_rq); +int __update_load_avg_blocked_se(u64 now, struct sched_entity *se); +int __update_load_avg_se(u64 now, struct cfs_rq *cfs_rq, struct sched_entity *se); +int __update_load_avg_cfs_rq(u64 now, struct cfs_rq *cfs_rq);  int update_rt_rq_load_avg(u64 now, struct rq *rq, int running);  int update_dl_rq_load_avg(u64 now, struct rq *rq, int running); @@ -42,6 +43,101 @@ static inline void cfs_se_util_change(struct sched_avg *avg)  	WRITE_ONCE(avg->util_est.enqueued, enqueued);  } +/* + * The clock_pelt scales the time to reflect the effective amount of + * computation done during the running delta time but then sync back to + * clock_task when rq is idle. + * + * + * absolute time   | 1| 2| 3| 4| 5| 6| 7| 8| 9|10|11|12|13|14|15|16 + * @ max capacity  ------******---------------******--------------- + * @ half capacity ------************---------************--------- + * clock pelt      | 1| 2|    3|    4| 7| 8| 9|   10|   11|14|15|16 + * + */ +static inline void update_rq_clock_pelt(struct rq *rq, s64 delta) +{ +	if (unlikely(is_idle_task(rq->curr))) { +		/* The rq is idle, we can sync to clock_task */ +		rq->clock_pelt  = rq_clock_task(rq); +		return; +	} + +	/* +	 * When a rq runs at a lower compute capacity, it will need +	 * more time to do the same amount of work than at max +	 * capacity. In order to be invariant, we scale the delta to +	 * reflect how much work has been really done. +	 * Running longer results in stealing idle time that will +	 * disturb the load signal compared to max capacity. This +	 * stolen idle time will be automatically reflected when the +	 * rq will be idle and the clock will be synced with +	 * rq_clock_task. +	 */ + +	/* +	 * Scale the elapsed time to reflect the real amount of +	 * computation +	 */ +	delta = cap_scale(delta, arch_scale_cpu_capacity(NULL, cpu_of(rq))); +	delta = cap_scale(delta, arch_scale_freq_capacity(cpu_of(rq))); + +	rq->clock_pelt += delta; +} + +/* + * When rq becomes idle, we have to check if it has lost idle time + * because it was fully busy. A rq is fully used when the /Sum util_sum + * is greater or equal to: + * (LOAD_AVG_MAX - 1024 + rq->cfs.avg.period_contrib) << SCHED_CAPACITY_SHIFT; + * For optimization and computing rounding purpose, we don't take into account + * the position in the current window (period_contrib) and we use the higher + * bound of util_sum to decide. + */ +static inline void update_idle_rq_clock_pelt(struct rq *rq) +{ +	u32 divider = ((LOAD_AVG_MAX - 1024) << SCHED_CAPACITY_SHIFT) - LOAD_AVG_MAX; +	u32 util_sum = rq->cfs.avg.util_sum; +	util_sum += rq->avg_rt.util_sum; +	util_sum += rq->avg_dl.util_sum; + +	/* +	 * Reflecting stolen time makes sense only if the idle +	 * phase would be present at max capacity. As soon as the +	 * utilization of a rq has reached the maximum value, it is +	 * considered as an always runnig rq without idle time to +	 * steal. This potential idle time is considered as lost in +	 * this case. We keep track of this lost idle time compare to +	 * rq's clock_task. +	 */ +	if (util_sum >= divider) +		rq->lost_idle_time += rq_clock_task(rq) - rq->clock_pelt; +} + +static inline u64 rq_clock_pelt(struct rq *rq) +{ +	lockdep_assert_held(&rq->lock); +	assert_clock_updated(rq); + +	return rq->clock_pelt - rq->lost_idle_time; +} + +#ifdef CONFIG_CFS_BANDWIDTH +/* rq->task_clock normalized against any time this cfs_rq has spent throttled */ +static inline u64 cfs_rq_clock_pelt(struct cfs_rq *cfs_rq) +{ +	if (unlikely(cfs_rq->throttle_count)) +		return cfs_rq->throttled_clock_task - cfs_rq->throttled_clock_task_time; + +	return rq_clock_pelt(rq_of(cfs_rq)) - cfs_rq->throttled_clock_task_time; +} +#else +static inline u64 cfs_rq_clock_pelt(struct cfs_rq *cfs_rq) +{ +	return rq_clock_pelt(rq_of(cfs_rq)); +} +#endif +  #else  static inline int @@ -67,6 +163,18 @@ update_irq_load_avg(struct rq *rq, u64 running)  {  	return 0;  } + +static inline u64 rq_clock_pelt(struct rq *rq) +{ +	return rq_clock_task(rq); +} + +static inline void +update_rq_clock_pelt(struct rq *rq, s64 delta) { } + +static inline void +update_idle_rq_clock_pelt(struct rq *rq) { } +  #endif diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index c3484785b179..0e97ca9306ef 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -322,7 +322,7 @@ static bool update_stats(struct psi_group *group)  	expires = group->next_update;  	if (now < expires)  		goto out; -	if (now - expires > psi_period) +	if (now - expires >= psi_period)  		missed_periods = div_u64(now - expires, psi_period);  	/* diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index e4f398ad9e73..90fa23d36565 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -1587,7 +1587,7 @@ pick_next_task_rt(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)  	 * rt task  	 */  	if (rq->curr->sched_class != &rt_sched_class) -		update_rt_rq_load_avg(rq_clock_task(rq), rq, 0); +		update_rt_rq_load_avg(rq_clock_pelt(rq), rq, 0);  	return p;  } @@ -1596,7 +1596,7 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p)  {  	update_curr_rt(rq); -	update_rt_rq_load_avg(rq_clock_task(rq), rq, 1); +	update_rt_rq_load_avg(rq_clock_pelt(rq), rq, 1);  	/*  	 * The previous task needs to be made eligible for pushing @@ -2325,7 +2325,7 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued)  	struct sched_rt_entity *rt_se = &p->rt;  	update_curr_rt(rq); -	update_rt_rq_load_avg(rq_clock_task(rq), rq, 1); +	update_rt_rq_load_avg(rq_clock_pelt(rq), rq, 1);  	watchdog(rq, p); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index d04530bf251f..efa686eeff26 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -861,7 +861,10 @@ struct rq {  	unsigned int		clock_update_flags;  	u64			clock; -	u64			clock_task; +	/* Ensure that all clocks are in the same cache line */ +	u64			clock_task ____cacheline_aligned; +	u64			clock_pelt; +	unsigned long		lost_idle_time;  	atomic_t		nr_iowait; @@ -951,6 +954,22 @@ struct rq {  #endif  }; +#ifdef CONFIG_FAIR_GROUP_SCHED + +/* CPU runqueue to which this cfs_rq is attached */ +static inline struct rq *rq_of(struct cfs_rq *cfs_rq) +{ +	return cfs_rq->rq; +} + +#else + +static inline struct rq *rq_of(struct cfs_rq *cfs_rq) +{ +	return container_of(cfs_rq, struct rq, cfs); +} +#endif +  static inline int cpu_of(struct rq *rq)  {  #ifdef CONFIG_SMP @@ -1260,7 +1279,7 @@ extern void sched_ttwu_pending(void);  /*   * The domain tree (rq->sd) is protected by RCU's quiescent state transition. - * See detach_destroy_domains: synchronize_sched for details. + * See destroy_sched_domains: call_rcu for details.   *   * The domain tree of any CPU may only be accessed from within   * preempt-disabled sections. @@ -1460,9 +1479,9 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)  	 */  	smp_wmb();  #ifdef CONFIG_THREAD_INFO_IN_TASK -	p->cpu = cpu; +	WRITE_ONCE(p->cpu, cpu);  #else -	task_thread_info(p)->cpu = cpu; +	WRITE_ONCE(task_thread_info(p)->cpu, cpu);  #endif  	p->wake_cpu = cpu;  #endif @@ -1563,7 +1582,7 @@ static inline int task_on_rq_queued(struct task_struct *p)  static inline int task_on_rq_migrating(struct task_struct *p)  { -	return p->on_rq == TASK_ON_RQ_MIGRATING; +	return READ_ONCE(p->on_rq) == TASK_ON_RQ_MIGRATING;  }  /* @@ -1781,7 +1800,7 @@ extern void init_dl_rq_bw_ratio(struct dl_rq *dl_rq);  unsigned long to_ratio(u64 period, u64 runtime);  extern void init_entity_runnable_average(struct sched_entity *se); -extern void post_init_entity_util_avg(struct sched_entity *se); +extern void post_init_entity_util_avg(struct task_struct *p);  #ifdef CONFIG_NO_HZ_FULL  extern bool sched_can_stop_tick(struct rq *rq); @@ -2211,6 +2230,13 @@ static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {}  # define arch_scale_freq_invariant()	false  #endif +#ifdef CONFIG_SMP +static inline unsigned long capacity_orig_of(int cpu) +{ +	return cpu_rq(cpu)->cpu_capacity_orig; +} +#endif +  #ifdef CONFIG_CPU_FREQ_GOV_SCHEDUTIL  /**   * enum schedutil_type - CPU utilization type @@ -2299,11 +2325,19 @@ unsigned long scale_irq_capacity(unsigned long util, unsigned long irq, unsigned  #endif  #if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) +  #define perf_domain_span(pd) (to_cpumask(((pd)->em_pd->cpus))) -#else + +DECLARE_STATIC_KEY_FALSE(sched_energy_present); + +static inline bool sched_energy_enabled(void) +{ +	return static_branch_unlikely(&sched_energy_present); +} + +#else /* ! (CONFIG_ENERGY_MODEL && CONFIG_CPU_FREQ_GOV_SCHEDUTIL) */ +  #define perf_domain_span(pd) NULL -#endif +static inline bool sched_energy_enabled(void) { return false; } -#ifdef CONFIG_SMP -extern struct static_key_false sched_energy_present; -#endif +#endif /* CONFIG_ENERGY_MODEL && CONFIG_CPU_FREQ_GOV_SCHEDUTIL */ diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 3f35ba1d8fde..ab7f371a3a17 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -201,11 +201,37 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)  	return 1;  } -DEFINE_STATIC_KEY_FALSE(sched_energy_present);  #if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) +DEFINE_STATIC_KEY_FALSE(sched_energy_present); +unsigned int sysctl_sched_energy_aware = 1;  DEFINE_MUTEX(sched_energy_mutex);  bool sched_energy_update; +#ifdef CONFIG_PROC_SYSCTL +int sched_energy_aware_handler(struct ctl_table *table, int write, +			 void __user *buffer, size_t *lenp, loff_t *ppos) +{ +	int ret, state; + +	if (write && !capable(CAP_SYS_ADMIN)) +		return -EPERM; + +	ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); +	if (!ret && write) { +		state = static_branch_unlikely(&sched_energy_present); +		if (state != sysctl_sched_energy_aware) { +			mutex_lock(&sched_energy_mutex); +			sched_energy_update = 1; +			rebuild_sched_domains(); +			sched_energy_update = 0; +			mutex_unlock(&sched_energy_mutex); +		} +	} + +	return ret; +} +#endif +  static void free_pd(struct perf_domain *pd)  {  	struct perf_domain *tmp; @@ -322,6 +348,9 @@ static bool build_perf_domains(const struct cpumask *cpu_map)  	struct cpufreq_policy *policy;  	struct cpufreq_governor *gov; +	if (!sysctl_sched_energy_aware) +		goto free; +  	/* EAS is enabled for asymmetric CPU capacity topologies. */  	if (!per_cpu(sd_asym_cpucapacity, cpu)) {  		if (sched_debug()) { @@ -442,7 +471,7 @@ void rq_attach_root(struct rq *rq, struct root_domain *rd)  	raw_spin_unlock_irqrestore(&rq->lock, flags);  	if (old_rd) -		call_rcu_sched(&old_rd->rcu, free_rootdomain); +		call_rcu(&old_rd->rcu, free_rootdomain);  }  void sched_get_rd(struct root_domain *rd) @@ -455,7 +484,7 @@ void sched_put_rd(struct root_domain *rd)  	if (!atomic_dec_and_test(&rd->refcount))  		return; -	call_rcu_sched(&rd->rcu, free_rootdomain); +	call_rcu(&rd->rcu, free_rootdomain);  }  static int init_rootdomain(struct root_domain *rd) @@ -676,7 +705,7 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)  }  struct s_data { -	struct sched_domain ** __percpu sd; +	struct sched_domain * __percpu *sd;  	struct root_domain	*rd;  }; diff --git a/kernel/seccomp.c b/kernel/seccomp.c index e815781ed751..a43c601ac252 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -267,6 +267,7 @@ static u32 seccomp_run_filters(const struct seccomp_data *sd,  	 * All filters in the list are evaluated and the lowest BPF return  	 * value always takes priority (ignoring the DATA).  	 */ +	preempt_disable();  	for (; f; f = f->prev) {  		u32 cur_ret = BPF_PROG_RUN(f->prog, sd); @@ -275,6 +276,7 @@ static u32 seccomp_run_filters(const struct seccomp_data *sd,  			*match = f;  		}  	} +	preempt_enable();  	return ret;  }  #endif /* CONFIG_SECCOMP_FILTER */ diff --git a/kernel/signal.c b/kernel/signal.c index 57b7771e20d7..5d53183e2705 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -3455,7 +3455,7 @@ COMPAT_SYSCALL_DEFINE4(rt_sigtimedwait_time64, compat_sigset_t __user *, uthese,  }  #ifdef CONFIG_COMPAT_32BIT_TIME -COMPAT_SYSCALL_DEFINE4(rt_sigtimedwait, compat_sigset_t __user *, uthese, +COMPAT_SYSCALL_DEFINE4(rt_sigtimedwait_time32, compat_sigset_t __user *, uthese,  		struct compat_siginfo __user *, uinfo,  		struct old_timespec32 __user *, uts, compat_size_t, sigsetsize)  { diff --git a/kernel/softirq.c b/kernel/softirq.c index d28813306b2c..10277429ed84 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -89,7 +89,8 @@ static bool ksoftirqd_running(unsigned long pending)  	if (pending & SOFTIRQ_NOW_MASK)  		return false; -	return tsk && (tsk->state == TASK_RUNNING); +	return tsk && (tsk->state == TASK_RUNNING) && +		!__kthread_should_park(tsk);  }  /* diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index ab9d0e3c6d50..62a6c8707799 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -42,9 +42,11 @@ COND_SYSCALL(io_destroy);  COND_SYSCALL(io_submit);  COND_SYSCALL_COMPAT(io_submit);  COND_SYSCALL(io_cancel); +COND_SYSCALL(io_getevents_time32);  COND_SYSCALL(io_getevents); +COND_SYSCALL(io_pgetevents_time32);  COND_SYSCALL(io_pgetevents); -COND_SYSCALL_COMPAT(io_getevents); +COND_SYSCALL_COMPAT(io_pgetevents_time32);  COND_SYSCALL_COMPAT(io_pgetevents);  /* fs/xattr.c */ @@ -114,9 +116,9 @@ COND_SYSCALL_COMPAT(signalfd4);  /* fs/timerfd.c */  COND_SYSCALL(timerfd_create);  COND_SYSCALL(timerfd_settime); -COND_SYSCALL_COMPAT(timerfd_settime); +COND_SYSCALL(timerfd_settime32);  COND_SYSCALL(timerfd_gettime); -COND_SYSCALL_COMPAT(timerfd_gettime); +COND_SYSCALL(timerfd_gettime32);  /* fs/utimes.c */ @@ -135,7 +137,7 @@ COND_SYSCALL(capset);  /* kernel/futex.c */  COND_SYSCALL(futex); -COND_SYSCALL_COMPAT(futex); +COND_SYSCALL(futex_time32);  COND_SYSCALL(set_robust_list);  COND_SYSCALL_COMPAT(set_robust_list);  COND_SYSCALL(get_robust_list); @@ -187,9 +189,9 @@ COND_SYSCALL(mq_open);  COND_SYSCALL_COMPAT(mq_open);  COND_SYSCALL(mq_unlink);  COND_SYSCALL(mq_timedsend); -COND_SYSCALL_COMPAT(mq_timedsend); +COND_SYSCALL(mq_timedsend_time32);  COND_SYSCALL(mq_timedreceive); -COND_SYSCALL_COMPAT(mq_timedreceive); +COND_SYSCALL(mq_timedreceive_time32);  COND_SYSCALL(mq_notify);  COND_SYSCALL_COMPAT(mq_notify);  COND_SYSCALL(mq_getsetattr); @@ -197,8 +199,10 @@ COND_SYSCALL_COMPAT(mq_getsetattr);  /* ipc/msg.c */  COND_SYSCALL(msgget); +COND_SYSCALL(old_msgctl);  COND_SYSCALL(msgctl);  COND_SYSCALL_COMPAT(msgctl); +COND_SYSCALL_COMPAT(old_msgctl);  COND_SYSCALL(msgrcv);  COND_SYSCALL_COMPAT(msgrcv);  COND_SYSCALL(msgsnd); @@ -206,16 +210,20 @@ COND_SYSCALL_COMPAT(msgsnd);  /* ipc/sem.c */  COND_SYSCALL(semget); +COND_SYSCALL(old_semctl);  COND_SYSCALL(semctl);  COND_SYSCALL_COMPAT(semctl); +COND_SYSCALL_COMPAT(old_semctl);  COND_SYSCALL(semtimedop); -COND_SYSCALL_COMPAT(semtimedop); +COND_SYSCALL(semtimedop_time32);  COND_SYSCALL(semop);  /* ipc/shm.c */  COND_SYSCALL(shmget); +COND_SYSCALL(old_shmctl);  COND_SYSCALL(shmctl);  COND_SYSCALL_COMPAT(shmctl); +COND_SYSCALL_COMPAT(old_shmctl);  COND_SYSCALL(shmat);  COND_SYSCALL_COMPAT(shmat);  COND_SYSCALL(shmdt); @@ -285,7 +293,7 @@ COND_SYSCALL(perf_event_open);  COND_SYSCALL(accept4);  COND_SYSCALL(recvmmsg);  COND_SYSCALL(recvmmsg_time32); -COND_SYSCALL_COMPAT(recvmmsg); +COND_SYSCALL_COMPAT(recvmmsg_time32);  COND_SYSCALL_COMPAT(recvmmsg_time64);  /* @@ -366,6 +374,7 @@ COND_SYSCALL(kexec_file_load);  /* s390 */  COND_SYSCALL(s390_pci_mmio_read);  COND_SYSCALL(s390_pci_mmio_write); +COND_SYSCALL(s390_ipc);  COND_SYSCALL_COMPAT(s390_ipc);  /* powerpc */ diff --git a/kernel/sysctl.c b/kernel/sysctl.c index ba4d9e85feb8..14f30b4a1b64 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -224,6 +224,11 @@ static int proc_dostring_coredump(struct ctl_table *table, int write,  #endif  static int proc_dopipe_max_size(struct ctl_table *table, int write,  		void __user *buffer, size_t *lenp, loff_t *ppos); +#ifdef CONFIG_BPF_SYSCALL +static int proc_dointvec_minmax_bpf_stats(struct ctl_table *table, int write, +					  void __user *buffer, size_t *lenp, +					  loff_t *ppos); +#endif  #ifdef CONFIG_MAGIC_SYSRQ  /* Note: sysrq code uses its own private copy */ @@ -467,6 +472,17 @@ static struct ctl_table kern_table[] = {  		.extra1		= &one,  	},  #endif +#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) +	{ +		.procname	= "sched_energy_aware", +		.data		= &sysctl_sched_energy_aware, +		.maxlen		= sizeof(unsigned int), +		.mode		= 0644, +		.proc_handler	= sched_energy_aware_handler, +		.extra1		= &zero, +		.extra2		= &one, +	}, +#endif  #ifdef CONFIG_PROVE_LOCKING  	{  		.procname	= "prove_locking", @@ -1229,6 +1245,15 @@ static struct ctl_table kern_table[] = {  		.extra1		= &one,  		.extra2		= &one,  	}, +	{ +		.procname	= "bpf_stats_enabled", +		.data		= &sysctl_bpf_stats_enabled, +		.maxlen		= sizeof(sysctl_bpf_stats_enabled), +		.mode		= 0644, +		.proc_handler	= proc_dointvec_minmax_bpf_stats, +		.extra1		= &zero, +		.extra2		= &one, +	},  #endif  #if defined(CONFIG_TREE_RCU) || defined(CONFIG_PREEMPT_RCU)  	{ @@ -1446,7 +1471,7 @@ static struct ctl_table vm_table[] = {  		.data		= &sysctl_extfrag_threshold,  		.maxlen		= sizeof(int),  		.mode		= 0644, -		.proc_handler	= sysctl_extfrag_handler, +		.proc_handler	= proc_dointvec_minmax,  		.extra1		= &min_extfrag_threshold,  		.extra2		= &max_extfrag_threshold,  	}, @@ -3260,6 +3285,29 @@ int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write,  #endif /* CONFIG_PROC_SYSCTL */ +#ifdef CONFIG_BPF_SYSCALL +static int proc_dointvec_minmax_bpf_stats(struct ctl_table *table, int write, +					  void __user *buffer, size_t *lenp, +					  loff_t *ppos) +{ +	int ret, bpf_stats = *(int *)table->data; +	struct ctl_table tmp = *table; + +	if (write && !capable(CAP_SYS_ADMIN)) +		return -EPERM; + +	tmp.data = &bpf_stats; +	ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos); +	if (write && !ret) { +		*(int *)table->data = bpf_stats; +		if (bpf_stats) +			static_branch_enable(&bpf_stats_enabled_key); +		else +			static_branch_disable(&bpf_stats_enabled_key); +	} +	return ret; +} +#endif  /*   * No sense putting this after each symbol definition, twice,   * exception granted :-) diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig index 58b981f4bb5d..e2c038d6c13c 100644 --- a/kernel/time/Kconfig +++ b/kernel/time/Kconfig @@ -117,6 +117,35 @@ config NO_HZ_FULL  endchoice +config CONTEXT_TRACKING +       bool + +config CONTEXT_TRACKING_FORCE +	bool "Force context tracking" +	depends on CONTEXT_TRACKING +	default y if !NO_HZ_FULL +	help +	  The major pre-requirement for full dynticks to work is to +	  support the context tracking subsystem. But there are also +	  other dependencies to provide in order to make the full +	  dynticks working. + +	  This option stands for testing when an arch implements the +	  context tracking backend but doesn't yet fullfill all the +	  requirements to make the full dynticks feature working. +	  Without the full dynticks, there is no way to test the support +	  for context tracking and the subsystems that rely on it: RCU +	  userspace extended quiescent state and tickless cputime +	  accounting. This option copes with the absence of the full +	  dynticks subsystem by forcing the context tracking on all +	  CPUs in the system. + +	  Say Y only if you're working on the development of an +	  architecture backend for the context tracking. + +	  Say N otherwise, this option brings an overhead that you +	  don't want in production. +  config NO_HZ  	bool "Old Idle dynticks config"  	depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index f5cfa1b73d6f..41dfff23c1f9 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -364,7 +364,7 @@ static bool hrtimer_fixup_activate(void *addr, enum debug_obj_state state)  	switch (state) {  	case ODEBUG_STATE_ACTIVE:  		WARN_ON(1); - +		/* fall through */  	default:  		return false;  	} @@ -1771,7 +1771,7 @@ SYSCALL_DEFINE2(nanosleep, struct __kernel_timespec __user *, rqtp,  #ifdef CONFIG_COMPAT_32BIT_TIME -COMPAT_SYSCALL_DEFINE2(nanosleep, struct old_timespec32 __user *, rqtp, +SYSCALL_DEFINE2(nanosleep_time32, struct old_timespec32 __user *, rqtp,  		       struct old_timespec32 __user *, rmtp)  {  	struct timespec64 tu; diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 36a2bef00125..92a90014a925 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -188,13 +188,13 @@ static inline int is_error_status(int status)  			&& (status & (STA_PPSWANDER|STA_PPSERROR)));  } -static inline void pps_fill_timex(struct timex *txc) +static inline void pps_fill_timex(struct __kernel_timex *txc)  {  	txc->ppsfreq	   = shift_right((pps_freq >> PPM_SCALE_INV_SHIFT) *  					 PPM_SCALE_INV, NTP_SCALE_SHIFT);  	txc->jitter	   = pps_jitter;  	if (!(time_status & STA_NANO)) -		txc->jitter /= NSEC_PER_USEC; +		txc->jitter = pps_jitter / NSEC_PER_USEC;  	txc->shift	   = pps_shift;  	txc->stabil	   = pps_stabil;  	txc->jitcnt	   = pps_jitcnt; @@ -220,7 +220,7 @@ static inline int is_error_status(int status)  	return status & (STA_UNSYNC|STA_CLOCKERR);  } -static inline void pps_fill_timex(struct timex *txc) +static inline void pps_fill_timex(struct __kernel_timex *txc)  {  	/* PPS is not implemented, so these are zero */  	txc->ppsfreq	   = 0; @@ -633,7 +633,7 @@ void ntp_notify_cmos_timer(void)  /*   * Propagate a new txc->status value into the NTP state:   */ -static inline void process_adj_status(const struct timex *txc) +static inline void process_adj_status(const struct __kernel_timex *txc)  {  	if ((time_status & STA_PLL) && !(txc->status & STA_PLL)) {  		time_state = TIME_OK; @@ -656,7 +656,8 @@ static inline void process_adj_status(const struct timex *txc)  } -static inline void process_adjtimex_modes(const struct timex *txc, s32 *time_tai) +static inline void process_adjtimex_modes(const struct __kernel_timex *txc, +					  s32 *time_tai)  {  	if (txc->modes & ADJ_STATUS)  		process_adj_status(txc); @@ -707,7 +708,8 @@ static inline void process_adjtimex_modes(const struct timex *txc, s32 *time_tai   * adjtimex mainly allows reading (and writing, if superuser) of   * kernel time-keeping variables. used by xntpd.   */ -int __do_adjtimex(struct timex *txc, const struct timespec64 *ts, s32 *time_tai) +int __do_adjtimex(struct __kernel_timex *txc, const struct timespec64 *ts, +		  s32 *time_tai)  {  	int result; @@ -729,7 +731,7 @@ int __do_adjtimex(struct timex *txc, const struct timespec64 *ts, s32 *time_tai)  		txc->offset = shift_right(time_offset * NTP_INTERVAL_FREQ,  				  NTP_SCALE_SHIFT);  		if (!(time_status & STA_NANO)) -			txc->offset /= NSEC_PER_USEC; +			txc->offset = (u32)txc->offset / NSEC_PER_USEC;  	}  	result = time_state;	/* mostly `TIME_OK' */ @@ -754,7 +756,7 @@ int __do_adjtimex(struct timex *txc, const struct timespec64 *ts, s32 *time_tai)  	txc->time.tv_sec = (time_t)ts->tv_sec;  	txc->time.tv_usec = ts->tv_nsec;  	if (!(time_status & STA_NANO)) -		txc->time.tv_usec /= NSEC_PER_USEC; +		txc->time.tv_usec = ts->tv_nsec / NSEC_PER_USEC;  	/* Handle leapsec adjustments */  	if (unlikely(ts->tv_sec >= ntp_next_leap_sec)) { diff --git a/kernel/time/ntp_internal.h b/kernel/time/ntp_internal.h index c24b0e13f011..40e6122e634e 100644 --- a/kernel/time/ntp_internal.h +++ b/kernel/time/ntp_internal.h @@ -8,6 +8,6 @@ extern void ntp_clear(void);  extern u64 ntp_tick_length(void);  extern ktime_t ntp_get_next_leap(void);  extern int second_overflow(time64_t secs); -extern int __do_adjtimex(struct timex *txc, const struct timespec64 *ts, s32 *time_tai); +extern int __do_adjtimex(struct __kernel_timex *txc, const struct timespec64 *ts, s32 *time_tai);  extern void __hardpps(const struct timespec64 *phase_ts, const struct timespec64 *raw_ts);  #endif /* _LINUX_NTP_INTERNAL_H */ diff --git a/kernel/time/posix-clock.c b/kernel/time/posix-clock.c index 425bbfce6819..ec960bb939fd 100644 --- a/kernel/time/posix-clock.c +++ b/kernel/time/posix-clock.c @@ -228,7 +228,7 @@ static void put_clock_desc(struct posix_clock_desc *cd)  	fput(cd->fp);  } -static int pc_clock_adjtime(clockid_t id, struct timex *tx) +static int pc_clock_adjtime(clockid_t id, struct __kernel_timex *tx)  {  	struct posix_clock_desc cd;  	int err; diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index 80f955210861..0a426f4e3125 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -67,13 +67,13 @@ static void bump_cpu_timer(struct k_itimer *timer, u64 now)  	int i;  	u64 delta, incr; -	if (timer->it.cpu.incr == 0) +	if (!timer->it_interval)  		return;  	if (now < timer->it.cpu.expires)  		return; -	incr = timer->it.cpu.incr; +	incr = timer->it_interval;  	delta = now + incr - timer->it.cpu.expires;  	/* Don't use (incr*2 < delta), incr*2 might overflow. */ @@ -520,7 +520,7 @@ static void cpu_timer_fire(struct k_itimer *timer)  		 */  		wake_up_process(timer->it_process);  		timer->it.cpu.expires = 0; -	} else if (timer->it.cpu.incr == 0) { +	} else if (!timer->it_interval) {  		/*  		 * One-shot timer.  Clear it as soon as it's fired.  		 */ @@ -606,7 +606,7 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags,  	 */  	ret = 0; -	old_incr = timer->it.cpu.incr; +	old_incr = timer->it_interval;  	old_expires = timer->it.cpu.expires;  	if (unlikely(timer->it.cpu.firing)) {  		timer->it.cpu.firing = -1; @@ -684,8 +684,7 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags,  	 * Install the new reload setting, and  	 * set up the signal and overrun bookkeeping.  	 */ -	timer->it.cpu.incr = timespec64_to_ns(&new->it_interval); -	timer->it_interval = ns_to_ktime(timer->it.cpu.incr); +	timer->it_interval = timespec64_to_ktime(new->it_interval);  	/*  	 * This acts as a modification timestamp for the timer, @@ -724,7 +723,7 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec64 *itp  	/*  	 * Easy part: convert the reload time.  	 */ -	itp->it_interval = ns_to_timespec64(timer->it.cpu.incr); +	itp->it_interval = ktime_to_timespec64(timer->it_interval);  	if (!timer->it.cpu.expires)  		return; diff --git a/kernel/time/posix-stubs.c b/kernel/time/posix-stubs.c index a51895486e5e..67df65f887ac 100644 --- a/kernel/time/posix-stubs.c +++ b/kernel/time/posix-stubs.c @@ -45,6 +45,7 @@ SYS_NI(timer_delete);  SYS_NI(clock_adjtime);  SYS_NI(getitimer);  SYS_NI(setitimer); +SYS_NI(clock_adjtime32);  #ifdef __ARCH_WANT_SYS_ALARM  SYS_NI(alarm);  #endif @@ -150,16 +151,16 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags,  #ifdef CONFIG_COMPAT  COMPAT_SYS_NI(timer_create); -COMPAT_SYS_NI(clock_adjtime); -COMPAT_SYS_NI(timer_settime); -COMPAT_SYS_NI(timer_gettime);  COMPAT_SYS_NI(getitimer);  COMPAT_SYS_NI(setitimer);  #endif  #ifdef CONFIG_COMPAT_32BIT_TIME -COMPAT_SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock, -		       struct old_timespec32 __user *, tp) +SYS_NI(timer_settime32); +SYS_NI(timer_gettime32); + +SYSCALL_DEFINE2(clock_settime32, const clockid_t, which_clock, +		struct old_timespec32 __user *, tp)  {  	struct timespec64 new_tp; @@ -171,8 +172,8 @@ COMPAT_SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock,  	return do_sys_settimeofday64(&new_tp, NULL);  } -COMPAT_SYSCALL_DEFINE2(clock_gettime, clockid_t, which_clock, -		       struct old_timespec32 __user *, tp) +SYSCALL_DEFINE2(clock_gettime32, clockid_t, which_clock, +		struct old_timespec32 __user *, tp)  {  	int ret;  	struct timespec64 kernel_tp; @@ -186,8 +187,8 @@ COMPAT_SYSCALL_DEFINE2(clock_gettime, clockid_t, which_clock,  	return 0;  } -COMPAT_SYSCALL_DEFINE2(clock_getres, clockid_t, which_clock, -		       struct old_timespec32 __user *, tp) +SYSCALL_DEFINE2(clock_getres_time32, clockid_t, which_clock, +		struct old_timespec32 __user *, tp)  {  	struct timespec64 rtn_tp = {  		.tv_sec = 0, @@ -206,9 +207,9 @@ COMPAT_SYSCALL_DEFINE2(clock_getres, clockid_t, which_clock,  	}  } -COMPAT_SYSCALL_DEFINE4(clock_nanosleep, clockid_t, which_clock, int, flags, -		       struct old_timespec32 __user *, rqtp, -		       struct old_timespec32 __user *, rmtp) +SYSCALL_DEFINE4(clock_nanosleep_time32, clockid_t, which_clock, int, flags, +		struct old_timespec32 __user *, rqtp, +		struct old_timespec32 __user *, rmtp)  {  	struct timespec64 t; diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index 0e84bb72a3da..29176635991f 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -179,7 +179,7 @@ static int posix_clock_realtime_set(const clockid_t which_clock,  }  static int posix_clock_realtime_adj(const clockid_t which_clock, -				    struct timex *t) +				    struct __kernel_timex *t)  {  	return do_adjtimex(t);  } @@ -730,8 +730,8 @@ SYSCALL_DEFINE2(timer_gettime, timer_t, timer_id,  #ifdef CONFIG_COMPAT_32BIT_TIME -COMPAT_SYSCALL_DEFINE2(timer_gettime, timer_t, timer_id, -		       struct old_itimerspec32 __user *, setting) +SYSCALL_DEFINE2(timer_gettime32, timer_t, timer_id, +		struct old_itimerspec32 __user *, setting)  {  	struct itimerspec64 cur_setting; @@ -903,9 +903,9 @@ SYSCALL_DEFINE4(timer_settime, timer_t, timer_id, int, flags,  }  #ifdef CONFIG_COMPAT_32BIT_TIME -COMPAT_SYSCALL_DEFINE4(timer_settime, timer_t, timer_id, int, flags, -		       struct old_itimerspec32 __user *, new, -		       struct old_itimerspec32 __user *, old) +SYSCALL_DEFINE4(timer_settime32, timer_t, timer_id, int, flags, +		struct old_itimerspec32 __user *, new, +		struct old_itimerspec32 __user *, old)  {  	struct itimerspec64 new_spec, old_spec;  	struct itimerspec64 *rtn = old ? &old_spec : NULL; @@ -1047,22 +1047,28 @@ SYSCALL_DEFINE2(clock_gettime, const clockid_t, which_clock,  	return error;  } -SYSCALL_DEFINE2(clock_adjtime, const clockid_t, which_clock, -		struct timex __user *, utx) +int do_clock_adjtime(const clockid_t which_clock, struct __kernel_timex * ktx)  {  	const struct k_clock *kc = clockid_to_kclock(which_clock); -	struct timex ktx; -	int err;  	if (!kc)  		return -EINVAL;  	if (!kc->clock_adj)  		return -EOPNOTSUPP; +	return kc->clock_adj(which_clock, ktx); +} + +SYSCALL_DEFINE2(clock_adjtime, const clockid_t, which_clock, +		struct __kernel_timex __user *, utx) +{ +	struct __kernel_timex ktx; +	int err; +  	if (copy_from_user(&ktx, utx, sizeof(ktx)))  		return -EFAULT; -	err = kc->clock_adj(which_clock, &ktx); +	err = do_clock_adjtime(which_clock, &ktx);  	if (err >= 0 && copy_to_user(utx, &ktx, sizeof(ktx)))  		return -EFAULT; @@ -1090,8 +1096,8 @@ SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock,  #ifdef CONFIG_COMPAT_32BIT_TIME -COMPAT_SYSCALL_DEFINE2(clock_settime, clockid_t, which_clock, -		       struct old_timespec32 __user *, tp) +SYSCALL_DEFINE2(clock_settime32, clockid_t, which_clock, +		struct old_timespec32 __user *, tp)  {  	const struct k_clock *kc = clockid_to_kclock(which_clock);  	struct timespec64 ts; @@ -1105,8 +1111,8 @@ COMPAT_SYSCALL_DEFINE2(clock_settime, clockid_t, which_clock,  	return kc->clock_set(which_clock, &ts);  } -COMPAT_SYSCALL_DEFINE2(clock_gettime, clockid_t, which_clock, -		       struct old_timespec32 __user *, tp) +SYSCALL_DEFINE2(clock_gettime32, clockid_t, which_clock, +		struct old_timespec32 __user *, tp)  {  	const struct k_clock *kc = clockid_to_kclock(which_clock);  	struct timespec64 ts; @@ -1123,40 +1129,26 @@ COMPAT_SYSCALL_DEFINE2(clock_gettime, clockid_t, which_clock,  	return err;  } -#endif - -#ifdef CONFIG_COMPAT - -COMPAT_SYSCALL_DEFINE2(clock_adjtime, clockid_t, which_clock, -		       struct compat_timex __user *, utp) +SYSCALL_DEFINE2(clock_adjtime32, clockid_t, which_clock, +		struct old_timex32 __user *, utp)  { -	const struct k_clock *kc = clockid_to_kclock(which_clock); -	struct timex ktx; +	struct __kernel_timex ktx;  	int err; -	if (!kc) -		return -EINVAL; -	if (!kc->clock_adj) -		return -EOPNOTSUPP; - -	err = compat_get_timex(&ktx, utp); +	err = get_old_timex32(&ktx, utp);  	if (err)  		return err; -	err = kc->clock_adj(which_clock, &ktx); +	err = do_clock_adjtime(which_clock, &ktx);  	if (err >= 0) -		err = compat_put_timex(utp, &ktx); +		err = put_old_timex32(utp, &ktx);  	return err;  } -#endif - -#ifdef CONFIG_COMPAT_32BIT_TIME - -COMPAT_SYSCALL_DEFINE2(clock_getres, clockid_t, which_clock, -		       struct old_timespec32 __user *, tp) +SYSCALL_DEFINE2(clock_getres_time32, clockid_t, which_clock, +		struct old_timespec32 __user *, tp)  {  	const struct k_clock *kc = clockid_to_kclock(which_clock);  	struct timespec64 ts; @@ -1212,9 +1204,9 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags,  #ifdef CONFIG_COMPAT_32BIT_TIME -COMPAT_SYSCALL_DEFINE4(clock_nanosleep, clockid_t, which_clock, int, flags, -		       struct old_timespec32 __user *, rqtp, -		       struct old_timespec32 __user *, rmtp) +SYSCALL_DEFINE4(clock_nanosleep_time32, clockid_t, which_clock, int, flags, +		struct old_timespec32 __user *, rqtp, +		struct old_timespec32 __user *, rmtp)  {  	const struct k_clock *kc = clockid_to_kclock(which_clock);  	struct timespec64 t; diff --git a/kernel/time/posix-timers.h b/kernel/time/posix-timers.h index ddb21145211a..de5daa6d975a 100644 --- a/kernel/time/posix-timers.h +++ b/kernel/time/posix-timers.h @@ -8,7 +8,7 @@ struct k_clock {  			     const struct timespec64 *tp);  	int	(*clock_get)(const clockid_t which_clock,  			     struct timespec64 *tp); -	int	(*clock_adj)(const clockid_t which_clock, struct timex *tx); +	int	(*clock_adj)(const clockid_t which_clock, struct __kernel_timex *tx);  	int	(*timer_create)(struct k_itimer *timer);  	int	(*nsleep)(const clockid_t which_clock, int flags,  			  const struct timespec64 *); diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index 803fa67aace9..ee834d4fb814 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -375,6 +375,7 @@ void tick_broadcast_control(enum tick_broadcast_mode mode)  	switch (mode) {  	case TICK_BROADCAST_FORCE:  		tick_broadcast_forced = 1; +		/* fall through */  	case TICK_BROADCAST_ON:  		cpumask_set_cpu(cpu, tick_broadcast_on);  		if (!cpumask_test_and_set_cpu(cpu, tick_broadcast_mask)) { diff --git a/kernel/time/time.c b/kernel/time/time.c index 2edb5088a70b..c3f756f8534b 100644 --- a/kernel/time/time.c +++ b/kernel/time/time.c @@ -98,11 +98,11 @@ SYSCALL_DEFINE1(stime, time_t __user *, tptr)  #endif /* __ARCH_WANT_SYS_TIME */ -#ifdef CONFIG_COMPAT -#ifdef __ARCH_WANT_COMPAT_SYS_TIME +#ifdef CONFIG_COMPAT_32BIT_TIME +#ifdef __ARCH_WANT_SYS_TIME32  /* old_time32_t is a 32 bit "long" and needs to get converted. */ -COMPAT_SYSCALL_DEFINE1(time, old_time32_t __user *, tloc) +SYSCALL_DEFINE1(time32, old_time32_t __user *, tloc)  {  	old_time32_t i; @@ -116,7 +116,7 @@ COMPAT_SYSCALL_DEFINE1(time, old_time32_t __user *, tloc)  	return i;  } -COMPAT_SYSCALL_DEFINE1(stime, old_time32_t __user *, tptr) +SYSCALL_DEFINE1(stime32, old_time32_t __user *, tptr)  {  	struct timespec64 tv;  	int err; @@ -134,7 +134,7 @@ COMPAT_SYSCALL_DEFINE1(stime, old_time32_t __user *, tptr)  	return 0;  } -#endif /* __ARCH_WANT_COMPAT_SYS_TIME */ +#endif /* __ARCH_WANT_SYS_TIME32 */  #endif  SYSCALL_DEFINE2(gettimeofday, struct timeval __user *, tv, @@ -263,35 +263,99 @@ COMPAT_SYSCALL_DEFINE2(settimeofday, struct old_timeval32 __user *, tv,  }  #endif -SYSCALL_DEFINE1(adjtimex, struct timex __user *, txc_p) +#if !defined(CONFIG_64BIT_TIME) || defined(CONFIG_64BIT) +SYSCALL_DEFINE1(adjtimex, struct __kernel_timex __user *, txc_p)  { -	struct timex txc;		/* Local copy of parameter */ +	struct __kernel_timex txc;		/* Local copy of parameter */  	int ret;  	/* Copy the user data space into the kernel copy  	 * structure. But bear in mind that the structures  	 * may change  	 */ -	if (copy_from_user(&txc, txc_p, sizeof(struct timex))) +	if (copy_from_user(&txc, txc_p, sizeof(struct __kernel_timex)))  		return -EFAULT;  	ret = do_adjtimex(&txc); -	return copy_to_user(txc_p, &txc, sizeof(struct timex)) ? -EFAULT : ret; +	return copy_to_user(txc_p, &txc, sizeof(struct __kernel_timex)) ? -EFAULT : ret;  } +#endif -#ifdef CONFIG_COMPAT +#ifdef CONFIG_COMPAT_32BIT_TIME +int get_old_timex32(struct __kernel_timex *txc, const struct old_timex32 __user *utp) +{ +	struct old_timex32 tx32; + +	memset(txc, 0, sizeof(struct __kernel_timex)); +	if (copy_from_user(&tx32, utp, sizeof(struct old_timex32))) +		return -EFAULT; + +	txc->modes = tx32.modes; +	txc->offset = tx32.offset; +	txc->freq = tx32.freq; +	txc->maxerror = tx32.maxerror; +	txc->esterror = tx32.esterror; +	txc->status = tx32.status; +	txc->constant = tx32.constant; +	txc->precision = tx32.precision; +	txc->tolerance = tx32.tolerance; +	txc->time.tv_sec = tx32.time.tv_sec; +	txc->time.tv_usec = tx32.time.tv_usec; +	txc->tick = tx32.tick; +	txc->ppsfreq = tx32.ppsfreq; +	txc->jitter = tx32.jitter; +	txc->shift = tx32.shift; +	txc->stabil = tx32.stabil; +	txc->jitcnt = tx32.jitcnt; +	txc->calcnt = tx32.calcnt; +	txc->errcnt = tx32.errcnt; +	txc->stbcnt = tx32.stbcnt; + +	return 0; +} + +int put_old_timex32(struct old_timex32 __user *utp, const struct __kernel_timex *txc) +{ +	struct old_timex32 tx32; + +	memset(&tx32, 0, sizeof(struct old_timex32)); +	tx32.modes = txc->modes; +	tx32.offset = txc->offset; +	tx32.freq = txc->freq; +	tx32.maxerror = txc->maxerror; +	tx32.esterror = txc->esterror; +	tx32.status = txc->status; +	tx32.constant = txc->constant; +	tx32.precision = txc->precision; +	tx32.tolerance = txc->tolerance; +	tx32.time.tv_sec = txc->time.tv_sec; +	tx32.time.tv_usec = txc->time.tv_usec; +	tx32.tick = txc->tick; +	tx32.ppsfreq = txc->ppsfreq; +	tx32.jitter = txc->jitter; +	tx32.shift = txc->shift; +	tx32.stabil = txc->stabil; +	tx32.jitcnt = txc->jitcnt; +	tx32.calcnt = txc->calcnt; +	tx32.errcnt = txc->errcnt; +	tx32.stbcnt = txc->stbcnt; +	tx32.tai = txc->tai; +	if (copy_to_user(utp, &tx32, sizeof(struct old_timex32))) +		return -EFAULT; +	return 0; +} -COMPAT_SYSCALL_DEFINE1(adjtimex, struct compat_timex __user *, utp) +SYSCALL_DEFINE1(adjtimex_time32, struct old_timex32 __user *, utp)  { -	struct timex txc; +	struct __kernel_timex txc;  	int err, ret; -	err = compat_get_timex(&txc, utp); +	err = get_old_timex32(&txc, utp);  	if (err)  		return err;  	ret = do_adjtimex(&txc); -	err = compat_put_timex(utp, &txc); +	err = put_old_timex32(utp, &txc);  	if (err)  		return err; diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index ac5dbf2cd4a2..f986e1918d12 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -2234,7 +2234,7 @@ ktime_t ktime_get_update_offsets_now(unsigned int *cwsseq, ktime_t *offs_real,  /**   * timekeeping_validate_timex - Ensures the timex is ok for use in do_adjtimex   */ -static int timekeeping_validate_timex(const struct timex *txc) +static int timekeeping_validate_timex(const struct __kernel_timex *txc)  {  	if (txc->modes & ADJ_ADJTIME) {  		/* singleshot must not be used with any other mode bits */ @@ -2300,7 +2300,7 @@ static int timekeeping_validate_timex(const struct timex *txc)  /**   * do_adjtimex() - Accessor function to NTP __do_adjtimex function   */ -int do_adjtimex(struct timex *txc) +int do_adjtimex(struct __kernel_timex *txc)  {  	struct timekeeper *tk = &tk_core.timekeeper;  	unsigned long flags; diff --git a/kernel/time/timekeeping_debug.c b/kernel/time/timekeeping_debug.c index 86489950d690..b73e8850e58d 100644 --- a/kernel/time/timekeeping_debug.c +++ b/kernel/time/timekeeping_debug.c @@ -37,15 +37,8 @@ DEFINE_SHOW_ATTRIBUTE(tk_debug_sleep_time);  static int __init tk_debug_sleep_time_init(void)  { -	struct dentry *d; - -	d = debugfs_create_file("sleep_time", 0444, NULL, NULL, -		&tk_debug_sleep_time_fops); -	if (!d) { -		pr_err("Failed to create sleep_time debug file\n"); -		return -ENOMEM; -	} - +	debugfs_create_file("sleep_time", 0444, NULL, NULL, +			    &tk_debug_sleep_time_fops);  	return 0;  }  late_initcall(tk_debug_sleep_time_init); diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 444156debfa0..2fce056f8a49 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -647,7 +647,7 @@ static bool timer_fixup_activate(void *addr, enum debug_obj_state state)  	case ODEBUG_STATE_ACTIVE:  		WARN_ON(1); - +		/* fall through */  	default:  		return false;  	} @@ -1632,7 +1632,7 @@ void update_process_times(int user_tick)  	/* Note: this timer irq context must be accounted for as well. */  	account_process_tick(p, user_tick);  	run_local_timers(); -	rcu_check_callbacks(user_tick); +	rcu_sched_clock_irq(user_tick);  #ifdef CONFIG_IRQ_WORK  	if (in_irq())  		irq_work_tick(); diff --git a/kernel/torture.c b/kernel/torture.c index bbf6d473e50c..8faa1a9aaeb9 100644 --- a/kernel/torture.c +++ b/kernel/torture.c @@ -1,23 +1,10 @@ +// SPDX-License-Identifier: GPL-2.0+  /*   * Common functions for in-kernel torture tests.   * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, you can access it online at - * http://www.gnu.org/licenses/gpl-2.0.html. - *   * Copyright (C) IBM Corporation, 2014   * - * Author: Paul E. McKenney <paulmck@us.ibm.com> + * Author: Paul E. McKenney <paulmck@linux.ibm.com>   *	Based on kernel/rcu/torture.c.   */ @@ -53,7 +40,7 @@  #include "rcu/rcu.h"  MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com>"); +MODULE_AUTHOR("Paul E. McKenney <paulmck@linux.ibm.com>");  static char *torture_type;  static int verbose; @@ -75,6 +62,7 @@ static DEFINE_MUTEX(fullstop_mutex);  static struct task_struct *onoff_task;  static long onoff_holdoff;  static long onoff_interval; +static torture_ofl_func *onoff_f;  static long n_offline_attempts;  static long n_offline_successes;  static unsigned long sum_offline; @@ -118,6 +106,8 @@ bool torture_offline(int cpu, long *n_offl_attempts, long *n_offl_successes,  			pr_alert("%s" TORTURE_FLAG  				 "torture_onoff task: offlined %d\n",  				 torture_type, cpu); +		if (onoff_f) +			onoff_f();  		(*n_offl_successes)++;  		delta = jiffies - starttime;  		*sum_offl += delta; @@ -243,11 +233,12 @@ stop:  /*   * Initiate online-offline handling.   */ -int torture_onoff_init(long ooholdoff, long oointerval) +int torture_onoff_init(long ooholdoff, long oointerval, torture_ofl_func *f)  {  #ifdef CONFIG_HOTPLUG_CPU  	onoff_holdoff = ooholdoff;  	onoff_interval = oointerval; +	onoff_f = f;  	if (onoff_interval <= 0)  		return 0;  	return torture_create_kthread(torture_onoff, NULL, onoff_task); diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index f1a86a0d881d..d64c00afceb5 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -431,8 +431,7 @@ __bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map,  	if (unlikely(event->oncpu != cpu))  		return -EOPNOTSUPP; -	perf_event_output(event, sd, regs); -	return 0; +	return perf_event_output(event, sd, regs);  }  BPF_CALL_5(bpf_perf_event_output, struct pt_regs *, regs, struct bpf_map *, map, diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index c521b7347482..c4238b441624 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3384,6 +3384,8 @@ static void print_func_help_header_irq(struct trace_buffer *buf, struct seq_file  	const char tgid_space[] = "          ";  	const char space[] = "  "; +	print_event_info(buf, m); +  	seq_printf(m, "#                          %s  _-----=> irqs-off\n",  		   tgid ? tgid_space : space);  	seq_printf(m, "#                          %s / _----=> need-resched\n", diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index d3294721f119..d42a473b8240 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -14,6 +14,7 @@  #include <linux/uaccess.h>  #include <linux/module.h>  #include <linux/ftrace.h> +#include <linux/kprobes.h>  #include "trace.h" @@ -365,7 +366,7 @@ out:  	__trace_function(tr, CALLER_ADDR0, parent_ip, flags, pc);  } -static inline void +static nokprobe_inline void  start_critical_timing(unsigned long ip, unsigned long parent_ip, int pc)  {  	int cpu; @@ -401,7 +402,7 @@ start_critical_timing(unsigned long ip, unsigned long parent_ip, int pc)  	atomic_dec(&data->disabled);  } -static inline void +static nokprobe_inline void  stop_critical_timing(unsigned long ip, unsigned long parent_ip, int pc)  {  	int cpu; @@ -443,6 +444,7 @@ void start_critical_timings(void)  		start_critical_timing(CALLER_ADDR0, CALLER_ADDR1, pc);  }  EXPORT_SYMBOL_GPL(start_critical_timings); +NOKPROBE_SYMBOL(start_critical_timings);  void stop_critical_timings(void)  { @@ -452,6 +454,7 @@ void stop_critical_timings(void)  		stop_critical_timing(CALLER_ADDR0, CALLER_ADDR1, pc);  }  EXPORT_SYMBOL_GPL(stop_critical_timings); +NOKPROBE_SYMBOL(stop_critical_timings);  #ifdef CONFIG_FUNCTION_TRACER  static bool function_enabled; @@ -611,6 +614,7 @@ void tracer_hardirqs_on(unsigned long a0, unsigned long a1)  	if (!preempt_trace(pc) && irq_trace())  		stop_critical_timing(a0, a1, pc);  } +NOKPROBE_SYMBOL(tracer_hardirqs_on);  void tracer_hardirqs_off(unsigned long a0, unsigned long a1)  { @@ -619,6 +623,7 @@ void tracer_hardirqs_off(unsigned long a0, unsigned long a1)  	if (!preempt_trace(pc) && irq_trace())  		start_critical_timing(a0, a1, pc);  } +NOKPROBE_SYMBOL(tracer_hardirqs_off);  static int irqsoff_tracer_init(struct trace_array *tr)  { diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index d5fb09ebba8b..9eaf07f99212 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -861,22 +861,14 @@ static const struct file_operations kprobe_profile_ops = {  static nokprobe_inline int  fetch_store_strlen(unsigned long addr)  { -	mm_segment_t old_fs;  	int ret, len = 0;  	u8 c; -	old_fs = get_fs(); -	set_fs(KERNEL_DS); -	pagefault_disable(); -  	do { -		ret = __copy_from_user_inatomic(&c, (u8 *)addr + len, 1); +		ret = probe_mem_read(&c, (u8 *)addr + len, 1);  		len++;  	} while (c && ret == 0 && len < MAX_STRING_SIZE); -	pagefault_enable(); -	set_fs(old_fs); -  	return (ret < 0) ? ret : len;  } diff --git a/kernel/trace/trace_preemptirq.c b/kernel/trace/trace_preemptirq.c index 71f553cceb3c..4d8e99fdbbbe 100644 --- a/kernel/trace/trace_preemptirq.c +++ b/kernel/trace/trace_preemptirq.c @@ -9,6 +9,7 @@  #include <linux/uaccess.h>  #include <linux/module.h>  #include <linux/ftrace.h> +#include <linux/kprobes.h>  #include "trace.h"  #define CREATE_TRACE_POINTS @@ -30,6 +31,7 @@ void trace_hardirqs_on(void)  	lockdep_hardirqs_on(CALLER_ADDR0);  }  EXPORT_SYMBOL(trace_hardirqs_on); +NOKPROBE_SYMBOL(trace_hardirqs_on);  void trace_hardirqs_off(void)  { @@ -43,6 +45,7 @@ void trace_hardirqs_off(void)  	lockdep_hardirqs_off(CALLER_ADDR0);  }  EXPORT_SYMBOL(trace_hardirqs_off); +NOKPROBE_SYMBOL(trace_hardirqs_off);  __visible void trace_hardirqs_on_caller(unsigned long caller_addr)  { @@ -56,6 +59,7 @@ __visible void trace_hardirqs_on_caller(unsigned long caller_addr)  	lockdep_hardirqs_on(CALLER_ADDR0);  }  EXPORT_SYMBOL(trace_hardirqs_on_caller); +NOKPROBE_SYMBOL(trace_hardirqs_on_caller);  __visible void trace_hardirqs_off_caller(unsigned long caller_addr)  { @@ -69,6 +73,7 @@ __visible void trace_hardirqs_off_caller(unsigned long caller_addr)  	lockdep_hardirqs_off(CALLER_ADDR0);  }  EXPORT_SYMBOL(trace_hardirqs_off_caller); +NOKPROBE_SYMBOL(trace_hardirqs_off_caller);  #endif /* CONFIG_TRACE_IRQFLAGS */  #ifdef CONFIG_TRACE_PREEMPT_TOGGLE diff --git a/kernel/workqueue.c b/kernel/workqueue.c index fc5d23d752a5..e163e7a7f5e5 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -259,6 +259,8 @@ struct workqueue_struct {  	struct wq_device	*wq_dev;	/* I: for sysfs interface */  #endif  #ifdef CONFIG_LOCKDEP +	char			*lock_name; +	struct lock_class_key	key;  	struct lockdep_map	lockdep_map;  #endif  	char			name[WQ_NAME_LEN]; /* I: workqueue name */ @@ -3337,11 +3339,49 @@ static int init_worker_pool(struct worker_pool *pool)  	return 0;  } +#ifdef CONFIG_LOCKDEP +static void wq_init_lockdep(struct workqueue_struct *wq) +{ +	char *lock_name; + +	lockdep_register_key(&wq->key); +	lock_name = kasprintf(GFP_KERNEL, "%s%s", "(wq_completion)", wq->name); +	if (!lock_name) +		lock_name = wq->name; +	lockdep_init_map(&wq->lockdep_map, lock_name, &wq->key, 0); +} + +static void wq_unregister_lockdep(struct workqueue_struct *wq) +{ +	lockdep_unregister_key(&wq->key); +} + +static void wq_free_lockdep(struct workqueue_struct *wq) +{ +	if (wq->lock_name != wq->name) +		kfree(wq->lock_name); +} +#else +static void wq_init_lockdep(struct workqueue_struct *wq) +{ +} + +static void wq_unregister_lockdep(struct workqueue_struct *wq) +{ +} + +static void wq_free_lockdep(struct workqueue_struct *wq) +{ +} +#endif +  static void rcu_free_wq(struct rcu_head *rcu)  {  	struct workqueue_struct *wq =  		container_of(rcu, struct workqueue_struct, rcu); +	wq_free_lockdep(wq); +  	if (!(wq->flags & WQ_UNBOUND))  		free_percpu(wq->cpu_pwqs);  	else @@ -3532,8 +3572,10 @@ static void pwq_unbound_release_workfn(struct work_struct *work)  	 * If we're the last pwq going away, @wq is already dead and no one  	 * is gonna access it anymore.  Schedule RCU free.  	 */ -	if (is_last) +	if (is_last) { +		wq_unregister_lockdep(wq);  		call_rcu(&wq->rcu, rcu_free_wq); +	}  }  /** @@ -4067,11 +4109,9 @@ static int init_rescuer(struct workqueue_struct *wq)  	return 0;  } -struct workqueue_struct *__alloc_workqueue_key(const char *fmt, -					       unsigned int flags, -					       int max_active, -					       struct lock_class_key *key, -					       const char *lock_name, ...) +struct workqueue_struct *alloc_workqueue(const char *fmt, +					 unsigned int flags, +					 int max_active, ...)  {  	size_t tbl_size = 0;  	va_list args; @@ -4106,7 +4146,7 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt,  			goto err_free_wq;  	} -	va_start(args, lock_name); +	va_start(args, max_active);  	vsnprintf(wq->name, sizeof(wq->name), fmt, args);  	va_end(args); @@ -4123,7 +4163,7 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt,  	INIT_LIST_HEAD(&wq->flusher_overflow);  	INIT_LIST_HEAD(&wq->maydays); -	lockdep_init_map(&wq->lockdep_map, lock_name, key, 0); +	wq_init_lockdep(wq);  	INIT_LIST_HEAD(&wq->list);  	if (alloc_and_link_pwqs(wq) < 0) @@ -4161,7 +4201,7 @@ err_destroy:  	destroy_workqueue(wq);  	return NULL;  } -EXPORT_SYMBOL_GPL(__alloc_workqueue_key); +EXPORT_SYMBOL_GPL(alloc_workqueue);  /**   * destroy_workqueue - safely terminate a workqueue @@ -4214,6 +4254,7 @@ void destroy_workqueue(struct workqueue_struct *wq)  		kthread_stop(wq->rescuer->task);  	if (!(wq->flags & WQ_UNBOUND)) { +		wq_unregister_lockdep(wq);  		/*  		 * The base ref is never dropped on per-cpu pwqs.  Directly  		 * schedule RCU free. | 
