diff options
| author | Thomas Gleixner <tglx@linutronix.de> | 2020-12-15 10:48:07 +0100 | 
|---|---|---|
| committer | Thomas Gleixner <tglx@linutronix.de> | 2020-12-15 10:48:07 +0100 | 
| commit | 3c41e57a1e168d879e923c5583adeae47eec9f64 (patch) | |
| tree | e6272012c4b766189be2821316a3d23d115f5195 /kernel | |
| parent | d14ce74f1fb376ccbbc0b05ded477ada51253729 (diff) | |
| parent | 2f5fbc4305d07725bfebaedb09e57271315691ef (diff) | |
Merge tag 'irqchip-5.11' of git://git.kernel.org/pub/scm/linux/kernel/git/maz/arm-platforms into irq/core
Pull irqchip updates for 5.11 from Marc Zyngier:
  - Preliminary support for managed interrupts on platform devices
  - Correctly identify allocation of MSIs proxyied by another device
  - Remove the fasteoi IPI flow which has been proved useless
  - Generalise the Ocelot support to new SoCs
  - Improve GICv4.1 vcpu entry, matching the corresponding KVM optimisation
  - Work around spurious interrupts on Qualcomm PDC
  - Random fixes and cleanups
Link: https://lore.kernel.org/r/20201212135626.1479884-1-maz@kernel.org
Diffstat (limited to 'kernel')
44 files changed, 566 insertions, 320 deletions
| diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index bdc8cd1b6767..c1b9f71ee6aa 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -1,6 +1,10 @@  # SPDX-License-Identifier: GPL-2.0  obj-y := core.o -CFLAGS_core.o += $(call cc-disable-warning, override-init) +ifneq ($(CONFIG_BPF_JIT_ALWAYS_ON),y) +# ___bpf_prog_run() needs GCSE disabled on x86; see 3193c0836f203 for details +cflags-nogcse-$(CONFIG_X86)$(CONFIG_CC_IS_GCC) := -fno-gcse +endif +CFLAGS_core.o += $(call cc-disable-warning, override-init) $(cflags-nogcse-yy)  obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o bpf_iter.o map_iter.o task_iter.o prog_iter.o  obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c index 78ea8a7bd27f..56cc5a915f67 100644 --- a/kernel/bpf/bpf_lsm.c +++ b/kernel/bpf/bpf_lsm.c @@ -13,6 +13,7 @@  #include <linux/bpf_verifier.h>  #include <net/bpf_sk_storage.h>  #include <linux/bpf_local_storage.h> +#include <linux/btf_ids.h>  /* For every LSM hook that allows attachment of BPF programs, declare a nop   * function where a BPF program can be attached. @@ -26,7 +27,11 @@ noinline RET bpf_lsm_##NAME(__VA_ARGS__)	\  #include <linux/lsm_hook_defs.h>  #undef LSM_HOOK -#define BPF_LSM_SYM_PREFX  "bpf_lsm_" +#define LSM_HOOK(RET, DEFAULT, NAME, ...) BTF_ID(func, bpf_lsm_##NAME) +BTF_SET_START(bpf_lsm_hooks) +#include <linux/lsm_hook_defs.h> +#undef LSM_HOOK +BTF_SET_END(bpf_lsm_hooks)  int bpf_lsm_verify_prog(struct bpf_verifier_log *vlog,  			const struct bpf_prog *prog) @@ -37,8 +42,7 @@ int bpf_lsm_verify_prog(struct bpf_verifier_log *vlog,  		return -EINVAL;  	} -	if (strncmp(BPF_LSM_SYM_PREFX, prog->aux->attach_func_name, -		    sizeof(BPF_LSM_SYM_PREFX) - 1)) { +	if (!btf_id_set_contains(&bpf_lsm_hooks, prog->aux->attach_btf_id)) {  		bpf_log(vlog, "attach_btf_id %u points to wrong type name %s\n",  			prog->aux->attach_btf_id, prog->aux->attach_func_name);  		return -EINVAL; diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 9268d77898b7..55454d2278b1 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1369,7 +1369,7 @@ u64 __weak bpf_probe_read_kernel(void *dst, u32 size, const void *unsafe_ptr)   *   * Decode and execute eBPF instructions.   */ -static u64 __no_fgcse ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn, u64 *stack) +static u64 ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn, u64 *stack)  {  #define BPF_INSN_2_LBL(x, y)    [BPF_##x | BPF_##y] = &&x##_##y  #define BPF_INSN_3_LBL(x, y, z) [BPF_##x | BPF_##y | BPF_##z] = &&x##_##y##_##z diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 1815e97d4c9c..1fccba6e88c4 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -821,6 +821,32 @@ static void pcpu_copy_value(struct bpf_htab *htab, void __percpu *pptr,  	}  } +static void pcpu_init_value(struct bpf_htab *htab, void __percpu *pptr, +			    void *value, bool onallcpus) +{ +	/* When using prealloc and not setting the initial value on all cpus, +	 * zero-fill element values for other cpus (just as what happens when +	 * not using prealloc). Otherwise, bpf program has no way to ensure +	 * known initial values for cpus other than current one +	 * (onallcpus=false always when coming from bpf prog). +	 */ +	if (htab_is_prealloc(htab) && !onallcpus) { +		u32 size = round_up(htab->map.value_size, 8); +		int current_cpu = raw_smp_processor_id(); +		int cpu; + +		for_each_possible_cpu(cpu) { +			if (cpu == current_cpu) +				bpf_long_memcpy(per_cpu_ptr(pptr, cpu), value, +						size); +			else +				memset(per_cpu_ptr(pptr, cpu), 0, size); +		} +	} else { +		pcpu_copy_value(htab, pptr, value, onallcpus); +	} +} +  static bool fd_htab_map_needs_adjust(const struct bpf_htab *htab)  {  	return htab->map.map_type == BPF_MAP_TYPE_HASH_OF_MAPS && @@ -891,7 +917,7 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,  			}  		} -		pcpu_copy_value(htab, pptr, value, onallcpus); +		pcpu_init_value(htab, pptr, value, onallcpus);  		if (!prealloc)  			htab_elem_set_ptr(l_new, key_size, pptr); @@ -1183,7 +1209,7 @@ static int __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key,  		pcpu_copy_value(htab, htab_elem_get_ptr(l_old, key_size),  				value, onallcpus);  	} else { -		pcpu_copy_value(htab, htab_elem_get_ptr(l_new, key_size), +		pcpu_init_value(htab, htab_elem_get_ptr(l_new, key_size),  				value, onallcpus);  		hlist_nulls_add_head_rcu(&l_new->hash_node, head);  		l_new = NULL; diff --git a/kernel/bpf/preload/Kconfig b/kernel/bpf/preload/Kconfig index ace49111d3a3..26bced262473 100644 --- a/kernel/bpf/preload/Kconfig +++ b/kernel/bpf/preload/Kconfig @@ -6,6 +6,7 @@ config USERMODE_DRIVER  menuconfig BPF_PRELOAD  	bool "Preload BPF file system with kernel specific program and map iterators"  	depends on BPF +	depends on BPF_SYSCALL  	# The dependency on !COMPILE_TEST prevents it from being enabled  	# in allmodconfig or allyesconfig configurations  	depends on !COMPILE_TEST diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 6200519582a6..1388bf733071 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -7786,9 +7786,11 @@ static int check_return_code(struct bpf_verifier_env *env)  	struct tnum range = tnum_range(0, 1);  	enum bpf_prog_type prog_type = resolve_prog_type(env->prog);  	int err; +	const bool is_subprog = env->cur_state->frame[0]->subprogno;  	/* LSM and struct_ops func-ptr's return type could be "void" */ -	if ((prog_type == BPF_PROG_TYPE_STRUCT_OPS || +	if (!is_subprog && +	    (prog_type == BPF_PROG_TYPE_STRUCT_OPS ||  	     prog_type == BPF_PROG_TYPE_LSM) &&  	    !prog->aux->attach_func_proto->type)  		return 0; @@ -7808,6 +7810,16 @@ static int check_return_code(struct bpf_verifier_env *env)  		return -EACCES;  	} +	reg = cur_regs(env) + BPF_REG_0; +	if (is_subprog) { +		if (reg->type != SCALAR_VALUE) { +			verbose(env, "At subprogram exit the register R0 is not a scalar value (%s)\n", +				reg_type_str[reg->type]); +			return -EINVAL; +		} +		return 0; +	} +  	switch (prog_type) {  	case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:  		if (env->prog->expected_attach_type == BPF_CGROUP_UDP4_RECVMSG || @@ -7861,7 +7873,6 @@ static int check_return_code(struct bpf_verifier_env *env)  		return 0;  	} -	reg = cur_regs(env) + BPF_REG_0;  	if (reg->type != SCALAR_VALUE) {  		verbose(env, "At program exit the register R0 is not a known value (%s)\n",  			reg_type_str[reg->type]); @@ -9572,12 +9583,13 @@ static int check_pseudo_btf_id(struct bpf_verifier_env *env,  			       struct bpf_insn *insn,  			       struct bpf_insn_aux_data *aux)  { -	u32 datasec_id, type, id = insn->imm;  	const struct btf_var_secinfo *vsi;  	const struct btf_type *datasec;  	const struct btf_type *t;  	const char *sym_name;  	bool percpu = false; +	u32 type, id = insn->imm; +	s32 datasec_id;  	u64 addr;  	int i; diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index b4eea0abc3f0..781b9dca197c 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -229,6 +229,7 @@ int __init swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose)  		io_tlb_orig_addr[i] = INVALID_PHYS_ADDR;  	}  	io_tlb_index = 0; +	no_iotlb_memory = false;  	if (verbose)  		swiotlb_print_info(); @@ -260,9 +261,11 @@ swiotlb_init(int verbose)  	if (vstart && !swiotlb_init_with_tbl(vstart, io_tlb_nslabs, verbose))  		return; -	if (io_tlb_start) +	if (io_tlb_start) {  		memblock_free_early(io_tlb_start,  				    PAGE_ALIGN(io_tlb_nslabs << IO_TLB_SHIFT)); +		io_tlb_start = 0; +	}  	pr_warn("Cannot allocate buffer");  	no_iotlb_memory = true;  } @@ -360,6 +363,7 @@ swiotlb_late_init_with_tbl(char *tlb, unsigned long nslabs)  		io_tlb_orig_addr[i] = INVALID_PHYS_ADDR;  	}  	io_tlb_index = 0; +	no_iotlb_memory = false;  	swiotlb_print_info(); @@ -441,14 +445,11 @@ static void swiotlb_bounce(phys_addr_t orig_addr, phys_addr_t tlb_addr,  	}  } -phys_addr_t swiotlb_tbl_map_single(struct device *hwdev, -				   dma_addr_t tbl_dma_addr, -				   phys_addr_t orig_addr, -				   size_t mapping_size, -				   size_t alloc_size, -				   enum dma_data_direction dir, -				   unsigned long attrs) +phys_addr_t swiotlb_tbl_map_single(struct device *hwdev, phys_addr_t orig_addr, +		size_t mapping_size, size_t alloc_size, +		enum dma_data_direction dir, unsigned long attrs)  { +	dma_addr_t tbl_dma_addr = phys_to_dma_unencrypted(hwdev, io_tlb_start);  	unsigned long flags;  	phys_addr_t tlb_addr;  	unsigned int nslots, stride, index, wrap; @@ -667,9 +668,8 @@ dma_addr_t swiotlb_map(struct device *dev, phys_addr_t paddr, size_t size,  	trace_swiotlb_bounced(dev, phys_to_dma(dev, paddr), size,  			      swiotlb_force); -	swiotlb_addr = swiotlb_tbl_map_single(dev, -			phys_to_dma_unencrypted(dev, io_tlb_start), -			paddr, size, size, dir, attrs); +	swiotlb_addr = swiotlb_tbl_map_single(dev, paddr, size, size, dir, +			attrs);  	if (swiotlb_addr == (phys_addr_t)DMA_MAPPING_ERROR)  		return DMA_MAPPING_ERROR; diff --git a/kernel/entry/common.c b/kernel/entry/common.c index 2b8366693d5c..e9e2df3f3f9e 100644 --- a/kernel/entry/common.c +++ b/kernel/entry/common.c @@ -337,10 +337,10 @@ noinstr irqentry_state_t irqentry_enter(struct pt_regs *regs)  	 * already contains a warning when RCU is not watching, so no point  	 * in having another one here.  	 */ +	lockdep_hardirqs_off(CALLER_ADDR0);  	instrumentation_begin();  	rcu_irq_enter_check_tick(); -	/* Use the combo lockdep/tracing function */ -	trace_hardirqs_off(); +	trace_hardirqs_off_finish();  	instrumentation_end();  	return ret; diff --git a/kernel/events/core.c b/kernel/events/core.c index da467e1dd49a..dc568ca295bd 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -2312,9 +2312,6 @@ group_sched_out(struct perf_event *group_event,  		event_sched_out(event, cpuctx, ctx);  	perf_pmu_enable(ctx->pmu); - -	if (group_event->attr.exclusive) -		cpuctx->exclusive = 0;  }  #define DETACH_GROUP	0x01UL @@ -2583,11 +2580,8 @@ group_sched_in(struct perf_event *group_event,  	pmu->start_txn(pmu, PERF_PMU_TXN_ADD); -	if (event_sched_in(group_event, cpuctx, ctx)) { -		pmu->cancel_txn(pmu); -		perf_mux_hrtimer_restart(cpuctx); -		return -EAGAIN; -	} +	if (event_sched_in(group_event, cpuctx, ctx)) +		goto error;  	/*  	 * Schedule in siblings as one group (if any): @@ -2616,10 +2610,8 @@ group_error:  	}  	event_sched_out(group_event, cpuctx, ctx); +error:  	pmu->cancel_txn(pmu); - -	perf_mux_hrtimer_restart(cpuctx); -  	return -EAGAIN;  } @@ -2645,7 +2637,7 @@ static int group_can_go_on(struct perf_event *event,  	 * If this group is exclusive and there are already  	 * events on the CPU, it can't go on.  	 */ -	if (event->attr.exclusive && cpuctx->active_oncpu) +	if (event->attr.exclusive && !list_empty(get_event_list(event)))  		return 0;  	/*  	 * Otherwise, try to add it if all previous groups were able @@ -3679,6 +3671,7 @@ static int merge_sched_in(struct perf_event *event, void *data)  		*can_add_hw = 0;  		ctx->rotate_necessary = 1; +		perf_mux_hrtimer_restart(cpuctx);  	}  	return 0; @@ -6374,14 +6367,13 @@ perf_output_sample_regs(struct perf_output_handle *handle,  }  static void perf_sample_regs_user(struct perf_regs *regs_user, -				  struct pt_regs *regs, -				  struct pt_regs *regs_user_copy) +				  struct pt_regs *regs)  {  	if (user_mode(regs)) {  		regs_user->abi = perf_reg_abi(current);  		regs_user->regs = regs;  	} else if (!(current->flags & PF_KTHREAD)) { -		perf_get_regs_user(regs_user, regs, regs_user_copy); +		perf_get_regs_user(regs_user, regs);  	} else {  		regs_user->abi = PERF_SAMPLE_REGS_ABI_NONE;  		regs_user->regs = NULL; @@ -7083,8 +7075,7 @@ void perf_prepare_sample(struct perf_event_header *header,  	}  	if (sample_type & (PERF_SAMPLE_REGS_USER | PERF_SAMPLE_STACK_USER)) -		perf_sample_regs_user(&data->regs_user, regs, -				      &data->regs_user_copy); +		perf_sample_regs_user(&data->regs_user, regs);  	if (sample_type & PERF_SAMPLE_REGS_USER) {  		/* regs dump ABI info */ @@ -7186,6 +7177,7 @@ __perf_event_output(struct perf_event *event,  		    struct perf_sample_data *data,  		    struct pt_regs *regs,  		    int (*output_begin)(struct perf_output_handle *, +					struct perf_sample_data *,  					struct perf_event *,  					unsigned int))  { @@ -7198,7 +7190,7 @@ __perf_event_output(struct perf_event *event,  	perf_prepare_sample(&header, data, event, regs); -	err = output_begin(&handle, event, header.size); +	err = output_begin(&handle, data, event, header.size);  	if (err)  		goto exit; @@ -7264,7 +7256,7 @@ perf_event_read_event(struct perf_event *event,  	int ret;  	perf_event_header__init_id(&read_event.header, &sample, event); -	ret = perf_output_begin(&handle, event, read_event.header.size); +	ret = perf_output_begin(&handle, &sample, event, read_event.header.size);  	if (ret)  		return; @@ -7533,7 +7525,7 @@ static void perf_event_task_output(struct perf_event *event,  	perf_event_header__init_id(&task_event->event_id.header, &sample, event); -	ret = perf_output_begin(&handle, event, +	ret = perf_output_begin(&handle, &sample, event,  				task_event->event_id.header.size);  	if (ret)  		goto out; @@ -7636,7 +7628,7 @@ static void perf_event_comm_output(struct perf_event *event,  		return;  	perf_event_header__init_id(&comm_event->event_id.header, &sample, event); -	ret = perf_output_begin(&handle, event, +	ret = perf_output_begin(&handle, &sample, event,  				comm_event->event_id.header.size);  	if (ret) @@ -7736,7 +7728,7 @@ static void perf_event_namespaces_output(struct perf_event *event,  	perf_event_header__init_id(&namespaces_event->event_id.header,  				   &sample, event); -	ret = perf_output_begin(&handle, event, +	ret = perf_output_begin(&handle, &sample, event,  				namespaces_event->event_id.header.size);  	if (ret)  		goto out; @@ -7863,7 +7855,7 @@ static void perf_event_cgroup_output(struct perf_event *event, void *data)  	perf_event_header__init_id(&cgroup_event->event_id.header,  				   &sample, event); -	ret = perf_output_begin(&handle, event, +	ret = perf_output_begin(&handle, &sample, event,  				cgroup_event->event_id.header.size);  	if (ret)  		goto out; @@ -7989,7 +7981,7 @@ static void perf_event_mmap_output(struct perf_event *event,  	}  	perf_event_header__init_id(&mmap_event->event_id.header, &sample, event); -	ret = perf_output_begin(&handle, event, +	ret = perf_output_begin(&handle, &sample, event,  				mmap_event->event_id.header.size);  	if (ret)  		goto out; @@ -8299,7 +8291,7 @@ void perf_event_aux_event(struct perf_event *event, unsigned long head,  	int ret;  	perf_event_header__init_id(&rec.header, &sample, event); -	ret = perf_output_begin(&handle, event, rec.header.size); +	ret = perf_output_begin(&handle, &sample, event, rec.header.size);  	if (ret)  		return; @@ -8333,7 +8325,7 @@ void perf_log_lost_samples(struct perf_event *event, u64 lost)  	perf_event_header__init_id(&lost_samples_event.header, &sample, event); -	ret = perf_output_begin(&handle, event, +	ret = perf_output_begin(&handle, &sample, event,  				lost_samples_event.header.size);  	if (ret)  		return; @@ -8388,7 +8380,7 @@ static void perf_event_switch_output(struct perf_event *event, void *data)  	perf_event_header__init_id(&se->event_id.header, &sample, event); -	ret = perf_output_begin(&handle, event, se->event_id.header.size); +	ret = perf_output_begin(&handle, &sample, event, se->event_id.header.size);  	if (ret)  		return; @@ -8463,7 +8455,7 @@ static void perf_log_throttle(struct perf_event *event, int enable)  	perf_event_header__init_id(&throttle_event.header, &sample, event); -	ret = perf_output_begin(&handle, event, +	ret = perf_output_begin(&handle, &sample, event,  				throttle_event.header.size);  	if (ret)  		return; @@ -8506,7 +8498,7 @@ static void perf_event_ksymbol_output(struct perf_event *event, void *data)  	perf_event_header__init_id(&ksymbol_event->event_id.header,  				   &sample, event); -	ret = perf_output_begin(&handle, event, +	ret = perf_output_begin(&handle, &sample, event,  				ksymbol_event->event_id.header.size);  	if (ret)  		return; @@ -8596,7 +8588,7 @@ static void perf_event_bpf_output(struct perf_event *event, void *data)  	perf_event_header__init_id(&bpf_event->event_id.header,  				   &sample, event); -	ret = perf_output_begin(&handle, event, +	ret = perf_output_begin(&handle, data, event,  				bpf_event->event_id.header.size);  	if (ret)  		return; @@ -8705,7 +8697,8 @@ static void perf_event_text_poke_output(struct perf_event *event, void *data)  	perf_event_header__init_id(&text_poke_event->event_id.header, &sample, event); -	ret = perf_output_begin(&handle, event, text_poke_event->event_id.header.size); +	ret = perf_output_begin(&handle, &sample, event, +				text_poke_event->event_id.header.size);  	if (ret)  		return; @@ -8786,7 +8779,7 @@ static void perf_log_itrace_start(struct perf_event *event)  	rec.tid	= perf_event_tid(event, current);  	perf_event_header__init_id(&rec.header, &sample, event); -	ret = perf_output_begin(&handle, event, rec.header.size); +	ret = perf_output_begin(&handle, &sample, event, rec.header.size);  	if (ret)  		return; @@ -10085,6 +10078,7 @@ perf_event_parse_addr_filter(struct perf_event *event, char *fstr,  			if (token == IF_SRC_FILE || token == IF_SRC_FILEADDR) {  				int fpos = token == IF_SRC_FILE ? 2 : 1; +				kfree(filename);  				filename = match_strdup(&args[fpos]);  				if (!filename) {  					ret = -ENOMEM; @@ -10131,16 +10125,13 @@ perf_event_parse_addr_filter(struct perf_event *event, char *fstr,  				 */  				ret = -EOPNOTSUPP;  				if (!event->ctx->task) -					goto fail_free_name; +					goto fail;  				/* look up the path and grab its inode */  				ret = kern_path(filename, LOOKUP_FOLLOW,  						&filter->path);  				if (ret) -					goto fail_free_name; - -				kfree(filename); -				filename = NULL; +					goto fail;  				ret = -EINVAL;  				if (!filter->path.dentry || @@ -10160,13 +10151,13 @@ perf_event_parse_addr_filter(struct perf_event *event, char *fstr,  	if (state != IF_STATE_ACTION)  		goto fail; +	kfree(filename);  	kfree(orig);  	return 0; -fail_free_name: -	kfree(filename);  fail: +	kfree(filename);  	free_filters_list(filters);  	kfree(orig); diff --git a/kernel/events/internal.h b/kernel/events/internal.h index fcbf5616a441..228801e20788 100644 --- a/kernel/events/internal.h +++ b/kernel/events/internal.h @@ -205,16 +205,12 @@ DEFINE_OUTPUT_COPY(__output_copy_user, arch_perf_out_copy_user)  static inline int get_recursion_context(int *recursion)  { -	int rctx; - -	if (unlikely(in_nmi())) -		rctx = 3; -	else if (in_irq()) -		rctx = 2; -	else if (in_softirq()) -		rctx = 1; -	else -		rctx = 0; +	unsigned int pc = preempt_count(); +	unsigned char rctx = 0; + +	rctx += !!(pc & (NMI_MASK)); +	rctx += !!(pc & (NMI_MASK | HARDIRQ_MASK)); +	rctx += !!(pc & (NMI_MASK | HARDIRQ_MASK | SOFTIRQ_OFFSET));  	if (recursion[rctx])  		return -1; diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index 192b8abc6330..ef91ae75ca56 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -147,6 +147,7 @@ ring_buffer_has_space(unsigned long head, unsigned long tail,  static __always_inline int  __perf_output_begin(struct perf_output_handle *handle, +		    struct perf_sample_data *data,  		    struct perf_event *event, unsigned int size,  		    bool backward)  { @@ -237,18 +238,16 @@ __perf_output_begin(struct perf_output_handle *handle,  	handle->size = (1UL << page_shift) - offset;  	if (unlikely(have_lost)) { -		struct perf_sample_data sample_data; -  		lost_event.header.size = sizeof(lost_event);  		lost_event.header.type = PERF_RECORD_LOST;  		lost_event.header.misc = 0;  		lost_event.id          = event->id;  		lost_event.lost        = local_xchg(&rb->lost, 0); -		perf_event_header__init_id(&lost_event.header, -					   &sample_data, event); +		/* XXX mostly redundant; @data is already fully initializes */ +		perf_event_header__init_id(&lost_event.header, data, event);  		perf_output_put(handle, lost_event); -		perf_event__output_id_sample(event, handle, &sample_data); +		perf_event__output_id_sample(event, handle, data);  	}  	return 0; @@ -263,22 +262,25 @@ out:  }  int perf_output_begin_forward(struct perf_output_handle *handle, -			     struct perf_event *event, unsigned int size) +			      struct perf_sample_data *data, +			      struct perf_event *event, unsigned int size)  { -	return __perf_output_begin(handle, event, size, false); +	return __perf_output_begin(handle, data, event, size, false);  }  int perf_output_begin_backward(struct perf_output_handle *handle, +			       struct perf_sample_data *data,  			       struct perf_event *event, unsigned int size)  { -	return __perf_output_begin(handle, event, size, true); +	return __perf_output_begin(handle, data, event, size, true);  }  int perf_output_begin(struct perf_output_handle *handle, +		      struct perf_sample_data *data,  		      struct perf_event *event, unsigned int size)  { -	return __perf_output_begin(handle, event, size, +	return __perf_output_begin(handle, data, event, size,  				   unlikely(is_write_backward(event)));  } diff --git a/kernel/exit.c b/kernel/exit.c index 87a2d515de0d..1f236ed375f8 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -454,7 +454,10 @@ static void exit_mm(void)  		mmap_read_unlock(mm);  		self.task = current; -		self.next = xchg(&core_state->dumper.next, &self); +		if (self.task->flags & PF_SIGNALED) +			self.next = xchg(&core_state->dumper.next, &self); +		else +			self.task = NULL;  		/*  		 * Implies mb(), the result of xchg() must be visible  		 * to core_state->dumper. diff --git a/kernel/fail_function.c b/kernel/fail_function.c index 63b349168da7..b0b1ad93fa95 100644 --- a/kernel/fail_function.c +++ b/kernel/fail_function.c @@ -253,7 +253,7 @@ static ssize_t fei_write(struct file *file, const char __user *buffer,  	if (copy_from_user(buf, buffer, count)) {  		ret = -EFAULT; -		goto out; +		goto out_free;  	}  	buf[count] = '\0';  	sym = strstrip(buf); @@ -307,8 +307,9 @@ static ssize_t fei_write(struct file *file, const char __user *buffer,  		ret = count;  	}  out: -	kfree(buf);  	mutex_unlock(&fei_lock); +out_free: +	kfree(buf);  	return ret;  } diff --git a/kernel/fork.c b/kernel/fork.c index 32083db7a2a2..6d266388d380 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2167,14 +2167,9 @@ static __latent_entropy struct task_struct *copy_process(  	/* ok, now we should be set up.. */  	p->pid = pid_nr(pid);  	if (clone_flags & CLONE_THREAD) { -		p->exit_signal = -1;  		p->group_leader = current->group_leader;  		p->tgid = current->tgid;  	} else { -		if (clone_flags & CLONE_PARENT) -			p->exit_signal = current->group_leader->exit_signal; -		else -			p->exit_signal = args->exit_signal;  		p->group_leader = p;  		p->tgid = p->pid;  	} @@ -2218,9 +2213,14 @@ static __latent_entropy struct task_struct *copy_process(  	if (clone_flags & (CLONE_PARENT|CLONE_THREAD)) {  		p->real_parent = current->real_parent;  		p->parent_exec_id = current->parent_exec_id; +		if (clone_flags & CLONE_THREAD) +			p->exit_signal = -1; +		else +			p->exit_signal = current->group_leader->exit_signal;  	} else {  		p->real_parent = current;  		p->parent_exec_id = current->self_exec_id; +		p->exit_signal = args->exit_signal;  	}  	klp_copy_process(p); diff --git a/kernel/futex.c b/kernel/futex.c index f8614ef4ff31..00259c7e288e 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -788,8 +788,9 @@ static void put_pi_state(struct futex_pi_state *pi_state)  	 */  	if (pi_state->owner) {  		struct task_struct *owner; +		unsigned long flags; -		raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); +		raw_spin_lock_irqsave(&pi_state->pi_mutex.wait_lock, flags);  		owner = pi_state->owner;  		if (owner) {  			raw_spin_lock(&owner->pi_lock); @@ -797,7 +798,7 @@ static void put_pi_state(struct futex_pi_state *pi_state)  			raw_spin_unlock(&owner->pi_lock);  		}  		rt_mutex_proxy_unlock(&pi_state->pi_mutex, owner); -		raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); +		raw_spin_unlock_irqrestore(&pi_state->pi_mutex.wait_lock, flags);  	}  	if (current->pi_state_cache) { @@ -2380,10 +2381,22 @@ retry:  		}  		/* -		 * Since we just failed the trylock; there must be an owner. +		 * The trylock just failed, so either there is an owner or +		 * there is a higher priority waiter than this one.  		 */  		newowner = rt_mutex_owner(&pi_state->pi_mutex); -		BUG_ON(!newowner); +		/* +		 * If the higher priority waiter has not yet taken over the +		 * rtmutex then newowner is NULL. We can't return here with +		 * that state because it's inconsistent vs. the user space +		 * state. So drop the locks and try again. It's a valid +		 * situation and not any different from the other retry +		 * conditions. +		 */ +		if (unlikely(!newowner)) { +			err = -EAGAIN; +			goto handle_err; +		}  	} else {  		WARN_ON_ONCE(argowner != current);  		if (oldowner == current) { diff --git a/kernel/hung_task.c b/kernel/hung_task.c index ce76f490126c..396ebaebea3f 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c @@ -225,8 +225,7 @@ static long hung_timeout_jiffies(unsigned long last_checked,   * Process updating of timeout sysctl   */  int proc_dohung_task_timeout_secs(struct ctl_table *table, int write, -				  void __user *buffer, -				  size_t *lenp, loff_t *ppos) +				  void *buffer, size_t *lenp, loff_t *ppos)  {  	int ret; diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig index f2cda6b0057f..d79ef2493a28 100644 --- a/kernel/irq/Kconfig +++ b/kernel/irq/Kconfig @@ -77,6 +77,7 @@ config IRQ_FASTEOI_HIERARCHY_HANDLERS  # Generic IRQ IPI support  config GENERIC_IRQ_IPI  	bool +	select IRQ_DOMAIN_HIERARCHY  # Generic MSI interrupt support  config GENERIC_MSI_IRQ diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index df75c3573dcb..6d89e33fe3aa 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -945,33 +945,6 @@ void handle_percpu_devid_irq(struct irq_desc *desc)  }  /** - * handle_percpu_devid_fasteoi_ipi - Per CPU local IPI handler with per cpu - *				     dev ids - * @desc:	the interrupt description structure for this irq - * - * The biggest difference with the IRQ version is that the interrupt is - * EOIed early, as the IPI could result in a context switch, and we need to - * make sure the IPI can fire again. We also assume that the arch code has - * registered an action. If not, we are positively doomed. - */ -void handle_percpu_devid_fasteoi_ipi(struct irq_desc *desc) -{ -	struct irq_chip *chip = irq_desc_get_chip(desc); -	struct irqaction *action = desc->action; -	unsigned int irq = irq_desc_get_irq(desc); -	irqreturn_t res; - -	__kstat_incr_irqs_this_cpu(desc); - -	if (chip->irq_eoi) -		chip->irq_eoi(&desc->irq_data); - -	trace_irq_handler_entry(irq, action); -	res = action->handler(irq, raw_cpu_ptr(action->percpu_dev_id)); -	trace_irq_handler_exit(irq, action, res); -} - -/**   * handle_percpu_devid_fasteoi_nmi - Per CPU local NMI handler with per cpu   *				     dev ids   * @desc:	the interrupt description structure for this irq diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index c460e0496006..c826ba4141fe 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -371,6 +371,76 @@ int irq_set_affinity_locked(struct irq_data *data, const struct cpumask *mask,  	return ret;  } +/** + * irq_update_affinity_desc - Update affinity management for an interrupt + * @irq:	The interrupt number to update + * @affinity:	Pointer to the affinity descriptor + * + * This interface can be used to configure the affinity management of + * interrupts which have been allocated already. + * + * There are certain limitations on when it may be used - attempts to use it + * for when the kernel is configured for generic IRQ reservation mode (in + * config GENERIC_IRQ_RESERVATION_MODE) will fail, as it may conflict with + * managed/non-managed interrupt accounting. In addition, attempts to use it on + * an interrupt which is already started or which has already been configured + * as managed will also fail, as these mean invalid init state or double init. + */ +int irq_update_affinity_desc(unsigned int irq, +			     struct irq_affinity_desc *affinity) +{ +	struct irq_desc *desc; +	unsigned long flags; +	bool activated; +	int ret = 0; + +	/* +	 * Supporting this with the reservation scheme used by x86 needs +	 * some more thought. Fail it for now. +	 */ +	if (IS_ENABLED(CONFIG_GENERIC_IRQ_RESERVATION_MODE)) +		return -EOPNOTSUPP; + +	desc = irq_get_desc_buslock(irq, &flags, 0); +	if (!desc) +		return -EINVAL; + +	/* Requires the interrupt to be shut down */ +	if (irqd_is_started(&desc->irq_data)) { +		ret = -EBUSY; +		goto out_unlock; +	} + +	/* Interrupts which are already managed cannot be modified */ +	if (irqd_affinity_is_managed(&desc->irq_data)) { +		ret = -EBUSY; +		goto out_unlock; +	} + +	/* +	 * Deactivate the interrupt. That's required to undo +	 * anything an earlier activation has established. +	 */ +	activated = irqd_is_activated(&desc->irq_data); +	if (activated) +		irq_domain_deactivate_irq(&desc->irq_data); + +	if (affinity->is_managed) { +		irqd_set(&desc->irq_data, IRQD_AFFINITY_MANAGED); +		irqd_set(&desc->irq_data, IRQD_MANAGED_SHUTDOWN); +	} + +	cpumask_copy(desc->irq_common_data.affinity, &affinity->mask); + +	/* Restore the activation state */ +	if (activated) +		irq_domain_activate_irq(&desc->irq_data, false); + +out_unlock: +	irq_put_desc_busunlock(desc, flags); +	return ret; +} +  int __irq_set_affinity(unsigned int irq, const struct cpumask *mask, bool force)  {  	struct irq_desc *desc = irq_to_desc(irq); diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 8a12a25fa40d..41fdbb7953c6 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -1249,7 +1249,13 @@ __acquires(hlist_lock)  	*head = &kretprobe_inst_table[hash];  	hlist_lock = kretprobe_table_lock_ptr(hash); -	raw_spin_lock_irqsave(hlist_lock, *flags); +	/* +	 * Nested is a workaround that will soon not be needed. +	 * There's other protections that make sure the same lock +	 * is not taken on the same CPU that lockdep is unaware of. +	 * Differentiate when it is taken in NMI context. +	 */ +	raw_spin_lock_irqsave_nested(hlist_lock, *flags, !!in_nmi());  }  NOKPROBE_SYMBOL(kretprobe_hash_lock); @@ -1258,7 +1264,13 @@ static void kretprobe_table_lock(unsigned long hash,  __acquires(hlist_lock)  {  	raw_spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash); -	raw_spin_lock_irqsave(hlist_lock, *flags); +	/* +	 * Nested is a workaround that will soon not be needed. +	 * There's other protections that make sure the same lock +	 * is not taken on the same CPU that lockdep is unaware of. +	 * Differentiate when it is taken in NMI context. +	 */ +	raw_spin_lock_irqsave_nested(hlist_lock, *flags, !!in_nmi());  }  NOKPROBE_SYMBOL(kretprobe_table_lock); @@ -2028,7 +2040,12 @@ static int pre_handler_kretprobe(struct kprobe *p, struct pt_regs *regs)  	/* TODO: consider to only swap the RA after the last pre_handler fired */  	hash = hash_ptr(current, KPROBE_HASH_BITS); -	raw_spin_lock_irqsave(&rp->lock, flags); +	/* +	 * Nested is a workaround that will soon not be needed. +	 * There's other protections that make sure the same lock +	 * is not taken on the same CPU that lockdep is unaware of. +	 */ +	raw_spin_lock_irqsave_nested(&rp->lock, flags, 1);  	if (!hlist_empty(&rp->free_instances)) {  		ri = hlist_entry(rp->free_instances.first,  				struct kretprobe_instance, hlist); @@ -2039,7 +2056,7 @@ static int pre_handler_kretprobe(struct kprobe *p, struct pt_regs *regs)  		ri->task = current;  		if (rp->entry_handler && rp->entry_handler(ri, regs)) { -			raw_spin_lock_irqsave(&rp->lock, flags); +			raw_spin_lock_irqsave_nested(&rp->lock, flags, 1);  			hlist_add_head(&ri->hlist, &rp->free_instances);  			raw_spin_unlock_irqrestore(&rp->lock, flags);  			return 0; diff --git a/kernel/kthread.c b/kernel/kthread.c index e29773c82b70..933a625621b8 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -897,7 +897,8 @@ void kthread_delayed_work_timer_fn(struct timer_list *t)  	/* Move the work from worker->delayed_work_list. */  	WARN_ON_ONCE(list_empty(&work->node));  	list_del_init(&work->node); -	kthread_insert_work(worker, work, &worker->work_list); +	if (!work->canceling) +		kthread_insert_work(worker, work, &worker->work_list);  	raw_spin_unlock_irqrestore(&worker->lock, flags);  } diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index b71ad8d9f1c9..c1418b47f625 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -108,19 +108,21 @@ static inline void lockdep_lock(void)  {  	DEBUG_LOCKS_WARN_ON(!irqs_disabled()); +	__this_cpu_inc(lockdep_recursion);  	arch_spin_lock(&__lock);  	__owner = current; -	__this_cpu_inc(lockdep_recursion);  }  static inline void lockdep_unlock(void)  { +	DEBUG_LOCKS_WARN_ON(!irqs_disabled()); +  	if (debug_locks && DEBUG_LOCKS_WARN_ON(__owner != current))  		return; -	__this_cpu_dec(lockdep_recursion);  	__owner = NULL;  	arch_spin_unlock(&__lock); +	__this_cpu_dec(lockdep_recursion);  }  static inline bool lockdep_assert_locked(void) @@ -2765,7 +2767,9 @@ print_deadlock_bug(struct task_struct *curr, struct held_lock *prev,   * (Note that this has to be done separately, because the graph cannot   * detect such classes of deadlocks.)   * - * Returns: 0 on deadlock detected, 1 on OK, 2 on recursive read + * Returns: 0 on deadlock detected, 1 on OK, 2 if another lock with the same + * lock class is held but nest_lock is also held, i.e. we rely on the + * nest_lock to avoid the deadlock.   */  static int  check_deadlock(struct task_struct *curr, struct held_lock *next) @@ -2788,7 +2792,7 @@ check_deadlock(struct task_struct *curr, struct held_lock *next)  		 * lock class (i.e. read_lock(lock)+read_lock(lock)):  		 */  		if ((next->read == 2) && prev->read) -			return 2; +			continue;  		/*  		 * We're holding the nest_lock, which serializes this lock's @@ -3593,15 +3597,12 @@ static int validate_chain(struct task_struct *curr,  		if (!ret)  			return 0;  		/* -		 * Mark recursive read, as we jump over it when -		 * building dependencies (just like we jump over -		 * trylock entries): -		 */ -		if (ret == 2) -			hlock->read = 2; -		/*  		 * Add dependency only if this lock is not the head -		 * of the chain, and if it's not a secondary read-lock: +		 * of the chain, and if the new lock introduces no more +		 * lock dependency (because we already hold a lock with the +		 * same lock class) nor deadlock (because the nest_lock +		 * serializes nesting locks), see the comments for +		 * check_deadlock().  		 */  		if (!chain_head && ret != 2) {  			if (!check_prevs_add(curr, hlock)) diff --git a/kernel/panic.c b/kernel/panic.c index 396142ee43fd..332736a72a58 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -605,7 +605,8 @@ void __warn(const char *file, int line, void *caller, unsigned taint,  		panic("panic_on_warn set ...\n");  	} -	dump_stack(); +	if (!regs) +		dump_stack();  	print_irqtrace_events(current); diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index fe64a49344bf..bc1e3b5a97bd 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -528,8 +528,8 @@ static int log_store(u32 caller_id, int facility, int level,  	if (dev_info)  		memcpy(&r.info->dev_info, dev_info, sizeof(r.info->dev_info)); -	/* insert message */ -	if ((flags & LOG_CONT) || !(flags & LOG_NEWLINE)) +	/* A message without a trailing newline can be continued. */ +	if (!(flags & LOG_NEWLINE))  		prb_commit(&e);  	else  		prb_final_commit(&e); diff --git a/kernel/printk/printk_ringbuffer.c b/kernel/printk/printk_ringbuffer.c index 6b1525685277..74e25a1704f2 100644 --- a/kernel/printk/printk_ringbuffer.c +++ b/kernel/printk/printk_ringbuffer.c @@ -882,8 +882,6 @@ static bool desc_reserve(struct printk_ringbuffer *rb, unsigned long *id_out)  	head_id = atomic_long_read(&desc_ring->head_id); /* LMM(desc_reserve:A) */  	do { -		desc = to_desc(desc_ring, head_id); -  		id = DESC_ID(head_id + 1);  		id_prev_wrap = DESC_ID_PREV_WRAP(desc_ring, id); diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 43d6179508d6..79de1294f8eb 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -264,17 +264,11 @@ static int ptrace_check_attach(struct task_struct *child, bool ignore_state)  	return ret;  } -static bool ptrace_has_cap(const struct cred *cred, struct user_namespace *ns, -			   unsigned int mode) +static bool ptrace_has_cap(struct user_namespace *ns, unsigned int mode)  { -	int ret; -  	if (mode & PTRACE_MODE_NOAUDIT) -		ret = security_capable(cred, ns, CAP_SYS_PTRACE, CAP_OPT_NOAUDIT); -	else -		ret = security_capable(cred, ns, CAP_SYS_PTRACE, CAP_OPT_NONE); - -	return ret == 0; +		return ns_capable_noaudit(ns, CAP_SYS_PTRACE); +	return ns_capable(ns, CAP_SYS_PTRACE);  }  /* Returns 0 on success, -errno on denial. */ @@ -326,7 +320,7 @@ static int __ptrace_may_access(struct task_struct *task, unsigned int mode)  	    gid_eq(caller_gid, tcred->sgid) &&  	    gid_eq(caller_gid, tcred->gid))  		goto ok; -	if (ptrace_has_cap(cred, tcred->user_ns, mode)) +	if (ptrace_has_cap(tcred->user_ns, mode))  		goto ok;  	rcu_read_unlock();  	return -EPERM; @@ -345,7 +339,7 @@ ok:  	mm = task->mm;  	if (mm &&  	    ((get_dumpable(mm) != SUID_DUMP_USER) && -	     !ptrace_has_cap(cred, mm->user_ns, mode))) +	     !ptrace_has_cap(mm->user_ns, mode)))  	    return -EPERM;  	return security_ptrace_access_check(task, mode); diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 2a52f42f64b6..bd04b09b84b3 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -4077,7 +4077,6 @@ void rcu_cpu_starting(unsigned int cpu)  	smp_mb(); /* Ensure RCU read-side usage follows above initialization. */  } -#ifdef CONFIG_HOTPLUG_CPU  /*   * The outgoing function has no further need of RCU, so remove it from   * the rcu_node tree's ->qsmaskinitnext bit masks. @@ -4117,6 +4116,7 @@ void rcu_report_dead(unsigned int cpu)  	rdp->cpu_started = false;  } +#ifdef CONFIG_HOTPLUG_CPU  /*   * The outgoing CPU has just passed through the dying-idle state, and we   * are being invoked from the CPU that was IPIed to continue the offline diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index 0fde39b8daab..ca21d28a0f98 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -249,13 +249,16 @@ static bool check_slow_task(struct task_struct *t, void *arg)  /*   * Scan the current list of tasks blocked within RCU read-side critical - * sections, printing out the tid of each. + * sections, printing out the tid of each of the first few of them.   */ -static int rcu_print_task_stall(struct rcu_node *rnp) +static int rcu_print_task_stall(struct rcu_node *rnp, unsigned long flags) +	__releases(rnp->lock)  { +	int i = 0;  	int ndetected = 0;  	struct rcu_stall_chk_rdr rscr;  	struct task_struct *t; +	struct task_struct *ts[8];  	if (!rcu_preempt_blocked_readers_cgp(rnp))  		return 0; @@ -264,6 +267,14 @@ static int rcu_print_task_stall(struct rcu_node *rnp)  	t = list_entry(rnp->gp_tasks->prev,  		       struct task_struct, rcu_node_entry);  	list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) { +		get_task_struct(t); +		ts[i++] = t; +		if (i >= ARRAY_SIZE(ts)) +			break; +	} +	raw_spin_unlock_irqrestore_rcu_node(rnp, flags); +	for (i--; i; i--) { +		t = ts[i];  		if (!try_invoke_on_locked_down_task(t, check_slow_task, &rscr))  			pr_cont(" P%d", t->pid);  		else @@ -273,6 +284,7 @@ static int rcu_print_task_stall(struct rcu_node *rnp)  				".q"[rscr.rs.b.need_qs],  				".e"[rscr.rs.b.exp_hint],  				".l"[rscr.on_blkd_list]); +		put_task_struct(t);  		ndetected++;  	}  	pr_cont("\n"); @@ -293,8 +305,9 @@ static void rcu_print_detail_task_stall_rnp(struct rcu_node *rnp)   * Because preemptible RCU does not exist, we never have to check for   * tasks blocked within RCU read-side critical sections.   */ -static int rcu_print_task_stall(struct rcu_node *rnp) +static int rcu_print_task_stall(struct rcu_node *rnp, unsigned long flags)  { +	raw_spin_unlock_irqrestore_rcu_node(rnp, flags);  	return 0;  }  #endif /* #else #ifdef CONFIG_PREEMPT_RCU */ @@ -472,7 +485,6 @@ static void print_other_cpu_stall(unsigned long gp_seq, unsigned long gps)  	pr_err("INFO: %s detected stalls on CPUs/tasks:\n", rcu_state.name);  	rcu_for_each_leaf_node(rnp) {  		raw_spin_lock_irqsave_rcu_node(rnp, flags); -		ndetected += rcu_print_task_stall(rnp);  		if (rnp->qsmask != 0) {  			for_each_leaf_node_possible_cpu(rnp, cpu)  				if (rnp->qsmask & leaf_node_cpu_bit(rnp, cpu)) { @@ -480,7 +492,7 @@ static void print_other_cpu_stall(unsigned long gp_seq, unsigned long gps)  					ndetected++;  				}  		} -		raw_spin_unlock_irqrestore_rcu_node(rnp, flags); +		ndetected += rcu_print_task_stall(rnp, flags); // Releases rnp->lock.  	}  	for_each_possible_cpu(cpu) diff --git a/kernel/reboot.c b/kernel/reboot.c index e7b78d5ae1ab..af6f23d8bea1 100644 --- a/kernel/reboot.c +++ b/kernel/reboot.c @@ -551,22 +551,22 @@ static int __init reboot_setup(char *str)  			break;  		case 's': -		{ -			int rc; - -			if (isdigit(*(str+1))) { -				rc = kstrtoint(str+1, 0, &reboot_cpu); -				if (rc) -					return rc; -			} else if (str[1] == 'm' && str[2] == 'p' && -				   isdigit(*(str+3))) { -				rc = kstrtoint(str+3, 0, &reboot_cpu); -				if (rc) -					return rc; -			} else +			if (isdigit(*(str+1))) +				reboot_cpu = simple_strtoul(str+1, NULL, 0); +			else if (str[1] == 'm' && str[2] == 'p' && +							isdigit(*(str+3))) +				reboot_cpu = simple_strtoul(str+3, NULL, 0); +			else  				*mode = REBOOT_SOFT; +			if (reboot_cpu >= num_possible_cpus()) { +				pr_err("Ignoring the CPU number in reboot= option. " +				       "CPU %d exceeds possible cpu number %d\n", +				       reboot_cpu, num_possible_cpus()); +				reboot_cpu = 0; +				break; +			}  			break; -		} +  		case 'g':  			*mode = REBOOT_GPIO;  			break; diff --git a/kernel/sched/core.c b/kernel/sched/core.c index d2003a7d5ab5..e7e453492cff 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2501,7 +2501,12 @@ ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags,  #ifdef CONFIG_SMP  	if (wake_flags & WF_MIGRATED)  		en_flags |= ENQUEUE_MIGRATED; +	else  #endif +	if (p->in_iowait) { +		delayacct_blkio_end(p); +		atomic_dec(&task_rq(p)->nr_iowait); +	}  	activate_task(rq, p, en_flags);  	ttwu_do_wakeup(rq, p, wake_flags, rf); @@ -2888,11 +2893,6 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)  	if (READ_ONCE(p->on_rq) && ttwu_runnable(p, wake_flags))  		goto unlock; -	if (p->in_iowait) { -		delayacct_blkio_end(p); -		atomic_dec(&task_rq(p)->nr_iowait); -	} -  #ifdef CONFIG_SMP  	/*  	 * Ensure we load p->on_cpu _after_ p->on_rq, otherwise it would be @@ -2963,6 +2963,11 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)  	cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags);  	if (task_cpu(p) != cpu) { +		if (p->in_iowait) { +			delayacct_blkio_end(p); +			atomic_dec(&task_rq(p)->nr_iowait); +		} +  		wake_flags |= WF_MIGRATED;  		psi_ttwu_dequeue(p);  		set_task_cpu(p, cpu); @@ -4907,20 +4912,21 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task)  		if (!dl_prio(p->normal_prio) ||  		    (pi_task && dl_prio(pi_task->prio) &&  		     dl_entity_preempt(&pi_task->dl, &p->dl))) { -			p->dl.dl_boosted = 1; +			p->dl.pi_se = pi_task->dl.pi_se;  			queue_flag |= ENQUEUE_REPLENISH; -		} else -			p->dl.dl_boosted = 0; +		} else { +			p->dl.pi_se = &p->dl; +		}  		p->sched_class = &dl_sched_class;  	} else if (rt_prio(prio)) {  		if (dl_prio(oldprio)) -			p->dl.dl_boosted = 0; +			p->dl.pi_se = &p->dl;  		if (oldprio < prio)  			queue_flag |= ENQUEUE_HEAD;  		p->sched_class = &rt_sched_class;  	} else {  		if (dl_prio(oldprio)) -			p->dl.dl_boosted = 0; +			p->dl.pi_se = &p->dl;  		if (rt_prio(oldprio))  			p->rt.timeout = 0;  		p->sched_class = &fair_sched_class; diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index c03a5775d019..97d318b0cd0c 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -102,9 +102,12 @@ static bool sugov_should_update_freq(struct sugov_policy *sg_policy, u64 time)  static bool sugov_update_next_freq(struct sugov_policy *sg_policy, u64 time,  				   unsigned int next_freq)  { -	if (sg_policy->next_freq == next_freq && -	    !cpufreq_driver_test_flags(CPUFREQ_NEED_UPDATE_LIMITS)) -		return false; +	if (!sg_policy->need_freq_update) { +		if (sg_policy->next_freq == next_freq) +			return false; +	} else { +		sg_policy->need_freq_update = cpufreq_driver_test_flags(CPUFREQ_NEED_UPDATE_LIMITS); +	}  	sg_policy->next_freq = next_freq;  	sg_policy->last_freq_update_time = time; @@ -162,11 +165,9 @@ static unsigned int get_next_freq(struct sugov_policy *sg_policy,  	freq = map_util_freq(util, freq, max); -	if (freq == sg_policy->cached_raw_freq && !sg_policy->need_freq_update && -	    !cpufreq_driver_test_flags(CPUFREQ_NEED_UPDATE_LIMITS)) +	if (freq == sg_policy->cached_raw_freq && !sg_policy->need_freq_update)  		return sg_policy->next_freq; -	sg_policy->need_freq_update = false;  	sg_policy->cached_raw_freq = freq;  	return cpufreq_driver_resolve_freq(policy, freq);  } @@ -442,7 +443,6 @@ static void sugov_update_single(struct update_util_data *hook, u64 time,  	struct sugov_policy *sg_policy = sg_cpu->sg_policy;  	unsigned long util, max;  	unsigned int next_f; -	bool busy;  	unsigned int cached_freq = sg_policy->cached_raw_freq;  	sugov_iowait_boost(sg_cpu, time, flags); @@ -453,9 +453,6 @@ static void sugov_update_single(struct update_util_data *hook, u64 time,  	if (!sugov_should_update_freq(sg_policy, time))  		return; -	/* Limits may have changed, don't skip frequency update */ -	busy = !sg_policy->need_freq_update && sugov_cpu_is_busy(sg_cpu); -  	util = sugov_get_util(sg_cpu);  	max = sg_cpu->max;  	util = sugov_iowait_apply(sg_cpu, time, util, max); @@ -464,7 +461,7 @@ static void sugov_update_single(struct update_util_data *hook, u64 time,  	 * Do not reduce the frequency if the CPU has not been idle  	 * recently, as the reduction is likely to be premature then.  	 */ -	if (busy && next_f < sg_policy->next_freq) { +	if (sugov_cpu_is_busy(sg_cpu) && next_f < sg_policy->next_freq) {  		next_f = sg_policy->next_freq;  		/* Restore cached freq as next_freq has changed */ @@ -829,9 +826,10 @@ static int sugov_start(struct cpufreq_policy *policy)  	sg_policy->next_freq			= 0;  	sg_policy->work_in_progress		= false;  	sg_policy->limits_changed		= false; -	sg_policy->need_freq_update		= false;  	sg_policy->cached_raw_freq		= 0; +	sg_policy->need_freq_update = cpufreq_driver_test_flags(CPUFREQ_NEED_UPDATE_LIMITS); +  	for_each_cpu(cpu, policy->cpus) {  		struct sugov_cpu *sg_cpu = &per_cpu(sugov_cpu, cpu); @@ -883,7 +881,7 @@ static void sugov_limits(struct cpufreq_policy *policy)  struct cpufreq_governor schedutil_gov = {  	.name			= "schedutil",  	.owner			= THIS_MODULE, -	.dynamic_switching	= true, +	.flags			= CPUFREQ_GOV_DYNAMIC_SWITCHING,  	.init			= sugov_init,  	.exit			= sugov_exit,  	.start			= sugov_start, diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index f232305dcefe..1d3c97268ec0 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -43,6 +43,28 @@ static inline int on_dl_rq(struct sched_dl_entity *dl_se)  	return !RB_EMPTY_NODE(&dl_se->rb_node);  } +#ifdef CONFIG_RT_MUTEXES +static inline struct sched_dl_entity *pi_of(struct sched_dl_entity *dl_se) +{ +	return dl_se->pi_se; +} + +static inline bool is_dl_boosted(struct sched_dl_entity *dl_se) +{ +	return pi_of(dl_se) != dl_se; +} +#else +static inline struct sched_dl_entity *pi_of(struct sched_dl_entity *dl_se) +{ +	return dl_se; +} + +static inline bool is_dl_boosted(struct sched_dl_entity *dl_se) +{ +	return false; +} +#endif +  #ifdef CONFIG_SMP  static inline struct dl_bw *dl_bw_of(int i)  { @@ -698,7 +720,7 @@ static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se)  	struct dl_rq *dl_rq = dl_rq_of_se(dl_se);  	struct rq *rq = rq_of_dl_rq(dl_rq); -	WARN_ON(dl_se->dl_boosted); +	WARN_ON(is_dl_boosted(dl_se));  	WARN_ON(dl_time_before(rq_clock(rq), dl_se->deadline));  	/* @@ -736,21 +758,20 @@ static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se)   * could happen are, typically, a entity voluntarily trying to overcome its   * runtime, or it just underestimated it during sched_setattr().   */ -static void replenish_dl_entity(struct sched_dl_entity *dl_se, -				struct sched_dl_entity *pi_se) +static void replenish_dl_entity(struct sched_dl_entity *dl_se)  {  	struct dl_rq *dl_rq = dl_rq_of_se(dl_se);  	struct rq *rq = rq_of_dl_rq(dl_rq); -	BUG_ON(pi_se->dl_runtime <= 0); +	BUG_ON(pi_of(dl_se)->dl_runtime <= 0);  	/*  	 * This could be the case for a !-dl task that is boosted.  	 * Just go with full inherited parameters.  	 */  	if (dl_se->dl_deadline == 0) { -		dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline; -		dl_se->runtime = pi_se->dl_runtime; +		dl_se->deadline = rq_clock(rq) + pi_of(dl_se)->dl_deadline; +		dl_se->runtime = pi_of(dl_se)->dl_runtime;  	}  	if (dl_se->dl_yielded && dl_se->runtime > 0) @@ -763,8 +784,8 @@ static void replenish_dl_entity(struct sched_dl_entity *dl_se,  	 * arbitrary large.  	 */  	while (dl_se->runtime <= 0) { -		dl_se->deadline += pi_se->dl_period; -		dl_se->runtime += pi_se->dl_runtime; +		dl_se->deadline += pi_of(dl_se)->dl_period; +		dl_se->runtime += pi_of(dl_se)->dl_runtime;  	}  	/* @@ -778,8 +799,8 @@ static void replenish_dl_entity(struct sched_dl_entity *dl_se,  	 */  	if (dl_time_before(dl_se->deadline, rq_clock(rq))) {  		printk_deferred_once("sched: DL replenish lagged too much\n"); -		dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline; -		dl_se->runtime = pi_se->dl_runtime; +		dl_se->deadline = rq_clock(rq) + pi_of(dl_se)->dl_deadline; +		dl_se->runtime = pi_of(dl_se)->dl_runtime;  	}  	if (dl_se->dl_yielded) @@ -812,8 +833,7 @@ static void replenish_dl_entity(struct sched_dl_entity *dl_se,   * task with deadline equal to period this is the same of using   * dl_period instead of dl_deadline in the equation above.   */ -static bool dl_entity_overflow(struct sched_dl_entity *dl_se, -			       struct sched_dl_entity *pi_se, u64 t) +static bool dl_entity_overflow(struct sched_dl_entity *dl_se, u64 t)  {  	u64 left, right; @@ -835,9 +855,9 @@ static bool dl_entity_overflow(struct sched_dl_entity *dl_se,  	 * of anything below microseconds resolution is actually fiction  	 * (but still we want to give the user that illusion >;).  	 */ -	left = (pi_se->dl_deadline >> DL_SCALE) * (dl_se->runtime >> DL_SCALE); +	left = (pi_of(dl_se)->dl_deadline >> DL_SCALE) * (dl_se->runtime >> DL_SCALE);  	right = ((dl_se->deadline - t) >> DL_SCALE) * -		(pi_se->dl_runtime >> DL_SCALE); +		(pi_of(dl_se)->dl_runtime >> DL_SCALE);  	return dl_time_before(right, left);  } @@ -922,24 +942,23 @@ static inline bool dl_is_implicit(struct sched_dl_entity *dl_se)   * Please refer to the comments update_dl_revised_wakeup() function to find   * more about the Revised CBS rule.   */ -static void update_dl_entity(struct sched_dl_entity *dl_se, -			     struct sched_dl_entity *pi_se) +static void update_dl_entity(struct sched_dl_entity *dl_se)  {  	struct dl_rq *dl_rq = dl_rq_of_se(dl_se);  	struct rq *rq = rq_of_dl_rq(dl_rq);  	if (dl_time_before(dl_se->deadline, rq_clock(rq)) || -	    dl_entity_overflow(dl_se, pi_se, rq_clock(rq))) { +	    dl_entity_overflow(dl_se, rq_clock(rq))) {  		if (unlikely(!dl_is_implicit(dl_se) &&  			     !dl_time_before(dl_se->deadline, rq_clock(rq)) && -			     !dl_se->dl_boosted)){ +			     !is_dl_boosted(dl_se))) {  			update_dl_revised_wakeup(dl_se, rq);  			return;  		} -		dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline; -		dl_se->runtime = pi_se->dl_runtime; +		dl_se->deadline = rq_clock(rq) + pi_of(dl_se)->dl_deadline; +		dl_se->runtime = pi_of(dl_se)->dl_runtime;  	}  } @@ -1038,7 +1057,7 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)  	 * The task might have been boosted by someone else and might be in the  	 * boosting/deboosting path, its not throttled.  	 */ -	if (dl_se->dl_boosted) +	if (is_dl_boosted(dl_se))  		goto unlock;  	/* @@ -1066,7 +1085,7 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)  	 * but do not enqueue -- wait for our wakeup to do that.  	 */  	if (!task_on_rq_queued(p)) { -		replenish_dl_entity(dl_se, dl_se); +		replenish_dl_entity(dl_se);  		goto unlock;  	} @@ -1156,7 +1175,7 @@ static inline void dl_check_constrained_dl(struct sched_dl_entity *dl_se)  	if (dl_time_before(dl_se->deadline, rq_clock(rq)) &&  	    dl_time_before(rq_clock(rq), dl_next_period(dl_se))) { -		if (unlikely(dl_se->dl_boosted || !start_dl_timer(p))) +		if (unlikely(is_dl_boosted(dl_se) || !start_dl_timer(p)))  			return;  		dl_se->dl_throttled = 1;  		if (dl_se->runtime > 0) @@ -1287,7 +1306,7 @@ throttle:  			dl_se->dl_overrun = 1;  		__dequeue_task_dl(rq, curr, 0); -		if (unlikely(dl_se->dl_boosted || !start_dl_timer(curr))) +		if (unlikely(is_dl_boosted(dl_se) || !start_dl_timer(curr)))  			enqueue_task_dl(rq, curr, ENQUEUE_REPLENISH);  		if (!is_leftmost(curr, &rq->dl)) @@ -1481,8 +1500,7 @@ static void __dequeue_dl_entity(struct sched_dl_entity *dl_se)  }  static void -enqueue_dl_entity(struct sched_dl_entity *dl_se, -		  struct sched_dl_entity *pi_se, int flags) +enqueue_dl_entity(struct sched_dl_entity *dl_se, int flags)  {  	BUG_ON(on_dl_rq(dl_se)); @@ -1493,9 +1511,9 @@ enqueue_dl_entity(struct sched_dl_entity *dl_se,  	 */  	if (flags & ENQUEUE_WAKEUP) {  		task_contending(dl_se, flags); -		update_dl_entity(dl_se, pi_se); +		update_dl_entity(dl_se);  	} else if (flags & ENQUEUE_REPLENISH) { -		replenish_dl_entity(dl_se, pi_se); +		replenish_dl_entity(dl_se);  	} else if ((flags & ENQUEUE_RESTORE) &&  		  dl_time_before(dl_se->deadline,  				 rq_clock(rq_of_dl_rq(dl_rq_of_se(dl_se))))) { @@ -1512,19 +1530,7 @@ static void dequeue_dl_entity(struct sched_dl_entity *dl_se)  static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags)  { -	struct task_struct *pi_task = rt_mutex_get_top_task(p); -	struct sched_dl_entity *pi_se = &p->dl; - -	/* -	 * Use the scheduling parameters of the top pi-waiter task if: -	 * - we have a top pi-waiter which is a SCHED_DEADLINE task AND -	 * - our dl_boosted is set (i.e. the pi-waiter's (absolute) deadline is -	 *   smaller than our deadline OR we are a !SCHED_DEADLINE task getting -	 *   boosted due to a SCHED_DEADLINE pi-waiter). -	 * Otherwise we keep our runtime and deadline. -	 */ -	if (pi_task && dl_prio(pi_task->normal_prio) && p->dl.dl_boosted) { -		pi_se = &pi_task->dl; +	if (is_dl_boosted(&p->dl)) {  		/*  		 * Because of delays in the detection of the overrun of a  		 * thread's runtime, it might be the case that a thread @@ -1557,7 +1563,7 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags)  		 * the throttle.  		 */  		p->dl.dl_throttled = 0; -		BUG_ON(!p->dl.dl_boosted || flags != ENQUEUE_REPLENISH); +		BUG_ON(!is_dl_boosted(&p->dl) || flags != ENQUEUE_REPLENISH);  		return;  	} @@ -1594,7 +1600,7 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags)  		return;  	} -	enqueue_dl_entity(&p->dl, pi_se, flags); +	enqueue_dl_entity(&p->dl, flags);  	if (!task_current(rq, p) && p->nr_cpus_allowed > 1)  		enqueue_pushable_dl_task(rq, p); @@ -2787,11 +2793,14 @@ void __dl_clear_params(struct task_struct *p)  	dl_se->dl_bw			= 0;  	dl_se->dl_density		= 0; -	dl_se->dl_boosted		= 0;  	dl_se->dl_throttled		= 0;  	dl_se->dl_yielded		= 0;  	dl_se->dl_non_contending	= 0;  	dl_se->dl_overrun		= 0; + +#ifdef CONFIG_RT_MUTEXES +	dl_se->pi_se			= dl_se; +#endif  }  bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr) diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 0655524700d2..2357921580f9 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -251,7 +251,7 @@ static int sd_ctl_doflags(struct ctl_table *table, int write,  	unsigned long flags = *(unsigned long *)table->data;  	size_t data_size = 0;  	size_t len = 0; -	char *tmp; +	char *tmp, *buf;  	int idx;  	if (write) @@ -269,17 +269,17 @@ static int sd_ctl_doflags(struct ctl_table *table, int write,  		return 0;  	} -	tmp = kcalloc(data_size + 1, sizeof(*tmp), GFP_KERNEL); -	if (!tmp) +	buf = kcalloc(data_size + 1, sizeof(*buf), GFP_KERNEL); +	if (!buf)  		return -ENOMEM;  	for_each_set_bit(idx, &flags, __SD_FLAG_CNT) {  		char *name = sd_flag_debug[idx].name; -		len += snprintf(tmp + len, strlen(name) + 2, "%s ", name); +		len += snprintf(buf + len, strlen(name) + 2, "%s ", name);  	} -	tmp += *ppos; +	tmp = buf + *ppos;  	len -= *ppos;  	if (len > *lenp) @@ -294,7 +294,7 @@ static int sd_ctl_doflags(struct ctl_table *table, int write,  	*lenp = len;  	*ppos += len; -	kfree(tmp); +	kfree(buf);  	return 0;  } diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 290f9e38378c..ae7ceba8fd4f 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5477,6 +5477,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)  	struct cfs_rq *cfs_rq;  	struct sched_entity *se = &p->se;  	int idle_h_nr_running = task_has_idle_policy(p); +	int task_new = !(flags & ENQUEUE_WAKEUP);  	/*  	 * The code below (indirectly) updates schedutil which looks at @@ -5549,7 +5550,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)  	 * into account, but that is not straightforward to implement,  	 * and the following generally works well enough in practice.  	 */ -	if (flags & ENQUEUE_WAKEUP) +	if (!task_new)  		update_overutilized_status(rq);  enqueue_throttle: @@ -6172,21 +6173,21 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t  static int  select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target)  { -	unsigned long best_cap = 0; +	unsigned long task_util, best_cap = 0;  	int cpu, best_cpu = -1;  	struct cpumask *cpus; -	sync_entity_load_avg(&p->se); -  	cpus = this_cpu_cpumask_var_ptr(select_idle_mask);  	cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr); +	task_util = uclamp_task_util(p); +  	for_each_cpu_wrap(cpu, cpus, target) {  		unsigned long cpu_cap = capacity_of(cpu);  		if (!available_idle_cpu(cpu) && !sched_idle_cpu(cpu))  			continue; -		if (task_fits_capacity(p, cpu_cap)) +		if (fits_capacity(task_util, cpu_cap))  			return cpu;  		if (cpu_cap > best_cap) { @@ -6198,44 +6199,42 @@ select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target)  	return best_cpu;  } +static inline bool asym_fits_capacity(int task_util, int cpu) +{ +	if (static_branch_unlikely(&sched_asym_cpucapacity)) +		return fits_capacity(task_util, capacity_of(cpu)); + +	return true; +} +  /*   * Try and locate an idle core/thread in the LLC cache domain.   */  static int select_idle_sibling(struct task_struct *p, int prev, int target)  {  	struct sched_domain *sd; +	unsigned long task_util;  	int i, recent_used_cpu;  	/* -	 * For asymmetric CPU capacity systems, our domain of interest is -	 * sd_asym_cpucapacity rather than sd_llc. +	 * On asymmetric system, update task utilization because we will check +	 * that the task fits with cpu's capacity.  	 */  	if (static_branch_unlikely(&sched_asym_cpucapacity)) { -		sd = rcu_dereference(per_cpu(sd_asym_cpucapacity, target)); -		/* -		 * On an asymmetric CPU capacity system where an exclusive -		 * cpuset defines a symmetric island (i.e. one unique -		 * capacity_orig value through the cpuset), the key will be set -		 * but the CPUs within that cpuset will not have a domain with -		 * SD_ASYM_CPUCAPACITY. These should follow the usual symmetric -		 * capacity path. -		 */ -		if (!sd) -			goto symmetric; - -		i = select_idle_capacity(p, sd, target); -		return ((unsigned)i < nr_cpumask_bits) ? i : target; +		sync_entity_load_avg(&p->se); +		task_util = uclamp_task_util(p);  	} -symmetric: -	if (available_idle_cpu(target) || sched_idle_cpu(target)) +	if ((available_idle_cpu(target) || sched_idle_cpu(target)) && +	    asym_fits_capacity(task_util, target))  		return target;  	/*  	 * If the previous CPU is cache affine and idle, don't be stupid:  	 */  	if (prev != target && cpus_share_cache(prev, target) && -	    (available_idle_cpu(prev) || sched_idle_cpu(prev))) +	    (available_idle_cpu(prev) || sched_idle_cpu(prev)) && +	    asym_fits_capacity(task_util, prev))  		return prev;  	/* @@ -6258,7 +6257,8 @@ symmetric:  	    recent_used_cpu != target &&  	    cpus_share_cache(recent_used_cpu, target) &&  	    (available_idle_cpu(recent_used_cpu) || sched_idle_cpu(recent_used_cpu)) && -	    cpumask_test_cpu(p->recent_used_cpu, p->cpus_ptr)) { +	    cpumask_test_cpu(p->recent_used_cpu, p->cpus_ptr) && +	    asym_fits_capacity(task_util, recent_used_cpu)) {  		/*  		 * Replace recent_used_cpu with prev as it is a potential  		 * candidate for the next wake: @@ -6267,6 +6267,26 @@ symmetric:  		return recent_used_cpu;  	} +	/* +	 * For asymmetric CPU capacity systems, our domain of interest is +	 * sd_asym_cpucapacity rather than sd_llc. +	 */ +	if (static_branch_unlikely(&sched_asym_cpucapacity)) { +		sd = rcu_dereference(per_cpu(sd_asym_cpucapacity, target)); +		/* +		 * On an asymmetric CPU capacity system where an exclusive +		 * cpuset defines a symmetric island (i.e. one unique +		 * capacity_orig value through the cpuset), the key will be set +		 * but the CPUs within that cpuset will not have a domain with +		 * SD_ASYM_CPUCAPACITY. These should follow the usual symmetric +		 * capacity path. +		 */ +		if (sd) { +			i = select_idle_capacity(p, sd, target); +			return ((unsigned)i < nr_cpumask_bits) ? i : target; +		} +	} +  	sd = rcu_dereference(per_cpu(sd_llc, target));  	if (!sd)  		return target; @@ -9031,7 +9051,8 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s  	 * emptying busiest.  	 */  	if (local->group_type == group_has_spare) { -		if (busiest->group_type > group_fully_busy) { +		if ((busiest->group_type > group_fully_busy) && +		    !(env->sd->flags & SD_SHARE_PKG_RESOURCES)) {  			/*  			 * If busiest is overloaded, try to fill spare  			 * capacity. This might end up creating spare capacity diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 24d0ee26377d..c6932b8f4467 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -78,7 +78,7 @@ void __weak arch_cpu_idle_dead(void) { }  void __weak arch_cpu_idle(void)  {  	cpu_idle_force_poll = 1; -	local_irq_enable(); +	raw_local_irq_enable();  }  /** @@ -94,9 +94,35 @@ void __cpuidle default_idle_call(void)  		trace_cpu_idle(1, smp_processor_id());  		stop_critical_timings(); + +		/* +		 * arch_cpu_idle() is supposed to enable IRQs, however +		 * we can't do that because of RCU and tracing. +		 * +		 * Trace IRQs enable here, then switch off RCU, and have +		 * arch_cpu_idle() use raw_local_irq_enable(). Note that +		 * rcu_idle_enter() relies on lockdep IRQ state, so switch that +		 * last -- this is very similar to the entry code. +		 */ +		trace_hardirqs_on_prepare(); +		lockdep_hardirqs_on_prepare(_THIS_IP_);  		rcu_idle_enter(); +		lockdep_hardirqs_on(_THIS_IP_); +  		arch_cpu_idle(); + +		/* +		 * OK, so IRQs are enabled here, but RCU needs them disabled to +		 * turn itself back on.. funny thing is that disabling IRQs +		 * will cause tracing, which needs RCU. Jump through hoops to +		 * make it 'work'. +		 */ +		raw_local_irq_disable(); +		lockdep_hardirqs_off(_THIS_IP_);  		rcu_idle_exit(); +		lockdep_hardirqs_on(_THIS_IP_); +		raw_local_irq_enable(); +  		start_critical_timings();  		trace_cpu_idle(PWR_EVENT_EXIT, smp_processor_id());  	} diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 8ad7a293255a..53a7d1512dd7 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -38,7 +38,7 @@  #include <linux/filter.h>  #include <linux/pid.h>  #include <linux/ptrace.h> -#include <linux/security.h> +#include <linux/capability.h>  #include <linux/tracehook.h>  #include <linux/uaccess.h>  #include <linux/anon_inodes.h> @@ -558,8 +558,7 @@ static struct seccomp_filter *seccomp_prepare_filter(struct sock_fprog *fprog)  	 * behavior of privileged children.  	 */  	if (!task_no_new_privs(current) && -	    security_capable(current_cred(), current_user_ns(), -				     CAP_SYS_ADMIN, CAP_OPT_NOAUDIT) != 0) +			!ns_capable_noaudit(current_user_ns(), CAP_SYS_ADMIN))  		return ERR_PTR(-EACCES);  	/* Allocate a new seccomp_filter */ diff --git a/kernel/signal.c b/kernel/signal.c index a38b3edc6851..ef8f2a28d37c 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -391,16 +391,17 @@ static bool task_participate_group_stop(struct task_struct *task)  void task_join_group_stop(struct task_struct *task)  { +	unsigned long mask = current->jobctl & JOBCTL_STOP_SIGMASK; +	struct signal_struct *sig = current->signal; + +	if (sig->group_stop_count) { +		sig->group_stop_count++; +		mask |= JOBCTL_STOP_CONSUME; +	} else if (!(sig->flags & SIGNAL_STOP_STOPPED)) +		return; +  	/* Have the new thread join an on-going signal group stop */ -	unsigned long jobctl = current->jobctl; -	if (jobctl & JOBCTL_STOP_PENDING) { -		struct signal_struct *sig = current->signal; -		unsigned long signr = jobctl & JOBCTL_STOP_SIGMASK; -		unsigned long gstop = JOBCTL_STOP_PENDING | JOBCTL_STOP_CONSUME; -		if (task_set_jobctl_pending(task, signr | gstop)) { -			sig->group_stop_count++; -		} -	} +	task_set_jobctl_pending(task, mask | JOBCTL_STOP_PENDING);  }  /* diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 4517c8b66518..048c655315f1 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -181,6 +181,16 @@ bpf_probe_read_user_str_common(void *dst, u32 size,  {  	int ret; +	/* +	 * NB: We rely on strncpy_from_user() not copying junk past the NUL +	 * terminator into `dst`. +	 * +	 * strncpy_from_user() does long-sized strides in the fast path. If the +	 * strncpy does not mask out the bytes after the NUL in `unsafe_ptr`, +	 * then there could be junk after the NUL in `dst`. If user takes `dst` +	 * and keys a hash map with it, then semantically identical strings can +	 * occupy multiple entries in the map. +	 */  	ret = strncpy_from_user_nofault(dst, unsafe_ptr, size);  	if (unlikely(ret < 0))  		memset(dst, 0, size); @@ -1198,7 +1208,7 @@ static int bpf_btf_printf_prepare(struct btf_ptr *ptr, u32 btf_ptr_size,  	*btf = bpf_get_btf_vmlinux();  	if (IS_ERR_OR_NULL(*btf)) -		return PTR_ERR(*btf); +		return IS_ERR(*btf) ? PTR_ERR(*btf) : -EINVAL;  	if (ptr->type_id > 0)  		*btf_id = ptr->type_id; diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 7f45fd9d5a45..dc83b3fa9fe7 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -438,14 +438,16 @@ enum {  };  /*   * Used for which event context the event is in. - *  NMI     = 0 - *  IRQ     = 1 - *  SOFTIRQ = 2 - *  NORMAL  = 3 + *  TRANSITION = 0 + *  NMI     = 1 + *  IRQ     = 2 + *  SOFTIRQ = 3 + *  NORMAL  = 4   *   * See trace_recursive_lock() comment below for more details.   */  enum { +	RB_CTX_TRANSITION,  	RB_CTX_NMI,  	RB_CTX_IRQ,  	RB_CTX_SOFTIRQ, @@ -3014,10 +3016,10 @@ rb_wakeups(struct trace_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer)   * a bit of overhead in something as critical as function tracing,   * we use a bitmask trick.   * - *  bit 0 =  NMI context - *  bit 1 =  IRQ context - *  bit 2 =  SoftIRQ context - *  bit 3 =  normal context. + *  bit 1 =  NMI context + *  bit 2 =  IRQ context + *  bit 3 =  SoftIRQ context + *  bit 4 =  normal context.   *   * This works because this is the order of contexts that can   * preempt other contexts. A SoftIRQ never preempts an IRQ @@ -3040,6 +3042,30 @@ rb_wakeups(struct trace_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer)   * The least significant bit can be cleared this way, and it   * just so happens that it is the same bit corresponding to   * the current context. + * + * Now the TRANSITION bit breaks the above slightly. The TRANSITION bit + * is set when a recursion is detected at the current context, and if + * the TRANSITION bit is already set, it will fail the recursion. + * This is needed because there's a lag between the changing of + * interrupt context and updating the preempt count. In this case, + * a false positive will be found. To handle this, one extra recursion + * is allowed, and this is done by the TRANSITION bit. If the TRANSITION + * bit is already set, then it is considered a recursion and the function + * ends. Otherwise, the TRANSITION bit is set, and that bit is returned. + * + * On the trace_recursive_unlock(), the TRANSITION bit will be the first + * to be cleared. Even if it wasn't the context that set it. That is, + * if an interrupt comes in while NORMAL bit is set and the ring buffer + * is called before preempt_count() is updated, since the check will + * be on the NORMAL bit, the TRANSITION bit will then be set. If an + * NMI then comes in, it will set the NMI bit, but when the NMI code + * does the trace_recursive_unlock() it will clear the TRANSTION bit + * and leave the NMI bit set. But this is fine, because the interrupt + * code that set the TRANSITION bit will then clear the NMI bit when it + * calls trace_recursive_unlock(). If another NMI comes in, it will + * set the TRANSITION bit and continue. + * + * Note: The TRANSITION bit only handles a single transition between context.   */  static __always_inline int @@ -3055,8 +3081,16 @@ trace_recursive_lock(struct ring_buffer_per_cpu *cpu_buffer)  		bit = pc & NMI_MASK ? RB_CTX_NMI :  			pc & HARDIRQ_MASK ? RB_CTX_IRQ : RB_CTX_SOFTIRQ; -	if (unlikely(val & (1 << (bit + cpu_buffer->nest)))) -		return 1; +	if (unlikely(val & (1 << (bit + cpu_buffer->nest)))) { +		/* +		 * It is possible that this was called by transitioning +		 * between interrupt context, and preempt_count() has not +		 * been updated yet. In this case, use the TRANSITION bit. +		 */ +		bit = RB_CTX_TRANSITION; +		if (val & (1 << (bit + cpu_buffer->nest))) +			return 1; +	}  	val |= (1 << (bit + cpu_buffer->nest));  	cpu_buffer->current_context = val; @@ -3071,8 +3105,8 @@ trace_recursive_unlock(struct ring_buffer_per_cpu *cpu_buffer)  		cpu_buffer->current_context - (1 << cpu_buffer->nest);  } -/* The recursive locking above uses 4 bits */ -#define NESTED_BITS 4 +/* The recursive locking above uses 5 bits */ +#define NESTED_BITS 5  /**   * ring_buffer_nest_start - Allow to trace while nested diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 528971714fc6..410cfeb16db5 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2750,7 +2750,7 @@ trace_event_buffer_lock_reserve(struct trace_buffer **current_rb,  	/*  	 * If tracing is off, but we have triggers enabled  	 * we still need to look at the event data. Use the temp_buffer -	 * to store the trace event for the tigger to use. It's recusive +	 * to store the trace event for the trigger to use. It's recursive  	 * safe and will not be recorded anywhere.  	 */  	if (!entry && trace_file->flags & EVENT_FILE_FL_TRIGGER_COND) { @@ -2952,7 +2952,7 @@ static void __ftrace_trace_stack(struct trace_buffer *buffer,  	stackidx = __this_cpu_inc_return(ftrace_stack_reserve) - 1;  	/* This should never happen. If it does, yell once and skip */ -	if (WARN_ON_ONCE(stackidx > FTRACE_KSTACK_NESTING)) +	if (WARN_ON_ONCE(stackidx >= FTRACE_KSTACK_NESTING))  		goto out;  	/* @@ -3132,7 +3132,7 @@ static char *get_trace_buf(void)  	/* Interrupts must see nesting incremented before we use the buffer */  	barrier(); -	return &buffer->buffer[buffer->nesting][0]; +	return &buffer->buffer[buffer->nesting - 1][0];  }  static void put_trace_buf(void) diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index f3f5e77123ad..1dadef445cd1 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -637,6 +637,12 @@ enum {  	 * function is called to clear it.  	 */  	TRACE_GRAPH_NOTRACE_BIT, + +	/* +	 * When transitioning between context, the preempt_count() may +	 * not be correct. Allow for a single recursion to cover this case. +	 */ +	TRACE_TRANSITION_BIT,  };  #define trace_recursion_set(bit)	do { (current)->trace_recursion |= (1<<(bit)); } while (0) @@ -691,14 +697,27 @@ static __always_inline int trace_test_and_set_recursion(int start, int max)  		return 0;  	bit = trace_get_context_bit() + start; -	if (unlikely(val & (1 << bit))) -		return -1; +	if (unlikely(val & (1 << bit))) { +		/* +		 * It could be that preempt_count has not been updated during +		 * a switch between contexts. Allow for a single recursion. +		 */ +		bit = TRACE_TRANSITION_BIT; +		if (trace_recursion_test(bit)) +			return -1; +		trace_recursion_set(bit); +		barrier(); +		return bit + 1; +	} + +	/* Normal check passed, clear the transition to allow it again */ +	trace_recursion_clear(TRACE_TRANSITION_BIT);  	val |= 1 << bit;  	current->trace_recursion = val;  	barrier(); -	return bit; +	return bit + 1;  }  static __always_inline void trace_clear_recursion(int bit) @@ -708,6 +727,7 @@ static __always_inline void trace_clear_recursion(int bit)  	if (!bit)  		return; +	bit--;  	bit = 1 << bit;  	val &= ~bit; diff --git a/kernel/trace/trace_events_synth.c b/kernel/trace/trace_events_synth.c index 84b7cab55291..881df991742a 100644 --- a/kernel/trace/trace_events_synth.c +++ b/kernel/trace/trace_events_synth.c @@ -584,7 +584,7 @@ static struct synth_field *parse_synth_field(int argc, const char **argv,  {  	struct synth_field *field;  	const char *prefix = NULL, *field_type = argv[0], *field_name, *array; -	int len, ret = 0; +	int len, ret = -ENOMEM;  	struct seq_buf s;  	ssize_t size; @@ -617,10 +617,9 @@ static struct synth_field *parse_synth_field(int argc, const char **argv,  		len--;  	field->name = kmemdup_nul(field_name, len, GFP_KERNEL); -	if (!field->name) { -		ret = -ENOMEM; +	if (!field->name)  		goto free; -	} +  	if (!is_good_name(field->name)) {  		synth_err(SYNTH_ERR_BAD_NAME, errpos(field_name));  		ret = -EINVAL; @@ -638,10 +637,9 @@ static struct synth_field *parse_synth_field(int argc, const char **argv,  		len += strlen(prefix);  	field->type = kzalloc(len, GFP_KERNEL); -	if (!field->type) { -		ret = -ENOMEM; +	if (!field->type)  		goto free; -	} +  	seq_buf_init(&s, field->type, len);  	if (prefix)  		seq_buf_puts(&s, prefix); @@ -653,6 +651,7 @@ static struct synth_field *parse_synth_field(int argc, const char **argv,  	}  	if (WARN_ON_ONCE(!seq_buf_buffer_left(&s)))  		goto free; +  	s.buffer[s.len] = '\0';  	size = synth_field_size(field->type); @@ -666,10 +665,8 @@ static struct synth_field *parse_synth_field(int argc, const char **argv,  			len = sizeof("__data_loc ") + strlen(field->type) + 1;  			type = kzalloc(len, GFP_KERNEL); -			if (!type) { -				ret = -ENOMEM; +			if (!type)  				goto free; -			}  			seq_buf_init(&s, type, len);  			seq_buf_puts(&s, "__data_loc "); diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index b5e3496cf803..4738ad48a667 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -492,8 +492,13 @@ trace_selftest_function_recursion(void)  	unregister_ftrace_function(&test_rec_probe);  	ret = -1; -	if (trace_selftest_recursion_cnt != 1) { -		pr_cont("*callback not called once (%d)* ", +	/* +	 * Recursion allows for transitions between context, +	 * and may call the callback twice. +	 */ +	if (trace_selftest_recursion_cnt != 1 && +	    trace_selftest_recursion_cnt != 2) { +		pr_cont("*callback not called once (or twice) (%d)* ",  			trace_selftest_recursion_cnt);  		goto out;  	} diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 5abb5b22ad13..71109065bd8e 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -44,8 +44,6 @@ int __read_mostly soft_watchdog_user_enabled = 1;  int __read_mostly watchdog_thresh = 10;  static int __read_mostly nmi_watchdog_available; -static struct cpumask watchdog_allowed_mask __read_mostly; -  struct cpumask watchdog_cpumask __read_mostly;  unsigned long *watchdog_cpumask_bits = cpumask_bits(&watchdog_cpumask); @@ -162,6 +160,8 @@ static void lockup_detector_update_enable(void)  int __read_mostly sysctl_softlockup_all_cpu_backtrace;  #endif +static struct cpumask watchdog_allowed_mask __read_mostly; +  /* Global variables, exported for sysctl */  unsigned int __read_mostly softlockup_panic =  			CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC_VALUE; | 
