diff options
Diffstat (limited to 'kernel')
61 files changed, 882 insertions, 884 deletions
| diff --git a/kernel/backtracetest.c b/kernel/backtracetest.c index a4181234232b..2dfe66b9ed76 100644 --- a/kernel/backtracetest.c +++ b/kernel/backtracetest.c @@ -74,5 +74,6 @@ static void exitf(void)  module_init(backtrace_regression_test);  module_exit(exitf); +MODULE_DESCRIPTION("Simple stack backtrace regression test module");  MODULE_LICENSE("GPL");  MODULE_AUTHOR("Arjan van de Ven <arjan@linux.intel.com>"); diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c index a546aba46d5d..dec892ded031 100644 --- a/kernel/bpf/memalloc.c +++ b/kernel/bpf/memalloc.c @@ -155,12 +155,9 @@ static void *__alloc(struct bpf_mem_cache *c, int node, gfp_t flags)  static struct mem_cgroup *get_memcg(const struct bpf_mem_cache *c)  { -#ifdef CONFIG_MEMCG_KMEM +#ifdef CONFIG_MEMCG  	if (c->objcg)  		return get_mem_cgroup_from_objcg(c->objcg); -#endif - -#ifdef CONFIG_MEMCG  	return root_mem_cgroup;  #else  	return NULL; @@ -534,7 +531,7 @@ int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu)  			size += LLIST_NODE_SZ; /* room for llist_node */  		unit_size = size; -#ifdef CONFIG_MEMCG_KMEM +#ifdef CONFIG_MEMCG  		if (memcg_bpf_enabled())  			objcg = get_obj_cgroup_from_current();  #endif @@ -556,7 +553,7 @@ int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu)  	pcc = __alloc_percpu_gfp(sizeof(*cc), 8, GFP_KERNEL);  	if (!pcc)  		return -ENOMEM; -#ifdef CONFIG_MEMCG_KMEM +#ifdef CONFIG_MEMCG  	objcg = get_obj_cgroup_from_current();  #endif  	ma->objcg = objcg; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 869265852d51..bf6c5f685ea2 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -385,7 +385,7 @@ void bpf_map_free_id(struct bpf_map *map)  	spin_unlock_irqrestore(&map_idr_lock, flags);  } -#ifdef CONFIG_MEMCG_KMEM +#ifdef CONFIG_MEMCG  static void bpf_map_save_memcg(struct bpf_map *map)  {  	/* Currently if a map is created by a process belonging to the root @@ -486,7 +486,7 @@ int bpf_map_alloc_pages(const struct bpf_map *map, gfp_t gfp, int nid,  	unsigned long i, j;  	struct page *pg;  	int ret = 0; -#ifdef CONFIG_MEMCG_KMEM +#ifdef CONFIG_MEMCG  	struct mem_cgroup *memcg, *old_memcg;  	memcg = bpf_map_get_memcg(map); @@ -505,7 +505,7 @@ int bpf_map_alloc_pages(const struct bpf_map *map, gfp_t gfp, int nid,  		break;  	} -#ifdef CONFIG_MEMCG_KMEM +#ifdef CONFIG_MEMCG  	set_active_memcg(old_memcg);  	mem_cgroup_put(memcg);  #endif @@ -5983,7 +5983,7 @@ const struct bpf_prog_ops bpf_syscall_prog_ops = {  };  #ifdef CONFIG_SYSCTL -static int bpf_stats_handler(struct ctl_table *table, int write, +static int bpf_stats_handler(const struct ctl_table *table, int write,  			     void *buffer, size_t *lenp, loff_t *ppos)  {  	struct static_key *key = (struct static_key *)table->data; @@ -6018,7 +6018,7 @@ void __weak unpriv_ebpf_notify(int new_state)  {  } -static int bpf_unpriv_handler(struct ctl_table *table, int write, +static int bpf_unpriv_handler(const struct ctl_table *table, int write,  			      void *buffer, size_t *lenp, loff_t *ppos)  {  	int ret, unpriv_enable = *(int *)table->data; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 8da132a1ef28..4cb5441ad75f 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -21132,8 +21132,12 @@ BTF_SET_START(btf_non_sleepable_error_inject)   * Assume non-sleepable from bpf safety point of view.   */  BTF_ID(func, __filemap_add_folio) +#ifdef CONFIG_FAIL_PAGE_ALLOC  BTF_ID(func, should_fail_alloc_page) +#endif +#ifdef CONFIG_FAILSLAB  BTF_ID(func, should_failslab) +#endif  BTF_SET_END(btf_non_sleepable_error_inject)  static int check_non_sleepable_error_inject(u32 btf_id) diff --git a/kernel/crash_reserve.c b/kernel/crash_reserve.c index 5b2722a93a48..d3b4cd12bdd1 100644 --- a/kernel/crash_reserve.c +++ b/kernel/crash_reserve.c @@ -13,7 +13,6 @@  #include <linux/memory.h>  #include <linux/cpuhotplug.h>  #include <linux/memblock.h> -#include <linux/kexec.h>  #include <linux/kmemleak.h>  #include <asm/page.h> diff --git a/kernel/debug/kdb/kdb_bt.c b/kernel/debug/kdb/kdb_bt.c index 10b454554ab0..137ba73f56fc 100644 --- a/kernel/debug/kdb/kdb_bt.c +++ b/kernel/debug/kdb/kdb_bt.c @@ -144,7 +144,7 @@ kdb_bt(int argc, const char **argv)  			kdb_ps_suppressed();  		/* Run the active tasks first */  		for_each_online_cpu(cpu) { -			p = kdb_curr_task(cpu); +			p = curr_task(cpu);  			if (kdb_bt1(p, mask, btaprompt))  				return 0;  		} diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c index 3131334d7a81..6a77f1c779c4 100644 --- a/kernel/debug/kdb/kdb_io.c +++ b/kernel/debug/kdb/kdb_io.c @@ -206,7 +206,7 @@ char kdb_getchar(void)   */  static void kdb_position_cursor(char *prompt, char *buffer, char *cp)  { -	kdb_printf("\r%s", kdb_prompt_str); +	kdb_printf("\r%s", prompt);  	if (cp > buffer)  		kdb_printf("%.*s", (int)(cp - buffer), buffer);  } @@ -362,7 +362,7 @@ poll_again:  			if (i >= dtab_count)  				kdb_printf("...");  			kdb_printf("\n"); -			kdb_printf(kdb_prompt_str); +			kdb_printf("%s",  kdb_prompt_str);  			kdb_printf("%s", buffer);  			if (cp != lastchar)  				kdb_position_cursor(kdb_prompt_str, buffer, cp); @@ -453,7 +453,7 @@ char *kdb_getstr(char *buffer, size_t bufsize, const char *prompt)  {  	if (prompt && kdb_prompt_str != prompt)  		strscpy(kdb_prompt_str, prompt, CMD_BUFLEN); -	kdb_printf(kdb_prompt_str); +	kdb_printf("%s", kdb_prompt_str);  	kdb_nextline = 1;	/* Prompt and input resets line number */  	return kdb_read(buffer, bufsize);  } diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index 664bae55f2c9..f5f7d7fb5936 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c @@ -155,16 +155,6 @@ static char *__env[31] = {  static const int __nenv = ARRAY_SIZE(__env); -struct task_struct *kdb_curr_task(int cpu) -{ -	struct task_struct *p = curr_task(cpu); -#ifdef	_TIF_MCA_INIT -	if ((task_thread_info(p)->flags & _TIF_MCA_INIT) && KDB_TSK(cpu)) -		p = krp->p; -#endif -	return p; -} -  /*   * Update the permissions flags (kdb_cmd_enabled) to match the   * current lockdown state. @@ -1228,7 +1218,7 @@ static int kdb_local(kdb_reason_t reason, int error, struct pt_regs *regs,  	char *cmdbuf;  	int diag;  	struct task_struct *kdb_current = -		kdb_curr_task(raw_smp_processor_id()); +		curr_task(raw_smp_processor_id());  	KDB_DEBUG_STATE("kdb_local 1", reason); @@ -2278,7 +2268,7 @@ void kdb_ps_suppressed(void)  	unsigned long cpu;  	const struct task_struct *p, *g;  	for_each_online_cpu(cpu) { -		p = kdb_curr_task(cpu); +		p = curr_task(cpu);  		if (kdb_task_state(p, "-"))  			++idle;  	} @@ -2314,7 +2304,7 @@ void kdb_ps1(const struct task_struct *p)  		   kdb_task_has_cpu(p), kdb_process_cpu(p),  		   kdb_task_state_char(p),  		   (void *)(&p->thread), -		   p == kdb_curr_task(raw_smp_processor_id()) ? '*' : ' ', +		   p == curr_task(raw_smp_processor_id()) ? '*' : ' ',  		   p->comm);  	if (kdb_task_has_cpu(p)) {  		if (!KDB_TSK(cpu)) { @@ -2350,7 +2340,7 @@ static int kdb_ps(int argc, const char **argv)  	for_each_online_cpu(cpu) {  		if (KDB_FLAG(CMD_INTERRUPT))  			return 0; -		p = kdb_curr_task(cpu); +		p = curr_task(cpu);  		if (kdb_task_state(p, mask))  			kdb_ps1(p);  	} diff --git a/kernel/debug/kdb/kdb_private.h b/kernel/debug/kdb/kdb_private.h index 548fd4059bf9..d2520d72b1f5 100644 --- a/kernel/debug/kdb/kdb_private.h +++ b/kernel/debug/kdb/kdb_private.h @@ -210,8 +210,6 @@ extern void kdb_gdb_state_pass(char *buf);  #define KDB_TSK(cpu) kgdb_info[cpu].task  #define KDB_TSKREGS(cpu) kgdb_info[cpu].debuggerinfo -extern struct task_struct *kdb_curr_task(int); -  #define kdb_task_has_cpu(p) (task_curr(p))  #define GFP_KDB (in_dbg_master() ? GFP_ATOMIC : GFP_KERNEL) diff --git a/kernel/delayacct.c b/kernel/delayacct.c index e039b0f99a0b..dead51de8eb5 100644 --- a/kernel/delayacct.c +++ b/kernel/delayacct.c @@ -44,7 +44,7 @@ void delayacct_init(void)  }  #ifdef CONFIG_PROC_SYSCTL -static int sysctl_delayacct(struct ctl_table *table, int write, void *buffer, +static int sysctl_delayacct(const struct ctl_table *table, int write, void *buffer,  		     size_t *lenp, loff_t *ppos)  {  	int state = delayacct_on; diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c index 81de84318ccc..b1c18058d55f 100644 --- a/kernel/dma/mapping.c +++ b/kernel/dma/mapping.c @@ -67,8 +67,8 @@ void dmam_free_coherent(struct device *dev, size_t size, void *vaddr,  {  	struct dma_devres match_data = { size, vaddr, dma_handle }; -	dma_free_coherent(dev, size, vaddr, dma_handle);  	WARN_ON(devres_destroy(dev, dmam_release, dmam_match, &match_data)); +	dma_free_coherent(dev, size, vaddr, dma_handle);  }  EXPORT_SYMBOL(dmam_free_coherent); diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c index 8d57255e5b29..8a47e52a454f 100644 --- a/kernel/events/callchain.c +++ b/kernel/events/callchain.c @@ -270,7 +270,7 @@ exit_put:   * Used for sysctl_perf_event_max_stack and   * sysctl_perf_event_max_contexts_per_stack.   */ -int perf_event_max_stack_handler(struct ctl_table *table, int write, +int perf_event_max_stack_handler(const struct ctl_table *table, int write,  				 void *buffer, size_t *lenp, loff_t *ppos)  {  	int *value = table->data; diff --git a/kernel/events/core.c b/kernel/events/core.c index ab6c4c942f79..aa3450bdc227 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -450,7 +450,7 @@ static void update_perf_cpu_limits(void)  static bool perf_rotate_context(struct perf_cpu_pmu_context *cpc); -int perf_event_max_sample_rate_handler(struct ctl_table *table, int write, +int perf_event_max_sample_rate_handler(const struct ctl_table *table, int write,  				       void *buffer, size_t *lenp, loff_t *ppos)  {  	int ret; @@ -474,7 +474,7 @@ int perf_event_max_sample_rate_handler(struct ctl_table *table, int write,  int sysctl_perf_cpu_time_max_percent __read_mostly = DEFAULT_CPU_TIME_MAX_PERCENT; -int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write, +int perf_cpu_time_max_percent_handler(const struct ctl_table *table, int write,  		void *buffer, size_t *lenp, loff_t *ppos)  {  	int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); @@ -534,7 +534,7 @@ void perf_sample_event_took(u64 sample_len_ns)  	__this_cpu_write(running_sample_length, running_len);  	/* -	 * Note: this will be biased artifically low until we have +	 * Note: this will be biased artificially low until we have  	 * seen NR_ACCUMULATED_SAMPLES. Doing it this way keeps us  	 * from having to maintain a count.  	 */ @@ -596,10 +596,10 @@ static inline u64 perf_event_clock(struct perf_event *event)   *   * Event groups make things a little more complicated, but not terribly so. The   * rules for a group are that if the group leader is OFF the entire group is - * OFF, irrespecive of what the group member states are. This results in + * OFF, irrespective of what the group member states are. This results in   * __perf_effective_state().   * - * A futher ramification is that when a group leader flips between OFF and + * A further ramification is that when a group leader flips between OFF and   * !OFF, we need to update all group member times.   *   * @@ -891,7 +891,7 @@ static int perf_cgroup_ensure_storage(struct perf_event *event,  	int cpu, heap_size, ret = 0;  	/* -	 * Allow storage to have sufficent space for an iterator for each +	 * Allow storage to have sufficient space for an iterator for each  	 * possibly nested cgroup plus an iterator for events with no cgroup.  	 */  	for (heap_size = 1; css; css = css->parent) @@ -3671,7 +3671,7 @@ void __perf_event_task_sched_out(struct task_struct *task,  	perf_cgroup_switch(next);  } -static bool perf_less_group_idx(const void *l, const void *r) +static bool perf_less_group_idx(const void *l, const void *r, void __always_unused *args)  {  	const struct perf_event *le = *(const struct perf_event **)l;  	const struct perf_event *re = *(const struct perf_event **)r; @@ -3679,20 +3679,21 @@ static bool perf_less_group_idx(const void *l, const void *r)  	return le->group_index < re->group_index;  } -static void swap_ptr(void *l, void *r) +static void swap_ptr(void *l, void *r, void __always_unused *args)  {  	void **lp = l, **rp = r;  	swap(*lp, *rp);  } +DEFINE_MIN_HEAP(struct perf_event *, perf_event_min_heap); +  static const struct min_heap_callbacks perf_min_heap = { -	.elem_size = sizeof(struct perf_event *),  	.less = perf_less_group_idx,  	.swp = swap_ptr,  }; -static void __heap_add(struct min_heap *heap, struct perf_event *event) +static void __heap_add(struct perf_event_min_heap *heap, struct perf_event *event)  {  	struct perf_event **itrs = heap->data; @@ -3726,7 +3727,7 @@ static noinline int visit_groups_merge(struct perf_event_context *ctx,  	struct perf_cpu_context *cpuctx = NULL;  	/* Space for per CPU and/or any CPU event iterators. */  	struct perf_event *itrs[2]; -	struct min_heap event_heap; +	struct perf_event_min_heap event_heap;  	struct perf_event **evt;  	int ret; @@ -3735,7 +3736,7 @@ static noinline int visit_groups_merge(struct perf_event_context *ctx,  	if (!ctx->task) {  		cpuctx = this_cpu_ptr(&perf_cpu_context); -		event_heap = (struct min_heap){ +		event_heap = (struct perf_event_min_heap){  			.data = cpuctx->heap,  			.nr = 0,  			.size = cpuctx->heap_size, @@ -3748,7 +3749,7 @@ static noinline int visit_groups_merge(struct perf_event_context *ctx,  			css = &cpuctx->cgrp->css;  #endif  	} else { -		event_heap = (struct min_heap){ +		event_heap = (struct perf_event_min_heap){  			.data = itrs,  			.nr = 0,  			.size = ARRAY_SIZE(itrs), @@ -3770,7 +3771,7 @@ static noinline int visit_groups_merge(struct perf_event_context *ctx,  		perf_assert_pmu_disabled((*evt)->pmu_ctx->pmu);  	} -	min_heapify_all(&event_heap, &perf_min_heap); +	min_heapify_all(&event_heap, &perf_min_heap, NULL);  	while (event_heap.nr) {  		ret = func(*evt, data); @@ -3779,9 +3780,9 @@ static noinline int visit_groups_merge(struct perf_event_context *ctx,  		*evt = perf_event_groups_next(*evt, pmu);  		if (*evt) -			min_heapify(&event_heap, 0, &perf_min_heap); +			min_heap_sift_down(&event_heap, 0, &perf_min_heap, NULL);  		else -			min_heap_pop(&event_heap, &perf_min_heap); +			min_heap_pop(&event_heap, &perf_min_heap, NULL);  	}  	return 0; @@ -7634,7 +7635,7 @@ again:  	pte = ptep_get_lockless(ptep);  	if (pte_present(pte)) -		size = pte_leaf_size(pte); +		size = __pte_leaf_size(pmd, pte);  	pte_unmap(ptep);  #endif /* CONFIG_HAVE_GUP_FAST */ @@ -9327,21 +9328,19 @@ static void perf_event_bpf_emit_ksymbols(struct bpf_prog *prog,  	bool unregister = type == PERF_BPF_EVENT_PROG_UNLOAD;  	int i; -	if (prog->aux->func_cnt == 0) { -		perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_BPF, -				   (u64)(unsigned long)prog->bpf_func, -				   prog->jited_len, unregister, -				   prog->aux->ksym.name); -	} else { -		for (i = 0; i < prog->aux->func_cnt; i++) { -			struct bpf_prog *subprog = prog->aux->func[i]; - -			perf_event_ksymbol( -				PERF_RECORD_KSYMBOL_TYPE_BPF, -				(u64)(unsigned long)subprog->bpf_func, -				subprog->jited_len, unregister, -				subprog->aux->ksym.name); -		} +	perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_BPF, +			   (u64)(unsigned long)prog->bpf_func, +			   prog->jited_len, unregister, +			   prog->aux->ksym.name); + +	for (i = 1; i < prog->aux->func_cnt; i++) { +		struct bpf_prog *subprog = prog->aux->func[i]; + +		perf_event_ksymbol( +			PERF_RECORD_KSYMBOL_TYPE_BPF, +			(u64)(unsigned long)subprog->bpf_func, +			subprog->jited_len, unregister, +			subprog->aux->ksym.name);  	}  } diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 99be2adedbc0..73cc47708679 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -181,7 +181,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,  	if (new_page) {  		folio_get(new_folio); -		folio_add_new_anon_rmap(new_folio, vma, addr); +		folio_add_new_anon_rmap(new_folio, vma, addr, RMAP_EXCLUSIVE);  		folio_add_lru_vma(new_folio, vma);  	} else  		/* no new page, just dec_mm_counter for old_page */ diff --git a/kernel/exit.c b/kernel/exit.c index be81342caf1b..7430852a8571 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -438,14 +438,46 @@ static void coredump_task_exit(struct task_struct *tsk)  }  #ifdef CONFIG_MEMCG +/* drops tasklist_lock if succeeds */ +static bool __try_to_set_owner(struct task_struct *tsk, struct mm_struct *mm) +{ +	bool ret = false; + +	task_lock(tsk); +	if (likely(tsk->mm == mm)) { +		/* tsk can't pass exit_mm/exec_mmap and exit */ +		read_unlock(&tasklist_lock); +		WRITE_ONCE(mm->owner, tsk); +		lru_gen_migrate_mm(mm); +		ret = true; +	} +	task_unlock(tsk); +	return ret; +} + +static bool try_to_set_owner(struct task_struct *g, struct mm_struct *mm) +{ +	struct task_struct *t; + +	for_each_thread(g, t) { +		struct mm_struct *t_mm = READ_ONCE(t->mm); +		if (t_mm == mm) { +			if (__try_to_set_owner(t, mm)) +				return true; +		} else if (t_mm) +			break; +	} + +	return false; +} +  /*   * A task is exiting.   If it owned this mm, find a new owner for the mm.   */  void mm_update_next_owner(struct mm_struct *mm)  { -	struct task_struct *c, *g, *p = current; +	struct task_struct *g, *p = current; -retry:  	/*  	 * If the exiting or execing task is not the owner, it's  	 * someone else's problem. @@ -466,19 +498,17 @@ retry:  	/*  	 * Search in the children  	 */ -	list_for_each_entry(c, &p->children, sibling) { -		if (c->mm == mm) -			goto assign_new_owner; +	list_for_each_entry(g, &p->children, sibling) { +		if (try_to_set_owner(g, mm)) +			goto ret;  	} -  	/*  	 * Search in the siblings  	 */ -	list_for_each_entry(c, &p->real_parent->children, sibling) { -		if (c->mm == mm) -			goto assign_new_owner; +	list_for_each_entry(g, &p->real_parent->children, sibling) { +		if (try_to_set_owner(g, mm)) +			goto ret;  	} -  	/*  	 * Search through everything else, we should not get here often.  	 */ @@ -487,12 +517,8 @@ retry:  			break;  		if (g->flags & PF_KTHREAD)  			continue; -		for_each_thread(g, c) { -			if (c->mm == mm) -				goto assign_new_owner; -			if (c->mm) -				break; -		} +		if (try_to_set_owner(g, mm)) +			goto ret;  	}  	read_unlock(&tasklist_lock);  	/* @@ -501,30 +527,9 @@ retry:  	 * ptrace or page migration (get_task_mm()).  Mark owner as NULL.  	 */  	WRITE_ONCE(mm->owner, NULL); + ret:  	return; -assign_new_owner: -	BUG_ON(c == p); -	get_task_struct(c); -	/* -	 * The task_lock protects c->mm from changing. -	 * We always want mm->owner->mm == mm -	 */ -	task_lock(c); -	/* -	 * Delay read_unlock() till we have the task_lock() -	 * to ensure that c does not slip away underneath us -	 */ -	read_unlock(&tasklist_lock); -	if (c->mm != mm) { -		task_unlock(c); -		put_task_struct(c); -		goto retry; -	} -	WRITE_ONCE(mm->owner, c); -	lru_gen_migrate_mm(mm); -	task_unlock(c); -	put_task_struct(c);  }  #endif /* CONFIG_MEMCG */ diff --git a/kernel/fork.c b/kernel/fork.c index 942e3d8617bf..cc760491f201 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -44,6 +44,7 @@  #include <linux/fs.h>  #include <linux/mm.h>  #include <linux/mm_inline.h> +#include <linux/memblock.h>  #include <linux/nsproxy.h>  #include <linux/capability.h>  #include <linux/cpu.h> @@ -207,9 +208,10 @@ static bool try_release_thread_stack_to_cache(struct vm_struct *vm)  	unsigned int i;  	for (i = 0; i < NR_CACHED_STACKS; i++) { -		if (this_cpu_cmpxchg(cached_stacks[i], NULL, vm) != NULL) -			continue; -		return true; +		struct vm_struct *tmp = NULL; + +		if (this_cpu_try_cmpxchg(cached_stacks[i], &tmp, vm)) +			return true;  	}  	return false;  } @@ -992,10 +994,10 @@ void __init __weak arch_task_cache_init(void) { }  /*   * set_max_threads   */ -static void set_max_threads(unsigned int max_threads_suggested) +static void __init set_max_threads(unsigned int max_threads_suggested)  {  	u64 threads; -	unsigned long nr_pages = totalram_pages(); +	unsigned long nr_pages = PHYS_PFN(memblock_phys_mem_size() - memblock_reserved_size());  	/*  	 * The number of threads shall be limited such that the thread @@ -1018,7 +1020,7 @@ static void set_max_threads(unsigned int max_threads_suggested)  int arch_task_struct_size __read_mostly;  #endif -static void task_struct_whitelist(unsigned long *offset, unsigned long *size) +static void __init task_struct_whitelist(unsigned long *offset, unsigned long *size)  {  	/* Fetch thread_struct whitelist for the architecture. */  	arch_thread_struct_whitelist(offset, size); @@ -1519,14 +1521,13 @@ struct mm_struct *get_task_mm(struct task_struct *task)  {  	struct mm_struct *mm; +	if (task->flags & PF_KTHREAD) +		return NULL; +  	task_lock(task);  	mm = task->mm; -	if (mm) { -		if (task->flags & PF_KTHREAD) -			mm = NULL; -		else -			mmget(mm); -	} +	if (mm) +		mmget(mm);  	task_unlock(task);  	return mm;  } @@ -3403,7 +3404,7 @@ int unshare_files(void)  	return 0;  } -int sysctl_max_threads(struct ctl_table *table, int write, +int sysctl_max_threads(const struct ctl_table *table, int write,  		       void *buffer, size_t *lenp, loff_t *ppos)  {  	struct ctl_table t; diff --git a/kernel/hung_task.c b/kernel/hung_task.c index 1d92016b0b3c..959d99583d1c 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c @@ -127,7 +127,7 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout)  	 * Ok, the task did not get scheduled for more than 2 minutes,  	 * complain:  	 */ -	if (sysctl_hung_task_warnings) { +	if (sysctl_hung_task_warnings || hung_task_call_panic) {  		if (sysctl_hung_task_warnings > 0)  			sysctl_hung_task_warnings--;  		pr_err("INFO: task %s:%d blocked for more than %ld seconds.\n", @@ -239,7 +239,7 @@ static long hung_timeout_jiffies(unsigned long last_checked,  /*   * Process updating of timeout sysctl   */ -static int proc_dohung_task_timeout_secs(struct ctl_table *table, int write, +static int proc_dohung_task_timeout_secs(const struct ctl_table *table, int write,  				  void *buffer,  				  size_t *lenp, loff_t *ppos)  { diff --git a/kernel/irq/debugfs.c b/kernel/irq/debugfs.c index aae0402507ed..c6ffb97966be 100644 --- a/kernel/irq/debugfs.c +++ b/kernel/irq/debugfs.c @@ -9,14 +9,8 @@  static struct dentry *irq_dir; -struct irq_bit_descr { -	unsigned int	mask; -	char		*name; -}; -#define BIT_MASK_DESCR(m)	{ .mask = m, .name = #m } - -static void irq_debug_show_bits(struct seq_file *m, int ind, unsigned int state, -				const struct irq_bit_descr *sd, int size) +void irq_debug_show_bits(struct seq_file *m, int ind, unsigned int state, +			 const struct irq_bit_descr *sd, int size)  {  	int i; diff --git a/kernel/irq/devres.c b/kernel/irq/devres.c index f6e5515ee077..b3e98668f4dd 100644 --- a/kernel/irq/devres.c +++ b/kernel/irq/devres.c @@ -1,6 +1,7 @@  // SPDX-License-Identifier: GPL-2.0  #include <linux/module.h>  #include <linux/interrupt.h> +#include <linux/irqdomain.h>  #include <linux/device.h>  #include <linux/gfp.h>  #include <linux/irq.h> @@ -282,3 +283,43 @@ int devm_irq_setup_generic_chip(struct device *dev, struct irq_chip_generic *gc,  }  EXPORT_SYMBOL_GPL(devm_irq_setup_generic_chip);  #endif /* CONFIG_GENERIC_IRQ_CHIP */ + +#ifdef CONFIG_IRQ_DOMAIN +static void devm_irq_domain_remove(struct device *dev, void *res) +{ +	struct irq_domain **domain = res; + +	irq_domain_remove(*domain); +} + +/** + * devm_irq_domain_instantiate() - Instantiate a new irq domain data for a + *                                 managed device. + * @dev:	Device to instantiate the domain for + * @info:	Domain information pointer pointing to the information for this + *		domain + * + * Return: A pointer to the instantiated irq domain or an ERR_PTR value. + */ +struct irq_domain *devm_irq_domain_instantiate(struct device *dev, +					       const struct irq_domain_info *info) +{ +	struct irq_domain *domain; +	struct irq_domain **dr; + +	dr = devres_alloc(devm_irq_domain_remove, sizeof(*dr), GFP_KERNEL); +	if (!dr) +		return ERR_PTR(-ENOMEM); + +	domain = irq_domain_instantiate(info); +	if (!IS_ERR(domain)) { +		*dr = domain; +		devres_add(dev, dr); +	} else { +		devres_free(dr); +	} + +	return domain; +} +EXPORT_SYMBOL_GPL(devm_irq_domain_instantiate); +#endif /* CONFIG_IRQ_DOMAIN */ diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c index d39a40bc542b..32ffcbb87fa1 100644 --- a/kernel/irq/generic-chip.c +++ b/kernel/irq/generic-chip.c @@ -276,21 +276,14 @@ irq_gc_init_mask_cache(struct irq_chip_generic *gc, enum irq_gc_flags flags)  }  /** - * __irq_alloc_domain_generic_chips - Allocate generic chips for an irq domain - * @d:			irq domain for which to allocate chips - * @irqs_per_chip:	Number of interrupts each chip handles (max 32) - * @num_ct:		Number of irq_chip_type instances associated with this - * @name:		Name of the irq chip - * @handler:		Default flow handler associated with these chips - * @clr:		IRQ_* bits to clear in the mapping function - * @set:		IRQ_* bits to set in the mapping function - * @gcflags:		Generic chip specific setup flags + * irq_domain_alloc_generic_chips - Allocate generic chips for an irq domain + * @d:		irq domain for which to allocate chips + * @info:	Generic chip information + * + * Return: 0 on success, negative error code on failure   */ -int __irq_alloc_domain_generic_chips(struct irq_domain *d, int irqs_per_chip, -				     int num_ct, const char *name, -				     irq_flow_handler_t handler, -				     unsigned int clr, unsigned int set, -				     enum irq_gc_flags gcflags) +int irq_domain_alloc_generic_chips(struct irq_domain *d, +				   const struct irq_domain_chip_generic_info *info)  {  	struct irq_domain_chip_generic *dgc;  	struct irq_chip_generic *gc; @@ -300,27 +293,29 @@ int __irq_alloc_domain_generic_chips(struct irq_domain *d, int irqs_per_chip,  	size_t gc_sz;  	size_t sz;  	void *tmp; +	int ret;  	if (d->gc)  		return -EBUSY; -	numchips = DIV_ROUND_UP(d->revmap_size, irqs_per_chip); +	numchips = DIV_ROUND_UP(d->revmap_size, info->irqs_per_chip);  	if (!numchips)  		return -EINVAL;  	/* Allocate a pointer, generic chip and chiptypes for each chip */ -	gc_sz = struct_size(gc, chip_types, num_ct); +	gc_sz = struct_size(gc, chip_types, info->num_ct);  	dgc_sz = struct_size(dgc, gc, numchips);  	sz = dgc_sz + numchips * gc_sz;  	tmp = dgc = kzalloc(sz, GFP_KERNEL);  	if (!dgc)  		return -ENOMEM; -	dgc->irqs_per_chip = irqs_per_chip; +	dgc->irqs_per_chip = info->irqs_per_chip;  	dgc->num_chips = numchips; -	dgc->irq_flags_to_set = set; -	dgc->irq_flags_to_clear = clr; -	dgc->gc_flags = gcflags; +	dgc->irq_flags_to_set = info->irq_flags_to_set; +	dgc->irq_flags_to_clear = info->irq_flags_to_clear; +	dgc->gc_flags = info->gc_flags; +	dgc->exit = info->exit;  	d->gc = dgc;  	/* Calc pointer to the first generic chip */ @@ -328,15 +323,22 @@ int __irq_alloc_domain_generic_chips(struct irq_domain *d, int irqs_per_chip,  	for (i = 0; i < numchips; i++) {  		/* Store the pointer to the generic chip */  		dgc->gc[i] = gc = tmp; -		irq_init_generic_chip(gc, name, num_ct, i * irqs_per_chip, -				      NULL, handler); +		irq_init_generic_chip(gc, info->name, info->num_ct, +				      i * dgc->irqs_per_chip, NULL, +				      info->handler);  		gc->domain = d; -		if (gcflags & IRQ_GC_BE_IO) { +		if (dgc->gc_flags & IRQ_GC_BE_IO) {  			gc->reg_readl = &irq_readl_be;  			gc->reg_writel = &irq_writel_be;  		} +		if (info->init) { +			ret = info->init(gc); +			if (ret) +				goto err; +		} +  		raw_spin_lock_irqsave(&gc_lock, flags);  		list_add_tail(&gc->list, &gc_list);  		raw_spin_unlock_irqrestore(&gc_lock, flags); @@ -344,6 +346,69 @@ int __irq_alloc_domain_generic_chips(struct irq_domain *d, int irqs_per_chip,  		tmp += gc_sz;  	}  	return 0; + +err: +	while (i--) { +		if (dgc->exit) +			dgc->exit(dgc->gc[i]); +		irq_remove_generic_chip(dgc->gc[i], ~0U, 0, 0); +	} +	d->gc = NULL; +	kfree(dgc); +	return ret; +} +EXPORT_SYMBOL_GPL(irq_domain_alloc_generic_chips); + +/** + * irq_domain_remove_generic_chips - Remove generic chips from an irq domain + * @d: irq domain for which generic chips are to be removed + */ +void irq_domain_remove_generic_chips(struct irq_domain *d) +{ +	struct irq_domain_chip_generic *dgc = d->gc; +	unsigned int i; + +	if (!dgc) +		return; + +	for (i = 0; i < dgc->num_chips; i++) { +		if (dgc->exit) +			dgc->exit(dgc->gc[i]); +		irq_remove_generic_chip(dgc->gc[i], ~0U, 0, 0); +	} +	d->gc = NULL; +	kfree(dgc); +} +EXPORT_SYMBOL_GPL(irq_domain_remove_generic_chips); + +/** + * __irq_alloc_domain_generic_chips - Allocate generic chips for an irq domain + * @d:			irq domain for which to allocate chips + * @irqs_per_chip:	Number of interrupts each chip handles (max 32) + * @num_ct:		Number of irq_chip_type instances associated with this + * @name:		Name of the irq chip + * @handler:		Default flow handler associated with these chips + * @clr:		IRQ_* bits to clear in the mapping function + * @set:		IRQ_* bits to set in the mapping function + * @gcflags:		Generic chip specific setup flags + */ +int __irq_alloc_domain_generic_chips(struct irq_domain *d, int irqs_per_chip, +				     int num_ct, const char *name, +				     irq_flow_handler_t handler, +				     unsigned int clr, unsigned int set, +				     enum irq_gc_flags gcflags) +{ +	struct irq_domain_chip_generic_info info = { +		.irqs_per_chip		= irqs_per_chip, +		.num_ct			= num_ct, +		.name			= name, +		.handler		= handler, +		.irq_flags_to_clear	= clr, +		.irq_flags_to_set	= set, +		.gc_flags		= gcflags, +	}; + +	return irq_domain_alloc_generic_chips(d, &info);  }  EXPORT_SYMBOL_GPL(__irq_alloc_domain_generic_chips); diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index ed28059e9849..fe0272cd84a5 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -501,6 +501,16 @@ static inline struct irq_data *irqd_get_parent_data(struct irq_data *irqd)  #ifdef CONFIG_GENERIC_IRQ_DEBUGFS  #include <linux/debugfs.h> +struct irq_bit_descr { +	unsigned int	mask; +	char		*name; +}; + +#define BIT_MASK_DESCR(m)	{ .mask = m, .name = #m } + +void irq_debug_show_bits(struct seq_file *m, int ind, unsigned int state, +			 const struct irq_bit_descr *sd, int size); +  void irq_add_debugfs_entry(unsigned int irq, struct irq_desc *desc);  static inline void irq_remove_debugfs_entry(struct irq_desc *desc)  { diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index aadc8891cc16..cea8f6874b1f 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -111,6 +111,7 @@ EXPORT_SYMBOL_GPL(__irq_domain_alloc_fwnode);  /**   * irq_domain_free_fwnode - Free a non-OF-backed fwnode_handle + * @fwnode: fwnode_handle to free   *   * Free a fwnode_handle allocated with irq_domain_alloc_fwnode.   */ @@ -127,27 +128,12 @@ void irq_domain_free_fwnode(struct fwnode_handle *fwnode)  }  EXPORT_SYMBOL_GPL(irq_domain_free_fwnode); -static struct irq_domain *__irq_domain_create(struct fwnode_handle *fwnode, -					      unsigned int size, -					      irq_hw_number_t hwirq_max, -					      int direct_max, -					      const struct irq_domain_ops *ops, -					      void *host_data) +static int irq_domain_set_name(struct irq_domain *domain, +			       const struct fwnode_handle *fwnode, +			       enum irq_domain_bus_token bus_token)  { -	struct irqchip_fwid *fwid; -	struct irq_domain *domain; -  	static atomic_t unknown_domains; - -	if (WARN_ON((size && direct_max) || -		    (!IS_ENABLED(CONFIG_IRQ_DOMAIN_NOMAP) && direct_max) || -		    (direct_max && (direct_max != hwirq_max)))) -		return NULL; - -	domain = kzalloc_node(struct_size(domain, revmap, size), -			      GFP_KERNEL, of_node_to_nid(to_of_node(fwnode))); -	if (!domain) -		return NULL; +	struct irqchip_fwid *fwid;  	if (is_fwnode_irqchip(fwnode)) {  		fwid = container_of(fwnode, struct irqchip_fwid, fwnode); @@ -155,17 +141,23 @@ static struct irq_domain *__irq_domain_create(struct fwnode_handle *fwnode,  		switch (fwid->type) {  		case IRQCHIP_FWNODE_NAMED:  		case IRQCHIP_FWNODE_NAMED_ID: -			domain->fwnode = fwnode; -			domain->name = kstrdup(fwid->name, GFP_KERNEL); -			if (!domain->name) { -				kfree(domain); -				return NULL; -			} +			domain->name = bus_token ? +					kasprintf(GFP_KERNEL, "%s-%d", +						  fwid->name, bus_token) : +					kstrdup(fwid->name, GFP_KERNEL); +			if (!domain->name) +				return -ENOMEM;  			domain->flags |= IRQ_DOMAIN_NAME_ALLOCATED;  			break;  		default: -			domain->fwnode = fwnode;  			domain->name = fwid->name; +			if (bus_token) { +				domain->name = kasprintf(GFP_KERNEL, "%s-%d", +							 fwid->name, bus_token); +				if (!domain->name) +					return -ENOMEM; +				domain->flags |= IRQ_DOMAIN_NAME_ALLOCATED; +			}  			break;  		}  	} else if (is_of_node(fwnode) || is_acpi_device_node(fwnode) || @@ -177,42 +169,68 @@ static struct irq_domain *__irq_domain_create(struct fwnode_handle *fwnode,  		 * unhappy about. Replace them with ':', which does  		 * the trick and is not as offensive as '\'...  		 */ -		name = kasprintf(GFP_KERNEL, "%pfw", fwnode); -		if (!name) { -			kfree(domain); -			return NULL; -		} +		name = bus_token ? +			kasprintf(GFP_KERNEL, "%pfw-%d", fwnode, bus_token) : +			kasprintf(GFP_KERNEL, "%pfw", fwnode); +		if (!name) +			return -ENOMEM;  		domain->name = strreplace(name, '/', ':'); -		domain->fwnode = fwnode;  		domain->flags |= IRQ_DOMAIN_NAME_ALLOCATED;  	}  	if (!domain->name) {  		if (fwnode)  			pr_err("Invalid fwnode type for irqdomain\n"); -		domain->name = kasprintf(GFP_KERNEL, "unknown-%d", -					 atomic_inc_return(&unknown_domains)); -		if (!domain->name) { -			kfree(domain); -			return NULL; -		} +		domain->name = bus_token ? +				kasprintf(GFP_KERNEL, "unknown-%d-%d", +					  atomic_inc_return(&unknown_domains), +					  bus_token) : +				kasprintf(GFP_KERNEL, "unknown-%d", +					  atomic_inc_return(&unknown_domains)); +		if (!domain->name) +			return -ENOMEM;  		domain->flags |= IRQ_DOMAIN_NAME_ALLOCATED;  	} -	fwnode_handle_get(fwnode); -	fwnode_dev_initialized(fwnode, true); +	return 0; +} + +static struct irq_domain *__irq_domain_create(const struct irq_domain_info *info) +{ +	struct irq_domain *domain; +	int err; + +	if (WARN_ON((info->size && info->direct_max) || +		    (!IS_ENABLED(CONFIG_IRQ_DOMAIN_NOMAP) && info->direct_max) || +		    (info->direct_max && info->direct_max != info->hwirq_max))) +		return ERR_PTR(-EINVAL); + +	domain = kzalloc_node(struct_size(domain, revmap, info->size), +			      GFP_KERNEL, of_node_to_nid(to_of_node(info->fwnode))); +	if (!domain) +		return ERR_PTR(-ENOMEM); + +	err = irq_domain_set_name(domain, info->fwnode, info->bus_token); +	if (err) { +		kfree(domain); +		return ERR_PTR(err); +	} + +	domain->fwnode = fwnode_handle_get(info->fwnode); +	fwnode_dev_initialized(domain->fwnode, true);  	/* Fill structure */  	INIT_RADIX_TREE(&domain->revmap_tree, GFP_KERNEL); -	domain->ops = ops; -	domain->host_data = host_data; -	domain->hwirq_max = hwirq_max; +	domain->ops = info->ops; +	domain->host_data = info->host_data; +	domain->bus_token = info->bus_token; +	domain->hwirq_max = info->hwirq_max; -	if (direct_max) +	if (info->direct_max)  		domain->flags |= IRQ_DOMAIN_FLAG_NO_MAP; -	domain->revmap_size = size; +	domain->revmap_size = info->size;  	/*  	 * Hierarchical domains use the domain lock of the root domain @@ -240,34 +258,64 @@ static void __irq_domain_publish(struct irq_domain *domain)  	pr_debug("Added domain %s\n", domain->name);  } +static void irq_domain_free(struct irq_domain *domain) +{ +	fwnode_dev_initialized(domain->fwnode, false); +	fwnode_handle_put(domain->fwnode); +	if (domain->flags & IRQ_DOMAIN_NAME_ALLOCATED) +		kfree(domain->name); +	kfree(domain); +} +  /** - * __irq_domain_add() - Allocate a new irq_domain data structure - * @fwnode: firmware node for the interrupt controller - * @size: Size of linear map; 0 for radix mapping only - * @hwirq_max: Maximum number of interrupts supported by controller - * @direct_max: Maximum value of direct maps; Use ~0 for no limit; 0 for no - *              direct mapping - * @ops: domain callbacks - * @host_data: Controller private data pointer + * irq_domain_instantiate() - Instantiate a new irq domain data structure + * @info: Domain information pointer pointing to the information for this domain   * - * Allocates and initializes an irq_domain structure. - * Returns pointer to IRQ domain, or NULL on failure. + * Return: A pointer to the instantiated irq domain or an ERR_PTR value.   */ -struct irq_domain *__irq_domain_add(struct fwnode_handle *fwnode, unsigned int size, -				    irq_hw_number_t hwirq_max, int direct_max, -				    const struct irq_domain_ops *ops, -				    void *host_data) +struct irq_domain *irq_domain_instantiate(const struct irq_domain_info *info)  {  	struct irq_domain *domain; +	int err; + +	domain = __irq_domain_create(info); +	if (IS_ERR(domain)) +		return domain; -	domain = __irq_domain_create(fwnode, size, hwirq_max, direct_max, -				     ops, host_data); -	if (domain) -		__irq_domain_publish(domain); +	domain->flags |= info->domain_flags; +	domain->exit = info->exit; + +#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY +	if (info->parent) { +		domain->root = info->parent->root; +		domain->parent = info->parent; +	} +#endif + +	if (info->dgc_info) { +		err = irq_domain_alloc_generic_chips(domain, info->dgc_info); +		if (err) +			goto err_domain_free; +	} + +	if (info->init) { +		err = info->init(domain); +		if (err) +			goto err_domain_gc_remove; +	} + +	__irq_domain_publish(domain);  	return domain; + +err_domain_gc_remove: +	if (info->dgc_info) +		irq_domain_remove_generic_chips(domain); +err_domain_free: +	irq_domain_free(domain); +	return ERR_PTR(err);  } -EXPORT_SYMBOL_GPL(__irq_domain_add); +EXPORT_SYMBOL_GPL(irq_domain_instantiate);  /**   * irq_domain_remove() - Remove an irq domain. @@ -279,6 +327,9 @@ EXPORT_SYMBOL_GPL(__irq_domain_add);   */  void irq_domain_remove(struct irq_domain *domain)  { +	if (domain->exit) +		domain->exit(domain); +  	mutex_lock(&irq_domain_mutex);  	debugfs_remove_domain_dir(domain); @@ -294,13 +345,11 @@ void irq_domain_remove(struct irq_domain *domain)  	mutex_unlock(&irq_domain_mutex); -	pr_debug("Removed domain %s\n", domain->name); +	if (domain->flags & IRQ_DOMAIN_FLAG_DESTROY_GC) +		irq_domain_remove_generic_chips(domain); -	fwnode_dev_initialized(domain->fwnode, false); -	fwnode_handle_put(domain->fwnode); -	if (domain->flags & IRQ_DOMAIN_NAME_ALLOCATED) -		kfree(domain->name); -	kfree(domain); +	pr_debug("Removed domain %s\n", domain->name); +	irq_domain_free(domain);  }  EXPORT_SYMBOL_GPL(irq_domain_remove); @@ -360,10 +409,17 @@ struct irq_domain *irq_domain_create_simple(struct fwnode_handle *fwnode,  					    const struct irq_domain_ops *ops,  					    void *host_data)  { +	struct irq_domain_info info = { +		.fwnode		= fwnode, +		.size		= size, +		.hwirq_max	= size, +		.ops		= ops, +		.host_data	= host_data, +	};  	struct irq_domain *domain; -	domain = __irq_domain_add(fwnode, size, size, 0, ops, host_data); -	if (!domain) +	domain = irq_domain_instantiate(&info); +	if (IS_ERR(domain))  		return NULL;  	if (first_irq > 0) { @@ -416,11 +472,20 @@ struct irq_domain *irq_domain_create_legacy(struct fwnode_handle *fwnode,  					 const struct irq_domain_ops *ops,  					 void *host_data)  { +	struct irq_domain_info info = { +		.fwnode		= fwnode, +		.size		= first_hwirq + size, +		.hwirq_max	= first_hwirq + size, +		.ops		= ops, +		.host_data	= host_data, +	};  	struct irq_domain *domain; -	domain = __irq_domain_add(fwnode, first_hwirq + size, first_hwirq + size, 0, ops, host_data); -	if (domain) -		irq_domain_associate_many(domain, first_irq, first_hwirq, size); +	domain = irq_domain_instantiate(&info); +	if (IS_ERR(domain)) +		return NULL; + +	irq_domain_associate_many(domain, first_irq, first_hwirq, size);  	return domain;  } @@ -438,7 +503,8 @@ struct irq_domain *irq_find_matching_fwspec(struct irq_fwspec *fwspec,  	struct fwnode_handle *fwnode = fwspec->fwnode;  	int rc; -	/* We might want to match the legacy controller last since +	/* +	 * We might want to match the legacy controller last since  	 * it might potentially be set to match all interrupts in  	 * the absence of a device node. This isn't a problem so far  	 * yet though... @@ -982,6 +1048,12 @@ EXPORT_SYMBOL_GPL(__irq_resolve_mapping);  /**   * irq_domain_xlate_onecell() - Generic xlate for direct one cell bindings + * @d:		Interrupt domain involved in the translation + * @ctrlr:	The device tree node for the device whose interrupt is translated + * @intspec:	The interrupt specifier data from the device tree + * @intsize:	The number of entries in @intspec + * @out_hwirq:	Pointer to storage for the hardware interrupt number + * @out_type:	Pointer to storage for the interrupt type   *   * Device Tree IRQ specifier translation function which works with one cell   * bindings where the cell value maps directly to the hwirq number. @@ -1000,6 +1072,12 @@ EXPORT_SYMBOL_GPL(irq_domain_xlate_onecell);  /**   * irq_domain_xlate_twocell() - Generic xlate for direct two cell bindings + * @d:		Interrupt domain involved in the translation + * @ctrlr:	The device tree node for the device whose interrupt is translated + * @intspec:	The interrupt specifier data from the device tree + * @intsize:	The number of entries in @intspec + * @out_hwirq:	Pointer to storage for the hardware interrupt number + * @out_type:	Pointer to storage for the interrupt type   *   * Device Tree IRQ specifier translation function which works with two cell   * bindings where the cell values map directly to the hwirq number @@ -1018,6 +1096,12 @@ EXPORT_SYMBOL_GPL(irq_domain_xlate_twocell);  /**   * irq_domain_xlate_onetwocell() - Generic xlate for one or two cell bindings + * @d:		Interrupt domain involved in the translation + * @ctrlr:	The device tree node for the device whose interrupt is translated + * @intspec:	The interrupt specifier data from the device tree + * @intsize:	The number of entries in @intspec + * @out_hwirq:	Pointer to storage for the hardware interrupt number + * @out_type:	Pointer to storage for the interrupt type   *   * Device Tree IRQ specifier translation function which works with either one   * or two cell bindings where the cell values map directly to the hwirq number @@ -1051,6 +1135,10 @@ EXPORT_SYMBOL_GPL(irq_domain_simple_ops);  /**   * irq_domain_translate_onecell() - Generic translate for direct one cell   * bindings + * @d:		Interrupt domain involved in the translation + * @fwspec:	The firmware interrupt specifier to translate + * @out_hwirq:	Pointer to storage for the hardware interrupt number + * @out_type:	Pointer to storage for the interrupt type   */  int irq_domain_translate_onecell(struct irq_domain *d,  				 struct irq_fwspec *fwspec, @@ -1068,6 +1156,10 @@ EXPORT_SYMBOL_GPL(irq_domain_translate_onecell);  /**   * irq_domain_translate_twocell() - Generic translate for direct two cell   * bindings + * @d:		Interrupt domain involved in the translation + * @fwspec:	The firmware interrupt specifier to translate + * @out_hwirq:	Pointer to storage for the hardware interrupt number + * @out_type:	Pointer to storage for the interrupt type   *   * Device Tree IRQ specifier translation function which works with two cell   * bindings where the cell values map directly to the hwirq number @@ -1144,23 +1236,22 @@ struct irq_domain *irq_domain_create_hierarchy(struct irq_domain *parent,  					    const struct irq_domain_ops *ops,  					    void *host_data)  { -	struct irq_domain *domain; - -	if (size) -		domain = __irq_domain_create(fwnode, size, size, 0, ops, host_data); -	else -		domain = __irq_domain_create(fwnode, 0, ~0, 0, ops, host_data); - -	if (domain) { -		if (parent) -			domain->root = parent->root; -		domain->parent = parent; -		domain->flags |= flags; +	struct irq_domain_info info = { +		.fwnode		= fwnode, +		.size		= size, +		.hwirq_max	= size, +		.ops		= ops, +		.host_data	= host_data, +		.domain_flags	= flags, +		.parent		= parent, +	}; +	struct irq_domain *d; -		__irq_domain_publish(domain); -	} +	if (!info.size) +		info.hwirq_max = ~0U; -	return domain; +	d = irq_domain_instantiate(&info); +	return IS_ERR(d) ? NULL : d;  }  EXPORT_SYMBOL_GPL(irq_domain_create_hierarchy); @@ -1932,13 +2023,26 @@ static void irq_domain_free_one_irq(struct irq_domain *domain, unsigned int virq  static struct dentry *domain_dir; -static void -irq_domain_debug_show_one(struct seq_file *m, struct irq_domain *d, int ind) +static const struct irq_bit_descr irqdomain_flags[] = { +	BIT_MASK_DESCR(IRQ_DOMAIN_FLAG_HIERARCHY), +	BIT_MASK_DESCR(IRQ_DOMAIN_NAME_ALLOCATED), +	BIT_MASK_DESCR(IRQ_DOMAIN_FLAG_IPI_PER_CPU), +	BIT_MASK_DESCR(IRQ_DOMAIN_FLAG_IPI_SINGLE), +	BIT_MASK_DESCR(IRQ_DOMAIN_FLAG_MSI), +	BIT_MASK_DESCR(IRQ_DOMAIN_FLAG_ISOLATED_MSI), +	BIT_MASK_DESCR(IRQ_DOMAIN_FLAG_NO_MAP), +	BIT_MASK_DESCR(IRQ_DOMAIN_FLAG_MSI_PARENT), +	BIT_MASK_DESCR(IRQ_DOMAIN_FLAG_MSI_DEVICE), +	BIT_MASK_DESCR(IRQ_DOMAIN_FLAG_NONCORE), +}; + +static void irq_domain_debug_show_one(struct seq_file *m, struct irq_domain *d, int ind)  {  	seq_printf(m, "%*sname:   %s\n", ind, "", d->name);  	seq_printf(m, "%*ssize:   %u\n", ind + 1, "", d->revmap_size);  	seq_printf(m, "%*smapped: %u\n", ind + 1, "", d->mapcount);  	seq_printf(m, "%*sflags:  0x%08x\n", ind +1 , "", d->flags); +	irq_debug_show_bits(m, ind, d->flags, irqdomain_flags, ARRAY_SIZE(irqdomain_flags));  	if (d->ops && d->ops->debug_show)  		d->ops->debug_show(m, d, NULL, ind + 1);  #ifdef	CONFIG_IRQ_DOMAIN_HIERARCHY diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c index 2024f89baea4..5fa0547ece0c 100644 --- a/kernel/irq/msi.c +++ b/kernel/irq/msi.c @@ -8,18 +8,34 @@   * This file contains common code to support Message Signaled Interrupts for   * PCI compatible and non PCI compatible devices.   */ -#include <linux/types.h>  #include <linux/device.h>  #include <linux/irq.h>  #include <linux/irqdomain.h>  #include <linux/msi.h> +#include <linux/mutex.h> +#include <linux/pci.h>  #include <linux/slab.h>  #include <linux/sysfs.h> -#include <linux/pci.h> +#include <linux/types.h> +#include <linux/xarray.h>  #include "internals.h"  /** + * struct msi_device_data - MSI per device data + * @properties:		MSI properties which are interesting to drivers + * @mutex:		Mutex protecting the MSI descriptor store + * @__domains:		Internal data for per device MSI domains + * @__iter_idx:		Index to search the next entry for iterators + */ +struct msi_device_data { +	unsigned long			properties; +	struct mutex			mutex; +	struct msi_dev_domain		__domains[MSI_MAX_DEVICE_IRQDOMAINS]; +	unsigned long			__iter_idx; +}; + +/**   * struct msi_ctrl - MSI internal management control structure   * @domid:	ID of the domain on which management operations should be done   * @first:	First (hardware) slot index to operate on @@ -1088,8 +1104,8 @@ bool msi_match_device_irq_domain(struct device *dev, unsigned int domid,  	return ret;  } -int msi_domain_prepare_irqs(struct irq_domain *domain, struct device *dev, -			    int nvec, msi_alloc_info_t *arg) +static int msi_domain_prepare_irqs(struct irq_domain *domain, struct device *dev, +				   int nvec, msi_alloc_info_t *arg)  {  	struct msi_domain_info *info = domain->host_data;  	struct msi_domain_ops *ops = info->ops; @@ -1097,77 +1113,6 @@ int msi_domain_prepare_irqs(struct irq_domain *domain, struct device *dev,  	return ops->msi_prepare(domain, dev, nvec, arg);  } -int msi_domain_populate_irqs(struct irq_domain *domain, struct device *dev, -			     int virq_base, int nvec, msi_alloc_info_t *arg) -{ -	struct msi_domain_info *info = domain->host_data; -	struct msi_domain_ops *ops = info->ops; -	struct msi_ctrl ctrl = { -		.domid	= MSI_DEFAULT_DOMAIN, -		.first  = virq_base, -		.last	= virq_base + nvec - 1, -	}; -	struct msi_desc *desc; -	struct xarray *xa; -	int ret, virq; - -	msi_lock_descs(dev); - -	if (!msi_ctrl_valid(dev, &ctrl)) { -		ret = -EINVAL; -		goto unlock; -	} - -	ret = msi_domain_add_simple_msi_descs(dev, &ctrl); -	if (ret) -		goto unlock; - -	xa = &dev->msi.data->__domains[ctrl.domid].store; - -	for (virq = virq_base; virq < virq_base + nvec; virq++) { -		desc = xa_load(xa, virq); -		desc->irq = virq; - -		ops->set_desc(arg, desc); -		ret = irq_domain_alloc_irqs_hierarchy(domain, virq, 1, arg); -		if (ret) -			goto fail; - -		irq_set_msi_desc(virq, desc); -	} -	msi_unlock_descs(dev); -	return 0; - -fail: -	for (--virq; virq >= virq_base; virq--) { -		msi_domain_depopulate_descs(dev, virq, 1); -		irq_domain_free_irqs_common(domain, virq, 1); -	} -	msi_domain_free_descs(dev, &ctrl); -unlock: -	msi_unlock_descs(dev); -	return ret; -} - -void msi_domain_depopulate_descs(struct device *dev, int virq_base, int nvec) -{ -	struct msi_ctrl ctrl = { -		.domid	= MSI_DEFAULT_DOMAIN, -		.first  = virq_base, -		.last	= virq_base + nvec - 1, -	}; -	struct msi_desc *desc; -	struct xarray *xa; -	unsigned long idx; - -	if (!msi_ctrl_valid(dev, &ctrl)) -		return; - -	xa = &dev->msi.data->__domains[ctrl.domid].store; -	xa_for_each_range(xa, idx, desc, ctrl.first, ctrl.last) -		desc->irq = 0; -} -  /*   * Carefully check whether the device can use reservation mode. If   * reservation mode is enabled then the early activation will assign a diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index 5c320c3f10a7..8cccdf40725a 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -461,10 +461,10 @@ int show_interrupts(struct seq_file *p, void *v)  {  	static int prec; -	unsigned long flags, any_count = 0;  	int i = *(loff_t *) v, j;  	struct irqaction *action;  	struct irq_desc *desc; +	unsigned long flags;  	if (i > ACTUAL_NR_IRQS)  		return 0; @@ -488,10 +488,7 @@ int show_interrupts(struct seq_file *p, void *v)  	if (!desc || irq_settings_is_hidden(desc))  		goto outsparse; -	if (desc->kstat_irqs) -		any_count = kstat_irqs_desc(desc, cpu_online_mask); - -	if ((!desc->action || irq_desc_is_chained(desc)) && !any_count) +	if (!desc->action || irq_desc_is_chained(desc) || !desc->kstat_irqs)  		goto outsparse;  	seq_printf(p, "%*d: ", prec, i); diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index 98b9622d372e..fb2c77368d18 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -148,9 +148,6 @@ static unsigned int get_symbol_offset(unsigned long pos)  unsigned long kallsyms_sym_address(int idx)  { -	if (!IS_ENABLED(CONFIG_KALLSYMS_BASE_RELATIVE)) -		return kallsyms_addresses[idx]; -  	/* values are unsigned offsets if --absolute-percpu is not in effect */  	if (!IS_ENABLED(CONFIG_KALLSYMS_ABSOLUTE_PERCPU))  		return kallsyms_relative_base + (u32)kallsyms_offsets[idx]; @@ -325,7 +322,7 @@ static unsigned long get_symbol_pos(unsigned long addr,  	unsigned long symbol_start = 0, symbol_end = 0;  	unsigned long i, low, high, mid; -	/* Do a binary search on the sorted kallsyms_addresses array. */ +	/* Do a binary search on the sorted kallsyms_offsets array. */  	low = 0;  	high = kallsyms_num_syms; diff --git a/kernel/kallsyms_internal.h b/kernel/kallsyms_internal.h index 85480274fc8f..9633782f8250 100644 --- a/kernel/kallsyms_internal.h +++ b/kernel/kallsyms_internal.h @@ -4,12 +4,6 @@  #include <linux/types.h> -/* - * These will be re-linked against their real values during the second link - * stage. Preliminary values must be provided in the linker script using the - * PROVIDE() directive so that the first link stage can complete successfully. - */ -extern const unsigned long kallsyms_addresses[];  extern const int kallsyms_offsets[];  extern const u8 kallsyms_names[]; diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index 9112d69d68b0..c0caa14880c3 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -888,7 +888,7 @@ struct kimage *kexec_crash_image;  static int kexec_load_disabled;  #ifdef CONFIG_SYSCTL -static int kexec_limit_handler(struct ctl_table *table, int write, +static int kexec_limit_handler(const struct ctl_table *table, int write,  			       void *buffer, size_t *lenp, loff_t *ppos)  {  	struct kexec_load_limit *limit = table->data; diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 6a76a8100073..e85de37d9e1e 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -939,7 +939,7 @@ static void unoptimize_all_kprobes(void)  static DEFINE_MUTEX(kprobe_sysctl_mutex);  static int sysctl_kprobes_optimization; -static int proc_kprobes_optimization_handler(struct ctl_table *table, +static int proc_kprobes_optimization_handler(const struct ctl_table *table,  					     int write, void *buffer,  					     size_t *length, loff_t *ppos)  { diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c index 07fb5987b42b..1bab21b4718f 100644 --- a/kernel/ksysfs.c +++ b/kernel/ksysfs.c @@ -92,7 +92,14 @@ static ssize_t profiling_store(struct kobject *kobj,  				   const char *buf, size_t count)  {  	int ret; +	static DEFINE_MUTEX(lock); +	/* +	 * We need serialization, for profile_setup() initializes prof_on +	 * value and profile_init() must not reallocate prof_buffer after +	 * once allocated. +	 */ +	guard(mutex)(&lock);  	if (prof_on)  		return -EEXIST;  	/* diff --git a/kernel/latencytop.c b/kernel/latencytop.c index 84c53285f499..7a75eab9c179 100644 --- a/kernel/latencytop.c +++ b/kernel/latencytop.c @@ -65,7 +65,7 @@ static struct latency_record latency_record[MAXLR];  int latencytop_enabled;  #ifdef CONFIG_SYSCTL -static int sysctl_latencytop(struct ctl_table *table, int write, void *buffer, +static int sysctl_latencytop(const struct ctl_table *table, int write, void *buffer,  		size_t *lenp, loff_t *ppos)  {  	int err; diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c index 52426665eecc..3c21c31796db 100644 --- a/kernel/livepatch/core.c +++ b/kernel/livepatch/core.c @@ -346,6 +346,7 @@ int klp_apply_section_relocs(struct module *pmod, Elf_Shdr *sechdrs,   * /sys/kernel/livepatch/<patch>/enabled   * /sys/kernel/livepatch/<patch>/transition   * /sys/kernel/livepatch/<patch>/force + * /sys/kernel/livepatch/<patch>/replace   * /sys/kernel/livepatch/<patch>/<object>   * /sys/kernel/livepatch/<patch>/<object>/patched   * /sys/kernel/livepatch/<patch>/<object>/<function,sympos> @@ -401,7 +402,7 @@ static ssize_t enabled_show(struct kobject *kobj,  	struct klp_patch *patch;  	patch = container_of(kobj, struct klp_patch, kobj); -	return snprintf(buf, PAGE_SIZE-1, "%d\n", patch->enabled); +	return sysfs_emit(buf, "%d\n", patch->enabled);  }  static ssize_t transition_show(struct kobject *kobj, @@ -410,8 +411,7 @@ static ssize_t transition_show(struct kobject *kobj,  	struct klp_patch *patch;  	patch = container_of(kobj, struct klp_patch, kobj); -	return snprintf(buf, PAGE_SIZE-1, "%d\n", -			patch == klp_transition_patch); +	return sysfs_emit(buf, "%d\n", patch == klp_transition_patch);  }  static ssize_t force_store(struct kobject *kobj, struct kobj_attribute *attr, @@ -443,13 +443,24 @@ static ssize_t force_store(struct kobject *kobj, struct kobj_attribute *attr,  	return count;  } +static ssize_t replace_show(struct kobject *kobj, +			    struct kobj_attribute *attr, char *buf) +{ +	struct klp_patch *patch; + +	patch = container_of(kobj, struct klp_patch, kobj); +	return sysfs_emit(buf, "%d\n", patch->replace); +} +  static struct kobj_attribute enabled_kobj_attr = __ATTR_RW(enabled);  static struct kobj_attribute transition_kobj_attr = __ATTR_RO(transition);  static struct kobj_attribute force_kobj_attr = __ATTR_WO(force); +static struct kobj_attribute replace_kobj_attr = __ATTR_RO(replace);  static struct attribute *klp_patch_attrs[] = {  	&enabled_kobj_attr.attr,  	&transition_kobj_attr.attr,  	&force_kobj_attr.attr, +	&replace_kobj_attr.attr,  	NULL  };  ATTRIBUTE_GROUPS(klp_patch); diff --git a/kernel/panic.c b/kernel/panic.c index 8bff183d6180..f861bedc1925 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -35,6 +35,7 @@  #include <linux/debugfs.h>  #include <linux/sysfs.h>  #include <linux/context_tracking.h> +#include <linux/seq_buf.h>  #include <trace/events/error_report.h>  #include <asm/sections.h> @@ -470,32 +471,83 @@ void panic(const char *fmt, ...)  EXPORT_SYMBOL(panic); +#define TAINT_FLAG(taint, _c_true, _c_false, _module)			\ +	[ TAINT_##taint ] = {						\ +		.c_true = _c_true, .c_false = _c_false,			\ +		.module = _module,					\ +		.desc = #taint,						\ +	} +  /*   * TAINT_FORCED_RMMOD could be a per-module flag but the module   * is being removed anyway.   */  const struct taint_flag taint_flags[TAINT_FLAGS_COUNT] = { -	[ TAINT_PROPRIETARY_MODULE ]	= { 'P', 'G', true }, -	[ TAINT_FORCED_MODULE ]		= { 'F', ' ', true }, -	[ TAINT_CPU_OUT_OF_SPEC ]	= { 'S', ' ', false }, -	[ TAINT_FORCED_RMMOD ]		= { 'R', ' ', false }, -	[ TAINT_MACHINE_CHECK ]		= { 'M', ' ', false }, -	[ TAINT_BAD_PAGE ]		= { 'B', ' ', false }, -	[ TAINT_USER ]			= { 'U', ' ', false }, -	[ TAINT_DIE ]			= { 'D', ' ', false }, -	[ TAINT_OVERRIDDEN_ACPI_TABLE ]	= { 'A', ' ', false }, -	[ TAINT_WARN ]			= { 'W', ' ', false }, -	[ TAINT_CRAP ]			= { 'C', ' ', true }, -	[ TAINT_FIRMWARE_WORKAROUND ]	= { 'I', ' ', false }, -	[ TAINT_OOT_MODULE ]		= { 'O', ' ', true }, -	[ TAINT_UNSIGNED_MODULE ]	= { 'E', ' ', true }, -	[ TAINT_SOFTLOCKUP ]		= { 'L', ' ', false }, -	[ TAINT_LIVEPATCH ]		= { 'K', ' ', true }, -	[ TAINT_AUX ]			= { 'X', ' ', true }, -	[ TAINT_RANDSTRUCT ]		= { 'T', ' ', true }, -	[ TAINT_TEST ]			= { 'N', ' ', true }, +	TAINT_FLAG(PROPRIETARY_MODULE,		'P', 'G', true), +	TAINT_FLAG(FORCED_MODULE,		'F', ' ', true), +	TAINT_FLAG(CPU_OUT_OF_SPEC,		'S', ' ', false), +	TAINT_FLAG(FORCED_RMMOD,		'R', ' ', false), +	TAINT_FLAG(MACHINE_CHECK,		'M', ' ', false), +	TAINT_FLAG(BAD_PAGE,			'B', ' ', false), +	TAINT_FLAG(USER,			'U', ' ', false), +	TAINT_FLAG(DIE,				'D', ' ', false), +	TAINT_FLAG(OVERRIDDEN_ACPI_TABLE,	'A', ' ', false), +	TAINT_FLAG(WARN,			'W', ' ', false), +	TAINT_FLAG(CRAP,			'C', ' ', true), +	TAINT_FLAG(FIRMWARE_WORKAROUND,		'I', ' ', false), +	TAINT_FLAG(OOT_MODULE,			'O', ' ', true), +	TAINT_FLAG(UNSIGNED_MODULE,		'E', ' ', true), +	TAINT_FLAG(SOFTLOCKUP,			'L', ' ', false), +	TAINT_FLAG(LIVEPATCH,			'K', ' ', true), +	TAINT_FLAG(AUX,				'X', ' ', true), +	TAINT_FLAG(RANDSTRUCT,			'T', ' ', true), +	TAINT_FLAG(TEST,			'N', ' ', true),  }; +#undef TAINT_FLAG + +static void print_tainted_seq(struct seq_buf *s, bool verbose) +{ +	const char *sep = ""; +	int i; + +	if (!tainted_mask) { +		seq_buf_puts(s, "Not tainted"); +		return; +	} + +	seq_buf_printf(s, "Tainted: "); +	for (i = 0; i < TAINT_FLAGS_COUNT; i++) { +		const struct taint_flag *t = &taint_flags[i]; +		bool is_set = test_bit(i, &tainted_mask); +		char c = is_set ? t->c_true : t->c_false; + +		if (verbose) { +			if (is_set) { +				seq_buf_printf(s, "%s[%c]=%s", sep, c, t->desc); +				sep = ", "; +			} +		} else { +			seq_buf_putc(s, c); +		} +	} +} + +static const char *_print_tainted(bool verbose) +{ +	/* FIXME: what should the size be? */ +	static char buf[sizeof(taint_flags)]; +	struct seq_buf s; + +	BUILD_BUG_ON(ARRAY_SIZE(taint_flags) != TAINT_FLAGS_COUNT); + +	seq_buf_init(&s, buf, sizeof(buf)); + +	print_tainted_seq(&s, verbose); + +	return seq_buf_str(&s); +} +  /**   * print_tainted - return a string to represent the kernel taint state.   * @@ -506,25 +558,15 @@ const struct taint_flag taint_flags[TAINT_FLAGS_COUNT] = {   */  const char *print_tainted(void)  { -	static char buf[TAINT_FLAGS_COUNT + sizeof("Tainted: ")]; - -	BUILD_BUG_ON(ARRAY_SIZE(taint_flags) != TAINT_FLAGS_COUNT); - -	if (tainted_mask) { -		char *s; -		int i; - -		s = buf + sprintf(buf, "Tainted: "); -		for (i = 0; i < TAINT_FLAGS_COUNT; i++) { -			const struct taint_flag *t = &taint_flags[i]; -			*s++ = test_bit(i, &tainted_mask) ? -					t->c_true : t->c_false; -		} -		*s = 0; -	} else -		snprintf(buf, sizeof(buf), "Not tainted"); +	return _print_tainted(false); +} -	return buf; +/** + * print_tainted_verbose - A more verbose version of print_tainted() + */ +const char *print_tainted_verbose(void) +{ +	return _print_tainted(true);  }  int test_taint(unsigned flag) diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index bdf0087d6442..d70ab49d5b4a 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -261,7 +261,7 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)  }  #ifdef CONFIG_CHECKPOINT_RESTORE -static int pid_ns_ctl_handler(struct ctl_table *table, int write, +static int pid_ns_ctl_handler(const struct ctl_table *table, int write,  		void *buffer, size_t *lenp, loff_t *ppos)  {  	struct pid_namespace *pid_ns = task_active_pid_ns(current); diff --git a/kernel/pid_sysctl.h b/kernel/pid_sysctl.h index fe9fb991dc42..18ecaef6be41 100644 --- a/kernel/pid_sysctl.h +++ b/kernel/pid_sysctl.h @@ -5,7 +5,7 @@  #include <linux/pid_namespace.h>  #if defined(CONFIG_SYSCTL) && defined(CONFIG_MEMFD_CREATE) -static int pid_mfd_noexec_dointvec_minmax(struct ctl_table *table, +static int pid_mfd_noexec_dointvec_minmax(const struct ctl_table *table,  	int write, void *buf, size_t *lenp, loff_t *ppos)  {  	struct pid_namespace *ns = task_active_pid_ns(current); diff --git a/kernel/power/swap.c b/kernel/power/swap.c index 753b8dd42a59..82b884b67152 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -200,12 +200,11 @@ void free_all_swap_pages(int swap)  	while ((node = swsusp_extents.rb_node)) {  		struct swsusp_extent *ext; -		unsigned long offset;  		ext = rb_entry(node, struct swsusp_extent, node);  		rb_erase(node, &swsusp_extents); -		for (offset = ext->start; offset <= ext->end; offset++) -			swap_free(swp_entry(swap, offset)); +		swap_free_nr(swp_entry(swap, ext->start), +			     ext->end - ext->start + 1);  		kfree(ext);  	} diff --git a/kernel/printk/internal.h b/kernel/printk/internal.h index 6c2afee5ef62..19dcc5832651 100644 --- a/kernel/printk/internal.h +++ b/kernel/printk/internal.h @@ -8,7 +8,7 @@  #if defined(CONFIG_PRINTK) && defined(CONFIG_SYSCTL)  void __init printk_sysctl_init(void); -int devkmsg_sysctl_set_loglvl(struct ctl_table *table, int write, +int devkmsg_sysctl_set_loglvl(const struct ctl_table *table, int write,  			      void *buffer, size_t *lenp, loff_t *ppos);  #else  #define printk_sysctl_init() do { } while (0) diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 7d91593f0ecf..054c0e7784fd 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -197,7 +197,7 @@ __setup("printk.devkmsg=", control_devkmsg);  char devkmsg_log_str[DEVKMSG_STR_MAX_SIZE] = "ratelimit";  #if defined(CONFIG_PRINTK) && defined(CONFIG_SYSCTL) -int devkmsg_sysctl_set_loglvl(struct ctl_table *table, int write, +int devkmsg_sysctl_set_loglvl(const struct ctl_table *table, int write,  			      void *buffer, size_t *lenp, loff_t *ppos)  {  	char old_str[DEVKMSG_STR_MAX_SIZE]; @@ -4372,15 +4372,15 @@ void kmsg_dump_rewind(struct kmsg_dump_iter *iter)  EXPORT_SYMBOL_GPL(kmsg_dump_rewind);  /** - * console_replay_all - replay kernel log on consoles + * console_try_replay_all - try to replay kernel log on consoles   *   * Try to obtain lock on console subsystem and replay all   * available records in printk buffer on the consoles.   * Does nothing if lock is not obtained.   * - * Context: Any context. + * Context: Any, except for NMI.   */ -void console_replay_all(void) +void console_try_replay_all(void)  {  	if (console_trylock()) {  		__console_rewind_all(); diff --git a/kernel/printk/sysctl.c b/kernel/printk/sysctl.c index 3e47dedce9e5..f5072dc85f7a 100644 --- a/kernel/printk/sysctl.c +++ b/kernel/printk/sysctl.c @@ -11,7 +11,7 @@  static const int ten_thousand = 10000; -static int proc_dointvec_minmax_sysadmin(struct ctl_table *table, int write, +static int proc_dointvec_minmax_sysadmin(const struct ctl_table *table, int write,  				void *buffer, size_t *lenp, loff_t *ppos)  {  	if (write && !capable(CAP_SYS_ADMIN)) diff --git a/kernel/profile.c b/kernel/profile.c index 2b775cc5c28f..ff68d3816182 100644 --- a/kernel/profile.c +++ b/kernel/profile.c @@ -47,13 +47,6 @@ static unsigned short int prof_shift;  int prof_on __read_mostly;  EXPORT_SYMBOL_GPL(prof_on); -static cpumask_var_t prof_cpu_mask; -#if defined(CONFIG_SMP) && defined(CONFIG_PROC_FS) -static DEFINE_PER_CPU(struct profile_hit *[2], cpu_profile_hits); -static DEFINE_PER_CPU(int, cpu_profile_flip); -static DEFINE_MUTEX(profile_flip_mutex); -#endif /* CONFIG_SMP */ -  int profile_setup(char *str)  {  	static const char schedstr[] = "schedule"; @@ -114,11 +107,6 @@ int __ref profile_init(void)  	buffer_bytes = prof_len*sizeof(atomic_t); -	if (!alloc_cpumask_var(&prof_cpu_mask, GFP_KERNEL)) -		return -ENOMEM; - -	cpumask_copy(prof_cpu_mask, cpu_possible_mask); -  	prof_buffer = kzalloc(buffer_bytes, GFP_KERNEL|__GFP_NOWARN);  	if (prof_buffer)  		return 0; @@ -132,195 +120,16 @@ int __ref profile_init(void)  	if (prof_buffer)  		return 0; -	free_cpumask_var(prof_cpu_mask);  	return -ENOMEM;  } -#if defined(CONFIG_SMP) && defined(CONFIG_PROC_FS) -/* - * Each cpu has a pair of open-addressed hashtables for pending - * profile hits. read_profile() IPI's all cpus to request them - * to flip buffers and flushes their contents to prof_buffer itself. - * Flip requests are serialized by the profile_flip_mutex. The sole - * use of having a second hashtable is for avoiding cacheline - * contention that would otherwise happen during flushes of pending - * profile hits required for the accuracy of reported profile hits - * and so resurrect the interrupt livelock issue. - * - * The open-addressed hashtables are indexed by profile buffer slot - * and hold the number of pending hits to that profile buffer slot on - * a cpu in an entry. When the hashtable overflows, all pending hits - * are accounted to their corresponding profile buffer slots with - * atomic_add() and the hashtable emptied. As numerous pending hits - * may be accounted to a profile buffer slot in a hashtable entry, - * this amortizes a number of atomic profile buffer increments likely - * to be far larger than the number of entries in the hashtable, - * particularly given that the number of distinct profile buffer - * positions to which hits are accounted during short intervals (e.g. - * several seconds) is usually very small. Exclusion from buffer - * flipping is provided by interrupt disablement (note that for - * SCHED_PROFILING or SLEEP_PROFILING profile_hit() may be called from - * process context). - * The hash function is meant to be lightweight as opposed to strong, - * and was vaguely inspired by ppc64 firmware-supported inverted - * pagetable hash functions, but uses a full hashtable full of finite - * collision chains, not just pairs of them. - * - * -- nyc - */ -static void __profile_flip_buffers(void *unused) -{ -	int cpu = smp_processor_id(); - -	per_cpu(cpu_profile_flip, cpu) = !per_cpu(cpu_profile_flip, cpu); -} - -static void profile_flip_buffers(void) -{ -	int i, j, cpu; - -	mutex_lock(&profile_flip_mutex); -	j = per_cpu(cpu_profile_flip, get_cpu()); -	put_cpu(); -	on_each_cpu(__profile_flip_buffers, NULL, 1); -	for_each_online_cpu(cpu) { -		struct profile_hit *hits = per_cpu(cpu_profile_hits, cpu)[j]; -		for (i = 0; i < NR_PROFILE_HIT; ++i) { -			if (!hits[i].hits) { -				if (hits[i].pc) -					hits[i].pc = 0; -				continue; -			} -			atomic_add(hits[i].hits, &prof_buffer[hits[i].pc]); -			hits[i].hits = hits[i].pc = 0; -		} -	} -	mutex_unlock(&profile_flip_mutex); -} - -static void profile_discard_flip_buffers(void) -{ -	int i, cpu; - -	mutex_lock(&profile_flip_mutex); -	i = per_cpu(cpu_profile_flip, get_cpu()); -	put_cpu(); -	on_each_cpu(__profile_flip_buffers, NULL, 1); -	for_each_online_cpu(cpu) { -		struct profile_hit *hits = per_cpu(cpu_profile_hits, cpu)[i]; -		memset(hits, 0, NR_PROFILE_HIT*sizeof(struct profile_hit)); -	} -	mutex_unlock(&profile_flip_mutex); -} - -static void do_profile_hits(int type, void *__pc, unsigned int nr_hits) -{ -	unsigned long primary, secondary, flags, pc = (unsigned long)__pc; -	int i, j, cpu; -	struct profile_hit *hits; - -	pc = min((pc - (unsigned long)_stext) >> prof_shift, prof_len - 1); -	i = primary = (pc & (NR_PROFILE_GRP - 1)) << PROFILE_GRPSHIFT; -	secondary = (~(pc << 1) & (NR_PROFILE_GRP - 1)) << PROFILE_GRPSHIFT; -	cpu = get_cpu(); -	hits = per_cpu(cpu_profile_hits, cpu)[per_cpu(cpu_profile_flip, cpu)]; -	if (!hits) { -		put_cpu(); -		return; -	} -	/* -	 * We buffer the global profiler buffer into a per-CPU -	 * queue and thus reduce the number of global (and possibly -	 * NUMA-alien) accesses. The write-queue is self-coalescing: -	 */ -	local_irq_save(flags); -	do { -		for (j = 0; j < PROFILE_GRPSZ; ++j) { -			if (hits[i + j].pc == pc) { -				hits[i + j].hits += nr_hits; -				goto out; -			} else if (!hits[i + j].hits) { -				hits[i + j].pc = pc; -				hits[i + j].hits = nr_hits; -				goto out; -			} -		} -		i = (i + secondary) & (NR_PROFILE_HIT - 1); -	} while (i != primary); - -	/* -	 * Add the current hit(s) and flush the write-queue out -	 * to the global buffer: -	 */ -	atomic_add(nr_hits, &prof_buffer[pc]); -	for (i = 0; i < NR_PROFILE_HIT; ++i) { -		atomic_add(hits[i].hits, &prof_buffer[hits[i].pc]); -		hits[i].pc = hits[i].hits = 0; -	} -out: -	local_irq_restore(flags); -	put_cpu(); -} - -static int profile_dead_cpu(unsigned int cpu) -{ -	struct page *page; -	int i; - -	if (cpumask_available(prof_cpu_mask)) -		cpumask_clear_cpu(cpu, prof_cpu_mask); - -	for (i = 0; i < 2; i++) { -		if (per_cpu(cpu_profile_hits, cpu)[i]) { -			page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[i]); -			per_cpu(cpu_profile_hits, cpu)[i] = NULL; -			__free_page(page); -		} -	} -	return 0; -} - -static int profile_prepare_cpu(unsigned int cpu) -{ -	int i, node = cpu_to_mem(cpu); -	struct page *page; - -	per_cpu(cpu_profile_flip, cpu) = 0; - -	for (i = 0; i < 2; i++) { -		if (per_cpu(cpu_profile_hits, cpu)[i]) -			continue; - -		page = __alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0); -		if (!page) { -			profile_dead_cpu(cpu); -			return -ENOMEM; -		} -		per_cpu(cpu_profile_hits, cpu)[i] = page_address(page); - -	} -	return 0; -} - -static int profile_online_cpu(unsigned int cpu) -{ -	if (cpumask_available(prof_cpu_mask)) -		cpumask_set_cpu(cpu, prof_cpu_mask); - -	return 0; -} - -#else /* !CONFIG_SMP */ -#define profile_flip_buffers()		do { } while (0) -#define profile_discard_flip_buffers()	do { } while (0) -  static void do_profile_hits(int type, void *__pc, unsigned int nr_hits)  {  	unsigned long pc;  	pc = ((unsigned long)__pc - (unsigned long)_stext) >> prof_shift; -	atomic_add(nr_hits, &prof_buffer[min(pc, prof_len - 1)]); +	if (pc < prof_len) +		atomic_add(nr_hits, &prof_buffer[pc]);  } -#endif /* !CONFIG_SMP */  void profile_hits(int type, void *__pc, unsigned int nr_hits)  { @@ -334,8 +143,8 @@ void profile_tick(int type)  {  	struct pt_regs *regs = get_irq_regs(); -	if (!user_mode(regs) && cpumask_available(prof_cpu_mask) && -	    cpumask_test_cpu(smp_processor_id(), prof_cpu_mask)) +	/* This is the old kernel-only legacy profiling */ +	if (!user_mode(regs))  		profile_hit(type, (void *)profile_pc(regs));  } @@ -358,7 +167,6 @@ read_profile(struct file *file, char __user *buf, size_t count, loff_t *ppos)  	char *pnt;  	unsigned long sample_step = 1UL << prof_shift; -	profile_flip_buffers();  	if (p >= (prof_len+1)*sizeof(unsigned int))  		return 0;  	if (count > (prof_len+1)*sizeof(unsigned int) - p) @@ -404,7 +212,6 @@ static ssize_t write_profile(struct file *file, const char __user *buf,  			return -EINVAL;  	}  #endif -	profile_discard_flip_buffers();  	memset(prof_buffer, 0, prof_len * sizeof(atomic_t));  	return count;  } @@ -418,40 +225,14 @@ static const struct proc_ops profile_proc_ops = {  int __ref create_proc_profile(void)  {  	struct proc_dir_entry *entry; -#ifdef CONFIG_SMP -	enum cpuhp_state online_state; -#endif -  	int err = 0;  	if (!prof_on)  		return 0; -#ifdef CONFIG_SMP -	err = cpuhp_setup_state(CPUHP_PROFILE_PREPARE, "PROFILE_PREPARE", -				profile_prepare_cpu, profile_dead_cpu); -	if (err) -		return err; - -	err = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "AP_PROFILE_ONLINE", -				profile_online_cpu, NULL); -	if (err < 0) -		goto err_state_prep; -	online_state = err; -	err = 0; -#endif  	entry = proc_create("profile", S_IWUSR | S_IRUGO,  			    NULL, &profile_proc_ops); -	if (!entry) -		goto err_state_onl; -	proc_set_size(entry, (1 + prof_len) * sizeof(atomic_t)); - -	return err; -err_state_onl: -#ifdef CONFIG_SMP -	cpuhp_remove_state(online_state); -err_state_prep: -	cpuhp_remove_state(CPUHP_PROFILE_PREPARE); -#endif +	if (entry) +		proc_set_size(entry, (1 + prof_len) * sizeof(atomic_t));  	return err;  }  subsys_initcall(create_proc_profile); diff --git a/kernel/resource_kunit.c b/kernel/resource_kunit.c index 58ab9f914602..0e509985a44a 100644 --- a/kernel/resource_kunit.c +++ b/kernel/resource_kunit.c @@ -149,4 +149,5 @@ static struct kunit_suite resource_test_suite = {  };  kunit_test_suite(resource_test_suite); +MODULE_DESCRIPTION("I/O Port & Memory Resource manager unit tests");  MODULE_LICENSE("GPL"); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index ae5ef3013a55..a9f655025607 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1806,7 +1806,7 @@ static void uclamp_sync_util_min_rt_default(void)  		uclamp_update_util_min_rt_default(p);  } -static int sysctl_sched_uclamp_handler(struct ctl_table *table, int write, +static int sysctl_sched_uclamp_handler(const struct ctl_table *table, int write,  				void *buffer, size_t *lenp, loff_t *ppos)  {  	bool update_root_tg = false; @@ -4392,7 +4392,7 @@ static void reset_memory_tiering(void)  	}  } -static int sysctl_numa_balancing(struct ctl_table *table, int write, +static int sysctl_numa_balancing(const struct ctl_table *table, int write,  			  void *buffer, size_t *lenp, loff_t *ppos)  {  	struct ctl_table t; @@ -4461,7 +4461,7 @@ out:  __setup("schedstats=", setup_schedstats);  #ifdef CONFIG_PROC_SYSCTL -static int sysctl_schedstats(struct ctl_table *table, int write, void *buffer, +static int sysctl_schedstats(const struct ctl_table *table, int write, void *buffer,  		size_t *lenp, loff_t *ppos)  {  	struct ctl_table t; diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 63e49c8ffc4d..310523c1b9e3 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -26,9 +26,9 @@ int sysctl_sched_rt_runtime = 950000;  #ifdef CONFIG_SYSCTL  static int sysctl_sched_rr_timeslice = (MSEC_PER_SEC * RR_TIMESLICE) / HZ; -static int sched_rt_handler(struct ctl_table *table, int write, void *buffer, +static int sched_rt_handler(const struct ctl_table *table, int write, void *buffer,  		size_t *lenp, loff_t *ppos); -static int sched_rr_handler(struct ctl_table *table, int write, void *buffer, +static int sched_rr_handler(const struct ctl_table *table, int write, void *buffer,  		size_t *lenp, loff_t *ppos);  static struct ctl_table sched_rt_sysctls[] = {  	{ @@ -2952,7 +2952,7 @@ static void sched_rt_do_global(void)  	raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags);  } -static int sched_rt_handler(struct ctl_table *table, int write, void *buffer, +static int sched_rt_handler(const struct ctl_table *table, int write, void *buffer,  		size_t *lenp, loff_t *ppos)  {  	int old_period, old_runtime; @@ -2991,7 +2991,7 @@ undo:  	return ret;  } -static int sched_rr_handler(struct ctl_table *table, int write, void *buffer, +static int sched_rr_handler(const struct ctl_table *table, int write, void *buffer,  		size_t *lenp, loff_t *ppos)  {  	int ret; diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 784a0be81e84..76504b776d03 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -285,7 +285,7 @@ void rebuild_sched_domains_energy(void)  }  #ifdef CONFIG_PROC_SYSCTL -static int sched_energy_aware_handler(struct ctl_table *table, int write, +static int sched_energy_aware_handler(const struct ctl_table *table, int write,  		void *buffer, size_t *lenp, loff_t *ppos)  {  	int ret, state; diff --git a/kernel/seccomp.c b/kernel/seccomp.c index dc51e521bc1d..385d48293a5f 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -2431,7 +2431,7 @@ static void audit_actions_logged(u32 actions_logged, u32 old_actions_logged,  	return audit_seccomp_actions_logged(new, old, !ret);  } -static int seccomp_actions_logged_handler(struct ctl_table *ro_table, int write, +static int seccomp_actions_logged_handler(const struct ctl_table *ro_table, int write,  					  void *buffer, size_t *lenp,  					  loff_t *ppos)  { diff --git a/kernel/stackleak.c b/kernel/stackleak.c index 0f9712584913..39fd620a7db6 100644 --- a/kernel/stackleak.c +++ b/kernel/stackleak.c @@ -21,7 +21,7 @@  static DEFINE_STATIC_KEY_FALSE(stack_erasing_bypass);  #ifdef CONFIG_SYSCTL -static int stack_erasing_sysctl(struct ctl_table *table, int write, +static int stack_erasing_sysctl(const struct ctl_table *table, int write,  			void __user *buffer, size_t *lenp, loff_t *ppos)  {  	int ret = 0; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index e4421594fc25..79e6cb1d5c48 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -256,7 +256,7 @@ static bool proc_first_pos_non_zero_ignore(loff_t *ppos,   *   * Returns 0 on success.   */ -int proc_dostring(struct ctl_table *table, int write, +int proc_dostring(const struct ctl_table *table, int write,  		  void *buffer, size_t *lenp, loff_t *ppos)  {  	if (write) @@ -702,7 +702,7 @@ int do_proc_douintvec(const struct ctl_table *table, int write,   *   * Returns 0 on success.   */ -int proc_dobool(struct ctl_table *table, int write, void *buffer, +int proc_dobool(const struct ctl_table *table, int write, void *buffer,  		size_t *lenp, loff_t *ppos)  {  	struct ctl_table tmp; @@ -739,7 +739,7 @@ int proc_dobool(struct ctl_table *table, int write, void *buffer,   *   * Returns 0 on success.   */ -int proc_dointvec(struct ctl_table *table, int write, void *buffer, +int proc_dointvec(const struct ctl_table *table, int write, void *buffer,  		  size_t *lenp, loff_t *ppos)  {  	return do_proc_dointvec(table, write, buffer, lenp, ppos, NULL, NULL); @@ -758,7 +758,7 @@ int proc_dointvec(struct ctl_table *table, int write, void *buffer,   *   * Returns 0 on success.   */ -int proc_douintvec(struct ctl_table *table, int write, void *buffer, +int proc_douintvec(const struct ctl_table *table, int write, void *buffer,  		size_t *lenp, loff_t *ppos)  {  	return do_proc_douintvec(table, write, buffer, lenp, ppos, @@ -769,7 +769,7 @@ int proc_douintvec(struct ctl_table *table, int write, void *buffer,   * Taint values can only be increased   * This means we can safely use a temporary.   */ -static int proc_taint(struct ctl_table *table, int write, +static int proc_taint(const struct ctl_table *table, int write,  			       void *buffer, size_t *lenp, loff_t *ppos)  {  	struct ctl_table t; @@ -864,7 +864,7 @@ static int do_proc_dointvec_minmax_conv(bool *negp, unsigned long *lvalp,   *   * Returns 0 on success or -EINVAL on write when the range check fails.   */ -int proc_dointvec_minmax(struct ctl_table *table, int write, +int proc_dointvec_minmax(const struct ctl_table *table, int write,  		  void *buffer, size_t *lenp, loff_t *ppos)  {  	struct do_proc_dointvec_minmax_conv_param param = { @@ -933,7 +933,7 @@ static int do_proc_douintvec_minmax_conv(unsigned long *lvalp,   *   * Returns 0 on success or -ERANGE on write when the range check fails.   */ -int proc_douintvec_minmax(struct ctl_table *table, int write, +int proc_douintvec_minmax(const struct ctl_table *table, int write,  			  void *buffer, size_t *lenp, loff_t *ppos)  {  	struct do_proc_douintvec_minmax_conv_param param = { @@ -961,7 +961,7 @@ int proc_douintvec_minmax(struct ctl_table *table, int write,   *   * Returns 0 on success or an error on write when the range check fails.   */ -int proc_dou8vec_minmax(struct ctl_table *table, int write, +int proc_dou8vec_minmax(const struct ctl_table *table, int write,  			void *buffer, size_t *lenp, loff_t *ppos)  {  	struct ctl_table tmp; @@ -998,7 +998,7 @@ int proc_dou8vec_minmax(struct ctl_table *table, int write,  EXPORT_SYMBOL_GPL(proc_dou8vec_minmax);  #ifdef CONFIG_MAGIC_SYSRQ -static int sysrq_sysctl_handler(struct ctl_table *table, int write, +static int sysrq_sysctl_handler(const struct ctl_table *table, int write,  				void *buffer, size_t *lenp, loff_t *ppos)  {  	int tmp, ret; @@ -1115,7 +1115,7 @@ static int do_proc_doulongvec_minmax(const struct ctl_table *table, int write,   *   * Returns 0 on success.   */ -int proc_doulongvec_minmax(struct ctl_table *table, int write, +int proc_doulongvec_minmax(const struct ctl_table *table, int write,  			   void *buffer, size_t *lenp, loff_t *ppos)  {      return do_proc_doulongvec_minmax(table, write, buffer, lenp, ppos, 1l, 1l); @@ -1138,7 +1138,7 @@ int proc_doulongvec_minmax(struct ctl_table *table, int write,   *   * Returns 0 on success.   */ -int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write, +int proc_doulongvec_ms_jiffies_minmax(const struct ctl_table *table, int write,  				      void *buffer, size_t *lenp, loff_t *ppos)  {      return do_proc_doulongvec_minmax(table, write, buffer, @@ -1259,14 +1259,14 @@ static int do_proc_dointvec_ms_jiffies_minmax_conv(bool *negp, unsigned long *lv   *   * Returns 0 on success.   */ -int proc_dointvec_jiffies(struct ctl_table *table, int write, +int proc_dointvec_jiffies(const struct ctl_table *table, int write,  			  void *buffer, size_t *lenp, loff_t *ppos)  {      return do_proc_dointvec(table,write,buffer,lenp,ppos,  		    	    do_proc_dointvec_jiffies_conv,NULL);  } -int proc_dointvec_ms_jiffies_minmax(struct ctl_table *table, int write, +int proc_dointvec_ms_jiffies_minmax(const struct ctl_table *table, int write,  			  void *buffer, size_t *lenp, loff_t *ppos)  {  	struct do_proc_dointvec_minmax_conv_param param = { @@ -1292,7 +1292,7 @@ int proc_dointvec_ms_jiffies_minmax(struct ctl_table *table, int write,   *   * Returns 0 on success.   */ -int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write, +int proc_dointvec_userhz_jiffies(const struct ctl_table *table, int write,  				 void *buffer, size_t *lenp, loff_t *ppos)  {  	return do_proc_dointvec(table, write, buffer, lenp, ppos, @@ -1315,14 +1315,14 @@ int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write,   *   * Returns 0 on success.   */ -int proc_dointvec_ms_jiffies(struct ctl_table *table, int write, void *buffer, +int proc_dointvec_ms_jiffies(const struct ctl_table *table, int write, void *buffer,  		size_t *lenp, loff_t *ppos)  {  	return do_proc_dointvec(table, write, buffer, lenp, ppos,  				do_proc_dointvec_ms_jiffies_conv, NULL);  } -static int proc_do_cad_pid(struct ctl_table *table, int write, void *buffer, +static int proc_do_cad_pid(const struct ctl_table *table, int write, void *buffer,  		size_t *lenp, loff_t *ppos)  {  	struct pid *new_pid; @@ -1361,7 +1361,7 @@ static int proc_do_cad_pid(struct ctl_table *table, int write, void *buffer,   *   * Returns 0 on success.   */ -int proc_do_large_bitmap(struct ctl_table *table, int write, +int proc_do_large_bitmap(const struct ctl_table *table, int write,  			 void *buffer, size_t *lenp, loff_t *ppos)  {  	int err = 0; @@ -1493,85 +1493,85 @@ int proc_do_large_bitmap(struct ctl_table *table, int write,  #else /* CONFIG_PROC_SYSCTL */ -int proc_dostring(struct ctl_table *table, int write, +int proc_dostring(const struct ctl_table *table, int write,  		  void *buffer, size_t *lenp, loff_t *ppos)  {  	return -ENOSYS;  } -int proc_dobool(struct ctl_table *table, int write, +int proc_dobool(const struct ctl_table *table, int write,  		void *buffer, size_t *lenp, loff_t *ppos)  {  	return -ENOSYS;  } -int proc_dointvec(struct ctl_table *table, int write, +int proc_dointvec(const struct ctl_table *table, int write,  		  void *buffer, size_t *lenp, loff_t *ppos)  {  	return -ENOSYS;  } -int proc_douintvec(struct ctl_table *table, int write, +int proc_douintvec(const struct ctl_table *table, int write,  		  void *buffer, size_t *lenp, loff_t *ppos)  {  	return -ENOSYS;  } -int proc_dointvec_minmax(struct ctl_table *table, int write, +int proc_dointvec_minmax(const struct ctl_table *table, int write,  		    void *buffer, size_t *lenp, loff_t *ppos)  {  	return -ENOSYS;  } -int proc_douintvec_minmax(struct ctl_table *table, int write, +int proc_douintvec_minmax(const struct ctl_table *table, int write,  			  void *buffer, size_t *lenp, loff_t *ppos)  {  	return -ENOSYS;  } -int proc_dou8vec_minmax(struct ctl_table *table, int write, +int proc_dou8vec_minmax(const struct ctl_table *table, int write,  			void *buffer, size_t *lenp, loff_t *ppos)  {  	return -ENOSYS;  } -int proc_dointvec_jiffies(struct ctl_table *table, int write, +int proc_dointvec_jiffies(const struct ctl_table *table, int write,  		    void *buffer, size_t *lenp, loff_t *ppos)  {  	return -ENOSYS;  } -int proc_dointvec_ms_jiffies_minmax(struct ctl_table *table, int write, +int proc_dointvec_ms_jiffies_minmax(const struct ctl_table *table, int write,  				    void *buffer, size_t *lenp, loff_t *ppos)  {  	return -ENOSYS;  } -int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write, +int proc_dointvec_userhz_jiffies(const struct ctl_table *table, int write,  		    void *buffer, size_t *lenp, loff_t *ppos)  {  	return -ENOSYS;  } -int proc_dointvec_ms_jiffies(struct ctl_table *table, int write, +int proc_dointvec_ms_jiffies(const struct ctl_table *table, int write,  			     void *buffer, size_t *lenp, loff_t *ppos)  {  	return -ENOSYS;  } -int proc_doulongvec_minmax(struct ctl_table *table, int write, +int proc_doulongvec_minmax(const struct ctl_table *table, int write,  		    void *buffer, size_t *lenp, loff_t *ppos)  {  	return -ENOSYS;  } -int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write, +int proc_doulongvec_ms_jiffies_minmax(const struct ctl_table *table, int write,  				      void *buffer, size_t *lenp, loff_t *ppos)  {  	return -ENOSYS;  } -int proc_do_large_bitmap(struct ctl_table *table, int write, +int proc_do_large_bitmap(const struct ctl_table *table, int write,  			 void *buffer, size_t *lenp, loff_t *ppos)  {  	return -ENOSYS; @@ -1580,7 +1580,7 @@ int proc_do_large_bitmap(struct ctl_table *table, int write,  #endif /* CONFIG_PROC_SYSCTL */  #if defined(CONFIG_SYSCTL) -int proc_do_static_key(struct ctl_table *table, int write, +int proc_do_static_key(const struct ctl_table *table, int write,  		       void *buffer, size_t *lenp, loff_t *ppos)  {  	struct static_key *key = (struct static_key *)table->data; diff --git a/kernel/task_work.c b/kernel/task_work.c index 5c2daa7ad3f9..5d14d639ac71 100644 --- a/kernel/task_work.c +++ b/kernel/task_work.c @@ -6,12 +6,14 @@  static struct callback_head work_exited; /* all we need is ->next == NULL */ +#ifdef CONFIG_IRQ_WORK  static void task_work_set_notify_irq(struct irq_work *entry)  {  	test_and_set_tsk_thread_flag(current, TIF_NOTIFY_RESUME);  }  static DEFINE_PER_CPU(struct irq_work, irq_work_NMI_resume) =  	IRQ_WORK_INIT_HARD(task_work_set_notify_irq); +#endif  /**   * task_work_add - ask the @task to execute @work->func() @@ -57,6 +59,8 @@ int task_work_add(struct task_struct *task, struct callback_head *work,  	if (notify == TWA_NMI_CURRENT) {  		if (WARN_ON_ONCE(task != current))  			return -EINVAL; +		if (!IS_ENABLED(CONFIG_IRQ_WORK)) +			return -EINVAL;  	} else {  		/* record the work call stack in order to print it in KASAN reports */  		kasan_record_aux_stack(work); @@ -81,9 +85,11 @@ int task_work_add(struct task_struct *task, struct callback_head *work,  	case TWA_SIGNAL_NO_IPI:  		__set_notify_signal(task);  		break; +#ifdef CONFIG_IRQ_WORK  	case TWA_NMI_CURRENT:  		irq_work_queue(this_cpu_ptr(&irq_work_NMI_resume));  		break; +#endif  	default:  		WARN_ON_ONCE(1);  		break; diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 48288dd4a102..64b0d8a0aa0f 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -289,7 +289,7 @@ static void timers_update_migration(void)  }  #ifdef CONFIG_SYSCTL -static int timer_migration_handler(struct ctl_table *table, int write, +static int timer_migration_handler(const struct ctl_table *table, int write,  			    void *buffer, size_t *lenp, loff_t *ppos)  {  	int ret; diff --git a/kernel/time/timer_migration.c b/kernel/time/timer_migration.c index 84413114db5c..8d57f7686bb0 100644 --- a/kernel/time/timer_migration.c +++ b/kernel/time/timer_migration.c @@ -475,9 +475,54 @@ static bool tmigr_check_lonely(struct tmigr_group *group)  	return bitmap_weight(&active, BIT_CNT) <= 1;  } -typedef bool (*up_f)(struct tmigr_group *, struct tmigr_group *, void *); +/** + * struct tmigr_walk - data required for walking the hierarchy + * @nextexp:		Next CPU event expiry information which is handed into + *			the timer migration code by the timer code + *			(get_next_timer_interrupt()) + * @firstexp:		Contains the first event expiry information when + *			hierarchy is completely idle.  When CPU itself was the + *			last going idle, information makes sure, that CPU will + *			be back in time. When using this value in the remote + *			expiry case, firstexp is stored in the per CPU tmigr_cpu + *			struct of CPU which expires remote timers. It is updated + *			in top level group only. Be aware, there could occur a + *			new top level of the hierarchy between the 'top level + *			call' in tmigr_update_events() and the check for the + *			parent group in walk_groups(). Then @firstexp might + *			contain a value != KTIME_MAX even if it was not the + *			final top level. This is not a problem, as the worst + *			outcome is a CPU which might wake up a little early. + * @evt:		Pointer to tmigr_event which needs to be queued (of idle + *			child group) + * @childmask:		groupmask of child group + * @remote:		Is set, when the new timer path is executed in + *			tmigr_handle_remote_cpu() + * @basej:		timer base in jiffies + * @now:		timer base monotonic + * @check:		is set if there is the need to handle remote timers; + *			required in tmigr_requires_handle_remote() only + * @tmc_active:		this flag indicates, whether the CPU which triggers + *			the hierarchy walk is !idle in the timer migration + *			hierarchy. When the CPU is idle and the whole hierarchy is + *			idle, only the first event of the top level has to be + *			considered. + */ +struct tmigr_walk { +	u64			nextexp; +	u64			firstexp; +	struct tmigr_event	*evt; +	u8			childmask; +	bool			remote; +	unsigned long		basej; +	u64			now; +	bool			check; +	bool			tmc_active; +}; + +typedef bool (*up_f)(struct tmigr_group *, struct tmigr_group *, struct tmigr_walk *); -static void __walk_groups(up_f up, void *data, +static void __walk_groups(up_f up, struct tmigr_walk *data,  			  struct tmigr_cpu *tmc)  {  	struct tmigr_group *child = NULL, *group = tmc->tmgroup; @@ -490,64 +535,17 @@ static void __walk_groups(up_f up, void *data,  		child = group;  		group = group->parent; +		data->childmask = child->groupmask;  	} while (group);  } -static void walk_groups(up_f up, void *data, struct tmigr_cpu *tmc) +static void walk_groups(up_f up, struct tmigr_walk *data, struct tmigr_cpu *tmc)  {  	lockdep_assert_held(&tmc->lock);  	__walk_groups(up, data, tmc);  } -/** - * struct tmigr_walk - data required for walking the hierarchy - * @nextexp:		Next CPU event expiry information which is handed into - *			the timer migration code by the timer code - *			(get_next_timer_interrupt()) - * @firstexp:		Contains the first event expiry information when last - *			active CPU of hierarchy is on the way to idle to make - *			sure CPU will be back in time. - * @evt:		Pointer to tmigr_event which needs to be queued (of idle - *			child group) - * @childmask:		childmask of child group - * @remote:		Is set, when the new timer path is executed in - *			tmigr_handle_remote_cpu() - */ -struct tmigr_walk { -	u64			nextexp; -	u64			firstexp; -	struct tmigr_event	*evt; -	u8			childmask; -	bool			remote; -}; - -/** - * struct tmigr_remote_data - data required for remote expiry hierarchy walk - * @basej:		timer base in jiffies - * @now:		timer base monotonic - * @firstexp:		returns expiry of the first timer in the idle timer - *			migration hierarchy to make sure the timer is handled in - *			time; it is stored in the per CPU tmigr_cpu struct of - *			CPU which expires remote timers - * @childmask:		childmask of child group - * @check:		is set if there is the need to handle remote timers; - *			required in tmigr_requires_handle_remote() only - * @tmc_active:		this flag indicates, whether the CPU which triggers - *			the hierarchy walk is !idle in the timer migration - *			hierarchy. When the CPU is idle and the whole hierarchy is - *			idle, only the first event of the top level has to be - *			considered. - */ -struct tmigr_remote_data { -	unsigned long	basej; -	u64		now; -	u64		firstexp; -	u8		childmask; -	bool		check; -	bool		tmc_active; -}; -  /*   * Returns the next event of the timerqueue @group->events   * @@ -618,10 +616,9 @@ static u64 tmigr_next_groupevt_expires(struct tmigr_group *group)  static bool tmigr_active_up(struct tmigr_group *group,  			    struct tmigr_group *child, -			    void *ptr) +			    struct tmigr_walk *data)  {  	union tmigr_state curstate, newstate; -	struct tmigr_walk *data = ptr;  	bool walk_done;  	u8 childmask; @@ -649,8 +646,7 @@ static bool tmigr_active_up(struct tmigr_group *group,  	} while (!atomic_try_cmpxchg(&group->migr_state, &curstate.state, newstate.state)); -	if ((walk_done == false) && group->parent) -		data->childmask = group->childmask; +	trace_tmigr_group_set_cpu_active(group, newstate, childmask);  	/*  	 * The group is active (again). The group event might be still queued @@ -666,8 +662,6 @@ static bool tmigr_active_up(struct tmigr_group *group,  	 */  	group->groupevt.ignore = true; -	trace_tmigr_group_set_cpu_active(group, newstate, childmask); -  	return walk_done;  } @@ -675,7 +669,7 @@ static void __tmigr_cpu_activate(struct tmigr_cpu *tmc)  {  	struct tmigr_walk data; -	data.childmask = tmc->childmask; +	data.childmask = tmc->groupmask;  	trace_tmigr_cpu_active(tmc); @@ -860,10 +854,8 @@ unlock:  static bool tmigr_new_timer_up(struct tmigr_group *group,  			       struct tmigr_group *child, -			       void *ptr) +			       struct tmigr_walk *data)  { -	struct tmigr_walk *data = ptr; -  	return tmigr_update_events(group, child, data);  } @@ -995,9 +987,8 @@ unlock:  static bool tmigr_handle_remote_up(struct tmigr_group *group,  				   struct tmigr_group *child, -				   void *ptr) +				   struct tmigr_walk *data)  { -	struct tmigr_remote_data *data = ptr;  	struct tmigr_event *evt;  	unsigned long jif;  	u8 childmask; @@ -1034,12 +1025,10 @@ again:  	}  	/* -	 * Update of childmask for the next level and keep track of the expiry -	 * of the first event that needs to be handled (group->next_expiry was -	 * updated by tmigr_next_expired_groupevt(), next was set by -	 * tmigr_handle_remote_cpu()). +	 * Keep track of the expiry of the first event that needs to be handled +	 * (group->next_expiry was updated by tmigr_next_expired_groupevt(), +	 * next was set by tmigr_handle_remote_cpu()).  	 */ -	data->childmask = group->childmask;  	data->firstexp = group->next_expiry;  	raw_spin_unlock_irq(&group->lock); @@ -1055,12 +1044,12 @@ again:  void tmigr_handle_remote(void)  {  	struct tmigr_cpu *tmc = this_cpu_ptr(&tmigr_cpu); -	struct tmigr_remote_data data; +	struct tmigr_walk data;  	if (tmigr_is_not_available(tmc))  		return; -	data.childmask = tmc->childmask; +	data.childmask = tmc->groupmask;  	data.firstexp = KTIME_MAX;  	/* @@ -1068,7 +1057,7 @@ void tmigr_handle_remote(void)  	 * in tmigr_handle_remote_up() anyway. Keep this check to speed up the  	 * return when nothing has to be done.  	 */ -	if (!tmigr_check_migrator(tmc->tmgroup, tmc->childmask)) { +	if (!tmigr_check_migrator(tmc->tmgroup, tmc->groupmask)) {  		/*  		 * If this CPU was an idle migrator, make sure to clear its wakeup  		 * value so it won't chase timers that have already expired elsewhere. @@ -1097,9 +1086,8 @@ void tmigr_handle_remote(void)  static bool tmigr_requires_handle_remote_up(struct tmigr_group *group,  					    struct tmigr_group *child, -					    void *ptr) +					    struct tmigr_walk *data)  { -	struct tmigr_remote_data *data = ptr;  	u8 childmask;  	childmask = data->childmask; @@ -1118,7 +1106,7 @@ static bool tmigr_requires_handle_remote_up(struct tmigr_group *group,  	 * group before reading the next_expiry value.  	 */  	if (group->parent && !data->tmc_active) -		goto out; +		return false;  	/*  	 * The lock is required on 32bit architectures to read the variable @@ -1143,9 +1131,6 @@ static bool tmigr_requires_handle_remote_up(struct tmigr_group *group,  		raw_spin_unlock(&group->lock);  	} -out: -	/* Update of childmask for the next level */ -	data->childmask = group->childmask;  	return false;  } @@ -1157,7 +1142,7 @@ out:  bool tmigr_requires_handle_remote(void)  {  	struct tmigr_cpu *tmc = this_cpu_ptr(&tmigr_cpu); -	struct tmigr_remote_data data; +	struct tmigr_walk data;  	unsigned long jif;  	bool ret = false; @@ -1165,7 +1150,7 @@ bool tmigr_requires_handle_remote(void)  		return ret;  	data.now = get_jiffies_update(&jif); -	data.childmask = tmc->childmask; +	data.childmask = tmc->groupmask;  	data.firstexp = KTIME_MAX;  	data.tmc_active = !tmc->idle;  	data.check = false; @@ -1230,14 +1215,13 @@ u64 tmigr_cpu_new_timer(u64 nextexp)  		if (nextexp != tmc->cpuevt.nextevt.expires ||  		    tmc->cpuevt.ignore) {  			ret = tmigr_new_timer(tmc, nextexp); +			/* +			 * Make sure the reevaluation of timers in idle path +			 * will not miss an event. +			 */ +			WRITE_ONCE(tmc->wakeup, ret);  		}  	} -	/* -	 * Make sure the reevaluation of timers in idle path will not miss an -	 * event. -	 */ -	WRITE_ONCE(tmc->wakeup, ret); -  	trace_tmigr_cpu_new_timer_idle(tmc, nextexp);  	raw_spin_unlock(&tmc->lock);  	return ret; @@ -1245,10 +1229,9 @@ u64 tmigr_cpu_new_timer(u64 nextexp)  static bool tmigr_inactive_up(struct tmigr_group *group,  			      struct tmigr_group *child, -			      void *ptr) +			      struct tmigr_walk *data)  {  	union tmigr_state curstate, newstate, childstate; -	struct tmigr_walk *data = ptr;  	bool walk_done;  	u8 childmask; @@ -1299,9 +1282,10 @@ static bool tmigr_inactive_up(struct tmigr_group *group,  		WARN_ON_ONCE((newstate.migrator != TMIGR_NONE) && !(newstate.active)); -		if (atomic_try_cmpxchg(&group->migr_state, &curstate.state, -				       newstate.state)) +		if (atomic_try_cmpxchg(&group->migr_state, &curstate.state, newstate.state)) { +			trace_tmigr_group_set_cpu_inactive(group, newstate, childmask);  			break; +		}  		/*  		 * The memory barrier is paired with the cmpxchg() in @@ -1317,22 +1301,6 @@ static bool tmigr_inactive_up(struct tmigr_group *group,  	/* Event Handling */  	tmigr_update_events(group, child, data); -	if (group->parent && (walk_done == false)) -		data->childmask = group->childmask; - -	/* -	 * data->firstexp was set by tmigr_update_events() and contains the -	 * expiry of the first global event which needs to be handled. It -	 * differs from KTIME_MAX if: -	 * - group is the top level group and -	 * - group is idle (which means CPU was the last active CPU in the -	 *   hierarchy) and -	 * - there is a pending event in the hierarchy -	 */ -	WARN_ON_ONCE(data->firstexp != KTIME_MAX && group->parent); - -	trace_tmigr_group_set_cpu_inactive(group, newstate, childmask); -  	return walk_done;  } @@ -1341,7 +1309,7 @@ static u64 __tmigr_cpu_deactivate(struct tmigr_cpu *tmc, u64 nextexp)  	struct tmigr_walk data = { .nextexp = nextexp,  				   .firstexp = KTIME_MAX,  				   .evt = &tmc->cpuevt, -				   .childmask = tmc->childmask }; +				   .childmask = tmc->groupmask };  	/*  	 * If nextexp is KTIME_MAX, the CPU event will be ignored because the @@ -1400,7 +1368,7 @@ u64 tmigr_cpu_deactivate(u64 nextexp)   *			  the only one in the level 0 group; and if it is the   *			  only one in level 0 group, but there are more than a   *			  single group active on the way to top level) - * * nextevt		- when CPU is offline and has to handle timer on his own + * * nextevt		- when CPU is offline and has to handle timer on its own   *			  or when on the way to top in every group only a single   *			  child is active but @nextevt is before the lowest   *			  next_expiry encountered while walking up to top level. @@ -1419,7 +1387,7 @@ u64 tmigr_quick_check(u64 nextevt)  	if (WARN_ON_ONCE(tmc->idle))  		return nextevt; -	if (!tmigr_check_migrator_and_lonely(tmc->tmgroup, tmc->childmask)) +	if (!tmigr_check_migrator_and_lonely(tmc->tmgroup, tmc->groupmask))  		return KTIME_MAX;  	do { @@ -1442,6 +1410,66 @@ u64 tmigr_quick_check(u64 nextevt)  	return KTIME_MAX;  } +/* + * tmigr_trigger_active() - trigger a CPU to become active again + * + * This function is executed on a CPU which is part of cpu_online_mask, when the + * last active CPU in the hierarchy is offlining. With this, it is ensured that + * the other CPU is active and takes over the migrator duty. + */ +static long tmigr_trigger_active(void *unused) +{ +	struct tmigr_cpu *tmc = this_cpu_ptr(&tmigr_cpu); + +	WARN_ON_ONCE(!tmc->online || tmc->idle); + +	return 0; +} + +static int tmigr_cpu_offline(unsigned int cpu) +{ +	struct tmigr_cpu *tmc = this_cpu_ptr(&tmigr_cpu); +	int migrator; +	u64 firstexp; + +	raw_spin_lock_irq(&tmc->lock); +	tmc->online = false; +	WRITE_ONCE(tmc->wakeup, KTIME_MAX); + +	/* +	 * CPU has to handle the local events on his own, when on the way to +	 * offline; Therefore nextevt value is set to KTIME_MAX +	 */ +	firstexp = __tmigr_cpu_deactivate(tmc, KTIME_MAX); +	trace_tmigr_cpu_offline(tmc); +	raw_spin_unlock_irq(&tmc->lock); + +	if (firstexp != KTIME_MAX) { +		migrator = cpumask_any_but(cpu_online_mask, cpu); +		work_on_cpu(migrator, tmigr_trigger_active, NULL); +	} + +	return 0; +} + +static int tmigr_cpu_online(unsigned int cpu) +{ +	struct tmigr_cpu *tmc = this_cpu_ptr(&tmigr_cpu); + +	/* Check whether CPU data was successfully initialized */ +	if (WARN_ON_ONCE(!tmc->tmgroup)) +		return -EINVAL; + +	raw_spin_lock_irq(&tmc->lock); +	trace_tmigr_cpu_online(tmc); +	tmc->idle = timer_base_is_idle(); +	if (!tmc->idle) +		__tmigr_cpu_activate(tmc); +	tmc->online = true; +	raw_spin_unlock_irq(&tmc->lock); +	return 0; +} +  static void tmigr_init_group(struct tmigr_group *group, unsigned int lvl,  			     int node)  { @@ -1514,21 +1542,25 @@ static struct tmigr_group *tmigr_get_group(unsigned int cpu, int node,  }  static void tmigr_connect_child_parent(struct tmigr_group *child, -				       struct tmigr_group *parent) +				       struct tmigr_group *parent, +				       bool activate)  { -	union tmigr_state childstate; +	struct tmigr_walk data;  	raw_spin_lock_irq(&child->lock);  	raw_spin_lock_nested(&parent->lock, SINGLE_DEPTH_NESTING);  	child->parent = parent; -	child->childmask = BIT(parent->num_children++); +	child->groupmask = BIT(parent->num_children++);  	raw_spin_unlock(&parent->lock);  	raw_spin_unlock_irq(&child->lock);  	trace_tmigr_connect_child_parent(child); +	if (!activate) +		return; +  	/*  	 * To prevent inconsistent states, active children need to be active in  	 * the new parent as well. Inactive children are already marked inactive @@ -1544,21 +1576,24 @@ static void tmigr_connect_child_parent(struct tmigr_group *child,  	 *   child to the new parent. So tmigr_connect_child_parent() is  	 *   executed with the formerly top level group (child) and the newly  	 *   created group (parent). +	 * +	 * * It is ensured that the child is active, as this setup path is +	 *   executed in hotplug prepare callback. This is exectued by an +	 *   already connected and !idle CPU. Even if all other CPUs go idle, +	 *   the CPU executing the setup will be responsible up to current top +	 *   level group. And the next time it goes inactive, it will release +	 *   the new childmask and parent to subsequent walkers through this +	 *   @child. Therefore propagate active state unconditionally.  	 */ -	childstate.state = atomic_read(&child->migr_state); -	if (childstate.migrator != TMIGR_NONE) { -		struct tmigr_walk data; - -		data.childmask = child->childmask; +	data.childmask = child->groupmask; -		/* -		 * There is only one new level per time. When connecting the -		 * child and the parent and set the child active when the parent -		 * is inactive, the parent needs to be the uppermost -		 * level. Otherwise there went something wrong! -		 */ -		WARN_ON(!tmigr_active_up(parent, child, &data) && parent->parent); -	} +	/* +	 * There is only one new level per time (which is protected by +	 * tmigr_mutex). When connecting the child and the parent and set the +	 * child active when the parent is inactive, the parent needs to be the +	 * uppermost level. Otherwise there went something wrong! +	 */ +	WARN_ON(!tmigr_active_up(parent, child, &data) && parent->parent);  }  static int tmigr_setup_groups(unsigned int cpu, unsigned int node) @@ -1611,12 +1646,12 @@ static int tmigr_setup_groups(unsigned int cpu, unsigned int node)  		 * Update tmc -> group / child -> group connection  		 */  		if (i == 0) { -			struct tmigr_cpu *tmc = this_cpu_ptr(&tmigr_cpu); +			struct tmigr_cpu *tmc = per_cpu_ptr(&tmigr_cpu, cpu);  			raw_spin_lock_irq(&group->lock);  			tmc->tmgroup = group; -			tmc->childmask = BIT(group->num_children++); +			tmc->groupmask = BIT(group->num_children++);  			raw_spin_unlock_irq(&group->lock); @@ -1626,7 +1661,8 @@ static int tmigr_setup_groups(unsigned int cpu, unsigned int node)  			continue;  		} else {  			child = stack[i - 1]; -			tmigr_connect_child_parent(child, group); +			/* Will be activated at online time */ +			tmigr_connect_child_parent(child, group, false);  		}  		/* check if uppermost level was newly created */ @@ -1637,12 +1673,21 @@ static int tmigr_setup_groups(unsigned int cpu, unsigned int node)  		lvllist = &tmigr_level_list[top];  		if (group->num_children == 1 && list_is_singular(lvllist)) { +			/* +			 * The target CPU must never do the prepare work, except +			 * on early boot when the boot CPU is the target. Otherwise +			 * it may spuriously activate the old top level group inside +			 * the new one (nevertheless whether old top level group is +			 * active or not) and/or release an uninitialized childmask. +			 */ +			WARN_ON_ONCE(cpu == raw_smp_processor_id()); +  			lvllist = &tmigr_level_list[top - 1];  			list_for_each_entry(child, lvllist, list) {  				if (child->parent)  					continue; -				tmigr_connect_child_parent(child, group); +				tmigr_connect_child_parent(child, group, true);  			}  		}  	} @@ -1664,80 +1709,31 @@ static int tmigr_add_cpu(unsigned int cpu)  	return ret;  } -static int tmigr_cpu_online(unsigned int cpu) -{ -	struct tmigr_cpu *tmc = this_cpu_ptr(&tmigr_cpu); -	int ret; - -	/* First online attempt? Initialize CPU data */ -	if (!tmc->tmgroup) { -		raw_spin_lock_init(&tmc->lock); - -		ret = tmigr_add_cpu(cpu); -		if (ret < 0) -			return ret; - -		if (tmc->childmask == 0) -			return -EINVAL; - -		timerqueue_init(&tmc->cpuevt.nextevt); -		tmc->cpuevt.nextevt.expires = KTIME_MAX; -		tmc->cpuevt.ignore = true; -		tmc->cpuevt.cpu = cpu; - -		tmc->remote = false; -		WRITE_ONCE(tmc->wakeup, KTIME_MAX); -	} -	raw_spin_lock_irq(&tmc->lock); -	trace_tmigr_cpu_online(tmc); -	tmc->idle = timer_base_is_idle(); -	if (!tmc->idle) -		__tmigr_cpu_activate(tmc); -	tmc->online = true; -	raw_spin_unlock_irq(&tmc->lock); -	return 0; -} - -/* - * tmigr_trigger_active() - trigger a CPU to become active again - * - * This function is executed on a CPU which is part of cpu_online_mask, when the - * last active CPU in the hierarchy is offlining. With this, it is ensured that - * the other CPU is active and takes over the migrator duty. - */ -static long tmigr_trigger_active(void *unused) +static int tmigr_cpu_prepare(unsigned int cpu)  { -	struct tmigr_cpu *tmc = this_cpu_ptr(&tmigr_cpu); +	struct tmigr_cpu *tmc = per_cpu_ptr(&tmigr_cpu, cpu); +	int ret = 0; -	WARN_ON_ONCE(!tmc->online || tmc->idle); - -	return 0; -} - -static int tmigr_cpu_offline(unsigned int cpu) -{ -	struct tmigr_cpu *tmc = this_cpu_ptr(&tmigr_cpu); -	int migrator; -	u64 firstexp; +	/* Not first online attempt? */ +	if (tmc->tmgroup) +		return ret; -	raw_spin_lock_irq(&tmc->lock); -	tmc->online = false; +	raw_spin_lock_init(&tmc->lock); +	timerqueue_init(&tmc->cpuevt.nextevt); +	tmc->cpuevt.nextevt.expires = KTIME_MAX; +	tmc->cpuevt.ignore = true; +	tmc->cpuevt.cpu = cpu; +	tmc->remote = false;  	WRITE_ONCE(tmc->wakeup, KTIME_MAX); -	/* -	 * CPU has to handle the local events on his own, when on the way to -	 * offline; Therefore nextevt value is set to KTIME_MAX -	 */ -	firstexp = __tmigr_cpu_deactivate(tmc, KTIME_MAX); -	trace_tmigr_cpu_offline(tmc); -	raw_spin_unlock_irq(&tmc->lock); +	ret = tmigr_add_cpu(cpu); +	if (ret < 0) +		return ret; -	if (firstexp != KTIME_MAX) { -		migrator = cpumask_any_but(cpu_online_mask, cpu); -		work_on_cpu(migrator, tmigr_trigger_active, NULL); -	} +	if (tmc->groupmask == 0) +		return -EINVAL; -	return 0; +	return ret;  }  static int __init tmigr_init(void) @@ -1796,6 +1792,11 @@ static int __init tmigr_init(void)  		tmigr_hierarchy_levels, TMIGR_CHILDREN_PER_GROUP,  		tmigr_crossnode_level); +	ret = cpuhp_setup_state(CPUHP_TMIGR_PREPARE, "tmigr:prepare", +				tmigr_cpu_prepare, NULL); +	if (ret) +		goto err; +  	ret = cpuhp_setup_state(CPUHP_AP_TMIGR_ONLINE, "tmigr:online",  				tmigr_cpu_online, tmigr_cpu_offline);  	if (ret) @@ -1807,4 +1808,4 @@ err:  	pr_err("Timer migration setup failed\n");  	return ret;  } -late_initcall(tmigr_init); +early_initcall(tmigr_init); diff --git a/kernel/time/timer_migration.h b/kernel/time/timer_migration.h index 6c37d94a37d9..154accc7a543 100644 --- a/kernel/time/timer_migration.h +++ b/kernel/time/timer_migration.h @@ -22,7 +22,17 @@ struct tmigr_event {   * struct tmigr_group - timer migration hierarchy group   * @lock:		Lock protecting the event information and group hierarchy   *			information during setup - * @parent:		Pointer to the parent group + * @parent:		Pointer to the parent group. Pointer is updated when a + *			new hierarchy level is added because of a CPU coming + *			online the first time. Once it is set, the pointer will + *			not be removed or updated. When accessing parent pointer + *			lock less to decide whether to abort a propagation or + *			not, it is not a problem. The worst outcome is an + *			unnecessary/early CPU wake up. But do not access parent + *			pointer several times in the same 'action' (like + *			activation, deactivation, check for remote expiry,...) + *			without holding the lock as it is not ensured that value + *			will not change.   * @groupevt:		Next event of the group which is only used when the   *			group is !active. The group event is then queued into   *			the parent timer queue. @@ -41,9 +51,8 @@ struct tmigr_event {   * @num_children:	Counter of group children to make sure the group is only   *			filled with TMIGR_CHILDREN_PER_GROUP; Required for setup   *			only - * @childmask:		childmask of the group in the parent group; is set - *			during setup and will never change; can be read - *			lockless + * @groupmask:		mask of the group in the parent group; is set during + *			setup and will never change; can be read lockless   * @list:		List head that is added to the per level   *			tmigr_level_list; is required during setup when a   *			new group needs to be connected to the existing @@ -59,7 +68,7 @@ struct tmigr_group {  	unsigned int		level;  	int			numa_node;  	unsigned int		num_children; -	u8			childmask; +	u8			groupmask;  	struct list_head	list;  }; @@ -79,7 +88,7 @@ struct tmigr_group {   *			hierarchy   * @remote:		Is set when timers of the CPU are expired remotely   * @tmgroup:		Pointer to the parent group - * @childmask:		childmask of tmigr_cpu in the parent group + * @groupmask:		mask of tmigr_cpu in the parent group   * @wakeup:		Stores the first timer when the timer migration   *			hierarchy is completely idle and remote expiry was done;   *			is returned to timer code in the idle path and is only @@ -92,7 +101,7 @@ struct tmigr_cpu {  	bool			idle;  	bool			remote;  	struct tmigr_group	*tmgroup; -	u8			childmask; +	u8			groupmask;  	u64			wakeup;  	struct tmigr_event	cpuevt;  }; @@ -108,8 +117,8 @@ union tmigr_state {  	u32 state;  	/**  	 * struct - split state of tmigr_group -	 * @active:	Contains each childmask bit of the active children -	 * @migrator:	Contains childmask of the child which is migrator +	 * @active:	Contains each mask bit of the active children +	 * @migrator:	Contains mask of the child which is migrator  	 * @seq:	Sequence counter needs to be increased when an update  	 *		to the tmigr_state is done. It prevents a race when  	 *		updates in the child groups are propagated in changed diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index e5d6a4ab433b..4c28dd177ca6 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -7920,6 +7920,7 @@ out:  void arch_ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip,  			       struct ftrace_ops *op, struct ftrace_regs *fregs)  { +	kmsan_unpoison_memory(fregs, sizeof(*fregs));  	__ftrace_ops_list_func(ip, parent_ip, NULL, fregs);  }  #else @@ -8734,7 +8735,7 @@ static bool is_permanent_ops_registered(void)  }  static int -ftrace_enable_sysctl(struct ctl_table *table, int write, +ftrace_enable_sysctl(const struct ctl_table *table, int write,  		     void *buffer, size_t *lenp, loff_t *ppos)  {  	int ret = -ENODEV; diff --git a/kernel/trace/preemptirq_delay_test.c b/kernel/trace/preemptirq_delay_test.c index cb0871fbdb07..314ffc143039 100644 --- a/kernel/trace/preemptirq_delay_test.c +++ b/kernel/trace/preemptirq_delay_test.c @@ -34,8 +34,6 @@ MODULE_PARM_DESC(cpu_affinity, "Cpu num test is running on");  static struct completion done; -#define MIN(x, y) ((x) < (y) ? (x) : (y)) -  static void busy_wait(ulong time)  {  	u64 start, end; diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 578a49ff5c32..10cd38bce2f1 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2767,7 +2767,7 @@ static void output_printk(struct trace_event_buffer *fbuffer)  	raw_spin_unlock_irqrestore(&tracepoint_iter_lock, flags);  } -int tracepoint_printk_sysctl(struct ctl_table *table, int write, +int tracepoint_printk_sysctl(const struct ctl_table *table, int write,  			     void *buffer, size_t *lenp,  			     loff_t *ppos)  { diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c index 3a2b46847c8b..42b0d998d103 100644 --- a/kernel/trace/trace_events_user.c +++ b/kernel/trace/trace_events_user.c @@ -2885,7 +2885,7 @@ err:  	return -ENODEV;  } -static int set_max_user_events_sysctl(struct ctl_table *table, int write, +static int set_max_user_events_sysctl(const struct ctl_table *table, int write,  				      void *buffer, size_t *lenp, loff_t *ppos)  {  	int ret; diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index 5a48dba912ea..7f9572a37333 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -514,7 +514,7 @@ static const struct file_operations stack_trace_filter_fops = {  #endif /* CONFIG_DYNAMIC_FTRACE */  int -stack_trace_sysctl(struct ctl_table *table, int write, void *buffer, +stack_trace_sysctl(const struct ctl_table *table, int write, void *buffer,  		   size_t *lenp, loff_t *ppos)  {  	int was_enabled; diff --git a/kernel/tsacct.c b/kernel/tsacct.c index 4252f0645b9e..16b283f9d831 100644 --- a/kernel/tsacct.c +++ b/kernel/tsacct.c @@ -76,7 +76,7 @@ void bacct_add_tsk(struct user_namespace *user_ns,  	stats->ac_minflt = tsk->min_flt;  	stats->ac_majflt = tsk->maj_flt; -	strncpy(stats->ac_comm, tsk->comm, sizeof(stats->ac_comm)); +	strscpy_pad(stats->ac_comm, tsk->comm);  } diff --git a/kernel/umh.c b/kernel/umh.c index 598b3ffe1522..ff1f13a27d29 100644 --- a/kernel/umh.c +++ b/kernel/umh.c @@ -495,7 +495,7 @@ int call_usermodehelper(const char *path, char **argv, char **envp, int wait)  EXPORT_SYMBOL(call_usermodehelper);  #if defined(CONFIG_SYSCTL) -static int proc_cap_handler(struct ctl_table *table, int write, +static int proc_cap_handler(const struct ctl_table *table, int write,  			 void *buffer, size_t *lenp, loff_t *ppos)  {  	struct ctl_table t; diff --git a/kernel/utsname_sysctl.c b/kernel/utsname_sysctl.c index 04e4513f2985..7282f61a8650 100644 --- a/kernel/utsname_sysctl.c +++ b/kernel/utsname_sysctl.c @@ -30,7 +30,7 @@ static void *get_uts(const struct ctl_table *table)   *	Special case of dostring for the UTS structure. This has locks   *	to observe. Should this be in kernel/sys.c ????   */ -static int proc_do_uts_string(struct ctl_table *table, int write, +static int proc_do_uts_string(const struct ctl_table *table, int write,  		  void *buffer, size_t *lenp, loff_t *ppos)  {  	struct ctl_table uts_table; diff --git a/kernel/vmcore_info.c b/kernel/vmcore_info.c index 1d5eadd9dd61..8b4f8cc2e0ec 100644 --- a/kernel/vmcore_info.c +++ b/kernel/vmcore_info.c @@ -216,12 +216,8 @@ static int __init crash_save_vmcoreinfo_init(void)  	VMCOREINFO_SYMBOL(kallsyms_num_syms);  	VMCOREINFO_SYMBOL(kallsyms_token_table);  	VMCOREINFO_SYMBOL(kallsyms_token_index); -#ifdef CONFIG_KALLSYMS_BASE_RELATIVE  	VMCOREINFO_SYMBOL(kallsyms_offsets);  	VMCOREINFO_SYMBOL(kallsyms_relative_base); -#else -	VMCOREINFO_SYMBOL(kallsyms_addresses); -#endif /* CONFIG_KALLSYMS_BASE_RELATIVE */  #endif /* CONFIG_KALLSYMS */  	arch_crash_save_vmcoreinfo(); diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 51915b44ac73..830a83895493 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -983,7 +983,7 @@ static void proc_watchdog_update(void)   * -------------------|----------------------------------|-------------------------------   * proc_soft_watchdog | watchdog_softlockup_user_enabled | WATCHDOG_SOFTOCKUP_ENABLED   */ -static int proc_watchdog_common(int which, struct ctl_table *table, int write, +static int proc_watchdog_common(int which, const struct ctl_table *table, int write,  				void *buffer, size_t *lenp, loff_t *ppos)  {  	int err, old, *param = table->data; @@ -1010,7 +1010,7 @@ static int proc_watchdog_common(int which, struct ctl_table *table, int write,  /*   * /proc/sys/kernel/watchdog   */ -static int proc_watchdog(struct ctl_table *table, int write, +static int proc_watchdog(const struct ctl_table *table, int write,  			 void *buffer, size_t *lenp, loff_t *ppos)  {  	return proc_watchdog_common(WATCHDOG_HARDLOCKUP_ENABLED | @@ -1021,7 +1021,7 @@ static int proc_watchdog(struct ctl_table *table, int write,  /*   * /proc/sys/kernel/nmi_watchdog   */ -static int proc_nmi_watchdog(struct ctl_table *table, int write, +static int proc_nmi_watchdog(const struct ctl_table *table, int write,  			     void *buffer, size_t *lenp, loff_t *ppos)  {  	if (!watchdog_hardlockup_available && write) @@ -1034,7 +1034,7 @@ static int proc_nmi_watchdog(struct ctl_table *table, int write,  /*   * /proc/sys/kernel/soft_watchdog   */ -static int proc_soft_watchdog(struct ctl_table *table, int write, +static int proc_soft_watchdog(const struct ctl_table *table, int write,  			      void *buffer, size_t *lenp, loff_t *ppos)  {  	return proc_watchdog_common(WATCHDOG_SOFTOCKUP_ENABLED, @@ -1045,7 +1045,7 @@ static int proc_soft_watchdog(struct ctl_table *table, int write,  /*   * /proc/sys/kernel/watchdog_thresh   */ -static int proc_watchdog_thresh(struct ctl_table *table, int write, +static int proc_watchdog_thresh(const struct ctl_table *table, int write,  				void *buffer, size_t *lenp, loff_t *ppos)  {  	int err, old; @@ -1068,7 +1068,7 @@ static int proc_watchdog_thresh(struct ctl_table *table, int write,   * user to specify a mask that will include cpus that have not yet   * been brought online, if desired.   */ -static int proc_watchdog_cpumask(struct ctl_table *table, int write, +static int proc_watchdog_cpumask(const struct ctl_table *table, int write,  				 void *buffer, size_t *lenp, loff_t *ppos)  {  	int err; diff --git a/kernel/watchdog_perf.c b/kernel/watchdog_perf.c index d577c4a8321e..59c1d86a73a2 100644 --- a/kernel/watchdog_perf.c +++ b/kernel/watchdog_perf.c @@ -75,11 +75,15 @@ static bool watchdog_check_timestamp(void)  	__this_cpu_write(last_timestamp, now);  	return true;  } -#else -static inline bool watchdog_check_timestamp(void) + +static void watchdog_init_timestamp(void)  { -	return true; +	__this_cpu_write(nmi_rearmed, 0); +	__this_cpu_write(last_timestamp, ktime_get_mono_fast_ns());  } +#else +static inline bool watchdog_check_timestamp(void) { return true; } +static inline void watchdog_init_timestamp(void) { }  #endif  static struct perf_event_attr wd_hw_attr = { @@ -161,6 +165,7 @@ void watchdog_hardlockup_enable(unsigned int cpu)  	if (!atomic_fetch_inc(&watchdog_cpus))  		pr_info("Enabled. Permanently consumes one hw-PMU counter.\n"); +	watchdog_init_timestamp();  	perf_event_enable(this_cpu_read(watchdog_ev));  } | 
