diff options
Diffstat (limited to 'kernel')
82 files changed, 2630 insertions, 1468 deletions
| diff --git a/kernel/Makefile b/kernel/Makefile index 4cb8e8b23c6e..9c323a6daa46 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -108,7 +108,6 @@ obj-$(CONFIG_CRASH_DUMP) += crash_dump.o  obj-$(CONFIG_JUMP_LABEL) += jump_label.o  obj-$(CONFIG_CONTEXT_TRACKING) += context_tracking.o  obj-$(CONFIG_TORTURE_TEST) += torture.o -obj-$(CONFIG_MEMBARRIER) += membarrier.o  obj-$(CONFIG_HAS_IOMEM) += memremap.o diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c index 62d686d96581..9eb8b3511636 100644 --- a/kernel/audit_watch.c +++ b/kernel/audit_watch.c @@ -66,7 +66,7 @@ static struct fsnotify_group *audit_watch_group;  /* fsnotify events we care about. */  #define AUDIT_FS_WATCH (FS_MOVE | FS_CREATE | FS_DELETE | FS_DELETE_SELF |\ -			FS_MOVE_SELF | FS_EVENT_ON_CHILD) +			FS_MOVE_SELF | FS_EVENT_ON_CHILD | FS_UNMOUNT)  static void audit_free_parent(struct audit_parent *parent)  { @@ -457,13 +457,15 @@ void audit_remove_watch_rule(struct audit_krule *krule)  	list_del(&krule->rlist);  	if (list_empty(&watch->rules)) { +		/* +		 * audit_remove_watch() drops our reference to 'parent' which +		 * can get freed. Grab our own reference to be safe. +		 */ +		audit_get_parent(parent);  		audit_remove_watch(watch); - -		if (list_empty(&parent->watches)) { -			audit_get_parent(parent); +		if (list_empty(&parent->watches))  			fsnotify_destroy_mark(&parent->mark, audit_watch_group); -			audit_put_parent(parent); -		} +		audit_put_parent(parent);  	}  } diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 4fb463172aa8..d11c8181f4c5 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -652,12 +652,27 @@ static void pcpu_copy_value(struct bpf_htab *htab, void __percpu *pptr,  	}  } +static bool fd_htab_map_needs_adjust(const struct bpf_htab *htab) +{ +	return htab->map.map_type == BPF_MAP_TYPE_HASH_OF_MAPS && +	       BITS_PER_LONG == 64; +} + +static u32 htab_size_value(const struct bpf_htab *htab, bool percpu) +{ +	u32 size = htab->map.value_size; + +	if (percpu || fd_htab_map_needs_adjust(htab)) +		size = round_up(size, 8); +	return size; +} +  static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,  					 void *value, u32 key_size, u32 hash,  					 bool percpu, bool onallcpus,  					 struct htab_elem *old_elem)  { -	u32 size = htab->map.value_size; +	u32 size = htab_size_value(htab, percpu);  	bool prealloc = htab_is_prealloc(htab);  	struct htab_elem *l_new, **pl_new;  	void __percpu *pptr; @@ -696,9 +711,6 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,  	memcpy(l_new->key, key, key_size);  	if (percpu) { -		/* round up value_size to 8 bytes */ -		size = round_up(size, 8); -  		if (prealloc) {  			pptr = htab_elem_get_ptr(l_new, key_size);  		} else { @@ -1209,17 +1221,9 @@ const struct bpf_map_ops htab_lru_percpu_map_ops = {  static struct bpf_map *fd_htab_map_alloc(union bpf_attr *attr)  { -	struct bpf_map *map; -  	if (attr->value_size != sizeof(u32))  		return ERR_PTR(-EINVAL); - -	/* pointer is stored internally */ -	attr->value_size = sizeof(void *); -	map = htab_map_alloc(attr); -	attr->value_size = sizeof(u32); - -	return map; +	return htab_map_alloc(attr);  }  static void fd_htab_map_free(struct bpf_map *map) diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 8d5151688504..2f4039bafebb 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -577,6 +577,13 @@ static void update_domain_attr_tree(struct sched_domain_attr *dattr,  	rcu_read_unlock();  } +/* Must be called with cpuset_mutex held.  */ +static inline int nr_cpusets(void) +{ +	/* jump label reference count + the top-level cpuset */ +	return static_key_count(&cpusets_enabled_key.key) + 1; +} +  /*   * generate_sched_domains()   * @@ -1892,6 +1899,7 @@ static struct cftype files[] = {  	{  		.name = "memory_pressure",  		.read_u64 = cpuset_read_u64, +		.private = FILE_MEMORY_PRESSURE,  	},  	{ @@ -2343,13 +2351,7 @@ void cpuset_update_active_cpus(void)  	 * We're inside cpu hotplug critical region which usually nests  	 * inside cgroup synchronization.  Bounce actual hotplug processing  	 * to a work item to avoid reverse locking order. -	 * -	 * We still need to do partition_sched_domains() synchronously; -	 * otherwise, the scheduler will get confused and put tasks to the -	 * dead CPU.  Fall back to the default single domain. -	 * cpuset_hotplug_workfn() will rebuild it as necessary.  	 */ -	partition_sched_domains(1, NULL, NULL);  	schedule_work(&cpuset_hotplug_work);  } diff --git a/kernel/cpu.c b/kernel/cpu.c index eee033134262..bfbd649ccdc8 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -650,6 +650,7 @@ static int takedown_cpu(unsigned int cpu)  	__cpu_die(cpu);  	tick_cleanup_dead_cpu(cpu); +	rcutree_migrate_callbacks(cpu);  	return 0;  } diff --git a/kernel/events/core.c b/kernel/events/core.c index 426c2ffba16d..ce64f3fed5c6 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -1249,26 +1249,31 @@ unclone_ctx(struct perf_event_context *ctx)  	return parent_ctx;  } -static u32 perf_event_pid(struct perf_event *event, struct task_struct *p) +static u32 perf_event_pid_type(struct perf_event *event, struct task_struct *p, +				enum pid_type type)  { +	u32 nr;  	/*  	 * only top level events have the pid namespace they were created in  	 */  	if (event->parent)  		event = event->parent; -	return task_tgid_nr_ns(p, event->ns); +	nr = __task_pid_nr_ns(p, type, event->ns); +	/* avoid -1 if it is idle thread or runs in another ns */ +	if (!nr && !pid_alive(p)) +		nr = -1; +	return nr;  } -static u32 perf_event_tid(struct perf_event *event, struct task_struct *p) +static u32 perf_event_pid(struct perf_event *event, struct task_struct *p)  { -	/* -	 * only top level events have the pid namespace they were created in -	 */ -	if (event->parent) -		event = event->parent; +	return perf_event_pid_type(event, p, __PIDTYPE_TGID); +} -	return task_pid_nr_ns(p, event->ns); +static u32 perf_event_tid(struct perf_event *event, struct task_struct *p) +{ +	return perf_event_pid_type(event, p, PIDTYPE_PID);  }  /* @@ -1570,6 +1575,9 @@ static void __perf_event_header_size(struct perf_event *event, u64 sample_type)  	if (sample_type & PERF_SAMPLE_TRANSACTION)  		size += sizeof(data->txn); +	if (sample_type & PERF_SAMPLE_PHYS_ADDR) +		size += sizeof(data->phys_addr); +  	event->header_size = size;  } @@ -2217,6 +2225,33 @@ static int group_can_go_on(struct perf_event *event,  	return can_add_hw;  } +/* + * Complement to update_event_times(). This computes the tstamp_* values to + * continue 'enabled' state from @now, and effectively discards the time + * between the prior tstamp_stopped and now (as we were in the OFF state, or + * just switched (context) time base). + * + * This further assumes '@event->state == INACTIVE' (we just came from OFF) and + * cannot have been scheduled in yet. And going into INACTIVE state means + * '@event->tstamp_stopped = @now'. + * + * Thus given the rules of update_event_times(): + * + *   total_time_enabled = tstamp_stopped - tstamp_enabled + *   total_time_running = tstamp_stopped - tstamp_running + * + * We can insert 'tstamp_stopped == now' and reverse them to compute new + * tstamp_* values. + */ +static void __perf_event_enable_time(struct perf_event *event, u64 now) +{ +	WARN_ON_ONCE(event->state != PERF_EVENT_STATE_INACTIVE); + +	event->tstamp_stopped = now; +	event->tstamp_enabled = now - event->total_time_enabled; +	event->tstamp_running = now - event->total_time_running; +} +  static void add_event_to_ctx(struct perf_event *event,  			       struct perf_event_context *ctx)  { @@ -2224,9 +2259,12 @@ static void add_event_to_ctx(struct perf_event *event,  	list_add_event(event, ctx);  	perf_group_attach(event); -	event->tstamp_enabled = tstamp; -	event->tstamp_running = tstamp; -	event->tstamp_stopped = tstamp; +	/* +	 * We can be called with event->state == STATE_OFF when we create with +	 * .disabled = 1. In that case the IOC_ENABLE will call this function. +	 */ +	if (event->state == PERF_EVENT_STATE_INACTIVE) +		__perf_event_enable_time(event, tstamp);  }  static void ctx_sched_out(struct perf_event_context *ctx, @@ -2471,10 +2509,11 @@ static void __perf_event_mark_enabled(struct perf_event *event)  	u64 tstamp = perf_event_time(event);  	event->state = PERF_EVENT_STATE_INACTIVE; -	event->tstamp_enabled = tstamp - event->total_time_enabled; +	__perf_event_enable_time(event, tstamp);  	list_for_each_entry(sub, &event->sibling_list, group_entry) { +		/* XXX should not be > INACTIVE if event isn't */  		if (sub->state >= PERF_EVENT_STATE_INACTIVE) -			sub->tstamp_enabled = tstamp - sub->total_time_enabled; +			__perf_event_enable_time(sub, tstamp);  	}  } @@ -3180,6 +3219,13 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx,  		return;  	perf_ctx_lock(cpuctx, ctx); +	/* +	 * We must check ctx->nr_events while holding ctx->lock, such +	 * that we serialize against perf_install_in_context(). +	 */ +	if (!ctx->nr_events) +		goto unlock; +  	perf_pmu_disable(ctx->pmu);  	/*  	 * We want to keep the following priority order: @@ -3193,6 +3239,8 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx,  		cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);  	perf_event_sched_in(cpuctx, ctx, task);  	perf_pmu_enable(ctx->pmu); + +unlock:  	perf_ctx_unlock(cpuctx, ctx);  } @@ -5090,7 +5138,7 @@ static void perf_mmap_open(struct vm_area_struct *vma)  		atomic_inc(&event->rb->aux_mmap_count);  	if (event->pmu->event_mapped) -		event->pmu->event_mapped(event); +		event->pmu->event_mapped(event, vma->vm_mm);  }  static void perf_pmu_output_stop(struct perf_event *event); @@ -5113,7 +5161,7 @@ static void perf_mmap_close(struct vm_area_struct *vma)  	unsigned long size = perf_data_size(rb);  	if (event->pmu->event_unmapped) -		event->pmu->event_unmapped(event); +		event->pmu->event_unmapped(event, vma->vm_mm);  	/*  	 * rb->aux_mmap_count will always drop before rb->mmap_count and @@ -5411,7 +5459,7 @@ aux_unlock:  	vma->vm_ops = &perf_mmap_vmops;  	if (event->pmu->event_mapped) -		event->pmu->event_mapped(event); +		event->pmu->event_mapped(event, vma->vm_mm);  	return ret;  } @@ -5972,6 +6020,9 @@ void perf_output_sample(struct perf_output_handle *handle,  		}  	} +	if (sample_type & PERF_SAMPLE_PHYS_ADDR) +		perf_output_put(handle, data->phys_addr); +  	if (!event->attr.watermark) {  		int wakeup_events = event->attr.wakeup_events; @@ -5987,6 +6038,38 @@ void perf_output_sample(struct perf_output_handle *handle,  	}  } +static u64 perf_virt_to_phys(u64 virt) +{ +	u64 phys_addr = 0; +	struct page *p = NULL; + +	if (!virt) +		return 0; + +	if (virt >= TASK_SIZE) { +		/* If it's vmalloc()d memory, leave phys_addr as 0 */ +		if (virt_addr_valid((void *)(uintptr_t)virt) && +		    !(virt >= VMALLOC_START && virt < VMALLOC_END)) +			phys_addr = (u64)virt_to_phys((void *)(uintptr_t)virt); +	} else { +		/* +		 * Walking the pages tables for user address. +		 * Interrupts are disabled, so it prevents any tear down +		 * of the page tables. +		 * Try IRQ-safe __get_user_pages_fast first. +		 * If failed, leave phys_addr as 0. +		 */ +		if ((current->mm != NULL) && +		    (__get_user_pages_fast(virt, 1, 0, &p) == 1)) +			phys_addr = page_to_phys(p) + virt % PAGE_SIZE; + +		if (p) +			put_page(p); +	} + +	return phys_addr; +} +  void perf_prepare_sample(struct perf_event_header *header,  			 struct perf_sample_data *data,  			 struct perf_event *event, @@ -6105,6 +6188,9 @@ void perf_prepare_sample(struct perf_event_header *header,  		header->size += size;  	} + +	if (sample_type & PERF_SAMPLE_PHYS_ADDR) +		data->phys_addr = perf_virt_to_phys(data->addr);  }  static void __always_inline @@ -7256,6 +7342,11 @@ static void perf_log_throttle(struct perf_event *event, int enable)  	perf_output_end(&handle);  } +void perf_event_itrace_started(struct perf_event *event) +{ +	event->attach_state |= PERF_ATTACH_ITRACE; +} +  static void perf_log_itrace_start(struct perf_event *event)  {  	struct perf_output_handle handle; @@ -7271,7 +7362,7 @@ static void perf_log_itrace_start(struct perf_event *event)  		event = event->parent;  	if (!(event->pmu->capabilities & PERF_PMU_CAP_ITRACE) || -	    event->hw.itrace_started) +	    event->attach_state & PERF_ATTACH_ITRACE)  		return;  	rec.header.type	= PERF_RECORD_ITRACE_START; @@ -7875,16 +7966,15 @@ void perf_trace_run_bpf_submit(void *raw_data, int size, int rctx,  		}  	}  	perf_tp_event(call->event.type, count, raw_data, size, regs, head, -		      rctx, task); +		      rctx, task, NULL);  }  EXPORT_SYMBOL_GPL(perf_trace_run_bpf_submit);  void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size,  		   struct pt_regs *regs, struct hlist_head *head, int rctx, -		   struct task_struct *task) +		   struct task_struct *task, struct perf_event *event)  {  	struct perf_sample_data data; -	struct perf_event *event;  	struct perf_raw_record raw = {  		.frag = { @@ -7898,9 +7988,15 @@ void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size,  	perf_trace_buf_update(record, event_type); -	hlist_for_each_entry_rcu(event, head, hlist_entry) { +	/* Use the given event instead of the hlist */ +	if (event) {  		if (perf_tp_event_match(event, &data, regs))  			perf_swevent_event(event, count, &data, regs); +	} else { +		hlist_for_each_entry_rcu(event, head, hlist_entry) { +			if (perf_tp_event_match(event, &data, regs)) +				perf_swevent_event(event, count, &data, regs); +		}  	}  	/* @@ -9580,6 +9676,8 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr,  	if (ret)  		return -EFAULT; +	attr->size = size; +  	if (attr->__reserved_1)  		return -EINVAL; @@ -9852,6 +9950,11 @@ SYSCALL_DEFINE5(perf_event_open,  			return -EINVAL;  	} +	/* Only privileged users can get physical addresses */ +	if ((attr.sample_type & PERF_SAMPLE_PHYS_ADDR) && +	    perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN)) +		return -EACCES; +  	if (!attr.sample_max_stack)  		attr.sample_max_stack = sysctl_perf_event_max_stack; @@ -10001,28 +10104,27 @@ SYSCALL_DEFINE5(perf_event_open,  			goto err_context;  		/* -		 * Do not allow to attach to a group in a different -		 * task or CPU context: +		 * Make sure we're both events for the same CPU; +		 * grouping events for different CPUs is broken; since +		 * you can never concurrently schedule them anyhow.  		 */ -		if (move_group) { -			/* -			 * Make sure we're both on the same task, or both -			 * per-cpu events. -			 */ -			if (group_leader->ctx->task != ctx->task) -				goto err_context; +		if (group_leader->cpu != event->cpu) +			goto err_context; -			/* -			 * Make sure we're both events for the same CPU; -			 * grouping events for different CPUs is broken; since -			 * you can never concurrently schedule them anyhow. -			 */ -			if (group_leader->cpu != event->cpu) -				goto err_context; -		} else { -			if (group_leader->ctx != ctx) -				goto err_context; -		} +		/* +		 * Make sure we're both on the same task, or both +		 * per-CPU events. +		 */ +		if (group_leader->ctx->task != ctx->task) +			goto err_context; + +		/* +		 * Do not allow to attach to a group in a different task +		 * or CPU context. If we're moving SW events, we'll fix +		 * this up later, so allow that. +		 */ +		if (!move_group && group_leader->ctx != ctx) +			goto err_context;  		/*  		 * Only a group leader can be exclusive or pinned diff --git a/kernel/events/internal.h b/kernel/events/internal.h index 486fd78eb8d5..843e97047335 100644 --- a/kernel/events/internal.h +++ b/kernel/events/internal.h @@ -38,9 +38,9 @@ struct ring_buffer {  	struct user_struct		*mmap_user;  	/* AUX area */ -	local_t				aux_head; +	long				aux_head;  	local_t				aux_nest; -	local_t				aux_wakeup; +	long				aux_wakeup;	/* last aux_watermark boundary crossed by aux_head */  	unsigned long			aux_pgoff;  	int				aux_nr_pages;  	int				aux_overwrite; @@ -208,7 +208,7 @@ static inline int get_recursion_context(int *recursion)  {  	int rctx; -	if (in_nmi()) +	if (unlikely(in_nmi()))  		rctx = 3;  	else if (in_irq())  		rctx = 2; diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index ee97196bb151..af71a84e12ee 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -367,7 +367,7 @@ void *perf_aux_output_begin(struct perf_output_handle *handle,  	if (WARN_ON_ONCE(local_xchg(&rb->aux_nest, 1)))  		goto err_put; -	aux_head = local_read(&rb->aux_head); +	aux_head = rb->aux_head;  	handle->rb = rb;  	handle->event = event; @@ -382,7 +382,7 @@ void *perf_aux_output_begin(struct perf_output_handle *handle,  	 */  	if (!rb->aux_overwrite) {  		aux_tail = ACCESS_ONCE(rb->user_page->aux_tail); -		handle->wakeup = local_read(&rb->aux_wakeup) + rb->aux_watermark; +		handle->wakeup = rb->aux_wakeup + rb->aux_watermark;  		if (aux_head - aux_tail < perf_aux_size(rb))  			handle->size = CIRC_SPACE(aux_head, aux_tail, perf_aux_size(rb)); @@ -433,12 +433,12 @@ void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size)  		handle->aux_flags |= PERF_AUX_FLAG_OVERWRITE;  		aux_head = handle->head; -		local_set(&rb->aux_head, aux_head); +		rb->aux_head = aux_head;  	} else {  		handle->aux_flags &= ~PERF_AUX_FLAG_OVERWRITE; -		aux_head = local_read(&rb->aux_head); -		local_add(size, &rb->aux_head); +		aux_head = rb->aux_head; +		rb->aux_head += size;  	}  	if (size || handle->aux_flags) { @@ -450,11 +450,10 @@ void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size)  		                     handle->aux_flags);  	} -	aux_head = rb->user_page->aux_head = local_read(&rb->aux_head); - -	if (aux_head - local_read(&rb->aux_wakeup) >= rb->aux_watermark) { +	rb->user_page->aux_head = rb->aux_head; +	if (rb->aux_head - rb->aux_wakeup >= rb->aux_watermark) {  		wakeup = true; -		local_add(rb->aux_watermark, &rb->aux_wakeup); +		rb->aux_wakeup = rounddown(rb->aux_head, rb->aux_watermark);  	}  	if (wakeup) { @@ -478,22 +477,20 @@ void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size)  int perf_aux_output_skip(struct perf_output_handle *handle, unsigned long size)  {  	struct ring_buffer *rb = handle->rb; -	unsigned long aux_head;  	if (size > handle->size)  		return -ENOSPC; -	local_add(size, &rb->aux_head); +	rb->aux_head += size; -	aux_head = rb->user_page->aux_head = local_read(&rb->aux_head); -	if (aux_head - local_read(&rb->aux_wakeup) >= rb->aux_watermark) { +	rb->user_page->aux_head = rb->aux_head; +	if (rb->aux_head - rb->aux_wakeup >= rb->aux_watermark) {  		perf_output_wakeup(handle); -		local_add(rb->aux_watermark, &rb->aux_wakeup); -		handle->wakeup = local_read(&rb->aux_wakeup) + -				 rb->aux_watermark; +		rb->aux_wakeup = rounddown(rb->aux_head, rb->aux_watermark); +		handle->wakeup = rb->aux_wakeup + rb->aux_watermark;  	} -	handle->head = aux_head; +	handle->head = rb->aux_head;  	handle->size -= size;  	return 0; diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 0e137f98a50c..267f6ef91d97 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -1262,8 +1262,6 @@ void uprobe_end_dup_mmap(void)  void uprobe_dup_mmap(struct mm_struct *oldmm, struct mm_struct *newmm)  { -	newmm->uprobes_state.xol_area = NULL; -  	if (test_bit(MMF_HAS_UPROBES, &oldmm->flags)) {  		set_bit(MMF_HAS_UPROBES, &newmm->flags);  		/* unconditionally, dup_mmap() skips VM_DONTCOPY vmas */ diff --git a/kernel/exit.c b/kernel/exit.c index c5548faa9f37..a35d8a17e01f 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -764,7 +764,6 @@ void __noreturn do_exit(long code)  {  	struct task_struct *tsk = current;  	int group_dead; -	TASKS_RCU(int tasks_rcu_i);  	profile_task_exit(tsk);  	kcov_task_exit(tsk); @@ -819,7 +818,8 @@ void __noreturn do_exit(long code)  	 * Ensure that we must observe the pi_state in exit_mm() ->  	 * mm_release() -> exit_pi_state_list().  	 */ -	raw_spin_unlock_wait(&tsk->pi_lock); +	raw_spin_lock_irq(&tsk->pi_lock); +	raw_spin_unlock_irq(&tsk->pi_lock);  	if (unlikely(in_atomic())) {  		pr_info("note: %s[%d] exited with preempt_count %d\n", @@ -881,9 +881,7 @@ void __noreturn do_exit(long code)  	 */  	flush_ptrace_hw_breakpoint(tsk); -	TASKS_RCU(preempt_disable()); -	TASKS_RCU(tasks_rcu_i = __srcu_read_lock(&tasks_rcu_exit_srcu)); -	TASKS_RCU(preempt_enable()); +	exit_tasks_rcu_start();  	exit_notify(tsk, group_dead);  	proc_exit_connector(tsk);  	mpol_put_task_policy(tsk); @@ -918,8 +916,9 @@ void __noreturn do_exit(long code)  	if (tsk->nr_dirtied)  		__this_cpu_add(dirty_throttle_leaks, tsk->nr_dirtied);  	exit_rcu(); -	TASKS_RCU(__srcu_read_unlock(&tasks_rcu_exit_srcu, tasks_rcu_i)); +	exit_tasks_rcu_finish(); +	lockdep_free_task(tsk);  	do_task_dead();  }  EXPORT_SYMBOL_GPL(do_exit); diff --git a/kernel/fork.c b/kernel/fork.c index e075b7780421..dab73d18bc4d 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -484,6 +484,8 @@ void __init fork_init(void)  	cpuhp_setup_state(CPUHP_BP_PREPARE_DYN, "fork:vm_stack_cache",  			  NULL, free_vm_stack_cache);  #endif + +	lockdep_init_task(&init_task);  }  int __weak arch_dup_task_struct(struct task_struct *dst, @@ -785,6 +787,13 @@ static void mm_init_owner(struct mm_struct *mm, struct task_struct *p)  #endif  } +static void mm_init_uprobes_state(struct mm_struct *mm) +{ +#ifdef CONFIG_UPROBES +	mm->uprobes_state.xol_area = NULL; +#endif +} +  static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,  	struct user_namespace *user_ns)  { @@ -806,11 +815,13 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,  	mm_init_cpumask(mm);  	mm_init_aio(mm);  	mm_init_owner(mm, p); +	RCU_INIT_POINTER(mm->exe_file, NULL);  	mmu_notifier_mm_init(mm);  	init_tlb_flush_pending(mm);  #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS  	mm->pmd_huge_pte = NULL;  #endif +	mm_init_uprobes_state(mm);  	if (current->mm) {  		mm->flags = current->mm->flags & MMF_INIT_MASK; @@ -1691,6 +1702,7 @@ static __latent_entropy struct task_struct *copy_process(  	p->lockdep_depth = 0; /* no locks held yet */  	p->curr_chain_key = 0;  	p->lockdep_recursion = 0; +	lockdep_init_task(p);  #endif  #ifdef CONFIG_DEBUG_MUTEXES @@ -1949,6 +1961,7 @@ bad_fork_cleanup_audit:  bad_fork_cleanup_perf:  	perf_event_free_task(p);  bad_fork_cleanup_policy: +	lockdep_free_task(p);  #ifdef CONFIG_NUMA  	mpol_put(p->mempolicy);  bad_fork_cleanup_threadgroup_lock: diff --git a/kernel/futex.c b/kernel/futex.c index f50b434756c1..3d38eaf05492 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -876,6 +876,8 @@ static struct task_struct *futex_find_get_task(pid_t pid)  	return p;  } +#ifdef CONFIG_FUTEX_PI +  /*   * This task is holding PI mutexes at exit time => bad.   * Kernel cleans up PI-state, but userspace is likely hosed. @@ -933,6 +935,8 @@ void exit_pi_state_list(struct task_struct *curr)  	raw_spin_unlock_irq(&curr->pi_lock);  } +#endif +  /*   * We need to check the following states:   * @@ -1547,6 +1551,45 @@ out:  	return ret;  } +static int futex_atomic_op_inuser(unsigned int encoded_op, u32 __user *uaddr) +{ +	unsigned int op =	  (encoded_op & 0x70000000) >> 28; +	unsigned int cmp =	  (encoded_op & 0x0f000000) >> 24; +	int oparg = sign_extend32((encoded_op & 0x00fff000) >> 12, 12); +	int cmparg = sign_extend32(encoded_op & 0x00000fff, 12); +	int oldval, ret; + +	if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) { +		if (oparg < 0 || oparg > 31) +			return -EINVAL; +		oparg = 1 << oparg; +	} + +	if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32))) +		return -EFAULT; + +	ret = arch_futex_atomic_op_inuser(op, oparg, &oldval, uaddr); +	if (ret) +		return ret; + +	switch (cmp) { +	case FUTEX_OP_CMP_EQ: +		return oldval == cmparg; +	case FUTEX_OP_CMP_NE: +		return oldval != cmparg; +	case FUTEX_OP_CMP_LT: +		return oldval < cmparg; +	case FUTEX_OP_CMP_GE: +		return oldval >= cmparg; +	case FUTEX_OP_CMP_LE: +		return oldval <= cmparg; +	case FUTEX_OP_CMP_GT: +		return oldval > cmparg; +	default: +		return -ENOSYS; +	} +} +  /*   * Wake up all waiters hashed on the physical page that is mapped   * to this virtual address: @@ -1800,6 +1843,15 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags,  	struct futex_q *this, *next;  	DEFINE_WAKE_Q(wake_q); +	/* +	 * When PI not supported: return -ENOSYS if requeue_pi is true, +	 * consequently the compiler knows requeue_pi is always false past +	 * this point which will optimize away all the conditional code +	 * further down. +	 */ +	if (!IS_ENABLED(CONFIG_FUTEX_PI) && requeue_pi) +		return -ENOSYS; +  	if (requeue_pi) {  		/*  		 * Requeue PI only works on two distinct uaddrs. This @@ -2595,6 +2647,9 @@ static int futex_lock_pi(u32 __user *uaddr, unsigned int flags,  	struct futex_q q = futex_q_init;  	int res, ret; +	if (!IS_ENABLED(CONFIG_FUTEX_PI)) +		return -ENOSYS; +  	if (refill_pi_state_cache())  		return -ENOMEM; @@ -2774,6 +2829,9 @@ static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags)  	struct futex_q *top_waiter;  	int ret; +	if (!IS_ENABLED(CONFIG_FUTEX_PI)) +		return -ENOSYS; +  retry:  	if (get_user(uval, uaddr))  		return -EFAULT; @@ -2984,6 +3042,9 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,  	struct futex_q q = futex_q_init;  	int res, ret; +	if (!IS_ENABLED(CONFIG_FUTEX_PI)) +		return -ENOSYS; +  	if (uaddr == uaddr2)  		return -EINVAL; diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index a3cc37c0c85e..3675c6004f2a 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -1000,7 +1000,7 @@ EXPORT_SYMBOL_GPL(irq_set_chip_and_handler_name);  void irq_modify_status(unsigned int irq, unsigned long clr, unsigned long set)  { -	unsigned long flags; +	unsigned long flags, trigger, tmp;  	struct irq_desc *desc = irq_get_desc_lock(irq, &flags, 0);  	if (!desc) @@ -1014,6 +1014,8 @@ void irq_modify_status(unsigned int irq, unsigned long clr, unsigned long set)  	irq_settings_clr_and_set(desc, clr, set); +	trigger = irqd_get_trigger_type(&desc->irq_data); +  	irqd_clear(&desc->irq_data, IRQD_NO_BALANCING | IRQD_PER_CPU |  		   IRQD_TRIGGER_MASK | IRQD_LEVEL | IRQD_MOVE_PCNTXT);  	if (irq_settings_has_no_balance_set(desc)) @@ -1025,7 +1027,11 @@ void irq_modify_status(unsigned int irq, unsigned long clr, unsigned long set)  	if (irq_settings_is_level(desc))  		irqd_set(&desc->irq_data, IRQD_LEVEL); -	irqd_set(&desc->irq_data, irq_settings_get_trigger_mask(desc)); +	tmp = irq_settings_get_trigger_mask(desc); +	if (tmp != IRQ_TYPE_NONE) +		trigger = tmp; + +	irqd_set(&desc->irq_data, trigger);  	irq_put_desc_unlock(desc, flags);  } diff --git a/kernel/irq/ipi.c b/kernel/irq/ipi.c index 1a9abc1c8ea0..259a22aa9934 100644 --- a/kernel/irq/ipi.c +++ b/kernel/irq/ipi.c @@ -165,7 +165,7 @@ irq_hw_number_t ipi_get_hwirq(unsigned int irq, unsigned int cpu)  	struct irq_data *data = irq_get_irq_data(irq);  	struct cpumask *ipimask = data ? irq_data_get_affinity_mask(data) : NULL; -	if (!data || !ipimask || cpu > nr_cpu_ids) +	if (!data || !ipimask || cpu >= nr_cpu_ids)  		return INVALID_HWIRQ;  	if (!cpumask_test_cpu(cpu, ipimask)) @@ -195,7 +195,7 @@ static int ipi_send_verify(struct irq_chip *chip, struct irq_data *data,  	if (!chip->ipi_send_single && !chip->ipi_send_mask)  		return -EINVAL; -	if (cpu > nr_cpu_ids) +	if (cpu >= nr_cpu_ids)  		return -EINVAL;  	if (dest) { diff --git a/kernel/jump_label.c b/kernel/jump_label.c index d11c506a6ac3..0bf2e8f5244a 100644 --- a/kernel/jump_label.c +++ b/kernel/jump_label.c @@ -79,29 +79,7 @@ int static_key_count(struct static_key *key)  }  EXPORT_SYMBOL_GPL(static_key_count); -void static_key_enable(struct static_key *key) -{ -	int count = static_key_count(key); - -	WARN_ON_ONCE(count < 0 || count > 1); - -	if (!count) -		static_key_slow_inc(key); -} -EXPORT_SYMBOL_GPL(static_key_enable); - -void static_key_disable(struct static_key *key) -{ -	int count = static_key_count(key); - -	WARN_ON_ONCE(count < 0 || count > 1); - -	if (count) -		static_key_slow_dec(key); -} -EXPORT_SYMBOL_GPL(static_key_disable); - -void static_key_slow_inc(struct static_key *key) +static void static_key_slow_inc_cpuslocked(struct static_key *key)  {  	int v, v1; @@ -125,24 +103,87 @@ void static_key_slow_inc(struct static_key *key)  			return;  	} -	cpus_read_lock();  	jump_label_lock();  	if (atomic_read(&key->enabled) == 0) {  		atomic_set(&key->enabled, -1);  		jump_label_update(key); -		atomic_set(&key->enabled, 1); +		/* +		 * Ensure that if the above cmpxchg loop observes our positive +		 * value, it must also observe all the text changes. +		 */ +		atomic_set_release(&key->enabled, 1);  	} else {  		atomic_inc(&key->enabled);  	}  	jump_label_unlock(); +} + +void static_key_slow_inc(struct static_key *key) +{ +	cpus_read_lock(); +	static_key_slow_inc_cpuslocked(key);  	cpus_read_unlock();  }  EXPORT_SYMBOL_GPL(static_key_slow_inc); -static void __static_key_slow_dec(struct static_key *key, -		unsigned long rate_limit, struct delayed_work *work) +void static_key_enable_cpuslocked(struct static_key *key) +{ +	STATIC_KEY_CHECK_USE(); + +	if (atomic_read(&key->enabled) > 0) { +		WARN_ON_ONCE(atomic_read(&key->enabled) != 1); +		return; +	} + +	jump_label_lock(); +	if (atomic_read(&key->enabled) == 0) { +		atomic_set(&key->enabled, -1); +		jump_label_update(key); +		/* +		 * See static_key_slow_inc(). +		 */ +		atomic_set_release(&key->enabled, 1); +	} +	jump_label_unlock(); +} +EXPORT_SYMBOL_GPL(static_key_enable_cpuslocked); + +void static_key_enable(struct static_key *key) +{ +	cpus_read_lock(); +	static_key_enable_cpuslocked(key); +	cpus_read_unlock(); +} +EXPORT_SYMBOL_GPL(static_key_enable); + +void static_key_disable_cpuslocked(struct static_key *key) +{ +	STATIC_KEY_CHECK_USE(); + +	if (atomic_read(&key->enabled) != 1) { +		WARN_ON_ONCE(atomic_read(&key->enabled) != 0); +		return; +	} + +	jump_label_lock(); +	if (atomic_cmpxchg(&key->enabled, 1, 0)) +		jump_label_update(key); +	jump_label_unlock(); +} +EXPORT_SYMBOL_GPL(static_key_disable_cpuslocked); + +void static_key_disable(struct static_key *key)  {  	cpus_read_lock(); +	static_key_disable_cpuslocked(key); +	cpus_read_unlock(); +} +EXPORT_SYMBOL_GPL(static_key_disable); + +static void static_key_slow_dec_cpuslocked(struct static_key *key, +					   unsigned long rate_limit, +					   struct delayed_work *work) +{  	/*  	 * The negative count check is valid even when a negative  	 * key->enabled is in use by static_key_slow_inc(); a @@ -153,7 +194,6 @@ static void __static_key_slow_dec(struct static_key *key,  	if (!atomic_dec_and_mutex_lock(&key->enabled, &jump_label_mutex)) {  		WARN(atomic_read(&key->enabled) < 0,  		     "jump label: negative count!\n"); -		cpus_read_unlock();  		return;  	} @@ -164,6 +204,14 @@ static void __static_key_slow_dec(struct static_key *key,  		jump_label_update(key);  	}  	jump_label_unlock(); +} + +static void __static_key_slow_dec(struct static_key *key, +				  unsigned long rate_limit, +				  struct delayed_work *work) +{ +	cpus_read_lock(); +	static_key_slow_dec_cpuslocked(key, rate_limit, work);  	cpus_read_unlock();  } diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index 1ae7c41c33c1..20fef1a38602 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -301,7 +301,7 @@ static struct page *kimage_alloc_pages(gfp_t gfp_mask, unsigned int order)  {  	struct page *pages; -	pages = alloc_pages(gfp_mask, order); +	pages = alloc_pages(gfp_mask & ~__GFP_ZERO, order);  	if (pages) {  		unsigned int count, i; @@ -310,6 +310,13 @@ static struct page *kimage_alloc_pages(gfp_t gfp_mask, unsigned int order)  		count = 1 << order;  		for (i = 0; i < count; i++)  			SetPageReserved(pages + i); + +		arch_kexec_post_alloc_pages(page_address(pages), count, +					    gfp_mask); + +		if (gfp_mask & __GFP_ZERO) +			for (i = 0; i < count; i++) +				clear_highpage(pages + i);  	}  	return pages; @@ -321,6 +328,9 @@ static void kimage_free_pages(struct page *page)  	order = page_private(page);  	count = 1 << order; + +	arch_kexec_pre_free_pages(page_address(page), count); +  	for (i = 0; i < count; i++)  		ClearPageReserved(page + i);  	__free_pages(page, order); diff --git a/kernel/kmod.c b/kernel/kmod.c index 6d016c5d97c8..2f37acde640b 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -71,6 +71,18 @@ static atomic_t kmod_concurrent_max = ATOMIC_INIT(MAX_KMOD_CONCURRENT);  static DECLARE_WAIT_QUEUE_HEAD(kmod_wq);  /* + * This is a restriction on having *all* MAX_KMOD_CONCURRENT threads + * running at the same time without returning. When this happens we + * believe you've somehow ended up with a recursive module dependency + * creating a loop. + * + * We have no option but to fail. + * + * Userspace should proactively try to detect and prevent these. + */ +#define MAX_KMOD_ALL_BUSY_TIMEOUT 5 + +/*  	modprobe_path is set via /proc/sys.  */  char modprobe_path[KMOD_PATH_LEN] = "/sbin/modprobe"; @@ -167,8 +179,17 @@ int __request_module(bool wait, const char *fmt, ...)  		pr_warn_ratelimited("request_module: kmod_concurrent_max (%u) close to 0 (max_modprobes: %u), for module %s, throttling...",  				    atomic_read(&kmod_concurrent_max),  				    MAX_KMOD_CONCURRENT, module_name); -		wait_event_interruptible(kmod_wq, -					 atomic_dec_if_positive(&kmod_concurrent_max) >= 0); +		ret = wait_event_killable_timeout(kmod_wq, +						  atomic_dec_if_positive(&kmod_concurrent_max) >= 0, +						  MAX_KMOD_ALL_BUSY_TIMEOUT * HZ); +		if (!ret) { +			pr_warn_ratelimited("request_module: modprobe %s cannot be processed, kmod busy with %d threads for more than %d seconds now", +					    module_name, MAX_KMOD_CONCURRENT, MAX_KMOD_ALL_BUSY_TIMEOUT); +			return -ETIME; +		} else if (ret == -ERESTARTSYS) { +			pr_warn_ratelimited("request_module: sigkill sent for modprobe %s, giving up", module_name); +			return ret; +		}  	}  	trace_module_request(module_name, wait, _RET_IP_); diff --git a/kernel/kthread.c b/kernel/kthread.c index 26db528c1d88..1c19edf82427 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -637,6 +637,7 @@ repeat:  		schedule();  	try_to_freeze(); +	cond_resched();  	goto repeat;  }  EXPORT_SYMBOL_GPL(kthread_worker_fn); diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 7d2499bec5fe..44c8d0d17170 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -58,6 +58,10 @@  #define CREATE_TRACE_POINTS  #include <trace/events/lock.h> +#ifdef CONFIG_LOCKDEP_CROSSRELEASE +#include <linux/slab.h> +#endif +  #ifdef CONFIG_PROVE_LOCKING  int prove_locking = 1;  module_param(prove_locking, int, 0644); @@ -344,14 +348,12 @@ EXPORT_SYMBOL(lockdep_on);  #if VERBOSE  # define HARDIRQ_VERBOSE	1  # define SOFTIRQ_VERBOSE	1 -# define RECLAIM_VERBOSE	1  #else  # define HARDIRQ_VERBOSE	0  # define SOFTIRQ_VERBOSE	0 -# define RECLAIM_VERBOSE	0  #endif -#if VERBOSE || HARDIRQ_VERBOSE || SOFTIRQ_VERBOSE || RECLAIM_VERBOSE +#if VERBOSE || HARDIRQ_VERBOSE || SOFTIRQ_VERBOSE  /*   * Quick filtering for interesting events:   */ @@ -726,6 +728,18 @@ look_up_lock_class(struct lockdep_map *lock, unsigned int subclass)  	return is_static || static_obj(lock->key) ? NULL : ERR_PTR(-EINVAL);  } +#ifdef CONFIG_LOCKDEP_CROSSRELEASE +static void cross_init(struct lockdep_map *lock, int cross); +static int cross_lock(struct lockdep_map *lock); +static int lock_acquire_crosslock(struct held_lock *hlock); +static int lock_release_crosslock(struct lockdep_map *lock); +#else +static inline void cross_init(struct lockdep_map *lock, int cross) {} +static inline int cross_lock(struct lockdep_map *lock) { return 0; } +static inline int lock_acquire_crosslock(struct held_lock *hlock) { return 2; } +static inline int lock_release_crosslock(struct lockdep_map *lock) { return 2; } +#endif +  /*   * Register a lock's class in the hash-table, if the class is not present   * yet. Otherwise we look it up. We cache the result in the lock object @@ -1125,22 +1139,41 @@ print_circular_lock_scenario(struct held_lock *src,  		printk(KERN_CONT "\n\n");  	} -	printk(" Possible unsafe locking scenario:\n\n"); -	printk("       CPU0                    CPU1\n"); -	printk("       ----                    ----\n"); -	printk("  lock("); -	__print_lock_name(target); -	printk(KERN_CONT ");\n"); -	printk("                               lock("); -	__print_lock_name(parent); -	printk(KERN_CONT ");\n"); -	printk("                               lock("); -	__print_lock_name(target); -	printk(KERN_CONT ");\n"); -	printk("  lock("); -	__print_lock_name(source); -	printk(KERN_CONT ");\n"); -	printk("\n *** DEADLOCK ***\n\n"); +	if (cross_lock(tgt->instance)) { +		printk(" Possible unsafe locking scenario by crosslock:\n\n"); +		printk("       CPU0                    CPU1\n"); +		printk("       ----                    ----\n"); +		printk("  lock("); +		__print_lock_name(parent); +		printk(KERN_CONT ");\n"); +		printk("  lock("); +		__print_lock_name(target); +		printk(KERN_CONT ");\n"); +		printk("                               lock("); +		__print_lock_name(source); +		printk(KERN_CONT ");\n"); +		printk("                               unlock("); +		__print_lock_name(target); +		printk(KERN_CONT ");\n"); +		printk("\n *** DEADLOCK ***\n\n"); +	} else { +		printk(" Possible unsafe locking scenario:\n\n"); +		printk("       CPU0                    CPU1\n"); +		printk("       ----                    ----\n"); +		printk("  lock("); +		__print_lock_name(target); +		printk(KERN_CONT ");\n"); +		printk("                               lock("); +		__print_lock_name(parent); +		printk(KERN_CONT ");\n"); +		printk("                               lock("); +		__print_lock_name(target); +		printk(KERN_CONT ");\n"); +		printk("  lock("); +		__print_lock_name(source); +		printk(KERN_CONT ");\n"); +		printk("\n *** DEADLOCK ***\n\n"); +	}  }  /* @@ -1165,7 +1198,12 @@ print_circular_bug_header(struct lock_list *entry, unsigned int depth,  	pr_warn("%s/%d is trying to acquire lock:\n",  		curr->comm, task_pid_nr(curr));  	print_lock(check_src); -	pr_warn("\nbut task is already holding lock:\n"); + +	if (cross_lock(check_tgt->instance)) +		pr_warn("\nbut now in release context of a crosslock acquired at the following:\n"); +	else +		pr_warn("\nbut task is already holding lock:\n"); +  	print_lock(check_tgt);  	pr_warn("\nwhich lock already depends on the new lock.\n\n");  	pr_warn("\nthe existing dependency chain (in reverse order) is:\n"); @@ -1183,7 +1221,8 @@ static inline int class_equal(struct lock_list *entry, void *data)  static noinline int print_circular_bug(struct lock_list *this,  				struct lock_list *target,  				struct held_lock *check_src, -				struct held_lock *check_tgt) +				struct held_lock *check_tgt, +				struct stack_trace *trace)  {  	struct task_struct *curr = current;  	struct lock_list *parent; @@ -1193,7 +1232,9 @@ static noinline int print_circular_bug(struct lock_list *this,  	if (!debug_locks_off_graph_unlock() || debug_locks_silent)  		return 0; -	if (!save_trace(&this->trace)) +	if (cross_lock(check_tgt->instance)) +		this->trace = *trace; +	else if (!save_trace(&this->trace))  		return 0;  	depth = get_lock_depth(target); @@ -1309,6 +1350,19 @@ check_noncircular(struct lock_list *root, struct lock_class *target,  	return result;  } +static noinline int +check_redundant(struct lock_list *root, struct lock_class *target, +		struct lock_list **target_entry) +{ +	int result; + +	debug_atomic_inc(nr_redundant_checks); + +	result = __bfs_forwards(root, target, class_equal, target_entry); + +	return result; +} +  #if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING)  /*   * Forwards and backwards subgraph searching, for the purposes of @@ -1784,6 +1838,9 @@ check_deadlock(struct task_struct *curr, struct held_lock *next,  		if (nest)  			return 2; +		if (cross_lock(prev->instance)) +			continue; +  		return print_deadlock_bug(curr, prev, next);  	}  	return 1; @@ -1813,20 +1870,13 @@ check_deadlock(struct task_struct *curr, struct held_lock *next,   */  static int  check_prev_add(struct task_struct *curr, struct held_lock *prev, -	       struct held_lock *next, int distance, int *stack_saved) +	       struct held_lock *next, int distance, struct stack_trace *trace, +	       int (*save)(struct stack_trace *trace))  {  	struct lock_list *entry;  	int ret;  	struct lock_list this;  	struct lock_list *uninitialized_var(target_entry); -	/* -	 * Static variable, serialized by the graph_lock(). -	 * -	 * We use this static variable to save the stack trace in case -	 * we call into this function multiple times due to encountering -	 * trylocks in the held lock stack. -	 */ -	static struct stack_trace trace;  	/*  	 * Prove that the new <prev> -> <next> dependency would not @@ -1841,7 +1891,7 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,  	this.parent = NULL;  	ret = check_noncircular(&this, hlock_class(prev), &target_entry);  	if (unlikely(!ret)) -		return print_circular_bug(&this, target_entry, next, prev); +		return print_circular_bug(&this, target_entry, next, prev, trace);  	else if (unlikely(ret < 0))  		return print_bfs_bug(ret); @@ -1870,15 +1920,26 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,  		if (entry->class == hlock_class(next)) {  			if (distance == 1)  				entry->distance = 1; -			return 2; +			return 1;  		}  	} -	if (!*stack_saved) { -		if (!save_trace(&trace)) -			return 0; -		*stack_saved = 1; +	/* +	 * Is the <prev> -> <next> link redundant? +	 */ +	this.class = hlock_class(prev); +	this.parent = NULL; +	ret = check_redundant(&this, hlock_class(next), &target_entry); +	if (!ret) { +		debug_atomic_inc(nr_redundant); +		return 2;  	} +	if (ret < 0) +		return print_bfs_bug(ret); + + +	if (save && !save(trace)) +		return 0;  	/*  	 * Ok, all validations passed, add the new lock @@ -1886,14 +1947,14 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,  	 */  	ret = add_lock_to_list(hlock_class(next),  			       &hlock_class(prev)->locks_after, -			       next->acquire_ip, distance, &trace); +			       next->acquire_ip, distance, trace);  	if (!ret)  		return 0;  	ret = add_lock_to_list(hlock_class(prev),  			       &hlock_class(next)->locks_before, -			       next->acquire_ip, distance, &trace); +			       next->acquire_ip, distance, trace);  	if (!ret)  		return 0; @@ -1901,8 +1962,6 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,  	 * Debugging printouts:  	 */  	if (verbose(hlock_class(prev)) || verbose(hlock_class(next))) { -		/* We drop graph lock, so another thread can overwrite trace. */ -		*stack_saved = 0;  		graph_unlock();  		printk("\n new dependency: ");  		print_lock_name(hlock_class(prev)); @@ -1910,9 +1969,10 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,  		print_lock_name(hlock_class(next));  		printk(KERN_CONT "\n");  		dump_stack(); -		return graph_lock(); +		if (!graph_lock()) +			return 0;  	} -	return 1; +	return 2;  }  /* @@ -1925,8 +1985,9 @@ static int  check_prevs_add(struct task_struct *curr, struct held_lock *next)  {  	int depth = curr->lockdep_depth; -	int stack_saved = 0;  	struct held_lock *hlock; +	struct stack_trace trace; +	int (*save)(struct stack_trace *trace) = save_trace;  	/*  	 * Debugging checks. @@ -1947,21 +2008,36 @@ check_prevs_add(struct task_struct *curr, struct held_lock *next)  		int distance = curr->lockdep_depth - depth + 1;  		hlock = curr->held_locks + depth - 1;  		/* -		 * Only non-recursive-read entries get new dependencies -		 * added: +		 * Only non-crosslock entries get new dependencies added. +		 * Crosslock entries will be added by commit later:  		 */ -		if (hlock->read != 2 && hlock->check) { -			if (!check_prev_add(curr, hlock, next, -						distance, &stack_saved)) -				return 0; +		if (!cross_lock(hlock->instance)) {  			/* -			 * Stop after the first non-trylock entry, -			 * as non-trylock entries have added their -			 * own direct dependencies already, so this -			 * lock is connected to them indirectly: +			 * Only non-recursive-read entries get new dependencies +			 * added:  			 */ -			if (!hlock->trylock) -				break; +			if (hlock->read != 2 && hlock->check) { +				int ret = check_prev_add(curr, hlock, next, +							 distance, &trace, save); +				if (!ret) +					return 0; + +				/* +				 * Stop saving stack_trace if save_trace() was +				 * called at least once: +				 */ +				if (save && ret == 2) +					save = NULL; + +				/* +				 * Stop after the first non-trylock entry, +				 * as non-trylock entries have added their +				 * own direct dependencies already, so this +				 * lock is connected to them indirectly: +				 */ +				if (!hlock->trylock) +					break; +			}  		}  		depth--;  		/* @@ -2126,19 +2202,26 @@ static int check_no_collision(struct task_struct *curr,  }  /* - * Look up a dependency chain. If the key is not present yet then - * add it and return 1 - in this case the new dependency chain is - * validated. If the key is already hashed, return 0. - * (On return with 1 graph_lock is held.) + * This is for building a chain between just two different classes, + * instead of adding a new hlock upon current, which is done by + * add_chain_cache(). + * + * This can be called in any context with two classes, while + * add_chain_cache() must be done within the lock owener's context + * since it uses hlock which might be racy in another context.   */ -static inline int lookup_chain_cache(struct task_struct *curr, -				     struct held_lock *hlock, -				     u64 chain_key) +static inline int add_chain_cache_classes(unsigned int prev, +					  unsigned int next, +					  unsigned int irq_context, +					  u64 chain_key)  { -	struct lock_class *class = hlock_class(hlock);  	struct hlist_head *hash_head = chainhashentry(chain_key);  	struct lock_chain *chain; -	int i, j; + +	/* +	 * Allocate a new chain entry from the static array, and add +	 * it to the hash: +	 */  	/*  	 * We might need to take the graph lock, ensure we've got IRQs @@ -2147,43 +2230,76 @@ static inline int lookup_chain_cache(struct task_struct *curr,  	 */  	if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))  		return 0; + +	if (unlikely(nr_lock_chains >= MAX_LOCKDEP_CHAINS)) { +		if (!debug_locks_off_graph_unlock()) +			return 0; + +		print_lockdep_off("BUG: MAX_LOCKDEP_CHAINS too low!"); +		dump_stack(); +		return 0; +	} + +	chain = lock_chains + nr_lock_chains++; +	chain->chain_key = chain_key; +	chain->irq_context = irq_context; +	chain->depth = 2; +	if (likely(nr_chain_hlocks + chain->depth <= MAX_LOCKDEP_CHAIN_HLOCKS)) { +		chain->base = nr_chain_hlocks; +		nr_chain_hlocks += chain->depth; +		chain_hlocks[chain->base] = prev - 1; +		chain_hlocks[chain->base + 1] = next -1; +	} +#ifdef CONFIG_DEBUG_LOCKDEP  	/* -	 * We can walk it lock-free, because entries only get added -	 * to the hash: +	 * Important for check_no_collision().  	 */ -	hlist_for_each_entry_rcu(chain, hash_head, entry) { -		if (chain->chain_key == chain_key) { -cache_hit: -			debug_atomic_inc(chain_lookup_hits); -			if (!check_no_collision(curr, hlock, chain)) -				return 0; - -			if (very_verbose(class)) -				printk("\nhash chain already cached, key: " -					"%016Lx tail class: [%p] %s\n", -					(unsigned long long)chain_key, -					class->key, class->name); +	else { +		if (!debug_locks_off_graph_unlock())  			return 0; -		} + +		print_lockdep_off("BUG: MAX_LOCKDEP_CHAIN_HLOCKS too low!"); +		dump_stack(); +		return 0;  	} -	if (very_verbose(class)) -		printk("\nnew hash chain, key: %016Lx tail class: [%p] %s\n", -			(unsigned long long)chain_key, class->key, class->name); +#endif + +	hlist_add_head_rcu(&chain->entry, hash_head); +	debug_atomic_inc(chain_lookup_misses); +	inc_chains(); + +	return 1; +} + +/* + * Adds a dependency chain into chain hashtable. And must be called with + * graph_lock held. + * + * Return 0 if fail, and graph_lock is released. + * Return 1 if succeed, with graph_lock held. + */ +static inline int add_chain_cache(struct task_struct *curr, +				  struct held_lock *hlock, +				  u64 chain_key) +{ +	struct lock_class *class = hlock_class(hlock); +	struct hlist_head *hash_head = chainhashentry(chain_key); +	struct lock_chain *chain; +	int i, j; +  	/*  	 * Allocate a new chain entry from the static array, and add  	 * it to the hash:  	 */ -	if (!graph_lock()) -		return 0; +  	/* -	 * We have to walk the chain again locked - to avoid duplicates: +	 * We might need to take the graph lock, ensure we've got IRQs +	 * disabled to make this an IRQ-safe lock.. for recursion reasons +	 * lockdep won't complain about its own locking errors.  	 */ -	hlist_for_each_entry(chain, hash_head, entry) { -		if (chain->chain_key == chain_key) { -			graph_unlock(); -			goto cache_hit; -		} -	} +	if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) +		return 0; +  	if (unlikely(nr_lock_chains >= MAX_LOCKDEP_CHAINS)) {  		if (!debug_locks_off_graph_unlock())  			return 0; @@ -2235,6 +2351,78 @@ cache_hit:  	return 1;  } +/* + * Look up a dependency chain. + */ +static inline struct lock_chain *lookup_chain_cache(u64 chain_key) +{ +	struct hlist_head *hash_head = chainhashentry(chain_key); +	struct lock_chain *chain; + +	/* +	 * We can walk it lock-free, because entries only get added +	 * to the hash: +	 */ +	hlist_for_each_entry_rcu(chain, hash_head, entry) { +		if (chain->chain_key == chain_key) { +			debug_atomic_inc(chain_lookup_hits); +			return chain; +		} +	} +	return NULL; +} + +/* + * If the key is not present yet in dependency chain cache then + * add it and return 1 - in this case the new dependency chain is + * validated. If the key is already hashed, return 0. + * (On return with 1 graph_lock is held.) + */ +static inline int lookup_chain_cache_add(struct task_struct *curr, +					 struct held_lock *hlock, +					 u64 chain_key) +{ +	struct lock_class *class = hlock_class(hlock); +	struct lock_chain *chain = lookup_chain_cache(chain_key); + +	if (chain) { +cache_hit: +		if (!check_no_collision(curr, hlock, chain)) +			return 0; + +		if (very_verbose(class)) { +			printk("\nhash chain already cached, key: " +					"%016Lx tail class: [%p] %s\n", +					(unsigned long long)chain_key, +					class->key, class->name); +		} + +		return 0; +	} + +	if (very_verbose(class)) { +		printk("\nnew hash chain, key: %016Lx tail class: [%p] %s\n", +			(unsigned long long)chain_key, class->key, class->name); +	} + +	if (!graph_lock()) +		return 0; + +	/* +	 * We have to walk the chain again locked - to avoid duplicates: +	 */ +	chain = lookup_chain_cache(chain_key); +	if (chain) { +		graph_unlock(); +		goto cache_hit; +	} + +	if (!add_chain_cache(curr, hlock, chain_key)) +		return 0; + +	return 1; +} +  static int validate_chain(struct task_struct *curr, struct lockdep_map *lock,  		struct held_lock *hlock, int chain_head, u64 chain_key)  { @@ -2245,11 +2433,11 @@ static int validate_chain(struct task_struct *curr, struct lockdep_map *lock,  	 *  	 * We look up the chain_key and do the O(N^2) check and update of  	 * the dependencies only if this is a new dependency chain. -	 * (If lookup_chain_cache() returns with 1 it acquires +	 * (If lookup_chain_cache_add() return with 1 it acquires  	 * graph_lock for us)  	 */  	if (!hlock->trylock && hlock->check && -	    lookup_chain_cache(curr, hlock, chain_key)) { +	    lookup_chain_cache_add(curr, hlock, chain_key)) {  		/*  		 * Check whether last held lock:  		 * @@ -2277,14 +2465,17 @@ static int validate_chain(struct task_struct *curr, struct lockdep_map *lock,  		 * Add dependency only if this lock is not the head  		 * of the chain, and if it's not a secondary read-lock:  		 */ -		if (!chain_head && ret != 2) +		if (!chain_head && ret != 2) {  			if (!check_prevs_add(curr, hlock))  				return 0; +		} +  		graph_unlock(); -	} else -		/* after lookup_chain_cache(): */ +	} else { +		/* after lookup_chain_cache_add(): */  		if (unlikely(!debug_locks))  			return 0; +	}  	return 1;  } @@ -2567,14 +2758,6 @@ static int SOFTIRQ_verbose(struct lock_class *class)  	return 0;  } -static int RECLAIM_FS_verbose(struct lock_class *class) -{ -#if RECLAIM_VERBOSE -	return class_filter(class); -#endif -	return 0; -} -  #define STRICT_READ_CHECKS	1  static int (*state_verbose_f[])(struct lock_class *class) = { @@ -2870,57 +3053,6 @@ void trace_softirqs_off(unsigned long ip)  		debug_atomic_inc(redundant_softirqs_off);  } -static void __lockdep_trace_alloc(gfp_t gfp_mask, unsigned long flags) -{ -	struct task_struct *curr = current; - -	if (unlikely(!debug_locks)) -		return; - -	gfp_mask = current_gfp_context(gfp_mask); - -	/* no reclaim without waiting on it */ -	if (!(gfp_mask & __GFP_DIRECT_RECLAIM)) -		return; - -	/* this guy won't enter reclaim */ -	if ((curr->flags & PF_MEMALLOC) && !(gfp_mask & __GFP_NOMEMALLOC)) -		return; - -	/* We're only interested __GFP_FS allocations for now */ -	if (!(gfp_mask & __GFP_FS) || (curr->flags & PF_MEMALLOC_NOFS)) -		return; - -	/* -	 * Oi! Can't be having __GFP_FS allocations with IRQs disabled. -	 */ -	if (DEBUG_LOCKS_WARN_ON(irqs_disabled_flags(flags))) -		return; - -	/* Disable lockdep if explicitly requested */ -	if (gfp_mask & __GFP_NOLOCKDEP) -		return; - -	mark_held_locks(curr, RECLAIM_FS); -} - -static void check_flags(unsigned long flags); - -void lockdep_trace_alloc(gfp_t gfp_mask) -{ -	unsigned long flags; - -	if (unlikely(current->lockdep_recursion)) -		return; - -	raw_local_irq_save(flags); -	check_flags(flags); -	current->lockdep_recursion = 1; -	__lockdep_trace_alloc(gfp_mask, flags); -	current->lockdep_recursion = 0; -	raw_local_irq_restore(flags); -} -  static int mark_irqflags(struct task_struct *curr, struct held_lock *hlock)  {  	/* @@ -2966,22 +3098,6 @@ static int mark_irqflags(struct task_struct *curr, struct held_lock *hlock)  		}  	} -	/* -	 * We reuse the irq context infrastructure more broadly as a general -	 * context checking code. This tests GFP_FS recursion (a lock taken -	 * during reclaim for a GFP_FS allocation is held over a GFP_FS -	 * allocation). -	 */ -	if (!hlock->trylock && (curr->lockdep_reclaim_gfp & __GFP_FS)) { -		if (hlock->read) { -			if (!mark_lock(curr, hlock, LOCK_USED_IN_RECLAIM_FS_READ)) -					return 0; -		} else { -			if (!mark_lock(curr, hlock, LOCK_USED_IN_RECLAIM_FS)) -					return 0; -		} -	} -  	return 1;  } @@ -3040,10 +3156,6 @@ static inline int separate_irq_context(struct task_struct *curr,  	return 0;  } -void lockdep_trace_alloc(gfp_t gfp_mask) -{ -} -  #endif /* defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) */  /* @@ -3116,7 +3228,7 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this,  /*   * Initialize a lock instance's lock-class mapping info:   */ -void lockdep_init_map(struct lockdep_map *lock, const char *name, +static void __lockdep_init_map(struct lockdep_map *lock, const char *name,  		      struct lock_class_key *key, int subclass)  {  	int i; @@ -3174,8 +3286,25 @@ void lockdep_init_map(struct lockdep_map *lock, const char *name,  		raw_local_irq_restore(flags);  	}  } + +void lockdep_init_map(struct lockdep_map *lock, const char *name, +		      struct lock_class_key *key, int subclass) +{ +	cross_init(lock, 0); +	__lockdep_init_map(lock, name, key, subclass); +}  EXPORT_SYMBOL_GPL(lockdep_init_map); +#ifdef CONFIG_LOCKDEP_CROSSRELEASE +void lockdep_init_map_crosslock(struct lockdep_map *lock, const char *name, +		      struct lock_class_key *key, int subclass) +{ +	cross_init(lock, 1); +	__lockdep_init_map(lock, name, key, subclass); +} +EXPORT_SYMBOL_GPL(lockdep_init_map_crosslock); +#endif +  struct lock_class_key __lockdep_no_validate__;  EXPORT_SYMBOL_GPL(__lockdep_no_validate__); @@ -3231,6 +3360,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,  	int chain_head = 0;  	int class_idx;  	u64 chain_key; +	int ret;  	if (unlikely(!debug_locks))  		return 0; @@ -3279,7 +3409,8 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,  	class_idx = class - lock_classes + 1; -	if (depth) { +	/* TODO: nest_lock is not implemented for crosslock yet. */ +	if (depth && !cross_lock(lock)) {  		hlock = curr->held_locks + depth - 1;  		if (hlock->class_idx == class_idx && nest_lock) {  			if (hlock->references) { @@ -3367,6 +3498,14 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,  	if (!validate_chain(curr, lock, hlock, chain_head, chain_key))  		return 0; +	ret = lock_acquire_crosslock(hlock); +	/* +	 * 2 means normal acquire operations are needed. Otherwise, it's +	 * ok just to return with '0:fail, 1:success'. +	 */ +	if (ret != 2) +		return ret; +  	curr->curr_chain_key = chain_key;  	curr->lockdep_depth++;  	check_chain_key(curr); @@ -3604,11 +3743,19 @@ __lock_release(struct lockdep_map *lock, int nested, unsigned long ip)  	struct task_struct *curr = current;  	struct held_lock *hlock;  	unsigned int depth; -	int i; +	int ret, i;  	if (unlikely(!debug_locks))  		return 0; +	ret = lock_release_crosslock(lock); +	/* +	 * 2 means normal release operations are needed. Otherwise, it's +	 * ok just to return with '0:fail, 1:success'. +	 */ +	if (ret != 2) +		return ret; +  	depth = curr->lockdep_depth;  	/*  	 * So we're all set to release this lock.. wait what lock? We don't @@ -3952,18 +4099,6 @@ void lock_unpin_lock(struct lockdep_map *lock, struct pin_cookie cookie)  }  EXPORT_SYMBOL_GPL(lock_unpin_lock); -void lockdep_set_current_reclaim_state(gfp_t gfp_mask) -{ -	current->lockdep_reclaim_gfp = current_gfp_context(gfp_mask); -} -EXPORT_SYMBOL_GPL(lockdep_set_current_reclaim_state); - -void lockdep_clear_current_reclaim_state(void) -{ -	current->lockdep_reclaim_gfp = 0; -} -EXPORT_SYMBOL_GPL(lockdep_clear_current_reclaim_state); -  #ifdef CONFIG_LOCK_STAT  static int  print_lock_contention_bug(struct task_struct *curr, struct lockdep_map *lock, @@ -4484,6 +4619,12 @@ asmlinkage __visible void lockdep_sys_exit(void)  				curr->comm, curr->pid);  		lockdep_print_held_locks(curr);  	} + +	/* +	 * The lock history for each syscall should be independent. So wipe the +	 * slate clean on return to userspace. +	 */ +	lockdep_invariant_state(false);  }  void lockdep_rcu_suspicious(const char *file, const int line, const char *s) @@ -4532,3 +4673,488 @@ void lockdep_rcu_suspicious(const char *file, const int line, const char *s)  	dump_stack();  }  EXPORT_SYMBOL_GPL(lockdep_rcu_suspicious); + +#ifdef CONFIG_LOCKDEP_CROSSRELEASE + +/* + * Crossrelease works by recording a lock history for each thread and + * connecting those historic locks that were taken after the + * wait_for_completion() in the complete() context. + * + * Task-A				Task-B + * + *					mutex_lock(&A); + *					mutex_unlock(&A); + * + * wait_for_completion(&C); + *   lock_acquire_crosslock(); + *     atomic_inc_return(&cross_gen_id); + *                                | + *				  |	mutex_lock(&B); + *				  |	mutex_unlock(&B); + *                                | + *				  |	complete(&C); + *				  `--	  lock_commit_crosslock(); + * + * Which will then add a dependency between B and C. + */ + +#define xhlock(i)         (current->xhlocks[(i) % MAX_XHLOCKS_NR]) + +/* + * Whenever a crosslock is held, cross_gen_id will be increased. + */ +static atomic_t cross_gen_id; /* Can be wrapped */ + +/* + * Make an entry of the ring buffer invalid. + */ +static inline void invalidate_xhlock(struct hist_lock *xhlock) +{ +	/* +	 * Normally, xhlock->hlock.instance must be !NULL. +	 */ +	xhlock->hlock.instance = NULL; +} + +/* + * Lock history stacks; we have 2 nested lock history stacks: + * + *   HARD(IRQ) + *   SOFT(IRQ) + * + * The thing is that once we complete a HARD/SOFT IRQ the future task locks + * should not depend on any of the locks observed while running the IRQ.  So + * what we do is rewind the history buffer and erase all our knowledge of that + * temporal event. + */ + +void crossrelease_hist_start(enum xhlock_context_t c) +{ +	struct task_struct *cur = current; + +	if (!cur->xhlocks) +		return; + +	cur->xhlock_idx_hist[c] = cur->xhlock_idx; +	cur->hist_id_save[c]    = cur->hist_id; +} + +void crossrelease_hist_end(enum xhlock_context_t c) +{ +	struct task_struct *cur = current; + +	if (cur->xhlocks) { +		unsigned int idx = cur->xhlock_idx_hist[c]; +		struct hist_lock *h = &xhlock(idx); + +		cur->xhlock_idx = idx; + +		/* Check if the ring was overwritten. */ +		if (h->hist_id != cur->hist_id_save[c]) +			invalidate_xhlock(h); +	} +} + +/* + * lockdep_invariant_state() is used to annotate independence inside a task, to + * make one task look like multiple independent 'tasks'. + * + * Take for instance workqueues; each work is independent of the last. The + * completion of a future work does not depend on the completion of a past work + * (in general). Therefore we must not carry that (lock) dependency across + * works. + * + * This is true for many things; pretty much all kthreads fall into this + * pattern, where they have an invariant state and future completions do not + * depend on past completions. Its just that since they all have the 'same' + * form -- the kthread does the same over and over -- it doesn't typically + * matter. + * + * The same is true for system-calls, once a system call is completed (we've + * returned to userspace) the next system call does not depend on the lock + * history of the previous system call. + * + * They key property for independence, this invariant state, is that it must be + * a point where we hold no locks and have no history. Because if we were to + * hold locks, the restore at _end() would not necessarily recover it's history + * entry. Similarly, independence per-definition means it does not depend on + * prior state. + */ +void lockdep_invariant_state(bool force) +{ +	/* +	 * We call this at an invariant point, no current state, no history. +	 * Verify the former, enforce the latter. +	 */ +	WARN_ON_ONCE(!force && current->lockdep_depth); +	invalidate_xhlock(&xhlock(current->xhlock_idx)); +} + +static int cross_lock(struct lockdep_map *lock) +{ +	return lock ? lock->cross : 0; +} + +/* + * This is needed to decide the relationship between wrapable variables. + */ +static inline int before(unsigned int a, unsigned int b) +{ +	return (int)(a - b) < 0; +} + +static inline struct lock_class *xhlock_class(struct hist_lock *xhlock) +{ +	return hlock_class(&xhlock->hlock); +} + +static inline struct lock_class *xlock_class(struct cross_lock *xlock) +{ +	return hlock_class(&xlock->hlock); +} + +/* + * Should we check a dependency with previous one? + */ +static inline int depend_before(struct held_lock *hlock) +{ +	return hlock->read != 2 && hlock->check && !hlock->trylock; +} + +/* + * Should we check a dependency with next one? + */ +static inline int depend_after(struct held_lock *hlock) +{ +	return hlock->read != 2 && hlock->check; +} + +/* + * Check if the xhlock is valid, which would be false if, + * + *    1. Has not used after initializaion yet. + *    2. Got invalidated. + * + * Remind hist_lock is implemented as a ring buffer. + */ +static inline int xhlock_valid(struct hist_lock *xhlock) +{ +	/* +	 * xhlock->hlock.instance must be !NULL. +	 */ +	return !!xhlock->hlock.instance; +} + +/* + * Record a hist_lock entry. + * + * Irq disable is only required. + */ +static void add_xhlock(struct held_lock *hlock) +{ +	unsigned int idx = ++current->xhlock_idx; +	struct hist_lock *xhlock = &xhlock(idx); + +#ifdef CONFIG_DEBUG_LOCKDEP +	/* +	 * This can be done locklessly because they are all task-local +	 * state, we must however ensure IRQs are disabled. +	 */ +	WARN_ON_ONCE(!irqs_disabled()); +#endif + +	/* Initialize hist_lock's members */ +	xhlock->hlock = *hlock; +	xhlock->hist_id = ++current->hist_id; + +	xhlock->trace.nr_entries = 0; +	xhlock->trace.max_entries = MAX_XHLOCK_TRACE_ENTRIES; +	xhlock->trace.entries = xhlock->trace_entries; +	xhlock->trace.skip = 3; +	save_stack_trace(&xhlock->trace); +} + +static inline int same_context_xhlock(struct hist_lock *xhlock) +{ +	return xhlock->hlock.irq_context == task_irq_context(current); +} + +/* + * This should be lockless as far as possible because this would be + * called very frequently. + */ +static void check_add_xhlock(struct held_lock *hlock) +{ +	/* +	 * Record a hist_lock, only in case that acquisitions ahead +	 * could depend on the held_lock. For example, if the held_lock +	 * is trylock then acquisitions ahead never depends on that. +	 * In that case, we don't need to record it. Just return. +	 */ +	if (!current->xhlocks || !depend_before(hlock)) +		return; + +	add_xhlock(hlock); +} + +/* + * For crosslock. + */ +static int add_xlock(struct held_lock *hlock) +{ +	struct cross_lock *xlock; +	unsigned int gen_id; + +	if (!graph_lock()) +		return 0; + +	xlock = &((struct lockdep_map_cross *)hlock->instance)->xlock; + +	/* +	 * When acquisitions for a crosslock are overlapped, we use +	 * nr_acquire to perform commit for them, based on cross_gen_id +	 * of the first acquisition, which allows to add additional +	 * dependencies. +	 * +	 * Moreover, when no acquisition of a crosslock is in progress, +	 * we should not perform commit because the lock might not exist +	 * any more, which might cause incorrect memory access. So we +	 * have to track the number of acquisitions of a crosslock. +	 * +	 * depend_after() is necessary to initialize only the first +	 * valid xlock so that the xlock can be used on its commit. +	 */ +	if (xlock->nr_acquire++ && depend_after(&xlock->hlock)) +		goto unlock; + +	gen_id = (unsigned int)atomic_inc_return(&cross_gen_id); +	xlock->hlock = *hlock; +	xlock->hlock.gen_id = gen_id; +unlock: +	graph_unlock(); +	return 1; +} + +/* + * Called for both normal and crosslock acquires. Normal locks will be + * pushed on the hist_lock queue. Cross locks will record state and + * stop regular lock_acquire() to avoid being placed on the held_lock + * stack. + * + * Return: 0 - failure; + *         1 - crosslock, done; + *         2 - normal lock, continue to held_lock[] ops. + */ +static int lock_acquire_crosslock(struct held_lock *hlock) +{ +	/* +	 *	CONTEXT 1		CONTEXT 2 +	 *	---------		--------- +	 *	lock A (cross) +	 *	X = atomic_inc_return(&cross_gen_id) +	 *	~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +	 *				Y = atomic_read_acquire(&cross_gen_id) +	 *				lock B +	 * +	 * atomic_read_acquire() is for ordering between A and B, +	 * IOW, A happens before B, when CONTEXT 2 see Y >= X. +	 * +	 * Pairs with atomic_inc_return() in add_xlock(). +	 */ +	hlock->gen_id = (unsigned int)atomic_read_acquire(&cross_gen_id); + +	if (cross_lock(hlock->instance)) +		return add_xlock(hlock); + +	check_add_xhlock(hlock); +	return 2; +} + +static int copy_trace(struct stack_trace *trace) +{ +	unsigned long *buf = stack_trace + nr_stack_trace_entries; +	unsigned int max_nr = MAX_STACK_TRACE_ENTRIES - nr_stack_trace_entries; +	unsigned int nr = min(max_nr, trace->nr_entries); + +	trace->nr_entries = nr; +	memcpy(buf, trace->entries, nr * sizeof(trace->entries[0])); +	trace->entries = buf; +	nr_stack_trace_entries += nr; + +	if (nr_stack_trace_entries >= MAX_STACK_TRACE_ENTRIES-1) { +		if (!debug_locks_off_graph_unlock()) +			return 0; + +		print_lockdep_off("BUG: MAX_STACK_TRACE_ENTRIES too low!"); +		dump_stack(); + +		return 0; +	} + +	return 1; +} + +static int commit_xhlock(struct cross_lock *xlock, struct hist_lock *xhlock) +{ +	unsigned int xid, pid; +	u64 chain_key; + +	xid = xlock_class(xlock) - lock_classes; +	chain_key = iterate_chain_key((u64)0, xid); +	pid = xhlock_class(xhlock) - lock_classes; +	chain_key = iterate_chain_key(chain_key, pid); + +	if (lookup_chain_cache(chain_key)) +		return 1; + +	if (!add_chain_cache_classes(xid, pid, xhlock->hlock.irq_context, +				chain_key)) +		return 0; + +	if (!check_prev_add(current, &xlock->hlock, &xhlock->hlock, 1, +			    &xhlock->trace, copy_trace)) +		return 0; + +	return 1; +} + +static void commit_xhlocks(struct cross_lock *xlock) +{ +	unsigned int cur = current->xhlock_idx; +	unsigned int prev_hist_id = xhlock(cur).hist_id; +	unsigned int i; + +	if (!graph_lock()) +		return; + +	if (xlock->nr_acquire) { +		for (i = 0; i < MAX_XHLOCKS_NR; i++) { +			struct hist_lock *xhlock = &xhlock(cur - i); + +			if (!xhlock_valid(xhlock)) +				break; + +			if (before(xhlock->hlock.gen_id, xlock->hlock.gen_id)) +				break; + +			if (!same_context_xhlock(xhlock)) +				break; + +			/* +			 * Filter out the cases where the ring buffer was +			 * overwritten and the current entry has a bigger +			 * hist_id than the previous one, which is impossible +			 * otherwise: +			 */ +			if (unlikely(before(prev_hist_id, xhlock->hist_id))) +				break; + +			prev_hist_id = xhlock->hist_id; + +			/* +			 * commit_xhlock() returns 0 with graph_lock already +			 * released if fail. +			 */ +			if (!commit_xhlock(xlock, xhlock)) +				return; +		} +	} + +	graph_unlock(); +} + +void lock_commit_crosslock(struct lockdep_map *lock) +{ +	struct cross_lock *xlock; +	unsigned long flags; + +	if (unlikely(!debug_locks || current->lockdep_recursion)) +		return; + +	if (!current->xhlocks) +		return; + +	/* +	 * Do commit hist_locks with the cross_lock, only in case that +	 * the cross_lock could depend on acquisitions after that. +	 * +	 * For example, if the cross_lock does not have the 'check' flag +	 * then we don't need to check dependencies and commit for that. +	 * Just skip it. In that case, of course, the cross_lock does +	 * not depend on acquisitions ahead, either. +	 * +	 * WARNING: Don't do that in add_xlock() in advance. When an +	 * acquisition context is different from the commit context, +	 * invalid(skipped) cross_lock might be accessed. +	 */ +	if (!depend_after(&((struct lockdep_map_cross *)lock)->xlock.hlock)) +		return; + +	raw_local_irq_save(flags); +	check_flags(flags); +	current->lockdep_recursion = 1; +	xlock = &((struct lockdep_map_cross *)lock)->xlock; +	commit_xhlocks(xlock); +	current->lockdep_recursion = 0; +	raw_local_irq_restore(flags); +} +EXPORT_SYMBOL_GPL(lock_commit_crosslock); + +/* + * Return: 0 - failure; + *         1 - crosslock, done; + *         2 - normal lock, continue to held_lock[] ops. + */ +static int lock_release_crosslock(struct lockdep_map *lock) +{ +	if (cross_lock(lock)) { +		if (!graph_lock()) +			return 0; +		((struct lockdep_map_cross *)lock)->xlock.nr_acquire--; +		graph_unlock(); +		return 1; +	} +	return 2; +} + +static void cross_init(struct lockdep_map *lock, int cross) +{ +	if (cross) +		((struct lockdep_map_cross *)lock)->xlock.nr_acquire = 0; + +	lock->cross = cross; + +	/* +	 * Crossrelease assumes that the ring buffer size of xhlocks +	 * is aligned with power of 2. So force it on build. +	 */ +	BUILD_BUG_ON(MAX_XHLOCKS_NR & (MAX_XHLOCKS_NR - 1)); +} + +void lockdep_init_task(struct task_struct *task) +{ +	int i; + +	task->xhlock_idx = UINT_MAX; +	task->hist_id = 0; + +	for (i = 0; i < XHLOCK_CTX_NR; i++) { +		task->xhlock_idx_hist[i] = UINT_MAX; +		task->hist_id_save[i] = 0; +	} + +	task->xhlocks = kzalloc(sizeof(struct hist_lock) * MAX_XHLOCKS_NR, +				GFP_KERNEL); +} + +void lockdep_free_task(struct task_struct *task) +{ +	if (task->xhlocks) { +		void *tmp = task->xhlocks; +		/* Diable crossrelease for current */ +		task->xhlocks = NULL; +		kfree(tmp); +	} +} +#endif diff --git a/kernel/locking/lockdep_internals.h b/kernel/locking/lockdep_internals.h index c08fbd2f5ba9..1da4669d57a7 100644 --- a/kernel/locking/lockdep_internals.h +++ b/kernel/locking/lockdep_internals.h @@ -143,6 +143,8 @@ struct lockdep_stats {  	int	redundant_softirqs_on;  	int	redundant_softirqs_off;  	int	nr_unused_locks; +	int	nr_redundant_checks; +	int	nr_redundant;  	int	nr_cyclic_checks;  	int	nr_cyclic_check_recursions;  	int	nr_find_usage_forwards_checks; diff --git a/kernel/locking/lockdep_proc.c b/kernel/locking/lockdep_proc.c index 6d1fcc786081..68d9e267ccd4 100644 --- a/kernel/locking/lockdep_proc.c +++ b/kernel/locking/lockdep_proc.c @@ -201,6 +201,10 @@ static void lockdep_stats_debug_show(struct seq_file *m)  		debug_atomic_read(chain_lookup_hits));  	seq_printf(m, " cyclic checks:                 %11llu\n",  		debug_atomic_read(nr_cyclic_checks)); +	seq_printf(m, " redundant checks:              %11llu\n", +		debug_atomic_read(nr_redundant_checks)); +	seq_printf(m, " redundant links:               %11llu\n", +		debug_atomic_read(nr_redundant));  	seq_printf(m, " find-mask forwards checks:     %11llu\n",  		debug_atomic_read(nr_find_usage_forwards_checks));  	seq_printf(m, " find-mask backwards checks:    %11llu\n", diff --git a/kernel/locking/lockdep_states.h b/kernel/locking/lockdep_states.h index 995b0cc2b84c..35ca09f2ed0b 100644 --- a/kernel/locking/lockdep_states.h +++ b/kernel/locking/lockdep_states.h @@ -6,4 +6,3 @@   */  LOCKDEP_STATE(HARDIRQ)  LOCKDEP_STATE(SOFTIRQ) -LOCKDEP_STATE(RECLAIM_FS) diff --git a/kernel/locking/osq_lock.c b/kernel/locking/osq_lock.c index a3167941093b..a74ee6abd039 100644 --- a/kernel/locking/osq_lock.c +++ b/kernel/locking/osq_lock.c @@ -109,6 +109,19 @@ bool osq_lock(struct optimistic_spin_queue *lock)  	prev = decode_cpu(old);  	node->prev = prev; + +	/* +	 * osq_lock()			unqueue +	 * +	 * node->prev = prev		osq_wait_next() +	 * WMB				MB +	 * prev->next = node		next->prev = prev // unqueue-C +	 * +	 * Here 'node->prev' and 'next->prev' are the same variable and we need +	 * to ensure these stores happen in-order to avoid corrupting the list. +	 */ +	smp_wmb(); +  	WRITE_ONCE(prev->next, node);  	/* diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c index fd24153e8a48..294294c71ba4 100644 --- a/kernel/locking/qspinlock.c +++ b/kernel/locking/qspinlock.c @@ -268,123 +268,6 @@ static __always_inline u32  __pv_wait_head_or_lock(struct qspinlock *lock,  #define queued_spin_lock_slowpath	native_queued_spin_lock_slowpath  #endif -/* - * Various notes on spin_is_locked() and spin_unlock_wait(), which are - * 'interesting' functions: - * - * PROBLEM: some architectures have an interesting issue with atomic ACQUIRE - * operations in that the ACQUIRE applies to the LOAD _not_ the STORE (ARM64, - * PPC). Also qspinlock has a similar issue per construction, the setting of - * the locked byte can be unordered acquiring the lock proper. - * - * This gets to be 'interesting' in the following cases, where the /should/s - * end up false because of this issue. - * - * - * CASE 1: - * - * So the spin_is_locked() correctness issue comes from something like: - * - *   CPU0				CPU1 - * - *   global_lock();			local_lock(i) - *     spin_lock(&G)			  spin_lock(&L[i]) - *     for (i)				  if (!spin_is_locked(&G)) { - *       spin_unlock_wait(&L[i]);	    smp_acquire__after_ctrl_dep(); - *					    return; - *					  } - *					  // deal with fail - * - * Where it is important CPU1 sees G locked or CPU0 sees L[i] locked such - * that there is exclusion between the two critical sections. - * - * The load from spin_is_locked(&G) /should/ be constrained by the ACQUIRE from - * spin_lock(&L[i]), and similarly the load(s) from spin_unlock_wait(&L[i]) - * /should/ be constrained by the ACQUIRE from spin_lock(&G). - * - * Similarly, later stuff is constrained by the ACQUIRE from CTRL+RMB. - * - * - * CASE 2: - * - * For spin_unlock_wait() there is a second correctness issue, namely: - * - *   CPU0				CPU1 - * - *   flag = set; - *   smp_mb();				spin_lock(&l) - *   spin_unlock_wait(&l);		if (!flag) - *					  // add to lockless list - *					spin_unlock(&l); - *   // iterate lockless list - * - * Which wants to ensure that CPU1 will stop adding bits to the list and CPU0 - * will observe the last entry on the list (if spin_unlock_wait() had ACQUIRE - * semantics etc..) - * - * Where flag /should/ be ordered against the locked store of l. - */ - -/* - * queued_spin_lock_slowpath() can (load-)ACQUIRE the lock before - * issuing an _unordered_ store to set _Q_LOCKED_VAL. - * - * This means that the store can be delayed, but no later than the - * store-release from the unlock. This means that simply observing - * _Q_LOCKED_VAL is not sufficient to determine if the lock is acquired. - * - * There are two paths that can issue the unordered store: - * - *  (1) clear_pending_set_locked():	*,1,0 -> *,0,1 - * - *  (2) set_locked():			t,0,0 -> t,0,1 ; t != 0 - *      atomic_cmpxchg_relaxed():	t,0,0 -> 0,0,1 - * - * However, in both cases we have other !0 state we've set before to queue - * ourseves: - * - * For (1) we have the atomic_cmpxchg_acquire() that set _Q_PENDING_VAL, our - * load is constrained by that ACQUIRE to not pass before that, and thus must - * observe the store. - * - * For (2) we have a more intersting scenario. We enqueue ourselves using - * xchg_tail(), which ends up being a RELEASE. This in itself is not - * sufficient, however that is followed by an smp_cond_acquire() on the same - * word, giving a RELEASE->ACQUIRE ordering. This again constrains our load and - * guarantees we must observe that store. - * - * Therefore both cases have other !0 state that is observable before the - * unordered locked byte store comes through. This means we can use that to - * wait for the lock store, and then wait for an unlock. - */ -#ifndef queued_spin_unlock_wait -void queued_spin_unlock_wait(struct qspinlock *lock) -{ -	u32 val; - -	for (;;) { -		val = atomic_read(&lock->val); - -		if (!val) /* not locked, we're done */ -			goto done; - -		if (val & _Q_LOCKED_MASK) /* locked, go wait for unlock */ -			break; - -		/* not locked, but pending, wait until we observe the lock */ -		cpu_relax(); -	} - -	/* any unlock is good */ -	while (atomic_read(&lock->val) & _Q_LOCKED_MASK) -		cpu_relax(); - -done: -	smp_acquire__after_ctrl_dep(); -} -EXPORT_SYMBOL(queued_spin_unlock_wait); -#endif -  #endif /* _GEN_PV_LOCK_SLOWPATH */  /** diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h index 4ccfcaae5b89..43555681c40b 100644 --- a/kernel/locking/qspinlock_paravirt.h +++ b/kernel/locking/qspinlock_paravirt.h @@ -72,7 +72,7 @@ static inline bool pv_queued_spin_steal_lock(struct qspinlock *lock)  	struct __qspinlock *l = (void *)lock;  	if (!(atomic_read(&lock->val) & _Q_LOCKED_PENDING_MASK) && -	    (cmpxchg(&l->locked, 0, _Q_LOCKED_VAL) == 0)) { +	    (cmpxchg_acquire(&l->locked, 0, _Q_LOCKED_VAL) == 0)) {  		qstat_inc(qstat_pv_lock_stealing, true);  		return true;  	} @@ -101,16 +101,16 @@ static __always_inline void clear_pending(struct qspinlock *lock)  /*   * The pending bit check in pv_queued_spin_steal_lock() isn't a memory - * barrier. Therefore, an atomic cmpxchg() is used to acquire the lock - * just to be sure that it will get it. + * barrier. Therefore, an atomic cmpxchg_acquire() is used to acquire the + * lock just to be sure that it will get it.   */  static __always_inline int trylock_clear_pending(struct qspinlock *lock)  {  	struct __qspinlock *l = (void *)lock;  	return !READ_ONCE(l->locked) && -	       (cmpxchg(&l->locked_pending, _Q_PENDING_VAL, _Q_LOCKED_VAL) -			== _Q_PENDING_VAL); +	       (cmpxchg_acquire(&l->locked_pending, _Q_PENDING_VAL, +				_Q_LOCKED_VAL) == _Q_PENDING_VAL);  }  #else /* _Q_PENDING_BITS == 8 */  static __always_inline void set_pending(struct qspinlock *lock) @@ -138,7 +138,7 @@ static __always_inline int trylock_clear_pending(struct qspinlock *lock)  		 */  		old = val;  		new = (val & ~_Q_PENDING_MASK) | _Q_LOCKED_VAL; -		val = atomic_cmpxchg(&lock->val, old, new); +		val = atomic_cmpxchg_acquire(&lock->val, old, new);  		if (val == old)  			return 1; @@ -362,8 +362,18 @@ static void pv_kick_node(struct qspinlock *lock, struct mcs_spinlock *node)  	 * observe its next->locked value and advance itself.  	 *  	 * Matches with smp_store_mb() and cmpxchg() in pv_wait_node() +	 * +	 * The write to next->locked in arch_mcs_spin_unlock_contended() +	 * must be ordered before the read of pn->state in the cmpxchg() +	 * below for the code to work correctly. To guarantee full ordering +	 * irrespective of the success or failure of the cmpxchg(), +	 * a relaxed version with explicit barrier is used. The control +	 * dependency will order the reading of pn->state before any +	 * subsequent writes.  	 */ -	if (cmpxchg(&pn->state, vcpu_halted, vcpu_hashed) != vcpu_halted) +	smp_mb__before_atomic(); +	if (cmpxchg_relaxed(&pn->state, vcpu_halted, vcpu_hashed) +	    != vcpu_halted)  		return;  	/* diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h index 72ad45a9a794..8d039b928d61 100644 --- a/kernel/locking/rtmutex_common.h +++ b/kernel/locking/rtmutex_common.h @@ -40,6 +40,9 @@ struct rt_mutex_waiter {  /*   * Various helpers to access the waiters-tree:   */ + +#ifdef CONFIG_RT_MUTEXES +  static inline int rt_mutex_has_waiters(struct rt_mutex *lock)  {  	return !RB_EMPTY_ROOT(&lock->waiters); @@ -69,6 +72,32 @@ task_top_pi_waiter(struct task_struct *p)  			pi_tree_entry);  } +#else + +static inline int rt_mutex_has_waiters(struct rt_mutex *lock) +{ +	return false; +} + +static inline struct rt_mutex_waiter * +rt_mutex_top_waiter(struct rt_mutex *lock) +{ +	return NULL; +} + +static inline int task_has_pi_waiters(struct task_struct *p) +{ +	return false; +} + +static inline struct rt_mutex_waiter * +task_top_pi_waiter(struct task_struct *p) +{ +	return NULL; +} + +#endif +  /*   * lock->owner state tracking:   */ diff --git a/kernel/locking/rwsem-spinlock.c b/kernel/locking/rwsem-spinlock.c index 20819df98125..0848634c5512 100644 --- a/kernel/locking/rwsem-spinlock.c +++ b/kernel/locking/rwsem-spinlock.c @@ -126,7 +126,7 @@ __rwsem_wake_one_writer(struct rw_semaphore *sem)  /*   * get a read lock on the semaphore   */ -void __sched __down_read(struct rw_semaphore *sem) +int __sched __down_read_common(struct rw_semaphore *sem, int state)  {  	struct rwsem_waiter waiter;  	unsigned long flags; @@ -140,8 +140,6 @@ void __sched __down_read(struct rw_semaphore *sem)  		goto out;  	} -	set_current_state(TASK_UNINTERRUPTIBLE); -  	/* set up my own style of waitqueue */  	waiter.task = current;  	waiter.type = RWSEM_WAITING_FOR_READ; @@ -149,20 +147,41 @@ void __sched __down_read(struct rw_semaphore *sem)  	list_add_tail(&waiter.list, &sem->wait_list); -	/* we don't need to touch the semaphore struct anymore */ -	raw_spin_unlock_irqrestore(&sem->wait_lock, flags); -  	/* wait to be given the lock */  	for (;;) {  		if (!waiter.task)  			break; +		if (signal_pending_state(state, current)) +			goto out_nolock; +		set_current_state(state); +		raw_spin_unlock_irqrestore(&sem->wait_lock, flags);  		schedule(); -		set_current_state(TASK_UNINTERRUPTIBLE); +		raw_spin_lock_irqsave(&sem->wait_lock, flags);  	} -	__set_current_state(TASK_RUNNING); +	raw_spin_unlock_irqrestore(&sem->wait_lock, flags);   out: -	; +	return 0; + +out_nolock: +	/* +	 * We didn't take the lock, so that there is a writer, which +	 * is owner or the first waiter of the sem. If it's a waiter, +	 * it will be woken by current owner. Not need to wake anybody. +	 */ +	list_del(&waiter.list); +	raw_spin_unlock_irqrestore(&sem->wait_lock, flags); +	return -EINTR; +} + +void __sched __down_read(struct rw_semaphore *sem) +{ +	__down_read_common(sem, TASK_UNINTERRUPTIBLE); +} + +int __sched __down_read_killable(struct rw_semaphore *sem) +{ +	return __down_read_common(sem, TASK_KILLABLE);  }  /* diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c index 34e727f18e49..02f660666ab8 100644 --- a/kernel/locking/rwsem-xadd.c +++ b/kernel/locking/rwsem-xadd.c @@ -221,8 +221,8 @@ static void __rwsem_mark_wake(struct rw_semaphore *sem,  /*   * Wait for the read lock to be granted   */ -__visible -struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem) +static inline struct rw_semaphore __sched * +__rwsem_down_read_failed_common(struct rw_semaphore *sem, int state)  {  	long count, adjustment = -RWSEM_ACTIVE_READ_BIAS;  	struct rwsem_waiter waiter; @@ -255,17 +255,44 @@ struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem)  	/* wait to be given the lock */  	while (true) { -		set_current_state(TASK_UNINTERRUPTIBLE); +		set_current_state(state);  		if (!waiter.task)  			break; +		if (signal_pending_state(state, current)) { +			raw_spin_lock_irq(&sem->wait_lock); +			if (waiter.task) +				goto out_nolock; +			raw_spin_unlock_irq(&sem->wait_lock); +			break; +		}  		schedule();  	}  	__set_current_state(TASK_RUNNING);  	return sem; +out_nolock: +	list_del(&waiter.list); +	if (list_empty(&sem->wait_list)) +		atomic_long_add(-RWSEM_WAITING_BIAS, &sem->count); +	raw_spin_unlock_irq(&sem->wait_lock); +	__set_current_state(TASK_RUNNING); +	return ERR_PTR(-EINTR); +} + +__visible struct rw_semaphore * __sched +rwsem_down_read_failed(struct rw_semaphore *sem) +{ +	return __rwsem_down_read_failed_common(sem, TASK_UNINTERRUPTIBLE);  }  EXPORT_SYMBOL(rwsem_down_read_failed); +__visible struct rw_semaphore * __sched +rwsem_down_read_failed_killable(struct rw_semaphore *sem) +{ +	return __rwsem_down_read_failed_common(sem, TASK_KILLABLE); +} +EXPORT_SYMBOL(rwsem_down_read_failed_killable); +  /*   * This function must be called with the sem->wait_lock held to prevent   * race conditions between checking the rwsem wait list and setting the diff --git a/kernel/membarrier.c b/kernel/membarrier.c deleted file mode 100644 index 9f9284f37f8d..000000000000 --- a/kernel/membarrier.c +++ /dev/null @@ -1,70 +0,0 @@ -/* - * Copyright (C) 2010, 2015 Mathieu Desnoyers <mathieu.desnoyers@efficios.com> - * - * membarrier system call - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the - * GNU General Public License for more details. - */ - -#include <linux/syscalls.h> -#include <linux/membarrier.h> -#include <linux/tick.h> - -/* - * Bitmask made from a "or" of all commands within enum membarrier_cmd, - * except MEMBARRIER_CMD_QUERY. - */ -#define MEMBARRIER_CMD_BITMASK	(MEMBARRIER_CMD_SHARED) - -/** - * sys_membarrier - issue memory barriers on a set of threads - * @cmd:   Takes command values defined in enum membarrier_cmd. - * @flags: Currently needs to be 0. For future extensions. - * - * If this system call is not implemented, -ENOSYS is returned. If the - * command specified does not exist, or if the command argument is invalid, - * this system call returns -EINVAL. For a given command, with flags argument - * set to 0, this system call is guaranteed to always return the same value - * until reboot. - * - * All memory accesses performed in program order from each targeted thread - * is guaranteed to be ordered with respect to sys_membarrier(). If we use - * the semantic "barrier()" to represent a compiler barrier forcing memory - * accesses to be performed in program order across the barrier, and - * smp_mb() to represent explicit memory barriers forcing full memory - * ordering across the barrier, we have the following ordering table for - * each pair of barrier(), sys_membarrier() and smp_mb(): - * - * The pair ordering is detailed as (O: ordered, X: not ordered): - * - *                        barrier()   smp_mb() sys_membarrier() - *        barrier()          X           X            O - *        smp_mb()           X           O            O - *        sys_membarrier()   O           O            O - */ -SYSCALL_DEFINE2(membarrier, int, cmd, int, flags) -{ -	/* MEMBARRIER_CMD_SHARED is not compatible with nohz_full. */ -	if (tick_nohz_full_enabled()) -		return -ENOSYS; -	if (unlikely(flags)) -		return -EINVAL; -	switch (cmd) { -	case MEMBARRIER_CMD_QUERY: -		return MEMBARRIER_CMD_BITMASK; -	case MEMBARRIER_CMD_SHARED: -		if (num_online_cpus() > 1) -			synchronize_sched(); -		return 0; -	default: -		return -EINVAL; -	} -} diff --git a/kernel/memremap.c b/kernel/memremap.c index 124bed776532..9afdc434fb49 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -34,13 +34,24 @@ static void *arch_memremap_wb(resource_size_t offset, unsigned long size)  }  #endif -static void *try_ram_remap(resource_size_t offset, size_t size) +#ifndef arch_memremap_can_ram_remap +static bool arch_memremap_can_ram_remap(resource_size_t offset, size_t size, +					unsigned long flags) +{ +	return true; +} +#endif + +static void *try_ram_remap(resource_size_t offset, size_t size, +			   unsigned long flags)  {  	unsigned long pfn = PHYS_PFN(offset);  	/* In the simple case just return the existing linear address */ -	if (pfn_valid(pfn) && !PageHighMem(pfn_to_page(pfn))) +	if (pfn_valid(pfn) && !PageHighMem(pfn_to_page(pfn)) && +	    arch_memremap_can_ram_remap(offset, size, flags))  		return __va(offset); +  	return NULL; /* fallback to arch_memremap_wb */  } @@ -48,7 +59,8 @@ static void *try_ram_remap(resource_size_t offset, size_t size)   * memremap() - remap an iomem_resource as cacheable memory   * @offset: iomem resource start address   * @size: size of remap - * @flags: any of MEMREMAP_WB, MEMREMAP_WT and MEMREMAP_WC + * @flags: any of MEMREMAP_WB, MEMREMAP_WT, MEMREMAP_WC, + *		  MEMREMAP_ENC, MEMREMAP_DEC   *   * memremap() is "ioremap" for cases where it is known that the resource   * being mapped does not have i/o side effects and the __iomem @@ -95,7 +107,7 @@ void *memremap(resource_size_t offset, size_t size, unsigned long flags)  		 * the requested range is potentially in System RAM.  		 */  		if (is_ram == REGION_INTERSECTS) -			addr = try_ram_remap(offset, size); +			addr = try_ram_remap(offset, size, flags);  		if (!addr)  			addr = arch_memremap_wb(offset, size);  	} diff --git a/kernel/panic.c b/kernel/panic.c index a58932b41700..bdd18afa19a4 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -26,6 +26,7 @@  #include <linux/nmi.h>  #include <linux/console.h>  #include <linux/bug.h> +#include <linux/ratelimit.h>  #define PANIC_TIMER_STEP 100  #define PANIC_BLINK_SPD 18 @@ -601,6 +602,17 @@ EXPORT_SYMBOL(__stack_chk_fail);  #endif +#ifdef CONFIG_ARCH_HAS_REFCOUNT +void refcount_error_report(struct pt_regs *regs, const char *err) +{ +	WARN_RATELIMIT(1, "refcount_t %s at %pB in %s[%d], uid/euid: %u/%u\n", +		err, (void *)instruction_pointer(regs), +		current->comm, task_pid_nr(current), +		from_kuid_munged(&init_user_ns, current_uid()), +		from_kuid_munged(&init_user_ns, current_euid())); +} +#endif +  core_param(panic, panic_timeout, int, 0644);  core_param(pause_on_oops, pause_on_oops, int, 0644);  core_param(panic_on_warn, panic_on_warn, int, 0644); diff --git a/kernel/pid.c b/kernel/pid.c index c69c30d827e5..020dedbdf066 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -527,8 +527,11 @@ pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type,  	if (!ns)  		ns = task_active_pid_ns(current);  	if (likely(pid_alive(task))) { -		if (type != PIDTYPE_PID) +		if (type != PIDTYPE_PID) { +			if (type == __PIDTYPE_TGID) +				type = PIDTYPE_PID;  			task = task->group_leader; +		}  		nr = pid_nr_ns(rcu_dereference(task->pids[type].pid), ns);  	}  	rcu_read_unlock(); @@ -537,12 +540,6 @@ pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type,  }  EXPORT_SYMBOL(__task_pid_nr_ns); -pid_t task_tgid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns) -{ -	return pid_nr_ns(task_tgid(tsk), ns); -} -EXPORT_SYMBOL(task_tgid_nr_ns); -  struct pid_namespace *task_active_pid_ns(struct task_struct *tsk)  {  	return ns_of_pid(task_pid(tsk)); diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig index be90c945063f..9210379c0353 100644 --- a/kernel/rcu/Kconfig +++ b/kernel/rcu/Kconfig @@ -69,8 +69,7 @@ config TREE_SRCU  	  This option selects the full-fledged version of SRCU.  config TASKS_RCU -	bool -	default n +	def_bool PREEMPT  	select SRCU  	help  	  This option enables a task-based RCU implementation that uses diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index 808b8c85f626..e4b43fef89f5 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -356,22 +356,10 @@ do {									\  #ifdef CONFIG_TINY_RCU  /* Tiny RCU doesn't expedite, as its purpose in life is instead to be tiny. */ -static inline bool rcu_gp_is_normal(void)  /* Internal RCU use. */ -{ -	return true; -} -static inline bool rcu_gp_is_expedited(void)  /* Internal RCU use. */ -{ -	return false; -} - -static inline void rcu_expedite_gp(void) -{ -} - -static inline void rcu_unexpedite_gp(void) -{ -} +static inline bool rcu_gp_is_normal(void) { return true; } +static inline bool rcu_gp_is_expedited(void) { return false; } +static inline void rcu_expedite_gp(void) { } +static inline void rcu_unexpedite_gp(void) { }  #else /* #ifdef CONFIG_TINY_RCU */  bool rcu_gp_is_normal(void);     /* Internal RCU use. */  bool rcu_gp_is_expedited(void);  /* Internal RCU use. */ @@ -419,12 +407,8 @@ static inline void rcutorture_get_gp_data(enum rcutorture_type test_type,  	*gpnum = 0;  	*completed = 0;  } -static inline void rcutorture_record_test_transition(void) -{ -} -static inline void rcutorture_record_progress(unsigned long vernum) -{ -} +static inline void rcutorture_record_test_transition(void) { } +static inline void rcutorture_record_progress(unsigned long vernum) { }  #ifdef CONFIG_RCU_TRACE  void do_trace_rcu_torture_read(const char *rcutorturename,  			       struct rcu_head *rhp, @@ -460,92 +444,20 @@ void srcutorture_get_gp_data(enum rcutorture_type test_type,  #endif  #ifdef CONFIG_TINY_RCU - -/* - * Return the number of grace periods started. - */ -static inline unsigned long rcu_batches_started(void) -{ -	return 0; -} - -/* - * Return the number of bottom-half grace periods started. - */ -static inline unsigned long rcu_batches_started_bh(void) -{ -	return 0; -} - -/* - * Return the number of sched grace periods started. - */ -static inline unsigned long rcu_batches_started_sched(void) -{ -	return 0; -} - -/* - * Return the number of grace periods completed. - */ -static inline unsigned long rcu_batches_completed(void) -{ -	return 0; -} - -/* - * Return the number of bottom-half grace periods completed. - */ -static inline unsigned long rcu_batches_completed_bh(void) -{ -	return 0; -} - -/* - * Return the number of sched grace periods completed. - */ -static inline unsigned long rcu_batches_completed_sched(void) -{ -	return 0; -} - -/* - * Return the number of expedited grace periods completed. - */ -static inline unsigned long rcu_exp_batches_completed(void) -{ -	return 0; -} - -/* - * Return the number of expedited sched grace periods completed. - */ -static inline unsigned long rcu_exp_batches_completed_sched(void) -{ -	return 0; -} - -static inline unsigned long srcu_batches_completed(struct srcu_struct *sp) -{ -	return 0; -} - -static inline void rcu_force_quiescent_state(void) -{ -} - -static inline void rcu_bh_force_quiescent_state(void) -{ -} - -static inline void rcu_sched_force_quiescent_state(void) -{ -} - -static inline void show_rcu_gp_kthreads(void) -{ -} - +static inline unsigned long rcu_batches_started(void) { return 0; } +static inline unsigned long rcu_batches_started_bh(void) { return 0; } +static inline unsigned long rcu_batches_started_sched(void) { return 0; } +static inline unsigned long rcu_batches_completed(void) { return 0; } +static inline unsigned long rcu_batches_completed_bh(void) { return 0; } +static inline unsigned long rcu_batches_completed_sched(void) { return 0; } +static inline unsigned long rcu_exp_batches_completed(void) { return 0; } +static inline unsigned long rcu_exp_batches_completed_sched(void) { return 0; } +static inline unsigned long +srcu_batches_completed(struct srcu_struct *sp) { return 0; } +static inline void rcu_force_quiescent_state(void) { } +static inline void rcu_bh_force_quiescent_state(void) { } +static inline void rcu_sched_force_quiescent_state(void) { } +static inline void show_rcu_gp_kthreads(void) { }  #else /* #ifdef CONFIG_TINY_RCU */  extern unsigned long rcutorture_testseq;  extern unsigned long rcutorture_vernum; diff --git a/kernel/rcu/rcu_segcblist.c b/kernel/rcu/rcu_segcblist.c index 2b62a38b080f..7649fcd2c4c7 100644 --- a/kernel/rcu/rcu_segcblist.c +++ b/kernel/rcu/rcu_segcblist.c @@ -36,24 +36,6 @@ void rcu_cblist_init(struct rcu_cblist *rclp)  }  /* - * Debug function to actually count the number of callbacks. - * If the number exceeds the limit specified, return -1. - */ -long rcu_cblist_count_cbs(struct rcu_cblist *rclp, long lim) -{ -	int cnt = 0; -	struct rcu_head **rhpp = &rclp->head; - -	for (;;) { -		if (!*rhpp) -			return cnt; -		if (++cnt > lim) -			return -1; -		rhpp = &(*rhpp)->next; -	} -} - -/*   * Dequeue the oldest rcu_head structure from the specified callback   * list.  This function assumes that the callback is non-lazy, but   * the caller can later invoke rcu_cblist_dequeued_lazy() if it @@ -103,17 +85,6 @@ void rcu_segcblist_disable(struct rcu_segcblist *rsclp)  }  /* - * Is the specified segment of the specified rcu_segcblist structure - * empty of callbacks? - */ -bool rcu_segcblist_segempty(struct rcu_segcblist *rsclp, int seg) -{ -	if (seg == RCU_DONE_TAIL) -		return &rsclp->head == rsclp->tails[RCU_DONE_TAIL]; -	return rsclp->tails[seg - 1] == rsclp->tails[seg]; -} - -/*   * Does the specified rcu_segcblist structure contain callbacks that   * are ready to be invoked?   */ @@ -134,50 +105,6 @@ bool rcu_segcblist_pend_cbs(struct rcu_segcblist *rsclp)  }  /* - * Dequeue and return the first ready-to-invoke callback.  If there - * are no ready-to-invoke callbacks, return NULL.  Disables interrupts - * to avoid interference.  Does not protect from interference from other - * CPUs or tasks. - */ -struct rcu_head *rcu_segcblist_dequeue(struct rcu_segcblist *rsclp) -{ -	unsigned long flags; -	int i; -	struct rcu_head *rhp; - -	local_irq_save(flags); -	if (!rcu_segcblist_ready_cbs(rsclp)) { -		local_irq_restore(flags); -		return NULL; -	} -	rhp = rsclp->head; -	BUG_ON(!rhp); -	rsclp->head = rhp->next; -	for (i = RCU_DONE_TAIL; i < RCU_CBLIST_NSEGS; i++) { -		if (rsclp->tails[i] != &rhp->next) -			break; -		rsclp->tails[i] = &rsclp->head; -	} -	smp_mb(); /* Dequeue before decrement for rcu_barrier(). */ -	WRITE_ONCE(rsclp->len, rsclp->len - 1); -	local_irq_restore(flags); -	return rhp; -} - -/* - * Account for the fact that a previously dequeued callback turned out - * to be marked as lazy. - */ -void rcu_segcblist_dequeued_lazy(struct rcu_segcblist *rsclp) -{ -	unsigned long flags; - -	local_irq_save(flags); -	rsclp->len_lazy--; -	local_irq_restore(flags); -} - -/*   * Return a pointer to the first callback in the specified rcu_segcblist   * structure.  This is useful for diagnostics.   */ @@ -203,17 +130,6 @@ struct rcu_head *rcu_segcblist_first_pend_cb(struct rcu_segcblist *rsclp)  }  /* - * Does the specified rcu_segcblist structure contain callbacks that - * have not yet been processed beyond having been posted, that is, - * does it contain callbacks in its last segment? - */ -bool rcu_segcblist_new_cbs(struct rcu_segcblist *rsclp) -{ -	return rcu_segcblist_is_enabled(rsclp) && -	       !rcu_segcblist_restempty(rsclp, RCU_NEXT_READY_TAIL); -} - -/*   * Enqueue the specified callback onto the specified rcu_segcblist   * structure, updating accounting as needed.  Note that the ->len   * field may be accessed locklessly, hence the WRITE_ONCE(). @@ -503,3 +419,27 @@ bool rcu_segcblist_future_gp_needed(struct rcu_segcblist *rsclp,  			return true;  	return false;  } + +/* + * Merge the source rcu_segcblist structure into the destination + * rcu_segcblist structure, then initialize the source.  Any pending + * callbacks from the source get to start over.  It is best to + * advance and accelerate both the destination and the source + * before merging. + */ +void rcu_segcblist_merge(struct rcu_segcblist *dst_rsclp, +			 struct rcu_segcblist *src_rsclp) +{ +	struct rcu_cblist donecbs; +	struct rcu_cblist pendcbs; + +	rcu_cblist_init(&donecbs); +	rcu_cblist_init(&pendcbs); +	rcu_segcblist_extract_count(src_rsclp, &donecbs); +	rcu_segcblist_extract_done_cbs(src_rsclp, &donecbs); +	rcu_segcblist_extract_pend_cbs(src_rsclp, &pendcbs); +	rcu_segcblist_insert_count(dst_rsclp, &donecbs); +	rcu_segcblist_insert_done_cbs(dst_rsclp, &donecbs); +	rcu_segcblist_insert_pend_cbs(dst_rsclp, &pendcbs); +	rcu_segcblist_init(src_rsclp); +} diff --git a/kernel/rcu/rcu_segcblist.h b/kernel/rcu/rcu_segcblist.h index 6e36e36478cd..581c12b63544 100644 --- a/kernel/rcu/rcu_segcblist.h +++ b/kernel/rcu/rcu_segcblist.h @@ -31,29 +31,7 @@ static inline void rcu_cblist_dequeued_lazy(struct rcu_cblist *rclp)  	rclp->len_lazy--;  } -/* - * Interim function to return rcu_cblist head pointer.  Longer term, the - * rcu_cblist will be used more pervasively, removing the need for this - * function. - */ -static inline struct rcu_head *rcu_cblist_head(struct rcu_cblist *rclp) -{ -	return rclp->head; -} - -/* - * Interim function to return rcu_cblist head pointer.  Longer term, the - * rcu_cblist will be used more pervasively, removing the need for this - * function. - */ -static inline struct rcu_head **rcu_cblist_tail(struct rcu_cblist *rclp) -{ -	WARN_ON_ONCE(!rclp->head); -	return rclp->tail; -} -  void rcu_cblist_init(struct rcu_cblist *rclp); -long rcu_cblist_count_cbs(struct rcu_cblist *rclp, long lim);  struct rcu_head *rcu_cblist_dequeue(struct rcu_cblist *rclp);  /* @@ -134,14 +112,10 @@ static inline struct rcu_head **rcu_segcblist_tail(struct rcu_segcblist *rsclp)  void rcu_segcblist_init(struct rcu_segcblist *rsclp);  void rcu_segcblist_disable(struct rcu_segcblist *rsclp); -bool rcu_segcblist_segempty(struct rcu_segcblist *rsclp, int seg);  bool rcu_segcblist_ready_cbs(struct rcu_segcblist *rsclp);  bool rcu_segcblist_pend_cbs(struct rcu_segcblist *rsclp); -struct rcu_head *rcu_segcblist_dequeue(struct rcu_segcblist *rsclp); -void rcu_segcblist_dequeued_lazy(struct rcu_segcblist *rsclp);  struct rcu_head *rcu_segcblist_first_cb(struct rcu_segcblist *rsclp);  struct rcu_head *rcu_segcblist_first_pend_cb(struct rcu_segcblist *rsclp); -bool rcu_segcblist_new_cbs(struct rcu_segcblist *rsclp);  void rcu_segcblist_enqueue(struct rcu_segcblist *rsclp,  			   struct rcu_head *rhp, bool lazy);  bool rcu_segcblist_entrain(struct rcu_segcblist *rsclp, @@ -162,3 +136,5 @@ void rcu_segcblist_advance(struct rcu_segcblist *rsclp, unsigned long seq);  bool rcu_segcblist_accelerate(struct rcu_segcblist *rsclp, unsigned long seq);  bool rcu_segcblist_future_gp_needed(struct rcu_segcblist *rsclp,  				    unsigned long seq); +void rcu_segcblist_merge(struct rcu_segcblist *dst_rsclp, +			 struct rcu_segcblist *src_rsclp); diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c index 3cc18110b612..1f87a02c3399 100644 --- a/kernel/rcu/rcuperf.c +++ b/kernel/rcu/rcuperf.c @@ -317,8 +317,6 @@ static struct rcu_perf_ops sched_ops = {  	.name		= "sched"  }; -#ifdef CONFIG_TASKS_RCU -  /*   * Definitions for RCU-tasks perf testing.   */ @@ -346,24 +344,11 @@ static struct rcu_perf_ops tasks_ops = {  	.name		= "tasks"  }; -#define RCUPERF_TASKS_OPS &tasks_ops, -  static bool __maybe_unused torturing_tasks(void)  {  	return cur_ops == &tasks_ops;  } -#else /* #ifdef CONFIG_TASKS_RCU */ - -#define RCUPERF_TASKS_OPS - -static bool __maybe_unused torturing_tasks(void) -{ -	return false; -} - -#endif /* #else #ifdef CONFIG_TASKS_RCU */ -  /*   * If performance tests complete, wait for shutdown to commence.   */ @@ -658,7 +643,7 @@ rcu_perf_init(void)  	int firsterr = 0;  	static struct rcu_perf_ops *perf_ops[] = {  		&rcu_ops, &rcu_bh_ops, &srcu_ops, &srcud_ops, &sched_ops, -		RCUPERF_TASKS_OPS +		&tasks_ops,  	};  	if (!torture_init_begin(perf_type, verbose, &perf_runnable)) diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index b8f7f8ce8575..45f2ffbc1e78 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -199,7 +199,8 @@ MODULE_PARM_DESC(torture_runnable, "Start rcutorture at boot");  static u64 notrace rcu_trace_clock_local(void)  {  	u64 ts = trace_clock_local(); -	unsigned long __maybe_unused ts_rem = do_div(ts, NSEC_PER_USEC); + +	(void)do_div(ts, NSEC_PER_USEC);  	return ts;  }  #else /* #ifdef CONFIG_RCU_TRACE */ @@ -496,7 +497,7 @@ static struct rcu_torture_ops rcu_busted_ops = {  	.fqs		= NULL,  	.stats		= NULL,  	.irq_capable	= 1, -	.name		= "rcu_busted" +	.name		= "busted"  };  /* @@ -522,7 +523,7 @@ static void srcu_read_delay(struct torture_random_state *rrsp)  	delay = torture_random(rrsp) %  		(nrealreaders * 2 * longdelay * uspertick); -	if (!delay) +	if (!delay && in_task())  		schedule_timeout_interruptible(longdelay);  	else  		rcu_read_delay(rrsp); @@ -561,44 +562,7 @@ static void srcu_torture_barrier(void)  static void srcu_torture_stats(void)  { -	int __maybe_unused cpu; -	int idx; - -#ifdef CONFIG_TREE_SRCU -	idx = srcu_ctlp->srcu_idx & 0x1; -	pr_alert("%s%s Tree SRCU per-CPU(idx=%d):", -		 torture_type, TORTURE_FLAG, idx); -	for_each_possible_cpu(cpu) { -		unsigned long l0, l1; -		unsigned long u0, u1; -		long c0, c1; -		struct srcu_data *counts; - -		counts = per_cpu_ptr(srcu_ctlp->sda, cpu); -		u0 = counts->srcu_unlock_count[!idx]; -		u1 = counts->srcu_unlock_count[idx]; - -		/* -		 * Make sure that a lock is always counted if the corresponding -		 * unlock is counted. -		 */ -		smp_rmb(); - -		l0 = counts->srcu_lock_count[!idx]; -		l1 = counts->srcu_lock_count[idx]; - -		c0 = l0 - u0; -		c1 = l1 - u1; -		pr_cont(" %d(%ld,%ld)", cpu, c0, c1); -	} -	pr_cont("\n"); -#elif defined(CONFIG_TINY_SRCU) -	idx = READ_ONCE(srcu_ctlp->srcu_idx) & 0x1; -	pr_alert("%s%s Tiny SRCU per-CPU(idx=%d): (%hd,%hd)\n", -		 torture_type, TORTURE_FLAG, idx, -		 READ_ONCE(srcu_ctlp->srcu_lock_nesting[!idx]), -		 READ_ONCE(srcu_ctlp->srcu_lock_nesting[idx])); -#endif +	srcu_torture_stats_print(srcu_ctlp, torture_type, TORTURE_FLAG);  }  static void srcu_torture_synchronize_expedited(void) @@ -620,6 +584,7 @@ static struct rcu_torture_ops srcu_ops = {  	.call		= srcu_torture_call,  	.cb_barrier	= srcu_torture_barrier,  	.stats		= srcu_torture_stats, +	.irq_capable	= 1,  	.name		= "srcu"  }; @@ -652,6 +617,7 @@ static struct rcu_torture_ops srcud_ops = {  	.call		= srcu_torture_call,  	.cb_barrier	= srcu_torture_barrier,  	.stats		= srcu_torture_stats, +	.irq_capable	= 1,  	.name		= "srcud"  }; @@ -696,8 +662,6 @@ static struct rcu_torture_ops sched_ops = {  	.name		= "sched"  }; -#ifdef CONFIG_TASKS_RCU -  /*   * Definitions for RCU-tasks torture testing.   */ @@ -735,24 +699,11 @@ static struct rcu_torture_ops tasks_ops = {  	.name		= "tasks"  }; -#define RCUTORTURE_TASKS_OPS &tasks_ops, -  static bool __maybe_unused torturing_tasks(void)  {  	return cur_ops == &tasks_ops;  } -#else /* #ifdef CONFIG_TASKS_RCU */ - -#define RCUTORTURE_TASKS_OPS - -static bool __maybe_unused torturing_tasks(void) -{ -	return false; -} - -#endif /* #else #ifdef CONFIG_TASKS_RCU */ -  /*   * RCU torture priority-boost testing.  Runs one real-time thread per   * CPU for moderate bursts, repeatedly registering RCU callbacks and @@ -1114,6 +1065,11 @@ rcu_torture_fakewriter(void *arg)  	return 0;  } +static void rcu_torture_timer_cb(struct rcu_head *rhp) +{ +	kfree(rhp); +} +  /*   * RCU torture reader from timer handler.  Dereferences rcu_torture_current,   * incrementing the corresponding element of the pipeline array.  The @@ -1176,6 +1132,14 @@ static void rcu_torture_timer(unsigned long unused)  	__this_cpu_inc(rcu_torture_batch[completed]);  	preempt_enable();  	cur_ops->readunlock(idx); + +	/* Test call_rcu() invocation from interrupt handler. */ +	if (cur_ops->call) { +		struct rcu_head *rhp = kmalloc(sizeof(*rhp), GFP_NOWAIT); + +		if (rhp) +			cur_ops->call(rhp, rcu_torture_timer_cb); +	}  }  /* @@ -1354,11 +1318,12 @@ rcu_torture_stats_print(void)  		srcutorture_get_gp_data(cur_ops->ttype, srcu_ctlp,  					&flags, &gpnum, &completed);  		wtp = READ_ONCE(writer_task); -		pr_alert("??? Writer stall state %s(%d) g%lu c%lu f%#x ->state %#lx\n", +		pr_alert("??? Writer stall state %s(%d) g%lu c%lu f%#x ->state %#lx cpu %d\n",  			 rcu_torture_writer_state_getname(),  			 rcu_torture_writer_state,  			 gpnum, completed, flags, -			 wtp == NULL ? ~0UL : wtp->state); +			 wtp == NULL ? ~0UL : wtp->state, +			 wtp == NULL ? -1 : (int)task_cpu(wtp));  		show_rcu_gp_kthreads();  		rcu_ftrace_dump(DUMP_ALL);  	} @@ -1749,7 +1714,7 @@ rcu_torture_init(void)  	int firsterr = 0;  	static struct rcu_torture_ops *torture_ops[] = {  		&rcu_ops, &rcu_bh_ops, &rcu_busted_ops, &srcu_ops, &srcud_ops, -		&sched_ops, RCUTORTURE_TASKS_OPS +		&sched_ops, &tasks_ops,  	};  	if (!torture_init_begin(torture_type, verbose, &torture_runnable)) diff --git a/kernel/rcu/srcutiny.c b/kernel/rcu/srcutiny.c index 1a1c1047d2ed..76ac5f50b2c7 100644 --- a/kernel/rcu/srcutiny.c +++ b/kernel/rcu/srcutiny.c @@ -33,6 +33,8 @@  #include "rcu_segcblist.h"  #include "rcu.h" +int rcu_scheduler_active __read_mostly; +  static int init_srcu_struct_fields(struct srcu_struct *sp)  {  	sp->srcu_lock_nesting[0] = 0; @@ -193,3 +195,9 @@ void synchronize_srcu(struct srcu_struct *sp)  	destroy_rcu_head_on_stack(&rs.head);  }  EXPORT_SYMBOL_GPL(synchronize_srcu); + +/* Lockdep diagnostics.  */ +void __init rcu_scheduler_starting(void) +{ +	rcu_scheduler_active = RCU_SCHEDULER_RUNNING; +} diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index d0ca524bf042..729a8706751d 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -51,6 +51,7 @@ module_param(counter_wrap_check, ulong, 0444);  static void srcu_invoke_callbacks(struct work_struct *work);  static void srcu_reschedule(struct srcu_struct *sp, unsigned long delay); +static void process_srcu(struct work_struct *work);  /*   * Initialize SRCU combining tree.  Note that statically allocated @@ -896,6 +897,15 @@ static void __synchronize_srcu(struct srcu_struct *sp, bool do_norm)  	__call_srcu(sp, &rcu.head, wakeme_after_rcu, do_norm);  	wait_for_completion(&rcu.completion);  	destroy_rcu_head_on_stack(&rcu.head); + +	/* +	 * Make sure that later code is ordered after the SRCU grace +	 * period.  This pairs with the raw_spin_lock_irq_rcu_node() +	 * in srcu_invoke_callbacks().  Unlike Tree RCU, this is needed +	 * because the current CPU might have been totally uninvolved with +	 * (and thus unordered against) that grace period. +	 */ +	smp_mb();  }  /** @@ -1194,7 +1204,7 @@ static void srcu_reschedule(struct srcu_struct *sp, unsigned long delay)  /*   * This is the work-queue function that handles SRCU grace periods.   */ -void process_srcu(struct work_struct *work) +static void process_srcu(struct work_struct *work)  {  	struct srcu_struct *sp; @@ -1203,7 +1213,6 @@ void process_srcu(struct work_struct *work)  	srcu_advance_state(sp);  	srcu_reschedule(sp, srcu_get_delay(sp));  } -EXPORT_SYMBOL_GPL(process_srcu);  void srcutorture_get_gp_data(enum rcutorture_type test_type,  			     struct srcu_struct *sp, int *flags, @@ -1217,6 +1226,43 @@ void srcutorture_get_gp_data(enum rcutorture_type test_type,  }  EXPORT_SYMBOL_GPL(srcutorture_get_gp_data); +void srcu_torture_stats_print(struct srcu_struct *sp, char *tt, char *tf) +{ +	int cpu; +	int idx; +	unsigned long s0 = 0, s1 = 0; + +	idx = sp->srcu_idx & 0x1; +	pr_alert("%s%s Tree SRCU per-CPU(idx=%d):", tt, tf, idx); +	for_each_possible_cpu(cpu) { +		unsigned long l0, l1; +		unsigned long u0, u1; +		long c0, c1; +		struct srcu_data *counts; + +		counts = per_cpu_ptr(sp->sda, cpu); +		u0 = counts->srcu_unlock_count[!idx]; +		u1 = counts->srcu_unlock_count[idx]; + +		/* +		 * Make sure that a lock is always counted if the corresponding +		 * unlock is counted. +		 */ +		smp_rmb(); + +		l0 = counts->srcu_lock_count[!idx]; +		l1 = counts->srcu_lock_count[idx]; + +		c0 = l0 - u0; +		c1 = l1 - u1; +		pr_cont(" %d(%ld,%ld)", cpu, c0, c1); +		s0 += c0; +		s1 += c1; +	} +	pr_cont(" T(%ld,%ld)\n", s0, s1); +} +EXPORT_SYMBOL_GPL(srcu_torture_stats_print); +  static int __init srcu_bootup_announce(void)  {  	pr_info("Hierarchical SRCU implementation.\n"); diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c index f8488965250f..a64eee0db39e 100644 --- a/kernel/rcu/tiny.c +++ b/kernel/rcu/tiny.c @@ -56,8 +56,6 @@ static struct rcu_ctrlblk rcu_bh_ctrlblk = {  	.curtail	= &rcu_bh_ctrlblk.rcucblist,  }; -#include "tiny_plugin.h" -  void rcu_barrier_bh(void)  {  	wait_rcu_gp(call_rcu_bh); diff --git a/kernel/rcu/tiny_plugin.h b/kernel/rcu/tiny_plugin.h deleted file mode 100644 index f0a01b2a3062..000000000000 --- a/kernel/rcu/tiny_plugin.h +++ /dev/null @@ -1,47 +0,0 @@ -/* - * Read-Copy Update mechanism for mutual exclusion, the Bloatwatch edition - * Internal non-public definitions that provide either classic - * or preemptible semantics. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, you can access it online at - * http://www.gnu.org/licenses/gpl-2.0.html. - * - * Copyright (c) 2010 Linaro - * - * Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com> - */ - -#if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_SRCU) -#include <linux/kernel_stat.h> - -int rcu_scheduler_active __read_mostly; -EXPORT_SYMBOL_GPL(rcu_scheduler_active); - -/* - * During boot, we forgive RCU lockdep issues.  After this function is - * invoked, we start taking RCU lockdep issues seriously.  Note that unlike - * Tree RCU, Tiny RCU transitions directly from RCU_SCHEDULER_INACTIVE - * to RCU_SCHEDULER_RUNNING, skipping the RCU_SCHEDULER_INIT stage. - * The reason for this is that Tiny RCU does not need kthreads, so does - * not have to care about the fact that the scheduler is half-initialized - * at a certain phase of the boot process.  Unless SRCU is in the mix. - */ -void __init rcu_scheduler_starting(void) -{ -	WARN_ON(nr_context_switches() > 0); -	rcu_scheduler_active = IS_ENABLED(CONFIG_SRCU) -		? RCU_SCHEDULER_INIT : RCU_SCHEDULER_RUNNING; -} - -#endif /* #if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_SRCU) */ diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 51d4c3acf32d..84fe96641b2e 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -97,9 +97,6 @@ struct rcu_state sname##_state = { \  	.gp_state = RCU_GP_IDLE, \  	.gpnum = 0UL - 300UL, \  	.completed = 0UL - 300UL, \ -	.orphan_lock = __RAW_SPIN_LOCK_UNLOCKED(&sname##_state.orphan_lock), \ -	.orphan_pend = RCU_CBLIST_INITIALIZER(sname##_state.orphan_pend), \ -	.orphan_done = RCU_CBLIST_INITIALIZER(sname##_state.orphan_done), \  	.barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \  	.name = RCU_STATE_NAME(sname), \  	.abbr = sabbr, \ @@ -843,13 +840,9 @@ static void rcu_eqs_enter(bool user)   */  void rcu_idle_enter(void)  { -	unsigned long flags; - -	local_irq_save(flags); +	RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_idle_enter() invoked with irqs enabled!!!");  	rcu_eqs_enter(false); -	local_irq_restore(flags);  } -EXPORT_SYMBOL_GPL(rcu_idle_enter);  #ifdef CONFIG_NO_HZ_FULL  /** @@ -862,7 +855,8 @@ EXPORT_SYMBOL_GPL(rcu_idle_enter);   */  void rcu_user_enter(void)  { -	rcu_eqs_enter(1); +	RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_user_enter() invoked with irqs enabled!!!"); +	rcu_eqs_enter(true);  }  #endif /* CONFIG_NO_HZ_FULL */ @@ -955,8 +949,10 @@ static void rcu_eqs_exit(bool user)  	if (oldval & DYNTICK_TASK_NEST_MASK) {  		rdtp->dynticks_nesting += DYNTICK_TASK_NEST_VALUE;  	} else { +		__this_cpu_inc(disable_rcu_irq_enter);  		rdtp->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE;  		rcu_eqs_exit_common(oldval, user); +		__this_cpu_dec(disable_rcu_irq_enter);  	}  } @@ -979,7 +975,6 @@ void rcu_idle_exit(void)  	rcu_eqs_exit(false);  	local_irq_restore(flags);  } -EXPORT_SYMBOL_GPL(rcu_idle_exit);  #ifdef CONFIG_NO_HZ_FULL  /** @@ -1358,12 +1353,13 @@ static void rcu_check_gp_kthread_starvation(struct rcu_state *rsp)  	j = jiffies;  	gpa = READ_ONCE(rsp->gp_activity);  	if (j - gpa > 2 * HZ) { -		pr_err("%s kthread starved for %ld jiffies! g%lu c%lu f%#x %s(%d) ->state=%#lx\n", +		pr_err("%s kthread starved for %ld jiffies! g%lu c%lu f%#x %s(%d) ->state=%#lx ->cpu=%d\n",  		       rsp->name, j - gpa,  		       rsp->gpnum, rsp->completed,  		       rsp->gp_flags,  		       gp_state_getname(rsp->gp_state), rsp->gp_state, -		       rsp->gp_kthread ? rsp->gp_kthread->state : ~0); +		       rsp->gp_kthread ? rsp->gp_kthread->state : ~0, +		       rsp->gp_kthread ? task_cpu(rsp->gp_kthread) : -1);  		if (rsp->gp_kthread) {  			sched_show_task(rsp->gp_kthread);  			wake_up_process(rsp->gp_kthread); @@ -2067,8 +2063,8 @@ static bool rcu_gp_init(struct rcu_state *rsp)  }  /* - * Helper function for wait_event_interruptible_timeout() wakeup - * at force-quiescent-state time. + * Helper function for swait_event_idle() wakeup at force-quiescent-state + * time.   */  static bool rcu_gp_fqs_check_wake(struct rcu_state *rsp, int *gfp)  { @@ -2206,9 +2202,8 @@ static int __noreturn rcu_gp_kthread(void *arg)  					       READ_ONCE(rsp->gpnum),  					       TPS("reqwait"));  			rsp->gp_state = RCU_GP_WAIT_GPS; -			swait_event_interruptible(rsp->gp_wq, -						 READ_ONCE(rsp->gp_flags) & -						 RCU_GP_FLAG_INIT); +			swait_event_idle(rsp->gp_wq, READ_ONCE(rsp->gp_flags) & +						     RCU_GP_FLAG_INIT);  			rsp->gp_state = RCU_GP_DONE_GPS;  			/* Locking provides needed memory barrier. */  			if (rcu_gp_init(rsp)) @@ -2239,7 +2234,7 @@ static int __noreturn rcu_gp_kthread(void *arg)  					       READ_ONCE(rsp->gpnum),  					       TPS("fqswait"));  			rsp->gp_state = RCU_GP_WAIT_FQS; -			ret = swait_event_interruptible_timeout(rsp->gp_wq, +			ret = swait_event_idle_timeout(rsp->gp_wq,  					rcu_gp_fqs_check_wake(rsp, &gf), j);  			rsp->gp_state = RCU_GP_DOING_FQS;  			/* Locking provides needed memory barriers. */ @@ -2409,6 +2404,8 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp,  			return;  		}  		WARN_ON_ONCE(oldmask); /* Any child must be all zeroed! */ +		WARN_ON_ONCE(rnp->level != rcu_num_lvls - 1 && +			     rcu_preempt_blocked_readers_cgp(rnp));  		rnp->qsmask &= ~mask;  		trace_rcu_quiescent_state_report(rsp->name, rnp->gpnum,  						 mask, rnp->qsmask, rnp->level, @@ -2563,85 +2560,6 @@ rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp)  }  /* - * Send the specified CPU's RCU callbacks to the orphanage.  The - * specified CPU must be offline, and the caller must hold the - * ->orphan_lock. - */ -static void -rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp, -			  struct rcu_node *rnp, struct rcu_data *rdp) -{ -	lockdep_assert_held(&rsp->orphan_lock); - -	/* No-CBs CPUs do not have orphanable callbacks. */ -	if (!IS_ENABLED(CONFIG_HOTPLUG_CPU) || rcu_is_nocb_cpu(rdp->cpu)) -		return; - -	/* -	 * Orphan the callbacks.  First adjust the counts.  This is safe -	 * because _rcu_barrier() excludes CPU-hotplug operations, so it -	 * cannot be running now.  Thus no memory barrier is required. -	 */ -	rdp->n_cbs_orphaned += rcu_segcblist_n_cbs(&rdp->cblist); -	rcu_segcblist_extract_count(&rdp->cblist, &rsp->orphan_done); - -	/* -	 * Next, move those callbacks still needing a grace period to -	 * the orphanage, where some other CPU will pick them up. -	 * Some of the callbacks might have gone partway through a grace -	 * period, but that is too bad.  They get to start over because we -	 * cannot assume that grace periods are synchronized across CPUs. -	 */ -	rcu_segcblist_extract_pend_cbs(&rdp->cblist, &rsp->orphan_pend); - -	/* -	 * Then move the ready-to-invoke callbacks to the orphanage, -	 * where some other CPU will pick them up.  These will not be -	 * required to pass though another grace period: They are done. -	 */ -	rcu_segcblist_extract_done_cbs(&rdp->cblist, &rsp->orphan_done); - -	/* Finally, disallow further callbacks on this CPU.  */ -	rcu_segcblist_disable(&rdp->cblist); -} - -/* - * Adopt the RCU callbacks from the specified rcu_state structure's - * orphanage.  The caller must hold the ->orphan_lock. - */ -static void rcu_adopt_orphan_cbs(struct rcu_state *rsp, unsigned long flags) -{ -	struct rcu_data *rdp = raw_cpu_ptr(rsp->rda); - -	lockdep_assert_held(&rsp->orphan_lock); - -	/* No-CBs CPUs are handled specially. */ -	if (!IS_ENABLED(CONFIG_HOTPLUG_CPU) || -	    rcu_nocb_adopt_orphan_cbs(rsp, rdp, flags)) -		return; - -	/* Do the accounting first. */ -	rdp->n_cbs_adopted += rsp->orphan_done.len; -	if (rsp->orphan_done.len_lazy != rsp->orphan_done.len) -		rcu_idle_count_callbacks_posted(); -	rcu_segcblist_insert_count(&rdp->cblist, &rsp->orphan_done); - -	/* -	 * We do not need a memory barrier here because the only way we -	 * can get here if there is an rcu_barrier() in flight is if -	 * we are the task doing the rcu_barrier(). -	 */ - -	/* First adopt the ready-to-invoke callbacks, then the done ones. */ -	rcu_segcblist_insert_done_cbs(&rdp->cblist, &rsp->orphan_done); -	WARN_ON_ONCE(rsp->orphan_done.head); -	rcu_segcblist_insert_pend_cbs(&rdp->cblist, &rsp->orphan_pend); -	WARN_ON_ONCE(rsp->orphan_pend.head); -	WARN_ON_ONCE(rcu_segcblist_empty(&rdp->cblist) != -		     !rcu_segcblist_n_cbs(&rdp->cblist)); -} - -/*   * Trace the fact that this CPU is going offline.   */  static void rcu_cleanup_dying_cpu(struct rcu_state *rsp) @@ -2704,14 +2622,12 @@ static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf)  /*   * The CPU has been completely removed, and some other CPU is reporting - * this fact from process context.  Do the remainder of the cleanup, - * including orphaning the outgoing CPU's RCU callbacks, and also - * adopting them.  There can only be one CPU hotplug operation at a time, - * so no other CPU can be attempting to update rcu_cpu_kthread_task. + * this fact from process context.  Do the remainder of the cleanup. + * There can only be one CPU hotplug operation at a time, so no need for + * explicit locking.   */  static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)  { -	unsigned long flags;  	struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);  	struct rcu_node *rnp = rdp->mynode;  /* Outgoing CPU's rdp & rnp. */ @@ -2720,18 +2636,6 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)  	/* Adjust any no-longer-needed kthreads. */  	rcu_boost_kthread_setaffinity(rnp, -1); - -	/* Orphan the dead CPU's callbacks, and adopt them if appropriate. */ -	raw_spin_lock_irqsave(&rsp->orphan_lock, flags); -	rcu_send_cbs_to_orphanage(cpu, rsp, rnp, rdp); -	rcu_adopt_orphan_cbs(rsp, flags); -	raw_spin_unlock_irqrestore(&rsp->orphan_lock, flags); - -	WARN_ONCE(rcu_segcblist_n_cbs(&rdp->cblist) != 0 || -		  !rcu_segcblist_empty(&rdp->cblist), -		  "rcu_cleanup_dead_cpu: Callbacks on offline CPU %d: qlen=%lu, 1stCB=%p\n", -		  cpu, rcu_segcblist_n_cbs(&rdp->cblist), -		  rcu_segcblist_first_cb(&rdp->cblist));  }  /* @@ -3569,10 +3473,11 @@ static void rcu_barrier_callback(struct rcu_head *rhp)  	struct rcu_state *rsp = rdp->rsp;  	if (atomic_dec_and_test(&rsp->barrier_cpu_count)) { -		_rcu_barrier_trace(rsp, "LastCB", -1, rsp->barrier_sequence); +		_rcu_barrier_trace(rsp, TPS("LastCB"), -1, +				   rsp->barrier_sequence);  		complete(&rsp->barrier_completion);  	} else { -		_rcu_barrier_trace(rsp, "CB", -1, rsp->barrier_sequence); +		_rcu_barrier_trace(rsp, TPS("CB"), -1, rsp->barrier_sequence);  	}  } @@ -3584,14 +3489,15 @@ static void rcu_barrier_func(void *type)  	struct rcu_state *rsp = type;  	struct rcu_data *rdp = raw_cpu_ptr(rsp->rda); -	_rcu_barrier_trace(rsp, "IRQ", -1, rsp->barrier_sequence); +	_rcu_barrier_trace(rsp, TPS("IRQ"), -1, rsp->barrier_sequence);  	rdp->barrier_head.func = rcu_barrier_callback;  	debug_rcu_head_queue(&rdp->barrier_head);  	if (rcu_segcblist_entrain(&rdp->cblist, &rdp->barrier_head, 0)) {  		atomic_inc(&rsp->barrier_cpu_count);  	} else {  		debug_rcu_head_unqueue(&rdp->barrier_head); -		_rcu_barrier_trace(rsp, "IRQNQ", -1, rsp->barrier_sequence); +		_rcu_barrier_trace(rsp, TPS("IRQNQ"), -1, +				   rsp->barrier_sequence);  	}  } @@ -3605,14 +3511,15 @@ static void _rcu_barrier(struct rcu_state *rsp)  	struct rcu_data *rdp;  	unsigned long s = rcu_seq_snap(&rsp->barrier_sequence); -	_rcu_barrier_trace(rsp, "Begin", -1, s); +	_rcu_barrier_trace(rsp, TPS("Begin"), -1, s);  	/* Take mutex to serialize concurrent rcu_barrier() requests. */  	mutex_lock(&rsp->barrier_mutex);  	/* Did someone else do our work for us? */  	if (rcu_seq_done(&rsp->barrier_sequence, s)) { -		_rcu_barrier_trace(rsp, "EarlyExit", -1, rsp->barrier_sequence); +		_rcu_barrier_trace(rsp, TPS("EarlyExit"), -1, +				   rsp->barrier_sequence);  		smp_mb(); /* caller's subsequent code after above check. */  		mutex_unlock(&rsp->barrier_mutex);  		return; @@ -3620,7 +3527,7 @@ static void _rcu_barrier(struct rcu_state *rsp)  	/* Mark the start of the barrier operation. */  	rcu_seq_start(&rsp->barrier_sequence); -	_rcu_barrier_trace(rsp, "Inc1", -1, rsp->barrier_sequence); +	_rcu_barrier_trace(rsp, TPS("Inc1"), -1, rsp->barrier_sequence);  	/*  	 * Initialize the count to one rather than to zero in order to @@ -3643,10 +3550,10 @@ static void _rcu_barrier(struct rcu_state *rsp)  		rdp = per_cpu_ptr(rsp->rda, cpu);  		if (rcu_is_nocb_cpu(cpu)) {  			if (!rcu_nocb_cpu_needs_barrier(rsp, cpu)) { -				_rcu_barrier_trace(rsp, "OfflineNoCB", cpu, +				_rcu_barrier_trace(rsp, TPS("OfflineNoCB"), cpu,  						   rsp->barrier_sequence);  			} else { -				_rcu_barrier_trace(rsp, "OnlineNoCB", cpu, +				_rcu_barrier_trace(rsp, TPS("OnlineNoCB"), cpu,  						   rsp->barrier_sequence);  				smp_mb__before_atomic();  				atomic_inc(&rsp->barrier_cpu_count); @@ -3654,11 +3561,11 @@ static void _rcu_barrier(struct rcu_state *rsp)  					   rcu_barrier_callback, rsp, cpu, 0);  			}  		} else if (rcu_segcblist_n_cbs(&rdp->cblist)) { -			_rcu_barrier_trace(rsp, "OnlineQ", cpu, +			_rcu_barrier_trace(rsp, TPS("OnlineQ"), cpu,  					   rsp->barrier_sequence);  			smp_call_function_single(cpu, rcu_barrier_func, rsp, 1);  		} else { -			_rcu_barrier_trace(rsp, "OnlineNQ", cpu, +			_rcu_barrier_trace(rsp, TPS("OnlineNQ"), cpu,  					   rsp->barrier_sequence);  		}  	} @@ -3675,7 +3582,7 @@ static void _rcu_barrier(struct rcu_state *rsp)  	wait_for_completion(&rsp->barrier_completion);  	/* Mark the end of the barrier operation. */ -	_rcu_barrier_trace(rsp, "Inc2", -1, rsp->barrier_sequence); +	_rcu_barrier_trace(rsp, TPS("Inc2"), -1, rsp->barrier_sequence);  	rcu_seq_end(&rsp->barrier_sequence);  	/* Other rcu_barrier() invocations can now safely proceed. */ @@ -3777,8 +3684,6 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp)  	 */  	rnp = rdp->mynode;  	raw_spin_lock_rcu_node(rnp);		/* irqs already disabled. */ -	if (!rdp->beenonline) -		WRITE_ONCE(rsp->ncpus, READ_ONCE(rsp->ncpus) + 1);  	rdp->beenonline = true;	 /* We have now been online. */  	rdp->gpnum = rnp->completed; /* Make CPU later note any new GP. */  	rdp->completed = rnp->completed; @@ -3882,6 +3787,8 @@ void rcu_cpu_starting(unsigned int cpu)  {  	unsigned long flags;  	unsigned long mask; +	int nbits; +	unsigned long oldmask;  	struct rcu_data *rdp;  	struct rcu_node *rnp;  	struct rcu_state *rsp; @@ -3892,9 +3799,15 @@ void rcu_cpu_starting(unsigned int cpu)  		mask = rdp->grpmask;  		raw_spin_lock_irqsave_rcu_node(rnp, flags);  		rnp->qsmaskinitnext |= mask; +		oldmask = rnp->expmaskinitnext;  		rnp->expmaskinitnext |= mask; +		oldmask ^= rnp->expmaskinitnext; +		nbits = bitmap_weight(&oldmask, BITS_PER_LONG); +		/* Allow lockless access for expedited grace periods. */ +		smp_store_release(&rsp->ncpus, rsp->ncpus + nbits); /* ^^^ */  		raw_spin_unlock_irqrestore_rcu_node(rnp, flags);  	} +	smp_mb(); /* Ensure RCU read-side usage follows above initialization. */  }  #ifdef CONFIG_HOTPLUG_CPU @@ -3937,6 +3850,50 @@ void rcu_report_dead(unsigned int cpu)  	for_each_rcu_flavor(rsp)  		rcu_cleanup_dying_idle_cpu(cpu, rsp);  } + +/* Migrate the dead CPU's callbacks to the current CPU. */ +static void rcu_migrate_callbacks(int cpu, struct rcu_state *rsp) +{ +	unsigned long flags; +	struct rcu_data *my_rdp; +	struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); +	struct rcu_node *rnp_root = rcu_get_root(rdp->rsp); + +	if (rcu_is_nocb_cpu(cpu) || rcu_segcblist_empty(&rdp->cblist)) +		return;  /* No callbacks to migrate. */ + +	local_irq_save(flags); +	my_rdp = this_cpu_ptr(rsp->rda); +	if (rcu_nocb_adopt_orphan_cbs(my_rdp, rdp, flags)) { +		local_irq_restore(flags); +		return; +	} +	raw_spin_lock_rcu_node(rnp_root); /* irqs already disabled. */ +	rcu_advance_cbs(rsp, rnp_root, rdp); /* Leverage recent GPs. */ +	rcu_advance_cbs(rsp, rnp_root, my_rdp); /* Assign GP to pending CBs. */ +	rcu_segcblist_merge(&my_rdp->cblist, &rdp->cblist); +	WARN_ON_ONCE(rcu_segcblist_empty(&my_rdp->cblist) != +		     !rcu_segcblist_n_cbs(&my_rdp->cblist)); +	raw_spin_unlock_irqrestore_rcu_node(rnp_root, flags); +	WARN_ONCE(rcu_segcblist_n_cbs(&rdp->cblist) != 0 || +		  !rcu_segcblist_empty(&rdp->cblist), +		  "rcu_cleanup_dead_cpu: Callbacks on offline CPU %d: qlen=%lu, 1stCB=%p\n", +		  cpu, rcu_segcblist_n_cbs(&rdp->cblist), +		  rcu_segcblist_first_cb(&rdp->cblist)); +} + +/* + * The outgoing CPU has just passed through the dying-idle state, + * and we are being invoked from the CPU that was IPIed to continue the + * offline operation.  We need to migrate the outgoing CPU's callbacks. + */ +void rcutree_migrate_callbacks(int cpu) +{ +	struct rcu_state *rsp; + +	for_each_rcu_flavor(rsp) +		rcu_migrate_callbacks(cpu, rsp); +}  #endif  /* diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 9af0f31d6847..8e1f285f0a70 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -219,8 +219,6 @@ struct rcu_data {  					/* qlen at last check for QS forcing */  	unsigned long	n_cbs_invoked;	/* count of RCU cbs invoked. */  	unsigned long	n_nocbs_invoked; /* count of no-CBs RCU cbs invoked. */ -	unsigned long   n_cbs_orphaned; /* RCU cbs orphaned by dying CPU */ -	unsigned long   n_cbs_adopted;  /* RCU cbs adopted from dying CPU */  	unsigned long	n_force_qs_snap;  					/* did other CPU force QS recently? */  	long		blimit;		/* Upper limit on a processed batch */ @@ -268,7 +266,9 @@ struct rcu_data {  	struct rcu_head **nocb_follower_tail;  	struct swait_queue_head nocb_wq; /* For nocb kthreads to sleep on. */  	struct task_struct *nocb_kthread; +	raw_spinlock_t nocb_lock;	/* Guard following pair of fields. */  	int nocb_defer_wakeup;		/* Defer wakeup of nocb_kthread. */ +	struct timer_list nocb_timer;	/* Enforce finite deferral. */  	/* The following fields are used by the leader, hence own cacheline. */  	struct rcu_head *nocb_gp_head ____cacheline_internodealigned_in_smp; @@ -350,15 +350,6 @@ struct rcu_state {  	/* End of fields guarded by root rcu_node's lock. */ -	raw_spinlock_t orphan_lock ____cacheline_internodealigned_in_smp; -						/* Protect following fields. */ -	struct rcu_cblist orphan_pend;		/* Orphaned callbacks that */ -						/*  need a grace period. */ -	struct rcu_cblist orphan_done;		/* Orphaned callbacks that */ -						/*  are ready to invoke. */ -						/* (Contains counts.) */ -	/* End of fields guarded by orphan_lock. */ -  	struct mutex barrier_mutex;		/* Guards barrier fields. */  	atomic_t barrier_cpu_count;		/* # CPUs waiting on. */  	struct completion barrier_completion;	/* Wake at barrier end. */ @@ -495,7 +486,7 @@ static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq);  static void rcu_init_one_nocb(struct rcu_node *rnp);  static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp,  			    bool lazy, unsigned long flags); -static bool rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp, +static bool rcu_nocb_adopt_orphan_cbs(struct rcu_data *my_rdp,  				      struct rcu_data *rdp,  				      unsigned long flags);  static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp); diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index dd21ca47e4b4..46d61b597731 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -73,7 +73,7 @@ static void sync_exp_reset_tree_hotplug(struct rcu_state *rsp)  	unsigned long flags;  	unsigned long mask;  	unsigned long oldmask; -	int ncpus = READ_ONCE(rsp->ncpus); +	int ncpus = smp_load_acquire(&rsp->ncpus); /* Order against locking. */  	struct rcu_node *rnp;  	struct rcu_node *rnp_up; diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 908b309d60d7..55bde94b9572 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -180,6 +180,8 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp)  	struct task_struct *t = current;  	lockdep_assert_held(&rnp->lock); +	WARN_ON_ONCE(rdp->mynode != rnp); +	WARN_ON_ONCE(rnp->level != rcu_num_lvls - 1);  	/*  	 * Decide where to queue the newly blocked task.  In theory, @@ -261,6 +263,10 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp)  		rnp->gp_tasks = &t->rcu_node_entry;  	if (!rnp->exp_tasks && (blkd_state & RCU_EXP_BLKD))  		rnp->exp_tasks = &t->rcu_node_entry; +	WARN_ON_ONCE(!(blkd_state & RCU_GP_BLKD) != +		     !(rnp->qsmask & rdp->grpmask)); +	WARN_ON_ONCE(!(blkd_state & RCU_EXP_BLKD) != +		     !(rnp->expmask & rdp->grpmask));  	raw_spin_unlock_rcu_node(rnp); /* interrupts remain disabled. */  	/* @@ -482,6 +488,7 @@ void rcu_read_unlock_special(struct task_struct *t)  		rnp = t->rcu_blocked_node;  		raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */  		WARN_ON_ONCE(rnp != t->rcu_blocked_node); +		WARN_ON_ONCE(rnp->level != rcu_num_lvls - 1);  		empty_norm = !rcu_preempt_blocked_readers_cgp(rnp);  		empty_exp = sync_rcu_preempt_exp_done(rnp);  		smp_mb(); /* ensure expedited fastpath sees end of RCU c-s. */ @@ -495,10 +502,10 @@ void rcu_read_unlock_special(struct task_struct *t)  		if (&t->rcu_node_entry == rnp->exp_tasks)  			rnp->exp_tasks = np;  		if (IS_ENABLED(CONFIG_RCU_BOOST)) { -			if (&t->rcu_node_entry == rnp->boost_tasks) -				rnp->boost_tasks = np;  			/* Snapshot ->boost_mtx ownership w/rnp->lock held. */  			drop_boost_mutex = rt_mutex_owner(&rnp->boost_mtx) == t; +			if (&t->rcu_node_entry == rnp->boost_tasks) +				rnp->boost_tasks = np;  		}  		/* @@ -636,10 +643,17 @@ static int rcu_print_task_exp_stall(struct rcu_node *rnp)   */  static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp)  { +	struct task_struct *t; +  	RCU_LOCKDEP_WARN(preemptible(), "rcu_preempt_check_blocked_tasks() invoked with preemption enabled!!!\n");  	WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp)); -	if (rcu_preempt_has_tasks(rnp)) +	if (rcu_preempt_has_tasks(rnp)) {  		rnp->gp_tasks = rnp->blkd_tasks.next; +		t = container_of(rnp->gp_tasks, struct task_struct, +				 rcu_node_entry); +		trace_rcu_unlock_preempted_task(TPS("rcu_preempt-GPS"), +						rnp->gpnum, t->pid); +	}  	WARN_ON_ONCE(rnp->qsmask);  } @@ -1788,23 +1802,62 @@ bool rcu_is_nocb_cpu(int cpu)  }  /* - * Kick the leader kthread for this NOCB group. + * Kick the leader kthread for this NOCB group.  Caller holds ->nocb_lock + * and this function releases it.   */ -static void wake_nocb_leader(struct rcu_data *rdp, bool force) +static void __wake_nocb_leader(struct rcu_data *rdp, bool force, +			       unsigned long flags) +	__releases(rdp->nocb_lock)  {  	struct rcu_data *rdp_leader = rdp->nocb_leader; -	if (!READ_ONCE(rdp_leader->nocb_kthread)) +	lockdep_assert_held(&rdp->nocb_lock); +	if (!READ_ONCE(rdp_leader->nocb_kthread)) { +		raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags);  		return; -	if (READ_ONCE(rdp_leader->nocb_leader_sleep) || force) { +	} +	if (rdp_leader->nocb_leader_sleep || force) {  		/* Prior smp_mb__after_atomic() orders against prior enqueue. */  		WRITE_ONCE(rdp_leader->nocb_leader_sleep, false); +		del_timer(&rdp->nocb_timer); +		raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags);  		smp_mb(); /* ->nocb_leader_sleep before swake_up(). */  		swake_up(&rdp_leader->nocb_wq); +	} else { +		raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags);  	}  }  /* + * Kick the leader kthread for this NOCB group, but caller has not + * acquired locks. + */ +static void wake_nocb_leader(struct rcu_data *rdp, bool force) +{ +	unsigned long flags; + +	raw_spin_lock_irqsave(&rdp->nocb_lock, flags); +	__wake_nocb_leader(rdp, force, flags); +} + +/* + * Arrange to wake the leader kthread for this NOCB group at some + * future time when it is safe to do so. + */ +static void wake_nocb_leader_defer(struct rcu_data *rdp, int waketype, +				   const char *reason) +{ +	unsigned long flags; + +	raw_spin_lock_irqsave(&rdp->nocb_lock, flags); +	if (rdp->nocb_defer_wakeup == RCU_NOCB_WAKE_NOT) +		mod_timer(&rdp->nocb_timer, jiffies + 1); +	WRITE_ONCE(rdp->nocb_defer_wakeup, waketype); +	trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, reason); +	raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags); +} + +/*   * Does the specified CPU need an RCU callback for the specified flavor   * of rcu_barrier()?   */ @@ -1891,11 +1944,8 @@ static void __call_rcu_nocb_enqueue(struct rcu_data *rdp,  			trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,  					    TPS("WakeEmpty"));  		} else { -			WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOCB_WAKE); -			/* Store ->nocb_defer_wakeup before ->rcu_urgent_qs. */ -			smp_store_release(this_cpu_ptr(&rcu_dynticks.rcu_urgent_qs), true); -			trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, -					    TPS("WakeEmptyIsDeferred")); +			wake_nocb_leader_defer(rdp, RCU_NOCB_WAKE, +					       TPS("WakeEmptyIsDeferred"));  		}  		rdp->qlen_last_fqs_check = 0;  	} else if (len > rdp->qlen_last_fqs_check + qhimark) { @@ -1905,11 +1955,8 @@ static void __call_rcu_nocb_enqueue(struct rcu_data *rdp,  			trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,  					    TPS("WakeOvf"));  		} else { -			WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOCB_WAKE_FORCE); -			/* Store ->nocb_defer_wakeup before ->rcu_urgent_qs. */ -			smp_store_release(this_cpu_ptr(&rcu_dynticks.rcu_urgent_qs), true); -			trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, -					    TPS("WakeOvfIsDeferred")); +			wake_nocb_leader_defer(rdp, RCU_NOCB_WAKE, +					       TPS("WakeOvfIsDeferred"));  		}  		rdp->qlen_last_fqs_check = LONG_MAX / 2;  	} else { @@ -1961,30 +2008,19 @@ static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp,   * Adopt orphaned callbacks on a no-CBs CPU, or return 0 if this is   * not a no-CBs CPU.   */ -static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp, +static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_data *my_rdp,  						     struct rcu_data *rdp,  						     unsigned long flags)  { -	long ql = rsp->orphan_done.len; -	long qll = rsp->orphan_done.len_lazy; - -	/* If this is not a no-CBs CPU, tell the caller to do it the old way. */ +	RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_nocb_adopt_orphan_cbs() invoked with irqs enabled!!!");  	if (!rcu_is_nocb_cpu(smp_processor_id())) -		return false; - -	/* First, enqueue the donelist, if any.  This preserves CB ordering. */ -	if (rsp->orphan_done.head) { -		__call_rcu_nocb_enqueue(rdp, rcu_cblist_head(&rsp->orphan_done), -					rcu_cblist_tail(&rsp->orphan_done), -					ql, qll, flags); -	} -	if (rsp->orphan_pend.head) { -		__call_rcu_nocb_enqueue(rdp, rcu_cblist_head(&rsp->orphan_pend), -					rcu_cblist_tail(&rsp->orphan_pend), -					ql, qll, flags); -	} -	rcu_cblist_init(&rsp->orphan_done); -	rcu_cblist_init(&rsp->orphan_pend); +		return false; /* Not NOCBs CPU, caller must migrate CBs. */ +	__call_rcu_nocb_enqueue(my_rdp, rcu_segcblist_head(&rdp->cblist), +				rcu_segcblist_tail(&rdp->cblist), +				rcu_segcblist_n_cbs(&rdp->cblist), +				rcu_segcblist_n_lazy_cbs(&rdp->cblist), flags); +	rcu_segcblist_init(&rdp->cblist); +	rcu_segcblist_disable(&rdp->cblist);  	return true;  } @@ -2031,6 +2067,7 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp)  static void nocb_leader_wait(struct rcu_data *my_rdp)  {  	bool firsttime = true; +	unsigned long flags;  	bool gotcbs;  	struct rcu_data *rdp;  	struct rcu_head **tail; @@ -2039,13 +2076,17 @@ wait_again:  	/* Wait for callbacks to appear. */  	if (!rcu_nocb_poll) { -		trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu, "Sleep"); +		trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu, TPS("Sleep"));  		swait_event_interruptible(my_rdp->nocb_wq,  				!READ_ONCE(my_rdp->nocb_leader_sleep)); -		/* Memory barrier handled by smp_mb() calls below and repoll. */ +		raw_spin_lock_irqsave(&my_rdp->nocb_lock, flags); +		my_rdp->nocb_leader_sleep = true; +		WRITE_ONCE(my_rdp->nocb_defer_wakeup, RCU_NOCB_WAKE_NOT); +		del_timer(&my_rdp->nocb_timer); +		raw_spin_unlock_irqrestore(&my_rdp->nocb_lock, flags);  	} else if (firsttime) {  		firsttime = false; /* Don't drown trace log with "Poll"! */ -		trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu, "Poll"); +		trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu, TPS("Poll"));  	}  	/* @@ -2054,7 +2095,7 @@ wait_again:  	 * nocb_gp_head, where they await a grace period.  	 */  	gotcbs = false; -	smp_mb(); /* wakeup before ->nocb_head reads. */ +	smp_mb(); /* wakeup and _sleep before ->nocb_head reads. */  	for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_follower) {  		rdp->nocb_gp_head = READ_ONCE(rdp->nocb_head);  		if (!rdp->nocb_gp_head) @@ -2066,56 +2107,41 @@ wait_again:  		gotcbs = true;  	} -	/* -	 * If there were no callbacks, sleep a bit, rescan after a -	 * memory barrier, and go retry. -	 */ +	/* No callbacks?  Sleep a bit if polling, and go retry.  */  	if (unlikely(!gotcbs)) { -		if (!rcu_nocb_poll) -			trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu, -					    "WokeEmpty");  		WARN_ON(signal_pending(current)); -		schedule_timeout_interruptible(1); - -		/* Rescan in case we were a victim of memory ordering. */ -		my_rdp->nocb_leader_sleep = true; -		smp_mb();  /* Ensure _sleep true before scan. */ -		for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_follower) -			if (READ_ONCE(rdp->nocb_head)) { -				/* Found CB, so short-circuit next wait. */ -				my_rdp->nocb_leader_sleep = false; -				break; -			} +		if (rcu_nocb_poll) { +			schedule_timeout_interruptible(1); +		} else { +			trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu, +					    TPS("WokeEmpty")); +		}  		goto wait_again;  	}  	/* Wait for one grace period. */  	rcu_nocb_wait_gp(my_rdp); -	/* -	 * We left ->nocb_leader_sleep unset to reduce cache thrashing. -	 * We set it now, but recheck for new callbacks while -	 * traversing our follower list. -	 */ -	my_rdp->nocb_leader_sleep = true; -	smp_mb(); /* Ensure _sleep true before scan of ->nocb_head. */ -  	/* Each pass through the following loop wakes a follower, if needed. */  	for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_follower) { -		if (READ_ONCE(rdp->nocb_head)) +		if (!rcu_nocb_poll && +		    READ_ONCE(rdp->nocb_head) && +		    READ_ONCE(my_rdp->nocb_leader_sleep)) { +			raw_spin_lock_irqsave(&my_rdp->nocb_lock, flags);  			my_rdp->nocb_leader_sleep = false;/* No need to sleep.*/ +			raw_spin_unlock_irqrestore(&my_rdp->nocb_lock, flags); +		}  		if (!rdp->nocb_gp_head)  			continue; /* No CBs, so no need to wake follower. */  		/* Append callbacks to follower's "done" list. */ -		tail = xchg(&rdp->nocb_follower_tail, rdp->nocb_gp_tail); +		raw_spin_lock_irqsave(&rdp->nocb_lock, flags); +		tail = rdp->nocb_follower_tail; +		rdp->nocb_follower_tail = rdp->nocb_gp_tail;  		*tail = rdp->nocb_gp_head; -		smp_mb__after_atomic(); /* Store *tail before wakeup. */ +		raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags);  		if (rdp != my_rdp && tail == &rdp->nocb_follower_head) { -			/* -			 * List was empty, wake up the follower. -			 * Memory barriers supplied by atomic_long_add(). -			 */ +			/* List was empty, so wake up the follower.  */  			swake_up(&rdp->nocb_wq);  		}  	} @@ -2131,28 +2157,16 @@ wait_again:   */  static void nocb_follower_wait(struct rcu_data *rdp)  { -	bool firsttime = true; -  	for (;;) { -		if (!rcu_nocb_poll) { -			trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, -					    "FollowerSleep"); -			swait_event_interruptible(rdp->nocb_wq, -						 READ_ONCE(rdp->nocb_follower_head)); -		} else if (firsttime) { -			/* Don't drown trace log with "Poll"! */ -			firsttime = false; -			trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, "Poll"); -		} +		trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("FollowerSleep")); +		swait_event_interruptible(rdp->nocb_wq, +					 READ_ONCE(rdp->nocb_follower_head));  		if (smp_load_acquire(&rdp->nocb_follower_head)) {  			/* ^^^ Ensure CB invocation follows _head test. */  			return;  		} -		if (!rcu_nocb_poll) -			trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, -					    "WokeEmpty");  		WARN_ON(signal_pending(current)); -		schedule_timeout_interruptible(1); +		trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("WokeEmpty"));  	}  } @@ -2165,6 +2179,7 @@ static void nocb_follower_wait(struct rcu_data *rdp)  static int rcu_nocb_kthread(void *arg)  {  	int c, cl; +	unsigned long flags;  	struct rcu_head *list;  	struct rcu_head *next;  	struct rcu_head **tail; @@ -2179,11 +2194,14 @@ static int rcu_nocb_kthread(void *arg)  			nocb_follower_wait(rdp);  		/* Pull the ready-to-invoke callbacks onto local list. */ -		list = READ_ONCE(rdp->nocb_follower_head); +		raw_spin_lock_irqsave(&rdp->nocb_lock, flags); +		list = rdp->nocb_follower_head; +		rdp->nocb_follower_head = NULL; +		tail = rdp->nocb_follower_tail; +		rdp->nocb_follower_tail = &rdp->nocb_follower_head; +		raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags);  		BUG_ON(!list); -		trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, "WokeNonEmpty"); -		WRITE_ONCE(rdp->nocb_follower_head, NULL); -		tail = xchg(&rdp->nocb_follower_tail, &rdp->nocb_follower_head); +		trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("WokeNonEmpty"));  		/* Each pass through the following loop invokes a callback. */  		trace_rcu_batch_start(rdp->rsp->name, @@ -2226,18 +2244,39 @@ static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp)  }  /* Do a deferred wakeup of rcu_nocb_kthread(). */ -static void do_nocb_deferred_wakeup(struct rcu_data *rdp) +static void do_nocb_deferred_wakeup_common(struct rcu_data *rdp)  { +	unsigned long flags;  	int ndw; -	if (!rcu_nocb_need_deferred_wakeup(rdp)) +	raw_spin_lock_irqsave(&rdp->nocb_lock, flags); +	if (!rcu_nocb_need_deferred_wakeup(rdp)) { +		raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags);  		return; +	}  	ndw = READ_ONCE(rdp->nocb_defer_wakeup);  	WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOCB_WAKE_NOT); -	wake_nocb_leader(rdp, ndw == RCU_NOCB_WAKE_FORCE); +	__wake_nocb_leader(rdp, ndw == RCU_NOCB_WAKE_FORCE, flags);  	trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("DeferredWake"));  } +/* Do a deferred wakeup of rcu_nocb_kthread() from a timer handler. */ +static void do_nocb_deferred_wakeup_timer(unsigned long x) +{ +	do_nocb_deferred_wakeup_common((struct rcu_data *)x); +} + +/* + * Do a deferred wakeup of rcu_nocb_kthread() from fastpath. + * This means we do an inexact common-case check.  Note that if + * we miss, ->nocb_timer will eventually clean things up. + */ +static void do_nocb_deferred_wakeup(struct rcu_data *rdp) +{ +	if (rcu_nocb_need_deferred_wakeup(rdp)) +		do_nocb_deferred_wakeup_common(rdp); +} +  void __init rcu_init_nohz(void)  {  	int cpu; @@ -2287,6 +2326,9 @@ static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp)  	rdp->nocb_tail = &rdp->nocb_head;  	init_swait_queue_head(&rdp->nocb_wq);  	rdp->nocb_follower_tail = &rdp->nocb_follower_head; +	raw_spin_lock_init(&rdp->nocb_lock); +	setup_timer(&rdp->nocb_timer, do_nocb_deferred_wakeup_timer, +		    (unsigned long)rdp);  }  /* @@ -2459,7 +2501,7 @@ static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp,  	return false;  } -static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp, +static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_data *my_rdp,  						     struct rcu_data *rdp,  						     unsigned long flags)  { diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index 00e77c470017..5033b66d2753 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -568,7 +568,7 @@ static DECLARE_WAIT_QUEUE_HEAD(rcu_tasks_cbs_wq);  static DEFINE_RAW_SPINLOCK(rcu_tasks_cbs_lock);  /* Track exiting tasks in order to allow them to be waited for. */ -DEFINE_SRCU(tasks_rcu_exit_srcu); +DEFINE_STATIC_SRCU(tasks_rcu_exit_srcu);  /* Control stall timeouts.  Disable with <= 0, otherwise jiffies till stall. */  #define RCU_TASK_STALL_TIMEOUT (HZ * 60 * 10) @@ -875,6 +875,22 @@ static void rcu_spawn_tasks_kthread(void)  	mutex_unlock(&rcu_tasks_kthread_mutex);  } +/* Do the srcu_read_lock() for the above synchronize_srcu().  */ +void exit_tasks_rcu_start(void) +{ +	preempt_disable(); +	current->rcu_tasks_idx = __srcu_read_lock(&tasks_rcu_exit_srcu); +	preempt_enable(); +} + +/* Do the srcu_read_unlock() for the above synchronize_srcu().  */ +void exit_tasks_rcu_finish(void) +{ +	preempt_disable(); +	__srcu_read_unlock(&tasks_rcu_exit_srcu, current->rcu_tasks_idx); +	preempt_enable(); +} +  #endif /* #ifdef CONFIG_TASKS_RCU */  #ifndef CONFIG_TINY_RCU diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index 53f0164ed362..78f54932ea1d 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -25,3 +25,4 @@ obj-$(CONFIG_SCHED_DEBUG) += debug.o  obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o  obj-$(CONFIG_CPU_FREQ) += cpufreq.o  obj-$(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) += cpufreq_schedutil.o +obj-$(CONFIG_MEMBARRIER) += membarrier.o diff --git a/kernel/sched/autogroup.c b/kernel/sched/autogroup.c index da39489d2d80..de6d7f4dfcb5 100644 --- a/kernel/sched/autogroup.c +++ b/kernel/sched/autogroup.c @@ -71,7 +71,6 @@ static inline struct autogroup *autogroup_create(void)  		goto out_fail;  	tg = sched_create_group(&root_task_group); -  	if (IS_ERR(tg))  		goto out_free; @@ -101,7 +100,7 @@ out_free:  out_fail:  	if (printk_ratelimit()) {  		printk(KERN_WARNING "autogroup_create: %s failure.\n", -			ag ? "sched_create_group()" : "kmalloc()"); +			ag ? "sched_create_group()" : "kzalloc()");  	}  	return autogroup_kref_get(&autogroup_default); diff --git a/kernel/sched/completion.c b/kernel/sched/completion.c index 13fc5ae9bf2f..cc873075c3bd 100644 --- a/kernel/sched/completion.c +++ b/kernel/sched/completion.c @@ -32,6 +32,12 @@ void complete(struct completion *x)  	unsigned long flags;  	spin_lock_irqsave(&x->wait.lock, flags); + +	/* +	 * Perform commit of crossrelease here. +	 */ +	complete_release_commit(x); +  	if (x->done != UINT_MAX)  		x->done++;  	__wake_up_locked(&x->wait, TASK_NORMAL, 1); @@ -47,6 +53,13 @@ EXPORT_SYMBOL(complete);   *   * It may be assumed that this function implies a write memory barrier before   * changing the task state if and only if any tasks are woken up. + * + * Since complete_all() sets the completion of @x permanently to done + * to allow multiple waiters to finish, a call to reinit_completion() + * must be used on @x if @x is to be used again. The code must make + * sure that all waiters have woken and finished before reinitializing + * @x. Also note that the function completion_done() can not be used + * to know if there are still waiters after complete_all() has been called.   */  void complete_all(struct completion *x)  { @@ -92,9 +105,14 @@ __wait_for_common(struct completion *x,  {  	might_sleep(); +	complete_acquire(x); +  	spin_lock_irq(&x->wait.lock);  	timeout = do_wait_for_common(x, action, timeout, state);  	spin_unlock_irq(&x->wait.lock); + +	complete_release(x); +  	return timeout;  } @@ -297,9 +315,12 @@ EXPORT_SYMBOL(try_wait_for_completion);   *	Return: 0 if there are waiters (wait_for_completion() in progress)   *		 1 if there are no waiters.   * + *	Note, this will always return true if complete_all() was called on @X.   */  bool completion_done(struct completion *x)  { +	unsigned long flags; +  	if (!READ_ONCE(x->done))  		return false; @@ -307,14 +328,9 @@ bool completion_done(struct completion *x)  	 * If ->done, we need to wait for complete() to release ->wait.lock  	 * otherwise we can end up freeing the completion before complete()  	 * is done referencing it. -	 * -	 * The RMB pairs with complete()'s RELEASE of ->wait.lock and orders -	 * the loads of ->done and ->wait.lock such that we cannot observe -	 * the lock before complete() acquires it while observing the ->done -	 * after it's acquired the lock.  	 */ -	smp_rmb(); -	spin_unlock_wait(&x->wait.lock); +	spin_lock_irqsave(&x->wait.lock, flags); +	spin_unlock_irqrestore(&x->wait.lock, flags);  	return true;  }  EXPORT_SYMBOL(completion_done); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 0869b20fba81..6d2c7ff9ba98 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -951,8 +951,13 @@ struct migration_arg {  static struct rq *__migrate_task(struct rq *rq, struct rq_flags *rf,  				 struct task_struct *p, int dest_cpu)  { -	if (unlikely(!cpu_active(dest_cpu))) -		return rq; +	if (p->flags & PF_KTHREAD) { +		if (unlikely(!cpu_online(dest_cpu))) +			return rq; +	} else { +		if (unlikely(!cpu_active(dest_cpu))) +			return rq; +	}  	/* Affinity changed (again). */  	if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed)) @@ -1967,8 +1972,8 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)  	 * reordered with p->state check below. This pairs with mb() in  	 * set_current_state() the waiting thread does.  	 */ -	smp_mb__before_spinlock();  	raw_spin_lock_irqsave(&p->pi_lock, flags); +	smp_mb__after_spinlock();  	if (!(p->state & state))  		goto out; @@ -2635,6 +2640,16 @@ static struct rq *finish_task_switch(struct task_struct *prev)  	prev_state = prev->state;  	vtime_task_switch(prev);  	perf_event_task_sched_in(prev, current); +	/* +	 * The membarrier system call requires a full memory barrier +	 * after storing to rq->curr, before going back to user-space. +	 * +	 * TODO: This smp_mb__after_unlock_lock can go away if PPC end +	 * up adding a full barrier to switch_mm(), or we should figure +	 * out if a smp_mb__after_unlock_lock is really the proper API +	 * to use. +	 */ +	smp_mb__after_unlock_lock();  	finish_lock_switch(rq, prev);  	finish_arch_post_lock_switch(); @@ -3281,8 +3296,8 @@ static void __sched notrace __schedule(bool preempt)  	 * can't be reordered with __set_current_state(TASK_INTERRUPTIBLE)  	 * done by the caller to avoid the race with signal_wake_up().  	 */ -	smp_mb__before_spinlock();  	rq_lock(rq, &rf); +	smp_mb__after_spinlock();  	/* Promote REQ to ACT */  	rq->clock_update_flags <<= 1; @@ -3324,6 +3339,21 @@ static void __sched notrace __schedule(bool preempt)  	if (likely(prev != next)) {  		rq->nr_switches++;  		rq->curr = next; +		/* +		 * The membarrier system call requires each architecture +		 * to have a full memory barrier after updating +		 * rq->curr, before returning to user-space. For TSO +		 * (e.g. x86), the architecture must provide its own +		 * barrier in switch_mm(). For weakly ordered machines +		 * for which spin_unlock() acts as a full memory +		 * barrier, finish_lock_switch() in common code takes +		 * care of this barrier. For weakly ordered machines for +		 * which spin_unlock() acts as a RELEASE barrier (only +		 * arm64 and PowerPC), arm64 has a full barrier in +		 * switch_to(), and PowerPC has +		 * smp_mb__after_unlock_lock() before +		 * finish_lock_switch(). +		 */  		++*switch_count;  		trace_sched_switch(preempt, prev, next); @@ -3352,8 +3382,8 @@ void __noreturn do_task_dead(void)  	 * To avoid it, we have to wait for releasing tsk->pi_lock which  	 * is held by try_to_wake_up()  	 */ -	smp_mb(); -	raw_spin_unlock_wait(¤t->pi_lock); +	raw_spin_lock_irq(¤t->pi_lock); +	raw_spin_unlock_irq(¤t->pi_lock);  	/* Causes final put_task_struct in finish_task_switch(): */  	__set_current_state(TASK_DEAD); @@ -5103,24 +5133,17 @@ out_unlock:  	return retval;  } -static const char stat_nam[] = TASK_STATE_TO_CHAR_STR; -  void sched_show_task(struct task_struct *p)  {  	unsigned long free = 0;  	int ppid; -	unsigned long state = p->state; - -	/* Make sure the string lines up properly with the number of task states: */ -	BUILD_BUG_ON(sizeof(TASK_STATE_TO_CHAR_STR)-1 != ilog2(TASK_STATE_MAX)+1);  	if (!try_get_task_stack(p))  		return; -	if (state) -		state = __ffs(state) + 1; -	printk(KERN_INFO "%-15.15s %c", p->comm, -		state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?'); -	if (state == TASK_RUNNING) + +	printk(KERN_INFO "%-15.15s %c", p->comm, task_state_to_char(p)); + +	if (p->state == TASK_RUNNING)  		printk(KERN_CONT "  running task    ");  #ifdef CONFIG_DEBUG_STACK_USAGE  	free = stack_not_used(p); @@ -5177,11 +5200,6 @@ void show_state_filter(unsigned long state_filter)  		debug_show_all_locks();  } -void init_idle_bootup_task(struct task_struct *idle) -{ -	idle->sched_class = &idle_sched_class; -} -  /**   * init_idle - set up an idle thread for a given CPU   * @idle: task in question @@ -5438,7 +5456,7 @@ static void migrate_tasks(struct rq *dead_rq, struct rq_flags *rf)  		 */  		next = pick_next_task(rq, &fake_task, rf);  		BUG_ON(!next); -		next->sched_class->put_prev_task(rq, next); +		put_prev_task(rq, next);  		/*  		 * Rules for changing task_struct::cpus_allowed are holding diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c index fba235c7d026..8d9562d890d3 100644 --- a/kernel/sched/cpudeadline.c +++ b/kernel/sched/cpudeadline.c @@ -119,29 +119,29 @@ static inline int cpudl_maximum(struct cpudl *cp)   * @p: the task   * @later_mask: a mask to fill in with the selected CPUs (or NULL)   * - * Returns: int - best CPU (heap maximum if suitable) + * Returns: int - CPUs were found   */  int cpudl_find(struct cpudl *cp, struct task_struct *p,  	       struct cpumask *later_mask)  { -	int best_cpu = -1;  	const struct sched_dl_entity *dl_se = &p->dl;  	if (later_mask &&  	    cpumask_and(later_mask, cp->free_cpus, &p->cpus_allowed)) { -		best_cpu = cpumask_any(later_mask); -		goto out; -	} else if (cpumask_test_cpu(cpudl_maximum(cp), &p->cpus_allowed) && -			dl_time_before(dl_se->deadline, cp->elements[0].dl)) { -		best_cpu = cpudl_maximum(cp); -		if (later_mask) -			cpumask_set_cpu(best_cpu, later_mask); -	} +		return 1; +	} else { +		int best_cpu = cpudl_maximum(cp); +		WARN_ON(best_cpu != -1 && !cpu_present(best_cpu)); -out: -	WARN_ON(best_cpu != -1 && !cpu_present(best_cpu)); +		if (cpumask_test_cpu(best_cpu, &p->cpus_allowed) && +		    dl_time_before(dl_se->deadline, cp->elements[0].dl)) { +			if (later_mask) +				cpumask_set_cpu(best_cpu, later_mask); -	return best_cpu; +			return 1; +		} +	} +	return 0;  }  /* @@ -246,7 +246,6 @@ int cpudl_init(struct cpudl *cp)  {  	int i; -	memset(cp, 0, sizeof(*cp));  	raw_spin_lock_init(&cp->lock);  	cp->size = 0; diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c index 981fcd7dc394..2511aba36b89 100644 --- a/kernel/sched/cpupri.c +++ b/kernel/sched/cpupri.c @@ -209,8 +209,6 @@ int cpupri_init(struct cpupri *cp)  {  	int i; -	memset(cp, 0, sizeof(*cp)); -  	for (i = 0; i < CPUPRI_NR_PRIORITIES; i++) {  		struct cpupri_vec *vec = &cp->pri_to_cpu[i]; diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 755bd3f1a1a9..d05bd9457a40 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -1594,7 +1594,7 @@ static void check_preempt_equal_dl(struct rq *rq, struct task_struct *p)  	 * let's hope p can move out.  	 */  	if (rq->curr->nr_cpus_allowed == 1 || -	    cpudl_find(&rq->rd->cpudl, rq->curr, NULL) == -1) +	    !cpudl_find(&rq->rd->cpudl, rq->curr, NULL))  		return;  	/* @@ -1602,7 +1602,7 @@ static void check_preempt_equal_dl(struct rq *rq, struct task_struct *p)  	 * see if it is pushed or pulled somewhere else.  	 */  	if (p->nr_cpus_allowed != 1 && -	    cpudl_find(&rq->rd->cpudl, p, NULL) != -1) +	    cpudl_find(&rq->rd->cpudl, p, NULL))  		return;  	resched_curr(rq); @@ -1655,7 +1655,7 @@ static struct sched_dl_entity *pick_next_dl_entity(struct rq *rq,  	return rb_entry(left, struct sched_dl_entity, rb_node);  } -struct task_struct * +static struct task_struct *  pick_next_task_dl(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)  {  	struct sched_dl_entity *dl_se; @@ -1798,7 +1798,7 @@ static int find_later_rq(struct task_struct *task)  	struct sched_domain *sd;  	struct cpumask *later_mask = this_cpu_cpumask_var_ptr(local_cpu_mask_dl);  	int this_cpu = smp_processor_id(); -	int best_cpu, cpu = task_cpu(task); +	int cpu = task_cpu(task);  	/* Make sure the mask is initialized first */  	if (unlikely(!later_mask)) @@ -1811,17 +1811,14 @@ static int find_later_rq(struct task_struct *task)  	 * We have to consider system topology and task affinity  	 * first, then we can look for a suitable cpu.  	 */ -	best_cpu = cpudl_find(&task_rq(task)->rd->cpudl, -			task, later_mask); -	if (best_cpu == -1) +	if (!cpudl_find(&task_rq(task)->rd->cpudl, task, later_mask))  		return -1;  	/* -	 * If we are here, some target has been found, -	 * the most suitable of which is cached in best_cpu. -	 * This is, among the runqueues where the current tasks -	 * have later deadlines than the task's one, the rq -	 * with the latest possible one. +	 * If we are here, some targets have been found, including +	 * the most suitable which is, among the runqueues where the +	 * current tasks have later deadlines than the task's one, the +	 * rq with the latest possible one.  	 *  	 * Now we check how well this matches with task's  	 * affinity and system topology. @@ -1841,6 +1838,7 @@ static int find_later_rq(struct task_struct *task)  	rcu_read_lock();  	for_each_domain(cpu, sd) {  		if (sd->flags & SD_WAKE_AFFINE) { +			int best_cpu;  			/*  			 * If possible, preempting this_cpu is @@ -1852,12 +1850,15 @@ static int find_later_rq(struct task_struct *task)  				return this_cpu;  			} +			best_cpu = cpumask_first_and(later_mask, +							sched_domain_span(sd));  			/* -			 * Last chance: if best_cpu is valid and is -			 * in the mask, that becomes our choice. +			 * Last chance: if a cpu being in both later_mask +			 * and current sd span is valid, that becomes our +			 * choice. Of course, the latest possible cpu is +			 * already under consideration through later_mask.  			 */ -			if (best_cpu < nr_cpu_ids && -			    cpumask_test_cpu(best_cpu, sched_domain_span(sd))) { +			if (best_cpu < nr_cpu_ids) {  				rcu_read_unlock();  				return best_cpu;  			} diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 4fa66de52bd6..4a23bbc3111b 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -327,38 +327,78 @@ static struct ctl_table *sd_alloc_ctl_cpu_table(int cpu)  	return table;  } +static cpumask_var_t sd_sysctl_cpus;  static struct ctl_table_header *sd_sysctl_header; +  void register_sched_domain_sysctl(void)  { -	int i, cpu_num = num_possible_cpus(); -	struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1); +	static struct ctl_table *cpu_entries; +	static struct ctl_table **cpu_idx;  	char buf[32]; +	int i; -	WARN_ON(sd_ctl_dir[0].child); -	sd_ctl_dir[0].child = entry; +	if (!cpu_entries) { +		cpu_entries = sd_alloc_ctl_entry(num_possible_cpus() + 1); +		if (!cpu_entries) +			return; -	if (entry == NULL) -		return; +		WARN_ON(sd_ctl_dir[0].child); +		sd_ctl_dir[0].child = cpu_entries; +	} -	for_each_possible_cpu(i) { -		snprintf(buf, 32, "cpu%d", i); -		entry->procname = kstrdup(buf, GFP_KERNEL); -		entry->mode = 0555; -		entry->child = sd_alloc_ctl_cpu_table(i); -		entry++; +	if (!cpu_idx) { +		struct ctl_table *e = cpu_entries; + +		cpu_idx = kcalloc(nr_cpu_ids, sizeof(struct ctl_table*), GFP_KERNEL); +		if (!cpu_idx) +			return; + +		/* deal with sparse possible map */ +		for_each_possible_cpu(i) { +			cpu_idx[i] = e; +			e++; +		} +	} + +	if (!cpumask_available(sd_sysctl_cpus)) { +		if (!alloc_cpumask_var(&sd_sysctl_cpus, GFP_KERNEL)) +			return; + +		/* init to possible to not have holes in @cpu_entries */ +		cpumask_copy(sd_sysctl_cpus, cpu_possible_mask); +	} + +	for_each_cpu(i, sd_sysctl_cpus) { +		struct ctl_table *e = cpu_idx[i]; + +		if (e->child) +			sd_free_ctl_entry(&e->child); + +		if (!e->procname) { +			snprintf(buf, 32, "cpu%d", i); +			e->procname = kstrdup(buf, GFP_KERNEL); +		} +		e->mode = 0555; +		e->child = sd_alloc_ctl_cpu_table(i); + +		__cpumask_clear_cpu(i, sd_sysctl_cpus);  	}  	WARN_ON(sd_sysctl_header);  	sd_sysctl_header = register_sysctl_table(sd_ctl_root);  } +void dirty_sched_domain_sysctl(int cpu) +{ +	if (cpumask_available(sd_sysctl_cpus)) +		__cpumask_set_cpu(cpu, sd_sysctl_cpus); +} +  /* may be called multiple times per register */  void unregister_sched_domain_sysctl(void)  {  	unregister_sysctl_table(sd_sysctl_header);  	sd_sysctl_header = NULL; -	if (sd_ctl_dir[0].child) -		sd_free_ctl_entry(&sd_ctl_dir[0].child);  }  #endif /* CONFIG_SYSCTL */  #endif /* CONFIG_SMP */ @@ -421,13 +461,15 @@ static char *task_group_path(struct task_group *tg)  }  #endif +static const char stat_nam[] = TASK_STATE_TO_CHAR_STR; +  static void  print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)  {  	if (rq->curr == p) -		SEQ_printf(m, "R"); +		SEQ_printf(m, ">R");  	else -		SEQ_printf(m, " "); +		SEQ_printf(m, " %c", task_state_to_char(p));  	SEQ_printf(m, "%15s %5d %9Ld.%06ld %9Ld %5d ",  		p->comm, task_pid_nr(p), @@ -456,9 +498,9 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu)  	SEQ_printf(m,  	"\nrunnable tasks:\n" -	"            task   PID         tree-key  switches  prio" +	" S           task   PID         tree-key  switches  prio"  	"     wait-time             sum-exec        sum-sleep\n" -	"------------------------------------------------------" +	"-------------------------------------------------------"  	"----------------------------------------------------\n");  	rcu_read_lock(); @@ -872,11 +914,12 @@ static void sched_show_numa(struct task_struct *p, struct seq_file *m)  #endif  } -void proc_sched_show_task(struct task_struct *p, struct seq_file *m) +void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, +						  struct seq_file *m)  {  	unsigned long nr_switches; -	SEQ_printf(m, "%s (%d, #threads: %d)\n", p->comm, task_pid_nr(p), +	SEQ_printf(m, "%s (%d, #threads: %d)\n", p->comm, task_pid_nr_ns(p, ns),  						get_nr_threads(p));  	SEQ_printf(m,  		"---------------------------------------------------------" diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index c95880e216f6..8d5868771cb3 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -806,7 +806,7 @@ void post_init_entity_util_avg(struct sched_entity *se)  			/*  			 * For !fair tasks do:  			 * -			update_cfs_rq_load_avg(now, cfs_rq, false); +			update_cfs_rq_load_avg(now, cfs_rq);  			attach_entity_load_avg(cfs_rq, se);  			switched_from_fair(rq, p);  			 * @@ -1071,6 +1071,29 @@ unsigned int sysctl_numa_balancing_scan_size = 256;  /* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */  unsigned int sysctl_numa_balancing_scan_delay = 1000; +struct numa_group { +	atomic_t refcount; + +	spinlock_t lock; /* nr_tasks, tasks */ +	int nr_tasks; +	pid_t gid; +	int active_nodes; + +	struct rcu_head rcu; +	unsigned long total_faults; +	unsigned long max_faults_cpu; +	/* +	 * Faults_cpu is used to decide whether memory should move +	 * towards the CPU. As a consequence, these stats are weighted +	 * more by CPU use than by memory faults. +	 */ +	unsigned long *faults_cpu; +	unsigned long faults[0]; +}; + +static inline unsigned long group_faults_priv(struct numa_group *ng); +static inline unsigned long group_faults_shared(struct numa_group *ng); +  static unsigned int task_nr_scan_windows(struct task_struct *p)  {  	unsigned long rss = 0; @@ -1107,13 +1130,47 @@ static unsigned int task_scan_min(struct task_struct *p)  	return max_t(unsigned int, floor, scan);  } +static unsigned int task_scan_start(struct task_struct *p) +{ +	unsigned long smin = task_scan_min(p); +	unsigned long period = smin; + +	/* Scale the maximum scan period with the amount of shared memory. */ +	if (p->numa_group) { +		struct numa_group *ng = p->numa_group; +		unsigned long shared = group_faults_shared(ng); +		unsigned long private = group_faults_priv(ng); + +		period *= atomic_read(&ng->refcount); +		period *= shared + 1; +		period /= private + shared + 1; +	} + +	return max(smin, period); +} +  static unsigned int task_scan_max(struct task_struct *p)  { -	unsigned int smin = task_scan_min(p); -	unsigned int smax; +	unsigned long smin = task_scan_min(p); +	unsigned long smax;  	/* Watch for min being lower than max due to floor calculations */  	smax = sysctl_numa_balancing_scan_period_max / task_nr_scan_windows(p); + +	/* Scale the maximum scan period with the amount of shared memory. */ +	if (p->numa_group) { +		struct numa_group *ng = p->numa_group; +		unsigned long shared = group_faults_shared(ng); +		unsigned long private = group_faults_priv(ng); +		unsigned long period = smax; + +		period *= atomic_read(&ng->refcount); +		period *= shared + 1; +		period /= private + shared + 1; + +		smax = max(smax, period); +	} +  	return max(smin, smax);  } @@ -1129,26 +1186,6 @@ static void account_numa_dequeue(struct rq *rq, struct task_struct *p)  	rq->nr_preferred_running -= (p->numa_preferred_nid == task_node(p));  } -struct numa_group { -	atomic_t refcount; - -	spinlock_t lock; /* nr_tasks, tasks */ -	int nr_tasks; -	pid_t gid; -	int active_nodes; - -	struct rcu_head rcu; -	unsigned long total_faults; -	unsigned long max_faults_cpu; -	/* -	 * Faults_cpu is used to decide whether memory should move -	 * towards the CPU. As a consequence, these stats are weighted -	 * more by CPU use than by memory faults. -	 */ -	unsigned long *faults_cpu; -	unsigned long faults[0]; -}; -  /* Shared or private faults. */  #define NR_NUMA_HINT_FAULT_TYPES 2 @@ -1198,6 +1235,30 @@ static inline unsigned long group_faults_cpu(struct numa_group *group, int nid)  		group->faults_cpu[task_faults_idx(NUMA_MEM, nid, 1)];  } +static inline unsigned long group_faults_priv(struct numa_group *ng) +{ +	unsigned long faults = 0; +	int node; + +	for_each_online_node(node) { +		faults += ng->faults[task_faults_idx(NUMA_MEM, node, 1)]; +	} + +	return faults; +} + +static inline unsigned long group_faults_shared(struct numa_group *ng) +{ +	unsigned long faults = 0; +	int node; + +	for_each_online_node(node) { +		faults += ng->faults[task_faults_idx(NUMA_MEM, node, 0)]; +	} + +	return faults; +} +  /*   * A node triggering more than 1/3 as many NUMA faults as the maximum is   * considered part of a numa group's pseudo-interleaving set. Migrations @@ -1378,7 +1439,7 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page * page,  	       group_faults_cpu(ng, src_nid) * group_faults(p, dst_nid) * 4;  } -static unsigned long weighted_cpuload(const int cpu); +static unsigned long weighted_cpuload(struct rq *rq);  static unsigned long source_load(int cpu, int type);  static unsigned long target_load(int cpu, int type);  static unsigned long capacity_of(int cpu); @@ -1409,7 +1470,7 @@ static void update_numa_stats(struct numa_stats *ns, int nid)  		struct rq *rq = cpu_rq(cpu);  		ns->nr_running += rq->nr_running; -		ns->load += weighted_cpuload(cpu); +		ns->load += weighted_cpuload(rq);  		ns->compute_capacity += capacity_of(cpu);  		cpus++; @@ -1808,7 +1869,7 @@ static int task_numa_migrate(struct task_struct *p)  	 * Reset the scan period if the task is being rescheduled on an  	 * alternative node to recheck if the tasks is now properly placed.  	 */ -	p->numa_scan_period = task_scan_min(p); +	p->numa_scan_period = task_scan_start(p);  	if (env.best_task == NULL) {  		ret = migrate_task_to(p, env.best_cpu); @@ -1892,7 +1953,7 @@ static void update_task_scan_period(struct task_struct *p,  			unsigned long shared, unsigned long private)  {  	unsigned int period_slot; -	int ratio; +	int lr_ratio, ps_ratio;  	int diff;  	unsigned long remote = p->numa_faults_locality[0]; @@ -1922,25 +1983,36 @@ static void update_task_scan_period(struct task_struct *p,  	 *	 >= NUMA_PERIOD_THRESHOLD scan period increases (scan slower)  	 */  	period_slot = DIV_ROUND_UP(p->numa_scan_period, NUMA_PERIOD_SLOTS); -	ratio = (local * NUMA_PERIOD_SLOTS) / (local + remote); -	if (ratio >= NUMA_PERIOD_THRESHOLD) { -		int slot = ratio - NUMA_PERIOD_THRESHOLD; +	lr_ratio = (local * NUMA_PERIOD_SLOTS) / (local + remote); +	ps_ratio = (private * NUMA_PERIOD_SLOTS) / (private + shared); + +	if (ps_ratio >= NUMA_PERIOD_THRESHOLD) { +		/* +		 * Most memory accesses are local. There is no need to +		 * do fast NUMA scanning, since memory is already local. +		 */ +		int slot = ps_ratio - NUMA_PERIOD_THRESHOLD; +		if (!slot) +			slot = 1; +		diff = slot * period_slot; +	} else if (lr_ratio >= NUMA_PERIOD_THRESHOLD) { +		/* +		 * Most memory accesses are shared with other tasks. +		 * There is no point in continuing fast NUMA scanning, +		 * since other tasks may just move the memory elsewhere. +		 */ +		int slot = lr_ratio - NUMA_PERIOD_THRESHOLD;  		if (!slot)  			slot = 1;  		diff = slot * period_slot;  	} else { -		diff = -(NUMA_PERIOD_THRESHOLD - ratio) * period_slot; -  		/* -		 * Scale scan rate increases based on sharing. There is an -		 * inverse relationship between the degree of sharing and -		 * the adjustment made to the scanning period. Broadly -		 * speaking the intent is that there is little point -		 * scanning faster if shared accesses dominate as it may -		 * simply bounce migrations uselessly +		 * Private memory faults exceed (SLOTS-THRESHOLD)/SLOTS, +		 * yet they are not on the local NUMA node. Speed up +		 * NUMA scanning to get the memory moved over.  		 */ -		ratio = DIV_ROUND_UP(private * NUMA_PERIOD_SLOTS, (private + shared + 1)); -		diff = (diff * ratio) / NUMA_PERIOD_SLOTS; +		int ratio = max(lr_ratio, ps_ratio); +		diff = -(NUMA_PERIOD_THRESHOLD - ratio) * period_slot;  	}  	p->numa_scan_period = clamp(p->numa_scan_period + diff, @@ -2448,7 +2520,7 @@ void task_numa_work(struct callback_head *work)  	if (p->numa_scan_period == 0) {  		p->numa_scan_period_max = task_scan_max(p); -		p->numa_scan_period = task_scan_min(p); +		p->numa_scan_period = task_scan_start(p);  	}  	next_scan = now + msecs_to_jiffies(p->numa_scan_period); @@ -2576,7 +2648,7 @@ void task_tick_numa(struct rq *rq, struct task_struct *curr)  	if (now > curr->node_stamp + period) {  		if (!curr->node_stamp) -			curr->numa_scan_period = task_scan_min(curr); +			curr->numa_scan_period = task_scan_start(curr);  		curr->node_stamp += period;  		if (!time_before(jiffies, curr->mm->numa_next_scan)) { @@ -2586,59 +2658,6 @@ void task_tick_numa(struct rq *rq, struct task_struct *curr)  	}  } -/* - * Can a task be moved from prev_cpu to this_cpu without causing a load - * imbalance that would trigger the load balancer? - */ -static inline bool numa_wake_affine(struct sched_domain *sd, -				    struct task_struct *p, int this_cpu, -				    int prev_cpu, int sync) -{ -	struct numa_stats prev_load, this_load; -	s64 this_eff_load, prev_eff_load; - -	update_numa_stats(&prev_load, cpu_to_node(prev_cpu)); -	update_numa_stats(&this_load, cpu_to_node(this_cpu)); - -	/* -	 * If sync wakeup then subtract the (maximum possible) -	 * effect of the currently running task from the load -	 * of the current CPU: -	 */ -	if (sync) { -		unsigned long current_load = task_h_load(current); - -		if (this_load.load > current_load) -			this_load.load -= current_load; -		else -			this_load.load = 0; -	} - -	/* -	 * In low-load situations, where this_cpu's node is idle due to the -	 * sync cause above having dropped this_load.load to 0, move the task. -	 * Moving to an idle socket will not create a bad imbalance. -	 * -	 * Otherwise check if the nodes are near enough in load to allow this -	 * task to be woken on this_cpu's node. -	 */ -	if (this_load.load > 0) { -		unsigned long task_load = task_h_load(p); - -		this_eff_load = 100; -		this_eff_load *= prev_load.compute_capacity; - -		prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2; -		prev_eff_load *= this_load.compute_capacity; - -		this_eff_load *= this_load.load + task_load; -		prev_eff_load *= prev_load.load - task_load; - -		return this_eff_load <= prev_eff_load; -	} - -	return true; -}  #else  static void task_tick_numa(struct rq *rq, struct task_struct *curr)  { @@ -2652,14 +2671,6 @@ static inline void account_numa_dequeue(struct rq *rq, struct task_struct *p)  {  } -#ifdef CONFIG_SMP -static inline bool numa_wake_affine(struct sched_domain *sd, -				    struct task_struct *p, int this_cpu, -				    int prev_cpu, int sync) -{ -	return true; -} -#endif /* !SMP */  #endif /* CONFIG_NUMA_BALANCING */  static void @@ -2790,6 +2801,29 @@ static inline void update_cfs_shares(struct sched_entity *se)  }  #endif /* CONFIG_FAIR_GROUP_SCHED */ +static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq) +{ +	if (&this_rq()->cfs == cfs_rq) { +		/* +		 * There are a few boundary cases this might miss but it should +		 * get called often enough that that should (hopefully) not be +		 * a real problem -- added to that it only calls on the local +		 * CPU, so if we enqueue remotely we'll miss an update, but +		 * the next tick/schedule should update. +		 * +		 * It will not get called when we go idle, because the idle +		 * thread is a different class (!fair), nor will the utilization +		 * number include things like RT tasks. +		 * +		 * As is, the util number is not freq-invariant (we'd have to +		 * implement arch_scale_freq_capacity() for that). +		 * +		 * See cpu_util(). +		 */ +		cpufreq_update_util(rq_of(cfs_rq), 0); +	} +} +  #ifdef CONFIG_SMP  /*   * Approximate: @@ -2968,6 +3002,18 @@ ___update_load_avg(u64 now, int cpu, struct sched_avg *sa,  	sa->last_update_time += delta << 10;  	/* +	 * running is a subset of runnable (weight) so running can't be set if +	 * runnable is clear. But there are some corner cases where the current +	 * se has been already dequeued but cfs_rq->curr still points to it. +	 * This means that weight will be 0 but not running for a sched_entity +	 * but also for a cfs_rq if the latter becomes idle. As an example, +	 * this happens during idle_balance() which calls +	 * update_blocked_averages() +	 */ +	if (!weight) +		running = 0; + +	/*  	 * Now we know we crossed measurement unit boundaries. The *_avg  	 * accrues by two steps:  	 * @@ -3276,29 +3322,6 @@ static inline void set_tg_cfs_propagate(struct cfs_rq *cfs_rq) {}  #endif /* CONFIG_FAIR_GROUP_SCHED */ -static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq) -{ -	if (&this_rq()->cfs == cfs_rq) { -		/* -		 * There are a few boundary cases this might miss but it should -		 * get called often enough that that should (hopefully) not be -		 * a real problem -- added to that it only calls on the local -		 * CPU, so if we enqueue remotely we'll miss an update, but -		 * the next tick/schedule should update. -		 * -		 * It will not get called when we go idle, because the idle -		 * thread is a different class (!fair), nor will the utilization -		 * number include things like RT tasks. -		 * -		 * As is, the util number is not freq-invariant (we'd have to -		 * implement arch_scale_freq_capacity() for that). -		 * -		 * See cpu_util(). -		 */ -		cpufreq_update_util(rq_of(cfs_rq), 0); -	} -} -  /*   * Unsigned subtract and clamp on underflow.   * @@ -3320,7 +3343,6 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq)   * update_cfs_rq_load_avg - update the cfs_rq's load/util averages   * @now: current time, as per cfs_rq_clock_task()   * @cfs_rq: cfs_rq to update - * @update_freq: should we call cfs_rq_util_change() or will the call do so   *   * The cfs_rq avg is the direct sum of all its entities (blocked and runnable)   * avg. The immediate corollary is that all (fair) tasks must be attached, see @@ -3334,7 +3356,7 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq)   * call update_tg_load_avg() when this function returns true.   */  static inline int -update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq) +update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)  {  	struct sched_avg *sa = &cfs_rq->avg;  	int decayed, removed_load = 0, removed_util = 0; @@ -3362,7 +3384,7 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq)  	cfs_rq->load_last_update_time_copy = sa->last_update_time;  #endif -	if (update_freq && (decayed || removed_util)) +	if (decayed || removed_util)  		cfs_rq_util_change(cfs_rq);  	return decayed || removed_load; @@ -3390,7 +3412,7 @@ static inline void update_load_avg(struct sched_entity *se, int flags)  	if (se->avg.last_update_time && !(flags & SKIP_AGE_LOAD))  		__update_load_avg_se(now, cpu, cfs_rq, se); -	decayed  = update_cfs_rq_load_avg(now, cfs_rq, true); +	decayed  = update_cfs_rq_load_avg(now, cfs_rq);  	decayed |= propagate_entity_load_avg(se);  	if (decayed && (flags & UPDATE_TG)) @@ -3534,7 +3556,7 @@ static int idle_balance(struct rq *this_rq, struct rq_flags *rf);  #else /* CONFIG_SMP */  static inline int -update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq) +update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)  {  	return 0;  } @@ -3544,7 +3566,7 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq)  static inline void update_load_avg(struct sched_entity *se, int not_used1)  { -	cpufreq_update_util(rq_of(cfs_rq_of(se)), 0); +	cfs_rq_util_change(cfs_rq_of(se));  }  static inline void @@ -5125,9 +5147,9 @@ static void cpu_load_update(struct rq *this_rq, unsigned long this_load,  }  /* Used instead of source_load when we know the type == 0 */ -static unsigned long weighted_cpuload(const int cpu) +static unsigned long weighted_cpuload(struct rq *rq)  { -	return cfs_rq_runnable_load_avg(&cpu_rq(cpu)->cfs); +	return cfs_rq_runnable_load_avg(&rq->cfs);  }  #ifdef CONFIG_NO_HZ_COMMON @@ -5172,7 +5194,7 @@ static void cpu_load_update_idle(struct rq *this_rq)  	/*  	 * bail if there's load or we're actually up-to-date.  	 */ -	if (weighted_cpuload(cpu_of(this_rq))) +	if (weighted_cpuload(this_rq))  		return;  	cpu_load_update_nohz(this_rq, READ_ONCE(jiffies), 0); @@ -5193,7 +5215,7 @@ void cpu_load_update_nohz_start(void)  	 * concurrently we'll exit nohz. And cpu_load write can race with  	 * cpu_load_update_idle() but both updater would be writing the same.  	 */ -	this_rq->cpu_load[0] = weighted_cpuload(cpu_of(this_rq)); +	this_rq->cpu_load[0] = weighted_cpuload(this_rq);  }  /* @@ -5209,7 +5231,7 @@ void cpu_load_update_nohz_stop(void)  	if (curr_jiffies == this_rq->last_load_update_tick)  		return; -	load = weighted_cpuload(cpu_of(this_rq)); +	load = weighted_cpuload(this_rq);  	rq_lock(this_rq, &rf);  	update_rq_clock(this_rq);  	cpu_load_update_nohz(this_rq, curr_jiffies, load); @@ -5235,7 +5257,7 @@ static void cpu_load_update_periodic(struct rq *this_rq, unsigned long load)   */  void cpu_load_update_active(struct rq *this_rq)  { -	unsigned long load = weighted_cpuload(cpu_of(this_rq)); +	unsigned long load = weighted_cpuload(this_rq);  	if (tick_nohz_tick_stopped())  		cpu_load_update_nohz(this_rq, READ_ONCE(jiffies), load); @@ -5253,7 +5275,7 @@ void cpu_load_update_active(struct rq *this_rq)  static unsigned long source_load(int cpu, int type)  {  	struct rq *rq = cpu_rq(cpu); -	unsigned long total = weighted_cpuload(cpu); +	unsigned long total = weighted_cpuload(rq);  	if (type == 0 || !sched_feat(LB_BIAS))  		return total; @@ -5268,7 +5290,7 @@ static unsigned long source_load(int cpu, int type)  static unsigned long target_load(int cpu, int type)  {  	struct rq *rq = cpu_rq(cpu); -	unsigned long total = weighted_cpuload(cpu); +	unsigned long total = weighted_cpuload(rq);  	if (type == 0 || !sched_feat(LB_BIAS))  		return total; @@ -5290,7 +5312,7 @@ static unsigned long cpu_avg_load_per_task(int cpu)  {  	struct rq *rq = cpu_rq(cpu);  	unsigned long nr_running = READ_ONCE(rq->cfs.h_nr_running); -	unsigned long load_avg = weighted_cpuload(cpu); +	unsigned long load_avg = weighted_cpuload(rq);  	if (nr_running)  		return load_avg / nr_running; @@ -5345,20 +5367,115 @@ static int wake_wide(struct task_struct *p)  	return 1;  } +struct llc_stats { +	unsigned long	nr_running; +	unsigned long	load; +	unsigned long	capacity; +	int		has_capacity; +}; + +static bool get_llc_stats(struct llc_stats *stats, int cpu) +{ +	struct sched_domain_shared *sds = rcu_dereference(per_cpu(sd_llc_shared, cpu)); + +	if (!sds) +		return false; + +	stats->nr_running	= READ_ONCE(sds->nr_running); +	stats->load		= READ_ONCE(sds->load); +	stats->capacity		= READ_ONCE(sds->capacity); +	stats->has_capacity	= stats->nr_running < per_cpu(sd_llc_size, cpu); + +	return true; +} + +/* + * Can a task be moved from prev_cpu to this_cpu without causing a load + * imbalance that would trigger the load balancer? + * + * Since we're running on 'stale' values, we might in fact create an imbalance + * but recomputing these values is expensive, as that'd mean iteration 2 cache + * domains worth of CPUs. + */ +static bool +wake_affine_llc(struct sched_domain *sd, struct task_struct *p, +		int this_cpu, int prev_cpu, int sync) +{ +	struct llc_stats prev_stats, this_stats; +	s64 this_eff_load, prev_eff_load; +	unsigned long task_load; + +	if (!get_llc_stats(&prev_stats, prev_cpu) || +	    !get_llc_stats(&this_stats, this_cpu)) +		return false; + +	/* +	 * If sync wakeup then subtract the (maximum possible) +	 * effect of the currently running task from the load +	 * of the current LLC. +	 */ +	if (sync) { +		unsigned long current_load = task_h_load(current); + +		/* in this case load hits 0 and this LLC is considered 'idle' */ +		if (current_load > this_stats.load) +			return true; + +		this_stats.load -= current_load; +	} + +	/* +	 * The has_capacity stuff is not SMT aware, but by trying to balance +	 * the nr_running on both ends we try and fill the domain at equal +	 * rates, thereby first consuming cores before siblings. +	 */ + +	/* if the old cache has capacity, stay there */ +	if (prev_stats.has_capacity && prev_stats.nr_running < this_stats.nr_running+1) +		return false; + +	/* if this cache has capacity, come here */ +	if (this_stats.has_capacity && this_stats.nr_running < prev_stats.nr_running+1) +		return true; + +	/* +	 * Check to see if we can move the load without causing too much +	 * imbalance. +	 */ +	task_load = task_h_load(p); + +	this_eff_load = 100; +	this_eff_load *= prev_stats.capacity; + +	prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2; +	prev_eff_load *= this_stats.capacity; + +	this_eff_load *= this_stats.load + task_load; +	prev_eff_load *= prev_stats.load - task_load; + +	return this_eff_load <= prev_eff_load; +} +  static int wake_affine(struct sched_domain *sd, struct task_struct *p,  		       int prev_cpu, int sync)  {  	int this_cpu = smp_processor_id(); -	bool affine = false; +	bool affine;  	/* -	 * Common case: CPUs are in the same socket, and select_idle_sibling() -	 * will do its thing regardless of what we return: +	 * Default to no affine wakeups; wake_affine() should not effect a task +	 * placement the load-balancer feels inclined to undo. The conservative +	 * option is therefore to not move tasks when they wake up.  	 */ -	if (cpus_share_cache(prev_cpu, this_cpu)) -		affine = true; -	else -		affine = numa_wake_affine(sd, p, this_cpu, prev_cpu, sync); +	affine = false; + +	/* +	 * If the wakeup is across cache domains, try to evaluate if movement +	 * makes sense, otherwise rely on select_idle_siblings() to do +	 * placement inside the cache domain. +	 */ +	if (!cpus_share_cache(prev_cpu, this_cpu)) +		affine = wake_affine_llc(sd, p, this_cpu, prev_cpu, sync);  	schedstat_inc(p->se.statistics.nr_wakeups_affine_attempts);  	if (affine) { @@ -5550,7 +5667,7 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)  				shallowest_idle_cpu = i;  			}  		} else if (shallowest_idle_cpu == -1) { -			load = weighted_cpuload(i); +			load = weighted_cpuload(cpu_rq(i));  			if (load < min_load || (load == min_load && i == this_cpu)) {  				min_load = load;  				least_loaded_cpu = i; @@ -6187,10 +6304,10 @@ pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf  	int new_tasks;  again: -#ifdef CONFIG_FAIR_GROUP_SCHED  	if (!cfs_rq->nr_running)  		goto idle; +#ifdef CONFIG_FAIR_GROUP_SCHED  	if (prev->sched_class != &fair_sched_class)  		goto simple; @@ -6220,11 +6337,17 @@ again:  			/*  			 * This call to check_cfs_rq_runtime() will do the  			 * throttle and dequeue its entity in the parent(s). -			 * Therefore the 'simple' nr_running test will indeed +			 * Therefore the nr_running test will indeed  			 * be correct.  			 */ -			if (unlikely(check_cfs_rq_runtime(cfs_rq))) +			if (unlikely(check_cfs_rq_runtime(cfs_rq))) { +				cfs_rq = &rq->cfs; + +				if (!cfs_rq->nr_running) +					goto idle; +  				goto simple; +			}  		}  		se = pick_next_entity(cfs_rq, curr); @@ -6264,12 +6387,8 @@ again:  	return p;  simple: -	cfs_rq = &rq->cfs;  #endif -	if (!cfs_rq->nr_running) -		goto idle; -  	put_prev_task(rq, prev);  	do { @@ -6917,7 +7036,7 @@ static void update_blocked_averages(int cpu)  		if (throttled_hierarchy(cfs_rq))  			continue; -		if (update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq, true)) +		if (update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq))  			update_tg_load_avg(cfs_rq, 0);  		/* Propagate pending load changes to the parent, if any: */ @@ -6990,7 +7109,7 @@ static inline void update_blocked_averages(int cpu)  	rq_lock_irqsave(rq, &rf);  	update_rq_clock(rq); -	update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq, true); +	update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq);  	rq_unlock_irqrestore(rq, &rf);  } @@ -7036,6 +7155,7 @@ struct sg_lb_stats {  struct sd_lb_stats {  	struct sched_group *busiest;	/* Busiest group in this sd */  	struct sched_group *local;	/* Local group in this sd */ +	unsigned long total_running;  	unsigned long total_load;	/* Total load of all groups in sd */  	unsigned long total_capacity;	/* Total capacity of all groups in sd */  	unsigned long avg_load;	/* Average load across all groups in sd */ @@ -7055,6 +7175,7 @@ static inline void init_sd_lb_stats(struct sd_lb_stats *sds)  	*sds = (struct sd_lb_stats){  		.busiest = NULL,  		.local = NULL, +		.total_running = 0UL,  		.total_load = 0UL,  		.total_capacity = 0UL,  		.busiest_stat = { @@ -7363,7 +7484,7 @@ static inline void update_sg_lb_stats(struct lb_env *env,  		sgs->nr_numa_running += rq->nr_numa_running;  		sgs->nr_preferred_running += rq->nr_preferred_running;  #endif -		sgs->sum_weighted_load += weighted_cpuload(i); +		sgs->sum_weighted_load += weighted_cpuload(rq);  		/*  		 * No need to call idle_cpu() if nr_running is not 0  		 */ @@ -7490,6 +7611,7 @@ static inline enum fbq_type fbq_classify_rq(struct rq *rq)   */  static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sds)  { +	struct sched_domain_shared *shared = env->sd->shared;  	struct sched_domain *child = env->sd->child;  	struct sched_group *sg = env->sd->groups;  	struct sg_lb_stats *local = &sds->local_stat; @@ -7546,6 +7668,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd  next_group:  		/* Now, start updating sd_lb_stats */ +		sds->total_running += sgs->sum_nr_running;  		sds->total_load += sgs->group_load;  		sds->total_capacity += sgs->group_capacity; @@ -7561,6 +7684,21 @@ next_group:  			env->dst_rq->rd->overload = overload;  	} +	if (!shared) +		return; + +	/* +	 * Since these are sums over groups they can contain some CPUs +	 * multiple times for the NUMA domains. +	 * +	 * Currently only wake_affine_llc() and find_busiest_group() +	 * uses these numbers, only the last is affected by this problem. +	 * +	 * XXX fix that. +	 */ +	WRITE_ONCE(shared->nr_running,	sds->total_running); +	WRITE_ONCE(shared->load,	sds->total_load); +	WRITE_ONCE(shared->capacity,	sds->total_capacity);  }  /** @@ -7790,6 +7928,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env)  	if (!sds.busiest || busiest->sum_nr_running == 0)  		goto out_balanced; +	/* XXX broken for overlapping NUMA groups */  	sds.avg_load = (SCHED_CAPACITY_SCALE * sds.total_load)  						/ sds.total_capacity; @@ -7892,7 +8031,7 @@ static struct rq *find_busiest_queue(struct lb_env *env,  		capacity = capacity_of(i); -		wl = weighted_cpuload(i); +		wl = weighted_cpuload(rq);  		/*  		 * When comparing with imbalance, use weighted_cpuload() diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c new file mode 100644 index 000000000000..a92fddc22747 --- /dev/null +++ b/kernel/sched/membarrier.c @@ -0,0 +1,152 @@ +/* + * Copyright (C) 2010-2017 Mathieu Desnoyers <mathieu.desnoyers@efficios.com> + * + * membarrier system call + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the + * GNU General Public License for more details. + */ + +#include <linux/syscalls.h> +#include <linux/membarrier.h> +#include <linux/tick.h> +#include <linux/cpumask.h> + +#include "sched.h"	/* for cpu_rq(). */ + +/* + * Bitmask made from a "or" of all commands within enum membarrier_cmd, + * except MEMBARRIER_CMD_QUERY. + */ +#define MEMBARRIER_CMD_BITMASK	\ +	(MEMBARRIER_CMD_SHARED | MEMBARRIER_CMD_PRIVATE_EXPEDITED) + +static void ipi_mb(void *info) +{ +	smp_mb();	/* IPIs should be serializing but paranoid. */ +} + +static void membarrier_private_expedited(void) +{ +	int cpu; +	bool fallback = false; +	cpumask_var_t tmpmask; + +	if (num_online_cpus() == 1) +		return; + +	/* +	 * Matches memory barriers around rq->curr modification in +	 * scheduler. +	 */ +	smp_mb();	/* system call entry is not a mb. */ + +	/* +	 * Expedited membarrier commands guarantee that they won't +	 * block, hence the GFP_NOWAIT allocation flag and fallback +	 * implementation. +	 */ +	if (!zalloc_cpumask_var(&tmpmask, GFP_NOWAIT)) { +		/* Fallback for OOM. */ +		fallback = true; +	} + +	cpus_read_lock(); +	for_each_online_cpu(cpu) { +		struct task_struct *p; + +		/* +		 * Skipping the current CPU is OK even through we can be +		 * migrated at any point. The current CPU, at the point +		 * where we read raw_smp_processor_id(), is ensured to +		 * be in program order with respect to the caller +		 * thread. Therefore, we can skip this CPU from the +		 * iteration. +		 */ +		if (cpu == raw_smp_processor_id()) +			continue; +		rcu_read_lock(); +		p = task_rcu_dereference(&cpu_rq(cpu)->curr); +		if (p && p->mm == current->mm) { +			if (!fallback) +				__cpumask_set_cpu(cpu, tmpmask); +			else +				smp_call_function_single(cpu, ipi_mb, NULL, 1); +		} +		rcu_read_unlock(); +	} +	if (!fallback) { +		smp_call_function_many(tmpmask, ipi_mb, NULL, 1); +		free_cpumask_var(tmpmask); +	} +	cpus_read_unlock(); + +	/* +	 * Memory barrier on the caller thread _after_ we finished +	 * waiting for the last IPI. Matches memory barriers around +	 * rq->curr modification in scheduler. +	 */ +	smp_mb();	/* exit from system call is not a mb */ +} + +/** + * sys_membarrier - issue memory barriers on a set of threads + * @cmd:   Takes command values defined in enum membarrier_cmd. + * @flags: Currently needs to be 0. For future extensions. + * + * If this system call is not implemented, -ENOSYS is returned. If the + * command specified does not exist, not available on the running + * kernel, or if the command argument is invalid, this system call + * returns -EINVAL. For a given command, with flags argument set to 0, + * this system call is guaranteed to always return the same value until + * reboot. + * + * All memory accesses performed in program order from each targeted thread + * is guaranteed to be ordered with respect to sys_membarrier(). If we use + * the semantic "barrier()" to represent a compiler barrier forcing memory + * accesses to be performed in program order across the barrier, and + * smp_mb() to represent explicit memory barriers forcing full memory + * ordering across the barrier, we have the following ordering table for + * each pair of barrier(), sys_membarrier() and smp_mb(): + * + * The pair ordering is detailed as (O: ordered, X: not ordered): + * + *                        barrier()   smp_mb() sys_membarrier() + *        barrier()          X           X            O + *        smp_mb()           X           O            O + *        sys_membarrier()   O           O            O + */ +SYSCALL_DEFINE2(membarrier, int, cmd, int, flags) +{ +	if (unlikely(flags)) +		return -EINVAL; +	switch (cmd) { +	case MEMBARRIER_CMD_QUERY: +	{ +		int cmd_mask = MEMBARRIER_CMD_BITMASK; + +		if (tick_nohz_full_enabled()) +			cmd_mask &= ~MEMBARRIER_CMD_SHARED; +		return cmd_mask; +	} +	case MEMBARRIER_CMD_SHARED: +		/* MEMBARRIER_CMD_SHARED is not compatible with nohz_full. */ +		if (tick_nohz_full_enabled()) +			return -EINVAL; +		if (num_online_cpus() > 1) +			synchronize_sched(); +		return 0; +	case MEMBARRIER_CMD_PRIVATE_EXPEDITED: +		membarrier_private_expedited(); +		return 0; +	default: +		return -EINVAL; +	} +} diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index eeef1a3086d1..ab1c7f5409a0 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -769,7 +769,7 @@ struct rq {  #ifdef CONFIG_SCHED_HRTICK  #ifdef CONFIG_SMP  	int hrtick_csd_pending; -	struct call_single_data hrtick_csd; +	call_single_data_t hrtick_csd;  #endif  	struct hrtimer hrtick_timer;  #endif @@ -1120,11 +1120,15 @@ extern int group_balance_cpu(struct sched_group *sg);  #if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)  void register_sched_domain_sysctl(void); +void dirty_sched_domain_sysctl(int cpu);  void unregister_sched_domain_sysctl(void);  #else  static inline void register_sched_domain_sysctl(void)  {  } +static inline void dirty_sched_domain_sysctl(int cpu) +{ +}  static inline void unregister_sched_domain_sysctl(void)  {  } diff --git a/kernel/sched/swait.c b/kernel/sched/swait.c index 3d5610dcce11..2227e183e202 100644 --- a/kernel/sched/swait.c +++ b/kernel/sched/swait.c @@ -33,9 +33,6 @@ void swake_up(struct swait_queue_head *q)  {  	unsigned long flags; -	if (!swait_active(q)) -		return; -  	raw_spin_lock_irqsave(&q->lock, flags);  	swake_up_locked(q);  	raw_spin_unlock_irqrestore(&q->lock, flags); @@ -51,9 +48,6 @@ void swake_up_all(struct swait_queue_head *q)  	struct swait_queue *curr;  	LIST_HEAD(tmp); -	if (!swait_active(q)) -		return; -  	raw_spin_lock_irq(&q->lock);  	list_splice_init(&q->task_list, &tmp);  	while (!list_empty(&tmp)) { diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 79895aec281e..6f7b43982f73 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -261,8 +261,6 @@ void rq_attach_root(struct rq *rq, struct root_domain *rd)  static int init_rootdomain(struct root_domain *rd)  { -	memset(rd, 0, sizeof(*rd)); -  	if (!zalloc_cpumask_var(&rd->span, GFP_KERNEL))  		goto out;  	if (!zalloc_cpumask_var(&rd->online, GFP_KERNEL)) @@ -311,7 +309,7 @@ static struct root_domain *alloc_rootdomain(void)  {  	struct root_domain *rd; -	rd = kmalloc(sizeof(*rd), GFP_KERNEL); +	rd = kzalloc(sizeof(*rd), GFP_KERNEL);  	if (!rd)  		return NULL; @@ -337,7 +335,8 @@ static void free_sched_groups(struct sched_group *sg, int free_sgc)  		if (free_sgc && atomic_dec_and_test(&sg->sgc->ref))  			kfree(sg->sgc); -		kfree(sg); +		if (atomic_dec_and_test(&sg->ref)) +			kfree(sg);  		sg = tmp;  	} while (sg != first);  } @@ -345,15 +344,12 @@ static void free_sched_groups(struct sched_group *sg, int free_sgc)  static void destroy_sched_domain(struct sched_domain *sd)  {  	/* -	 * If its an overlapping domain it has private groups, iterate and -	 * nuke them all. +	 * A normal sched domain may have multiple group references, an +	 * overlapping domain, having private groups, only one.  Iterate, +	 * dropping group/capacity references, freeing where none remain.  	 */ -	if (sd->flags & SD_OVERLAP) { -		free_sched_groups(sd->groups, 1); -	} else if (atomic_dec_and_test(&sd->groups->ref)) { -		kfree(sd->groups->sgc); -		kfree(sd->groups); -	} +	free_sched_groups(sd->groups, 1); +  	if (sd->shared && atomic_dec_and_test(&sd->shared->ref))  		kfree(sd->shared);  	kfree(sd); @@ -463,6 +459,7 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)  	rq_attach_root(rq, rd);  	tmp = rq->sd;  	rcu_assign_pointer(rq->sd, sd); +	dirty_sched_domain_sysctl(cpu);  	destroy_sched_domains(tmp);  	update_top_cache_domain(cpu); @@ -670,6 +667,7 @@ build_group_from_child_sched_domain(struct sched_domain *sd, int cpu)  	else  		cpumask_copy(sg_span, sched_domain_span(sd)); +	atomic_inc(&sg->ref);  	return sg;  } @@ -1595,7 +1593,7 @@ static void __sdt_free(const struct cpumask *cpu_map)  	}  } -struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl, +static struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,  		const struct cpumask *cpu_map, struct sched_domain_attr *attr,  		struct sched_domain *child, int cpu)  { @@ -1854,7 +1852,17 @@ void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],  	/* Let the architecture update CPU core mappings: */  	new_topology = arch_update_cpu_topology(); -	n = doms_new ? ndoms_new : 0; +	if (!doms_new) { +		WARN_ON_ONCE(dattr_new); +		n = 0; +		doms_new = alloc_sched_domains(1); +		if (doms_new) { +			n = 1; +			cpumask_andnot(doms_new[0], cpu_active_mask, cpu_isolated_map); +		} +	} else { +		n = ndoms_new; +	}  	/* Destroy deleted domains: */  	for (i = 0; i < ndoms_cur; i++) { @@ -1870,11 +1878,10 @@ match1:  	}  	n = ndoms_cur; -	if (doms_new == NULL) { +	if (!doms_new) {  		n = 0;  		doms_new = &fallback_doms;  		cpumask_andnot(doms_new[0], cpu_active_mask, cpu_isolated_map); -		WARN_ON_ONCE(dattr_new);  	}  	/* Build new domains: */ diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c index 17f11c6b0a9f..d6afed6d0752 100644 --- a/kernel/sched/wait.c +++ b/kernel/sched/wait.c @@ -70,9 +70,10 @@ static void __wake_up_common(struct wait_queue_head *wq_head, unsigned int mode,  	list_for_each_entry_safe(curr, next, &wq_head->head, entry) {  		unsigned flags = curr->flags; - -		if (curr->func(curr, mode, wake_flags, key) && -				(flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive) +		int ret = curr->func(curr, mode, wake_flags, key); +		if (ret < 0) +			break; +		if (ret && (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)  			break;  	}  } diff --git a/kernel/signal.c b/kernel/signal.c index 7e33f8c583e6..ed804a470dcd 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1194,7 +1194,11 @@ force_sig_info(int sig, struct siginfo *info, struct task_struct *t)  			recalc_sigpending_and_wake(t);  		}  	} -	if (action->sa.sa_handler == SIG_DFL) +	/* +	 * Don't clear SIGNAL_UNKILLABLE for traced tasks, users won't expect +	 * debugging to leave init killable. +	 */ +	if (action->sa.sa_handler == SIG_DFL && !t->ptrace)  		t->signal->flags &= ~SIGNAL_UNKILLABLE;  	ret = specific_send_sig_info(sig, info, t);  	spin_unlock_irqrestore(&t->sighand->siglock, flags); diff --git a/kernel/smp.c b/kernel/smp.c index 3061483cb3ad..81cfca9b4cc3 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -28,7 +28,7 @@ enum {  };  struct call_function_data { -	struct call_single_data	__percpu *csd; +	call_single_data_t	__percpu *csd;  	cpumask_var_t		cpumask;  	cpumask_var_t		cpumask_ipi;  }; @@ -51,7 +51,7 @@ int smpcfd_prepare_cpu(unsigned int cpu)  		free_cpumask_var(cfd->cpumask);  		return -ENOMEM;  	} -	cfd->csd = alloc_percpu(struct call_single_data); +	cfd->csd = alloc_percpu(call_single_data_t);  	if (!cfd->csd) {  		free_cpumask_var(cfd->cpumask);  		free_cpumask_var(cfd->cpumask_ipi); @@ -103,12 +103,12 @@ void __init call_function_init(void)   * previous function call. For multi-cpu calls its even more interesting   * as we'll have to ensure no other cpu is observing our csd.   */ -static __always_inline void csd_lock_wait(struct call_single_data *csd) +static __always_inline void csd_lock_wait(call_single_data_t *csd)  {  	smp_cond_load_acquire(&csd->flags, !(VAL & CSD_FLAG_LOCK));  } -static __always_inline void csd_lock(struct call_single_data *csd) +static __always_inline void csd_lock(call_single_data_t *csd)  {  	csd_lock_wait(csd);  	csd->flags |= CSD_FLAG_LOCK; @@ -116,12 +116,12 @@ static __always_inline void csd_lock(struct call_single_data *csd)  	/*  	 * prevent CPU from reordering the above assignment  	 * to ->flags with any subsequent assignments to other -	 * fields of the specified call_single_data structure: +	 * fields of the specified call_single_data_t structure:  	 */  	smp_wmb();  } -static __always_inline void csd_unlock(struct call_single_data *csd) +static __always_inline void csd_unlock(call_single_data_t *csd)  {  	WARN_ON(!(csd->flags & CSD_FLAG_LOCK)); @@ -131,14 +131,14 @@ static __always_inline void csd_unlock(struct call_single_data *csd)  	smp_store_release(&csd->flags, 0);  } -static DEFINE_PER_CPU_SHARED_ALIGNED(struct call_single_data, csd_data); +static DEFINE_PER_CPU_SHARED_ALIGNED(call_single_data_t, csd_data);  /* - * Insert a previously allocated call_single_data element + * Insert a previously allocated call_single_data_t element   * for execution on the given CPU. data must already have   * ->func, ->info, and ->flags set.   */ -static int generic_exec_single(int cpu, struct call_single_data *csd, +static int generic_exec_single(int cpu, call_single_data_t *csd,  			       smp_call_func_t func, void *info)  {  	if (cpu == smp_processor_id()) { @@ -210,7 +210,7 @@ static void flush_smp_call_function_queue(bool warn_cpu_offline)  {  	struct llist_head *head;  	struct llist_node *entry; -	struct call_single_data *csd, *csd_next; +	call_single_data_t *csd, *csd_next;  	static bool warned;  	WARN_ON(!irqs_disabled()); @@ -268,8 +268,10 @@ static void flush_smp_call_function_queue(bool warn_cpu_offline)  int smp_call_function_single(int cpu, smp_call_func_t func, void *info,  			     int wait)  { -	struct call_single_data *csd; -	struct call_single_data csd_stack = { .flags = CSD_FLAG_LOCK | CSD_FLAG_SYNCHRONOUS }; +	call_single_data_t *csd; +	call_single_data_t csd_stack = { +		.flags = CSD_FLAG_LOCK | CSD_FLAG_SYNCHRONOUS, +	};  	int this_cpu;  	int err; @@ -321,7 +323,7 @@ EXPORT_SYMBOL(smp_call_function_single);   * NOTE: Be careful, there is unfortunately no current debugging facility to   * validate the correctness of this serialization.   */ -int smp_call_function_single_async(int cpu, struct call_single_data *csd) +int smp_call_function_single_async(int cpu, call_single_data_t *csd)  {  	int err = 0; @@ -444,7 +446,7 @@ void smp_call_function_many(const struct cpumask *mask,  	cpumask_clear(cfd->cpumask_ipi);  	for_each_cpu(cpu, cfd->cpumask) { -		struct call_single_data *csd = per_cpu_ptr(cfd->csd, cpu); +		call_single_data_t *csd = per_cpu_ptr(cfd->csd, cpu);  		csd_lock(csd);  		if (wait) @@ -460,7 +462,7 @@ void smp_call_function_many(const struct cpumask *mask,  	if (wait) {  		for_each_cpu(cpu, cfd->cpumask) { -			struct call_single_data *csd; +			call_single_data_t *csd;  			csd = per_cpu_ptr(cfd->csd, cpu);  			csd_lock_wait(csd); diff --git a/kernel/task_work.c b/kernel/task_work.c index d513051fcca2..836a72a66fba 100644 --- a/kernel/task_work.c +++ b/kernel/task_work.c @@ -96,20 +96,16 @@ void task_work_run(void)  		 * work->func() can do task_work_add(), do not set  		 * work_exited unless the list is empty.  		 */ +		raw_spin_lock_irq(&task->pi_lock);  		do {  			work = READ_ONCE(task->task_works);  			head = !work && (task->flags & PF_EXITING) ?  				&work_exited : NULL;  		} while (cmpxchg(&task->task_works, work, head) != work); +		raw_spin_unlock_irq(&task->pi_lock);  		if (!work)  			break; -		/* -		 * Synchronize with task_work_cancel(). It can't remove -		 * the first entry == work, cmpxchg(task_works) should -		 * fail, but it can play with *work and other entries. -		 */ -		raw_spin_unlock_wait(&task->pi_lock);  		do {  			next = work->next; diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 8f5866981883..8ea4fb315719 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -637,9 +637,7 @@ static inline void tk_update_ktime_data(struct timekeeper *tk)  	tk->ktime_sec = seconds;  	/* Update the monotonic raw base */ -	seconds = tk->raw_sec; -	nsec = (u32)(tk->tkr_raw.xtime_nsec >> tk->tkr_raw.shift); -	tk->tkr_raw.base = ns_to_ktime(seconds * NSEC_PER_SEC + nsec); +	tk->tkr_raw.base = ns_to_ktime(tk->raw_sec * NSEC_PER_SEC);  }  /* must hold timekeeper_lock */ diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 8f5d1bf18854..f2674a056c26 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -203,6 +203,7 @@ struct timer_base {  	bool			migration_enabled;  	bool			nohz_active;  	bool			is_idle; +	bool			must_forward_clk;  	DECLARE_BITMAP(pending_map, WHEEL_SIZE);  	struct hlist_head	vectors[WHEEL_SIZE];  } ____cacheline_aligned; @@ -856,13 +857,19 @@ get_target_base(struct timer_base *base, unsigned tflags)  static inline void forward_timer_base(struct timer_base *base)  { -	unsigned long jnow = READ_ONCE(jiffies); +	unsigned long jnow;  	/* -	 * We only forward the base when it's idle and we have a delta between -	 * base clock and jiffies. +	 * We only forward the base when we are idle or have just come out of +	 * idle (must_forward_clk logic), and have a delta between base clock +	 * and jiffies. In the common case, run_timers will take care of it.  	 */ -	if (!base->is_idle || (long) (jnow - base->clk) < 2) +	if (likely(!base->must_forward_clk)) +		return; + +	jnow = READ_ONCE(jiffies); +	base->must_forward_clk = base->is_idle; +	if ((long)(jnow - base->clk) < 2)  		return;  	/* @@ -938,6 +945,11 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only)  	 * same array bucket then just return:  	 */  	if (timer_pending(timer)) { +		/* +		 * The downside of this optimization is that it can result in +		 * larger granularity than you would get from adding a new +		 * timer with this expiry. +		 */  		if (timer->expires == expires)  			return 1; @@ -948,6 +960,7 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only)  		 * dequeue/enqueue dance.  		 */  		base = lock_timer_base(timer, &flags); +		forward_timer_base(base);  		clk = base->clk;  		idx = calc_wheel_index(expires, clk); @@ -964,6 +977,7 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only)  		}  	} else {  		base = lock_timer_base(timer, &flags); +		forward_timer_base(base);  	}  	ret = detach_if_pending(timer, base, false); @@ -991,12 +1005,10 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only)  			raw_spin_lock(&base->lock);  			WRITE_ONCE(timer->flags,  				   (timer->flags & ~TIMER_BASEMASK) | base->cpu); +			forward_timer_base(base);  		}  	} -	/* Try to forward a stale timer base clock */ -	forward_timer_base(base); -  	timer->expires = expires;  	/*  	 * If 'idx' was calculated above and the base time did not advance @@ -1112,6 +1124,7 @@ void add_timer_on(struct timer_list *timer, int cpu)  		WRITE_ONCE(timer->flags,  			   (timer->flags & ~TIMER_BASEMASK) | cpu);  	} +	forward_timer_base(base);  	debug_activate(timer, timer->expires);  	internal_add_timer(base, timer); @@ -1497,10 +1510,16 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem)  		if (!is_max_delta)  			expires = basem + (u64)(nextevt - basej) * TICK_NSEC;  		/* -		 * If we expect to sleep more than a tick, mark the base idle: +		 * If we expect to sleep more than a tick, mark the base idle. +		 * Also the tick is stopped so any added timer must forward +		 * the base clk itself to keep granularity small. This idle +		 * logic is only maintained for the BASE_STD base, deferrable +		 * timers may still see large granularity skew (by design).  		 */ -		if ((expires - basem) > TICK_NSEC) +		if ((expires - basem) > TICK_NSEC) { +			base->must_forward_clk = true;  			base->is_idle = true; +		}  	}  	raw_spin_unlock(&base->lock); @@ -1611,6 +1630,19 @@ static __latent_entropy void run_timer_softirq(struct softirq_action *h)  {  	struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]); +	/* +	 * must_forward_clk must be cleared before running timers so that any +	 * timer functions that call mod_timer will not try to forward the +	 * base. idle trcking / clock forwarding logic is only used with +	 * BASE_STD timers. +	 * +	 * The deferrable base does not do idle tracking at all, so we do +	 * not forward it. This can result in very large variations in +	 * granularity for deferrable timers, but they can be deferred for +	 * long periods due to idle. +	 */ +	base->must_forward_clk = false; +  	__run_timers(base);  	if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && base->nohz_active)  		__run_timers(this_cpu_ptr(&timer_bases[BASE_DEF])); diff --git a/kernel/torture.c b/kernel/torture.c index 55de96529287..637e172835d8 100644 --- a/kernel/torture.c +++ b/kernel/torture.c @@ -117,7 +117,7 @@ bool torture_offline(int cpu, long *n_offl_attempts, long *n_offl_successes,  				 torture_type, cpu);  		(*n_offl_successes)++;  		delta = jiffies - starttime; -		sum_offl += delta; +		*sum_offl += delta;  		if (*min_offl < 0) {  			*min_offl = delta;  			*max_offl = delta; diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 37385193a608..dc498b605d5d 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -204,10 +204,36 @@ BPF_CALL_5(bpf_trace_printk, char *, fmt, u32, fmt_size, u64, arg1,  		fmt_cnt++;  	} -	return __trace_printk(1/* fake ip will not be printed */, fmt, -			      mod[0] == 2 ? arg1 : mod[0] == 1 ? (long) arg1 : (u32) arg1, -			      mod[1] == 2 ? arg2 : mod[1] == 1 ? (long) arg2 : (u32) arg2, -			      mod[2] == 2 ? arg3 : mod[2] == 1 ? (long) arg3 : (u32) arg3); +/* Horrid workaround for getting va_list handling working with different + * argument type combinations generically for 32 and 64 bit archs. + */ +#define __BPF_TP_EMIT()	__BPF_ARG3_TP() +#define __BPF_TP(...)							\ +	__trace_printk(1 /* Fake ip will not be printed. */,		\ +		       fmt, ##__VA_ARGS__) + +#define __BPF_ARG1_TP(...)						\ +	((mod[0] == 2 || (mod[0] == 1 && __BITS_PER_LONG == 64))	\ +	  ? __BPF_TP(arg1, ##__VA_ARGS__)				\ +	  : ((mod[0] == 1 || (mod[0] == 0 && __BITS_PER_LONG == 32))	\ +	      ? __BPF_TP((long)arg1, ##__VA_ARGS__)			\ +	      : __BPF_TP((u32)arg1, ##__VA_ARGS__))) + +#define __BPF_ARG2_TP(...)						\ +	((mod[1] == 2 || (mod[1] == 1 && __BITS_PER_LONG == 64))	\ +	  ? __BPF_ARG1_TP(arg2, ##__VA_ARGS__)				\ +	  : ((mod[1] == 1 || (mod[1] == 0 && __BITS_PER_LONG == 32))	\ +	      ? __BPF_ARG1_TP((long)arg2, ##__VA_ARGS__)		\ +	      : __BPF_ARG1_TP((u32)arg2, ##__VA_ARGS__))) + +#define __BPF_ARG3_TP(...)						\ +	((mod[2] == 2 || (mod[2] == 1 && __BITS_PER_LONG == 64))	\ +	  ? __BPF_ARG2_TP(arg3, ##__VA_ARGS__)				\ +	  : ((mod[2] == 1 || (mod[2] == 0 && __BITS_PER_LONG == 32))	\ +	      ? __BPF_ARG2_TP((long)arg3, ##__VA_ARGS__)		\ +	      : __BPF_ARG2_TP((u32)arg3, ##__VA_ARGS__))) + +	return __BPF_TP_EMIT();  }  static const struct bpf_func_proto bpf_trace_printk_proto = { diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 02004ae91860..96cea88fa00f 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -889,6 +889,10 @@ static int profile_graph_entry(struct ftrace_graph_ent *trace)  	function_profile_call(trace->func, 0, NULL, NULL); +	/* If function graph is shutting down, ret_stack can be NULL */ +	if (!current->ret_stack) +		return 0; +  	if (index >= 0 && index < FTRACE_RETFUNC_DEPTH)  		current->ret_stack[index].subtime = 0; diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 529cc50d7243..81279c6602ff 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -4386,15 +4386,19 @@ EXPORT_SYMBOL_GPL(ring_buffer_swap_cpu);   * the page that was allocated, with the read page of the buffer.   *   * Returns: - *  The page allocated, or NULL on error. + *  The page allocated, or ERR_PTR   */  void *ring_buffer_alloc_read_page(struct ring_buffer *buffer, int cpu)  { -	struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu]; +	struct ring_buffer_per_cpu *cpu_buffer;  	struct buffer_data_page *bpage = NULL;  	unsigned long flags;  	struct page *page; +	if (!cpumask_test_cpu(cpu, buffer->cpumask)) +		return ERR_PTR(-ENODEV); + +	cpu_buffer = buffer->buffers[cpu];  	local_irq_save(flags);  	arch_spin_lock(&cpu_buffer->lock); @@ -4412,7 +4416,7 @@ void *ring_buffer_alloc_read_page(struct ring_buffer *buffer, int cpu)  	page = alloc_pages_node(cpu_to_node(cpu),  				GFP_KERNEL | __GFP_NORETRY, 0);  	if (!page) -		return NULL; +		return ERR_PTR(-ENOMEM);  	bpage = page_address(page); @@ -4467,8 +4471,8 @@ EXPORT_SYMBOL_GPL(ring_buffer_free_read_page);   *   * for example:   *	rpage = ring_buffer_alloc_read_page(buffer, cpu); - *	if (!rpage) - *		return error; + *	if (IS_ERR(rpage)) + *		return PTR_ERR(rpage);   *	ret = ring_buffer_read_page(buffer, &rpage, len, cpu, 0);   *	if (ret >= 0)   *		process_page(rpage, ret); diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c index 9fbcaf567886..68ee79afe31c 100644 --- a/kernel/trace/ring_buffer_benchmark.c +++ b/kernel/trace/ring_buffer_benchmark.c @@ -113,7 +113,7 @@ static enum event_status read_page(int cpu)  	int i;  	bpage = ring_buffer_alloc_read_page(buffer, cpu); -	if (!bpage) +	if (IS_ERR(bpage))  		return EVENT_DROPPED;  	ret = ring_buffer_read_page(buffer, &bpage, PAGE_SIZE, cpu, 1); diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 42b9355033d4..44004d8aa3b3 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -6598,7 +6598,7 @@ tracing_buffers_read(struct file *filp, char __user *ubuf,  {  	struct ftrace_buffer_info *info = filp->private_data;  	struct trace_iterator *iter = &info->iter; -	ssize_t ret; +	ssize_t ret = 0;  	ssize_t size;  	if (!count) @@ -6612,10 +6612,15 @@ tracing_buffers_read(struct file *filp, char __user *ubuf,  	if (!info->spare) {  		info->spare = ring_buffer_alloc_read_page(iter->trace_buffer->buffer,  							  iter->cpu_file); -		info->spare_cpu = iter->cpu_file; +		if (IS_ERR(info->spare)) { +			ret = PTR_ERR(info->spare); +			info->spare = NULL; +		} else { +			info->spare_cpu = iter->cpu_file; +		}  	}  	if (!info->spare) -		return -ENOMEM; +		return ret;  	/* Do we have previous read data to read? */  	if (info->read < PAGE_SIZE) @@ -6790,8 +6795,9 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,  		ref->ref = 1;  		ref->buffer = iter->trace_buffer->buffer;  		ref->page = ring_buffer_alloc_read_page(ref->buffer, iter->cpu_file); -		if (!ref->page) { -			ret = -ENOMEM; +		if (IS_ERR(ref->page)) { +			ret = PTR_ERR(ref->page); +			ref->page = NULL;  			kfree(ref);  			break;  		} @@ -8293,6 +8299,7 @@ __init static int tracer_alloc_buffers(void)  	if (ret < 0)  		goto out_free_cpumask;  	/* Used for event triggers */ +	ret = -ENOMEM;  	temp_buffer = ring_buffer_alloc(PAGE_SIZE, RB_FL_OVERWRITE);  	if (!temp_buffer)  		goto out_rm_hp_state; @@ -8407,4 +8414,4 @@ __init static int clear_boot_tracer(void)  }  fs_initcall(tracer_init_tracefs); -late_initcall(clear_boot_tracer); +late_initcall_sync(clear_boot_tracer); diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index 562fa69df5d3..13ba2d3f6a91 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c @@ -306,6 +306,7 @@ static void  perf_ftrace_function_call(unsigned long ip, unsigned long parent_ip,  			  struct ftrace_ops *ops, struct pt_regs *pt_regs)  { +	struct perf_event *event;  	struct ftrace_entry *entry;  	struct hlist_head *head;  	struct pt_regs regs; @@ -329,8 +330,9 @@ perf_ftrace_function_call(unsigned long ip, unsigned long parent_ip,  	entry->ip = ip;  	entry->parent_ip = parent_ip; +	event = container_of(ops, struct perf_event, ftrace_ops);  	perf_trace_buf_submit(entry, ENTRY_SIZE, rctx, TRACE_FN, -			      1, ®s, head, NULL); +			      1, ®s, head, NULL, event);  #undef ENTRY_SIZE  } diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 59a411ff60c7..181e139a8057 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -1959,6 +1959,10 @@ static int create_filter(struct trace_event_call *call,  		if (err && set_str)  			append_filter_err(ps, filter);  	} +	if (err && !set_str) { +		free_event_filter(filter); +		filter = NULL; +	}  	create_filter_finish(ps);  	*filterp = filter; diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index c9b5aa10fbf9..8a907e12b6b9 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -1200,7 +1200,7 @@ kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs)  	memset(&entry[1], 0, dsize);  	store_trace_args(sizeof(*entry), &tk->tp, regs, (u8 *)&entry[1], dsize);  	perf_trace_buf_submit(entry, size, rctx, call->event.type, 1, regs, -			      head, NULL); +			      head, NULL, NULL);  }  NOKPROBE_SYMBOL(kprobe_perf_func); @@ -1236,7 +1236,7 @@ kretprobe_perf_func(struct trace_kprobe *tk, struct kretprobe_instance *ri,  	entry->ret_ip = (unsigned long)ri->ret_addr;  	store_trace_args(sizeof(*entry), &tk->tp, regs, (u8 *)&entry[1], dsize);  	perf_trace_buf_submit(entry, size, rctx, call->event.type, 1, regs, -			      head, NULL); +			      head, NULL, NULL);  }  NOKPROBE_SYMBOL(kretprobe_perf_func);  #endif	/* CONFIG_PERF_EVENTS */ diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 5e10395da88e..74d9a86eccc0 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -596,7 +596,7 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)  			       (unsigned long *)&rec->args);  	perf_trace_buf_submit(rec, size, rctx,  			      sys_data->enter_event->event.type, 1, regs, -			      head, NULL); +			      head, NULL, NULL);  }  static int perf_sysenter_enable(struct trace_event_call *call) @@ -667,7 +667,7 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)  	rec->nr = syscall_nr;  	rec->ret = syscall_get_return_value(current, regs);  	perf_trace_buf_submit(rec, size, rctx, sys_data->exit_event->event.type, -			      1, regs, head, NULL); +			      1, regs, head, NULL, NULL);  }  static int perf_sysexit_enable(struct trace_event_call *call) diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index a7581fec9681..4525e0271a53 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -1156,7 +1156,7 @@ static void __uprobe_perf_func(struct trace_uprobe *tu,  	}  	perf_trace_buf_submit(entry, size, rctx, call->event.type, 1, regs, -			      head, NULL); +			      head, NULL, NULL);   out:  	preempt_enable();  } diff --git a/kernel/trace/tracing_map.c b/kernel/trace/tracing_map.c index 0a689bbb78ef..305039b122fa 100644 --- a/kernel/trace/tracing_map.c +++ b/kernel/trace/tracing_map.c @@ -221,16 +221,19 @@ void tracing_map_array_free(struct tracing_map_array *a)  	if (!a)  		return; -	if (!a->pages) { -		kfree(a); -		return; -	} +	if (!a->pages) +		goto free;  	for (i = 0; i < a->n_pages; i++) {  		if (!a->pages[i])  			break;  		free_page((unsigned long)a->pages[i]);  	} + +	kfree(a->pages); + + free: +	kfree(a);  }  struct tracing_map_array *tracing_map_array_alloc(unsigned int n_elts, diff --git a/kernel/up.c b/kernel/up.c index ee81ac9af4ca..42c46bf3e0a5 100644 --- a/kernel/up.c +++ b/kernel/up.c @@ -23,7 +23,7 @@ int smp_call_function_single(int cpu, void (*func) (void *info), void *info,  }  EXPORT_SYMBOL(smp_call_function_single); -int smp_call_function_single_async(int cpu, struct call_single_data *csd) +int smp_call_function_single_async(int cpu, call_single_data_t *csd)  {  	unsigned long flags; diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 06d3389bca0d..f5d52024f6b7 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -240,6 +240,7 @@ static void set_sample_period(void)  	 * hardlockup detector generates a warning  	 */  	sample_period = get_softlockup_thresh() * ((u64)NSEC_PER_SEC / 5); +	watchdog_update_hrtimer_threshold(sample_period);  }  /* Commands for resetting the watchdog */ diff --git a/kernel/watchdog_hld.c b/kernel/watchdog_hld.c index 295a0d84934c..3a09ea1b1d3d 100644 --- a/kernel/watchdog_hld.c +++ b/kernel/watchdog_hld.c @@ -37,6 +37,62 @@ void arch_touch_nmi_watchdog(void)  }  EXPORT_SYMBOL(arch_touch_nmi_watchdog); +#ifdef CONFIG_HARDLOCKUP_CHECK_TIMESTAMP +static DEFINE_PER_CPU(ktime_t, last_timestamp); +static DEFINE_PER_CPU(unsigned int, nmi_rearmed); +static ktime_t watchdog_hrtimer_sample_threshold __read_mostly; + +void watchdog_update_hrtimer_threshold(u64 period) +{ +	/* +	 * The hrtimer runs with a period of (watchdog_threshold * 2) / 5 +	 * +	 * So it runs effectively with 2.5 times the rate of the NMI +	 * watchdog. That means the hrtimer should fire 2-3 times before +	 * the NMI watchdog expires. The NMI watchdog on x86 is based on +	 * unhalted CPU cycles, so if Turbo-Mode is enabled the CPU cycles +	 * might run way faster than expected and the NMI fires in a +	 * smaller period than the one deduced from the nominal CPU +	 * frequency. Depending on the Turbo-Mode factor this might be fast +	 * enough to get the NMI period smaller than the hrtimer watchdog +	 * period and trigger false positives. +	 * +	 * The sample threshold is used to check in the NMI handler whether +	 * the minimum time between two NMI samples has elapsed. That +	 * prevents false positives. +	 * +	 * Set this to 4/5 of the actual watchdog threshold period so the +	 * hrtimer is guaranteed to fire at least once within the real +	 * watchdog threshold. +	 */ +	watchdog_hrtimer_sample_threshold = period * 2; +} + +static bool watchdog_check_timestamp(void) +{ +	ktime_t delta, now = ktime_get_mono_fast_ns(); + +	delta = now - __this_cpu_read(last_timestamp); +	if (delta < watchdog_hrtimer_sample_threshold) { +		/* +		 * If ktime is jiffies based, a stalled timer would prevent +		 * jiffies from being incremented and the filter would look +		 * at a stale timestamp and never trigger. +		 */ +		if (__this_cpu_inc_return(nmi_rearmed) < 10) +			return false; +	} +	__this_cpu_write(nmi_rearmed, 0); +	__this_cpu_write(last_timestamp, now); +	return true; +} +#else +static inline bool watchdog_check_timestamp(void) +{ +	return true; +} +#endif +  static struct perf_event_attr wd_hw_attr = {  	.type		= PERF_TYPE_HARDWARE,  	.config		= PERF_COUNT_HW_CPU_CYCLES, @@ -61,6 +117,9 @@ static void watchdog_overflow_callback(struct perf_event *event,  		return;  	} +	if (!watchdog_check_timestamp()) +		return; +  	/* check for a hardlockup  	 * This is done by making sure our timer interrupt  	 * is incrementing.  The timer interrupt should have diff --git a/kernel/workqueue.c b/kernel/workqueue.c index ca937b0c3a96..ab3c0dc8c7ed 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -2091,8 +2091,30 @@ __acquires(&pool->lock)  	spin_unlock_irq(&pool->lock); -	lock_map_acquire_read(&pwq->wq->lockdep_map); +	lock_map_acquire(&pwq->wq->lockdep_map);  	lock_map_acquire(&lockdep_map); +	/* +	 * Strictly speaking we should mark the invariant state without holding +	 * any locks, that is, before these two lock_map_acquire()'s. +	 * +	 * However, that would result in: +	 * +	 *   A(W1) +	 *   WFC(C) +	 *		A(W1) +	 *		C(C) +	 * +	 * Which would create W1->C->W1 dependencies, even though there is no +	 * actual deadlock possible. There are two solutions, using a +	 * read-recursive acquire on the work(queue) 'locks', but this will then +	 * hit the lockdep limitation on recursive locks, or simply discard +	 * these locks. +	 * +	 * AFAICT there is no possible deadlock scenario between the +	 * flush_work() and complete() primitives (except for single-threaded +	 * workqueues), so hiding them isn't a problem. +	 */ +	lockdep_invariant_state(true);  	trace_workqueue_execute_start(work);  	worker->current_func(work);  	/* @@ -2474,7 +2496,16 @@ static void insert_wq_barrier(struct pool_workqueue *pwq,  	 */  	INIT_WORK_ONSTACK(&barr->work, wq_barrier_func);  	__set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(&barr->work)); -	init_completion(&barr->done); + +	/* +	 * Explicitly init the crosslock for wq_barrier::done, make its lock +	 * key a subkey of the corresponding work. As a result we won't +	 * build a dependency between wq_barrier::done and unrelated work. +	 */ +	lockdep_init_map_crosslock((struct lockdep_map *)&barr->done.map, +				   "(complete)wq_barr::done", +				   target->lockdep_map.key, 1); +	__init_completion(&barr->done);  	barr->task = current;  	/* @@ -2815,16 +2846,18 @@ static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr)  	spin_unlock_irq(&pool->lock);  	/* -	 * If @max_active is 1 or rescuer is in use, flushing another work -	 * item on the same workqueue may lead to deadlock.  Make sure the -	 * flusher is not running on the same workqueue by verifying write -	 * access. +	 * Force a lock recursion deadlock when using flush_work() inside a +	 * single-threaded or rescuer equipped workqueue. +	 * +	 * For single threaded workqueues the deadlock happens when the work +	 * is after the work issuing the flush_work(). For rescuer equipped +	 * workqueues the deadlock happens when the rescuer stalls, blocking +	 * forward progress.  	 */ -	if (pwq->wq->saved_max_active == 1 || pwq->wq->rescuer) +	if (pwq->wq->saved_max_active == 1 || pwq->wq->rescuer) {  		lock_map_acquire(&pwq->wq->lockdep_map); -	else -		lock_map_acquire_read(&pwq->wq->lockdep_map); -	lock_map_release(&pwq->wq->lockdep_map); +		lock_map_release(&pwq->wq->lockdep_map); +	}  	return true;  already_gone: | 
