diff options
Diffstat (limited to 'kernel')
| -rw-r--r-- | kernel/bpf/hashtab.c | 3 | ||||
| -rw-r--r-- | kernel/bpf/syscall.c | 4 | ||||
| -rw-r--r-- | kernel/bpf/verifier.c | 80 | ||||
| -rw-r--r-- | kernel/cpu.c | 2 | ||||
| -rw-r--r-- | kernel/events/core.c | 35 | ||||
| -rw-r--r-- | kernel/events/uprobes.c | 6 | ||||
| -rw-r--r-- | kernel/exit.c | 1 | ||||
| -rw-r--r-- | kernel/fork.c | 4 | ||||
| -rw-r--r-- | kernel/irq/manage.c | 5 | ||||
| -rw-r--r-- | kernel/kcov.c | 10 | ||||
| -rw-r--r-- | kernel/locking/lockdep.c | 111 | ||||
| -rw-r--r-- | kernel/locking/lockdep_internals.h | 20 | ||||
| -rw-r--r-- | kernel/locking/rtmutex.c | 68 | ||||
| -rw-r--r-- | kernel/locking/rtmutex_common.h | 5 | ||||
| -rw-r--r-- | kernel/module.c | 5 | ||||
| -rw-r--r-- | kernel/power/suspend.c | 4 | ||||
| -rw-r--r-- | kernel/power/suspend_test.c | 4 | ||||
| -rw-r--r-- | kernel/printk/printk.c | 28 | ||||
| -rw-r--r-- | kernel/ptrace.c | 16 | ||||
| -rw-r--r-- | kernel/sched/auto_group.c | 40 | ||||
| -rw-r--r-- | kernel/sched/core.c | 28 | ||||
| -rw-r--r-- | kernel/sched/fair.c | 23 | ||||
| -rw-r--r-- | kernel/sched/wait.c | 10 | ||||
| -rw-r--r-- | kernel/softirq.c | 2 | ||||
| -rw-r--r-- | kernel/taskstats.c | 6 | ||||
| -rw-r--r-- | kernel/time/alarmtimer.c | 2 | ||||
| -rw-r--r-- | kernel/time/timer.c | 74 | ||||
| -rw-r--r-- | kernel/trace/ftrace.c | 24 | 
28 files changed, 415 insertions, 205 deletions
| diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 570eeca7bdfa..ad1bc67aff1b 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -687,7 +687,8 @@ static void delete_all_elements(struct bpf_htab *htab)  		hlist_for_each_entry_safe(l, n, head, hash_node) {  			hlist_del_rcu(&l->hash_node); -			htab_elem_free(htab, l); +			if (l->state != HTAB_EXTRA_ELEM_USED) +				htab_elem_free(htab, l);  		}  	}  } diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 228f962447a5..237f3d6a7ddc 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -194,7 +194,7 @@ static int map_create(union bpf_attr *attr)  	err = bpf_map_charge_memlock(map);  	if (err) -		goto free_map; +		goto free_map_nouncharge;  	err = bpf_map_new_fd(map);  	if (err < 0) @@ -204,6 +204,8 @@ static int map_create(union bpf_attr *attr)  	return err;  free_map: +	bpf_map_uncharge_memlock(map); +free_map_nouncharge:  	map->ops->map_free(map);  	return err;  } diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 99a7e5b388f2..8199821f54cf 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -216,8 +216,8 @@ static void print_verifier_state(struct bpf_verifier_state *state)  				reg->map_ptr->key_size,  				reg->map_ptr->value_size);  		if (reg->min_value != BPF_REGISTER_MIN_RANGE) -			verbose(",min_value=%llu", -				(unsigned long long)reg->min_value); +			verbose(",min_value=%lld", +				(long long)reg->min_value);  		if (reg->max_value != BPF_REGISTER_MAX_RANGE)  			verbose(",max_value=%llu",  				(unsigned long long)reg->max_value); @@ -758,7 +758,7 @@ static int check_mem_access(struct bpf_verifier_env *env, u32 regno, int off,  			 * index'es we need to make sure that whatever we use  			 * will have a set floor within our range.  			 */ -			if ((s64)reg->min_value < 0) { +			if (reg->min_value < 0) {  				verbose("R%d min value is negative, either use unsigned index or do a if (index >=0) check.\n",  					regno);  				return -EACCES; @@ -1468,7 +1468,8 @@ static void check_reg_overflow(struct bpf_reg_state *reg)  {  	if (reg->max_value > BPF_REGISTER_MAX_RANGE)  		reg->max_value = BPF_REGISTER_MAX_RANGE; -	if ((s64)reg->min_value < BPF_REGISTER_MIN_RANGE) +	if (reg->min_value < BPF_REGISTER_MIN_RANGE || +	    reg->min_value > BPF_REGISTER_MAX_RANGE)  		reg->min_value = BPF_REGISTER_MIN_RANGE;  } @@ -1476,7 +1477,8 @@ static void adjust_reg_min_max_vals(struct bpf_verifier_env *env,  				    struct bpf_insn *insn)  {  	struct bpf_reg_state *regs = env->cur_state.regs, *dst_reg; -	u64 min_val = BPF_REGISTER_MIN_RANGE, max_val = BPF_REGISTER_MAX_RANGE; +	s64 min_val = BPF_REGISTER_MIN_RANGE; +	u64 max_val = BPF_REGISTER_MAX_RANGE;  	bool min_set = false, max_set = false;  	u8 opcode = BPF_OP(insn->code); @@ -1512,22 +1514,43 @@ static void adjust_reg_min_max_vals(struct bpf_verifier_env *env,  		return;  	} +	/* If one of our values was at the end of our ranges then we can't just +	 * do our normal operations to the register, we need to set the values +	 * to the min/max since they are undefined. +	 */ +	if (min_val == BPF_REGISTER_MIN_RANGE) +		dst_reg->min_value = BPF_REGISTER_MIN_RANGE; +	if (max_val == BPF_REGISTER_MAX_RANGE) +		dst_reg->max_value = BPF_REGISTER_MAX_RANGE; +  	switch (opcode) {  	case BPF_ADD: -		dst_reg->min_value += min_val; -		dst_reg->max_value += max_val; +		if (dst_reg->min_value != BPF_REGISTER_MIN_RANGE) +			dst_reg->min_value += min_val; +		if (dst_reg->max_value != BPF_REGISTER_MAX_RANGE) +			dst_reg->max_value += max_val;  		break;  	case BPF_SUB: -		dst_reg->min_value -= min_val; -		dst_reg->max_value -= max_val; +		if (dst_reg->min_value != BPF_REGISTER_MIN_RANGE) +			dst_reg->min_value -= min_val; +		if (dst_reg->max_value != BPF_REGISTER_MAX_RANGE) +			dst_reg->max_value -= max_val;  		break;  	case BPF_MUL: -		dst_reg->min_value *= min_val; -		dst_reg->max_value *= max_val; +		if (dst_reg->min_value != BPF_REGISTER_MIN_RANGE) +			dst_reg->min_value *= min_val; +		if (dst_reg->max_value != BPF_REGISTER_MAX_RANGE) +			dst_reg->max_value *= max_val;  		break;  	case BPF_AND: -		/* & is special since it could end up with 0 bits set. */ -		dst_reg->min_value &= min_val; +		/* Disallow AND'ing of negative numbers, ain't nobody got time +		 * for that.  Otherwise the minimum is 0 and the max is the max +		 * value we could AND against. +		 */ +		if (min_val < 0) +			dst_reg->min_value = BPF_REGISTER_MIN_RANGE; +		else +			dst_reg->min_value = 0;  		dst_reg->max_value = max_val;  		break;  	case BPF_LSH: @@ -1537,24 +1560,25 @@ static void adjust_reg_min_max_vals(struct bpf_verifier_env *env,  		 */  		if (min_val > ilog2(BPF_REGISTER_MAX_RANGE))  			dst_reg->min_value = BPF_REGISTER_MIN_RANGE; -		else +		else if (dst_reg->min_value != BPF_REGISTER_MIN_RANGE)  			dst_reg->min_value <<= min_val;  		if (max_val > ilog2(BPF_REGISTER_MAX_RANGE))  			dst_reg->max_value = BPF_REGISTER_MAX_RANGE; -		else +		else if (dst_reg->max_value != BPF_REGISTER_MAX_RANGE)  			dst_reg->max_value <<= max_val;  		break;  	case BPF_RSH: -		dst_reg->min_value >>= min_val; -		dst_reg->max_value >>= max_val; -		break; -	case BPF_MOD: -		/* % is special since it is an unsigned modulus, so the floor -		 * will always be 0. +		/* RSH by a negative number is undefined, and the BPF_RSH is an +		 * unsigned shift, so make the appropriate casts.  		 */ -		dst_reg->min_value = 0; -		dst_reg->max_value = max_val - 1; +		if (min_val < 0 || dst_reg->min_value < 0) +			dst_reg->min_value = BPF_REGISTER_MIN_RANGE; +		else +			dst_reg->min_value = +				(u64)(dst_reg->min_value) >> min_val; +		if (dst_reg->max_value != BPF_REGISTER_MAX_RANGE) +			dst_reg->max_value >>= max_val;  		break;  	default:  		reset_reg_range_values(regs, insn->dst_reg); @@ -2430,6 +2454,7 @@ static bool states_equal(struct bpf_verifier_env *env,  			 struct bpf_verifier_state *old,  			 struct bpf_verifier_state *cur)  { +	bool varlen_map_access = env->varlen_map_value_access;  	struct bpf_reg_state *rold, *rcur;  	int i; @@ -2443,12 +2468,17 @@ static bool states_equal(struct bpf_verifier_env *env,  		/* If the ranges were not the same, but everything else was and  		 * we didn't do a variable access into a map then we are a-ok.  		 */ -		if (!env->varlen_map_value_access && +		if (!varlen_map_access &&  		    rold->type == rcur->type && rold->imm == rcur->imm)  			continue; +		/* If we didn't map access then again we don't care about the +		 * mismatched range values and it's ok if our old type was +		 * UNKNOWN and we didn't go to a NOT_INIT'ed reg. +		 */  		if (rold->type == NOT_INIT || -		    (rold->type == UNKNOWN_VALUE && rcur->type != NOT_INIT)) +		    (!varlen_map_access && rold->type == UNKNOWN_VALUE && +		     rcur->type != NOT_INIT))  			continue;  		if (rold->type == PTR_TO_PACKET && rcur->type == PTR_TO_PACKET && diff --git a/kernel/cpu.c b/kernel/cpu.c index 5df20d6d1520..29de1a9352c0 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -228,7 +228,7 @@ static struct {  	.wq = __WAIT_QUEUE_HEAD_INITIALIZER(cpu_hotplug.wq),  	.lock = __MUTEX_INITIALIZER(cpu_hotplug.lock),  #ifdef CONFIG_DEBUG_LOCK_ALLOC -	.dep_map = {.name = "cpu_hotplug.lock" }, +	.dep_map = STATIC_LOCKDEP_MAP_INIT("cpu_hotplug.dep_map", &cpu_hotplug.dep_map),  #endif  }; diff --git a/kernel/events/core.c b/kernel/events/core.c index c6e47e97b33f..02c8421f8c01 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -902,7 +902,15 @@ list_update_cgroup_event(struct perf_event *event,  	 * this will always be called from the right CPU.  	 */  	cpuctx = __get_cpu_context(ctx); -	cpuctx->cgrp = add ? event->cgrp : NULL; + +	/* +	 * cpuctx->cgrp is NULL until a cgroup event is sched in or +	 * ctx->nr_cgroup == 0 . +	 */ +	if (add && perf_cgroup_from_task(current, ctx) == event->cgrp) +		cpuctx->cgrp = event->cgrp; +	else if (!add) +		cpuctx->cgrp = NULL;  }  #else /* !CONFIG_CGROUP_PERF */ @@ -1960,6 +1968,12 @@ void perf_event_disable(struct perf_event *event)  }  EXPORT_SYMBOL_GPL(perf_event_disable); +void perf_event_disable_inatomic(struct perf_event *event) +{ +	event->pending_disable = 1; +	irq_work_queue(&event->pending); +} +  static void perf_set_shadow_time(struct perf_event *event,  				 struct perf_event_context *ctx,  				 u64 tstamp) @@ -7075,8 +7089,8 @@ static int __perf_event_overflow(struct perf_event *event,  	if (events && atomic_dec_and_test(&event->event_limit)) {  		ret = 1;  		event->pending_kill = POLL_HUP; -		event->pending_disable = 1; -		irq_work_queue(&event->pending); + +		perf_event_disable_inatomic(event);  	}  	READ_ONCE(event->overflow_handler)(event, data, regs); @@ -8012,6 +8026,7 @@ restart:   * if <size> is not specified, the range is treated as a single address.   */  enum { +	IF_ACT_NONE = -1,  	IF_ACT_FILTER,  	IF_ACT_START,  	IF_ACT_STOP, @@ -8035,6 +8050,7 @@ static const match_table_t if_tokens = {  	{ IF_SRC_KERNEL,	"%u/%u" },  	{ IF_SRC_FILEADDR,	"%u@%s" },  	{ IF_SRC_KERNELADDR,	"%u" }, +	{ IF_ACT_NONE,		NULL },  };  /* @@ -8855,7 +8871,10 @@ EXPORT_SYMBOL_GPL(perf_pmu_register);  void perf_pmu_unregister(struct pmu *pmu)  { +	int remove_device; +  	mutex_lock(&pmus_lock); +	remove_device = pmu_bus_running;  	list_del_rcu(&pmu->entry);  	mutex_unlock(&pmus_lock); @@ -8869,10 +8888,12 @@ void perf_pmu_unregister(struct pmu *pmu)  	free_percpu(pmu->pmu_disable_count);  	if (pmu->type >= PERF_TYPE_MAX)  		idr_remove(&pmu_idr, pmu->type); -	if (pmu->nr_addr_filters) -		device_remove_file(pmu->dev, &dev_attr_nr_addr_filters); -	device_del(pmu->dev); -	put_device(pmu->dev); +	if (remove_device) { +		if (pmu->nr_addr_filters) +			device_remove_file(pmu->dev, &dev_attr_nr_addr_filters); +		device_del(pmu->dev); +		put_device(pmu->dev); +	}  	free_pmu_context(pmu);  }  EXPORT_SYMBOL_GPL(perf_pmu_unregister); diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index d4129bb05e5d..f9ec9add2164 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -300,7 +300,8 @@ int uprobe_write_opcode(struct mm_struct *mm, unsigned long vaddr,  retry:  	/* Read the page with vaddr into memory */ -	ret = get_user_pages_remote(NULL, mm, vaddr, 1, 0, 1, &old_page, &vma); +	ret = get_user_pages_remote(NULL, mm, vaddr, 1, FOLL_FORCE, &old_page, +			&vma);  	if (ret <= 0)  		return ret; @@ -1710,7 +1711,8 @@ static int is_trap_at_addr(struct mm_struct *mm, unsigned long vaddr)  	 * but we treat this as a 'remote' access since it is  	 * essentially a kernel access to the memory.  	 */ -	result = get_user_pages_remote(NULL, mm, vaddr, 1, 0, 1, &page, NULL); +	result = get_user_pages_remote(NULL, mm, vaddr, 1, FOLL_FORCE, &page, +			NULL);  	if (result < 0)  		return result; diff --git a/kernel/exit.c b/kernel/exit.c index 9d68c45ebbe3..3076f3089919 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -836,6 +836,7 @@ void __noreturn do_exit(long code)  	 */  	perf_event_exit_task(tsk); +	sched_autogroup_exit_task(tsk);  	cgroup_exit(tsk);  	/* diff --git a/kernel/fork.c b/kernel/fork.c index 623259fc794d..997ac1d584f7 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -315,6 +315,9 @@ static void account_kernel_stack(struct task_struct *tsk, int account)  static void release_task_stack(struct task_struct *tsk)  { +	if (WARN_ON(tsk->state != TASK_DEAD)) +		return;  /* Better to leak the stack than to free prematurely */ +  	account_kernel_stack(tsk, -1);  	arch_release_thread_stack(tsk->stack);  	free_thread_stack(tsk); @@ -1862,6 +1865,7 @@ bad_fork_cleanup_count:  	atomic_dec(&p->cred->user->processes);  	exit_creds(p);  bad_fork_free: +	p->state = TASK_DEAD;  	put_task_stack(p);  	free_task(p);  fork_out: diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 0c5f1a5db654..6b669593e7eb 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -721,6 +721,7 @@ int irq_set_parent(int irq, int parent_irq)  	irq_put_desc_unlock(desc, flags);  	return 0;  } +EXPORT_SYMBOL_GPL(irq_set_parent);  #endif  /* @@ -1340,12 +1341,12 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)  	} else if (new->flags & IRQF_TRIGGER_MASK) {  		unsigned int nmsk = new->flags & IRQF_TRIGGER_MASK; -		unsigned int omsk = irq_settings_get_trigger_mask(desc); +		unsigned int omsk = irqd_get_trigger_type(&desc->irq_data);  		if (nmsk != omsk)  			/* hope the handler works with current  trigger mode */  			pr_warn("irq %d uses trigger mode %u; requested %u\n", -				irq, nmsk, omsk); +				irq, omsk, nmsk);  	}  	*old_ptr = new; diff --git a/kernel/kcov.c b/kernel/kcov.c index 8d44b3fea9d0..3cbb0c879705 100644 --- a/kernel/kcov.c +++ b/kernel/kcov.c @@ -7,6 +7,7 @@  #include <linux/fs.h>  #include <linux/mm.h>  #include <linux/printk.h> +#include <linux/sched.h>  #include <linux/slab.h>  #include <linux/spinlock.h>  #include <linux/vmalloc.h> @@ -53,8 +54,15 @@ void notrace __sanitizer_cov_trace_pc(void)  	/*  	 * We are interested in code coverage as a function of a syscall inputs,  	 * so we ignore code executed in interrupts. +	 * The checks for whether we are in an interrupt are open-coded, because +	 * 1. We can't use in_interrupt() here, since it also returns true +	 *    when we are inside local_bh_disable() section. +	 * 2. We don't want to use (in_irq() | in_serving_softirq() | in_nmi()), +	 *    since that leads to slower generated code (three separate tests, +	 *    one for each of the flags).  	 */ -	if (!t || in_interrupt()) +	if (!t || (preempt_count() & (HARDIRQ_MASK | SOFTIRQ_OFFSET +							| NMI_MASK)))  		return;  	mode = READ_ONCE(t->kcov_mode);  	if (mode == KCOV_MODE_TRACE) { diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 589d763a49b3..4d7ffc0a0d00 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -506,13 +506,13 @@ static void __print_lock_name(struct lock_class *class)  	name = class->name;  	if (!name) {  		name = __get_key_name(class->key, str); -		printk("%s", name); +		printk(KERN_CONT "%s", name);  	} else { -		printk("%s", name); +		printk(KERN_CONT "%s", name);  		if (class->name_version > 1) -			printk("#%d", class->name_version); +			printk(KERN_CONT "#%d", class->name_version);  		if (class->subclass) -			printk("/%d", class->subclass); +			printk(KERN_CONT "/%d", class->subclass);  	}  } @@ -522,9 +522,9 @@ static void print_lock_name(struct lock_class *class)  	get_usage_chars(class, usage); -	printk(" ("); +	printk(KERN_CONT " (");  	__print_lock_name(class); -	printk("){%s}", usage); +	printk(KERN_CONT "){%s}", usage);  }  static void print_lockdep_cache(struct lockdep_map *lock) @@ -536,7 +536,7 @@ static void print_lockdep_cache(struct lockdep_map *lock)  	if (!name)  		name = __get_key_name(lock->key->subkeys, str); -	printk("%s", name); +	printk(KERN_CONT "%s", name);  }  static void print_lock(struct held_lock *hlock) @@ -551,13 +551,13 @@ static void print_lock(struct held_lock *hlock)  	barrier();  	if (!class_idx || (class_idx - 1) >= MAX_LOCKDEP_KEYS) { -		printk("<RELEASED>\n"); +		printk(KERN_CONT "<RELEASED>\n");  		return;  	}  	print_lock_name(lock_classes + class_idx - 1); -	printk(", at: "); -	print_ip_sym(hlock->acquire_ip); +	printk(KERN_CONT ", at: [<%p>] %pS\n", +		(void *)hlock->acquire_ip, (void *)hlock->acquire_ip);  }  static void lockdep_print_held_locks(struct task_struct *curr) @@ -792,8 +792,8 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)  		printk("\nnew class %p: %s", class->key, class->name);  		if (class->name_version > 1) -			printk("#%d", class->name_version); -		printk("\n"); +			printk(KERN_CONT "#%d", class->name_version); +		printk(KERN_CONT "\n");  		dump_stack();  		if (!graph_lock()) { @@ -1071,7 +1071,7 @@ print_circular_bug_entry(struct lock_list *target, int depth)  		return 0;  	printk("\n-> #%u", depth);  	print_lock_name(target->class); -	printk(":\n"); +	printk(KERN_CONT ":\n");  	print_stack_trace(&target->trace, 6);  	return 0; @@ -1102,11 +1102,11 @@ print_circular_lock_scenario(struct held_lock *src,  	if (parent != source) {  		printk("Chain exists of:\n  ");  		__print_lock_name(source); -		printk(" --> "); +		printk(KERN_CONT " --> ");  		__print_lock_name(parent); -		printk(" --> "); +		printk(KERN_CONT " --> ");  		__print_lock_name(target); -		printk("\n\n"); +		printk(KERN_CONT "\n\n");  	}  	printk(" Possible unsafe locking scenario:\n\n"); @@ -1114,16 +1114,16 @@ print_circular_lock_scenario(struct held_lock *src,  	printk("       ----                    ----\n");  	printk("  lock(");  	__print_lock_name(target); -	printk(");\n"); +	printk(KERN_CONT ");\n");  	printk("                               lock(");  	__print_lock_name(parent); -	printk(");\n"); +	printk(KERN_CONT ");\n");  	printk("                               lock(");  	__print_lock_name(target); -	printk(");\n"); +	printk(KERN_CONT ");\n");  	printk("  lock(");  	__print_lock_name(source); -	printk(");\n"); +	printk(KERN_CONT ");\n");  	printk("\n *** DEADLOCK ***\n\n");  } @@ -1359,22 +1359,22 @@ static void print_lock_class_header(struct lock_class *class, int depth)  	printk("%*s->", depth, "");  	print_lock_name(class); -	printk(" ops: %lu", class->ops); -	printk(" {\n"); +	printk(KERN_CONT " ops: %lu", class->ops); +	printk(KERN_CONT " {\n");  	for (bit = 0; bit < LOCK_USAGE_STATES; bit++) {  		if (class->usage_mask & (1 << bit)) {  			int len = depth;  			len += printk("%*s   %s", depth, "", usage_str[bit]); -			len += printk(" at:\n"); +			len += printk(KERN_CONT " at:\n");  			print_stack_trace(class->usage_traces + bit, len);  		}  	}  	printk("%*s }\n", depth, ""); -	printk("%*s ... key      at: ",depth,""); -	print_ip_sym((unsigned long)class->key); +	printk("%*s ... key      at: [<%p>] %pS\n", +		depth, "", class->key, class->key);  }  /* @@ -1437,11 +1437,11 @@ print_irq_lock_scenario(struct lock_list *safe_entry,  	if (middle_class != unsafe_class) {  		printk("Chain exists of:\n  ");  		__print_lock_name(safe_class); -		printk(" --> "); +		printk(KERN_CONT " --> ");  		__print_lock_name(middle_class); -		printk(" --> "); +		printk(KERN_CONT " --> ");  		__print_lock_name(unsafe_class); -		printk("\n\n"); +		printk(KERN_CONT "\n\n");  	}  	printk(" Possible interrupt unsafe locking scenario:\n\n"); @@ -1449,18 +1449,18 @@ print_irq_lock_scenario(struct lock_list *safe_entry,  	printk("       ----                    ----\n");  	printk("  lock(");  	__print_lock_name(unsafe_class); -	printk(");\n"); +	printk(KERN_CONT ");\n");  	printk("                               local_irq_disable();\n");  	printk("                               lock(");  	__print_lock_name(safe_class); -	printk(");\n"); +	printk(KERN_CONT ");\n");  	printk("                               lock(");  	__print_lock_name(middle_class); -	printk(");\n"); +	printk(KERN_CONT ");\n");  	printk("  <Interrupt>\n");  	printk("    lock(");  	__print_lock_name(safe_class); -	printk(");\n"); +	printk(KERN_CONT ");\n");  	printk("\n *** DEADLOCK ***\n\n");  } @@ -1497,9 +1497,9 @@ print_bad_irq_dependency(struct task_struct *curr,  	print_lock(prev);  	printk("which would create a new lock dependency:\n");  	print_lock_name(hlock_class(prev)); -	printk(" ->"); +	printk(KERN_CONT " ->");  	print_lock_name(hlock_class(next)); -	printk("\n"); +	printk(KERN_CONT "\n");  	printk("\nbut this new dependency connects a %s-irq-safe lock:\n",  		irqclass); @@ -1521,8 +1521,7 @@ print_bad_irq_dependency(struct task_struct *curr,  	lockdep_print_held_locks(curr); -	printk("\nthe dependencies between %s-irq-safe lock", irqclass); -	printk(" and the holding lock:\n"); +	printk("\nthe dependencies between %s-irq-safe lock and the holding lock:\n", irqclass);  	if (!save_trace(&prev_root->trace))  		return 0;  	print_shortest_lock_dependencies(backwards_entry, prev_root); @@ -1694,10 +1693,10 @@ print_deadlock_scenario(struct held_lock *nxt,  	printk("       ----\n");  	printk("  lock(");  	__print_lock_name(prev); -	printk(");\n"); +	printk(KERN_CONT ");\n");  	printk("  lock(");  	__print_lock_name(next); -	printk(");\n"); +	printk(KERN_CONT ");\n");  	printk("\n *** DEADLOCK ***\n\n");  	printk(" May be due to missing lock nesting notation\n\n");  } @@ -1891,9 +1890,9 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,  		graph_unlock();  		printk("\n new dependency: ");  		print_lock_name(hlock_class(prev)); -		printk(" => "); +		printk(KERN_CONT " => ");  		print_lock_name(hlock_class(next)); -		printk("\n"); +		printk(KERN_CONT "\n");  		dump_stack();  		return graph_lock();  	} @@ -2343,11 +2342,11 @@ print_usage_bug_scenario(struct held_lock *lock)  	printk("       ----\n");  	printk("  lock(");  	__print_lock_name(class); -	printk(");\n"); +	printk(KERN_CONT ");\n");  	printk("  <Interrupt>\n");  	printk("    lock(");  	__print_lock_name(class); -	printk(");\n"); +	printk(KERN_CONT ");\n");  	printk("\n *** DEADLOCK ***\n\n");  } @@ -2522,14 +2521,18 @@ check_usage_backwards(struct task_struct *curr, struct held_lock *this,  void print_irqtrace_events(struct task_struct *curr)  {  	printk("irq event stamp: %u\n", curr->irq_events); -	printk("hardirqs last  enabled at (%u): ", curr->hardirq_enable_event); -	print_ip_sym(curr->hardirq_enable_ip); -	printk("hardirqs last disabled at (%u): ", curr->hardirq_disable_event); -	print_ip_sym(curr->hardirq_disable_ip); -	printk("softirqs last  enabled at (%u): ", curr->softirq_enable_event); -	print_ip_sym(curr->softirq_enable_ip); -	printk("softirqs last disabled at (%u): ", curr->softirq_disable_event); -	print_ip_sym(curr->softirq_disable_ip); +	printk("hardirqs last  enabled at (%u): [<%p>] %pS\n", +		curr->hardirq_enable_event, (void *)curr->hardirq_enable_ip, +		(void *)curr->hardirq_enable_ip); +	printk("hardirqs last disabled at (%u): [<%p>] %pS\n", +		curr->hardirq_disable_event, (void *)curr->hardirq_disable_ip, +		(void *)curr->hardirq_disable_ip); +	printk("softirqs last  enabled at (%u): [<%p>] %pS\n", +		curr->softirq_enable_event, (void *)curr->softirq_enable_ip, +		(void *)curr->softirq_enable_ip); +	printk("softirqs last disabled at (%u): [<%p>] %pS\n", +		curr->softirq_disable_event, (void *)curr->softirq_disable_ip, +		(void *)curr->softirq_disable_ip);  }  static int HARDIRQ_verbose(struct lock_class *class) @@ -3235,8 +3238,8 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,  	if (very_verbose(class)) {  		printk("\nacquire class [%p] %s", class->key, class->name);  		if (class->name_version > 1) -			printk("#%d", class->name_version); -		printk("\n"); +			printk(KERN_CONT "#%d", class->name_version); +		printk(KERN_CONT "\n");  		dump_stack();  	} @@ -3378,7 +3381,7 @@ print_unlock_imbalance_bug(struct task_struct *curr, struct lockdep_map *lock,  	printk("%s/%d is trying to release lock (",  		curr->comm, task_pid_nr(curr));  	print_lockdep_cache(lock); -	printk(") at:\n"); +	printk(KERN_CONT ") at:\n");  	print_ip_sym(ip);  	printk("but there are no more locks to release!\n");  	printk("\nother info that might help us debug this:\n"); @@ -3871,7 +3874,7 @@ print_lock_contention_bug(struct task_struct *curr, struct lockdep_map *lock,  	printk("%s/%d is trying to contend lock (",  		curr->comm, task_pid_nr(curr));  	print_lockdep_cache(lock); -	printk(") at:\n"); +	printk(KERN_CONT ") at:\n");  	print_ip_sym(ip);  	printk("but there are no locks held!\n");  	printk("\nother info that might help us debug this:\n"); diff --git a/kernel/locking/lockdep_internals.h b/kernel/locking/lockdep_internals.h index 51c4b24b6328..c2b88490d857 100644 --- a/kernel/locking/lockdep_internals.h +++ b/kernel/locking/lockdep_internals.h @@ -46,6 +46,14 @@ enum {  		(LOCKF_USED_IN_HARDIRQ_READ | LOCKF_USED_IN_SOFTIRQ_READ)  /* + * CONFIG_PROVE_LOCKING_SMALL is defined for sparc. Sparc requires .text, + * .data and .bss to fit in required 32MB limit for the kernel. With + * PROVE_LOCKING we could go over this limit and cause system boot-up problems. + * So, reduce the static allocations for lockdeps related structures so that + * everything fits in current required size limit. + */ +#ifdef CONFIG_PROVE_LOCKING_SMALL +/*   * MAX_LOCKDEP_ENTRIES is the maximum number of lock dependencies   * we track.   * @@ -54,18 +62,24 @@ enum {   * table (if it's not there yet), and we check it for lock order   * conflicts and deadlocks.   */ +#define MAX_LOCKDEP_ENTRIES	16384UL +#define MAX_LOCKDEP_CHAINS_BITS	15 +#define MAX_STACK_TRACE_ENTRIES	262144UL +#else  #define MAX_LOCKDEP_ENTRIES	32768UL  #define MAX_LOCKDEP_CHAINS_BITS	16 -#define MAX_LOCKDEP_CHAINS	(1UL << MAX_LOCKDEP_CHAINS_BITS) - -#define MAX_LOCKDEP_CHAIN_HLOCKS (MAX_LOCKDEP_CHAINS*5)  /*   * Stack-trace: tightly packed array of stack backtrace   * addresses. Protected by the hash_lock.   */  #define MAX_STACK_TRACE_ENTRIES	524288UL +#endif + +#define MAX_LOCKDEP_CHAINS	(1UL << MAX_LOCKDEP_CHAINS_BITS) + +#define MAX_LOCKDEP_CHAIN_HLOCKS (MAX_LOCKDEP_CHAINS*5)  extern struct list_head all_lock_classes;  extern struct lock_chain lock_chains[]; diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index 1ec0f48962b3..2c49d76f96c3 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -65,8 +65,72 @@ static inline void clear_rt_mutex_waiters(struct rt_mutex *lock)  static void fixup_rt_mutex_waiters(struct rt_mutex *lock)  { -	if (!rt_mutex_has_waiters(lock)) -		clear_rt_mutex_waiters(lock); +	unsigned long owner, *p = (unsigned long *) &lock->owner; + +	if (rt_mutex_has_waiters(lock)) +		return; + +	/* +	 * The rbtree has no waiters enqueued, now make sure that the +	 * lock->owner still has the waiters bit set, otherwise the +	 * following can happen: +	 * +	 * CPU 0	CPU 1		CPU2 +	 * l->owner=T1 +	 *		rt_mutex_lock(l) +	 *		lock(l->lock) +	 *		l->owner = T1 | HAS_WAITERS; +	 *		enqueue(T2) +	 *		boost() +	 *		  unlock(l->lock) +	 *		block() +	 * +	 *				rt_mutex_lock(l) +	 *				lock(l->lock) +	 *				l->owner = T1 | HAS_WAITERS; +	 *				enqueue(T3) +	 *				boost() +	 *				  unlock(l->lock) +	 *				block() +	 *		signal(->T2)	signal(->T3) +	 *		lock(l->lock) +	 *		dequeue(T2) +	 *		deboost() +	 *		  unlock(l->lock) +	 *				lock(l->lock) +	 *				dequeue(T3) +	 *				 ==> wait list is empty +	 *				deboost() +	 *				 unlock(l->lock) +	 *		lock(l->lock) +	 *		fixup_rt_mutex_waiters() +	 *		  if (wait_list_empty(l) { +	 *		    l->owner = owner +	 *		    owner = l->owner & ~HAS_WAITERS; +	 *		      ==> l->owner = T1 +	 *		  } +	 *				lock(l->lock) +	 * rt_mutex_unlock(l)		fixup_rt_mutex_waiters() +	 *				  if (wait_list_empty(l) { +	 *				    owner = l->owner & ~HAS_WAITERS; +	 * cmpxchg(l->owner, T1, NULL) +	 *  ===> Success (l->owner = NULL) +	 * +	 *				    l->owner = owner +	 *				      ==> l->owner = T1 +	 *				  } +	 * +	 * With the check for the waiter bit in place T3 on CPU2 will not +	 * overwrite. All tasks fiddling with the waiters bit are +	 * serialized by l->lock, so nothing else can modify the waiters +	 * bit. If the bit is set then nothing can change l->owner either +	 * so the simple RMW is safe. The cmpxchg() will simply fail if it +	 * happens in the middle of the RMW because the waiters bit is +	 * still set. +	 */ +	owner = READ_ONCE(*p); +	if (owner & RT_MUTEX_HAS_WAITERS) +		WRITE_ONCE(*p, owner & ~RT_MUTEX_HAS_WAITERS);  }  /* diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h index 4f5f83c7d2d3..e317e1cbb3eb 100644 --- a/kernel/locking/rtmutex_common.h +++ b/kernel/locking/rtmutex_common.h @@ -75,8 +75,9 @@ task_top_pi_waiter(struct task_struct *p)  static inline struct task_struct *rt_mutex_owner(struct rt_mutex *lock)  { -	return (struct task_struct *) -		((unsigned long)lock->owner & ~RT_MUTEX_OWNER_MASKALL); +	unsigned long owner = (unsigned long) READ_ONCE(lock->owner); + +	return (struct task_struct *) (owner & ~RT_MUTEX_OWNER_MASKALL);  }  /* diff --git a/kernel/module.c b/kernel/module.c index f57dd63186e6..0e54d5bf0097 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -1301,8 +1301,9 @@ static int check_version(Elf_Shdr *sechdrs,  		goto bad_version;  	} -	pr_warn("%s: no symbol version for %s\n", mod->name, symname); -	return 0; +	/* Broken toolchain. Warn once, then let it go.. */ +	pr_warn_once("%s: no symbol version for %s\n", mod->name, symname); +	return 1;  bad_version:  	pr_warn("%s: disagrees about version of symbol %s\n", diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 1e7f5da648d9..6ccb08f57fcb 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -498,9 +498,9 @@ static int enter_state(suspend_state_t state)  #ifndef CONFIG_SUSPEND_SKIP_SYNC  	trace_suspend_resume(TPS("sync_filesystems"), 0, true); -	printk(KERN_INFO "PM: Syncing filesystems ... "); +	pr_info("PM: Syncing filesystems ... ");  	sys_sync(); -	printk("done.\n"); +	pr_cont("done.\n");  	trace_suspend_resume(TPS("sync_filesystems"), 0, false);  #endif diff --git a/kernel/power/suspend_test.c b/kernel/power/suspend_test.c index 084452e34a12..bdff5ed57f10 100644 --- a/kernel/power/suspend_test.c +++ b/kernel/power/suspend_test.c @@ -203,8 +203,10 @@ static int __init test_suspend(void)  	/* RTCs have initialized by now too ... can we use one? */  	dev = class_find_device(rtc_class, NULL, NULL, has_wakealarm); -	if (dev) +	if (dev) {  		rtc = rtc_class_open(dev_name(dev)); +		put_device(dev); +	}  	if (!rtc) {  		printk(warn_no_rtc);  		return 0; diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index d5e397315473..f7a55e9ff2f7 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -253,17 +253,6 @@ static int preferred_console = -1;  int console_set_on_cmdline;  EXPORT_SYMBOL(console_set_on_cmdline); -#ifdef CONFIG_OF -static bool of_specified_console; - -void console_set_by_of(void) -{ -	of_specified_console = true; -} -#else -# define of_specified_console false -#endif -  /* Flag: console code may call schedule() */  static int console_may_schedule; @@ -794,8 +783,6 @@ static ssize_t devkmsg_write(struct kiocb *iocb, struct iov_iter *from)  	return ret;  } -static void cont_flush(void); -  static ssize_t devkmsg_read(struct file *file, char __user *buf,  			    size_t count, loff_t *ppos)  { @@ -811,7 +798,6 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf,  	if (ret)  		return ret;  	raw_spin_lock_irq(&logbuf_lock); -	cont_flush();  	while (user->seq == log_next_seq) {  		if (file->f_flags & O_NONBLOCK) {  			ret = -EAGAIN; @@ -874,7 +860,6 @@ static loff_t devkmsg_llseek(struct file *file, loff_t offset, int whence)  		return -ESPIPE;  	raw_spin_lock_irq(&logbuf_lock); -	cont_flush();  	switch (whence) {  	case SEEK_SET:  		/* the first record */ @@ -913,7 +898,6 @@ static unsigned int devkmsg_poll(struct file *file, poll_table *wait)  	poll_wait(file, &log_wait, wait);  	raw_spin_lock_irq(&logbuf_lock); -	cont_flush();  	if (user->seq < log_next_seq) {  		/* return error when data has vanished underneath us */  		if (user->seq < log_first_seq) @@ -1300,7 +1284,6 @@ static int syslog_print(char __user *buf, int size)  		size_t skip;  		raw_spin_lock_irq(&logbuf_lock); -		cont_flush();  		if (syslog_seq < log_first_seq) {  			/* messages are gone, move to first one */  			syslog_seq = log_first_seq; @@ -1360,7 +1343,6 @@ static int syslog_print_all(char __user *buf, int size, bool clear)  		return -ENOMEM;  	raw_spin_lock_irq(&logbuf_lock); -	cont_flush();  	if (buf) {  		u64 next_seq;  		u64 seq; @@ -1522,7 +1504,6 @@ int do_syslog(int type, char __user *buf, int len, int source)  	/* Number of chars in the log buffer */  	case SYSLOG_ACTION_SIZE_UNREAD:  		raw_spin_lock_irq(&logbuf_lock); -		cont_flush();  		if (syslog_seq < log_first_seq) {  			/* messages are gone, move to first one */  			syslog_seq = log_first_seq; @@ -1769,6 +1750,10 @@ static size_t log_output(int facility, int level, enum log_flags lflags, const c  		cont_flush();  	} +	/* Skip empty continuation lines that couldn't be added - they just flush */ +	if (!text_len && (lflags & LOG_CONT)) +		return 0; +  	/* If it doesn't end in a newline, try to buffer the current line */  	if (!(lflags & LOG_NEWLINE)) {  		if (cont_add(facility, level, lflags, text, text_len)) @@ -2653,7 +2638,7 @@ void register_console(struct console *newcon)  	 *	didn't select a console we take the first one  	 *	that registers here.  	 */ -	if (preferred_console < 0 && !of_specified_console) { +	if (preferred_console < 0) {  		if (newcon->index < 0)  			newcon->index = 0;  		if (newcon->setup == NULL || @@ -3035,7 +3020,6 @@ void kmsg_dump(enum kmsg_dump_reason reason)  		dumper->active = true;  		raw_spin_lock_irqsave(&logbuf_lock, flags); -		cont_flush();  		dumper->cur_seq = clear_seq;  		dumper->cur_idx = clear_idx;  		dumper->next_seq = log_next_seq; @@ -3126,7 +3110,6 @@ bool kmsg_dump_get_line(struct kmsg_dumper *dumper, bool syslog,  	bool ret;  	raw_spin_lock_irqsave(&logbuf_lock, flags); -	cont_flush();  	ret = kmsg_dump_get_line_nolock(dumper, syslog, line, size, len);  	raw_spin_unlock_irqrestore(&logbuf_lock, flags); @@ -3169,7 +3152,6 @@ bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog,  		goto out;  	raw_spin_lock_irqsave(&logbuf_lock, flags); -	cont_flush();  	if (dumper->cur_seq < log_first_seq) {  		/* messages are gone, move to first available one */  		dumper->cur_seq = log_first_seq; diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 2a99027312a6..e6474f7272ec 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -537,7 +537,7 @@ int ptrace_readdata(struct task_struct *tsk, unsigned long src, char __user *dst  		int this_len, retval;  		this_len = (len > sizeof(buf)) ? sizeof(buf) : len; -		retval = access_process_vm(tsk, src, buf, this_len, 0); +		retval = access_process_vm(tsk, src, buf, this_len, FOLL_FORCE);  		if (!retval) {  			if (copied)  				break; @@ -564,7 +564,8 @@ int ptrace_writedata(struct task_struct *tsk, char __user *src, unsigned long ds  		this_len = (len > sizeof(buf)) ? sizeof(buf) : len;  		if (copy_from_user(buf, src, this_len))  			return -EFAULT; -		retval = access_process_vm(tsk, dst, buf, this_len, 1); +		retval = access_process_vm(tsk, dst, buf, this_len, +				FOLL_FORCE | FOLL_WRITE);  		if (!retval) {  			if (copied)  				break; @@ -1127,7 +1128,7 @@ int generic_ptrace_peekdata(struct task_struct *tsk, unsigned long addr,  	unsigned long tmp;  	int copied; -	copied = access_process_vm(tsk, addr, &tmp, sizeof(tmp), 0); +	copied = access_process_vm(tsk, addr, &tmp, sizeof(tmp), FOLL_FORCE);  	if (copied != sizeof(tmp))  		return -EIO;  	return put_user(tmp, (unsigned long __user *)data); @@ -1138,7 +1139,8 @@ int generic_ptrace_pokedata(struct task_struct *tsk, unsigned long addr,  {  	int copied; -	copied = access_process_vm(tsk, addr, &data, sizeof(data), 1); +	copied = access_process_vm(tsk, addr, &data, sizeof(data), +			FOLL_FORCE | FOLL_WRITE);  	return (copied == sizeof(data)) ? 0 : -EIO;  } @@ -1155,7 +1157,8 @@ int compat_ptrace_request(struct task_struct *child, compat_long_t request,  	switch (request) {  	case PTRACE_PEEKTEXT:  	case PTRACE_PEEKDATA: -		ret = access_process_vm(child, addr, &word, sizeof(word), 0); +		ret = access_process_vm(child, addr, &word, sizeof(word), +				FOLL_FORCE);  		if (ret != sizeof(word))  			ret = -EIO;  		else @@ -1164,7 +1167,8 @@ int compat_ptrace_request(struct task_struct *child, compat_long_t request,  	case PTRACE_POKETEXT:  	case PTRACE_POKEDATA: -		ret = access_process_vm(child, addr, &data, sizeof(data), 1); +		ret = access_process_vm(child, addr, &data, sizeof(data), +				FOLL_FORCE | FOLL_WRITE);  		ret = (ret != sizeof(data) ? -EIO : 0);  		break; diff --git a/kernel/sched/auto_group.c b/kernel/sched/auto_group.c index a5d966cb8891..da39489d2d80 100644 --- a/kernel/sched/auto_group.c +++ b/kernel/sched/auto_group.c @@ -111,10 +111,13 @@ bool task_wants_autogroup(struct task_struct *p, struct task_group *tg)  {  	if (tg != &root_task_group)  		return false; -  	/* -	 * We can only assume the task group can't go away on us if -	 * autogroup_move_group() can see us on ->thread_group list. +	 * If we race with autogroup_move_group() the caller can use the old +	 * value of signal->autogroup but in this case sched_move_task() will +	 * be called again before autogroup_kref_put(). +	 * +	 * However, there is no way sched_autogroup_exit_task() could tell us +	 * to avoid autogroup->tg, so we abuse PF_EXITING flag for this case.  	 */  	if (p->flags & PF_EXITING)  		return false; @@ -122,6 +125,16 @@ bool task_wants_autogroup(struct task_struct *p, struct task_group *tg)  	return true;  } +void sched_autogroup_exit_task(struct task_struct *p) +{ +	/* +	 * We are going to call exit_notify() and autogroup_move_group() can't +	 * see this thread after that: we can no longer use signal->autogroup. +	 * See the PF_EXITING check in task_wants_autogroup(). +	 */ +	sched_move_task(p); +} +  static void  autogroup_move_group(struct task_struct *p, struct autogroup *ag)  { @@ -138,13 +151,20 @@ autogroup_move_group(struct task_struct *p, struct autogroup *ag)  	}  	p->signal->autogroup = autogroup_kref_get(ag); - -	if (!READ_ONCE(sysctl_sched_autogroup_enabled)) -		goto out; - +	/* +	 * We can't avoid sched_move_task() after we changed signal->autogroup, +	 * this process can already run with task_group() == prev->tg or we can +	 * race with cgroup code which can read autogroup = prev under rq->lock. +	 * In the latter case for_each_thread() can not miss a migrating thread, +	 * cpu_cgroup_attach() must not be possible after cgroup_exit() and it +	 * can't be removed from thread list, we hold ->siglock. +	 * +	 * If an exiting thread was already removed from thread list we rely on +	 * sched_autogroup_exit_task(). +	 */  	for_each_thread(p, t)  		sched_move_task(t); -out: +  	unlock_task_sighand(p, &flags);  	autogroup_kref_put(prev);  } @@ -192,6 +212,7 @@ int proc_sched_autogroup_set_nice(struct task_struct *p, int nice)  {  	static unsigned long next = INITIAL_JIFFIES;  	struct autogroup *ag; +	unsigned long shares;  	int err;  	if (nice < MIN_NICE || nice > MAX_NICE) @@ -210,9 +231,10 @@ int proc_sched_autogroup_set_nice(struct task_struct *p, int nice)  	next = HZ / 10 + jiffies;  	ag = autogroup_task_get(p); +	shares = scale_load(sched_prio_to_weight[nice + 20]);  	down_write(&ag->lock); -	err = sched_group_set_shares(ag->tg, sched_prio_to_weight[nice + 20]); +	err = sched_group_set_shares(ag->tg, shares);  	if (!err)  		ag->nice = nice;  	up_write(&ag->lock); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 94732d1ab00a..154fd689fe02 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5192,21 +5192,14 @@ void sched_show_task(struct task_struct *p)  	int ppid;  	unsigned long state = p->state; +	if (!try_get_task_stack(p)) +		return;  	if (state)  		state = __ffs(state) + 1;  	printk(KERN_INFO "%-15.15s %c", p->comm,  		state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?'); -#if BITS_PER_LONG == 32 -	if (state == TASK_RUNNING) -		printk(KERN_CONT " running  "); -	else -		printk(KERN_CONT " %08lx ", thread_saved_pc(p)); -#else  	if (state == TASK_RUNNING)  		printk(KERN_CONT "  running task    "); -	else -		printk(KERN_CONT " %016lx ", thread_saved_pc(p)); -#endif  #ifdef CONFIG_DEBUG_STACK_USAGE  	free = stack_not_used(p);  #endif @@ -5221,6 +5214,7 @@ void sched_show_task(struct task_struct *p)  	print_worker_info(KERN_INFO, p);  	show_stack(p, NULL); +	put_task_stack(p);  }  void show_state_filter(unsigned long state_filter) @@ -7515,11 +7509,27 @@ static struct kmem_cache *task_group_cache __read_mostly;  DECLARE_PER_CPU(cpumask_var_t, load_balance_mask);  DECLARE_PER_CPU(cpumask_var_t, select_idle_mask); +#define WAIT_TABLE_BITS 8 +#define WAIT_TABLE_SIZE (1 << WAIT_TABLE_BITS) +static wait_queue_head_t bit_wait_table[WAIT_TABLE_SIZE] __cacheline_aligned; + +wait_queue_head_t *bit_waitqueue(void *word, int bit) +{ +	const int shift = BITS_PER_LONG == 32 ? 5 : 6; +	unsigned long val = (unsigned long)word << shift | bit; + +	return bit_wait_table + hash_long(val, WAIT_TABLE_BITS); +} +EXPORT_SYMBOL(bit_waitqueue); +  void __init sched_init(void)  {  	int i, j;  	unsigned long alloc_size = 0, ptr; +	for (i = 0; i < WAIT_TABLE_SIZE; i++) +		init_waitqueue_head(bit_wait_table + i); +  #ifdef CONFIG_FAIR_GROUP_SCHED  	alloc_size += 2 * nr_cpu_ids * sizeof(void **);  #endif diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 2d4ad72f8f3c..c242944f5cbd 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -690,7 +690,14 @@ void init_entity_runnable_average(struct sched_entity *se)  	 * will definitely be update (after enqueue).  	 */  	sa->period_contrib = 1023; -	sa->load_avg = scale_load_down(se->load.weight); +	/* +	 * Tasks are intialized with full load to be seen as heavy tasks until +	 * they get a chance to stabilize to their real load level. +	 * Group entities are intialized with zero load to reflect the fact that +	 * nothing has been attached to the task group yet. +	 */ +	if (entity_is_task(se)) +		sa->load_avg = scale_load_down(se->load.weight);  	sa->load_sum = sa->load_avg * LOAD_AVG_MAX;  	/*  	 * At this point, util_avg won't be used in select_task_rq_fair anyway @@ -5471,13 +5478,18 @@ static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd   */  static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int target)  { -	struct sched_domain *this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc)); -	u64 avg_idle = this_rq()->avg_idle; -	u64 avg_cost = this_sd->avg_scan_cost; +	struct sched_domain *this_sd; +	u64 avg_cost, avg_idle = this_rq()->avg_idle;  	u64 time, cost;  	s64 delta;  	int cpu, wrap; +	this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc)); +	if (!this_sd) +		return -1; + +	avg_cost = this_sd->avg_scan_cost; +  	/*  	 * Due to large variance we need a large fuzz factor; hackbench in  	 * particularly is sensitive here. @@ -8827,7 +8839,6 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)  {  	struct sched_entity *se;  	struct cfs_rq *cfs_rq; -	struct rq *rq;  	int i;  	tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL); @@ -8842,8 +8853,6 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)  	init_cfs_bandwidth(tg_cfs_bandwidth(tg));  	for_each_possible_cpu(i) { -		rq = cpu_rq(i); -  		cfs_rq = kzalloc_node(sizeof(struct cfs_rq),  				      GFP_KERNEL, cpu_to_node(i));  		if (!cfs_rq) diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c index 4f7053579fe3..9453efe9b25a 100644 --- a/kernel/sched/wait.c +++ b/kernel/sched/wait.c @@ -480,16 +480,6 @@ void wake_up_bit(void *word, int bit)  }  EXPORT_SYMBOL(wake_up_bit); -wait_queue_head_t *bit_waitqueue(void *word, int bit) -{ -	const int shift = BITS_PER_LONG == 32 ? 5 : 6; -	const struct zone *zone = page_zone(virt_to_page(word)); -	unsigned long val = (unsigned long)word << shift | bit; - -	return &zone->wait_table[hash_long(val, zone->wait_table_bits)]; -} -EXPORT_SYMBOL(bit_waitqueue); -  /*   * Manipulate the atomic_t address to produce a better bit waitqueue table hash   * index (we're keying off bit -1, but that would produce a horrible hash diff --git a/kernel/softirq.c b/kernel/softirq.c index 1bf81ef91375..744fa611cae0 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -58,7 +58,7 @@ static struct softirq_action softirq_vec[NR_SOFTIRQS] __cacheline_aligned_in_smp  DEFINE_PER_CPU(struct task_struct *, ksoftirqd);  const char * const softirq_to_name[NR_SOFTIRQS] = { -	"HI", "TIMER", "NET_TX", "NET_RX", "BLOCK", "BLOCK_IOPOLL", +	"HI", "TIMER", "NET_TX", "NET_RX", "BLOCK", "IRQ_POLL",  	"TASKLET", "SCHED", "HRTIMER", "RCU"  }; diff --git a/kernel/taskstats.c b/kernel/taskstats.c index b3f05ee20d18..cbb387a265db 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c @@ -54,7 +54,11 @@ static const struct nla_policy taskstats_cmd_get_policy[TASKSTATS_CMD_ATTR_MAX+1  	[TASKSTATS_CMD_ATTR_REGISTER_CPUMASK] = { .type = NLA_STRING },  	[TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK] = { .type = NLA_STRING },}; -static const struct nla_policy cgroupstats_cmd_get_policy[CGROUPSTATS_CMD_ATTR_MAX+1] = { +/* + * We have to use TASKSTATS_CMD_ATTR_MAX here, it is the maxattr in the family. + * Make sure they are always aligned. + */ +static const struct nla_policy cgroupstats_cmd_get_policy[TASKSTATS_CMD_ATTR_MAX+1] = {  	[CGROUPSTATS_CMD_ATTR_FD] = { .type = NLA_U32 },  }; diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index c3aad685bbc0..12dd190634ab 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -542,7 +542,6 @@ static int alarm_clock_get(clockid_t which_clock, struct timespec *tp)  static int alarm_timer_create(struct k_itimer *new_timer)  {  	enum  alarmtimer_type type; -	struct alarm_base *base;  	if (!alarmtimer_get_rtcdev())  		return -ENOTSUPP; @@ -551,7 +550,6 @@ static int alarm_timer_create(struct k_itimer *new_timer)  		return -EPERM;  	type = clock2alarm(new_timer->it_clock); -	base = &alarm_bases[type];  	alarm_init(&new_timer->it.alarm.alarmtimer, type, alarm_handle_timer);  	return 0;  } diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 2d47980a1bc4..c611c47de884 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -878,7 +878,7 @@ static inline struct timer_base *get_timer_base(u32 tflags)  #ifdef CONFIG_NO_HZ_COMMON  static inline struct timer_base * -__get_target_base(struct timer_base *base, unsigned tflags) +get_target_base(struct timer_base *base, unsigned tflags)  {  #ifdef CONFIG_SMP  	if ((tflags & TIMER_PINNED) || !base->migration_enabled) @@ -891,25 +891,27 @@ __get_target_base(struct timer_base *base, unsigned tflags)  static inline void forward_timer_base(struct timer_base *base)  { +	unsigned long jnow = READ_ONCE(jiffies); +  	/*  	 * We only forward the base when it's idle and we have a delta between  	 * base clock and jiffies.  	 */ -	if (!base->is_idle || (long) (jiffies - base->clk) < 2) +	if (!base->is_idle || (long) (jnow - base->clk) < 2)  		return;  	/*  	 * If the next expiry value is > jiffies, then we fast forward to  	 * jiffies otherwise we forward to the next expiry value.  	 */ -	if (time_after(base->next_expiry, jiffies)) -		base->clk = jiffies; +	if (time_after(base->next_expiry, jnow)) +		base->clk = jnow;  	else  		base->clk = base->next_expiry;  }  #else  static inline struct timer_base * -__get_target_base(struct timer_base *base, unsigned tflags) +get_target_base(struct timer_base *base, unsigned tflags)  {  	return get_timer_this_cpu_base(tflags);  } @@ -917,14 +919,6 @@ __get_target_base(struct timer_base *base, unsigned tflags)  static inline void forward_timer_base(struct timer_base *base) { }  #endif -static inline struct timer_base * -get_target_base(struct timer_base *base, unsigned tflags) -{ -	struct timer_base *target = __get_target_base(base, tflags); - -	forward_timer_base(target); -	return target; -}  /*   * We are using hashed locking: Holding per_cpu(timer_bases[x]).lock means @@ -943,7 +937,14 @@ static struct timer_base *lock_timer_base(struct timer_list *timer,  {  	for (;;) {  		struct timer_base *base; -		u32 tf = timer->flags; +		u32 tf; + +		/* +		 * We need to use READ_ONCE() here, otherwise the compiler +		 * might re-read @tf between the check for TIMER_MIGRATING +		 * and spin_lock(). +		 */ +		tf = READ_ONCE(timer->flags);  		if (!(tf & TIMER_MIGRATING)) {  			base = get_timer_base(tf); @@ -964,6 +965,8 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only)  	unsigned long clk = 0, flags;  	int ret = 0; +	BUG_ON(!timer->function); +  	/*  	 * This is a common optimization triggered by the networking code - if  	 * the timer is re-modified to have the same timeout or ends up in the @@ -972,13 +975,16 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only)  	if (timer_pending(timer)) {  		if (timer->expires == expires)  			return 1; +  		/* -		 * Take the current timer_jiffies of base, but without holding -		 * the lock! +		 * We lock timer base and calculate the bucket index right +		 * here. If the timer ends up in the same bucket, then we +		 * just update the expiry time and avoid the whole +		 * dequeue/enqueue dance.  		 */ -		base = get_timer_base(timer->flags); -		clk = base->clk; +		base = lock_timer_base(timer, &flags); +		clk = base->clk;  		idx = calc_wheel_index(expires, clk);  		/* @@ -988,14 +994,14 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only)  		 */  		if (idx == timer_get_idx(timer)) {  			timer->expires = expires; -			return 1; +			ret = 1; +			goto out_unlock;  		} +	} else { +		base = lock_timer_base(timer, &flags);  	}  	timer_stats_timer_set_start_info(timer); -	BUG_ON(!timer->function); - -	base = lock_timer_base(timer, &flags);  	ret = detach_if_pending(timer, base, false);  	if (!ret && pending_only) @@ -1025,12 +1031,16 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only)  		}  	} +	/* Try to forward a stale timer base clock */ +	forward_timer_base(base); +  	timer->expires = expires;  	/*  	 * If 'idx' was calculated above and the base time did not advance -	 * between calculating 'idx' and taking the lock, only enqueue_timer() -	 * and trigger_dyntick_cpu() is required. Otherwise we need to -	 * (re)calculate the wheel index via internal_add_timer(). +	 * between calculating 'idx' and possibly switching the base, only +	 * enqueue_timer() and trigger_dyntick_cpu() is required. Otherwise +	 * we need to (re)calculate the wheel index via +	 * internal_add_timer().  	 */  	if (idx != UINT_MAX && clk == base->clk) {  		enqueue_timer(base, timer, idx); @@ -1510,12 +1520,16 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem)  	is_max_delta = (nextevt == base->clk + NEXT_TIMER_MAX_DELTA);  	base->next_expiry = nextevt;  	/* -	 * We have a fresh next event. Check whether we can forward the base: +	 * We have a fresh next event. Check whether we can forward the +	 * base. We can only do that when @basej is past base->clk +	 * otherwise we might rewind base->clk.  	 */ -	if (time_after(nextevt, jiffies)) -		base->clk = jiffies; -	else if (time_after(nextevt, base->clk)) -		base->clk = nextevt; +	if (time_after(basej, base->clk)) { +		if (time_after(nextevt, basej)) +			base->clk = basej; +		else if (time_after(nextevt, base->clk)) +			base->clk = nextevt; +	}  	if (time_before_eq(nextevt, basej)) {  		expires = basem; diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 2050a7652a86..da87b3cba5b3 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1862,6 +1862,10 @@ static int __ftrace_hash_update_ipmodify(struct ftrace_ops *ops,  	/* Update rec->flags */  	do_for_each_ftrace_rec(pg, rec) { + +		if (rec->flags & FTRACE_FL_DISABLED) +			continue; +  		/* We need to update only differences of filter_hash */  		in_old = !!ftrace_lookup_ip(old_hash, rec->ip);  		in_new = !!ftrace_lookup_ip(new_hash, rec->ip); @@ -1884,6 +1888,10 @@ rollback:  	/* Roll back what we did above */  	do_for_each_ftrace_rec(pg, rec) { + +		if (rec->flags & FTRACE_FL_DISABLED) +			continue; +  		if (rec == end)  			goto err_out; @@ -2397,6 +2405,10 @@ void __weak ftrace_replace_code(int enable)  		return;  	do_for_each_ftrace_rec(pg, rec) { + +		if (rec->flags & FTRACE_FL_DISABLED) +			continue; +  		failed = __ftrace_replace_code(rec, enable);  		if (failed) {  			ftrace_bug(failed, rec); @@ -2763,7 +2775,7 @@ static int ftrace_shutdown(struct ftrace_ops *ops, int command)  		struct dyn_ftrace *rec;  		do_for_each_ftrace_rec(pg, rec) { -			if (FTRACE_WARN_ON_ONCE(rec->flags)) +			if (FTRACE_WARN_ON_ONCE(rec->flags & ~FTRACE_FL_DISABLED))  				pr_warn("  %pS flags:%lx\n",  					(void *)rec->ip, rec->flags);  		} while_for_each_ftrace_rec(); @@ -3598,6 +3610,10 @@ match_records(struct ftrace_hash *hash, char *func, int len, char *mod)  		goto out_unlock;  	do_for_each_ftrace_rec(pg, rec) { + +		if (rec->flags & FTRACE_FL_DISABLED) +			continue; +  		if (ftrace_match_record(rec, &func_g, mod_match, exclude_mod)) {  			ret = enter_record(hash, rec, clear_filter);  			if (ret < 0) { @@ -3793,6 +3809,9 @@ register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,  	do_for_each_ftrace_rec(pg, rec) { +		if (rec->flags & FTRACE_FL_DISABLED) +			continue; +  		if (!ftrace_match_record(rec, &func_g, NULL, 0))  			continue; @@ -4685,6 +4704,9 @@ ftrace_set_func(unsigned long *array, int *idx, int size, char *buffer)  	do_for_each_ftrace_rec(pg, rec) { +		if (rec->flags & FTRACE_FL_DISABLED) +			continue; +  		if (ftrace_match_record(rec, &func_g, NULL, 0)) {  			/* if it is in the array */  			exists = false; | 
