diff options
Diffstat (limited to 'kernel/sched')
| -rw-r--r-- | kernel/sched/Makefile | 3 | ||||
| -rw-r--r-- | kernel/sched/clock.c | 5 | ||||
| -rw-r--r-- | kernel/sched/core.c | 511 | ||||
| -rw-r--r-- | kernel/sched/cpufreq.c | 37 | ||||
| -rw-r--r-- | kernel/sched/cputime.c | 53 | ||||
| -rw-r--r-- | kernel/sched/deadline.c | 66 | ||||
| -rw-r--r-- | kernel/sched/debug.c | 415 | ||||
| -rw-r--r-- | kernel/sched/fair.c | 323 | ||||
| -rw-r--r-- | kernel/sched/idle.c | 9 | ||||
| -rw-r--r-- | kernel/sched/rt.c | 116 | ||||
| -rw-r--r-- | kernel/sched/sched.h | 149 | ||||
| -rw-r--r-- | kernel/sched/stats.h | 8 | ||||
| -rw-r--r-- | kernel/sched/swait.c | 123 | 
13 files changed, 1148 insertions, 670 deletions
| diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index 67687973ce80..302d6ebd64f7 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -13,9 +13,10 @@ endif  obj-y += core.o loadavg.o clock.o cputime.o  obj-y += idle_task.o fair.o rt.o deadline.o stop_task.o -obj-y += wait.o completion.o idle.o +obj-y += wait.o swait.o completion.o idle.o  obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o  obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o  obj-$(CONFIG_SCHEDSTATS) += stats.o  obj-$(CONFIG_SCHED_DEBUG) += debug.o  obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o +obj-$(CONFIG_CPU_FREQ) += cpufreq.o diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c index bc54e84675da..fedb967a9841 100644 --- a/kernel/sched/clock.c +++ b/kernel/sched/clock.c @@ -61,6 +61,7 @@  #include <linux/static_key.h>  #include <linux/workqueue.h>  #include <linux/compiler.h> +#include <linux/tick.h>  /*   * Scheduler clock - returns current time in nanosec units. @@ -89,6 +90,8 @@ static void __set_sched_clock_stable(void)  {  	if (!sched_clock_stable())  		static_key_slow_inc(&__sched_clock_stable); + +	tick_dep_clear(TICK_DEP_BIT_CLOCK_UNSTABLE);  }  void set_sched_clock_stable(void) @@ -108,6 +111,8 @@ static void __clear_sched_clock_stable(struct work_struct *work)  	/* XXX worry about clock continuity */  	if (sched_clock_stable())  		static_key_slow_dec(&__sched_clock_stable); + +	tick_dep_set(TICK_DEP_BIT_CLOCK_UNSTABLE);  }  static DECLARE_WORK(sched_clock_work, __clear_sched_clock_stable); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 9503d590e5ef..ea8f49ae0062 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -26,6 +26,7 @@   *              Thomas Gleixner, Mike Kravetz   */ +#include <linux/kasan.h>  #include <linux/mm.h>  #include <linux/module.h>  #include <linux/nmi.h> @@ -66,12 +67,10 @@  #include <linux/pagemap.h>  #include <linux/hrtimer.h>  #include <linux/tick.h> -#include <linux/debugfs.h>  #include <linux/ctype.h>  #include <linux/ftrace.h>  #include <linux/slab.h>  #include <linux/init_task.h> -#include <linux/binfmts.h>  #include <linux/context_tracking.h>  #include <linux/compiler.h> @@ -124,138 +123,6 @@ const_debug unsigned int sysctl_sched_features =  #undef SCHED_FEAT -#ifdef CONFIG_SCHED_DEBUG -#define SCHED_FEAT(name, enabled)	\ -	#name , - -static const char * const sched_feat_names[] = { -#include "features.h" -}; - -#undef SCHED_FEAT - -static int sched_feat_show(struct seq_file *m, void *v) -{ -	int i; - -	for (i = 0; i < __SCHED_FEAT_NR; i++) { -		if (!(sysctl_sched_features & (1UL << i))) -			seq_puts(m, "NO_"); -		seq_printf(m, "%s ", sched_feat_names[i]); -	} -	seq_puts(m, "\n"); - -	return 0; -} - -#ifdef HAVE_JUMP_LABEL - -#define jump_label_key__true  STATIC_KEY_INIT_TRUE -#define jump_label_key__false STATIC_KEY_INIT_FALSE - -#define SCHED_FEAT(name, enabled)	\ -	jump_label_key__##enabled , - -struct static_key sched_feat_keys[__SCHED_FEAT_NR] = { -#include "features.h" -}; - -#undef SCHED_FEAT - -static void sched_feat_disable(int i) -{ -	static_key_disable(&sched_feat_keys[i]); -} - -static void sched_feat_enable(int i) -{ -	static_key_enable(&sched_feat_keys[i]); -} -#else -static void sched_feat_disable(int i) { }; -static void sched_feat_enable(int i) { }; -#endif /* HAVE_JUMP_LABEL */ - -static int sched_feat_set(char *cmp) -{ -	int i; -	int neg = 0; - -	if (strncmp(cmp, "NO_", 3) == 0) { -		neg = 1; -		cmp += 3; -	} - -	for (i = 0; i < __SCHED_FEAT_NR; i++) { -		if (strcmp(cmp, sched_feat_names[i]) == 0) { -			if (neg) { -				sysctl_sched_features &= ~(1UL << i); -				sched_feat_disable(i); -			} else { -				sysctl_sched_features |= (1UL << i); -				sched_feat_enable(i); -			} -			break; -		} -	} - -	return i; -} - -static ssize_t -sched_feat_write(struct file *filp, const char __user *ubuf, -		size_t cnt, loff_t *ppos) -{ -	char buf[64]; -	char *cmp; -	int i; -	struct inode *inode; - -	if (cnt > 63) -		cnt = 63; - -	if (copy_from_user(&buf, ubuf, cnt)) -		return -EFAULT; - -	buf[cnt] = 0; -	cmp = strstrip(buf); - -	/* Ensure the static_key remains in a consistent state */ -	inode = file_inode(filp); -	inode_lock(inode); -	i = sched_feat_set(cmp); -	inode_unlock(inode); -	if (i == __SCHED_FEAT_NR) -		return -EINVAL; - -	*ppos += cnt; - -	return cnt; -} - -static int sched_feat_open(struct inode *inode, struct file *filp) -{ -	return single_open(filp, sched_feat_show, NULL); -} - -static const struct file_operations sched_feat_fops = { -	.open		= sched_feat_open, -	.write		= sched_feat_write, -	.read		= seq_read, -	.llseek		= seq_lseek, -	.release	= single_release, -}; - -static __init int sched_init_debug(void) -{ -	debugfs_create_file("sched_features", 0644, NULL, NULL, -			&sched_feat_fops); - -	return 0; -} -late_initcall(sched_init_debug); -#endif /* CONFIG_SCHED_DEBUG */ -  /*   * Number of tasks to iterate in a single balance run.   * Limited because this is done with IRQs disabled. @@ -453,20 +320,6 @@ static inline void init_hrtick(void)  }  #endif	/* CONFIG_SCHED_HRTICK */ -/* - * cmpxchg based fetch_or, macro so it works for different integer types - */ -#define fetch_or(ptr, val)						\ -({	typeof(*(ptr)) __old, __val = *(ptr);				\ - 	for (;;) {							\ - 		__old = cmpxchg((ptr), __val, __val | (val));		\ - 		if (__old == __val)					\ - 			break;						\ - 		__val = __old;						\ - 	}								\ - 	__old;								\ -}) -  #if defined(CONFIG_SMP) && defined(TIF_POLLING_NRFLAG)  /*   * Atomically set TIF_NEED_RESCHED and test for TIF_POLLING_NRFLAG, @@ -715,31 +568,36 @@ static inline bool got_nohz_idle_kick(void)  #endif /* CONFIG_NO_HZ_COMMON */  #ifdef CONFIG_NO_HZ_FULL -bool sched_can_stop_tick(void) +bool sched_can_stop_tick(struct rq *rq)  { +	int fifo_nr_running; + +	/* Deadline tasks, even if single, need the tick */ +	if (rq->dl.dl_nr_running) +		return false; +  	/* -	 * FIFO realtime policy runs the highest priority task. Other runnable -	 * tasks are of a lower priority. The scheduler tick does nothing. +	 * FIFO realtime policy runs the highest priority task (after DEADLINE). +	 * Other runnable tasks are of a lower priority. The scheduler tick +	 * isn't needed.  	 */ -	if (current->policy == SCHED_FIFO) +	fifo_nr_running = rq->rt.rt_nr_running - rq->rt.rr_nr_running; +	if (fifo_nr_running)  		return true;  	/*  	 * Round-robin realtime tasks time slice with other tasks at the same -	 * realtime priority. Is this task the only one at this priority? +	 * realtime priority.  	 */ -	if (current->policy == SCHED_RR) { -		struct sched_rt_entity *rt_se = ¤t->rt; - -		return list_is_singular(&rt_se->run_list); +	if (rq->rt.rr_nr_running) { +		if (rq->rt.rr_nr_running == 1) +			return true; +		else +			return false;  	} -	/* -	 * More than one running task need preemption. -	 * nr_running update is assumed to be visible -	 * after IPI is sent from wakers. -	 */ -	if (this_rq()->nr_running > 1) +	/* Normal multitasking need periodic preemption checks */ +	if (rq->cfs.nr_running > 1)  		return false;  	return true; @@ -2093,7 +1951,8 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)  	ttwu_queue(p, cpu);  stat: -	ttwu_stat(p, cpu, wake_flags); +	if (schedstat_enabled()) +		ttwu_stat(p, cpu, wake_flags);  out:  	raw_spin_unlock_irqrestore(&p->pi_lock, flags); @@ -2141,7 +2000,8 @@ static void try_to_wake_up_local(struct task_struct *p)  		ttwu_activate(rq, p, ENQUEUE_WAKEUP);  	ttwu_do_wakeup(rq, p, 0); -	ttwu_stat(p, smp_processor_id(), 0); +	if (schedstat_enabled()) +		ttwu_stat(p, smp_processor_id(), 0);  out:  	raw_spin_unlock(&p->pi_lock);  } @@ -2183,7 +2043,6 @@ void __dl_clear_params(struct task_struct *p)  	dl_se->dl_bw = 0;  	dl_se->dl_throttled = 0; -	dl_se->dl_new = 1;  	dl_se->dl_yielded = 0;  } @@ -2210,6 +2069,7 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)  #endif  #ifdef CONFIG_SCHEDSTATS +	/* Even if schedstat is disabled, there should not be garbage */  	memset(&p->se.statistics, 0, sizeof(p->se.statistics));  #endif @@ -2218,6 +2078,10 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)  	__dl_clear_params(p);  	INIT_LIST_HEAD(&p->rt.run_list); +	p->rt.timeout		= 0; +	p->rt.time_slice	= sched_rr_timeslice; +	p->rt.on_rq		= 0; +	p->rt.on_list		= 0;  #ifdef CONFIG_PREEMPT_NOTIFIERS  	INIT_HLIST_HEAD(&p->preempt_notifiers); @@ -2281,6 +2145,69 @@ int sysctl_numa_balancing(struct ctl_table *table, int write,  #endif  #endif +DEFINE_STATIC_KEY_FALSE(sched_schedstats); + +#ifdef CONFIG_SCHEDSTATS +static void set_schedstats(bool enabled) +{ +	if (enabled) +		static_branch_enable(&sched_schedstats); +	else +		static_branch_disable(&sched_schedstats); +} + +void force_schedstat_enabled(void) +{ +	if (!schedstat_enabled()) { +		pr_info("kernel profiling enabled schedstats, disable via kernel.sched_schedstats.\n"); +		static_branch_enable(&sched_schedstats); +	} +} + +static int __init setup_schedstats(char *str) +{ +	int ret = 0; +	if (!str) +		goto out; + +	if (!strcmp(str, "enable")) { +		set_schedstats(true); +		ret = 1; +	} else if (!strcmp(str, "disable")) { +		set_schedstats(false); +		ret = 1; +	} +out: +	if (!ret) +		pr_warn("Unable to parse schedstats=\n"); + +	return ret; +} +__setup("schedstats=", setup_schedstats); + +#ifdef CONFIG_PROC_SYSCTL +int sysctl_schedstats(struct ctl_table *table, int write, +			 void __user *buffer, size_t *lenp, loff_t *ppos) +{ +	struct ctl_table t; +	int err; +	int state = static_branch_likely(&sched_schedstats); + +	if (write && !capable(CAP_SYS_ADMIN)) +		return -EPERM; + +	t = *table; +	t.data = &state; +	err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos); +	if (err < 0) +		return err; +	if (write) +		set_schedstats(state); +	return err; +} +#endif +#endif +  /*   * fork()/clone()-time setup:   */ @@ -3010,16 +2937,6 @@ u64 scheduler_tick_max_deferment(void)  }  #endif -notrace unsigned long get_parent_ip(unsigned long addr) -{ -	if (in_lock_functions(addr)) { -		addr = CALLER_ADDR2; -		if (in_lock_functions(addr)) -			addr = CALLER_ADDR3; -	} -	return addr; -} -  #if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \  				defined(CONFIG_PREEMPT_TRACER)) @@ -3041,7 +2958,7 @@ void preempt_count_add(int val)  				PREEMPT_MASK - 10);  #endif  	if (preempt_count() == val) { -		unsigned long ip = get_parent_ip(CALLER_ADDR1); +		unsigned long ip = get_lock_parent_ip();  #ifdef CONFIG_DEBUG_PREEMPT  		current->preempt_disable_ip = ip;  #endif @@ -3068,7 +2985,7 @@ void preempt_count_sub(int val)  #endif  	if (preempt_count() == val) -		trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1)); +		trace_preempt_on(CALLER_ADDR0, get_lock_parent_ip());  	__preempt_count_sub(val);  }  EXPORT_SYMBOL(preempt_count_sub); @@ -3280,7 +3197,6 @@ static void __sched notrace __schedule(bool preempt)  		trace_sched_switch(preempt, prev, next);  		rq = context_switch(rq, prev, next); /* unlocks the rq */ -		cpu = cpu_of(rq);  	} else {  		lockdep_unpin_lock(&rq->lock);  		raw_spin_unlock_irq(&rq->lock); @@ -3466,7 +3382,7 @@ EXPORT_SYMBOL(default_wake_function);   */  void rt_mutex_setprio(struct task_struct *p, int prio)  { -	int oldprio, queued, running, enqueue_flag = ENQUEUE_RESTORE; +	int oldprio, queued, running, queue_flag = DEQUEUE_SAVE | DEQUEUE_MOVE;  	struct rq *rq;  	const struct sched_class *prev_class; @@ -3494,11 +3410,15 @@ void rt_mutex_setprio(struct task_struct *p, int prio)  	trace_sched_pi_setprio(p, prio);  	oldprio = p->prio; + +	if (oldprio == prio) +		queue_flag &= ~DEQUEUE_MOVE; +  	prev_class = p->sched_class;  	queued = task_on_rq_queued(p);  	running = task_current(rq, p);  	if (queued) -		dequeue_task(rq, p, DEQUEUE_SAVE); +		dequeue_task(rq, p, queue_flag);  	if (running)  		put_prev_task(rq, p); @@ -3516,7 +3436,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)  		if (!dl_prio(p->normal_prio) ||  		    (pi_task && dl_entity_preempt(&pi_task->dl, &p->dl))) {  			p->dl.dl_boosted = 1; -			enqueue_flag |= ENQUEUE_REPLENISH; +			queue_flag |= ENQUEUE_REPLENISH;  		} else  			p->dl.dl_boosted = 0;  		p->sched_class = &dl_sched_class; @@ -3524,7 +3444,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)  		if (dl_prio(oldprio))  			p->dl.dl_boosted = 0;  		if (oldprio < prio) -			enqueue_flag |= ENQUEUE_HEAD; +			queue_flag |= ENQUEUE_HEAD;  		p->sched_class = &rt_sched_class;  	} else {  		if (dl_prio(oldprio)) @@ -3539,7 +3459,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)  	if (running)  		p->sched_class->set_curr_task(rq);  	if (queued) -		enqueue_task(rq, p, enqueue_flag); +		enqueue_task(rq, p, queue_flag);  	check_class_changed(rq, p, prev_class, oldprio);  out_unlock: @@ -3895,6 +3815,7 @@ static int __sched_setscheduler(struct task_struct *p,  	const struct sched_class *prev_class;  	struct rq *rq;  	int reset_on_fork; +	int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE;  	/* may grab non-irq protected spin_locks */  	BUG_ON(in_interrupt()); @@ -4077,17 +3998,14 @@ change:  		 * itself.  		 */  		new_effective_prio = rt_mutex_get_effective_prio(p, newprio); -		if (new_effective_prio == oldprio) { -			__setscheduler_params(p, attr); -			task_rq_unlock(rq, p, &flags); -			return 0; -		} +		if (new_effective_prio == oldprio) +			queue_flags &= ~DEQUEUE_MOVE;  	}  	queued = task_on_rq_queued(p);  	running = task_current(rq, p);  	if (queued) -		dequeue_task(rq, p, DEQUEUE_SAVE); +		dequeue_task(rq, p, queue_flags);  	if (running)  		put_prev_task(rq, p); @@ -4097,15 +4015,14 @@ change:  	if (running)  		p->sched_class->set_curr_task(rq);  	if (queued) { -		int enqueue_flags = ENQUEUE_RESTORE;  		/*  		 * We enqueue to tail when the priority of a task is  		 * increased (user space view).  		 */ -		if (oldprio <= p->prio) -			enqueue_flags |= ENQUEUE_HEAD; +		if (oldprio < p->prio) +			queue_flags |= ENQUEUE_HEAD; -		enqueue_task(rq, p, enqueue_flags); +		enqueue_task(rq, p, queue_flags);  	}  	check_class_changed(rq, p, prev_class, oldprio); @@ -5096,6 +5013,8 @@ void init_idle(struct task_struct *idle, int cpu)  	idle->state = TASK_RUNNING;  	idle->se.exec_start = sched_clock(); +	kasan_unpoison_task_stack(idle); +  #ifdef CONFIG_SMP  	/*  	 * Its possible that init_idle() gets called multiple times on a task, @@ -5405,183 +5324,6 @@ static void migrate_tasks(struct rq *dead_rq)  }  #endif /* CONFIG_HOTPLUG_CPU */ -#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL) - -static struct ctl_table sd_ctl_dir[] = { -	{ -		.procname	= "sched_domain", -		.mode		= 0555, -	}, -	{} -}; - -static struct ctl_table sd_ctl_root[] = { -	{ -		.procname	= "kernel", -		.mode		= 0555, -		.child		= sd_ctl_dir, -	}, -	{} -}; - -static struct ctl_table *sd_alloc_ctl_entry(int n) -{ -	struct ctl_table *entry = -		kcalloc(n, sizeof(struct ctl_table), GFP_KERNEL); - -	return entry; -} - -static void sd_free_ctl_entry(struct ctl_table **tablep) -{ -	struct ctl_table *entry; - -	/* -	 * In the intermediate directories, both the child directory and -	 * procname are dynamically allocated and could fail but the mode -	 * will always be set. In the lowest directory the names are -	 * static strings and all have proc handlers. -	 */ -	for (entry = *tablep; entry->mode; entry++) { -		if (entry->child) -			sd_free_ctl_entry(&entry->child); -		if (entry->proc_handler == NULL) -			kfree(entry->procname); -	} - -	kfree(*tablep); -	*tablep = NULL; -} - -static int min_load_idx = 0; -static int max_load_idx = CPU_LOAD_IDX_MAX-1; - -static void -set_table_entry(struct ctl_table *entry, -		const char *procname, void *data, int maxlen, -		umode_t mode, proc_handler *proc_handler, -		bool load_idx) -{ -	entry->procname = procname; -	entry->data = data; -	entry->maxlen = maxlen; -	entry->mode = mode; -	entry->proc_handler = proc_handler; - -	if (load_idx) { -		entry->extra1 = &min_load_idx; -		entry->extra2 = &max_load_idx; -	} -} - -static struct ctl_table * -sd_alloc_ctl_domain_table(struct sched_domain *sd) -{ -	struct ctl_table *table = sd_alloc_ctl_entry(14); - -	if (table == NULL) -		return NULL; - -	set_table_entry(&table[0], "min_interval", &sd->min_interval, -		sizeof(long), 0644, proc_doulongvec_minmax, false); -	set_table_entry(&table[1], "max_interval", &sd->max_interval, -		sizeof(long), 0644, proc_doulongvec_minmax, false); -	set_table_entry(&table[2], "busy_idx", &sd->busy_idx, -		sizeof(int), 0644, proc_dointvec_minmax, true); -	set_table_entry(&table[3], "idle_idx", &sd->idle_idx, -		sizeof(int), 0644, proc_dointvec_minmax, true); -	set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx, -		sizeof(int), 0644, proc_dointvec_minmax, true); -	set_table_entry(&table[5], "wake_idx", &sd->wake_idx, -		sizeof(int), 0644, proc_dointvec_minmax, true); -	set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx, -		sizeof(int), 0644, proc_dointvec_minmax, true); -	set_table_entry(&table[7], "busy_factor", &sd->busy_factor, -		sizeof(int), 0644, proc_dointvec_minmax, false); -	set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct, -		sizeof(int), 0644, proc_dointvec_minmax, false); -	set_table_entry(&table[9], "cache_nice_tries", -		&sd->cache_nice_tries, -		sizeof(int), 0644, proc_dointvec_minmax, false); -	set_table_entry(&table[10], "flags", &sd->flags, -		sizeof(int), 0644, proc_dointvec_minmax, false); -	set_table_entry(&table[11], "max_newidle_lb_cost", -		&sd->max_newidle_lb_cost, -		sizeof(long), 0644, proc_doulongvec_minmax, false); -	set_table_entry(&table[12], "name", sd->name, -		CORENAME_MAX_SIZE, 0444, proc_dostring, false); -	/* &table[13] is terminator */ - -	return table; -} - -static struct ctl_table *sd_alloc_ctl_cpu_table(int cpu) -{ -	struct ctl_table *entry, *table; -	struct sched_domain *sd; -	int domain_num = 0, i; -	char buf[32]; - -	for_each_domain(cpu, sd) -		domain_num++; -	entry = table = sd_alloc_ctl_entry(domain_num + 1); -	if (table == NULL) -		return NULL; - -	i = 0; -	for_each_domain(cpu, sd) { -		snprintf(buf, 32, "domain%d", i); -		entry->procname = kstrdup(buf, GFP_KERNEL); -		entry->mode = 0555; -		entry->child = sd_alloc_ctl_domain_table(sd); -		entry++; -		i++; -	} -	return table; -} - -static struct ctl_table_header *sd_sysctl_header; -static void register_sched_domain_sysctl(void) -{ -	int i, cpu_num = num_possible_cpus(); -	struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1); -	char buf[32]; - -	WARN_ON(sd_ctl_dir[0].child); -	sd_ctl_dir[0].child = entry; - -	if (entry == NULL) -		return; - -	for_each_possible_cpu(i) { -		snprintf(buf, 32, "cpu%d", i); -		entry->procname = kstrdup(buf, GFP_KERNEL); -		entry->mode = 0555; -		entry->child = sd_alloc_ctl_cpu_table(i); -		entry++; -	} - -	WARN_ON(sd_sysctl_header); -	sd_sysctl_header = register_sysctl_table(sd_ctl_root); -} - -/* may be called multiple times per register */ -static void unregister_sched_domain_sysctl(void) -{ -	unregister_sysctl_table(sd_sysctl_header); -	sd_sysctl_header = NULL; -	if (sd_ctl_dir[0].child) -		sd_free_ctl_entry(&sd_ctl_dir[0].child); -} -#else -static void register_sched_domain_sysctl(void) -{ -} -static void unregister_sched_domain_sysctl(void) -{ -} -#endif /* CONFIG_SCHED_DEBUG && CONFIG_SYSCTL */ -  static void set_rq_online(struct rq *rq)  {  	if (!rq->online) { @@ -5692,16 +5434,6 @@ static int sched_cpu_active(struct notifier_block *nfb,  		set_cpu_rq_start_time();  		return NOTIFY_OK; -	case CPU_ONLINE: -		/* -		 * At this point a starting CPU has marked itself as online via -		 * set_cpu_online(). But it might not yet have marked itself -		 * as active, which is essential from here on. -		 */ -		set_cpu_active(cpu, true); -		stop_machine_unpark(cpu); -		return NOTIFY_OK; -  	case CPU_DOWN_FAILED:  		set_cpu_active(cpu, true);  		return NOTIFY_OK; @@ -6173,11 +5905,16 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)  /* Setup the mask of cpus configured for isolated domains */  static int __init isolated_cpu_setup(char *str)  { +	int ret; +  	alloc_bootmem_cpumask_var(&cpu_isolated_map); -	cpulist_parse(str, cpu_isolated_map); +	ret = cpulist_parse(str, cpu_isolated_map); +	if (ret) { +		pr_err("sched: Error, all isolcpus= values must be between 0 and %d\n", nr_cpu_ids); +		return 0; +	}  	return 1;  } -  __setup("isolcpus=", isolated_cpu_setup);  struct s_data { @@ -7860,11 +7597,9 @@ void sched_destroy_group(struct task_group *tg)  void sched_offline_group(struct task_group *tg)  {  	unsigned long flags; -	int i;  	/* end participation in shares distribution */ -	for_each_possible_cpu(i) -		unregister_fair_sched_group(tg, i); +	unregister_fair_sched_group(tg);  	spin_lock_irqsave(&task_group_lock, flags);  	list_del_rcu(&tg->list); @@ -7890,7 +7625,7 @@ void sched_move_task(struct task_struct *tsk)  	queued = task_on_rq_queued(tsk);  	if (queued) -		dequeue_task(rq, tsk, DEQUEUE_SAVE); +		dequeue_task(rq, tsk, DEQUEUE_SAVE | DEQUEUE_MOVE);  	if (unlikely(running))  		put_prev_task(rq, tsk); @@ -7914,7 +7649,7 @@ void sched_move_task(struct task_struct *tsk)  	if (unlikely(running))  		tsk->sched_class->set_curr_task(rq);  	if (queued) -		enqueue_task(rq, tsk, ENQUEUE_RESTORE); +		enqueue_task(rq, tsk, ENQUEUE_RESTORE | ENQUEUE_MOVE);  	task_rq_unlock(rq, tsk, &flags);  } diff --git a/kernel/sched/cpufreq.c b/kernel/sched/cpufreq.c new file mode 100644 index 000000000000..928c4ba32f68 --- /dev/null +++ b/kernel/sched/cpufreq.c @@ -0,0 +1,37 @@ +/* + * Scheduler code and data structures related to cpufreq. + * + * Copyright (C) 2016, Intel Corporation + * Author: Rafael J. Wysocki <rafael.j.wysocki@intel.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include "sched.h" + +DEFINE_PER_CPU(struct update_util_data *, cpufreq_update_util_data); + +/** + * cpufreq_set_update_util_data - Populate the CPU's update_util_data pointer. + * @cpu: The CPU to set the pointer for. + * @data: New pointer value. + * + * Set and publish the update_util_data pointer for the given CPU.  That pointer + * points to a struct update_util_data object containing a callback function + * to call from cpufreq_update_util().  That function will be called from an RCU + * read-side critical section, so it must not sleep. + * + * Callers must use RCU-sched callbacks to free any memory that might be + * accessed via the old update_util_data pointer or invoke synchronize_sched() + * right after this function to avoid use-after-free. + */ +void cpufreq_set_update_util_data(int cpu, struct update_util_data *data) +{ +	if (WARN_ON(data && !data->func)) +		return; + +	rcu_assign_pointer(per_cpu(cpufreq_update_util_data, cpu), data); +} +EXPORT_SYMBOL_GPL(cpufreq_set_update_util_data); diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index b2ab2ffb1adc..75f98c5498d5 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -262,21 +262,21 @@ static __always_inline bool steal_account_process_tick(void)  #ifdef CONFIG_PARAVIRT  	if (static_key_false(¶virt_steal_enabled)) {  		u64 steal; -		cputime_t steal_ct; +		unsigned long steal_jiffies;  		steal = paravirt_steal_clock(smp_processor_id());  		steal -= this_rq()->prev_steal_time;  		/* -		 * cputime_t may be less precise than nsecs (eg: if it's -		 * based on jiffies). Lets cast the result to cputime +		 * steal is in nsecs but our caller is expecting steal +		 * time in jiffies. Lets cast the result to jiffies  		 * granularity and account the rest on the next rounds.  		 */ -		steal_ct = nsecs_to_cputime(steal); -		this_rq()->prev_steal_time += cputime_to_nsecs(steal_ct); +		steal_jiffies = nsecs_to_jiffies(steal); +		this_rq()->prev_steal_time += jiffies_to_nsecs(steal_jiffies); -		account_steal_time(steal_ct); -		return steal_ct; +		account_steal_time(jiffies_to_cputime(steal_jiffies)); +		return steal_jiffies;  	}  #endif  	return false; @@ -668,26 +668,25 @@ void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime  #endif /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */  #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN -static unsigned long long vtime_delta(struct task_struct *tsk) +static cputime_t vtime_delta(struct task_struct *tsk)  { -	unsigned long long clock; +	unsigned long now = READ_ONCE(jiffies); -	clock = local_clock(); -	if (clock < tsk->vtime_snap) +	if (time_before(now, (unsigned long)tsk->vtime_snap))  		return 0; -	return clock - tsk->vtime_snap; +	return jiffies_to_cputime(now - tsk->vtime_snap);  }  static cputime_t get_vtime_delta(struct task_struct *tsk)  { -	unsigned long long delta = vtime_delta(tsk); +	unsigned long now = READ_ONCE(jiffies); +	unsigned long delta = now - tsk->vtime_snap;  	WARN_ON_ONCE(tsk->vtime_snap_whence == VTIME_INACTIVE); -	tsk->vtime_snap += delta; +	tsk->vtime_snap = now; -	/* CHECKME: always safe to convert nsecs to cputime? */ -	return nsecs_to_cputime(delta); +	return jiffies_to_cputime(delta);  }  static void __vtime_account_system(struct task_struct *tsk) @@ -699,6 +698,9 @@ static void __vtime_account_system(struct task_struct *tsk)  void vtime_account_system(struct task_struct *tsk)  { +	if (!vtime_delta(tsk)) +		return; +  	write_seqcount_begin(&tsk->vtime_seqcount);  	__vtime_account_system(tsk);  	write_seqcount_end(&tsk->vtime_seqcount); @@ -707,7 +709,8 @@ void vtime_account_system(struct task_struct *tsk)  void vtime_gen_account_irq_exit(struct task_struct *tsk)  {  	write_seqcount_begin(&tsk->vtime_seqcount); -	__vtime_account_system(tsk); +	if (vtime_delta(tsk)) +		__vtime_account_system(tsk);  	if (context_tracking_in_user())  		tsk->vtime_snap_whence = VTIME_USER;  	write_seqcount_end(&tsk->vtime_seqcount); @@ -718,16 +721,19 @@ void vtime_account_user(struct task_struct *tsk)  	cputime_t delta_cpu;  	write_seqcount_begin(&tsk->vtime_seqcount); -	delta_cpu = get_vtime_delta(tsk);  	tsk->vtime_snap_whence = VTIME_SYS; -	account_user_time(tsk, delta_cpu, cputime_to_scaled(delta_cpu)); +	if (vtime_delta(tsk)) { +		delta_cpu = get_vtime_delta(tsk); +		account_user_time(tsk, delta_cpu, cputime_to_scaled(delta_cpu)); +	}  	write_seqcount_end(&tsk->vtime_seqcount);  }  void vtime_user_enter(struct task_struct *tsk)  {  	write_seqcount_begin(&tsk->vtime_seqcount); -	__vtime_account_system(tsk); +	if (vtime_delta(tsk)) +		__vtime_account_system(tsk);  	tsk->vtime_snap_whence = VTIME_USER;  	write_seqcount_end(&tsk->vtime_seqcount);  } @@ -742,7 +748,8 @@ void vtime_guest_enter(struct task_struct *tsk)  	 * that can thus safely catch up with a tickless delta.  	 */  	write_seqcount_begin(&tsk->vtime_seqcount); -	__vtime_account_system(tsk); +	if (vtime_delta(tsk)) +		__vtime_account_system(tsk);  	current->flags |= PF_VCPU;  	write_seqcount_end(&tsk->vtime_seqcount);  } @@ -772,7 +779,7 @@ void arch_vtime_task_switch(struct task_struct *prev)  	write_seqcount_begin(¤t->vtime_seqcount);  	current->vtime_snap_whence = VTIME_SYS; -	current->vtime_snap = sched_clock_cpu(smp_processor_id()); +	current->vtime_snap = jiffies;  	write_seqcount_end(¤t->vtime_seqcount);  } @@ -783,7 +790,7 @@ void vtime_init_idle(struct task_struct *t, int cpu)  	local_irq_save(flags);  	write_seqcount_begin(&t->vtime_seqcount);  	t->vtime_snap_whence = VTIME_SYS; -	t->vtime_snap = sched_clock_cpu(cpu); +	t->vtime_snap = jiffies;  	write_seqcount_end(&t->vtime_seqcount);  	local_irq_restore(flags);  } diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index cd64c979d0e1..affd97ec9f65 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -352,7 +352,15 @@ static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se,  	struct dl_rq *dl_rq = dl_rq_of_se(dl_se);  	struct rq *rq = rq_of_dl_rq(dl_rq); -	WARN_ON(!dl_se->dl_new || dl_se->dl_throttled); +	WARN_ON(dl_time_before(rq_clock(rq), dl_se->deadline)); + +	/* +	 * We are racing with the deadline timer. So, do nothing because +	 * the deadline timer handler will take care of properly recharging +	 * the runtime and postponing the deadline +	 */ +	if (dl_se->dl_throttled) +		return;  	/*  	 * We use the regular wall clock time to set deadlines in the @@ -361,7 +369,6 @@ static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se,  	 */  	dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline;  	dl_se->runtime = pi_se->dl_runtime; -	dl_se->dl_new = 0;  }  /* @@ -399,6 +406,9 @@ static void replenish_dl_entity(struct sched_dl_entity *dl_se,  		dl_se->runtime = pi_se->dl_runtime;  	} +	if (dl_se->dl_yielded && dl_se->runtime > 0) +		dl_se->runtime = 0; +  	/*  	 * We keep moving the deadline away until we get some  	 * available runtime for the entity. This ensures correct @@ -420,7 +430,7 @@ static void replenish_dl_entity(struct sched_dl_entity *dl_se,  	 * entity.  	 */  	if (dl_time_before(dl_se->deadline, rq_clock(rq))) { -		printk_deferred_once("sched: DL replenish lagged to much\n"); +		printk_deferred_once("sched: DL replenish lagged too much\n");  		dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline;  		dl_se->runtime = pi_se->dl_runtime;  	} @@ -500,15 +510,6 @@ static void update_dl_entity(struct sched_dl_entity *dl_se,  	struct dl_rq *dl_rq = dl_rq_of_se(dl_se);  	struct rq *rq = rq_of_dl_rq(dl_rq); -	/* -	 * The arrival of a new instance needs special treatment, i.e., -	 * the actual scheduling parameters have to be "renewed". -	 */ -	if (dl_se->dl_new) { -		setup_new_dl_entity(dl_se, pi_se); -		return; -	} -  	if (dl_time_before(dl_se->deadline, rq_clock(rq)) ||  	    dl_entity_overflow(dl_se, pi_se, rq_clock(rq))) {  		dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline; @@ -605,16 +606,6 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)  	}  	/* -	 * This is possible if switched_from_dl() raced against a running -	 * callback that took the above !dl_task() path and we've since then -	 * switched back into SCHED_DEADLINE. -	 * -	 * There's nothing to do except drop our task reference. -	 */ -	if (dl_se->dl_new) -		goto unlock; - -	/*  	 * The task might have been boosted by someone else and might be in the  	 * boosting/deboosting path, its not throttled.  	 */ @@ -726,6 +717,10 @@ static void update_curr_dl(struct rq *rq)  	if (!dl_task(curr) || !on_dl_rq(dl_se))  		return; +	/* Kick cpufreq (see the comment in linux/cpufreq.h). */ +	if (cpu_of(rq) == smp_processor_id()) +		cpufreq_trigger_update(rq_clock(rq)); +  	/*  	 * Consumed budget is computed considering the time as  	 * observed by schedulable tasks (excluding time spent @@ -735,8 +730,11 @@ static void update_curr_dl(struct rq *rq)  	 * approach need further study.  	 */  	delta_exec = rq_clock_task(rq) - curr->se.exec_start; -	if (unlikely((s64)delta_exec <= 0)) +	if (unlikely((s64)delta_exec <= 0)) { +		if (unlikely(dl_se->dl_yielded)) +			goto throttle;  		return; +	}  	schedstat_set(curr->se.statistics.exec_max,  		      max(curr->se.statistics.exec_max, delta_exec)); @@ -749,8 +747,10 @@ static void update_curr_dl(struct rq *rq)  	sched_rt_avg_update(rq, delta_exec); -	dl_se->runtime -= dl_se->dl_yielded ? 0 : delta_exec; -	if (dl_runtime_exceeded(dl_se)) { +	dl_se->runtime -= delta_exec; + +throttle: +	if (dl_runtime_exceeded(dl_se) || dl_se->dl_yielded) {  		dl_se->dl_throttled = 1;  		__dequeue_task_dl(rq, curr, 0);  		if (unlikely(dl_se->dl_boosted || !start_dl_timer(curr))) @@ -917,7 +917,7 @@ enqueue_dl_entity(struct sched_dl_entity *dl_se,  	 * parameters of the task might need updating. Otherwise,  	 * we want a replenishment of its runtime.  	 */ -	if (dl_se->dl_new || flags & ENQUEUE_WAKEUP) +	if (flags & ENQUEUE_WAKEUP)  		update_dl_entity(dl_se, pi_se);  	else if (flags & ENQUEUE_REPLENISH)  		replenish_dl_entity(dl_se, pi_se); @@ -994,18 +994,14 @@ static void dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags)   */  static void yield_task_dl(struct rq *rq)  { -	struct task_struct *p = rq->curr; -  	/*  	 * We make the task go to sleep until its current deadline by  	 * forcing its runtime to zero. This way, update_curr_dl() stops  	 * it and the bandwidth timer will wake it up and will give it  	 * new scheduling parameters (thanks to dl_yielded=1).  	 */ -	if (p->dl.runtime > 0) { -		rq->curr->dl.dl_yielded = 1; -		p->dl.runtime = 0; -	} +	rq->curr->dl.dl_yielded = 1; +  	update_rq_clock(rq);  	update_curr_dl(rq);  	/* @@ -1722,6 +1718,9 @@ static void switched_from_dl(struct rq *rq, struct task_struct *p)   */  static void switched_to_dl(struct rq *rq, struct task_struct *p)  { +	if (dl_time_before(p->dl.deadline, rq_clock(rq))) +		setup_new_dl_entity(&p->dl, &p->dl); +  	if (task_on_rq_queued(p) && rq->curr != p) {  #ifdef CONFIG_SMP  		if (p->nr_cpus_allowed > 1 && rq->dl.overloaded) @@ -1768,8 +1767,7 @@ static void prio_changed_dl(struct rq *rq, struct task_struct *p,  		 */  		resched_curr(rq);  #endif /* CONFIG_SMP */ -	} else -		switched_to_dl(rq, p); +	}  }  const struct sched_class dl_sched_class = { diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 641511771ae6..4fbc3bd5ff60 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -16,6 +16,7 @@  #include <linux/kallsyms.h>  #include <linux/utsname.h>  #include <linux/mempolicy.h> +#include <linux/debugfs.h>  #include "sched.h" @@ -58,6 +59,309 @@ static unsigned long nsec_low(unsigned long long nsec)  #define SPLIT_NS(x) nsec_high(x), nsec_low(x) +#define SCHED_FEAT(name, enabled)	\ +	#name , + +static const char * const sched_feat_names[] = { +#include "features.h" +}; + +#undef SCHED_FEAT + +static int sched_feat_show(struct seq_file *m, void *v) +{ +	int i; + +	for (i = 0; i < __SCHED_FEAT_NR; i++) { +		if (!(sysctl_sched_features & (1UL << i))) +			seq_puts(m, "NO_"); +		seq_printf(m, "%s ", sched_feat_names[i]); +	} +	seq_puts(m, "\n"); + +	return 0; +} + +#ifdef HAVE_JUMP_LABEL + +#define jump_label_key__true  STATIC_KEY_INIT_TRUE +#define jump_label_key__false STATIC_KEY_INIT_FALSE + +#define SCHED_FEAT(name, enabled)	\ +	jump_label_key__##enabled , + +struct static_key sched_feat_keys[__SCHED_FEAT_NR] = { +#include "features.h" +}; + +#undef SCHED_FEAT + +static void sched_feat_disable(int i) +{ +	static_key_disable(&sched_feat_keys[i]); +} + +static void sched_feat_enable(int i) +{ +	static_key_enable(&sched_feat_keys[i]); +} +#else +static void sched_feat_disable(int i) { }; +static void sched_feat_enable(int i) { }; +#endif /* HAVE_JUMP_LABEL */ + +static int sched_feat_set(char *cmp) +{ +	int i; +	int neg = 0; + +	if (strncmp(cmp, "NO_", 3) == 0) { +		neg = 1; +		cmp += 3; +	} + +	for (i = 0; i < __SCHED_FEAT_NR; i++) { +		if (strcmp(cmp, sched_feat_names[i]) == 0) { +			if (neg) { +				sysctl_sched_features &= ~(1UL << i); +				sched_feat_disable(i); +			} else { +				sysctl_sched_features |= (1UL << i); +				sched_feat_enable(i); +			} +			break; +		} +	} + +	return i; +} + +static ssize_t +sched_feat_write(struct file *filp, const char __user *ubuf, +		size_t cnt, loff_t *ppos) +{ +	char buf[64]; +	char *cmp; +	int i; +	struct inode *inode; + +	if (cnt > 63) +		cnt = 63; + +	if (copy_from_user(&buf, ubuf, cnt)) +		return -EFAULT; + +	buf[cnt] = 0; +	cmp = strstrip(buf); + +	/* Ensure the static_key remains in a consistent state */ +	inode = file_inode(filp); +	inode_lock(inode); +	i = sched_feat_set(cmp); +	inode_unlock(inode); +	if (i == __SCHED_FEAT_NR) +		return -EINVAL; + +	*ppos += cnt; + +	return cnt; +} + +static int sched_feat_open(struct inode *inode, struct file *filp) +{ +	return single_open(filp, sched_feat_show, NULL); +} + +static const struct file_operations sched_feat_fops = { +	.open		= sched_feat_open, +	.write		= sched_feat_write, +	.read		= seq_read, +	.llseek		= seq_lseek, +	.release	= single_release, +}; + +static __init int sched_init_debug(void) +{ +	debugfs_create_file("sched_features", 0644, NULL, NULL, +			&sched_feat_fops); + +	return 0; +} +late_initcall(sched_init_debug); + +#ifdef CONFIG_SMP + +#ifdef CONFIG_SYSCTL + +static struct ctl_table sd_ctl_dir[] = { +	{ +		.procname	= "sched_domain", +		.mode		= 0555, +	}, +	{} +}; + +static struct ctl_table sd_ctl_root[] = { +	{ +		.procname	= "kernel", +		.mode		= 0555, +		.child		= sd_ctl_dir, +	}, +	{} +}; + +static struct ctl_table *sd_alloc_ctl_entry(int n) +{ +	struct ctl_table *entry = +		kcalloc(n, sizeof(struct ctl_table), GFP_KERNEL); + +	return entry; +} + +static void sd_free_ctl_entry(struct ctl_table **tablep) +{ +	struct ctl_table *entry; + +	/* +	 * In the intermediate directories, both the child directory and +	 * procname are dynamically allocated and could fail but the mode +	 * will always be set. In the lowest directory the names are +	 * static strings and all have proc handlers. +	 */ +	for (entry = *tablep; entry->mode; entry++) { +		if (entry->child) +			sd_free_ctl_entry(&entry->child); +		if (entry->proc_handler == NULL) +			kfree(entry->procname); +	} + +	kfree(*tablep); +	*tablep = NULL; +} + +static int min_load_idx = 0; +static int max_load_idx = CPU_LOAD_IDX_MAX-1; + +static void +set_table_entry(struct ctl_table *entry, +		const char *procname, void *data, int maxlen, +		umode_t mode, proc_handler *proc_handler, +		bool load_idx) +{ +	entry->procname = procname; +	entry->data = data; +	entry->maxlen = maxlen; +	entry->mode = mode; +	entry->proc_handler = proc_handler; + +	if (load_idx) { +		entry->extra1 = &min_load_idx; +		entry->extra2 = &max_load_idx; +	} +} + +static struct ctl_table * +sd_alloc_ctl_domain_table(struct sched_domain *sd) +{ +	struct ctl_table *table = sd_alloc_ctl_entry(14); + +	if (table == NULL) +		return NULL; + +	set_table_entry(&table[0], "min_interval", &sd->min_interval, +		sizeof(long), 0644, proc_doulongvec_minmax, false); +	set_table_entry(&table[1], "max_interval", &sd->max_interval, +		sizeof(long), 0644, proc_doulongvec_minmax, false); +	set_table_entry(&table[2], "busy_idx", &sd->busy_idx, +		sizeof(int), 0644, proc_dointvec_minmax, true); +	set_table_entry(&table[3], "idle_idx", &sd->idle_idx, +		sizeof(int), 0644, proc_dointvec_minmax, true); +	set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx, +		sizeof(int), 0644, proc_dointvec_minmax, true); +	set_table_entry(&table[5], "wake_idx", &sd->wake_idx, +		sizeof(int), 0644, proc_dointvec_minmax, true); +	set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx, +		sizeof(int), 0644, proc_dointvec_minmax, true); +	set_table_entry(&table[7], "busy_factor", &sd->busy_factor, +		sizeof(int), 0644, proc_dointvec_minmax, false); +	set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct, +		sizeof(int), 0644, proc_dointvec_minmax, false); +	set_table_entry(&table[9], "cache_nice_tries", +		&sd->cache_nice_tries, +		sizeof(int), 0644, proc_dointvec_minmax, false); +	set_table_entry(&table[10], "flags", &sd->flags, +		sizeof(int), 0644, proc_dointvec_minmax, false); +	set_table_entry(&table[11], "max_newidle_lb_cost", +		&sd->max_newidle_lb_cost, +		sizeof(long), 0644, proc_doulongvec_minmax, false); +	set_table_entry(&table[12], "name", sd->name, +		CORENAME_MAX_SIZE, 0444, proc_dostring, false); +	/* &table[13] is terminator */ + +	return table; +} + +static struct ctl_table *sd_alloc_ctl_cpu_table(int cpu) +{ +	struct ctl_table *entry, *table; +	struct sched_domain *sd; +	int domain_num = 0, i; +	char buf[32]; + +	for_each_domain(cpu, sd) +		domain_num++; +	entry = table = sd_alloc_ctl_entry(domain_num + 1); +	if (table == NULL) +		return NULL; + +	i = 0; +	for_each_domain(cpu, sd) { +		snprintf(buf, 32, "domain%d", i); +		entry->procname = kstrdup(buf, GFP_KERNEL); +		entry->mode = 0555; +		entry->child = sd_alloc_ctl_domain_table(sd); +		entry++; +		i++; +	} +	return table; +} + +static struct ctl_table_header *sd_sysctl_header; +void register_sched_domain_sysctl(void) +{ +	int i, cpu_num = num_possible_cpus(); +	struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1); +	char buf[32]; + +	WARN_ON(sd_ctl_dir[0].child); +	sd_ctl_dir[0].child = entry; + +	if (entry == NULL) +		return; + +	for_each_possible_cpu(i) { +		snprintf(buf, 32, "cpu%d", i); +		entry->procname = kstrdup(buf, GFP_KERNEL); +		entry->mode = 0555; +		entry->child = sd_alloc_ctl_cpu_table(i); +		entry++; +	} + +	WARN_ON(sd_sysctl_header); +	sd_sysctl_header = register_sysctl_table(sd_ctl_root); +} + +/* may be called multiple times per register */ +void unregister_sched_domain_sysctl(void) +{ +	unregister_sysctl_table(sd_sysctl_header); +	sd_sysctl_header = NULL; +	if (sd_ctl_dir[0].child) +		sd_free_ctl_entry(&sd_ctl_dir[0].child); +} +#endif /* CONFIG_SYSCTL */ +#endif /* CONFIG_SMP */ +  #ifdef CONFIG_FAIR_GROUP_SCHED  static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group *tg)  { @@ -75,16 +379,18 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group  	PN(se->vruntime);  	PN(se->sum_exec_runtime);  #ifdef CONFIG_SCHEDSTATS -	PN(se->statistics.wait_start); -	PN(se->statistics.sleep_start); -	PN(se->statistics.block_start); -	PN(se->statistics.sleep_max); -	PN(se->statistics.block_max); -	PN(se->statistics.exec_max); -	PN(se->statistics.slice_max); -	PN(se->statistics.wait_max); -	PN(se->statistics.wait_sum); -	P(se->statistics.wait_count); +	if (schedstat_enabled()) { +		PN(se->statistics.wait_start); +		PN(se->statistics.sleep_start); +		PN(se->statistics.block_start); +		PN(se->statistics.sleep_max); +		PN(se->statistics.block_max); +		PN(se->statistics.exec_max); +		PN(se->statistics.slice_max); +		PN(se->statistics.wait_max); +		PN(se->statistics.wait_sum); +		P(se->statistics.wait_count); +	}  #endif  	P(se->load.weight);  #ifdef CONFIG_SMP @@ -122,10 +428,12 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)  		(long long)(p->nvcsw + p->nivcsw),  		p->prio);  #ifdef CONFIG_SCHEDSTATS -	SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld", -		SPLIT_NS(p->se.statistics.wait_sum), -		SPLIT_NS(p->se.sum_exec_runtime), -		SPLIT_NS(p->se.statistics.sum_sleep_runtime)); +	if (schedstat_enabled()) { +		SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld", +			SPLIT_NS(p->se.statistics.wait_sum), +			SPLIT_NS(p->se.sum_exec_runtime), +			SPLIT_NS(p->se.statistics.sum_sleep_runtime)); +	}  #else  	SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld",  		0LL, 0L, @@ -258,8 +566,17 @@ void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq)  void print_dl_rq(struct seq_file *m, int cpu, struct dl_rq *dl_rq)  { +	struct dl_bw *dl_bw; +  	SEQ_printf(m, "\ndl_rq[%d]:\n", cpu);  	SEQ_printf(m, "  .%-30s: %ld\n", "dl_nr_running", dl_rq->dl_nr_running); +#ifdef CONFIG_SMP +	dl_bw = &cpu_rq(cpu)->rd->dl_bw; +#else +	dl_bw = &dl_rq->dl_bw; +#endif +	SEQ_printf(m, "  .%-30s: %lld\n", "dl_bw->bw", dl_bw->bw); +	SEQ_printf(m, "  .%-30s: %lld\n", "dl_bw->total_bw", dl_bw->total_bw);  }  extern __read_mostly int sched_clock_running; @@ -313,17 +630,18 @@ do {									\  #define P(n) SEQ_printf(m, "  .%-30s: %d\n", #n, rq->n);  #define P64(n) SEQ_printf(m, "  .%-30s: %Ld\n", #n, rq->n); -	P(yld_count); - -	P(sched_count); -	P(sched_goidle);  #ifdef CONFIG_SMP  	P64(avg_idle);  	P64(max_idle_balance_cost);  #endif -	P(ttwu_count); -	P(ttwu_local); +	if (schedstat_enabled()) { +		P(yld_count); +		P(sched_count); +		P(sched_goidle); +		P(ttwu_count); +		P(ttwu_local); +	}  #undef P  #undef P64 @@ -569,38 +887,39 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)  	nr_switches = p->nvcsw + p->nivcsw;  #ifdef CONFIG_SCHEDSTATS -	PN(se.statistics.sum_sleep_runtime); -	PN(se.statistics.wait_start); -	PN(se.statistics.sleep_start); -	PN(se.statistics.block_start); -	PN(se.statistics.sleep_max); -	PN(se.statistics.block_max); -	PN(se.statistics.exec_max); -	PN(se.statistics.slice_max); -	PN(se.statistics.wait_max); -	PN(se.statistics.wait_sum); -	P(se.statistics.wait_count); -	PN(se.statistics.iowait_sum); -	P(se.statistics.iowait_count);  	P(se.nr_migrations); -	P(se.statistics.nr_migrations_cold); -	P(se.statistics.nr_failed_migrations_affine); -	P(se.statistics.nr_failed_migrations_running); -	P(se.statistics.nr_failed_migrations_hot); -	P(se.statistics.nr_forced_migrations); -	P(se.statistics.nr_wakeups); -	P(se.statistics.nr_wakeups_sync); -	P(se.statistics.nr_wakeups_migrate); -	P(se.statistics.nr_wakeups_local); -	P(se.statistics.nr_wakeups_remote); -	P(se.statistics.nr_wakeups_affine); -	P(se.statistics.nr_wakeups_affine_attempts); -	P(se.statistics.nr_wakeups_passive); -	P(se.statistics.nr_wakeups_idle); -	{ +	if (schedstat_enabled()) {  		u64 avg_atom, avg_per_cpu; +		PN(se.statistics.sum_sleep_runtime); +		PN(se.statistics.wait_start); +		PN(se.statistics.sleep_start); +		PN(se.statistics.block_start); +		PN(se.statistics.sleep_max); +		PN(se.statistics.block_max); +		PN(se.statistics.exec_max); +		PN(se.statistics.slice_max); +		PN(se.statistics.wait_max); +		PN(se.statistics.wait_sum); +		P(se.statistics.wait_count); +		PN(se.statistics.iowait_sum); +		P(se.statistics.iowait_count); +		P(se.statistics.nr_migrations_cold); +		P(se.statistics.nr_failed_migrations_affine); +		P(se.statistics.nr_failed_migrations_running); +		P(se.statistics.nr_failed_migrations_hot); +		P(se.statistics.nr_forced_migrations); +		P(se.statistics.nr_wakeups); +		P(se.statistics.nr_wakeups_sync); +		P(se.statistics.nr_wakeups_migrate); +		P(se.statistics.nr_wakeups_local); +		P(se.statistics.nr_wakeups_remote); +		P(se.statistics.nr_wakeups_affine); +		P(se.statistics.nr_wakeups_affine_attempts); +		P(se.statistics.nr_wakeups_passive); +		P(se.statistics.nr_wakeups_idle); +  		avg_atom = p->se.sum_exec_runtime;  		if (nr_switches)  			avg_atom = div64_ul(avg_atom, nr_switches); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 56b7d4b83947..46d64e4ccfde 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -20,8 +20,8 @@   *  Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra   */ -#include <linux/latencytop.h>  #include <linux/sched.h> +#include <linux/latencytop.h>  #include <linux/cpumask.h>  #include <linux/cpuidle.h>  #include <linux/slab.h> @@ -755,7 +755,9 @@ static void  update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)  {  	struct task_struct *p; -	u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start; +	u64 delta; + +	delta = rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start;  	if (entity_is_task(se)) {  		p = task_of(se); @@ -776,22 +778,12 @@ update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)  	se->statistics.wait_sum += delta;  	se->statistics.wait_start = 0;  } -#else -static inline void -update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se) -{ -} - -static inline void -update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se) -{ -} -#endif  /*   * Task is being enqueued - update stats:   */ -static void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) +static inline void +update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)  {  	/*  	 * Are we enqueueing a waiting task? (for current tasks @@ -802,7 +794,7 @@ static void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)  }  static inline void -update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) +update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)  {  	/*  	 * Mark the end of the wait period if dequeueing a @@ -810,8 +802,41 @@ update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)  	 */  	if (se != cfs_rq->curr)  		update_stats_wait_end(cfs_rq, se); + +	if (flags & DEQUEUE_SLEEP) { +		if (entity_is_task(se)) { +			struct task_struct *tsk = task_of(se); + +			if (tsk->state & TASK_INTERRUPTIBLE) +				se->statistics.sleep_start = rq_clock(rq_of(cfs_rq)); +			if (tsk->state & TASK_UNINTERRUPTIBLE) +				se->statistics.block_start = rq_clock(rq_of(cfs_rq)); +		} +	} + +} +#else +static inline void +update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ +} + +static inline void +update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se) +{  } +static inline void +update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ +} + +static inline void +update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) +{ +} +#endif +  /*   * We are picking a new current task - update its stats:   */ @@ -907,10 +932,11 @@ struct numa_group {  	spinlock_t lock; /* nr_tasks, tasks */  	int nr_tasks;  	pid_t gid; +	int active_nodes;  	struct rcu_head rcu; -	nodemask_t active_nodes;  	unsigned long total_faults; +	unsigned long max_faults_cpu;  	/*  	 * Faults_cpu is used to decide whether memory should move  	 * towards the CPU. As a consequence, these stats are weighted @@ -969,6 +995,18 @@ static inline unsigned long group_faults_cpu(struct numa_group *group, int nid)  		group->faults_cpu[task_faults_idx(NUMA_MEM, nid, 1)];  } +/* + * A node triggering more than 1/3 as many NUMA faults as the maximum is + * considered part of a numa group's pseudo-interleaving set. Migrations + * between these nodes are slowed down, to allow things to settle down. + */ +#define ACTIVE_NODE_FRACTION 3 + +static bool numa_is_active_node(int nid, struct numa_group *ng) +{ +	return group_faults_cpu(ng, nid) * ACTIVE_NODE_FRACTION > ng->max_faults_cpu; +} +  /* Handle placement on systems where not all nodes are directly connected. */  static unsigned long score_nearby_nodes(struct task_struct *p, int nid,  					int maxdist, bool task) @@ -1118,27 +1156,23 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page * page,  		return true;  	/* -	 * Do not migrate if the destination is not a node that -	 * is actively used by this numa group. -	 */ -	if (!node_isset(dst_nid, ng->active_nodes)) -		return false; - -	/* -	 * Source is a node that is not actively used by this -	 * numa group, while the destination is. Migrate. +	 * Destination node is much more heavily used than the source +	 * node? Allow migration.  	 */ -	if (!node_isset(src_nid, ng->active_nodes)) +	if (group_faults_cpu(ng, dst_nid) > group_faults_cpu(ng, src_nid) * +					ACTIVE_NODE_FRACTION)  		return true;  	/* -	 * Both source and destination are nodes in active -	 * use by this numa group. Maximize memory bandwidth -	 * by migrating from more heavily used groups, to less -	 * heavily used ones, spreading the load around. -	 * Use a 1/4 hysteresis to avoid spurious page movement. +	 * Distribute memory according to CPU & memory use on each node, +	 * with 3/4 hysteresis to avoid unnecessary memory migrations: +	 * +	 * faults_cpu(dst)   3   faults_cpu(src) +	 * --------------- * - > --------------- +	 * faults_mem(dst)   4   faults_mem(src)  	 */ -	return group_faults(p, dst_nid) < (group_faults(p, src_nid) * 3 / 4); +	return group_faults_cpu(ng, dst_nid) * group_faults(p, src_nid) * 3 > +	       group_faults_cpu(ng, src_nid) * group_faults(p, dst_nid) * 4;  }  static unsigned long weighted_cpuload(const int cpu); @@ -1484,7 +1518,7 @@ static int task_numa_migrate(struct task_struct *p)  		.best_task = NULL,  		.best_imp = 0, -		.best_cpu = -1 +		.best_cpu = -1,  	};  	struct sched_domain *sd;  	unsigned long taskweight, groupweight; @@ -1536,8 +1570,7 @@ static int task_numa_migrate(struct task_struct *p)  	 *   multiple NUMA nodes; in order to better consolidate the group,  	 *   we need to check other locations.  	 */ -	if (env.best_cpu == -1 || (p->numa_group && -			nodes_weight(p->numa_group->active_nodes) > 1)) { +	if (env.best_cpu == -1 || (p->numa_group && p->numa_group->active_nodes > 1)) {  		for_each_online_node(nid) {  			if (nid == env.src_nid || nid == p->numa_preferred_nid)  				continue; @@ -1572,12 +1605,14 @@ static int task_numa_migrate(struct task_struct *p)  	 * trying for a better one later. Do not set the preferred node here.  	 */  	if (p->numa_group) { +		struct numa_group *ng = p->numa_group; +  		if (env.best_cpu == -1)  			nid = env.src_nid;  		else  			nid = env.dst_nid; -		if (node_isset(nid, p->numa_group->active_nodes)) +		if (ng->active_nodes > 1 && numa_is_active_node(env.dst_nid, ng))  			sched_setnuma(p, env.dst_nid);  	} @@ -1627,20 +1662,15 @@ static void numa_migrate_preferred(struct task_struct *p)  }  /* - * Find the nodes on which the workload is actively running. We do this by + * Find out how many nodes on the workload is actively running on. Do this by   * tracking the nodes from which NUMA hinting faults are triggered. This can   * be different from the set of nodes where the workload's memory is currently   * located. - * - * The bitmask is used to make smarter decisions on when to do NUMA page - * migrations, To prevent flip-flopping, and excessive page migrations, nodes - * are added when they cause over 6/16 of the maximum number of faults, but - * only removed when they drop below 3/16.   */ -static void update_numa_active_node_mask(struct numa_group *numa_group) +static void numa_group_count_active_nodes(struct numa_group *numa_group)  {  	unsigned long faults, max_faults = 0; -	int nid; +	int nid, active_nodes = 0;  	for_each_online_node(nid) {  		faults = group_faults_cpu(numa_group, nid); @@ -1650,12 +1680,12 @@ static void update_numa_active_node_mask(struct numa_group *numa_group)  	for_each_online_node(nid) {  		faults = group_faults_cpu(numa_group, nid); -		if (!node_isset(nid, numa_group->active_nodes)) { -			if (faults > max_faults * 6 / 16) -				node_set(nid, numa_group->active_nodes); -		} else if (faults < max_faults * 3 / 16) -			node_clear(nid, numa_group->active_nodes); +		if (faults * ACTIVE_NODE_FRACTION > max_faults) +			active_nodes++;  	} + +	numa_group->max_faults_cpu = max_faults; +	numa_group->active_nodes = active_nodes;  }  /* @@ -1946,7 +1976,7 @@ static void task_numa_placement(struct task_struct *p)  	update_task_scan_period(p, fault_types[0], fault_types[1]);  	if (p->numa_group) { -		update_numa_active_node_mask(p->numa_group); +		numa_group_count_active_nodes(p->numa_group);  		spin_unlock_irq(group_lock);  		max_nid = preferred_group_nid(p, max_group_nid);  	} @@ -1990,14 +2020,14 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags,  			return;  		atomic_set(&grp->refcount, 1); +		grp->active_nodes = 1; +		grp->max_faults_cpu = 0;  		spin_lock_init(&grp->lock);  		grp->gid = p->pid;  		/* Second half of the array tracks nids where faults happen */  		grp->faults_cpu = grp->faults + NR_NUMA_HINT_FAULT_TYPES *  						nr_node_ids; -		node_set(task_node(current), grp->active_nodes); -  		for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)  			grp->faults[i] = p->numa_faults[i]; @@ -2111,6 +2141,7 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)  	bool migrated = flags & TNF_MIGRATED;  	int cpu_node = task_node(current);  	int local = !!(flags & TNF_FAULT_LOCAL); +	struct numa_group *ng;  	int priv;  	if (!static_branch_likely(&sched_numa_balancing)) @@ -2151,9 +2182,10 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)  	 * actively using should be counted as local. This allows the  	 * scan rate to slow down when a workload has settled down.  	 */ -	if (!priv && !local && p->numa_group && -			node_isset(cpu_node, p->numa_group->active_nodes) && -			node_isset(mem_node, p->numa_group->active_nodes)) +	ng = p->numa_group; +	if (!priv && !local && ng && ng->active_nodes > 1 && +				numa_is_active_node(cpu_node, ng) && +				numa_is_active_node(mem_node, ng))  		local = 1;  	task_numa_placement(p); @@ -2824,7 +2856,8 @@ static inline void update_load_avg(struct sched_entity *se, int update_tg)  {  	struct cfs_rq *cfs_rq = cfs_rq_of(se);  	u64 now = cfs_rq_clock_task(cfs_rq); -	int cpu = cpu_of(rq_of(cfs_rq)); +	struct rq *rq = rq_of(cfs_rq); +	int cpu = cpu_of(rq);  	/*  	 * Track task load average for carrying it to new CPU after migrated, and @@ -2836,6 +2869,29 @@ static inline void update_load_avg(struct sched_entity *se, int update_tg)  	if (update_cfs_rq_load_avg(now, cfs_rq) && update_tg)  		update_tg_load_avg(cfs_rq, 0); + +	if (cpu == smp_processor_id() && &rq->cfs == cfs_rq) { +		unsigned long max = rq->cpu_capacity_orig; + +		/* +		 * There are a few boundary cases this might miss but it should +		 * get called often enough that that should (hopefully) not be +		 * a real problem -- added to that it only calls on the local +		 * CPU, so if we enqueue remotely we'll miss an update, but +		 * the next tick/schedule should update. +		 * +		 * It will not get called when we go idle, because the idle +		 * thread is a different class (!fair), nor will the utilization +		 * number include things like RT tasks. +		 * +		 * As is, the util number is not freq-invariant (we'd have to +		 * implement arch_scale_freq_capacity() for that). +		 * +		 * See cpu_util(). +		 */ +		cpufreq_update_util(rq_clock(rq), +				    min(cfs_rq->avg.util_avg, max), max); +	}  }  static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) @@ -3102,6 +3158,26 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)  static void check_enqueue_throttle(struct cfs_rq *cfs_rq); +static inline void check_schedstat_required(void) +{ +#ifdef CONFIG_SCHEDSTATS +	if (schedstat_enabled()) +		return; + +	/* Force schedstat enabled if a dependent tracepoint is active */ +	if (trace_sched_stat_wait_enabled()    || +			trace_sched_stat_sleep_enabled()   || +			trace_sched_stat_iowait_enabled()  || +			trace_sched_stat_blocked_enabled() || +			trace_sched_stat_runtime_enabled())  { +		pr_warn_once("Scheduler tracepoints stat_sleep, stat_iowait, " +			     "stat_blocked and stat_runtime require the " +			     "kernel parameter schedstats=enabled or " +			     "kernel.sched_schedstats=1\n"); +	} +#endif +} +  static void  enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)  { @@ -3122,11 +3198,15 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)  	if (flags & ENQUEUE_WAKEUP) {  		place_entity(cfs_rq, se, 0); -		enqueue_sleeper(cfs_rq, se); +		if (schedstat_enabled()) +			enqueue_sleeper(cfs_rq, se);  	} -	update_stats_enqueue(cfs_rq, se); -	check_spread(cfs_rq, se); +	check_schedstat_required(); +	if (schedstat_enabled()) { +		update_stats_enqueue(cfs_rq, se); +		check_spread(cfs_rq, se); +	}  	if (se != cfs_rq->curr)  		__enqueue_entity(cfs_rq, se);  	se->on_rq = 1; @@ -3193,19 +3273,8 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)  	update_curr(cfs_rq);  	dequeue_entity_load_avg(cfs_rq, se); -	update_stats_dequeue(cfs_rq, se); -	if (flags & DEQUEUE_SLEEP) { -#ifdef CONFIG_SCHEDSTATS -		if (entity_is_task(se)) { -			struct task_struct *tsk = task_of(se); - -			if (tsk->state & TASK_INTERRUPTIBLE) -				se->statistics.sleep_start = rq_clock(rq_of(cfs_rq)); -			if (tsk->state & TASK_UNINTERRUPTIBLE) -				se->statistics.block_start = rq_clock(rq_of(cfs_rq)); -		} -#endif -	} +	if (schedstat_enabled()) +		update_stats_dequeue(cfs_rq, se, flags);  	clear_buddies(cfs_rq, se); @@ -3279,7 +3348,8 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)  		 * a CPU. So account for the time it spent waiting on the  		 * runqueue.  		 */ -		update_stats_wait_end(cfs_rq, se); +		if (schedstat_enabled()) +			update_stats_wait_end(cfs_rq, se);  		__dequeue_entity(cfs_rq, se);  		update_load_avg(se, 1);  	} @@ -3292,7 +3362,7 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)  	 * least twice that of our own weight (i.e. dont track it  	 * when there are only lesser-weight tasks around):  	 */ -	if (rq_of(cfs_rq)->load.weight >= 2*se->load.weight) { +	if (schedstat_enabled() && rq_of(cfs_rq)->load.weight >= 2*se->load.weight) {  		se->statistics.slice_max = max(se->statistics.slice_max,  			se->sum_exec_runtime - se->prev_sum_exec_runtime);  	} @@ -3375,9 +3445,13 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)  	/* throttle cfs_rqs exceeding runtime */  	check_cfs_rq_runtime(cfs_rq); -	check_spread(cfs_rq, prev); +	if (schedstat_enabled()) { +		check_spread(cfs_rq, prev); +		if (prev->on_rq) +			update_stats_wait_start(cfs_rq, prev); +	} +  	if (prev->on_rq) { -		update_stats_wait_start(cfs_rq, prev);  		/* Put 'current' back into the tree. */  		__enqueue_entity(cfs_rq, prev);  		/* in !on_rq case, update occurred at dequeue */ @@ -4459,9 +4533,17 @@ static void __update_cpu_load(struct rq *this_rq, unsigned long this_load,  		/* scale is effectively 1 << i now, and >> i divides by scale */ -		old_load = this_rq->cpu_load[i] - tickless_load; +		old_load = this_rq->cpu_load[i];  		old_load = decay_load_missed(old_load, pending_updates - 1, i); -		old_load += tickless_load; +		if (tickless_load) { +			old_load -= decay_load_missed(tickless_load, pending_updates - 1, i); +			/* +			 * old_load can never be a negative value because a +			 * decayed tickless_load cannot be greater than the +			 * original tickless_load. +			 */ +			old_load += tickless_load; +		}  		new_load = this_load;  		/*  		 * Round up the averaging division if load is increasing. This @@ -4484,6 +4566,25 @@ static unsigned long weighted_cpuload(const int cpu)  }  #ifdef CONFIG_NO_HZ_COMMON +static void __update_cpu_load_nohz(struct rq *this_rq, +				   unsigned long curr_jiffies, +				   unsigned long load, +				   int active) +{ +	unsigned long pending_updates; + +	pending_updates = curr_jiffies - this_rq->last_load_update_tick; +	if (pending_updates) { +		this_rq->last_load_update_tick = curr_jiffies; +		/* +		 * In the regular NOHZ case, we were idle, this means load 0. +		 * In the NOHZ_FULL case, we were non-idle, we should consider +		 * its weighted load. +		 */ +		__update_cpu_load(this_rq, load, pending_updates, active); +	} +} +  /*   * There is no sane way to deal with nohz on smp when using jiffies because the   * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading @@ -4501,22 +4602,15 @@ static unsigned long weighted_cpuload(const int cpu)   * Called from nohz_idle_balance() to update the load ratings before doing the   * idle balance.   */ -static void update_idle_cpu_load(struct rq *this_rq) +static void update_cpu_load_idle(struct rq *this_rq)  { -	unsigned long curr_jiffies = READ_ONCE(jiffies); -	unsigned long load = weighted_cpuload(cpu_of(this_rq)); -	unsigned long pending_updates; -  	/*  	 * bail if there's load or we're actually up-to-date.  	 */ -	if (load || curr_jiffies == this_rq->last_load_update_tick) +	if (weighted_cpuload(cpu_of(this_rq)))  		return; -	pending_updates = curr_jiffies - this_rq->last_load_update_tick; -	this_rq->last_load_update_tick = curr_jiffies; - -	__update_cpu_load(this_rq, load, pending_updates, 0); +	__update_cpu_load_nohz(this_rq, READ_ONCE(jiffies), 0, 0);  }  /* @@ -4527,22 +4621,12 @@ void update_cpu_load_nohz(int active)  	struct rq *this_rq = this_rq();  	unsigned long curr_jiffies = READ_ONCE(jiffies);  	unsigned long load = active ? weighted_cpuload(cpu_of(this_rq)) : 0; -	unsigned long pending_updates;  	if (curr_jiffies == this_rq->last_load_update_tick)  		return;  	raw_spin_lock(&this_rq->lock); -	pending_updates = curr_jiffies - this_rq->last_load_update_tick; -	if (pending_updates) { -		this_rq->last_load_update_tick = curr_jiffies; -		/* -		 * In the regular NOHZ case, we were idle, this means load 0. -		 * In the NOHZ_FULL case, we were non-idle, we should consider -		 * its weighted load. -		 */ -		__update_cpu_load(this_rq, load, pending_updates, active); -	} +	__update_cpu_load_nohz(this_rq, curr_jiffies, load, active);  	raw_spin_unlock(&this_rq->lock);  }  #endif /* CONFIG_NO_HZ */ @@ -4554,7 +4638,7 @@ void update_cpu_load_active(struct rq *this_rq)  {  	unsigned long load = weighted_cpuload(cpu_of(this_rq));  	/* -	 * See the mess around update_idle_cpu_load() / update_cpu_load_nohz(). +	 * See the mess around update_cpu_load_idle() / update_cpu_load_nohz().  	 */  	this_rq->last_load_update_tick = jiffies;  	__update_cpu_load(this_rq, load, 1, 1); @@ -7848,7 +7932,7 @@ static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)  		if (time_after_eq(jiffies, rq->next_balance)) {  			raw_spin_lock_irq(&rq->lock);  			update_rq_clock(rq); -			update_idle_cpu_load(rq); +			update_cpu_load_idle(rq);  			raw_spin_unlock_irq(&rq->lock);  			rebalance_domains(rq, CPU_IDLE);  		} @@ -8234,11 +8318,8 @@ void free_fair_sched_group(struct task_group *tg)  	for_each_possible_cpu(i) {  		if (tg->cfs_rq)  			kfree(tg->cfs_rq[i]); -		if (tg->se) { -			if (tg->se[i]) -				remove_entity_load_avg(tg->se[i]); +		if (tg->se)  			kfree(tg->se[i]); -		}  	}  	kfree(tg->cfs_rq); @@ -8286,21 +8367,29 @@ err:  	return 0;  } -void unregister_fair_sched_group(struct task_group *tg, int cpu) +void unregister_fair_sched_group(struct task_group *tg)  { -	struct rq *rq = cpu_rq(cpu);  	unsigned long flags; +	struct rq *rq; +	int cpu; -	/* -	* Only empty task groups can be destroyed; so we can speculatively -	* check on_list without danger of it being re-added. -	*/ -	if (!tg->cfs_rq[cpu]->on_list) -		return; +	for_each_possible_cpu(cpu) { +		if (tg->se[cpu]) +			remove_entity_load_avg(tg->se[cpu]); -	raw_spin_lock_irqsave(&rq->lock, flags); -	list_del_leaf_cfs_rq(tg->cfs_rq[cpu]); -	raw_spin_unlock_irqrestore(&rq->lock, flags); +		/* +		 * Only empty task groups can be destroyed; so we can speculatively +		 * check on_list without danger of it being re-added. +		 */ +		if (!tg->cfs_rq[cpu]->on_list) +			continue; + +		rq = cpu_rq(cpu); + +		raw_spin_lock_irqsave(&rq->lock, flags); +		list_del_leaf_cfs_rq(tg->cfs_rq[cpu]); +		raw_spin_unlock_irqrestore(&rq->lock, flags); +	}  }  void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, @@ -8382,7 +8471,7 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)  	return 1;  } -void unregister_fair_sched_group(struct task_group *tg, int cpu) { } +void unregister_fair_sched_group(struct task_group *tg) { }  #endif /* CONFIG_FAIR_GROUP_SCHED */ diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 544a7133cbd1..bd12c6c714ec 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -4,6 +4,7 @@  #include <linux/sched.h>  #include <linux/cpu.h>  #include <linux/cpuidle.h> +#include <linux/cpuhotplug.h>  #include <linux/tick.h>  #include <linux/mm.h>  #include <linux/stackprotector.h> @@ -193,8 +194,6 @@ exit_idle:  	rcu_idle_exit();  } -DEFINE_PER_CPU(bool, cpu_dead_idle); -  /*   * Generic idle loop implementation   * @@ -221,10 +220,7 @@ static void cpu_idle_loop(void)  			rmb();  			if (cpu_is_offline(smp_processor_id())) { -				rcu_cpu_notify(NULL, CPU_DYING_IDLE, -					       (void *)(long)smp_processor_id()); -				smp_mb(); /* all activity before dead. */ -				this_cpu_write(cpu_dead_idle, true); +				cpuhp_report_idle_dead();  				arch_cpu_idle_dead();  			} @@ -291,5 +287,6 @@ void cpu_startup_entry(enum cpuhp_state state)  	boot_init_stack_canary();  #endif  	arch_cpu_idle_prepare(); +	cpuhp_online_idle(state);  	cpu_idle_loop();  } diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 8ec86abe0ea1..c41ea7ac1764 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -58,7 +58,15 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b)  	raw_spin_lock(&rt_b->rt_runtime_lock);  	if (!rt_b->rt_period_active) {  		rt_b->rt_period_active = 1; -		hrtimer_forward_now(&rt_b->rt_period_timer, rt_b->rt_period); +		/* +		 * SCHED_DEADLINE updates the bandwidth, as a run away +		 * RT task with a DL task could hog a CPU. But DL does +		 * not reset the period. If a deadline task was running +		 * without an RT task running, it can cause RT tasks to +		 * throttle when they start up. Kick the timer right away +		 * to update the period. +		 */ +		hrtimer_forward_now(&rt_b->rt_period_timer, ns_to_ktime(0));  		hrtimer_start_expires(&rt_b->rt_period_timer, HRTIMER_MODE_ABS_PINNED);  	}  	raw_spin_unlock(&rt_b->rt_runtime_lock); @@ -436,7 +444,7 @@ static void dequeue_top_rt_rq(struct rt_rq *rt_rq);  static inline int on_rt_rq(struct sched_rt_entity *rt_se)  { -	return !list_empty(&rt_se->run_list); +	return rt_se->on_rq;  }  #ifdef CONFIG_RT_GROUP_SCHED @@ -482,8 +490,8 @@ static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se)  	return rt_se->my_q;  } -static void enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head); -static void dequeue_rt_entity(struct sched_rt_entity *rt_se); +static void enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags); +static void dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags);  static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)  { @@ -499,7 +507,7 @@ static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)  		if (!rt_se)  			enqueue_top_rt_rq(rt_rq);  		else if (!on_rt_rq(rt_se)) -			enqueue_rt_entity(rt_se, false); +			enqueue_rt_entity(rt_se, 0);  		if (rt_rq->highest_prio.curr < curr->prio)  			resched_curr(rq); @@ -516,7 +524,7 @@ static void sched_rt_rq_dequeue(struct rt_rq *rt_rq)  	if (!rt_se)  		dequeue_top_rt_rq(rt_rq);  	else if (on_rt_rq(rt_se)) -		dequeue_rt_entity(rt_se); +		dequeue_rt_entity(rt_se, 0);  }  static inline int rt_rq_throttled(struct rt_rq *rt_rq) @@ -945,6 +953,10 @@ static void update_curr_rt(struct rq *rq)  	if (curr->sched_class != &rt_sched_class)  		return; +	/* Kick cpufreq (see the comment in linux/cpufreq.h). */ +	if (cpu_of(rq) == smp_processor_id()) +		cpufreq_trigger_update(rq_clock(rq)); +  	delta_exec = rq_clock_task(rq) - curr->se.exec_start;  	if (unlikely((s64)delta_exec <= 0))  		return; @@ -1142,12 +1154,27 @@ unsigned int rt_se_nr_running(struct sched_rt_entity *rt_se)  }  static inline +unsigned int rt_se_rr_nr_running(struct sched_rt_entity *rt_se) +{ +	struct rt_rq *group_rq = group_rt_rq(rt_se); +	struct task_struct *tsk; + +	if (group_rq) +		return group_rq->rr_nr_running; + +	tsk = rt_task_of(rt_se); + +	return (tsk->policy == SCHED_RR) ? 1 : 0; +} + +static inline  void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)  {  	int prio = rt_se_prio(rt_se);  	WARN_ON(!rt_prio(prio));  	rt_rq->rt_nr_running += rt_se_nr_running(rt_se); +	rt_rq->rr_nr_running += rt_se_rr_nr_running(rt_se);  	inc_rt_prio(rt_rq, prio);  	inc_rt_migration(rt_se, rt_rq); @@ -1160,13 +1187,37 @@ void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)  	WARN_ON(!rt_prio(rt_se_prio(rt_se)));  	WARN_ON(!rt_rq->rt_nr_running);  	rt_rq->rt_nr_running -= rt_se_nr_running(rt_se); +	rt_rq->rr_nr_running -= rt_se_rr_nr_running(rt_se);  	dec_rt_prio(rt_rq, rt_se_prio(rt_se));  	dec_rt_migration(rt_se, rt_rq);  	dec_rt_group(rt_se, rt_rq);  } -static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head) +/* + * Change rt_se->run_list location unless SAVE && !MOVE + * + * assumes ENQUEUE/DEQUEUE flags match + */ +static inline bool move_entity(unsigned int flags) +{ +	if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) == DEQUEUE_SAVE) +		return false; + +	return true; +} + +static void __delist_rt_entity(struct sched_rt_entity *rt_se, struct rt_prio_array *array) +{ +	list_del_init(&rt_se->run_list); + +	if (list_empty(array->queue + rt_se_prio(rt_se))) +		__clear_bit(rt_se_prio(rt_se), array->bitmap); + +	rt_se->on_list = 0; +} + +static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)  {  	struct rt_rq *rt_rq = rt_rq_of_se(rt_se);  	struct rt_prio_array *array = &rt_rq->active; @@ -1179,26 +1230,37 @@ static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head)  	 * get throttled and the current group doesn't have any other  	 * active members.  	 */ -	if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running)) +	if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running)) { +		if (rt_se->on_list) +			__delist_rt_entity(rt_se, array);  		return; +	} -	if (head) -		list_add(&rt_se->run_list, queue); -	else -		list_add_tail(&rt_se->run_list, queue); -	__set_bit(rt_se_prio(rt_se), array->bitmap); +	if (move_entity(flags)) { +		WARN_ON_ONCE(rt_se->on_list); +		if (flags & ENQUEUE_HEAD) +			list_add(&rt_se->run_list, queue); +		else +			list_add_tail(&rt_se->run_list, queue); + +		__set_bit(rt_se_prio(rt_se), array->bitmap); +		rt_se->on_list = 1; +	} +	rt_se->on_rq = 1;  	inc_rt_tasks(rt_se, rt_rq);  } -static void __dequeue_rt_entity(struct sched_rt_entity *rt_se) +static void __dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)  {  	struct rt_rq *rt_rq = rt_rq_of_se(rt_se);  	struct rt_prio_array *array = &rt_rq->active; -	list_del_init(&rt_se->run_list); -	if (list_empty(array->queue + rt_se_prio(rt_se))) -		__clear_bit(rt_se_prio(rt_se), array->bitmap); +	if (move_entity(flags)) { +		WARN_ON_ONCE(!rt_se->on_list); +		__delist_rt_entity(rt_se, array); +	} +	rt_se->on_rq = 0;  	dec_rt_tasks(rt_se, rt_rq);  } @@ -1207,7 +1269,7 @@ static void __dequeue_rt_entity(struct sched_rt_entity *rt_se)   * Because the prio of an upper entry depends on the lower   * entries, we must remove entries top - down.   */ -static void dequeue_rt_stack(struct sched_rt_entity *rt_se) +static void dequeue_rt_stack(struct sched_rt_entity *rt_se, unsigned int flags)  {  	struct sched_rt_entity *back = NULL; @@ -1220,31 +1282,31 @@ static void dequeue_rt_stack(struct sched_rt_entity *rt_se)  	for (rt_se = back; rt_se; rt_se = rt_se->back) {  		if (on_rt_rq(rt_se)) -			__dequeue_rt_entity(rt_se); +			__dequeue_rt_entity(rt_se, flags);  	}  } -static void enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head) +static void enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)  {  	struct rq *rq = rq_of_rt_se(rt_se); -	dequeue_rt_stack(rt_se); +	dequeue_rt_stack(rt_se, flags);  	for_each_sched_rt_entity(rt_se) -		__enqueue_rt_entity(rt_se, head); +		__enqueue_rt_entity(rt_se, flags);  	enqueue_top_rt_rq(&rq->rt);  } -static void dequeue_rt_entity(struct sched_rt_entity *rt_se) +static void dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)  {  	struct rq *rq = rq_of_rt_se(rt_se); -	dequeue_rt_stack(rt_se); +	dequeue_rt_stack(rt_se, flags);  	for_each_sched_rt_entity(rt_se) {  		struct rt_rq *rt_rq = group_rt_rq(rt_se);  		if (rt_rq && rt_rq->rt_nr_running) -			__enqueue_rt_entity(rt_se, false); +			__enqueue_rt_entity(rt_se, flags);  	}  	enqueue_top_rt_rq(&rq->rt);  } @@ -1260,7 +1322,7 @@ enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags)  	if (flags & ENQUEUE_WAKEUP)  		rt_se->timeout = 0; -	enqueue_rt_entity(rt_se, flags & ENQUEUE_HEAD); +	enqueue_rt_entity(rt_se, flags);  	if (!task_current(rq, p) && p->nr_cpus_allowed > 1)  		enqueue_pushable_task(rq, p); @@ -1271,7 +1333,7 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags)  	struct sched_rt_entity *rt_se = &p->rt;  	update_curr_rt(rq); -	dequeue_rt_entity(rt_se); +	dequeue_rt_entity(rt_se, flags);  	dequeue_pushable_task(rq, p);  } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 10f16374df7f..382848a24ed9 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -3,6 +3,7 @@  #include <linux/sched/sysctl.h>  #include <linux/sched/rt.h>  #include <linux/sched/deadline.h> +#include <linux/binfmts.h>  #include <linux/mutex.h>  #include <linux/spinlock.h>  #include <linux/stop_machine.h> @@ -313,12 +314,11 @@ extern int tg_nop(struct task_group *tg, void *data);  extern void free_fair_sched_group(struct task_group *tg);  extern int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent); -extern void unregister_fair_sched_group(struct task_group *tg, int cpu); +extern void unregister_fair_sched_group(struct task_group *tg);  extern void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,  			struct sched_entity *se, int cpu,  			struct sched_entity *parent);  extern void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b); -extern int sched_group_set_shares(struct task_group *tg, unsigned long shares);  extern void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b);  extern void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b); @@ -450,6 +450,7 @@ static inline int rt_bandwidth_enabled(void)  struct rt_rq {  	struct rt_prio_array active;  	unsigned int rt_nr_running; +	unsigned int rr_nr_running;  #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED  	struct {  		int curr; /* highest queued rt task prio */ @@ -909,6 +910,18 @@ static inline unsigned int group_first_cpu(struct sched_group *group)  extern int group_balance_cpu(struct sched_group *sg); +#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL) +void register_sched_domain_sysctl(void); +void unregister_sched_domain_sysctl(void); +#else +static inline void register_sched_domain_sysctl(void) +{ +} +static inline void unregister_sched_domain_sysctl(void) +{ +} +#endif +  #else  static inline void sched_ttwu_pending(void) { } @@ -1022,6 +1035,7 @@ extern struct static_key sched_feat_keys[__SCHED_FEAT_NR];  #endif /* SCHED_DEBUG && HAVE_JUMP_LABEL */  extern struct static_key_false sched_numa_balancing; +extern struct static_key_false sched_schedstats;  static inline u64 global_rt_period(void)  { @@ -1130,18 +1144,40 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)  extern const int sched_prio_to_weight[40];  extern const u32 sched_prio_to_wmult[40]; +/* + * {de,en}queue flags: + * + * DEQUEUE_SLEEP  - task is no longer runnable + * ENQUEUE_WAKEUP - task just became runnable + * + * SAVE/RESTORE - an otherwise spurious dequeue/enqueue, done to ensure tasks + *                are in a known state which allows modification. Such pairs + *                should preserve as much state as possible. + * + * MOVE - paired with SAVE/RESTORE, explicitly does not preserve the location + *        in the runqueue. + * + * ENQUEUE_HEAD      - place at front of runqueue (tail if not specified) + * ENQUEUE_REPLENISH - CBS (replenish runtime and postpone deadline) + * ENQUEUE_WAKING    - sched_class::task_waking was called + * + */ + +#define DEQUEUE_SLEEP		0x01 +#define DEQUEUE_SAVE		0x02 /* matches ENQUEUE_RESTORE */ +#define DEQUEUE_MOVE		0x04 /* matches ENQUEUE_MOVE */ +  #define ENQUEUE_WAKEUP		0x01 -#define ENQUEUE_HEAD		0x02 +#define ENQUEUE_RESTORE		0x02 +#define ENQUEUE_MOVE		0x04 + +#define ENQUEUE_HEAD		0x08 +#define ENQUEUE_REPLENISH	0x10  #ifdef CONFIG_SMP -#define ENQUEUE_WAKING		0x04	/* sched_class::task_waking was called */ +#define ENQUEUE_WAKING		0x20  #else  #define ENQUEUE_WAKING		0x00  #endif -#define ENQUEUE_REPLENISH	0x08 -#define ENQUEUE_RESTORE	0x10 - -#define DEQUEUE_SLEEP		0x01 -#define DEQUEUE_SAVE		0x02  #define RETRY_TASK		((void *)-1UL) @@ -1278,6 +1314,35 @@ unsigned long to_ratio(u64 period, u64 runtime);  extern void init_entity_runnable_average(struct sched_entity *se); +#ifdef CONFIG_NO_HZ_FULL +extern bool sched_can_stop_tick(struct rq *rq); + +/* + * Tick may be needed by tasks in the runqueue depending on their policy and + * requirements. If tick is needed, lets send the target an IPI to kick it out of + * nohz mode if necessary. + */ +static inline void sched_update_tick_dependency(struct rq *rq) +{ +	int cpu; + +	if (!tick_nohz_full_enabled()) +		return; + +	cpu = cpu_of(rq); + +	if (!tick_nohz_full_cpu(cpu)) +		return; + +	if (sched_can_stop_tick(rq)) +		tick_nohz_dep_clear_cpu(cpu, TICK_DEP_BIT_SCHED); +	else +		tick_nohz_dep_set_cpu(cpu, TICK_DEP_BIT_SCHED); +} +#else +static inline void sched_update_tick_dependency(struct rq *rq) { } +#endif +  static inline void add_nr_running(struct rq *rq, unsigned count)  {  	unsigned prev_nr = rq->nr_running; @@ -1289,26 +1354,16 @@ static inline void add_nr_running(struct rq *rq, unsigned count)  		if (!rq->rd->overload)  			rq->rd->overload = true;  #endif - -#ifdef CONFIG_NO_HZ_FULL -		if (tick_nohz_full_cpu(rq->cpu)) { -			/* -			 * Tick is needed if more than one task runs on a CPU. -			 * Send the target an IPI to kick it out of nohz mode. -			 * -			 * We assume that IPI implies full memory barrier and the -			 * new value of rq->nr_running is visible on reception -			 * from the target. -			 */ -			tick_nohz_full_kick_cpu(rq->cpu); -		} -#endif  	} + +	sched_update_tick_dependency(rq);  }  static inline void sub_nr_running(struct rq *rq, unsigned count)  {  	rq->nr_running -= count; +	/* Check if we still need preemption */ +	sched_update_tick_dependency(rq);  }  static inline void rq_last_tick_reset(struct rq *rq) @@ -1738,3 +1793,51 @@ static inline u64 irq_time_read(int cpu)  }  #endif /* CONFIG_64BIT */  #endif /* CONFIG_IRQ_TIME_ACCOUNTING */ + +#ifdef CONFIG_CPU_FREQ +DECLARE_PER_CPU(struct update_util_data *, cpufreq_update_util_data); + +/** + * cpufreq_update_util - Take a note about CPU utilization changes. + * @time: Current time. + * @util: Current utilization. + * @max: Utilization ceiling. + * + * This function is called by the scheduler on every invocation of + * update_load_avg() on the CPU whose utilization is being updated. + * + * It can only be called from RCU-sched read-side critical sections. + */ +static inline void cpufreq_update_util(u64 time, unsigned long util, unsigned long max) +{ +       struct update_util_data *data; + +       data = rcu_dereference_sched(*this_cpu_ptr(&cpufreq_update_util_data)); +       if (data) +               data->func(data, time, util, max); +} + +/** + * cpufreq_trigger_update - Trigger CPU performance state evaluation if needed. + * @time: Current time. + * + * The way cpufreq is currently arranged requires it to evaluate the CPU + * performance state (frequency/voltage) on a regular basis to prevent it from + * being stuck in a completely inadequate performance level for too long. + * That is not guaranteed to happen if the updates are only triggered from CFS, + * though, because they may not be coming in if RT or deadline tasks are active + * all the time (or there are RT and DL tasks only). + * + * As a workaround for that issue, this function is called by the RT and DL + * sched classes to trigger extra cpufreq updates to prevent it from stalling, + * but that really is a band-aid.  Going forward it should be replaced with + * solutions targeted more specifically at RT and DL tasks. + */ +static inline void cpufreq_trigger_update(u64 time) +{ +	cpufreq_update_util(time, ULONG_MAX, 0); +} +#else +static inline void cpufreq_update_util(u64 time, unsigned long util, unsigned long max) {} +static inline void cpufreq_trigger_update(u64 time) {} +#endif /* CONFIG_CPU_FREQ */ diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h index b0fbc7632de5..70b3b6a20fb0 100644 --- a/kernel/sched/stats.h +++ b/kernel/sched/stats.h @@ -29,9 +29,10 @@ rq_sched_info_dequeued(struct rq *rq, unsigned long long delta)  	if (rq)  		rq->rq_sched_info.run_delay += delta;  } -# define schedstat_inc(rq, field)	do { (rq)->field++; } while (0) -# define schedstat_add(rq, field, amt)	do { (rq)->field += (amt); } while (0) -# define schedstat_set(var, val)	do { var = (val); } while (0) +# define schedstat_enabled()		static_branch_unlikely(&sched_schedstats) +# define schedstat_inc(rq, field)	do { if (schedstat_enabled()) { (rq)->field++; } } while (0) +# define schedstat_add(rq, field, amt)	do { if (schedstat_enabled()) { (rq)->field += (amt); } } while (0) +# define schedstat_set(var, val)	do { if (schedstat_enabled()) { var = (val); } } while (0)  #else /* !CONFIG_SCHEDSTATS */  static inline void  rq_sched_info_arrive(struct rq *rq, unsigned long long delta) @@ -42,6 +43,7 @@ rq_sched_info_dequeued(struct rq *rq, unsigned long long delta)  static inline void  rq_sched_info_depart(struct rq *rq, unsigned long long delta)  {} +# define schedstat_enabled()		0  # define schedstat_inc(rq, field)	do { } while (0)  # define schedstat_add(rq, field, amt)	do { } while (0)  # define schedstat_set(var, val)	do { } while (0) diff --git a/kernel/sched/swait.c b/kernel/sched/swait.c new file mode 100644 index 000000000000..82f0dff90030 --- /dev/null +++ b/kernel/sched/swait.c @@ -0,0 +1,123 @@ +#include <linux/sched.h> +#include <linux/swait.h> + +void __init_swait_queue_head(struct swait_queue_head *q, const char *name, +			     struct lock_class_key *key) +{ +	raw_spin_lock_init(&q->lock); +	lockdep_set_class_and_name(&q->lock, key, name); +	INIT_LIST_HEAD(&q->task_list); +} +EXPORT_SYMBOL(__init_swait_queue_head); + +/* + * The thing about the wake_up_state() return value; I think we can ignore it. + * + * If for some reason it would return 0, that means the previously waiting + * task is already running, so it will observe condition true (or has already). + */ +void swake_up_locked(struct swait_queue_head *q) +{ +	struct swait_queue *curr; + +	if (list_empty(&q->task_list)) +		return; + +	curr = list_first_entry(&q->task_list, typeof(*curr), task_list); +	wake_up_process(curr->task); +	list_del_init(&curr->task_list); +} +EXPORT_SYMBOL(swake_up_locked); + +void swake_up(struct swait_queue_head *q) +{ +	unsigned long flags; + +	if (!swait_active(q)) +		return; + +	raw_spin_lock_irqsave(&q->lock, flags); +	swake_up_locked(q); +	raw_spin_unlock_irqrestore(&q->lock, flags); +} +EXPORT_SYMBOL(swake_up); + +/* + * Does not allow usage from IRQ disabled, since we must be able to + * release IRQs to guarantee bounded hold time. + */ +void swake_up_all(struct swait_queue_head *q) +{ +	struct swait_queue *curr; +	LIST_HEAD(tmp); + +	if (!swait_active(q)) +		return; + +	raw_spin_lock_irq(&q->lock); +	list_splice_init(&q->task_list, &tmp); +	while (!list_empty(&tmp)) { +		curr = list_first_entry(&tmp, typeof(*curr), task_list); + +		wake_up_state(curr->task, TASK_NORMAL); +		list_del_init(&curr->task_list); + +		if (list_empty(&tmp)) +			break; + +		raw_spin_unlock_irq(&q->lock); +		raw_spin_lock_irq(&q->lock); +	} +	raw_spin_unlock_irq(&q->lock); +} +EXPORT_SYMBOL(swake_up_all); + +void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait) +{ +	wait->task = current; +	if (list_empty(&wait->task_list)) +		list_add(&wait->task_list, &q->task_list); +} + +void prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait, int state) +{ +	unsigned long flags; + +	raw_spin_lock_irqsave(&q->lock, flags); +	__prepare_to_swait(q, wait); +	set_current_state(state); +	raw_spin_unlock_irqrestore(&q->lock, flags); +} +EXPORT_SYMBOL(prepare_to_swait); + +long prepare_to_swait_event(struct swait_queue_head *q, struct swait_queue *wait, int state) +{ +	if (signal_pending_state(state, current)) +		return -ERESTARTSYS; + +	prepare_to_swait(q, wait, state); + +	return 0; +} +EXPORT_SYMBOL(prepare_to_swait_event); + +void __finish_swait(struct swait_queue_head *q, struct swait_queue *wait) +{ +	__set_current_state(TASK_RUNNING); +	if (!list_empty(&wait->task_list)) +		list_del_init(&wait->task_list); +} + +void finish_swait(struct swait_queue_head *q, struct swait_queue *wait) +{ +	unsigned long flags; + +	__set_current_state(TASK_RUNNING); + +	if (!list_empty_careful(&wait->task_list)) { +		raw_spin_lock_irqsave(&q->lock, flags); +		list_del_init(&wait->task_list); +		raw_spin_unlock_irqrestore(&q->lock, flags); +	} +} +EXPORT_SYMBOL(finish_swait); | 
