diff options
Diffstat (limited to 'kernel/sched/fair.c')
| -rw-r--r-- | kernel/sched/fair.c | 323 | 
1 files changed, 206 insertions, 117 deletions
| diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 56b7d4b83947..46d64e4ccfde 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -20,8 +20,8 @@   *  Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra   */ -#include <linux/latencytop.h>  #include <linux/sched.h> +#include <linux/latencytop.h>  #include <linux/cpumask.h>  #include <linux/cpuidle.h>  #include <linux/slab.h> @@ -755,7 +755,9 @@ static void  update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)  {  	struct task_struct *p; -	u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start; +	u64 delta; + +	delta = rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start;  	if (entity_is_task(se)) {  		p = task_of(se); @@ -776,22 +778,12 @@ update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)  	se->statistics.wait_sum += delta;  	se->statistics.wait_start = 0;  } -#else -static inline void -update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se) -{ -} - -static inline void -update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se) -{ -} -#endif  /*   * Task is being enqueued - update stats:   */ -static void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) +static inline void +update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)  {  	/*  	 * Are we enqueueing a waiting task? (for current tasks @@ -802,7 +794,7 @@ static void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)  }  static inline void -update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) +update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)  {  	/*  	 * Mark the end of the wait period if dequeueing a @@ -810,8 +802,41 @@ update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)  	 */  	if (se != cfs_rq->curr)  		update_stats_wait_end(cfs_rq, se); + +	if (flags & DEQUEUE_SLEEP) { +		if (entity_is_task(se)) { +			struct task_struct *tsk = task_of(se); + +			if (tsk->state & TASK_INTERRUPTIBLE) +				se->statistics.sleep_start = rq_clock(rq_of(cfs_rq)); +			if (tsk->state & TASK_UNINTERRUPTIBLE) +				se->statistics.block_start = rq_clock(rq_of(cfs_rq)); +		} +	} + +} +#else +static inline void +update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ +} + +static inline void +update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se) +{  } +static inline void +update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ +} + +static inline void +update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) +{ +} +#endif +  /*   * We are picking a new current task - update its stats:   */ @@ -907,10 +932,11 @@ struct numa_group {  	spinlock_t lock; /* nr_tasks, tasks */  	int nr_tasks;  	pid_t gid; +	int active_nodes;  	struct rcu_head rcu; -	nodemask_t active_nodes;  	unsigned long total_faults; +	unsigned long max_faults_cpu;  	/*  	 * Faults_cpu is used to decide whether memory should move  	 * towards the CPU. As a consequence, these stats are weighted @@ -969,6 +995,18 @@ static inline unsigned long group_faults_cpu(struct numa_group *group, int nid)  		group->faults_cpu[task_faults_idx(NUMA_MEM, nid, 1)];  } +/* + * A node triggering more than 1/3 as many NUMA faults as the maximum is + * considered part of a numa group's pseudo-interleaving set. Migrations + * between these nodes are slowed down, to allow things to settle down. + */ +#define ACTIVE_NODE_FRACTION 3 + +static bool numa_is_active_node(int nid, struct numa_group *ng) +{ +	return group_faults_cpu(ng, nid) * ACTIVE_NODE_FRACTION > ng->max_faults_cpu; +} +  /* Handle placement on systems where not all nodes are directly connected. */  static unsigned long score_nearby_nodes(struct task_struct *p, int nid,  					int maxdist, bool task) @@ -1118,27 +1156,23 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page * page,  		return true;  	/* -	 * Do not migrate if the destination is not a node that -	 * is actively used by this numa group. -	 */ -	if (!node_isset(dst_nid, ng->active_nodes)) -		return false; - -	/* -	 * Source is a node that is not actively used by this -	 * numa group, while the destination is. Migrate. +	 * Destination node is much more heavily used than the source +	 * node? Allow migration.  	 */ -	if (!node_isset(src_nid, ng->active_nodes)) +	if (group_faults_cpu(ng, dst_nid) > group_faults_cpu(ng, src_nid) * +					ACTIVE_NODE_FRACTION)  		return true;  	/* -	 * Both source and destination are nodes in active -	 * use by this numa group. Maximize memory bandwidth -	 * by migrating from more heavily used groups, to less -	 * heavily used ones, spreading the load around. -	 * Use a 1/4 hysteresis to avoid spurious page movement. +	 * Distribute memory according to CPU & memory use on each node, +	 * with 3/4 hysteresis to avoid unnecessary memory migrations: +	 * +	 * faults_cpu(dst)   3   faults_cpu(src) +	 * --------------- * - > --------------- +	 * faults_mem(dst)   4   faults_mem(src)  	 */ -	return group_faults(p, dst_nid) < (group_faults(p, src_nid) * 3 / 4); +	return group_faults_cpu(ng, dst_nid) * group_faults(p, src_nid) * 3 > +	       group_faults_cpu(ng, src_nid) * group_faults(p, dst_nid) * 4;  }  static unsigned long weighted_cpuload(const int cpu); @@ -1484,7 +1518,7 @@ static int task_numa_migrate(struct task_struct *p)  		.best_task = NULL,  		.best_imp = 0, -		.best_cpu = -1 +		.best_cpu = -1,  	};  	struct sched_domain *sd;  	unsigned long taskweight, groupweight; @@ -1536,8 +1570,7 @@ static int task_numa_migrate(struct task_struct *p)  	 *   multiple NUMA nodes; in order to better consolidate the group,  	 *   we need to check other locations.  	 */ -	if (env.best_cpu == -1 || (p->numa_group && -			nodes_weight(p->numa_group->active_nodes) > 1)) { +	if (env.best_cpu == -1 || (p->numa_group && p->numa_group->active_nodes > 1)) {  		for_each_online_node(nid) {  			if (nid == env.src_nid || nid == p->numa_preferred_nid)  				continue; @@ -1572,12 +1605,14 @@ static int task_numa_migrate(struct task_struct *p)  	 * trying for a better one later. Do not set the preferred node here.  	 */  	if (p->numa_group) { +		struct numa_group *ng = p->numa_group; +  		if (env.best_cpu == -1)  			nid = env.src_nid;  		else  			nid = env.dst_nid; -		if (node_isset(nid, p->numa_group->active_nodes)) +		if (ng->active_nodes > 1 && numa_is_active_node(env.dst_nid, ng))  			sched_setnuma(p, env.dst_nid);  	} @@ -1627,20 +1662,15 @@ static void numa_migrate_preferred(struct task_struct *p)  }  /* - * Find the nodes on which the workload is actively running. We do this by + * Find out how many nodes on the workload is actively running on. Do this by   * tracking the nodes from which NUMA hinting faults are triggered. This can   * be different from the set of nodes where the workload's memory is currently   * located. - * - * The bitmask is used to make smarter decisions on when to do NUMA page - * migrations, To prevent flip-flopping, and excessive page migrations, nodes - * are added when they cause over 6/16 of the maximum number of faults, but - * only removed when they drop below 3/16.   */ -static void update_numa_active_node_mask(struct numa_group *numa_group) +static void numa_group_count_active_nodes(struct numa_group *numa_group)  {  	unsigned long faults, max_faults = 0; -	int nid; +	int nid, active_nodes = 0;  	for_each_online_node(nid) {  		faults = group_faults_cpu(numa_group, nid); @@ -1650,12 +1680,12 @@ static void update_numa_active_node_mask(struct numa_group *numa_group)  	for_each_online_node(nid) {  		faults = group_faults_cpu(numa_group, nid); -		if (!node_isset(nid, numa_group->active_nodes)) { -			if (faults > max_faults * 6 / 16) -				node_set(nid, numa_group->active_nodes); -		} else if (faults < max_faults * 3 / 16) -			node_clear(nid, numa_group->active_nodes); +		if (faults * ACTIVE_NODE_FRACTION > max_faults) +			active_nodes++;  	} + +	numa_group->max_faults_cpu = max_faults; +	numa_group->active_nodes = active_nodes;  }  /* @@ -1946,7 +1976,7 @@ static void task_numa_placement(struct task_struct *p)  	update_task_scan_period(p, fault_types[0], fault_types[1]);  	if (p->numa_group) { -		update_numa_active_node_mask(p->numa_group); +		numa_group_count_active_nodes(p->numa_group);  		spin_unlock_irq(group_lock);  		max_nid = preferred_group_nid(p, max_group_nid);  	} @@ -1990,14 +2020,14 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags,  			return;  		atomic_set(&grp->refcount, 1); +		grp->active_nodes = 1; +		grp->max_faults_cpu = 0;  		spin_lock_init(&grp->lock);  		grp->gid = p->pid;  		/* Second half of the array tracks nids where faults happen */  		grp->faults_cpu = grp->faults + NR_NUMA_HINT_FAULT_TYPES *  						nr_node_ids; -		node_set(task_node(current), grp->active_nodes); -  		for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)  			grp->faults[i] = p->numa_faults[i]; @@ -2111,6 +2141,7 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)  	bool migrated = flags & TNF_MIGRATED;  	int cpu_node = task_node(current);  	int local = !!(flags & TNF_FAULT_LOCAL); +	struct numa_group *ng;  	int priv;  	if (!static_branch_likely(&sched_numa_balancing)) @@ -2151,9 +2182,10 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)  	 * actively using should be counted as local. This allows the  	 * scan rate to slow down when a workload has settled down.  	 */ -	if (!priv && !local && p->numa_group && -			node_isset(cpu_node, p->numa_group->active_nodes) && -			node_isset(mem_node, p->numa_group->active_nodes)) +	ng = p->numa_group; +	if (!priv && !local && ng && ng->active_nodes > 1 && +				numa_is_active_node(cpu_node, ng) && +				numa_is_active_node(mem_node, ng))  		local = 1;  	task_numa_placement(p); @@ -2824,7 +2856,8 @@ static inline void update_load_avg(struct sched_entity *se, int update_tg)  {  	struct cfs_rq *cfs_rq = cfs_rq_of(se);  	u64 now = cfs_rq_clock_task(cfs_rq); -	int cpu = cpu_of(rq_of(cfs_rq)); +	struct rq *rq = rq_of(cfs_rq); +	int cpu = cpu_of(rq);  	/*  	 * Track task load average for carrying it to new CPU after migrated, and @@ -2836,6 +2869,29 @@ static inline void update_load_avg(struct sched_entity *se, int update_tg)  	if (update_cfs_rq_load_avg(now, cfs_rq) && update_tg)  		update_tg_load_avg(cfs_rq, 0); + +	if (cpu == smp_processor_id() && &rq->cfs == cfs_rq) { +		unsigned long max = rq->cpu_capacity_orig; + +		/* +		 * There are a few boundary cases this might miss but it should +		 * get called often enough that that should (hopefully) not be +		 * a real problem -- added to that it only calls on the local +		 * CPU, so if we enqueue remotely we'll miss an update, but +		 * the next tick/schedule should update. +		 * +		 * It will not get called when we go idle, because the idle +		 * thread is a different class (!fair), nor will the utilization +		 * number include things like RT tasks. +		 * +		 * As is, the util number is not freq-invariant (we'd have to +		 * implement arch_scale_freq_capacity() for that). +		 * +		 * See cpu_util(). +		 */ +		cpufreq_update_util(rq_clock(rq), +				    min(cfs_rq->avg.util_avg, max), max); +	}  }  static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) @@ -3102,6 +3158,26 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)  static void check_enqueue_throttle(struct cfs_rq *cfs_rq); +static inline void check_schedstat_required(void) +{ +#ifdef CONFIG_SCHEDSTATS +	if (schedstat_enabled()) +		return; + +	/* Force schedstat enabled if a dependent tracepoint is active */ +	if (trace_sched_stat_wait_enabled()    || +			trace_sched_stat_sleep_enabled()   || +			trace_sched_stat_iowait_enabled()  || +			trace_sched_stat_blocked_enabled() || +			trace_sched_stat_runtime_enabled())  { +		pr_warn_once("Scheduler tracepoints stat_sleep, stat_iowait, " +			     "stat_blocked and stat_runtime require the " +			     "kernel parameter schedstats=enabled or " +			     "kernel.sched_schedstats=1\n"); +	} +#endif +} +  static void  enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)  { @@ -3122,11 +3198,15 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)  	if (flags & ENQUEUE_WAKEUP) {  		place_entity(cfs_rq, se, 0); -		enqueue_sleeper(cfs_rq, se); +		if (schedstat_enabled()) +			enqueue_sleeper(cfs_rq, se);  	} -	update_stats_enqueue(cfs_rq, se); -	check_spread(cfs_rq, se); +	check_schedstat_required(); +	if (schedstat_enabled()) { +		update_stats_enqueue(cfs_rq, se); +		check_spread(cfs_rq, se); +	}  	if (se != cfs_rq->curr)  		__enqueue_entity(cfs_rq, se);  	se->on_rq = 1; @@ -3193,19 +3273,8 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)  	update_curr(cfs_rq);  	dequeue_entity_load_avg(cfs_rq, se); -	update_stats_dequeue(cfs_rq, se); -	if (flags & DEQUEUE_SLEEP) { -#ifdef CONFIG_SCHEDSTATS -		if (entity_is_task(se)) { -			struct task_struct *tsk = task_of(se); - -			if (tsk->state & TASK_INTERRUPTIBLE) -				se->statistics.sleep_start = rq_clock(rq_of(cfs_rq)); -			if (tsk->state & TASK_UNINTERRUPTIBLE) -				se->statistics.block_start = rq_clock(rq_of(cfs_rq)); -		} -#endif -	} +	if (schedstat_enabled()) +		update_stats_dequeue(cfs_rq, se, flags);  	clear_buddies(cfs_rq, se); @@ -3279,7 +3348,8 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)  		 * a CPU. So account for the time it spent waiting on the  		 * runqueue.  		 */ -		update_stats_wait_end(cfs_rq, se); +		if (schedstat_enabled()) +			update_stats_wait_end(cfs_rq, se);  		__dequeue_entity(cfs_rq, se);  		update_load_avg(se, 1);  	} @@ -3292,7 +3362,7 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)  	 * least twice that of our own weight (i.e. dont track it  	 * when there are only lesser-weight tasks around):  	 */ -	if (rq_of(cfs_rq)->load.weight >= 2*se->load.weight) { +	if (schedstat_enabled() && rq_of(cfs_rq)->load.weight >= 2*se->load.weight) {  		se->statistics.slice_max = max(se->statistics.slice_max,  			se->sum_exec_runtime - se->prev_sum_exec_runtime);  	} @@ -3375,9 +3445,13 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)  	/* throttle cfs_rqs exceeding runtime */  	check_cfs_rq_runtime(cfs_rq); -	check_spread(cfs_rq, prev); +	if (schedstat_enabled()) { +		check_spread(cfs_rq, prev); +		if (prev->on_rq) +			update_stats_wait_start(cfs_rq, prev); +	} +  	if (prev->on_rq) { -		update_stats_wait_start(cfs_rq, prev);  		/* Put 'current' back into the tree. */  		__enqueue_entity(cfs_rq, prev);  		/* in !on_rq case, update occurred at dequeue */ @@ -4459,9 +4533,17 @@ static void __update_cpu_load(struct rq *this_rq, unsigned long this_load,  		/* scale is effectively 1 << i now, and >> i divides by scale */ -		old_load = this_rq->cpu_load[i] - tickless_load; +		old_load = this_rq->cpu_load[i];  		old_load = decay_load_missed(old_load, pending_updates - 1, i); -		old_load += tickless_load; +		if (tickless_load) { +			old_load -= decay_load_missed(tickless_load, pending_updates - 1, i); +			/* +			 * old_load can never be a negative value because a +			 * decayed tickless_load cannot be greater than the +			 * original tickless_load. +			 */ +			old_load += tickless_load; +		}  		new_load = this_load;  		/*  		 * Round up the averaging division if load is increasing. This @@ -4484,6 +4566,25 @@ static unsigned long weighted_cpuload(const int cpu)  }  #ifdef CONFIG_NO_HZ_COMMON +static void __update_cpu_load_nohz(struct rq *this_rq, +				   unsigned long curr_jiffies, +				   unsigned long load, +				   int active) +{ +	unsigned long pending_updates; + +	pending_updates = curr_jiffies - this_rq->last_load_update_tick; +	if (pending_updates) { +		this_rq->last_load_update_tick = curr_jiffies; +		/* +		 * In the regular NOHZ case, we were idle, this means load 0. +		 * In the NOHZ_FULL case, we were non-idle, we should consider +		 * its weighted load. +		 */ +		__update_cpu_load(this_rq, load, pending_updates, active); +	} +} +  /*   * There is no sane way to deal with nohz on smp when using jiffies because the   * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading @@ -4501,22 +4602,15 @@ static unsigned long weighted_cpuload(const int cpu)   * Called from nohz_idle_balance() to update the load ratings before doing the   * idle balance.   */ -static void update_idle_cpu_load(struct rq *this_rq) +static void update_cpu_load_idle(struct rq *this_rq)  { -	unsigned long curr_jiffies = READ_ONCE(jiffies); -	unsigned long load = weighted_cpuload(cpu_of(this_rq)); -	unsigned long pending_updates; -  	/*  	 * bail if there's load or we're actually up-to-date.  	 */ -	if (load || curr_jiffies == this_rq->last_load_update_tick) +	if (weighted_cpuload(cpu_of(this_rq)))  		return; -	pending_updates = curr_jiffies - this_rq->last_load_update_tick; -	this_rq->last_load_update_tick = curr_jiffies; - -	__update_cpu_load(this_rq, load, pending_updates, 0); +	__update_cpu_load_nohz(this_rq, READ_ONCE(jiffies), 0, 0);  }  /* @@ -4527,22 +4621,12 @@ void update_cpu_load_nohz(int active)  	struct rq *this_rq = this_rq();  	unsigned long curr_jiffies = READ_ONCE(jiffies);  	unsigned long load = active ? weighted_cpuload(cpu_of(this_rq)) : 0; -	unsigned long pending_updates;  	if (curr_jiffies == this_rq->last_load_update_tick)  		return;  	raw_spin_lock(&this_rq->lock); -	pending_updates = curr_jiffies - this_rq->last_load_update_tick; -	if (pending_updates) { -		this_rq->last_load_update_tick = curr_jiffies; -		/* -		 * In the regular NOHZ case, we were idle, this means load 0. -		 * In the NOHZ_FULL case, we were non-idle, we should consider -		 * its weighted load. -		 */ -		__update_cpu_load(this_rq, load, pending_updates, active); -	} +	__update_cpu_load_nohz(this_rq, curr_jiffies, load, active);  	raw_spin_unlock(&this_rq->lock);  }  #endif /* CONFIG_NO_HZ */ @@ -4554,7 +4638,7 @@ void update_cpu_load_active(struct rq *this_rq)  {  	unsigned long load = weighted_cpuload(cpu_of(this_rq));  	/* -	 * See the mess around update_idle_cpu_load() / update_cpu_load_nohz(). +	 * See the mess around update_cpu_load_idle() / update_cpu_load_nohz().  	 */  	this_rq->last_load_update_tick = jiffies;  	__update_cpu_load(this_rq, load, 1, 1); @@ -7848,7 +7932,7 @@ static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)  		if (time_after_eq(jiffies, rq->next_balance)) {  			raw_spin_lock_irq(&rq->lock);  			update_rq_clock(rq); -			update_idle_cpu_load(rq); +			update_cpu_load_idle(rq);  			raw_spin_unlock_irq(&rq->lock);  			rebalance_domains(rq, CPU_IDLE);  		} @@ -8234,11 +8318,8 @@ void free_fair_sched_group(struct task_group *tg)  	for_each_possible_cpu(i) {  		if (tg->cfs_rq)  			kfree(tg->cfs_rq[i]); -		if (tg->se) { -			if (tg->se[i]) -				remove_entity_load_avg(tg->se[i]); +		if (tg->se)  			kfree(tg->se[i]); -		}  	}  	kfree(tg->cfs_rq); @@ -8286,21 +8367,29 @@ err:  	return 0;  } -void unregister_fair_sched_group(struct task_group *tg, int cpu) +void unregister_fair_sched_group(struct task_group *tg)  { -	struct rq *rq = cpu_rq(cpu);  	unsigned long flags; +	struct rq *rq; +	int cpu; -	/* -	* Only empty task groups can be destroyed; so we can speculatively -	* check on_list without danger of it being re-added. -	*/ -	if (!tg->cfs_rq[cpu]->on_list) -		return; +	for_each_possible_cpu(cpu) { +		if (tg->se[cpu]) +			remove_entity_load_avg(tg->se[cpu]); -	raw_spin_lock_irqsave(&rq->lock, flags); -	list_del_leaf_cfs_rq(tg->cfs_rq[cpu]); -	raw_spin_unlock_irqrestore(&rq->lock, flags); +		/* +		 * Only empty task groups can be destroyed; so we can speculatively +		 * check on_list without danger of it being re-added. +		 */ +		if (!tg->cfs_rq[cpu]->on_list) +			continue; + +		rq = cpu_rq(cpu); + +		raw_spin_lock_irqsave(&rq->lock, flags); +		list_del_leaf_cfs_rq(tg->cfs_rq[cpu]); +		raw_spin_unlock_irqrestore(&rq->lock, flags); +	}  }  void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, @@ -8382,7 +8471,7 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)  	return 1;  } -void unregister_fair_sched_group(struct task_group *tg, int cpu) { } +void unregister_fair_sched_group(struct task_group *tg) { }  #endif /* CONFIG_FAIR_GROUP_SCHED */ | 
