diff options
Diffstat (limited to 'kernel/sched/fair.c')
| -rw-r--r-- | kernel/sched/fair.c | 226 | 
1 files changed, 162 insertions, 64 deletions
| diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 0fdb96de81a5..c9617b73bcc0 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1095,6 +1095,34 @@ static void task_numa_assign(struct task_numa_env *env,  	env->best_cpu = env->dst_cpu;  } +static bool load_too_imbalanced(long orig_src_load, long orig_dst_load, +				long src_load, long dst_load, +				struct task_numa_env *env) +{ +	long imb, old_imb; + +	/* We care about the slope of the imbalance, not the direction. */ +	if (dst_load < src_load) +		swap(dst_load, src_load); + +	/* Is the difference below the threshold? */ +	imb = dst_load * 100 - src_load * env->imbalance_pct; +	if (imb <= 0) +		return false; + +	/* +	 * The imbalance is above the allowed threshold. +	 * Compare it with the old imbalance. +	 */ +	if (orig_dst_load < orig_src_load) +		swap(orig_dst_load, orig_src_load); + +	old_imb = orig_dst_load * 100 - orig_src_load * env->imbalance_pct; + +	/* Would this change make things worse? */ +	return (old_imb > imb); +} +  /*   * This checks if the overall compute and NUMA accesses of the system would   * be improved if the source tasks was migrated to the target dst_cpu taking @@ -1107,7 +1135,8 @@ static void task_numa_compare(struct task_numa_env *env,  	struct rq *src_rq = cpu_rq(env->src_cpu);  	struct rq *dst_rq = cpu_rq(env->dst_cpu);  	struct task_struct *cur; -	long dst_load, src_load; +	long orig_src_load, src_load; +	long orig_dst_load, dst_load;  	long load;  	long imp = (groupimp > 0) ? groupimp : taskimp; @@ -1181,13 +1210,13 @@ static void task_numa_compare(struct task_numa_env *env,  	 * In the overloaded case, try and keep the load balanced.  	 */  balance: -	dst_load = env->dst_stats.load; -	src_load = env->src_stats.load; +	orig_dst_load = env->dst_stats.load; +	orig_src_load = env->src_stats.load;  	/* XXX missing power terms */  	load = task_h_load(env->p); -	dst_load += load; -	src_load -= load; +	dst_load = orig_dst_load + load; +	src_load = orig_src_load - load;  	if (cur) {  		load = task_h_load(cur); @@ -1195,11 +1224,8 @@ balance:  		src_load += load;  	} -	/* make src_load the smaller */ -	if (dst_load < src_load) -		swap(dst_load, src_load); - -	if (src_load * env->imbalance_pct < dst_load * 100) +	if (load_too_imbalanced(orig_src_load, orig_dst_load, +				src_load, dst_load, env))  		goto unlock;  assign: @@ -1301,7 +1327,16 @@ static int task_numa_migrate(struct task_struct *p)  	if (env.best_cpu == -1)  		return -EAGAIN; -	sched_setnuma(p, env.dst_nid); +	/* +	 * If the task is part of a workload that spans multiple NUMA nodes, +	 * and is migrating into one of the workload's active nodes, remember +	 * this node as the task's preferred numa node, so the workload can +	 * settle down. +	 * A task that migrated to a second choice node will be better off +	 * trying for a better one later. Do not set the preferred node here. +	 */ +	if (p->numa_group && node_isset(env.dst_nid, p->numa_group->active_nodes)) +		sched_setnuma(p, env.dst_nid);  	/*  	 * Reset the scan period if the task is being rescheduled on an @@ -1326,12 +1361,15 @@ static int task_numa_migrate(struct task_struct *p)  /* Attempt to migrate a task to a CPU on the preferred node. */  static void numa_migrate_preferred(struct task_struct *p)  { +	unsigned long interval = HZ; +  	/* This task has no NUMA fault statistics yet */  	if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults_memory))  		return;  	/* Periodically retry migrating the task to the preferred node */ -	p->numa_migrate_retry = jiffies + HZ; +	interval = min(interval, msecs_to_jiffies(p->numa_scan_period) / 16); +	p->numa_migrate_retry = jiffies + interval;  	/* Success if task is already running on preferred CPU */  	if (task_node(p) == p->numa_preferred_nid) @@ -1738,6 +1776,7 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)  	struct task_struct *p = current;  	bool migrated = flags & TNF_MIGRATED;  	int cpu_node = task_node(current); +	int local = !!(flags & TNF_FAULT_LOCAL);  	int priv;  	if (!numabalancing_enabled) @@ -1786,6 +1825,17 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)  			task_numa_group(p, last_cpupid, flags, &priv);  	} +	/* +	 * If a workload spans multiple NUMA nodes, a shared fault that +	 * occurs wholly within the set of nodes that the workload is +	 * actively using should be counted as local. This allows the +	 * scan rate to slow down when a workload has settled down. +	 */ +	if (!priv && !local && p->numa_group && +			node_isset(cpu_node, p->numa_group->active_nodes) && +			node_isset(mem_node, p->numa_group->active_nodes)) +		local = 1; +  	task_numa_placement(p);  	/* @@ -1800,7 +1850,7 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)  	p->numa_faults_buffer_memory[task_faults_idx(mem_node, priv)] += pages;  	p->numa_faults_buffer_cpu[task_faults_idx(cpu_node, priv)] += pages; -	p->numa_faults_locality[!!(flags & TNF_FAULT_LOCAL)] += pages; +	p->numa_faults_locality[local] += pages;  }  static void reset_ptenuma_scan(struct task_struct *p) @@ -3301,7 +3351,7 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)  	}  	if (!se) -		rq->nr_running -= task_delta; +		sub_nr_running(rq, task_delta);  	cfs_rq->throttled = 1;  	cfs_rq->throttled_clock = rq_clock(rq); @@ -3352,7 +3402,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)  	}  	if (!se) -		rq->nr_running += task_delta; +		add_nr_running(rq, task_delta);  	/* determine whether we need to wake up potentially idle cpu */  	if (rq->curr == rq->idle && rq->cfs.nr_running) @@ -3884,7 +3934,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)  	if (!se) {  		update_rq_runnable_avg(rq, rq->nr_running); -		inc_nr_running(rq); +		add_nr_running(rq, 1);  	}  	hrtick_update(rq);  } @@ -3944,7 +3994,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)  	}  	if (!se) { -		dec_nr_running(rq); +		sub_nr_running(rq, 1);  		update_rq_runnable_avg(rq, 1);  	}  	hrtick_update(rq); @@ -4015,7 +4065,7 @@ static void record_wakee(struct task_struct *p)  	 * about the loss.  	 */  	if (jiffies > current->wakee_flip_decay_ts + HZ) { -		current->wakee_flips = 0; +		current->wakee_flips >>= 1;  		current->wakee_flip_decay_ts = jiffies;  	} @@ -4449,10 +4499,10 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f  			sd = tmp;  	} -	if (affine_sd) { -		if (cpu != prev_cpu && wake_affine(affine_sd, p, sync)) -			prev_cpu = cpu; +	if (affine_sd && cpu != prev_cpu && wake_affine(affine_sd, p, sync)) +		prev_cpu = cpu; +	if (sd_flag & SD_BALANCE_WAKE) {  		new_cpu = select_idle_sibling(p, prev_cpu);  		goto unlock;  	} @@ -4520,6 +4570,9 @@ migrate_task_rq_fair(struct task_struct *p, int next_cpu)  		atomic_long_add(se->avg.load_avg_contrib,  						&cfs_rq->removed_load);  	} + +	/* We have migrated, no longer consider this task hot */ +	se->exec_start = 0;  }  #endif /* CONFIG_SMP */ @@ -5070,6 +5123,7 @@ task_hot(struct task_struct *p, u64 now)  /* Returns true if the destination node has incurred more faults */  static bool migrate_improves_locality(struct task_struct *p, struct lb_env *env)  { +	struct numa_group *numa_group = rcu_dereference(p->numa_group);  	int src_nid, dst_nid;  	if (!sched_feat(NUMA_FAVOUR_HIGHER) || !p->numa_faults_memory || @@ -5083,21 +5137,29 @@ static bool migrate_improves_locality(struct task_struct *p, struct lb_env *env)  	if (src_nid == dst_nid)  		return false; -	/* Always encourage migration to the preferred node. */ -	if (dst_nid == p->numa_preferred_nid) -		return true; +	if (numa_group) { +		/* Task is already in the group's interleave set. */ +		if (node_isset(src_nid, numa_group->active_nodes)) +			return false; + +		/* Task is moving into the group's interleave set. */ +		if (node_isset(dst_nid, numa_group->active_nodes)) +			return true; -	/* If both task and group weight improve, this move is a winner. */ -	if (task_weight(p, dst_nid) > task_weight(p, src_nid) && -	    group_weight(p, dst_nid) > group_weight(p, src_nid)) +		return group_faults(p, dst_nid) > group_faults(p, src_nid); +	} + +	/* Encourage migration to the preferred node. */ +	if (dst_nid == p->numa_preferred_nid)  		return true; -	return false; +	return task_faults(p, dst_nid) > task_faults(p, src_nid);  }  static bool migrate_degrades_locality(struct task_struct *p, struct lb_env *env)  { +	struct numa_group *numa_group = rcu_dereference(p->numa_group);  	int src_nid, dst_nid;  	if (!sched_feat(NUMA) || !sched_feat(NUMA_RESIST_LOWER)) @@ -5112,16 +5174,23 @@ static bool migrate_degrades_locality(struct task_struct *p, struct lb_env *env)  	if (src_nid == dst_nid)  		return false; +	if (numa_group) { +		/* Task is moving within/into the group's interleave set. */ +		if (node_isset(dst_nid, numa_group->active_nodes)) +			return false; + +		/* Task is moving out of the group's interleave set. */ +		if (node_isset(src_nid, numa_group->active_nodes)) +			return true; + +		return group_faults(p, dst_nid) < group_faults(p, src_nid); +	} +  	/* Migrating away from the preferred node is always bad. */  	if (src_nid == p->numa_preferred_nid)  		return true; -	/* If either task or group weight get worse, don't do it. */ -	if (task_weight(p, dst_nid) < task_weight(p, src_nid) || -	    group_weight(p, dst_nid) < group_weight(p, src_nid)) -		return true; - -	return false; +	return task_faults(p, dst_nid) < task_faults(p, src_nid);  }  #else @@ -5564,6 +5633,7 @@ static unsigned long scale_rt_power(int cpu)  {  	struct rq *rq = cpu_rq(cpu);  	u64 total, available, age_stamp, avg; +	s64 delta;  	/*  	 * Since we're reading these variables without serialization make sure @@ -5572,7 +5642,11 @@ static unsigned long scale_rt_power(int cpu)  	age_stamp = ACCESS_ONCE(rq->age_stamp);  	avg = ACCESS_ONCE(rq->rt_avg); -	total = sched_avg_period() + (rq_clock(rq) - age_stamp); +	delta = rq_clock(rq) - age_stamp; +	if (unlikely(delta < 0)) +		delta = 0; + +	total = sched_avg_period() + delta;  	if (unlikely(total < avg)) {  		/* Ensures that power won't end up being negative */ @@ -6640,17 +6714,44 @@ out:  	return ld_moved;  } +static inline unsigned long +get_sd_balance_interval(struct sched_domain *sd, int cpu_busy) +{ +	unsigned long interval = sd->balance_interval; + +	if (cpu_busy) +		interval *= sd->busy_factor; + +	/* scale ms to jiffies */ +	interval = msecs_to_jiffies(interval); +	interval = clamp(interval, 1UL, max_load_balance_interval); + +	return interval; +} + +static inline void +update_next_balance(struct sched_domain *sd, int cpu_busy, unsigned long *next_balance) +{ +	unsigned long interval, next; + +	interval = get_sd_balance_interval(sd, cpu_busy); +	next = sd->last_balance + interval; + +	if (time_after(*next_balance, next)) +		*next_balance = next; +} +  /*   * idle_balance is called by schedule() if this_cpu is about to become   * idle. Attempts to pull tasks from other CPUs.   */  static int idle_balance(struct rq *this_rq)  { +	unsigned long next_balance = jiffies + HZ; +	int this_cpu = this_rq->cpu;  	struct sched_domain *sd;  	int pulled_task = 0; -	unsigned long next_balance = jiffies + HZ;  	u64 curr_cost = 0; -	int this_cpu = this_rq->cpu;  	idle_enter_fair(this_rq); @@ -6660,8 +6761,15 @@ static int idle_balance(struct rq *this_rq)  	 */  	this_rq->idle_stamp = rq_clock(this_rq); -	if (this_rq->avg_idle < sysctl_sched_migration_cost) +	if (this_rq->avg_idle < sysctl_sched_migration_cost) { +		rcu_read_lock(); +		sd = rcu_dereference_check_sched_domain(this_rq->sd); +		if (sd) +			update_next_balance(sd, 0, &next_balance); +		rcu_read_unlock(); +  		goto out; +	}  	/*  	 * Drop the rq->lock, but keep IRQ/preempt disabled. @@ -6671,20 +6779,20 @@ static int idle_balance(struct rq *this_rq)  	update_blocked_averages(this_cpu);  	rcu_read_lock();  	for_each_domain(this_cpu, sd) { -		unsigned long interval;  		int continue_balancing = 1;  		u64 t0, domain_cost;  		if (!(sd->flags & SD_LOAD_BALANCE))  			continue; -		if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost) +		if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost) { +			update_next_balance(sd, 0, &next_balance);  			break; +		}  		if (sd->flags & SD_BALANCE_NEWIDLE) {  			t0 = sched_clock_cpu(this_cpu); -			/* If we've pulled tasks over stop searching: */  			pulled_task = load_balance(this_cpu, this_rq,  						   sd, CPU_NEWLY_IDLE,  						   &continue_balancing); @@ -6696,10 +6804,13 @@ static int idle_balance(struct rq *this_rq)  			curr_cost += domain_cost;  		} -		interval = msecs_to_jiffies(sd->balance_interval); -		if (time_after(next_balance, sd->last_balance + interval)) -			next_balance = sd->last_balance + interval; -		if (pulled_task) +		update_next_balance(sd, 0, &next_balance); + +		/* +		 * Stop searching for tasks to pull if there are +		 * now runnable tasks on this rq. +		 */ +		if (pulled_task || this_rq->nr_running > 0)  			break;  	}  	rcu_read_unlock(); @@ -6717,20 +6828,13 @@ static int idle_balance(struct rq *this_rq)  	if (this_rq->cfs.h_nr_running && !pulled_task)  		pulled_task = 1; -	if (pulled_task || time_after(jiffies, this_rq->next_balance)) { -		/* -		 * We are going idle. next_balance may be set based on -		 * a busy processor. So reset next_balance. -		 */ +out: +	/* Move the next balance forward */ +	if (time_after(this_rq->next_balance, next_balance))  		this_rq->next_balance = next_balance; -	} -out:  	/* Is there a task of a high priority class? */ -	if (this_rq->nr_running != this_rq->cfs.h_nr_running && -	    ((this_rq->stop && this_rq->stop->on_rq) || -	     this_rq->dl.dl_nr_running || -	     (this_rq->rt.rt_nr_running && !rt_rq_throttled(&this_rq->rt)))) +	if (this_rq->nr_running != this_rq->cfs.h_nr_running)  		pulled_task = -1;  	if (pulled_task) { @@ -7011,16 +7115,9 @@ static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)  			break;  		} -		interval = sd->balance_interval; -		if (idle != CPU_IDLE) -			interval *= sd->busy_factor; - -		/* scale ms to jiffies */ -		interval = msecs_to_jiffies(interval); -		interval = clamp(interval, 1UL, max_load_balance_interval); +		interval = get_sd_balance_interval(sd, idle != CPU_IDLE);  		need_serialize = sd->flags & SD_SERIALIZE; -  		if (need_serialize) {  			if (!spin_trylock(&balancing))  				goto out; @@ -7036,6 +7133,7 @@ static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)  				idle = idle_cpu(cpu) ? CPU_IDLE : CPU_NOT_IDLE;  			}  			sd->last_balance = jiffies; +			interval = get_sd_balance_interval(sd, idle != CPU_IDLE);  		}  		if (need_serialize)  			spin_unlock(&balancing); | 
