diff options
Diffstat (limited to 'kernel')
| -rw-r--r-- | kernel/locking/locktorture.c | 2 | ||||
| -rw-r--r-- | kernel/power/suspend.c | 2 | ||||
| -rw-r--r-- | kernel/sched/core.c | 324 | ||||
| -rw-r--r-- | kernel/sched/deadline.c | 6 | ||||
| -rw-r--r-- | kernel/sched/fair.c | 226 | ||||
| -rw-r--r-- | kernel/sched/idle.c | 140 | ||||
| -rw-r--r-- | kernel/sched/rt.c | 119 | ||||
| -rw-r--r-- | kernel/sched/sched.h | 26 | ||||
| -rw-r--r-- | kernel/sched/stop_task.c | 4 | ||||
| -rw-r--r-- | kernel/sys.c | 6 | ||||
| -rw-r--r-- | kernel/workqueue.c | 6 | 
11 files changed, 522 insertions, 339 deletions
| diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c index dbafeac18e4d..0955b885d0dc 100644 --- a/kernel/locking/locktorture.c +++ b/kernel/locking/locktorture.c @@ -216,7 +216,7 @@ static int lock_torture_writer(void *arg)  	static DEFINE_TORTURE_RANDOM(rand);  	VERBOSE_TOROUT_STRING("lock_torture_writer task started"); -	set_user_nice(current, 19); +	set_user_nice(current, MAX_NICE);  	do {  		if ((torture_random(&rand) & 0xfffff) == 0) diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 8233cd4047d7..155721f7f909 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -54,9 +54,11 @@ static void freeze_begin(void)  static void freeze_enter(void)  { +	cpuidle_use_deepest_state(true);  	cpuidle_resume();  	wait_event(suspend_freeze_wait_head, suspend_freeze_wake);  	cpuidle_pause(); +	cpuidle_use_deepest_state(false);  }  void freeze_wake(void) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index a62a7dec3986..913c6d6cc2c1 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -522,6 +522,39 @@ static inline void init_hrtick(void)  #endif	/* CONFIG_SCHED_HRTICK */  /* + * cmpxchg based fetch_or, macro so it works for different integer types + */ +#define fetch_or(ptr, val)						\ +({	typeof(*(ptr)) __old, __val = *(ptr);				\ + 	for (;;) {							\ + 		__old = cmpxchg((ptr), __val, __val | (val));		\ + 		if (__old == __val)					\ + 			break;						\ + 		__val = __old;						\ + 	}								\ + 	__old;								\ +}) + +#ifdef TIF_POLLING_NRFLAG +/* + * Atomically set TIF_NEED_RESCHED and test for TIF_POLLING_NRFLAG, + * this avoids any races wrt polling state changes and thereby avoids + * spurious IPIs. + */ +static bool set_nr_and_not_polling(struct task_struct *p) +{ +	struct thread_info *ti = task_thread_info(p); +	return !(fetch_or(&ti->flags, _TIF_NEED_RESCHED) & _TIF_POLLING_NRFLAG); +} +#else +static bool set_nr_and_not_polling(struct task_struct *p) +{ +	set_tsk_need_resched(p); +	return true; +} +#endif + +/*   * resched_task - mark a task 'to be rescheduled now'.   *   * On UP this means the setting of the need_resched flag, on SMP it @@ -537,17 +570,15 @@ void resched_task(struct task_struct *p)  	if (test_tsk_need_resched(p))  		return; -	set_tsk_need_resched(p); -  	cpu = task_cpu(p); +  	if (cpu == smp_processor_id()) { +		set_tsk_need_resched(p);  		set_preempt_need_resched();  		return;  	} -	/* NEED_RESCHED must be visible before we test polling */ -	smp_mb(); -	if (!tsk_is_polling(p)) +	if (set_nr_and_not_polling(p))  		smp_send_reschedule(cpu);  } @@ -3018,7 +3049,7 @@ EXPORT_SYMBOL(set_user_nice);  int can_nice(const struct task_struct *p, const int nice)  {  	/* convert nice value [19,-20] to rlimit style value [1,40] */ -	int nice_rlim = 20 - nice; +	int nice_rlim = nice_to_rlimit(nice);  	return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) ||  		capable(CAP_SYS_NICE)); @@ -3042,17 +3073,10 @@ SYSCALL_DEFINE1(nice, int, increment)  	 * We don't have to worry. Conceptually one call occurs first  	 * and we have a single winner.  	 */ -	if (increment < -40) -		increment = -40; -	if (increment > 40) -		increment = 40; - +	increment = clamp(increment, -NICE_WIDTH, NICE_WIDTH);  	nice = task_nice(current) + increment; -	if (nice < MIN_NICE) -		nice = MIN_NICE; -	if (nice > MAX_NICE) -		nice = MAX_NICE; +	nice = clamp_val(nice, MIN_NICE, MAX_NICE);  	if (increment < 0 && !can_nice(current, nice))  		return -EPERM; @@ -3642,13 +3666,11 @@ static int sched_copy_attr(struct sched_attr __user *uattr,  	 */  	attr->sched_nice = clamp(attr->sched_nice, MIN_NICE, MAX_NICE); -out: -	return ret; +	return 0;  err_size:  	put_user(sizeof(*attr), &uattr->size); -	ret = -E2BIG; -	goto out; +	return -E2BIG;  }  /** @@ -3808,7 +3830,7 @@ static int sched_read_attr(struct sched_attr __user *uattr,  		for (; addr < end; addr++) {  			if (*addr) -				goto err_size; +				return -EFBIG;  		}  		attr->size = usize; @@ -3818,12 +3840,7 @@ static int sched_read_attr(struct sched_attr __user *uattr,  	if (ret)  		return -EFAULT; -out: -	return ret; - -err_size: -	ret = -E2BIG; -	goto out; +	return 0;  }  /** @@ -5093,10 +5110,20 @@ static struct notifier_block migration_notifier = {  	.priority = CPU_PRI_MIGRATION,  }; +static void __cpuinit set_cpu_rq_start_time(void) +{ +	int cpu = smp_processor_id(); +	struct rq *rq = cpu_rq(cpu); +	rq->age_stamp = sched_clock_cpu(cpu); +} +  static int sched_cpu_active(struct notifier_block *nfb,  				      unsigned long action, void *hcpu)  {  	switch (action & ~CPU_TASKS_FROZEN) { +	case CPU_STARTING: +		set_cpu_rq_start_time(); +		return NOTIFY_OK;  	case CPU_DOWN_FAILED:  		set_cpu_active((long)hcpu, true);  		return NOTIFY_OK; @@ -5305,7 +5332,8 @@ static int sd_degenerate(struct sched_domain *sd)  			 SD_BALANCE_FORK |  			 SD_BALANCE_EXEC |  			 SD_SHARE_CPUPOWER | -			 SD_SHARE_PKG_RESOURCES)) { +			 SD_SHARE_PKG_RESOURCES | +			 SD_SHARE_POWERDOMAIN)) {  		if (sd->groups != sd->groups->next)  			return 0;  	} @@ -5336,7 +5364,8 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)  				SD_BALANCE_EXEC |  				SD_SHARE_CPUPOWER |  				SD_SHARE_PKG_RESOURCES | -				SD_PREFER_SIBLING); +				SD_PREFER_SIBLING | +				SD_SHARE_POWERDOMAIN);  		if (nr_node_ids == 1)  			pflags &= ~SD_SERIALIZE;  	} @@ -5610,17 +5639,6 @@ static int __init isolated_cpu_setup(char *str)  __setup("isolcpus=", isolated_cpu_setup); -static const struct cpumask *cpu_cpu_mask(int cpu) -{ -	return cpumask_of_node(cpu_to_node(cpu)); -} - -struct sd_data { -	struct sched_domain **__percpu sd; -	struct sched_group **__percpu sg; -	struct sched_group_power **__percpu sgp; -}; -  struct s_data {  	struct sched_domain ** __percpu sd;  	struct root_domain	*rd; @@ -5633,21 +5651,6 @@ enum s_alloc {  	sa_none,  }; -struct sched_domain_topology_level; - -typedef struct sched_domain *(*sched_domain_init_f)(struct sched_domain_topology_level *tl, int cpu); -typedef const struct cpumask *(*sched_domain_mask_f)(int cpu); - -#define SDTL_OVERLAP	0x01 - -struct sched_domain_topology_level { -	sched_domain_init_f init; -	sched_domain_mask_f mask; -	int		    flags; -	int		    numa_level; -	struct sd_data      data; -}; -  /*   * Build an iteration mask that can exclude certain CPUs from the upwards   * domain traversal. @@ -5815,8 +5818,6 @@ build_sched_groups(struct sched_domain *sd, int cpu)  			continue;  		group = get_group(i, sdd, &sg); -		cpumask_clear(sched_group_cpus(sg)); -		sg->sgp->power = 0;  		cpumask_setall(sched_group_mask(sg));  		for_each_cpu(j, span) { @@ -5866,44 +5867,11 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)  	atomic_set(&sg->sgp->nr_busy_cpus, sg->group_weight);  } -int __weak arch_sd_sibling_asym_packing(void) -{ -       return 0*SD_ASYM_PACKING; -} -  /*   * Initializers for schedule domains   * Non-inlined to reduce accumulated stack pressure in build_sched_domains()   */ -#ifdef CONFIG_SCHED_DEBUG -# define SD_INIT_NAME(sd, type)		sd->name = #type -#else -# define SD_INIT_NAME(sd, type)		do { } while (0) -#endif - -#define SD_INIT_FUNC(type)						\ -static noinline struct sched_domain *					\ -sd_init_##type(struct sched_domain_topology_level *tl, int cpu) 	\ -{									\ -	struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu);	\ -	*sd = SD_##type##_INIT;						\ -	SD_INIT_NAME(sd, type);						\ -	sd->private = &tl->data;					\ -	return sd;							\ -} - -SD_INIT_FUNC(CPU) -#ifdef CONFIG_SCHED_SMT - SD_INIT_FUNC(SIBLING) -#endif -#ifdef CONFIG_SCHED_MC - SD_INIT_FUNC(MC) -#endif -#ifdef CONFIG_SCHED_BOOK - SD_INIT_FUNC(BOOK) -#endif -  static int default_relax_domain_level = -1;  int sched_domain_level_max; @@ -5991,99 +5959,154 @@ static void claim_allocations(int cpu, struct sched_domain *sd)  		*per_cpu_ptr(sdd->sgp, cpu) = NULL;  } -#ifdef CONFIG_SCHED_SMT -static const struct cpumask *cpu_smt_mask(int cpu) -{ -	return topology_thread_cpumask(cpu); -} -#endif - -/* - * Topology list, bottom-up. - */ -static struct sched_domain_topology_level default_topology[] = { -#ifdef CONFIG_SCHED_SMT -	{ sd_init_SIBLING, cpu_smt_mask, }, -#endif -#ifdef CONFIG_SCHED_MC -	{ sd_init_MC, cpu_coregroup_mask, }, -#endif -#ifdef CONFIG_SCHED_BOOK -	{ sd_init_BOOK, cpu_book_mask, }, -#endif -	{ sd_init_CPU, cpu_cpu_mask, }, -	{ NULL, }, -}; - -static struct sched_domain_topology_level *sched_domain_topology = default_topology; - -#define for_each_sd_topology(tl)			\ -	for (tl = sched_domain_topology; tl->init; tl++) -  #ifdef CONFIG_NUMA -  static int sched_domains_numa_levels;  static int *sched_domains_numa_distance;  static struct cpumask ***sched_domains_numa_masks;  static int sched_domains_curr_level; +#endif -static inline int sd_local_flags(int level) -{ -	if (sched_domains_numa_distance[level] > RECLAIM_DISTANCE) -		return 0; - -	return SD_BALANCE_EXEC | SD_BALANCE_FORK | SD_WAKE_AFFINE; -} +/* + * SD_flags allowed in topology descriptions. + * + * SD_SHARE_CPUPOWER      - describes SMT topologies + * SD_SHARE_PKG_RESOURCES - describes shared caches + * SD_NUMA                - describes NUMA topologies + * SD_SHARE_POWERDOMAIN   - describes shared power domain + * + * Odd one out: + * SD_ASYM_PACKING        - describes SMT quirks + */ +#define TOPOLOGY_SD_FLAGS		\ +	(SD_SHARE_CPUPOWER |		\ +	 SD_SHARE_PKG_RESOURCES |	\ +	 SD_NUMA |			\ +	 SD_ASYM_PACKING |		\ +	 SD_SHARE_POWERDOMAIN)  static struct sched_domain * -sd_numa_init(struct sched_domain_topology_level *tl, int cpu) +sd_init(struct sched_domain_topology_level *tl, int cpu)  {  	struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu); -	int level = tl->numa_level; -	int sd_weight = cpumask_weight( -			sched_domains_numa_masks[level][cpu_to_node(cpu)]); +	int sd_weight, sd_flags = 0; + +#ifdef CONFIG_NUMA +	/* +	 * Ugly hack to pass state to sd_numa_mask()... +	 */ +	sched_domains_curr_level = tl->numa_level; +#endif + +	sd_weight = cpumask_weight(tl->mask(cpu)); + +	if (tl->sd_flags) +		sd_flags = (*tl->sd_flags)(); +	if (WARN_ONCE(sd_flags & ~TOPOLOGY_SD_FLAGS, +			"wrong sd_flags in topology description\n")) +		sd_flags &= ~TOPOLOGY_SD_FLAGS;  	*sd = (struct sched_domain){  		.min_interval		= sd_weight,  		.max_interval		= 2*sd_weight,  		.busy_factor		= 32,  		.imbalance_pct		= 125, -		.cache_nice_tries	= 2, -		.busy_idx		= 3, -		.idle_idx		= 2, + +		.cache_nice_tries	= 0, +		.busy_idx		= 0, +		.idle_idx		= 0,  		.newidle_idx		= 0,  		.wake_idx		= 0,  		.forkexec_idx		= 0,  		.flags			= 1*SD_LOAD_BALANCE  					| 1*SD_BALANCE_NEWIDLE -					| 0*SD_BALANCE_EXEC -					| 0*SD_BALANCE_FORK +					| 1*SD_BALANCE_EXEC +					| 1*SD_BALANCE_FORK  					| 0*SD_BALANCE_WAKE -					| 0*SD_WAKE_AFFINE +					| 1*SD_WAKE_AFFINE  					| 0*SD_SHARE_CPUPOWER  					| 0*SD_SHARE_PKG_RESOURCES -					| 1*SD_SERIALIZE +					| 0*SD_SERIALIZE  					| 0*SD_PREFER_SIBLING -					| 1*SD_NUMA -					| sd_local_flags(level) +					| 0*SD_NUMA +					| sd_flags  					, +  		.last_balance		= jiffies,  		.balance_interval	= sd_weight, +		.smt_gain		= 0,  		.max_newidle_lb_cost	= 0,  		.next_decay_max_lb_cost	= jiffies, +#ifdef CONFIG_SCHED_DEBUG +		.name			= tl->name, +#endif  	}; -	SD_INIT_NAME(sd, NUMA); -	sd->private = &tl->data;  	/* -	 * Ugly hack to pass state to sd_numa_mask()... +	 * Convert topological properties into behaviour.  	 */ -	sched_domains_curr_level = tl->numa_level; + +	if (sd->flags & SD_SHARE_CPUPOWER) { +		sd->imbalance_pct = 110; +		sd->smt_gain = 1178; /* ~15% */ + +	} else if (sd->flags & SD_SHARE_PKG_RESOURCES) { +		sd->imbalance_pct = 117; +		sd->cache_nice_tries = 1; +		sd->busy_idx = 2; + +#ifdef CONFIG_NUMA +	} else if (sd->flags & SD_NUMA) { +		sd->cache_nice_tries = 2; +		sd->busy_idx = 3; +		sd->idle_idx = 2; + +		sd->flags |= SD_SERIALIZE; +		if (sched_domains_numa_distance[tl->numa_level] > RECLAIM_DISTANCE) { +			sd->flags &= ~(SD_BALANCE_EXEC | +				       SD_BALANCE_FORK | +				       SD_WAKE_AFFINE); +		} + +#endif +	} else { +		sd->flags |= SD_PREFER_SIBLING; +		sd->cache_nice_tries = 1; +		sd->busy_idx = 2; +		sd->idle_idx = 1; +	} + +	sd->private = &tl->data;  	return sd;  } +/* + * Topology list, bottom-up. + */ +static struct sched_domain_topology_level default_topology[] = { +#ifdef CONFIG_SCHED_SMT +	{ cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) }, +#endif +#ifdef CONFIG_SCHED_MC +	{ cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) }, +#endif +	{ cpu_cpu_mask, SD_INIT_NAME(DIE) }, +	{ NULL, }, +}; + +struct sched_domain_topology_level *sched_domain_topology = default_topology; + +#define for_each_sd_topology(tl)			\ +	for (tl = sched_domain_topology; tl->mask; tl++) + +void set_sched_topology(struct sched_domain_topology_level *tl) +{ +	sched_domain_topology = tl; +} + +#ifdef CONFIG_NUMA +  static const struct cpumask *sd_numa_mask(int cpu)  {  	return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)]; @@ -6227,7 +6250,10 @@ static void sched_init_numa(void)  		}  	} -	tl = kzalloc((ARRAY_SIZE(default_topology) + level) * +	/* Compute default topology size */ +	for (i = 0; sched_domain_topology[i].mask; i++); + +	tl = kzalloc((i + level + 1) *  			sizeof(struct sched_domain_topology_level), GFP_KERNEL);  	if (!tl)  		return; @@ -6235,18 +6261,19 @@ static void sched_init_numa(void)  	/*  	 * Copy the default topology bits..  	 */ -	for (i = 0; default_topology[i].init; i++) -		tl[i] = default_topology[i]; +	for (i = 0; sched_domain_topology[i].mask; i++) +		tl[i] = sched_domain_topology[i];  	/*  	 * .. and append 'j' levels of NUMA goodness.  	 */  	for (j = 0; j < level; i++, j++) {  		tl[i] = (struct sched_domain_topology_level){ -			.init = sd_numa_init,  			.mask = sd_numa_mask, +			.sd_flags = cpu_numa_flags,  			.flags = SDTL_OVERLAP,  			.numa_level = j, +			SD_INIT_NAME(NUMA)  		};  	} @@ -6404,7 +6431,7 @@ struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,  		const struct cpumask *cpu_map, struct sched_domain_attr *attr,  		struct sched_domain *child, int cpu)  { -	struct sched_domain *sd = tl->init(tl, cpu); +	struct sched_domain *sd = sd_init(tl, cpu);  	if (!sd)  		return child; @@ -6974,6 +7001,7 @@ void __init sched_init(void)  	if (cpu_isolated_map == NULL)  		zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);  	idle_thread_set_boot_cpu(); +	set_cpu_rq_start_time();  #endif  	init_sched_fair_class(); diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 800e99b99075..f9ca7d19781a 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -520,7 +520,7 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)  	 * We need to take care of a possible races here. In fact, the  	 * task might have changed its scheduling policy to something  	 * different from SCHED_DEADLINE or changed its reservation -	 * parameters (through sched_setscheduler()). +	 * parameters (through sched_setattr()).  	 */  	if (!dl_task(p) || dl_se->dl_new)  		goto unlock; @@ -741,7 +741,7 @@ void inc_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)  	WARN_ON(!dl_prio(prio));  	dl_rq->dl_nr_running++; -	inc_nr_running(rq_of_dl_rq(dl_rq)); +	add_nr_running(rq_of_dl_rq(dl_rq), 1);  	inc_dl_deadline(dl_rq, deadline);  	inc_dl_migration(dl_se, dl_rq); @@ -755,7 +755,7 @@ void dec_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)  	WARN_ON(!dl_prio(prio));  	WARN_ON(!dl_rq->dl_nr_running);  	dl_rq->dl_nr_running--; -	dec_nr_running(rq_of_dl_rq(dl_rq)); +	sub_nr_running(rq_of_dl_rq(dl_rq), 1);  	dec_dl_deadline(dl_rq, dl_se->deadline);  	dec_dl_migration(dl_se, dl_rq); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 0fdb96de81a5..c9617b73bcc0 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1095,6 +1095,34 @@ static void task_numa_assign(struct task_numa_env *env,  	env->best_cpu = env->dst_cpu;  } +static bool load_too_imbalanced(long orig_src_load, long orig_dst_load, +				long src_load, long dst_load, +				struct task_numa_env *env) +{ +	long imb, old_imb; + +	/* We care about the slope of the imbalance, not the direction. */ +	if (dst_load < src_load) +		swap(dst_load, src_load); + +	/* Is the difference below the threshold? */ +	imb = dst_load * 100 - src_load * env->imbalance_pct; +	if (imb <= 0) +		return false; + +	/* +	 * The imbalance is above the allowed threshold. +	 * Compare it with the old imbalance. +	 */ +	if (orig_dst_load < orig_src_load) +		swap(orig_dst_load, orig_src_load); + +	old_imb = orig_dst_load * 100 - orig_src_load * env->imbalance_pct; + +	/* Would this change make things worse? */ +	return (old_imb > imb); +} +  /*   * This checks if the overall compute and NUMA accesses of the system would   * be improved if the source tasks was migrated to the target dst_cpu taking @@ -1107,7 +1135,8 @@ static void task_numa_compare(struct task_numa_env *env,  	struct rq *src_rq = cpu_rq(env->src_cpu);  	struct rq *dst_rq = cpu_rq(env->dst_cpu);  	struct task_struct *cur; -	long dst_load, src_load; +	long orig_src_load, src_load; +	long orig_dst_load, dst_load;  	long load;  	long imp = (groupimp > 0) ? groupimp : taskimp; @@ -1181,13 +1210,13 @@ static void task_numa_compare(struct task_numa_env *env,  	 * In the overloaded case, try and keep the load balanced.  	 */  balance: -	dst_load = env->dst_stats.load; -	src_load = env->src_stats.load; +	orig_dst_load = env->dst_stats.load; +	orig_src_load = env->src_stats.load;  	/* XXX missing power terms */  	load = task_h_load(env->p); -	dst_load += load; -	src_load -= load; +	dst_load = orig_dst_load + load; +	src_load = orig_src_load - load;  	if (cur) {  		load = task_h_load(cur); @@ -1195,11 +1224,8 @@ balance:  		src_load += load;  	} -	/* make src_load the smaller */ -	if (dst_load < src_load) -		swap(dst_load, src_load); - -	if (src_load * env->imbalance_pct < dst_load * 100) +	if (load_too_imbalanced(orig_src_load, orig_dst_load, +				src_load, dst_load, env))  		goto unlock;  assign: @@ -1301,7 +1327,16 @@ static int task_numa_migrate(struct task_struct *p)  	if (env.best_cpu == -1)  		return -EAGAIN; -	sched_setnuma(p, env.dst_nid); +	/* +	 * If the task is part of a workload that spans multiple NUMA nodes, +	 * and is migrating into one of the workload's active nodes, remember +	 * this node as the task's preferred numa node, so the workload can +	 * settle down. +	 * A task that migrated to a second choice node will be better off +	 * trying for a better one later. Do not set the preferred node here. +	 */ +	if (p->numa_group && node_isset(env.dst_nid, p->numa_group->active_nodes)) +		sched_setnuma(p, env.dst_nid);  	/*  	 * Reset the scan period if the task is being rescheduled on an @@ -1326,12 +1361,15 @@ static int task_numa_migrate(struct task_struct *p)  /* Attempt to migrate a task to a CPU on the preferred node. */  static void numa_migrate_preferred(struct task_struct *p)  { +	unsigned long interval = HZ; +  	/* This task has no NUMA fault statistics yet */  	if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults_memory))  		return;  	/* Periodically retry migrating the task to the preferred node */ -	p->numa_migrate_retry = jiffies + HZ; +	interval = min(interval, msecs_to_jiffies(p->numa_scan_period) / 16); +	p->numa_migrate_retry = jiffies + interval;  	/* Success if task is already running on preferred CPU */  	if (task_node(p) == p->numa_preferred_nid) @@ -1738,6 +1776,7 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)  	struct task_struct *p = current;  	bool migrated = flags & TNF_MIGRATED;  	int cpu_node = task_node(current); +	int local = !!(flags & TNF_FAULT_LOCAL);  	int priv;  	if (!numabalancing_enabled) @@ -1786,6 +1825,17 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)  			task_numa_group(p, last_cpupid, flags, &priv);  	} +	/* +	 * If a workload spans multiple NUMA nodes, a shared fault that +	 * occurs wholly within the set of nodes that the workload is +	 * actively using should be counted as local. This allows the +	 * scan rate to slow down when a workload has settled down. +	 */ +	if (!priv && !local && p->numa_group && +			node_isset(cpu_node, p->numa_group->active_nodes) && +			node_isset(mem_node, p->numa_group->active_nodes)) +		local = 1; +  	task_numa_placement(p);  	/* @@ -1800,7 +1850,7 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)  	p->numa_faults_buffer_memory[task_faults_idx(mem_node, priv)] += pages;  	p->numa_faults_buffer_cpu[task_faults_idx(cpu_node, priv)] += pages; -	p->numa_faults_locality[!!(flags & TNF_FAULT_LOCAL)] += pages; +	p->numa_faults_locality[local] += pages;  }  static void reset_ptenuma_scan(struct task_struct *p) @@ -3301,7 +3351,7 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)  	}  	if (!se) -		rq->nr_running -= task_delta; +		sub_nr_running(rq, task_delta);  	cfs_rq->throttled = 1;  	cfs_rq->throttled_clock = rq_clock(rq); @@ -3352,7 +3402,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)  	}  	if (!se) -		rq->nr_running += task_delta; +		add_nr_running(rq, task_delta);  	/* determine whether we need to wake up potentially idle cpu */  	if (rq->curr == rq->idle && rq->cfs.nr_running) @@ -3884,7 +3934,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)  	if (!se) {  		update_rq_runnable_avg(rq, rq->nr_running); -		inc_nr_running(rq); +		add_nr_running(rq, 1);  	}  	hrtick_update(rq);  } @@ -3944,7 +3994,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)  	}  	if (!se) { -		dec_nr_running(rq); +		sub_nr_running(rq, 1);  		update_rq_runnable_avg(rq, 1);  	}  	hrtick_update(rq); @@ -4015,7 +4065,7 @@ static void record_wakee(struct task_struct *p)  	 * about the loss.  	 */  	if (jiffies > current->wakee_flip_decay_ts + HZ) { -		current->wakee_flips = 0; +		current->wakee_flips >>= 1;  		current->wakee_flip_decay_ts = jiffies;  	} @@ -4449,10 +4499,10 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f  			sd = tmp;  	} -	if (affine_sd) { -		if (cpu != prev_cpu && wake_affine(affine_sd, p, sync)) -			prev_cpu = cpu; +	if (affine_sd && cpu != prev_cpu && wake_affine(affine_sd, p, sync)) +		prev_cpu = cpu; +	if (sd_flag & SD_BALANCE_WAKE) {  		new_cpu = select_idle_sibling(p, prev_cpu);  		goto unlock;  	} @@ -4520,6 +4570,9 @@ migrate_task_rq_fair(struct task_struct *p, int next_cpu)  		atomic_long_add(se->avg.load_avg_contrib,  						&cfs_rq->removed_load);  	} + +	/* We have migrated, no longer consider this task hot */ +	se->exec_start = 0;  }  #endif /* CONFIG_SMP */ @@ -5070,6 +5123,7 @@ task_hot(struct task_struct *p, u64 now)  /* Returns true if the destination node has incurred more faults */  static bool migrate_improves_locality(struct task_struct *p, struct lb_env *env)  { +	struct numa_group *numa_group = rcu_dereference(p->numa_group);  	int src_nid, dst_nid;  	if (!sched_feat(NUMA_FAVOUR_HIGHER) || !p->numa_faults_memory || @@ -5083,21 +5137,29 @@ static bool migrate_improves_locality(struct task_struct *p, struct lb_env *env)  	if (src_nid == dst_nid)  		return false; -	/* Always encourage migration to the preferred node. */ -	if (dst_nid == p->numa_preferred_nid) -		return true; +	if (numa_group) { +		/* Task is already in the group's interleave set. */ +		if (node_isset(src_nid, numa_group->active_nodes)) +			return false; + +		/* Task is moving into the group's interleave set. */ +		if (node_isset(dst_nid, numa_group->active_nodes)) +			return true; -	/* If both task and group weight improve, this move is a winner. */ -	if (task_weight(p, dst_nid) > task_weight(p, src_nid) && -	    group_weight(p, dst_nid) > group_weight(p, src_nid)) +		return group_faults(p, dst_nid) > group_faults(p, src_nid); +	} + +	/* Encourage migration to the preferred node. */ +	if (dst_nid == p->numa_preferred_nid)  		return true; -	return false; +	return task_faults(p, dst_nid) > task_faults(p, src_nid);  }  static bool migrate_degrades_locality(struct task_struct *p, struct lb_env *env)  { +	struct numa_group *numa_group = rcu_dereference(p->numa_group);  	int src_nid, dst_nid;  	if (!sched_feat(NUMA) || !sched_feat(NUMA_RESIST_LOWER)) @@ -5112,16 +5174,23 @@ static bool migrate_degrades_locality(struct task_struct *p, struct lb_env *env)  	if (src_nid == dst_nid)  		return false; +	if (numa_group) { +		/* Task is moving within/into the group's interleave set. */ +		if (node_isset(dst_nid, numa_group->active_nodes)) +			return false; + +		/* Task is moving out of the group's interleave set. */ +		if (node_isset(src_nid, numa_group->active_nodes)) +			return true; + +		return group_faults(p, dst_nid) < group_faults(p, src_nid); +	} +  	/* Migrating away from the preferred node is always bad. */  	if (src_nid == p->numa_preferred_nid)  		return true; -	/* If either task or group weight get worse, don't do it. */ -	if (task_weight(p, dst_nid) < task_weight(p, src_nid) || -	    group_weight(p, dst_nid) < group_weight(p, src_nid)) -		return true; - -	return false; +	return task_faults(p, dst_nid) < task_faults(p, src_nid);  }  #else @@ -5564,6 +5633,7 @@ static unsigned long scale_rt_power(int cpu)  {  	struct rq *rq = cpu_rq(cpu);  	u64 total, available, age_stamp, avg; +	s64 delta;  	/*  	 * Since we're reading these variables without serialization make sure @@ -5572,7 +5642,11 @@ static unsigned long scale_rt_power(int cpu)  	age_stamp = ACCESS_ONCE(rq->age_stamp);  	avg = ACCESS_ONCE(rq->rt_avg); -	total = sched_avg_period() + (rq_clock(rq) - age_stamp); +	delta = rq_clock(rq) - age_stamp; +	if (unlikely(delta < 0)) +		delta = 0; + +	total = sched_avg_period() + delta;  	if (unlikely(total < avg)) {  		/* Ensures that power won't end up being negative */ @@ -6640,17 +6714,44 @@ out:  	return ld_moved;  } +static inline unsigned long +get_sd_balance_interval(struct sched_domain *sd, int cpu_busy) +{ +	unsigned long interval = sd->balance_interval; + +	if (cpu_busy) +		interval *= sd->busy_factor; + +	/* scale ms to jiffies */ +	interval = msecs_to_jiffies(interval); +	interval = clamp(interval, 1UL, max_load_balance_interval); + +	return interval; +} + +static inline void +update_next_balance(struct sched_domain *sd, int cpu_busy, unsigned long *next_balance) +{ +	unsigned long interval, next; + +	interval = get_sd_balance_interval(sd, cpu_busy); +	next = sd->last_balance + interval; + +	if (time_after(*next_balance, next)) +		*next_balance = next; +} +  /*   * idle_balance is called by schedule() if this_cpu is about to become   * idle. Attempts to pull tasks from other CPUs.   */  static int idle_balance(struct rq *this_rq)  { +	unsigned long next_balance = jiffies + HZ; +	int this_cpu = this_rq->cpu;  	struct sched_domain *sd;  	int pulled_task = 0; -	unsigned long next_balance = jiffies + HZ;  	u64 curr_cost = 0; -	int this_cpu = this_rq->cpu;  	idle_enter_fair(this_rq); @@ -6660,8 +6761,15 @@ static int idle_balance(struct rq *this_rq)  	 */  	this_rq->idle_stamp = rq_clock(this_rq); -	if (this_rq->avg_idle < sysctl_sched_migration_cost) +	if (this_rq->avg_idle < sysctl_sched_migration_cost) { +		rcu_read_lock(); +		sd = rcu_dereference_check_sched_domain(this_rq->sd); +		if (sd) +			update_next_balance(sd, 0, &next_balance); +		rcu_read_unlock(); +  		goto out; +	}  	/*  	 * Drop the rq->lock, but keep IRQ/preempt disabled. @@ -6671,20 +6779,20 @@ static int idle_balance(struct rq *this_rq)  	update_blocked_averages(this_cpu);  	rcu_read_lock();  	for_each_domain(this_cpu, sd) { -		unsigned long interval;  		int continue_balancing = 1;  		u64 t0, domain_cost;  		if (!(sd->flags & SD_LOAD_BALANCE))  			continue; -		if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost) +		if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost) { +			update_next_balance(sd, 0, &next_balance);  			break; +		}  		if (sd->flags & SD_BALANCE_NEWIDLE) {  			t0 = sched_clock_cpu(this_cpu); -			/* If we've pulled tasks over stop searching: */  			pulled_task = load_balance(this_cpu, this_rq,  						   sd, CPU_NEWLY_IDLE,  						   &continue_balancing); @@ -6696,10 +6804,13 @@ static int idle_balance(struct rq *this_rq)  			curr_cost += domain_cost;  		} -		interval = msecs_to_jiffies(sd->balance_interval); -		if (time_after(next_balance, sd->last_balance + interval)) -			next_balance = sd->last_balance + interval; -		if (pulled_task) +		update_next_balance(sd, 0, &next_balance); + +		/* +		 * Stop searching for tasks to pull if there are +		 * now runnable tasks on this rq. +		 */ +		if (pulled_task || this_rq->nr_running > 0)  			break;  	}  	rcu_read_unlock(); @@ -6717,20 +6828,13 @@ static int idle_balance(struct rq *this_rq)  	if (this_rq->cfs.h_nr_running && !pulled_task)  		pulled_task = 1; -	if (pulled_task || time_after(jiffies, this_rq->next_balance)) { -		/* -		 * We are going idle. next_balance may be set based on -		 * a busy processor. So reset next_balance. -		 */ +out: +	/* Move the next balance forward */ +	if (time_after(this_rq->next_balance, next_balance))  		this_rq->next_balance = next_balance; -	} -out:  	/* Is there a task of a high priority class? */ -	if (this_rq->nr_running != this_rq->cfs.h_nr_running && -	    ((this_rq->stop && this_rq->stop->on_rq) || -	     this_rq->dl.dl_nr_running || -	     (this_rq->rt.rt_nr_running && !rt_rq_throttled(&this_rq->rt)))) +	if (this_rq->nr_running != this_rq->cfs.h_nr_running)  		pulled_task = -1;  	if (pulled_task) { @@ -7011,16 +7115,9 @@ static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)  			break;  		} -		interval = sd->balance_interval; -		if (idle != CPU_IDLE) -			interval *= sd->busy_factor; - -		/* scale ms to jiffies */ -		interval = msecs_to_jiffies(interval); -		interval = clamp(interval, 1UL, max_load_balance_interval); +		interval = get_sd_balance_interval(sd, idle != CPU_IDLE);  		need_serialize = sd->flags & SD_SERIALIZE; -  		if (need_serialize) {  			if (!spin_trylock(&balancing))  				goto out; @@ -7036,6 +7133,7 @@ static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)  				idle = idle_cpu(cpu) ? CPU_IDLE : CPU_NOT_IDLE;  			}  			sd->last_balance = jiffies; +			interval = get_sd_balance_interval(sd, idle != CPU_IDLE);  		}  		if (need_serialize)  			spin_unlock(&balancing); diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 8f4390a079c7..25b9423abce9 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -67,24 +67,21 @@ void __weak arch_cpu_idle(void)   * cpuidle_idle_call - the main idle function   *   * NOTE: no locks or semaphores should be used here - * return non-zero on failure   */ -static int cpuidle_idle_call(void) +static void cpuidle_idle_call(void)  {  	struct cpuidle_device *dev = __this_cpu_read(cpuidle_devices);  	struct cpuidle_driver *drv = cpuidle_get_cpu_driver(dev); -	int next_state, entered_state, ret; +	int next_state, entered_state;  	bool broadcast;  	/*  	 * Check if the idle task must be rescheduled. If it is the -	 * case, exit the function after re-enabling the local irq and -	 * set again the polling flag +	 * case, exit the function after re-enabling the local irq.  	 */ -	if (current_clr_polling_and_test()) { +	if (need_resched()) {  		local_irq_enable(); -		__current_set_polling(); -		return 0; +		return;  	}  	/* @@ -101,96 +98,79 @@ static int cpuidle_idle_call(void)  	rcu_idle_enter();  	/* -	 * Check if the cpuidle framework is ready, otherwise fallback -	 * to the default arch specific idle method +	 * Ask the cpuidle framework to choose a convenient idle state. +	 * Fall back to the default arch idle method on errors.  	 */ -	ret = cpuidle_enabled(drv, dev); - -	if (!ret) { +	next_state = cpuidle_select(drv, dev); +	if (next_state < 0) { +use_default:  		/* -		 * Ask the governor to choose an idle state it thinks -		 * it is convenient to go to. There is *always* a -		 * convenient idle state +		 * We can't use the cpuidle framework, let's use the default +		 * idle routine.  		 */ -		next_state = cpuidle_select(drv, dev); - -		/* -		 * The idle task must be scheduled, it is pointless to -		 * go to idle, just update no idle residency and get -		 * out of this function -		 */ -		if (current_clr_polling_and_test()) { -			dev->last_residency = 0; -			entered_state = next_state; +		if (current_clr_polling_and_test())  			local_irq_enable(); -		} else { -			broadcast = !!(drv->states[next_state].flags & -				       CPUIDLE_FLAG_TIMER_STOP); - -			if (broadcast) -				/* -				 * Tell the time framework to switch -				 * to a broadcast timer because our -				 * local timer will be shutdown. If a -				 * local timer is used from another -				 * cpu as a broadcast timer, this call -				 * may fail if it is not available -				 */ -				ret = clockevents_notify( -					CLOCK_EVT_NOTIFY_BROADCAST_ENTER, -					&dev->cpu); - -			if (!ret) { -				trace_cpu_idle_rcuidle(next_state, dev->cpu); - -				/* -				 * Enter the idle state previously -				 * returned by the governor -				 * decision. This function will block -				 * until an interrupt occurs and will -				 * take care of re-enabling the local -				 * interrupts -				 */ -				entered_state = cpuidle_enter(drv, dev, -							      next_state); - -				trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, -						       dev->cpu); - -				if (broadcast) -					clockevents_notify( -						CLOCK_EVT_NOTIFY_BROADCAST_EXIT, -						&dev->cpu); - -				/* -				 * Give the governor an opportunity to reflect on the -				 * outcome -				 */ -				cpuidle_reflect(dev, entered_state); -			} -		} +		else +			arch_cpu_idle(); + +		goto exit_idle;  	} +  	/* -	 * We can't use the cpuidle framework, let's use the default -	 * idle routine +	 * The idle task must be scheduled, it is pointless to +	 * go to idle, just update no idle residency and get +	 * out of this function  	 */ -	if (ret) -		arch_cpu_idle(); +	if (current_clr_polling_and_test()) { +		dev->last_residency = 0; +		entered_state = next_state; +		local_irq_enable(); +		goto exit_idle; +	} + +	broadcast = !!(drv->states[next_state].flags & CPUIDLE_FLAG_TIMER_STOP); +	/* +	 * Tell the time framework to switch to a broadcast timer +	 * because our local timer will be shutdown. If a local timer +	 * is used from another cpu as a broadcast timer, this call may +	 * fail if it is not available +	 */ +	if (broadcast && +	    clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &dev->cpu)) +		goto use_default; + +	trace_cpu_idle_rcuidle(next_state, dev->cpu); + +	/* +	 * Enter the idle state previously returned by the governor decision. +	 * This function will block until an interrupt occurs and will take +	 * care of re-enabling the local interrupts +	 */ +	entered_state = cpuidle_enter(drv, dev, next_state); + +	trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, dev->cpu); + +	if (broadcast) +		clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &dev->cpu); + +	/* +	 * Give the governor an opportunity to reflect on the outcome +	 */ +	cpuidle_reflect(dev, entered_state); + +exit_idle:  	__current_set_polling();  	/* -	 * It is up to the idle functions to enable back the local -	 * interrupt +	 * It is up to the idle functions to reenable local interrupts  	 */  	if (WARN_ON_ONCE(irqs_disabled()))  		local_irq_enable();  	rcu_idle_exit();  	start_critical_timings(); - -	return 0;  }  /* diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index bd2267ad404f..0ebfd7a29472 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -79,6 +79,8 @@ void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)  	rt_rq->overloaded = 0;  	plist_head_init(&rt_rq->pushable_tasks);  #endif +	/* We start is dequeued state, because no RT tasks are queued */ +	rt_rq->rt_queued = 0;  	rt_rq->rt_time = 0;  	rt_rq->rt_throttled = 0; @@ -112,6 +114,13 @@ static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)  	return rt_se->rt_rq;  } +static inline struct rq *rq_of_rt_se(struct sched_rt_entity *rt_se) +{ +	struct rt_rq *rt_rq = rt_se->rt_rq; + +	return rt_rq->rq; +} +  void free_rt_sched_group(struct task_group *tg)  {  	int i; @@ -211,10 +220,16 @@ static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)  	return container_of(rt_rq, struct rq, rt);  } -static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se) +static inline struct rq *rq_of_rt_se(struct sched_rt_entity *rt_se)  {  	struct task_struct *p = rt_task_of(rt_se); -	struct rq *rq = task_rq(p); + +	return task_rq(p); +} + +static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se) +{ +	struct rq *rq = rq_of_rt_se(rt_se);  	return &rq->rt;  } @@ -391,6 +406,9 @@ static inline void set_post_schedule(struct rq *rq)  }  #endif /* CONFIG_SMP */ +static void enqueue_top_rt_rq(struct rt_rq *rt_rq); +static void dequeue_top_rt_rq(struct rt_rq *rt_rq); +  static inline int on_rt_rq(struct sched_rt_entity *rt_se)  {  	return !list_empty(&rt_se->run_list); @@ -452,8 +470,11 @@ static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)  	rt_se = rt_rq->tg->rt_se[cpu];  	if (rt_rq->rt_nr_running) { -		if (rt_se && !on_rt_rq(rt_se)) +		if (!rt_se) +			enqueue_top_rt_rq(rt_rq); +		else if (!on_rt_rq(rt_se))  			enqueue_rt_entity(rt_se, false); +  		if (rt_rq->highest_prio.curr < curr->prio)  			resched_task(curr);  	} @@ -466,10 +487,17 @@ static void sched_rt_rq_dequeue(struct rt_rq *rt_rq)  	rt_se = rt_rq->tg->rt_se[cpu]; -	if (rt_se && on_rt_rq(rt_se)) +	if (!rt_se) +		dequeue_top_rt_rq(rt_rq); +	else if (on_rt_rq(rt_se))  		dequeue_rt_entity(rt_se);  } +static inline int rt_rq_throttled(struct rt_rq *rt_rq) +{ +	return rt_rq->rt_throttled && !rt_rq->rt_nr_boosted; +} +  static int rt_se_boosted(struct sched_rt_entity *rt_se)  {  	struct rt_rq *rt_rq = group_rt_rq(rt_se); @@ -532,12 +560,23 @@ static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se)  static inline void sched_rt_rq_enqueue(struct rt_rq *rt_rq)  { -	if (rt_rq->rt_nr_running) -		resched_task(rq_of_rt_rq(rt_rq)->curr); +	struct rq *rq = rq_of_rt_rq(rt_rq); + +	if (!rt_rq->rt_nr_running) +		return; + +	enqueue_top_rt_rq(rt_rq); +	resched_task(rq->curr);  }  static inline void sched_rt_rq_dequeue(struct rt_rq *rt_rq)  { +	dequeue_top_rt_rq(rt_rq); +} + +static inline int rt_rq_throttled(struct rt_rq *rt_rq) +{ +	return rt_rq->rt_throttled;  }  static inline const struct cpumask *sched_rt_period_mask(void) @@ -922,6 +961,38 @@ static void update_curr_rt(struct rq *rq)  	}  } +static void +dequeue_top_rt_rq(struct rt_rq *rt_rq) +{ +	struct rq *rq = rq_of_rt_rq(rt_rq); + +	BUG_ON(&rq->rt != rt_rq); + +	if (!rt_rq->rt_queued) +		return; + +	BUG_ON(!rq->nr_running); + +	sub_nr_running(rq, rt_rq->rt_nr_running); +	rt_rq->rt_queued = 0; +} + +static void +enqueue_top_rt_rq(struct rt_rq *rt_rq) +{ +	struct rq *rq = rq_of_rt_rq(rt_rq); + +	BUG_ON(&rq->rt != rt_rq); + +	if (rt_rq->rt_queued) +		return; +	if (rt_rq_throttled(rt_rq) || !rt_rq->rt_nr_running) +		return; + +	add_nr_running(rq, rt_rq->rt_nr_running); +	rt_rq->rt_queued = 1; +} +  #if defined CONFIG_SMP  static void @@ -1045,12 +1116,23 @@ void dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) {}  #endif /* CONFIG_RT_GROUP_SCHED */  static inline +unsigned int rt_se_nr_running(struct sched_rt_entity *rt_se) +{ +	struct rt_rq *group_rq = group_rt_rq(rt_se); + +	if (group_rq) +		return group_rq->rt_nr_running; +	else +		return 1; +} + +static inline  void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)  {  	int prio = rt_se_prio(rt_se);  	WARN_ON(!rt_prio(prio)); -	rt_rq->rt_nr_running++; +	rt_rq->rt_nr_running += rt_se_nr_running(rt_se);  	inc_rt_prio(rt_rq, prio);  	inc_rt_migration(rt_se, rt_rq); @@ -1062,7 +1144,7 @@ void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)  {  	WARN_ON(!rt_prio(rt_se_prio(rt_se)));  	WARN_ON(!rt_rq->rt_nr_running); -	rt_rq->rt_nr_running--; +	rt_rq->rt_nr_running -= rt_se_nr_running(rt_se);  	dec_rt_prio(rt_rq, rt_se_prio(rt_se));  	dec_rt_migration(rt_se, rt_rq); @@ -1119,6 +1201,8 @@ static void dequeue_rt_stack(struct sched_rt_entity *rt_se)  		back = rt_se;  	} +	dequeue_top_rt_rq(rt_rq_of_se(back)); +  	for (rt_se = back; rt_se; rt_se = rt_se->back) {  		if (on_rt_rq(rt_se))  			__dequeue_rt_entity(rt_se); @@ -1127,13 +1211,18 @@ static void dequeue_rt_stack(struct sched_rt_entity *rt_se)  static void enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head)  { +	struct rq *rq = rq_of_rt_se(rt_se); +  	dequeue_rt_stack(rt_se);  	for_each_sched_rt_entity(rt_se)  		__enqueue_rt_entity(rt_se, head); +	enqueue_top_rt_rq(&rq->rt);  }  static void dequeue_rt_entity(struct sched_rt_entity *rt_se)  { +	struct rq *rq = rq_of_rt_se(rt_se); +  	dequeue_rt_stack(rt_se);  	for_each_sched_rt_entity(rt_se) { @@ -1142,6 +1231,7 @@ static void dequeue_rt_entity(struct sched_rt_entity *rt_se)  		if (rt_rq && rt_rq->rt_nr_running)  			__enqueue_rt_entity(rt_se, false);  	} +	enqueue_top_rt_rq(&rq->rt);  }  /* @@ -1159,8 +1249,6 @@ enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags)  	if (!task_current(rq, p) && p->nr_cpus_allowed > 1)  		enqueue_pushable_task(rq, p); - -	inc_nr_running(rq);  }  static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags) @@ -1171,8 +1259,6 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags)  	dequeue_rt_entity(rt_se);  	dequeue_pushable_task(rq, p); - -	dec_nr_running(rq);  }  /* @@ -1377,10 +1463,7 @@ pick_next_task_rt(struct rq *rq, struct task_struct *prev)  	if (prev->sched_class == &rt_sched_class)  		update_curr_rt(rq); -	if (!rt_rq->rt_nr_running) -		return NULL; - -	if (rt_rq_throttled(rt_rq)) +	if (!rt_rq->rt_queued)  		return NULL;  	put_prev_task(rq, prev); @@ -1892,9 +1975,9 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p)  	 */  	if (p->on_rq && rq->curr != p) {  #ifdef CONFIG_SMP -		if (rq->rt.overloaded && push_rt_task(rq) && +		if (p->nr_cpus_allowed > 1 && rq->rt.overloaded &&  		    /* Don't resched if we changed runqueues */ -		    rq != task_rq(p)) +		    push_rt_task(rq) && rq != task_rq(p))  			check_resched = 0;  #endif /* CONFIG_SMP */  		if (check_resched && p->prio < rq->curr->prio) diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 456e492a3dca..600e2291a75c 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -409,6 +409,8 @@ struct rt_rq {  	int overloaded;  	struct plist_head pushable_tasks;  #endif +	int rt_queued; +  	int rt_throttled;  	u64 rt_time;  	u64 rt_runtime; @@ -423,18 +425,6 @@ struct rt_rq {  #endif  }; -#ifdef CONFIG_RT_GROUP_SCHED -static inline int rt_rq_throttled(struct rt_rq *rt_rq) -{ -	return rt_rq->rt_throttled && !rt_rq->rt_nr_boosted; -} -#else -static inline int rt_rq_throttled(struct rt_rq *rt_rq) -{ -	return rt_rq->rt_throttled; -} -#endif -  /* Deadline class' related fields in a runqueue */  struct dl_rq {  	/* runqueue is an rbtree, ordered by deadline */ @@ -1216,12 +1206,14 @@ extern void update_idle_cpu_load(struct rq *this_rq);  extern void init_task_runnable_average(struct task_struct *p); -static inline void inc_nr_running(struct rq *rq) +static inline void add_nr_running(struct rq *rq, unsigned count)  { -	rq->nr_running++; +	unsigned prev_nr = rq->nr_running; + +	rq->nr_running = prev_nr + count;  #ifdef CONFIG_NO_HZ_FULL -	if (rq->nr_running == 2) { +	if (prev_nr < 2 && rq->nr_running >= 2) {  		if (tick_nohz_full_cpu(rq->cpu)) {  			/* Order rq->nr_running write against the IPI */  			smp_wmb(); @@ -1231,9 +1223,9 @@ static inline void inc_nr_running(struct rq *rq)  #endif  } -static inline void dec_nr_running(struct rq *rq) +static inline void sub_nr_running(struct rq *rq, unsigned count)  { -	rq->nr_running--; +	rq->nr_running -= count;  }  static inline void rq_last_tick_reset(struct rq *rq) diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c index d6ce65dde541..bfe0edadbfbb 100644 --- a/kernel/sched/stop_task.c +++ b/kernel/sched/stop_task.c @@ -41,13 +41,13 @@ pick_next_task_stop(struct rq *rq, struct task_struct *prev)  static void  enqueue_task_stop(struct rq *rq, struct task_struct *p, int flags)  { -	inc_nr_running(rq); +	add_nr_running(rq, 1);  }  static void  dequeue_task_stop(struct rq *rq, struct task_struct *p, int flags)  { -	dec_nr_running(rq); +	sub_nr_running(rq, 1);  }  static void yield_task_stop(struct rq *rq) diff --git a/kernel/sys.c b/kernel/sys.c index fba0f29401ea..66a751ebf9d9 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -250,7 +250,7 @@ SYSCALL_DEFINE2(getpriority, int, which, int, who)  			else  				p = current;  			if (p) { -				niceval = 20 - task_nice(p); +				niceval = nice_to_rlimit(task_nice(p));  				if (niceval > retval)  					retval = niceval;  			} @@ -261,7 +261,7 @@ SYSCALL_DEFINE2(getpriority, int, which, int, who)  			else  				pgrp = task_pgrp(current);  			do_each_pid_thread(pgrp, PIDTYPE_PGID, p) { -				niceval = 20 - task_nice(p); +				niceval = nice_to_rlimit(task_nice(p));  				if (niceval > retval)  					retval = niceval;  			} while_each_pid_thread(pgrp, PIDTYPE_PGID, p); @@ -277,7 +277,7 @@ SYSCALL_DEFINE2(getpriority, int, which, int, who)  			do_each_thread(g, p) {  				if (uid_eq(task_uid(p), uid)) { -					niceval = 20 - task_nice(p); +					niceval = nice_to_rlimit(task_nice(p));  					if (niceval > retval)  						retval = niceval;  				} diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 8edc87185427..a4bab46cd38e 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -100,10 +100,10 @@ enum {  	/*  	 * Rescue workers are used only on emergencies and shared by -	 * all cpus.  Give -20. +	 * all cpus.  Give MIN_NICE.  	 */ -	RESCUER_NICE_LEVEL	= -20, -	HIGHPRI_NICE_LEVEL	= -20, +	RESCUER_NICE_LEVEL	= MIN_NICE, +	HIGHPRI_NICE_LEVEL	= MIN_NICE,  	WQ_NAME_LEN		= 24,  }; | 
