diff options
Diffstat (limited to 'kernel/sched/core.c')
| -rw-r--r-- | kernel/sched/core.c | 324 | 
1 files changed, 176 insertions, 148 deletions
| diff --git a/kernel/sched/core.c b/kernel/sched/core.c index a62a7dec3986..913c6d6cc2c1 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -522,6 +522,39 @@ static inline void init_hrtick(void)  #endif	/* CONFIG_SCHED_HRTICK */  /* + * cmpxchg based fetch_or, macro so it works for different integer types + */ +#define fetch_or(ptr, val)						\ +({	typeof(*(ptr)) __old, __val = *(ptr);				\ + 	for (;;) {							\ + 		__old = cmpxchg((ptr), __val, __val | (val));		\ + 		if (__old == __val)					\ + 			break;						\ + 		__val = __old;						\ + 	}								\ + 	__old;								\ +}) + +#ifdef TIF_POLLING_NRFLAG +/* + * Atomically set TIF_NEED_RESCHED and test for TIF_POLLING_NRFLAG, + * this avoids any races wrt polling state changes and thereby avoids + * spurious IPIs. + */ +static bool set_nr_and_not_polling(struct task_struct *p) +{ +	struct thread_info *ti = task_thread_info(p); +	return !(fetch_or(&ti->flags, _TIF_NEED_RESCHED) & _TIF_POLLING_NRFLAG); +} +#else +static bool set_nr_and_not_polling(struct task_struct *p) +{ +	set_tsk_need_resched(p); +	return true; +} +#endif + +/*   * resched_task - mark a task 'to be rescheduled now'.   *   * On UP this means the setting of the need_resched flag, on SMP it @@ -537,17 +570,15 @@ void resched_task(struct task_struct *p)  	if (test_tsk_need_resched(p))  		return; -	set_tsk_need_resched(p); -  	cpu = task_cpu(p); +  	if (cpu == smp_processor_id()) { +		set_tsk_need_resched(p);  		set_preempt_need_resched();  		return;  	} -	/* NEED_RESCHED must be visible before we test polling */ -	smp_mb(); -	if (!tsk_is_polling(p)) +	if (set_nr_and_not_polling(p))  		smp_send_reschedule(cpu);  } @@ -3018,7 +3049,7 @@ EXPORT_SYMBOL(set_user_nice);  int can_nice(const struct task_struct *p, const int nice)  {  	/* convert nice value [19,-20] to rlimit style value [1,40] */ -	int nice_rlim = 20 - nice; +	int nice_rlim = nice_to_rlimit(nice);  	return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) ||  		capable(CAP_SYS_NICE)); @@ -3042,17 +3073,10 @@ SYSCALL_DEFINE1(nice, int, increment)  	 * We don't have to worry. Conceptually one call occurs first  	 * and we have a single winner.  	 */ -	if (increment < -40) -		increment = -40; -	if (increment > 40) -		increment = 40; - +	increment = clamp(increment, -NICE_WIDTH, NICE_WIDTH);  	nice = task_nice(current) + increment; -	if (nice < MIN_NICE) -		nice = MIN_NICE; -	if (nice > MAX_NICE) -		nice = MAX_NICE; +	nice = clamp_val(nice, MIN_NICE, MAX_NICE);  	if (increment < 0 && !can_nice(current, nice))  		return -EPERM; @@ -3642,13 +3666,11 @@ static int sched_copy_attr(struct sched_attr __user *uattr,  	 */  	attr->sched_nice = clamp(attr->sched_nice, MIN_NICE, MAX_NICE); -out: -	return ret; +	return 0;  err_size:  	put_user(sizeof(*attr), &uattr->size); -	ret = -E2BIG; -	goto out; +	return -E2BIG;  }  /** @@ -3808,7 +3830,7 @@ static int sched_read_attr(struct sched_attr __user *uattr,  		for (; addr < end; addr++) {  			if (*addr) -				goto err_size; +				return -EFBIG;  		}  		attr->size = usize; @@ -3818,12 +3840,7 @@ static int sched_read_attr(struct sched_attr __user *uattr,  	if (ret)  		return -EFAULT; -out: -	return ret; - -err_size: -	ret = -E2BIG; -	goto out; +	return 0;  }  /** @@ -5093,10 +5110,20 @@ static struct notifier_block migration_notifier = {  	.priority = CPU_PRI_MIGRATION,  }; +static void __cpuinit set_cpu_rq_start_time(void) +{ +	int cpu = smp_processor_id(); +	struct rq *rq = cpu_rq(cpu); +	rq->age_stamp = sched_clock_cpu(cpu); +} +  static int sched_cpu_active(struct notifier_block *nfb,  				      unsigned long action, void *hcpu)  {  	switch (action & ~CPU_TASKS_FROZEN) { +	case CPU_STARTING: +		set_cpu_rq_start_time(); +		return NOTIFY_OK;  	case CPU_DOWN_FAILED:  		set_cpu_active((long)hcpu, true);  		return NOTIFY_OK; @@ -5305,7 +5332,8 @@ static int sd_degenerate(struct sched_domain *sd)  			 SD_BALANCE_FORK |  			 SD_BALANCE_EXEC |  			 SD_SHARE_CPUPOWER | -			 SD_SHARE_PKG_RESOURCES)) { +			 SD_SHARE_PKG_RESOURCES | +			 SD_SHARE_POWERDOMAIN)) {  		if (sd->groups != sd->groups->next)  			return 0;  	} @@ -5336,7 +5364,8 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)  				SD_BALANCE_EXEC |  				SD_SHARE_CPUPOWER |  				SD_SHARE_PKG_RESOURCES | -				SD_PREFER_SIBLING); +				SD_PREFER_SIBLING | +				SD_SHARE_POWERDOMAIN);  		if (nr_node_ids == 1)  			pflags &= ~SD_SERIALIZE;  	} @@ -5610,17 +5639,6 @@ static int __init isolated_cpu_setup(char *str)  __setup("isolcpus=", isolated_cpu_setup); -static const struct cpumask *cpu_cpu_mask(int cpu) -{ -	return cpumask_of_node(cpu_to_node(cpu)); -} - -struct sd_data { -	struct sched_domain **__percpu sd; -	struct sched_group **__percpu sg; -	struct sched_group_power **__percpu sgp; -}; -  struct s_data {  	struct sched_domain ** __percpu sd;  	struct root_domain	*rd; @@ -5633,21 +5651,6 @@ enum s_alloc {  	sa_none,  }; -struct sched_domain_topology_level; - -typedef struct sched_domain *(*sched_domain_init_f)(struct sched_domain_topology_level *tl, int cpu); -typedef const struct cpumask *(*sched_domain_mask_f)(int cpu); - -#define SDTL_OVERLAP	0x01 - -struct sched_domain_topology_level { -	sched_domain_init_f init; -	sched_domain_mask_f mask; -	int		    flags; -	int		    numa_level; -	struct sd_data      data; -}; -  /*   * Build an iteration mask that can exclude certain CPUs from the upwards   * domain traversal. @@ -5815,8 +5818,6 @@ build_sched_groups(struct sched_domain *sd, int cpu)  			continue;  		group = get_group(i, sdd, &sg); -		cpumask_clear(sched_group_cpus(sg)); -		sg->sgp->power = 0;  		cpumask_setall(sched_group_mask(sg));  		for_each_cpu(j, span) { @@ -5866,44 +5867,11 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)  	atomic_set(&sg->sgp->nr_busy_cpus, sg->group_weight);  } -int __weak arch_sd_sibling_asym_packing(void) -{ -       return 0*SD_ASYM_PACKING; -} -  /*   * Initializers for schedule domains   * Non-inlined to reduce accumulated stack pressure in build_sched_domains()   */ -#ifdef CONFIG_SCHED_DEBUG -# define SD_INIT_NAME(sd, type)		sd->name = #type -#else -# define SD_INIT_NAME(sd, type)		do { } while (0) -#endif - -#define SD_INIT_FUNC(type)						\ -static noinline struct sched_domain *					\ -sd_init_##type(struct sched_domain_topology_level *tl, int cpu) 	\ -{									\ -	struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu);	\ -	*sd = SD_##type##_INIT;						\ -	SD_INIT_NAME(sd, type);						\ -	sd->private = &tl->data;					\ -	return sd;							\ -} - -SD_INIT_FUNC(CPU) -#ifdef CONFIG_SCHED_SMT - SD_INIT_FUNC(SIBLING) -#endif -#ifdef CONFIG_SCHED_MC - SD_INIT_FUNC(MC) -#endif -#ifdef CONFIG_SCHED_BOOK - SD_INIT_FUNC(BOOK) -#endif -  static int default_relax_domain_level = -1;  int sched_domain_level_max; @@ -5991,99 +5959,154 @@ static void claim_allocations(int cpu, struct sched_domain *sd)  		*per_cpu_ptr(sdd->sgp, cpu) = NULL;  } -#ifdef CONFIG_SCHED_SMT -static const struct cpumask *cpu_smt_mask(int cpu) -{ -	return topology_thread_cpumask(cpu); -} -#endif - -/* - * Topology list, bottom-up. - */ -static struct sched_domain_topology_level default_topology[] = { -#ifdef CONFIG_SCHED_SMT -	{ sd_init_SIBLING, cpu_smt_mask, }, -#endif -#ifdef CONFIG_SCHED_MC -	{ sd_init_MC, cpu_coregroup_mask, }, -#endif -#ifdef CONFIG_SCHED_BOOK -	{ sd_init_BOOK, cpu_book_mask, }, -#endif -	{ sd_init_CPU, cpu_cpu_mask, }, -	{ NULL, }, -}; - -static struct sched_domain_topology_level *sched_domain_topology = default_topology; - -#define for_each_sd_topology(tl)			\ -	for (tl = sched_domain_topology; tl->init; tl++) -  #ifdef CONFIG_NUMA -  static int sched_domains_numa_levels;  static int *sched_domains_numa_distance;  static struct cpumask ***sched_domains_numa_masks;  static int sched_domains_curr_level; +#endif -static inline int sd_local_flags(int level) -{ -	if (sched_domains_numa_distance[level] > RECLAIM_DISTANCE) -		return 0; - -	return SD_BALANCE_EXEC | SD_BALANCE_FORK | SD_WAKE_AFFINE; -} +/* + * SD_flags allowed in topology descriptions. + * + * SD_SHARE_CPUPOWER      - describes SMT topologies + * SD_SHARE_PKG_RESOURCES - describes shared caches + * SD_NUMA                - describes NUMA topologies + * SD_SHARE_POWERDOMAIN   - describes shared power domain + * + * Odd one out: + * SD_ASYM_PACKING        - describes SMT quirks + */ +#define TOPOLOGY_SD_FLAGS		\ +	(SD_SHARE_CPUPOWER |		\ +	 SD_SHARE_PKG_RESOURCES |	\ +	 SD_NUMA |			\ +	 SD_ASYM_PACKING |		\ +	 SD_SHARE_POWERDOMAIN)  static struct sched_domain * -sd_numa_init(struct sched_domain_topology_level *tl, int cpu) +sd_init(struct sched_domain_topology_level *tl, int cpu)  {  	struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu); -	int level = tl->numa_level; -	int sd_weight = cpumask_weight( -			sched_domains_numa_masks[level][cpu_to_node(cpu)]); +	int sd_weight, sd_flags = 0; + +#ifdef CONFIG_NUMA +	/* +	 * Ugly hack to pass state to sd_numa_mask()... +	 */ +	sched_domains_curr_level = tl->numa_level; +#endif + +	sd_weight = cpumask_weight(tl->mask(cpu)); + +	if (tl->sd_flags) +		sd_flags = (*tl->sd_flags)(); +	if (WARN_ONCE(sd_flags & ~TOPOLOGY_SD_FLAGS, +			"wrong sd_flags in topology description\n")) +		sd_flags &= ~TOPOLOGY_SD_FLAGS;  	*sd = (struct sched_domain){  		.min_interval		= sd_weight,  		.max_interval		= 2*sd_weight,  		.busy_factor		= 32,  		.imbalance_pct		= 125, -		.cache_nice_tries	= 2, -		.busy_idx		= 3, -		.idle_idx		= 2, + +		.cache_nice_tries	= 0, +		.busy_idx		= 0, +		.idle_idx		= 0,  		.newidle_idx		= 0,  		.wake_idx		= 0,  		.forkexec_idx		= 0,  		.flags			= 1*SD_LOAD_BALANCE  					| 1*SD_BALANCE_NEWIDLE -					| 0*SD_BALANCE_EXEC -					| 0*SD_BALANCE_FORK +					| 1*SD_BALANCE_EXEC +					| 1*SD_BALANCE_FORK  					| 0*SD_BALANCE_WAKE -					| 0*SD_WAKE_AFFINE +					| 1*SD_WAKE_AFFINE  					| 0*SD_SHARE_CPUPOWER  					| 0*SD_SHARE_PKG_RESOURCES -					| 1*SD_SERIALIZE +					| 0*SD_SERIALIZE  					| 0*SD_PREFER_SIBLING -					| 1*SD_NUMA -					| sd_local_flags(level) +					| 0*SD_NUMA +					| sd_flags  					, +  		.last_balance		= jiffies,  		.balance_interval	= sd_weight, +		.smt_gain		= 0,  		.max_newidle_lb_cost	= 0,  		.next_decay_max_lb_cost	= jiffies, +#ifdef CONFIG_SCHED_DEBUG +		.name			= tl->name, +#endif  	}; -	SD_INIT_NAME(sd, NUMA); -	sd->private = &tl->data;  	/* -	 * Ugly hack to pass state to sd_numa_mask()... +	 * Convert topological properties into behaviour.  	 */ -	sched_domains_curr_level = tl->numa_level; + +	if (sd->flags & SD_SHARE_CPUPOWER) { +		sd->imbalance_pct = 110; +		sd->smt_gain = 1178; /* ~15% */ + +	} else if (sd->flags & SD_SHARE_PKG_RESOURCES) { +		sd->imbalance_pct = 117; +		sd->cache_nice_tries = 1; +		sd->busy_idx = 2; + +#ifdef CONFIG_NUMA +	} else if (sd->flags & SD_NUMA) { +		sd->cache_nice_tries = 2; +		sd->busy_idx = 3; +		sd->idle_idx = 2; + +		sd->flags |= SD_SERIALIZE; +		if (sched_domains_numa_distance[tl->numa_level] > RECLAIM_DISTANCE) { +			sd->flags &= ~(SD_BALANCE_EXEC | +				       SD_BALANCE_FORK | +				       SD_WAKE_AFFINE); +		} + +#endif +	} else { +		sd->flags |= SD_PREFER_SIBLING; +		sd->cache_nice_tries = 1; +		sd->busy_idx = 2; +		sd->idle_idx = 1; +	} + +	sd->private = &tl->data;  	return sd;  } +/* + * Topology list, bottom-up. + */ +static struct sched_domain_topology_level default_topology[] = { +#ifdef CONFIG_SCHED_SMT +	{ cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) }, +#endif +#ifdef CONFIG_SCHED_MC +	{ cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) }, +#endif +	{ cpu_cpu_mask, SD_INIT_NAME(DIE) }, +	{ NULL, }, +}; + +struct sched_domain_topology_level *sched_domain_topology = default_topology; + +#define for_each_sd_topology(tl)			\ +	for (tl = sched_domain_topology; tl->mask; tl++) + +void set_sched_topology(struct sched_domain_topology_level *tl) +{ +	sched_domain_topology = tl; +} + +#ifdef CONFIG_NUMA +  static const struct cpumask *sd_numa_mask(int cpu)  {  	return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)]; @@ -6227,7 +6250,10 @@ static void sched_init_numa(void)  		}  	} -	tl = kzalloc((ARRAY_SIZE(default_topology) + level) * +	/* Compute default topology size */ +	for (i = 0; sched_domain_topology[i].mask; i++); + +	tl = kzalloc((i + level + 1) *  			sizeof(struct sched_domain_topology_level), GFP_KERNEL);  	if (!tl)  		return; @@ -6235,18 +6261,19 @@ static void sched_init_numa(void)  	/*  	 * Copy the default topology bits..  	 */ -	for (i = 0; default_topology[i].init; i++) -		tl[i] = default_topology[i]; +	for (i = 0; sched_domain_topology[i].mask; i++) +		tl[i] = sched_domain_topology[i];  	/*  	 * .. and append 'j' levels of NUMA goodness.  	 */  	for (j = 0; j < level; i++, j++) {  		tl[i] = (struct sched_domain_topology_level){ -			.init = sd_numa_init,  			.mask = sd_numa_mask, +			.sd_flags = cpu_numa_flags,  			.flags = SDTL_OVERLAP,  			.numa_level = j, +			SD_INIT_NAME(NUMA)  		};  	} @@ -6404,7 +6431,7 @@ struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,  		const struct cpumask *cpu_map, struct sched_domain_attr *attr,  		struct sched_domain *child, int cpu)  { -	struct sched_domain *sd = tl->init(tl, cpu); +	struct sched_domain *sd = sd_init(tl, cpu);  	if (!sd)  		return child; @@ -6974,6 +7001,7 @@ void __init sched_init(void)  	if (cpu_isolated_map == NULL)  		zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);  	idle_thread_set_boot_cpu(); +	set_cpu_rq_start_time();  #endif  	init_sched_fair_class(); | 
