diff options
Diffstat (limited to 'kernel/sched/core.c')
| -rw-r--r-- | kernel/sched/core.c | 245 | 
1 files changed, 104 insertions, 141 deletions
| diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 78d8facba456..625bc9897f62 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7,7 +7,6 @@   */  #include "sched.h" -#include <linux/kthread.h>  #include <linux/nospec.h>  #include <linux/kcov.h> @@ -18,6 +17,8 @@  #include "../workqueue_internal.h"  #include "../smpboot.h" +#include "pelt.h" +  #define CREATE_TRACE_POINTS  #include <trace/events/sched.h> @@ -46,14 +47,6 @@ const_debug unsigned int sysctl_sched_features =  const_debug unsigned int sysctl_sched_nr_migrate = 32;  /* - * period over which we average the RT time consumption, measured - * in ms. - * - * default: 1s - */ -const_debug unsigned int sysctl_sched_time_avg = MSEC_PER_SEC; - -/*   * period over which we measure -rt task CPU usage in us.   * default: 1s   */ @@ -184,9 +177,9 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)  	rq->clock_task += delta; -#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING) +#ifdef HAVE_SCHED_AVG_IRQ  	if ((irq_delta + steal) && sched_feat(NONTASK_CAPACITY)) -		sched_rt_avg_update(rq, irq_delta + steal); +		update_irq_load_avg(rq, irq_delta + steal);  #endif  } @@ -413,8 +406,8 @@ void wake_q_add(struct wake_q_head *head, struct task_struct *task)  	 * its already queued (either by us or someone else) and will get the  	 * wakeup due to that.  	 * -	 * This cmpxchg() implies a full barrier, which pairs with the write -	 * barrier implied by the wakeup in wake_up_q(). +	 * This cmpxchg() executes a full barrier, which pairs with the full +	 * barrier executed by the wakeup in wake_up_q().  	 */  	if (cmpxchg(&node->next, NULL, WAKE_Q_TAIL))  		return; @@ -442,8 +435,8 @@ void wake_up_q(struct wake_q_head *head)  		task->wake_q.next = NULL;  		/* -		 * wake_up_process() implies a wmb() to pair with the queueing -		 * in wake_q_add() so as not to miss wakeups. +		 * wake_up_process() executes a full barrier, which pairs with +		 * the queueing in wake_q_add() so as not to miss wakeups.  		 */  		wake_up_process(task);  		put_task_struct(task); @@ -650,23 +643,6 @@ bool sched_can_stop_tick(struct rq *rq)  	return true;  }  #endif /* CONFIG_NO_HZ_FULL */ - -void sched_avg_update(struct rq *rq) -{ -	s64 period = sched_avg_period(); - -	while ((s64)(rq_clock(rq) - rq->age_stamp) > period) { -		/* -		 * Inline assembly required to prevent the compiler -		 * optimising this loop into a divmod call. -		 * See __iter_div_u64_rem() for another example of this. -		 */ -		asm("" : "+rm" (rq->age_stamp)); -		rq->age_stamp += period; -		rq->rt_avg /= 2; -	} -} -  #endif /* CONFIG_SMP */  #if defined(CONFIG_RT_GROUP_SCHED) || (defined(CONFIG_FAIR_GROUP_SCHED) && \ @@ -1200,6 +1176,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)  	__set_task_cpu(p, new_cpu);  } +#ifdef CONFIG_NUMA_BALANCING  static void __migrate_swap_task(struct task_struct *p, int cpu)  {  	if (task_on_rq_queued(p)) { @@ -1281,16 +1258,17 @@ unlock:  /*   * Cross migrate two tasks   */ -int migrate_swap(struct task_struct *cur, struct task_struct *p) +int migrate_swap(struct task_struct *cur, struct task_struct *p, +		int target_cpu, int curr_cpu)  {  	struct migration_swap_arg arg;  	int ret = -EINVAL;  	arg = (struct migration_swap_arg){  		.src_task = cur, -		.src_cpu = task_cpu(cur), +		.src_cpu = curr_cpu,  		.dst_task = p, -		.dst_cpu = task_cpu(p), +		.dst_cpu = target_cpu,  	};  	if (arg.src_cpu == arg.dst_cpu) @@ -1315,6 +1293,7 @@ int migrate_swap(struct task_struct *cur, struct task_struct *p)  out:  	return ret;  } +#endif /* CONFIG_NUMA_BALANCING */  /*   * wait_task_inactive - wait for a thread to unschedule. @@ -1880,8 +1859,7 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags)   *     rq(c1)->lock (if not at the same time, then in that order).   *  C) LOCK of the rq(c1)->lock scheduling in task   * - * Transitivity guarantees that B happens after A and C after B. - * Note: we only require RCpc transitivity. + * Release/acquire chaining guarantees that B happens after A and C after B.   * Note: the CPU doing B need not be c0 or c1   *   * Example: @@ -1943,16 +1921,9 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags)   *   UNLOCK rq(0)->lock   *   * - * However; for wakeups there is a second guarantee we must provide, namely we - * must observe the state that lead to our wakeup. That is, not only must our - * task observe its own prior state, it must also observe the stores prior to - * its wakeup. - * - * This means that any means of doing remote wakeups must order the CPU doing - * the wakeup against the CPU the task is going to end up running on. This, - * however, is already required for the regular Program-Order guarantee above, - * since the waking CPU is the one issueing the ACQUIRE (smp_cond_load_acquire). - * + * However, for wakeups there is a second guarantee we must provide, namely we + * must ensure that CONDITION=1 done by the caller can not be reordered with + * accesses to the task state; see try_to_wake_up() and set_current_state().   */  /** @@ -1968,6 +1939,9 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags)   * Atomic against schedule() which would dequeue a task, also see   * set_current_state().   * + * This function executes a full memory barrier before accessing the task + * state; see set_current_state(). + *   * Return: %true if @p->state changes (an actual wakeup was done),   *	   %false otherwise.   */ @@ -1999,21 +1973,20 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)  	 * be possible to, falsely, observe p->on_rq == 0 and get stuck  	 * in smp_cond_load_acquire() below.  	 * -	 * sched_ttwu_pending()                 try_to_wake_up() -	 *   [S] p->on_rq = 1;                  [L] P->state -	 *       UNLOCK rq->lock  -----. -	 *                              \ -	 *				 +---   RMB -	 * schedule()                   / -	 *       LOCK rq->lock    -----' -	 *       UNLOCK rq->lock +	 * sched_ttwu_pending()			try_to_wake_up() +	 *   STORE p->on_rq = 1			  LOAD p->state +	 *   UNLOCK rq->lock +	 * +	 * __schedule() (switch to task 'p') +	 *   LOCK rq->lock			  smp_rmb(); +	 *   smp_mb__after_spinlock(); +	 *   UNLOCK rq->lock  	 *  	 * [task p] -	 *   [S] p->state = UNINTERRUPTIBLE     [L] p->on_rq +	 *   STORE p->state = UNINTERRUPTIBLE	  LOAD p->on_rq  	 * -	 * Pairs with the UNLOCK+LOCK on rq->lock from the -	 * last wakeup of our task and the schedule that got our task -	 * current. +	 * Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in +	 * __schedule().  See the comment for smp_mb__after_spinlock().  	 */  	smp_rmb();  	if (p->on_rq && ttwu_remote(p, wake_flags)) @@ -2027,15 +2000,17 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)  	 * One must be running (->on_cpu == 1) in order to remove oneself  	 * from the runqueue.  	 * -	 *  [S] ->on_cpu = 1;	[L] ->on_rq -	 *      UNLOCK rq->lock -	 *			RMB -	 *      LOCK   rq->lock -	 *  [S] ->on_rq = 0;    [L] ->on_cpu +	 * __schedule() (switch to task 'p')	try_to_wake_up() +	 *   STORE p->on_cpu = 1		  LOAD p->on_rq +	 *   UNLOCK rq->lock +	 * +	 * __schedule() (put 'p' to sleep) +	 *   LOCK rq->lock			  smp_rmb(); +	 *   smp_mb__after_spinlock(); +	 *   STORE p->on_rq = 0			  LOAD p->on_cpu  	 * -	 * Pairs with the full barrier implied in the UNLOCK+LOCK on rq->lock -	 * from the consecutive calls to schedule(); the first switching to our -	 * task, the second putting it to sleep. +	 * Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in +	 * __schedule().  See the comment for smp_mb__after_spinlock().  	 */  	smp_rmb(); @@ -2141,8 +2116,7 @@ out:   *   * Return: 1 if the process was woken up, 0 if it was already running.   * - * It may be assumed that this function implies a write memory barrier before - * changing the task state if and only if any tasks are woken up. + * This function executes a full memory barrier before accessing the task state.   */  int wake_up_process(struct task_struct *p)  { @@ -2318,7 +2292,6 @@ static inline void init_schedstats(void) {}  int sched_fork(unsigned long clone_flags, struct task_struct *p)  {  	unsigned long flags; -	int cpu = get_cpu();  	__sched_fork(clone_flags, p);  	/* @@ -2354,14 +2327,12 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)  		p->sched_reset_on_fork = 0;  	} -	if (dl_prio(p->prio)) { -		put_cpu(); +	if (dl_prio(p->prio))  		return -EAGAIN; -	} else if (rt_prio(p->prio)) { +	else if (rt_prio(p->prio))  		p->sched_class = &rt_sched_class; -	} else { +	else  		p->sched_class = &fair_sched_class; -	}  	init_entity_runnable_average(&p->se); @@ -2377,7 +2348,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)  	 * We're setting the CPU for the first time, we don't migrate,  	 * so use __set_task_cpu().  	 */ -	__set_task_cpu(p, cpu); +	__set_task_cpu(p, smp_processor_id());  	if (p->sched_class->task_fork)  		p->sched_class->task_fork(p);  	raw_spin_unlock_irqrestore(&p->pi_lock, flags); @@ -2394,8 +2365,6 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)  	plist_node_init(&p->pushable_tasks, MAX_PRIO);  	RB_CLEAR_NODE(&p->pushable_dl_tasks);  #endif - -	put_cpu();  	return 0;  } @@ -2724,28 +2693,20 @@ static struct rq *finish_task_switch(struct task_struct *prev)  		membarrier_mm_sync_core_before_usermode(mm);  		mmdrop(mm);  	} -	if (unlikely(prev_state & (TASK_DEAD|TASK_PARKED))) { -		switch (prev_state) { -		case TASK_DEAD: -			if (prev->sched_class->task_dead) -				prev->sched_class->task_dead(prev); +	if (unlikely(prev_state == TASK_DEAD)) { +		if (prev->sched_class->task_dead) +			prev->sched_class->task_dead(prev); -			/* -			 * Remove function-return probe instances associated with this -			 * task and put them back on the free list. -			 */ -			kprobe_flush_task(prev); - -			/* Task is done with its stack. */ -			put_task_stack(prev); +		/* +		 * Remove function-return probe instances associated with this +		 * task and put them back on the free list. +		 */ +		kprobe_flush_task(prev); -			put_task_struct(prev); -			break; +		/* Task is done with its stack. */ +		put_task_stack(prev); -		case TASK_PARKED: -			kthread_park_complete(prev); -			break; -		} +		put_task_struct(prev);  	}  	tick_nohz_task_switch(); @@ -2813,6 +2774,8 @@ asmlinkage __visible void schedule_tail(struct task_struct *prev)  	if (current->set_child_tid)  		put_user(task_pid_vnr(current), current->set_child_tid); + +	calculate_sigpending();  }  /* @@ -3113,7 +3076,9 @@ static void sched_tick_remote(struct work_struct *work)  	struct tick_work *twork = container_of(dwork, struct tick_work, work);  	int cpu = twork->cpu;  	struct rq *rq = cpu_rq(cpu); +	struct task_struct *curr;  	struct rq_flags rf; +	u64 delta;  	/*  	 * Handle the tick only if it appears the remote CPU is running in full @@ -3122,25 +3087,29 @@ static void sched_tick_remote(struct work_struct *work)  	 * statistics and checks timeslices in a time-independent way, regardless  	 * of when exactly it is running.  	 */ -	if (!idle_cpu(cpu) && tick_nohz_tick_stopped_cpu(cpu)) { -		struct task_struct *curr; -		u64 delta; +	if (idle_cpu(cpu) || !tick_nohz_tick_stopped_cpu(cpu)) +		goto out_requeue; -		rq_lock_irq(rq, &rf); -		update_rq_clock(rq); -		curr = rq->curr; -		delta = rq_clock_task(rq) - curr->se.exec_start; +	rq_lock_irq(rq, &rf); +	curr = rq->curr; +	if (is_idle_task(curr)) +		goto out_unlock; -		/* -		 * Make sure the next tick runs within a reasonable -		 * amount of time. -		 */ -		WARN_ON_ONCE(delta > (u64)NSEC_PER_SEC * 3); -		curr->sched_class->task_tick(rq, curr, 0); -		rq_unlock_irq(rq, &rf); -	} +	update_rq_clock(rq); +	delta = rq_clock_task(rq) - curr->se.exec_start;  	/* +	 * Make sure the next tick runs within a reasonable +	 * amount of time. +	 */ +	WARN_ON_ONCE(delta > (u64)NSEC_PER_SEC * 3); +	curr->sched_class->task_tick(rq, curr, 0); + +out_unlock: +	rq_unlock_irq(rq, &rf); + +out_requeue: +	/*  	 * Run the remote tick once per second (1Hz). This arbitrary  	 * frequency is large enough to avoid overload but short enough  	 * to keep scheduler internal stats reasonably up to date. @@ -3192,7 +3161,7 @@ static inline void sched_tick_stop(int cpu) { }  #endif  #if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \ -				defined(CONFIG_PREEMPT_TRACER)) +				defined(CONFIG_TRACE_PREEMPT_TOGGLE))  /*   * If the value passed in is equal to the current preempt count   * then we just disabled preemption. Start timing the latency. @@ -5717,13 +5686,6 @@ void set_rq_offline(struct rq *rq)  	}  } -static void set_cpu_rq_start_time(unsigned int cpu) -{ -	struct rq *rq = cpu_rq(cpu); - -	rq->age_stamp = sched_clock_cpu(cpu); -} -  /*   * used to mark begin/end of suspend/resume:   */ @@ -5777,6 +5739,18 @@ int sched_cpu_activate(unsigned int cpu)  	struct rq *rq = cpu_rq(cpu);  	struct rq_flags rf; +#ifdef CONFIG_SCHED_SMT +	/* +	 * The sched_smt_present static key needs to be evaluated on every +	 * hotplug event because at boot time SMT might be disabled when +	 * the number of booted CPUs is limited. +	 * +	 * If then later a sibling gets hotplugged, then the key would stay +	 * off and SMT scheduling would never be functional. +	 */ +	if (cpumask_weight(cpu_smt_mask(cpu)) > 1) +		static_branch_enable_cpuslocked(&sched_smt_present); +#endif  	set_cpu_active(cpu, true);  	if (sched_smp_initialized) { @@ -5841,7 +5815,6 @@ static void sched_rq_cpu_starting(unsigned int cpu)  int sched_cpu_starting(unsigned int cpu)  { -	set_cpu_rq_start_time(cpu);  	sched_rq_cpu_starting(cpu);  	sched_tick_start(cpu);  	return 0; @@ -5874,22 +5847,6 @@ int sched_cpu_dying(unsigned int cpu)  }  #endif -#ifdef CONFIG_SCHED_SMT -DEFINE_STATIC_KEY_FALSE(sched_smt_present); - -static void sched_init_smt(void) -{ -	/* -	 * We've enumerated all CPUs and will assume that if any CPU -	 * has SMT siblings, CPU0 will too. -	 */ -	if (cpumask_weight(cpu_smt_mask(0)) > 1) -		static_branch_enable(&sched_smt_present); -} -#else -static inline void sched_init_smt(void) { } -#endif -  void __init sched_init_smp(void)  {  	sched_init_numa(); @@ -5911,8 +5868,6 @@ void __init sched_init_smp(void)  	init_sched_rt_class();  	init_sched_dl_class(); -	sched_init_smt(); -  	sched_smp_initialized = true;  } @@ -5957,7 +5912,6 @@ void __init sched_init(void)  	int i, j;  	unsigned long alloc_size = 0, ptr; -	sched_clock_init();  	wait_bit_init();  #ifdef CONFIG_FAIR_GROUP_SCHED @@ -6109,7 +6063,6 @@ void __init sched_init(void)  #ifdef CONFIG_SMP  	idle_thread_set_boot_cpu(); -	set_cpu_rq_start_time(smp_processor_id());  #endif  	init_sched_fair_class(); @@ -6788,6 +6741,16 @@ static int cpu_cfs_stat_show(struct seq_file *sf, void *v)  	seq_printf(sf, "nr_throttled %d\n", cfs_b->nr_throttled);  	seq_printf(sf, "throttled_time %llu\n", cfs_b->throttled_time); +	if (schedstat_enabled() && tg != &root_task_group) { +		u64 ws = 0; +		int i; + +		for_each_possible_cpu(i) +			ws += schedstat_val(tg->se[i]->statistics.wait_sum); + +		seq_printf(sf, "wait_sum %llu\n", ws); +	} +  	return 0;  }  #endif /* CONFIG_CFS_BANDWIDTH */ | 
