diff options
Diffstat (limited to 'kernel/sched/ext.c')
| -rw-r--r-- | kernel/sched/ext.c | 113 | 
1 files changed, 76 insertions, 37 deletions
| diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 8857c0709bdd..5a81d9a1e31f 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -123,6 +123,19 @@ enum scx_ops_flags {  	SCX_OPS_SWITCH_PARTIAL	= 1LLU << 3,  	/* +	 * A migration disabled task can only execute on its current CPU. By +	 * default, such tasks are automatically put on the CPU's local DSQ with +	 * the default slice on enqueue. If this ops flag is set, they also go +	 * through ops.enqueue(). +	 * +	 * A migration disabled task never invokes ops.select_cpu() as it can +	 * only select the current CPU. Also, p->cpus_ptr will only contain its +	 * current CPU while p->nr_cpus_allowed keeps tracking p->user_cpus_ptr +	 * and thus may disagree with cpumask_weight(p->cpus_ptr). +	 */ +	SCX_OPS_ENQ_MIGRATION_DISABLED = 1LLU << 4, + +	/*  	 * CPU cgroup support flags  	 */  	SCX_OPS_HAS_CGROUP_WEIGHT = 1LLU << 16,	/* cpu.weight */ @@ -130,6 +143,7 @@ enum scx_ops_flags {  	SCX_OPS_ALL_FLAGS	= SCX_OPS_KEEP_BUILTIN_IDLE |  				  SCX_OPS_ENQ_LAST |  				  SCX_OPS_ENQ_EXITING | +				  SCX_OPS_ENQ_MIGRATION_DISABLED |  				  SCX_OPS_SWITCH_PARTIAL |  				  SCX_OPS_HAS_CGROUP_WEIGHT,  }; @@ -416,7 +430,7 @@ struct sched_ext_ops {  	/**  	 * @update_idle: Update the idle state of a CPU -	 * @cpu: CPU to udpate the idle state for +	 * @cpu: CPU to update the idle state for  	 * @idle: whether entering or exiting the idle state  	 *  	 * This operation is called when @rq's CPU goes or leaves the idle @@ -882,6 +896,7 @@ static bool scx_warned_zero_slice;  static DEFINE_STATIC_KEY_FALSE(scx_ops_enq_last);  static DEFINE_STATIC_KEY_FALSE(scx_ops_enq_exiting); +static DEFINE_STATIC_KEY_FALSE(scx_ops_enq_migration_disabled);  static DEFINE_STATIC_KEY_FALSE(scx_ops_cpu_preempt);  static DEFINE_STATIC_KEY_FALSE(scx_builtin_idle_enabled); @@ -1214,7 +1229,7 @@ static bool scx_kf_allowed_if_unlocked(void)  /**   * nldsq_next_task - Iterate to the next task in a non-local DSQ - * @dsq: user dsq being interated + * @dsq: user dsq being iterated   * @cur: current position, %NULL to start iteration   * @rev: walk backwards   * @@ -2014,6 +2029,11 @@ static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags,  	    unlikely(p->flags & PF_EXITING))  		goto local; +	/* see %SCX_OPS_ENQ_MIGRATION_DISABLED */ +	if (!static_branch_unlikely(&scx_ops_enq_migration_disabled) && +	    is_migration_disabled(p)) +		goto local; +  	if (!SCX_HAS_OP(enqueue))  		goto global; @@ -2078,7 +2098,7 @@ static void set_task_runnable(struct rq *rq, struct task_struct *p)  	/*  	 * list_add_tail() must be used. scx_ops_bypass() depends on tasks being -	 * appened to the runnable_list. +	 * appended to the runnable_list.  	 */  	list_add_tail(&p->scx.runnable_node, &rq->scx.runnable_list);  } @@ -2313,12 +2333,35 @@ static void move_remote_task_to_local_dsq(struct task_struct *p, u64 enq_flags,   *   * - The BPF scheduler is bypassed while the rq is offline and we can always say   *   no to the BPF scheduler initiated migrations while offline. + * + * The caller must ensure that @p and @rq are on different CPUs.   */  static bool task_can_run_on_remote_rq(struct task_struct *p, struct rq *rq,  				      bool trigger_error)  {  	int cpu = cpu_of(rq); +	SCHED_WARN_ON(task_cpu(p) == cpu); + +	/* +	 * If @p has migration disabled, @p->cpus_ptr is updated to contain only +	 * the pinned CPU in migrate_disable_switch() while @p is being switched +	 * out. However, put_prev_task_scx() is called before @p->cpus_ptr is +	 * updated and thus another CPU may see @p on a DSQ inbetween leading to +	 * @p passing the below task_allowed_on_cpu() check while migration is +	 * disabled. +	 * +	 * Test the migration disabled state first as the race window is narrow +	 * and the BPF scheduler failing to check migration disabled state can +	 * easily be masked if task_allowed_on_cpu() is done first. +	 */ +	if (unlikely(is_migration_disabled(p))) { +		if (trigger_error) +			scx_ops_error("SCX_DSQ_LOCAL[_ON] cannot move migration disabled %s[%d] from CPU %d to %d", +				      p->comm, p->pid, task_cpu(p), cpu); +		return false; +	} +  	/*  	 * We don't require the BPF scheduler to avoid dispatching to offline  	 * CPUs mostly for convenience but also because CPUs can go offline @@ -2327,14 +2370,11 @@ static bool task_can_run_on_remote_rq(struct task_struct *p, struct rq *rq,  	 */  	if (!task_allowed_on_cpu(p, cpu)) {  		if (trigger_error) -			scx_ops_error("SCX_DSQ_LOCAL[_ON] verdict target cpu %d not allowed for %s[%d]", -				      cpu_of(rq), p->comm, p->pid); +			scx_ops_error("SCX_DSQ_LOCAL[_ON] target CPU %d not allowed for %s[%d]", +				      cpu, p->comm, p->pid);  		return false;  	} -	if (unlikely(is_migration_disabled(p))) -		return false; -  	if (!scx_rq_online(rq))  		return false; @@ -2437,7 +2477,8 @@ static struct rq *move_task_between_dsqs(struct task_struct *p, u64 enq_flags,  	if (dst_dsq->id == SCX_DSQ_LOCAL) {  		dst_rq = container_of(dst_dsq, struct rq, scx.local_dsq); -		if (!task_can_run_on_remote_rq(p, dst_rq, true)) { +		if (src_rq != dst_rq && +		    unlikely(!task_can_run_on_remote_rq(p, dst_rq, true))) {  			dst_dsq = find_global_dsq(p);  			dst_rq = src_rq;  		} @@ -2480,7 +2521,7 @@ static struct rq *move_task_between_dsqs(struct task_struct *p, u64 enq_flags,  /*   * A poorly behaving BPF scheduler can live-lock the system by e.g. incessantly   * banging on the same DSQ on a large NUMA system to the point where switching - * to the bypass mode can take a long time. Inject artifical delays while the + * to the bypass mode can take a long time. Inject artificial delays while the   * bypass mode is switching to guarantee timely completion.   */  static void scx_ops_breather(struct rq *rq) @@ -2575,6 +2616,9 @@ static void dispatch_to_local_dsq(struct rq *rq, struct scx_dispatch_q *dst_dsq,  {  	struct rq *src_rq = task_rq(p);  	struct rq *dst_rq = container_of(dst_dsq, struct rq, scx.local_dsq); +#ifdef CONFIG_SMP +	struct rq *locked_rq = rq; +#endif  	/*  	 * We're synchronized against dequeue through DISPATCHING. As @p can't @@ -2588,7 +2632,8 @@ static void dispatch_to_local_dsq(struct rq *rq, struct scx_dispatch_q *dst_dsq,  	}  #ifdef CONFIG_SMP -	if (unlikely(!task_can_run_on_remote_rq(p, dst_rq, true))) { +	if (src_rq != dst_rq && +	    unlikely(!task_can_run_on_remote_rq(p, dst_rq, true))) {  		dispatch_enqueue(find_global_dsq(p), p,  				 enq_flags | SCX_ENQ_CLEAR_OPSS);  		return; @@ -2611,8 +2656,9 @@ static void dispatch_to_local_dsq(struct rq *rq, struct scx_dispatch_q *dst_dsq,  	atomic_long_set_release(&p->scx.ops_state, SCX_OPSS_NONE);  	/* switch to @src_rq lock */ -	if (rq != src_rq) { -		raw_spin_rq_unlock(rq); +	if (locked_rq != src_rq) { +		raw_spin_rq_unlock(locked_rq); +		locked_rq = src_rq;  		raw_spin_rq_lock(src_rq);  	} @@ -2630,6 +2676,8 @@ static void dispatch_to_local_dsq(struct rq *rq, struct scx_dispatch_q *dst_dsq,  		} else {  			move_remote_task_to_local_dsq(p, enq_flags,  						      src_rq, dst_rq); +			/* task has been moved to dst_rq, which is now locked */ +			locked_rq = dst_rq;  		}  		/* if the destination CPU is idle, wake it up */ @@ -2638,8 +2686,8 @@ static void dispatch_to_local_dsq(struct rq *rq, struct scx_dispatch_q *dst_dsq,  	}  	/* switch back to @rq lock */ -	if (rq != dst_rq) { -		raw_spin_rq_unlock(dst_rq); +	if (locked_rq != rq) { +		raw_spin_rq_unlock(locked_rq);  		raw_spin_rq_lock(rq);  	}  #else	/* CONFIG_SMP */ @@ -3144,7 +3192,7 @@ static struct task_struct *pick_task_scx(struct rq *rq)   *   * Unless overridden by ops.core_sched_before(), @p->scx.core_sched_at is used   * to implement the default task ordering. The older the timestamp, the higher - * prority the task - the global FIFO ordering matching the default scheduling + * priority the task - the global FIFO ordering matching the default scheduling   * behavior.   *   * When ops.core_sched_before() is enabled, @p->scx.core_sched_at is used to @@ -3851,7 +3899,7 @@ static void task_tick_scx(struct rq *rq, struct task_struct *curr, int queued)  		curr->scx.slice = 0;  		touch_core_sched(rq, curr);  	} else if (SCX_HAS_OP(tick)) { -		SCX_CALL_OP(SCX_KF_REST, tick, curr); +		SCX_CALL_OP_TASK(SCX_KF_REST, tick, curr);  	}  	if (!curr->scx.slice) @@ -3998,7 +4046,7 @@ static void scx_ops_disable_task(struct task_struct *p)  	WARN_ON_ONCE(scx_get_task_state(p) != SCX_TASK_ENABLED);  	if (SCX_HAS_OP(disable)) -		SCX_CALL_OP(SCX_KF_REST, disable, p); +		SCX_CALL_OP_TASK(SCX_KF_REST, disable, p);  	scx_set_task_state(p, SCX_TASK_READY);  } @@ -4027,7 +4075,7 @@ static void scx_ops_exit_task(struct task_struct *p)  	}  	if (SCX_HAS_OP(exit_task)) -		SCX_CALL_OP(SCX_KF_REST, exit_task, p, &args); +		SCX_CALL_OP_TASK(SCX_KF_REST, exit_task, p, &args);  	scx_set_task_state(p, SCX_TASK_NONE);  } @@ -4323,25 +4371,12 @@ err:  	return ops_sanitize_err("cgroup_prep_move", ret);  } -void scx_move_task(struct task_struct *p) +void scx_cgroup_move_task(struct task_struct *p)  {  	if (!scx_cgroup_enabled)  		return;  	/* -	 * We're called from sched_move_task() which handles both cgroup and -	 * autogroup moves. Ignore the latter. -	 * -	 * Also ignore exiting tasks, because in the exit path tasks transition -	 * from the autogroup to the root group, so task_group_is_autogroup() -	 * alone isn't able to catch exiting autogroup tasks. This is safe for -	 * cgroup_move(), because cgroup migrations never happen for PF_EXITING -	 * tasks. -	 */ -	if (task_group_is_autogroup(task_group(p)) || (p->flags & PF_EXITING)) -		return; - -	/*  	 * @p must have ops.cgroup_prep_move() called on it and thus  	 * cgrp_moving_from set.  	 */ @@ -4590,7 +4625,7 @@ static int scx_cgroup_init(void)  	cgroup_warned_missing_idle = false;  	/* -	 * scx_tg_on/offline() are excluded thorugh scx_cgroup_rwsem. If we walk +	 * scx_tg_on/offline() are excluded through scx_cgroup_rwsem. If we walk  	 * cgroups and init, all online cgroups are initialized.  	 */  	rcu_read_lock(); @@ -5059,6 +5094,7 @@ static void scx_ops_disable_workfn(struct kthread_work *work)  		static_branch_disable(&scx_has_op[i]);  	static_branch_disable(&scx_ops_enq_last);  	static_branch_disable(&scx_ops_enq_exiting); +	static_branch_disable(&scx_ops_enq_migration_disabled);  	static_branch_disable(&scx_ops_cpu_preempt);  	static_branch_disable(&scx_builtin_idle_enabled);  	synchronize_rcu(); @@ -5277,9 +5313,10 @@ static void scx_dump_task(struct seq_buf *s, struct scx_dump_ctx *dctx,  		  scx_get_task_state(p), p->scx.flags & ~SCX_TASK_STATE_MASK,  		  p->scx.dsq_flags, ops_state & SCX_OPSS_STATE_MASK,  		  ops_state >> SCX_OPSS_QSEQ_SHIFT); -	dump_line(s, "      sticky/holding_cpu=%d/%d dsq_id=%s dsq_vtime=%llu slice=%llu", -		  p->scx.sticky_cpu, p->scx.holding_cpu, dsq_id_buf, -		  p->scx.dsq_vtime, p->scx.slice); +	dump_line(s, "      sticky/holding_cpu=%d/%d dsq_id=%s", +		  p->scx.sticky_cpu, p->scx.holding_cpu, dsq_id_buf); +	dump_line(s, "      dsq_vtime=%llu slice=%llu weight=%u", +		  p->scx.dsq_vtime, p->scx.slice, p->scx.weight);  	dump_line(s, "      cpus=%*pb", cpumask_pr_args(p->cpus_ptr));  	if (SCX_HAS_OP(dump_task)) { @@ -5667,6 +5704,8 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link)  	if (ops->flags & SCX_OPS_ENQ_EXITING)  		static_branch_enable(&scx_ops_enq_exiting); +	if (ops->flags & SCX_OPS_ENQ_MIGRATION_DISABLED) +		static_branch_enable(&scx_ops_enq_migration_disabled);  	if (scx_ops.cpu_acquire || scx_ops.cpu_release)  		static_branch_enable(&scx_ops_cpu_preempt); | 
