summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--kernel/sched/ext.c466
-rw-r--r--kernel/sched/ext_idle.c208
-rw-r--r--kernel/sched/ext_idle.h3
-rw-r--r--kernel/sched/sched.h4
-rw-r--r--tools/sched_ext/include/scx/common.bpf.h2
-rw-r--r--tools/sched_ext/scx_show_state.py14
-rw-r--r--tools/testing/selftests/sched_ext/Makefile1
-rw-r--r--tools/testing/selftests/sched_ext/allowed_cpus.bpf.c121
-rw-r--r--tools/testing/selftests/sched_ext/allowed_cpus.c57
9 files changed, 584 insertions, 292 deletions
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index fdbf249d1c68..3cc5d7b5e3a9 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -26,7 +26,7 @@ enum scx_consts {
* Iterating all tasks may take a while. Periodically drop
* scx_tasks_lock to avoid causing e.g. CSD and RCU stalls.
*/
- SCX_OPS_TASK_ITER_BATCH = 32,
+ SCX_TASK_ITER_BATCH = 32,
};
enum scx_exit_kind {
@@ -44,9 +44,9 @@ enum scx_exit_kind {
};
/*
- * An exit code can be specified when exiting with scx_bpf_exit() or
- * scx_ops_exit(), corresponding to exit_kind UNREG_BPF and UNREG_KERN
- * respectively. The codes are 64bit of the format:
+ * An exit code can be specified when exiting with scx_bpf_exit() or scx_exit(),
+ * corresponding to exit_kind UNREG_BPF and UNREG_KERN respectively. The codes
+ * are 64bit of the format:
*
* Bits: [63 .. 48 47 .. 32 31 .. 0]
* [ SYS ACT ] [ SYS RSN ] [ USR ]
@@ -465,6 +465,7 @@ struct sched_ext_ops {
* idle CPU tracking and the following helpers become unavailable:
*
* - scx_bpf_select_cpu_dfl()
+ * - scx_bpf_select_cpu_and()
* - scx_bpf_test_and_clear_cpu_idle()
* - scx_bpf_pick_idle_cpu()
*
@@ -838,18 +839,18 @@ enum scx_tg_flags {
SCX_TG_INITED = 1U << 1,
};
-enum scx_ops_enable_state {
- SCX_OPS_ENABLING,
- SCX_OPS_ENABLED,
- SCX_OPS_DISABLING,
- SCX_OPS_DISABLED,
+enum scx_enable_state {
+ SCX_ENABLING,
+ SCX_ENABLED,
+ SCX_DISABLING,
+ SCX_DISABLED,
};
-static const char *scx_ops_enable_state_str[] = {
- [SCX_OPS_ENABLING] = "enabling",
- [SCX_OPS_ENABLED] = "enabled",
- [SCX_OPS_DISABLING] = "disabling",
- [SCX_OPS_DISABLED] = "disabled",
+static const char *scx_enable_state_str[] = {
+ [SCX_ENABLING] = "enabling",
+ [SCX_ENABLED] = "enabled",
+ [SCX_DISABLING] = "disabling",
+ [SCX_DISABLED] = "disabled",
};
/*
@@ -907,15 +908,15 @@ static DEFINE_SPINLOCK(scx_tasks_lock);
static LIST_HEAD(scx_tasks);
/* ops enable/disable */
-static struct kthread_worker *scx_ops_helper;
-static DEFINE_MUTEX(scx_ops_enable_mutex);
-DEFINE_STATIC_KEY_FALSE(__scx_ops_enabled);
+static struct kthread_worker *scx_helper;
+static DEFINE_MUTEX(scx_enable_mutex);
+DEFINE_STATIC_KEY_FALSE(__scx_enabled);
DEFINE_STATIC_PERCPU_RWSEM(scx_fork_rwsem);
-static atomic_t scx_ops_enable_state_var = ATOMIC_INIT(SCX_OPS_DISABLED);
+static atomic_t scx_enable_state_var = ATOMIC_INIT(SCX_DISABLED);
static unsigned long scx_in_softlockup;
-static atomic_t scx_ops_breather_depth = ATOMIC_INIT(0);
-static int scx_ops_bypass_depth;
-static bool scx_ops_init_task_enabled;
+static atomic_t scx_breather_depth = ATOMIC_INIT(0);
+static int scx_bypass_depth;
+static bool scx_init_task_enabled;
static bool scx_switching_all;
DEFINE_STATIC_KEY_FALSE(__scx_switched_all);
@@ -947,7 +948,7 @@ static atomic_long_t scx_enable_seq = ATOMIC_LONG_INIT(0);
/*
* The maximum amount of time in jiffies that a task may be runnable without
* being scheduled on a CPU. If this timeout is exceeded, it will trigger
- * scx_ops_error().
+ * scx_error().
*/
static unsigned long scx_watchdog_timeout;
@@ -1043,18 +1044,17 @@ static struct kobject *scx_root_kobj;
static void process_ddsp_deferred_locals(struct rq *rq);
static void scx_bpf_kick_cpu(s32 cpu, u64 flags);
-static __printf(3, 4) void scx_ops_exit_kind(enum scx_exit_kind kind,
- s64 exit_code,
- const char *fmt, ...);
+static __printf(3, 4) void __scx_exit(enum scx_exit_kind kind, s64 exit_code,
+ const char *fmt, ...);
-#define scx_ops_error_kind(err, fmt, args...) \
- scx_ops_exit_kind((err), 0, fmt, ##args)
+#define __scx_error(err, fmt, args...) \
+ __scx_exit((err), 0, fmt, ##args)
-#define scx_ops_exit(code, fmt, args...) \
- scx_ops_exit_kind(SCX_EXIT_UNREG_KERN, (code), fmt, ##args)
+#define scx_exit(code, fmt, args...) \
+ __scx_exit(SCX_EXIT_UNREG_KERN, (code), fmt, ##args)
-#define scx_ops_error(fmt, args...) \
- scx_ops_error_kind(SCX_EXIT_ERROR, fmt, ##args)
+#define scx_error(fmt, args...) \
+ __scx_error(SCX_EXIT_ERROR, fmt, ##args)
#define SCX_HAS_OP(op) static_branch_likely(&scx_has_op[SCX_OP_IDX(op)])
@@ -1187,8 +1187,8 @@ do { \
static __always_inline bool scx_kf_allowed(u32 mask)
{
if (unlikely(!(current->scx.kf_mask & mask))) {
- scx_ops_error("kfunc with mask 0x%x called from an operation only allowing 0x%x",
- mask, current->scx.kf_mask);
+ scx_error("kfunc with mask 0x%x called from an operation only allowing 0x%x",
+ mask, current->scx.kf_mask);
return false;
}
@@ -1201,13 +1201,13 @@ static __always_inline bool scx_kf_allowed(u32 mask)
*/
if (unlikely(highest_bit(mask) == SCX_KF_CPU_RELEASE &&
(current->scx.kf_mask & higher_bits(SCX_KF_CPU_RELEASE)))) {
- scx_ops_error("cpu_release kfunc called from a nested operation");
+ scx_error("cpu_release kfunc called from a nested operation");
return false;
}
if (unlikely(highest_bit(mask) == SCX_KF_DISPATCH &&
(current->scx.kf_mask & higher_bits(SCX_KF_DISPATCH)))) {
- scx_ops_error("dispatch kfunc called from a nested operation");
+ scx_error("dispatch kfunc called from a nested operation");
return false;
}
@@ -1223,7 +1223,7 @@ static __always_inline bool scx_kf_allowed_on_arg_tasks(u32 mask,
if (unlikely((p != current->scx.kf_tasks[0] &&
p != current->scx.kf_tasks[1]))) {
- scx_ops_error("called on a task not being operated on");
+ scx_error("called on a task not being operated on");
return false;
}
@@ -1402,15 +1402,15 @@ static void scx_task_iter_stop(struct scx_task_iter *iter)
* @iter: iterator to walk
*
* Visit the next task. See scx_task_iter_start() for details. Locks are dropped
- * and re-acquired every %SCX_OPS_TASK_ITER_BATCH iterations to avoid causing
- * stalls by holding scx_tasks_lock for too long.
+ * and re-acquired every %SCX_TASK_ITER_BATCH iterations to avoid causing stalls
+ * by holding scx_tasks_lock for too long.
*/
static struct task_struct *scx_task_iter_next(struct scx_task_iter *iter)
{
struct list_head *cursor = &iter->cursor.tasks_node;
struct sched_ext_entity *pos;
- if (!(++iter->cnt % SCX_OPS_TASK_ITER_BATCH)) {
+ if (!(++iter->cnt % SCX_TASK_ITER_BATCH)) {
scx_task_iter_unlock(iter);
cond_resched();
scx_task_iter_relock(iter);
@@ -1592,23 +1592,22 @@ static DEFINE_PER_CPU(struct scx_event_stats, event_stats_cpu);
static void scx_bpf_events(struct scx_event_stats *events, size_t events__sz);
-static enum scx_ops_enable_state scx_ops_enable_state(void)
+static enum scx_enable_state scx_enable_state(void)
{
- return atomic_read(&scx_ops_enable_state_var);
+ return atomic_read(&scx_enable_state_var);
}
-static enum scx_ops_enable_state
-scx_ops_set_enable_state(enum scx_ops_enable_state to)
+static enum scx_enable_state scx_set_enable_state(enum scx_enable_state to)
{
- return atomic_xchg(&scx_ops_enable_state_var, to);
+ return atomic_xchg(&scx_enable_state_var, to);
}
-static bool scx_ops_tryset_enable_state(enum scx_ops_enable_state to,
- enum scx_ops_enable_state from)
+static bool scx_tryset_enable_state(enum scx_enable_state to,
+ enum scx_enable_state from)
{
int from_v = from;
- return atomic_try_cmpxchg(&scx_ops_enable_state_var, &from_v, to);
+ return atomic_try_cmpxchg(&scx_enable_state_var, &from_v, to);
}
static bool scx_rq_bypassing(struct rq *rq)
@@ -1647,8 +1646,7 @@ static bool ops_cpu_valid(s32 cpu, const char *where)
if (likely(cpu >= 0 && cpu < nr_cpu_ids && cpu_possible(cpu))) {
return true;
} else {
- scx_ops_error("invalid CPU %d%s%s", cpu,
- where ? " " : "", where ?: "");
+ scx_error("invalid CPU %d%s%s", cpu, where ? " " : "", where ?: "");
return false;
}
}
@@ -1658,7 +1656,7 @@ static bool ops_cpu_valid(s32 cpu, const char *where)
* @ops_name: operation to blame on failure
* @err: -errno value to sanitize
*
- * Verify @err is a valid -errno. If not, trigger scx_ops_error() and return
+ * Verify @err is a valid -errno. If not, trigger scx_error() and return
* -%EPROTO. This is necessary because returning a rogue -errno up the chain can
* cause misbehaviors. For an example, a large negative return from
* ops.init_task() triggers an oops when passed up the call chain because the
@@ -1670,7 +1668,7 @@ static int ops_sanitize_err(const char *ops_name, s32 err)
if (err < 0 && err >= -MAX_ERRNO)
return err;
- scx_ops_error("ops.%s() returned an invalid errno %d", ops_name, err);
+ scx_error("ops.%s() returned an invalid errno %d", ops_name, err);
return -EPROTO;
}
@@ -1827,7 +1825,7 @@ static void dispatch_enqueue(struct scx_dispatch_q *dsq, struct task_struct *p,
if (!is_local) {
raw_spin_lock(&dsq->lock);
if (unlikely(dsq->id == SCX_DSQ_INVALID)) {
- scx_ops_error("attempting to dispatch to a destroyed dsq");
+ scx_error("attempting to dispatch to a destroyed dsq");
/* fall back to the global dsq */
raw_spin_unlock(&dsq->lock);
dsq = find_global_dsq(p);
@@ -1844,7 +1842,7 @@ static void dispatch_enqueue(struct scx_dispatch_q *dsq, struct task_struct *p,
* disallow any internal DSQ from doing vtime ordering of
* tasks.
*/
- scx_ops_error("cannot use vtime ordering for built-in DSQs");
+ scx_error("cannot use vtime ordering for built-in DSQs");
enq_flags &= ~SCX_ENQ_DSQ_PRIQ;
}
@@ -1858,8 +1856,8 @@ static void dispatch_enqueue(struct scx_dispatch_q *dsq, struct task_struct *p,
*/
if (unlikely(RB_EMPTY_ROOT(&dsq->priq) &&
nldsq_next_task(dsq, NULL, false)))
- scx_ops_error("DSQ ID 0x%016llx already had FIFO-enqueued tasks",
- dsq->id);
+ scx_error("DSQ ID 0x%016llx already had FIFO-enqueued tasks",
+ dsq->id);
p->scx.dsq_flags |= SCX_TASK_DSQ_ON_PRIQ;
rb_add(&p->scx.dsq_priq, &dsq->priq, scx_dsq_priq_less);
@@ -1880,8 +1878,8 @@ static void dispatch_enqueue(struct scx_dispatch_q *dsq, struct task_struct *p,
} else {
/* a FIFO DSQ shouldn't be using PRIQ enqueuing */
if (unlikely(!RB_EMPTY_ROOT(&dsq->priq)))
- scx_ops_error("DSQ ID 0x%016llx already had PRIQ-enqueued tasks",
- dsq->id);
+ scx_error("DSQ ID 0x%016llx already had PRIQ-enqueued tasks",
+ dsq->id);
if (enq_flags & (SCX_ENQ_HEAD | SCX_ENQ_PREEMPT))
list_add(&p->scx.dsq_list.node, &dsq->list);
@@ -2019,8 +2017,8 @@ static struct scx_dispatch_q *find_dsq_for_dispatch(struct rq *rq, u64 dsq_id,
dsq = find_user_dsq(dsq_id);
if (unlikely(!dsq)) {
- scx_ops_error("non-existent DSQ 0x%llx for %s[%d]",
- dsq_id, p->comm, p->pid);
+ scx_error("non-existent DSQ 0x%llx for %s[%d]",
+ dsq_id, p->comm, p->pid);
return find_global_dsq(p);
}
@@ -2041,12 +2039,12 @@ static void mark_direct_dispatch(struct task_struct *ddsp_task,
/* @p must match the task on the enqueue path */
if (unlikely(p != ddsp_task)) {
if (IS_ERR(ddsp_task))
- scx_ops_error("%s[%d] already direct-dispatched",
- p->comm, p->pid);
+ scx_error("%s[%d] already direct-dispatched",
+ p->comm, p->pid);
else
- scx_ops_error("scheduling for %s[%d] but trying to direct-dispatch %s[%d]",
- ddsp_task->comm, ddsp_task->pid,
- p->comm, p->pid);
+ scx_error("scheduling for %s[%d] but trying to direct-dispatch %s[%d]",
+ ddsp_task->comm, ddsp_task->pid,
+ p->comm, p->pid);
return;
}
@@ -2224,7 +2222,7 @@ static void set_task_runnable(struct rq *rq, struct task_struct *p)
}
/*
- * list_add_tail() must be used. scx_ops_bypass() depends on tasks being
+ * list_add_tail() must be used. scx_bypass() depends on tasks being
* appended to the runnable_list.
*/
list_add_tail(&p->scx.runnable_node, &rq->scx.runnable_list);
@@ -2488,8 +2486,8 @@ static bool task_can_run_on_remote_rq(struct task_struct *p, struct rq *rq,
*/
if (unlikely(is_migration_disabled(p))) {
if (enforce)
- scx_ops_error("SCX_DSQ_LOCAL[_ON] cannot move migration disabled %s[%d] from CPU %d to %d",
- p->comm, p->pid, task_cpu(p), cpu);
+ scx_error("SCX_DSQ_LOCAL[_ON] cannot move migration disabled %s[%d] from CPU %d to %d",
+ p->comm, p->pid, task_cpu(p), cpu);
return false;
}
@@ -2501,8 +2499,8 @@ static bool task_can_run_on_remote_rq(struct task_struct *p, struct rq *rq,
*/
if (!task_allowed_on_cpu(p, cpu)) {
if (enforce)
- scx_ops_error("SCX_DSQ_LOCAL[_ON] target CPU %d not allowed for %s[%d]",
- cpu, p->comm, p->pid);
+ scx_error("SCX_DSQ_LOCAL[_ON] target CPU %d not allowed for %s[%d]",
+ cpu, p->comm, p->pid);
return false;
}
@@ -2658,13 +2656,13 @@ static struct rq *move_task_between_dsqs(struct task_struct *p, u64 enq_flags,
* to the bypass mode can take a long time. Inject artificial delays while the
* bypass mode is switching to guarantee timely completion.
*/
-static void scx_ops_breather(struct rq *rq)
+static void scx_breather(struct rq *rq)
{
u64 until;
lockdep_assert_rq_held(rq);
- if (likely(!atomic_read(&scx_ops_breather_depth)))
+ if (likely(!atomic_read(&scx_breather_depth)))
return;
raw_spin_rq_unlock(rq);
@@ -2673,9 +2671,9 @@ static void scx_ops_breather(struct rq *rq)
do {
int cnt = 1024;
- while (atomic_read(&scx_ops_breather_depth) && --cnt)
+ while (atomic_read(&scx_breather_depth) && --cnt)
cpu_relax();
- } while (atomic_read(&scx_ops_breather_depth) &&
+ } while (atomic_read(&scx_breather_depth) &&
time_before64(ktime_get_ns(), until));
raw_spin_rq_lock(rq);
@@ -2686,12 +2684,12 @@ static bool consume_dispatch_q(struct rq *rq, struct scx_dispatch_q *dsq)
struct task_struct *p;
retry:
/*
- * This retry loop can repeatedly race against scx_ops_bypass()
- * dequeueing tasks from @dsq trying to put the system into the bypass
- * mode. On some multi-socket machines (e.g. 2x Intel 8480c), this can
- * live-lock the machine into soft lockups. Give a breather.
+ * This retry loop can repeatedly race against scx_bypass() dequeueing
+ * tasks from @dsq trying to put the system into the bypass mode. On
+ * some multi-socket machines (e.g. 2x Intel 8480c), this can live-lock
+ * the machine into soft lockups. Give a breather.
*/
- scx_ops_breather(rq);
+ scx_breather(rq);
/*
* The caller can't expect to successfully consume a task if the task's
@@ -2959,8 +2957,8 @@ static int balance_one(struct rq *rq, struct task_struct *prev)
* scheduler wants to handle this explicitly, it should
* implement ->cpu_release().
*
- * See scx_ops_disable_workfn() for the explanation on the
- * bypassing test.
+ * See scx_disable_workfn() for the explanation on the bypassing
+ * test.
*/
if (prev_on_rq && prev->scx.slice && !scx_rq_bypassing(rq)) {
rq->scx.flags |= SCX_RQ_BAL_KEEP;
@@ -3283,7 +3281,7 @@ static struct task_struct *pick_task_scx(struct rq *rq)
* Can happen while enabling as SCX_RQ_BAL_PENDING assertion is
* conditional on scx_enabled() and may have been skipped.
*/
- WARN_ON_ONCE(scx_ops_enable_state() == SCX_OPS_ENABLED);
+ WARN_ON_ONCE(scx_enable_state() == SCX_ENABLED);
keep_prev = false;
}
@@ -3395,7 +3393,7 @@ static int select_task_rq_scx(struct task_struct *p, int prev_cpu, int wake_flag
} else {
s32 cpu;
- cpu = scx_select_cpu_dfl(p, prev_cpu, wake_flags, 0);
+ cpu = scx_select_cpu_dfl(p, prev_cpu, wake_flags, NULL, 0);
if (cpu >= 0) {
p->scx.slice = SCX_SLICE_DFL;
p->scx.ddsp_dsq_id = SCX_DSQ_LOCAL;
@@ -3448,9 +3446,9 @@ static void handle_hotplug(struct rq *rq, bool online)
else if (!online && SCX_HAS_OP(cpu_offline))
SCX_CALL_OP(SCX_KF_UNLOCKED, cpu_offline, cpu);
else
- scx_ops_exit(SCX_ECODE_ACT_RESTART | SCX_ECODE_RSN_HOTPLUG,
- "cpu %d going %s, exiting scheduler", cpu,
- online ? "online" : "offline");
+ scx_exit(SCX_ECODE_ACT_RESTART | SCX_ECODE_RSN_HOTPLUG,
+ "cpu %d going %s, exiting scheduler", cpu,
+ online ? "online" : "offline");
}
void scx_rq_activate(struct rq *rq)
@@ -3489,10 +3487,9 @@ static bool check_rq_for_timeouts(struct rq *rq)
last_runnable + scx_watchdog_timeout))) {
u32 dur_ms = jiffies_to_msecs(jiffies - last_runnable);
- scx_ops_error_kind(SCX_EXIT_ERROR_STALL,
- "%s[%d] failed to run for %u.%03us",
- p->comm, p->pid,
- dur_ms / 1000, dur_ms % 1000);
+ __scx_error(SCX_EXIT_ERROR_STALL,
+ "%s[%d] failed to run for %u.%03us",
+ p->comm, p->pid, dur_ms / 1000, dur_ms % 1000);
timed_out = true;
break;
}
@@ -3530,9 +3527,9 @@ void scx_tick(struct rq *rq)
last_check + READ_ONCE(scx_watchdog_timeout)))) {
u32 dur_ms = jiffies_to_msecs(jiffies - last_check);
- scx_ops_error_kind(SCX_EXIT_ERROR_STALL,
- "watchdog failed to check in for %u.%03us",
- dur_ms / 1000, dur_ms % 1000);
+ __scx_error(SCX_EXIT_ERROR_STALL,
+ "watchdog failed to check in for %u.%03us",
+ dur_ms / 1000, dur_ms % 1000);
}
update_other_load_avgs(rq);
@@ -3615,7 +3612,7 @@ static void scx_set_task_state(struct task_struct *p, enum scx_task_state state)
p->scx.flags |= state << SCX_TASK_STATE_SHIFT;
}
-static int scx_ops_init_task(struct task_struct *p, struct task_group *tg, bool fork)
+static int scx_init_task(struct task_struct *p, struct task_group *tg, bool fork)
{
int ret;
@@ -3657,8 +3654,8 @@ static int scx_ops_init_task(struct task_struct *p, struct task_group *tg, bool
task_rq_unlock(rq, p, &rf);
} else if (p->policy == SCHED_EXT) {
- scx_ops_error("ops.init_task() set task->scx.disallow for %s[%d] during fork",
- p->comm, p->pid);
+ scx_error("ops.init_task() set task->scx.disallow for %s[%d] during fork",
+ p->comm, p->pid);
}
}
@@ -3666,7 +3663,7 @@ static int scx_ops_init_task(struct task_struct *p, struct task_group *tg, bool
return 0;
}
-static void scx_ops_enable_task(struct task_struct *p)
+static void scx_enable_task(struct task_struct *p)
{
u32 weight;
@@ -3691,7 +3688,7 @@ static void scx_ops_enable_task(struct task_struct *p)
SCX_CALL_OP_TASK(SCX_KF_REST, set_weight, p, p->scx.weight);
}
-static void scx_ops_disable_task(struct task_struct *p)
+static void scx_disable_task(struct task_struct *p)
{
lockdep_assert_rq_held(task_rq(p));
WARN_ON_ONCE(scx_get_task_state(p) != SCX_TASK_ENABLED);
@@ -3701,7 +3698,7 @@ static void scx_ops_disable_task(struct task_struct *p)
scx_set_task_state(p, SCX_TASK_READY);
}
-static void scx_ops_exit_task(struct task_struct *p)
+static void scx_exit_task(struct task_struct *p)
{
struct scx_exit_task_args args = {
.cancelled = false,
@@ -3718,7 +3715,7 @@ static void scx_ops_exit_task(struct task_struct *p)
case SCX_TASK_READY:
break;
case SCX_TASK_ENABLED:
- scx_ops_disable_task(p);
+ scx_disable_task(p);
break;
default:
WARN_ON_ONCE(true);
@@ -3758,15 +3755,15 @@ int scx_fork(struct task_struct *p)
{
percpu_rwsem_assert_held(&scx_fork_rwsem);
- if (scx_ops_init_task_enabled)
- return scx_ops_init_task(p, task_group(p), true);
+ if (scx_init_task_enabled)
+ return scx_init_task(p, task_group(p), true);
else
return 0;
}
void scx_post_fork(struct task_struct *p)
{
- if (scx_ops_init_task_enabled) {
+ if (scx_init_task_enabled) {
scx_set_task_state(p, SCX_TASK_READY);
/*
@@ -3779,7 +3776,7 @@ void scx_post_fork(struct task_struct *p)
struct rq *rq;
rq = task_rq_lock(p, &rf);
- scx_ops_enable_task(p);
+ scx_enable_task(p);
task_rq_unlock(rq, p, &rf);
}
}
@@ -3799,7 +3796,7 @@ void scx_cancel_fork(struct task_struct *p)
rq = task_rq_lock(p, &rf);
WARN_ON_ONCE(scx_get_task_state(p) >= SCX_TASK_READY);
- scx_ops_exit_task(p);
+ scx_exit_task(p);
task_rq_unlock(rq, p, &rf);
}
@@ -3815,15 +3812,15 @@ void sched_ext_free(struct task_struct *p)
spin_unlock_irqrestore(&scx_tasks_lock, flags);
/*
- * @p is off scx_tasks and wholly ours. scx_ops_enable()'s READY ->
- * ENABLED transitions can't race us. Disable ops for @p.
+ * @p is off scx_tasks and wholly ours. scx_enable()'s READY -> ENABLED
+ * transitions can't race us. Disable ops for @p.
*/
if (scx_get_task_state(p) != SCX_TASK_NONE) {
struct rq_flags rf;
struct rq *rq;
rq = task_rq_lock(p, &rf);
- scx_ops_exit_task(p);
+ scx_exit_task(p);
task_rq_unlock(rq, p, &rf);
}
}
@@ -3844,7 +3841,7 @@ static void prio_changed_scx(struct rq *rq, struct task_struct *p, int oldprio)
static void switching_to_scx(struct rq *rq, struct task_struct *p)
{
- scx_ops_enable_task(p);
+ scx_enable_task(p);
/*
* set_cpus_allowed_scx() is not called while @p is associated with a
@@ -3857,7 +3854,7 @@ static void switching_to_scx(struct rq *rq, struct task_struct *p)
static void switched_from_scx(struct rq *rq, struct task_struct *p)
{
- scx_ops_disable_task(p);
+ scx_disable_task(p);
}
static void wakeup_preempt_scx(struct rq *rq, struct task_struct *p,int wake_flags) {}
@@ -4172,8 +4169,8 @@ static void destroy_dsq(u64 dsq_id)
raw_spin_lock_irqsave(&dsq->lock, flags);
if (dsq->nr) {
- scx_ops_error("attempting to destroy in-use dsq 0x%016llx (nr=%u)",
- dsq->id, dsq->nr);
+ scx_error("attempting to destroy in-use dsq 0x%016llx (nr=%u)",
+ dsq->id, dsq->nr);
goto out_unlock_dsq;
}
@@ -4265,7 +4262,7 @@ static int scx_cgroup_init(void)
css->cgroup, &args);
if (ret) {
css_put(css);
- scx_ops_error("ops.cgroup_init() failed (%d)", ret);
+ scx_error("ops.cgroup_init() failed (%d)", ret);
return ret;
}
tg->scx_flags |= SCX_TG_INITED;
@@ -4300,8 +4297,7 @@ static int scx_cgroup_init(void) { return 0; }
static ssize_t scx_attr_state_show(struct kobject *kobj,
struct kobj_attribute *ka, char *buf)
{
- return sysfs_emit(buf, "%s\n",
- scx_ops_enable_state_str[scx_ops_enable_state()]);
+ return sysfs_emit(buf, "%s\n", scx_enable_state_str[scx_enable_state()]);
}
SCX_ATTR(state);
@@ -4410,8 +4406,7 @@ static const struct kset_uevent_ops scx_uevent_ops = {
*/
bool task_should_scx(int policy)
{
- if (!scx_enabled() ||
- unlikely(scx_ops_enable_state() == SCX_OPS_DISABLING))
+ if (!scx_enabled() || unlikely(scx_enable_state() == SCX_DISABLING))
return false;
if (READ_ONCE(scx_switching_all))
return true;
@@ -4430,15 +4425,15 @@ bool task_should_scx(int policy)
*/
void scx_softlockup(u32 dur_s)
{
- switch (scx_ops_enable_state()) {
- case SCX_OPS_ENABLING:
- case SCX_OPS_ENABLED:
+ switch (scx_enable_state()) {
+ case SCX_ENABLING:
+ case SCX_ENABLED:
break;
default:
return;
}
- /* allow only one instance, cleared at the end of scx_ops_bypass() */
+ /* allow only one instance, cleared at the end of scx_bypass() */
if (test_and_set_bit(0, &scx_in_softlockup))
return;
@@ -4447,23 +4442,21 @@ void scx_softlockup(u32 dur_s)
/*
* Some CPUs may be trapped in the dispatch paths. Enable breather
- * immediately; otherwise, we might even be able to get to
- * scx_ops_bypass().
+ * immediately; otherwise, we might even be able to get to scx_bypass().
*/
- atomic_inc(&scx_ops_breather_depth);
+ atomic_inc(&scx_breather_depth);
- scx_ops_error("soft lockup - CPU#%d stuck for %us",
- smp_processor_id(), dur_s);
+ scx_error("soft lockup - CPU#%d stuck for %us", smp_processor_id(), dur_s);
}
static void scx_clear_softlockup(void)
{
if (test_and_clear_bit(0, &scx_in_softlockup))
- atomic_dec(&scx_ops_breather_depth);
+ atomic_dec(&scx_breather_depth);
}
/**
- * scx_ops_bypass - [Un]bypass scx_ops and guarantee forward progress
+ * scx_bypass - [Un]bypass scx_ops and guarantee forward progress
* @bypass: true for bypass, false for unbypass
*
* Bypassing guarantees that all runnable tasks make forward progress without
@@ -4493,7 +4486,7 @@ static void scx_clear_softlockup(void)
*
* - scx_prio_less() reverts to the default core_sched_at order.
*/
-static void scx_ops_bypass(bool bypass)
+static void scx_bypass(bool bypass)
{
static DEFINE_RAW_SPINLOCK(bypass_lock);
static unsigned long bypass_timestamp;
@@ -4503,22 +4496,22 @@ static void scx_ops_bypass(bool bypass)
raw_spin_lock_irqsave(&bypass_lock, flags);
if (bypass) {
- scx_ops_bypass_depth++;
- WARN_ON_ONCE(scx_ops_bypass_depth <= 0);
- if (scx_ops_bypass_depth != 1)
+ scx_bypass_depth++;
+ WARN_ON_ONCE(scx_bypass_depth <= 0);
+ if (scx_bypass_depth != 1)
goto unlock;
bypass_timestamp = ktime_get_ns();
scx_add_event(SCX_EV_BYPASS_ACTIVATE, 1);
} else {
- scx_ops_bypass_depth--;
- WARN_ON_ONCE(scx_ops_bypass_depth < 0);
- if (scx_ops_bypass_depth != 0)
+ scx_bypass_depth--;
+ WARN_ON_ONCE(scx_bypass_depth < 0);
+ if (scx_bypass_depth != 0)
goto unlock;
scx_add_event(SCX_EV_BYPASS_DURATION,
ktime_get_ns() - bypass_timestamp);
}
- atomic_inc(&scx_ops_breather_depth);
+ atomic_inc(&scx_breather_depth);
/*
* No task property is changing. We just need to make sure all currently
@@ -4576,7 +4569,7 @@ static void scx_ops_bypass(bool bypass)
raw_spin_rq_unlock(rq);
}
- atomic_dec(&scx_ops_breather_depth);
+ atomic_dec(&scx_breather_depth);
unlock:
raw_spin_unlock_irqrestore(&bypass_lock, flags);
scx_clear_softlockup();
@@ -4632,7 +4625,7 @@ static const char *scx_exit_reason(enum scx_exit_kind kind)
}
}
-static void scx_ops_disable_workfn(struct kthread_work *work)
+static void scx_disable_workfn(struct kthread_work *work)
{
struct scx_exit_info *ei = scx_exit_info;
struct scx_task_iter sti;
@@ -4657,17 +4650,16 @@ static void scx_ops_disable_workfn(struct kthread_work *work)
ei->reason = scx_exit_reason(ei->kind);
/* guarantee forward progress by bypassing scx_ops */
- scx_ops_bypass(true);
+ scx_bypass(true);
- switch (scx_ops_set_enable_state(SCX_OPS_DISABLING)) {
- case SCX_OPS_DISABLING:
+ switch (scx_set_enable_state(SCX_DISABLING)) {
+ case SCX_DISABLING:
WARN_ONCE(true, "sched_ext: duplicate disabling instance?");
break;
- case SCX_OPS_DISABLED:
+ case SCX_DISABLED:
pr_warn("sched_ext: ops error detected without ops (%s)\n",
scx_exit_info->msg);
- WARN_ON_ONCE(scx_ops_set_enable_state(SCX_OPS_DISABLED) !=
- SCX_OPS_DISABLING);
+ WARN_ON_ONCE(scx_set_enable_state(SCX_DISABLED) != SCX_DISABLING);
goto done;
default:
break;
@@ -4678,14 +4670,14 @@ static void scx_ops_disable_workfn(struct kthread_work *work)
* we can safely use blocking synchronization constructs. Actually
* disable ops.
*/
- mutex_lock(&scx_ops_enable_mutex);
+ mutex_lock(&scx_enable_mutex);
static_branch_disable(&__scx_switched_all);
WRITE_ONCE(scx_switching_all, false);
/*
* Shut down cgroup support before tasks so that the cgroup attach path
- * doesn't race against scx_ops_exit_task().
+ * doesn't race against scx_exit_task().
*/
scx_cgroup_lock();
scx_cgroup_exit();
@@ -4697,7 +4689,7 @@ static void scx_ops_disable_workfn(struct kthread_work *work)
*/
percpu_down_write(&scx_fork_rwsem);
- scx_ops_init_task_enabled = false;
+ scx_init_task_enabled = false;
scx_task_iter_start(&sti);
while ((p = scx_task_iter_next_locked(&sti))) {
@@ -4717,7 +4709,7 @@ static void scx_ops_disable_workfn(struct kthread_work *work)
sched_enq_and_set_task(&ctx);
check_class_changed(task_rq(p), p, old_class, p->prio);
- scx_ops_exit_task(p);
+ scx_exit_task(p);
}
scx_task_iter_stop(&sti);
percpu_up_write(&scx_fork_rwsem);
@@ -4732,7 +4724,7 @@ static void scx_ops_disable_workfn(struct kthread_work *work)
}
/* no task is on scx, turn off all the switches and flush in-progress calls */
- static_branch_disable(&__scx_ops_enabled);
+ static_branch_disable(&__scx_enabled);
for (i = SCX_OPI_BEGIN; i < SCX_OPI_END; i++)
static_branch_disable(&scx_has_op[i]);
static_branch_disable(&scx_ops_allow_queued_wakeup);
@@ -4792,29 +4784,28 @@ static void scx_ops_disable_workfn(struct kthread_work *work)
free_exit_info(scx_exit_info);
scx_exit_info = NULL;
- mutex_unlock(&scx_ops_enable_mutex);
+ mutex_unlock(&scx_enable_mutex);
- WARN_ON_ONCE(scx_ops_set_enable_state(SCX_OPS_DISABLED) !=
- SCX_OPS_DISABLING);
+ WARN_ON_ONCE(scx_set_enable_state(SCX_DISABLED) != SCX_DISABLING);
done:
- scx_ops_bypass(false);
+ scx_bypass(false);
}
-static DEFINE_KTHREAD_WORK(scx_ops_disable_work, scx_ops_disable_workfn);
+static DEFINE_KTHREAD_WORK(scx_disable_work, scx_disable_workfn);
-static void schedule_scx_ops_disable_work(void)
+static void schedule_scx_disable_work(void)
{
- struct kthread_worker *helper = READ_ONCE(scx_ops_helper);
+ struct kthread_worker *helper = READ_ONCE(scx_helper);
/*
* We may be called spuriously before the first bpf_sched_ext_reg(). If
- * scx_ops_helper isn't set up yet, there's nothing to do.
+ * scx_helper isn't set up yet, there's nothing to do.
*/
if (helper)
- kthread_queue_work(helper, &scx_ops_disable_work);
+ kthread_queue_work(helper, &scx_disable_work);
}
-static void scx_ops_disable(enum scx_exit_kind kind)
+static void scx_disable(enum scx_exit_kind kind)
{
int none = SCX_EXIT_NONE;
@@ -4823,7 +4814,7 @@ static void scx_ops_disable(enum scx_exit_kind kind)
atomic_try_cmpxchg(&scx_exit_kind, &none, kind);
- schedule_scx_ops_disable_work();
+ schedule_scx_disable_work();
}
static void dump_newline(struct seq_buf *s)
@@ -5121,21 +5112,20 @@ static void scx_dump_state(struct scx_exit_info *ei, size_t dump_len)
spin_unlock_irqrestore(&dump_lock, flags);
}
-static void scx_ops_error_irq_workfn(struct irq_work *irq_work)
+static void scx_error_irq_workfn(struct irq_work *irq_work)
{
struct scx_exit_info *ei = scx_exit_info;
if (ei->kind >= SCX_EXIT_ERROR)
scx_dump_state(ei, scx_ops.exit_dump_len);
- schedule_scx_ops_disable_work();
+ schedule_scx_disable_work();
}
-static DEFINE_IRQ_WORK(scx_ops_error_irq_work, scx_ops_error_irq_workfn);
+static DEFINE_IRQ_WORK(scx_error_irq_work, scx_error_irq_workfn);
-static __printf(3, 4) void scx_ops_exit_kind(enum scx_exit_kind kind,
- s64 exit_code,
- const char *fmt, ...)
+static __printf(3, 4) void __scx_exit(enum scx_exit_kind kind, s64 exit_code,
+ const char *fmt, ...)
{
struct scx_exit_info *ei = scx_exit_info;
int none = SCX_EXIT_NONE;
@@ -5155,12 +5145,12 @@ static __printf(3, 4) void scx_ops_exit_kind(enum scx_exit_kind kind,
/*
* Set ei->kind and ->reason for scx_dump_state(). They'll be set again
- * in scx_ops_disable_workfn().
+ * in scx_disable_workfn().
*/
ei->kind = kind;
ei->reason = scx_exit_reason(ei->kind);
- irq_work_queue(&scx_ops_error_irq_work);
+ irq_work_queue(&scx_error_irq_work);
}
static struct kthread_worker *scx_create_rt_helper(const char *name)
@@ -5185,9 +5175,9 @@ static void check_hotplug_seq(const struct sched_ext_ops *ops)
if (ops->hotplug_seq) {
global_hotplug_seq = atomic_long_read(&scx_hotplug_seq);
if (ops->hotplug_seq != global_hotplug_seq) {
- scx_ops_exit(SCX_ECODE_ACT_RESTART | SCX_ECODE_RSN_HOTPLUG,
- "expected hotplug seq %llu did not match actual %llu",
- ops->hotplug_seq, global_hotplug_seq);
+ scx_exit(SCX_ECODE_ACT_RESTART | SCX_ECODE_RSN_HOTPLUG,
+ "expected hotplug seq %llu did not match actual %llu",
+ ops->hotplug_seq, global_hotplug_seq);
}
}
}
@@ -5199,7 +5189,7 @@ static int validate_ops(const struct sched_ext_ops *ops)
* ops.enqueue() callback isn't implemented.
*/
if ((ops->flags & SCX_OPS_ENQ_LAST) && !ops->enqueue) {
- scx_ops_error("SCX_OPS_ENQ_LAST requires ops.enqueue() to be implemented");
+ scx_error("SCX_OPS_ENQ_LAST requires ops.enqueue() to be implemented");
return -EINVAL;
}
@@ -5209,7 +5199,7 @@ static int validate_ops(const struct sched_ext_ops *ops)
*/
if ((ops->flags & SCX_OPS_BUILTIN_IDLE_PER_NODE) &&
(ops->update_idle && !(ops->flags & SCX_OPS_KEEP_BUILTIN_IDLE))) {
- scx_ops_error("SCX_OPS_BUILTIN_IDLE_PER_NODE requires CPU idle selection enabled");
+ scx_error("SCX_OPS_BUILTIN_IDLE_PER_NODE requires CPU idle selection enabled");
return -EINVAL;
}
@@ -5219,7 +5209,7 @@ static int validate_ops(const struct sched_ext_ops *ops)
return 0;
}
-static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link)
+static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
{
struct scx_task_iter sti;
struct task_struct *p;
@@ -5232,7 +5222,7 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link)
return -EINVAL;
}
- mutex_lock(&scx_ops_enable_mutex);
+ mutex_lock(&scx_enable_mutex);
/*
* Clear event counters so a new scx scheduler gets
@@ -5243,10 +5233,9 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link)
memset(e, 0, sizeof(*e));
}
- if (!scx_ops_helper) {
- WRITE_ONCE(scx_ops_helper,
- scx_create_rt_helper("sched_ext_ops_helper"));
- if (!scx_ops_helper) {
+ if (!scx_helper) {
+ WRITE_ONCE(scx_helper, scx_create_rt_helper("sched_ext_helper"));
+ if (!scx_helper) {
ret = -ENOMEM;
goto err_unlock;
}
@@ -5280,7 +5269,7 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link)
global_dsqs = dsqs;
}
- if (scx_ops_enable_state() != SCX_OPS_DISABLED) {
+ if (scx_enable_state() != SCX_DISABLED) {
ret = -EBUSY;
goto err_unlock;
}
@@ -5308,8 +5297,7 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link)
*/
scx_ops = *ops;
- WARN_ON_ONCE(scx_ops_set_enable_state(SCX_OPS_ENABLING) !=
- SCX_OPS_DISABLED);
+ WARN_ON_ONCE(scx_set_enable_state(SCX_ENABLING) != SCX_DISABLED);
atomic_set(&scx_exit_kind, SCX_EXIT_NONE);
scx_warned_zero_slice = false;
@@ -5332,7 +5320,7 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link)
if (ret) {
ret = ops_sanitize_err("init", ret);
cpus_read_unlock();
- scx_ops_error("ops.init() failed (%d)", ret);
+ scx_error("ops.init() failed (%d)", ret);
goto err_disable;
}
}
@@ -5371,12 +5359,12 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link)
scx_watchdog_timeout / 2);
/*
- * Once __scx_ops_enabled is set, %current can be switched to SCX
- * anytime. This can lead to stalls as some BPF schedulers (e.g.
- * userspace scheduling) may not function correctly before all tasks are
- * switched. Init in bypass mode to guarantee forward progress.
+ * Once __scx_enabled is set, %current can be switched to SCX anytime.
+ * This can lead to stalls as some BPF schedulers (e.g. userspace
+ * scheduling) may not function correctly before all tasks are switched.
+ * Init in bypass mode to guarantee forward progress.
*/
- scx_ops_bypass(true);
+ scx_bypass(true);
for (i = SCX_OPI_NORMAL_BEGIN; i < SCX_OPI_NORMAL_END; i++)
if (((void (**)(void))ops)[i])
@@ -5399,8 +5387,8 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link)
*/
percpu_down_write(&scx_fork_rwsem);
- WARN_ON_ONCE(scx_ops_init_task_enabled);
- scx_ops_init_task_enabled = true;
+ WARN_ON_ONCE(scx_init_task_enabled);
+ scx_init_task_enabled = true;
/*
* Enable ops for every task. Fork is excluded by scx_fork_rwsem
@@ -5409,11 +5397,11 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link)
* tasks. Prep all tasks first and then enable them with preemption
* disabled.
*
- * All cgroups should be initialized before scx_ops_init_task() so that
- * the BPF scheduler can reliably track each task's cgroup membership
- * from scx_ops_init_task(). Lock out cgroup on/offlining and task
- * migrations while tasks are being initialized so that
- * scx_cgroup_can_attach() never sees uninitialized tasks.
+ * All cgroups should be initialized before scx_init_task() so that the
+ * BPF scheduler can reliably track each task's cgroup membership from
+ * scx_init_task(). Lock out cgroup on/offlining and task migrations
+ * while tasks are being initialized so that scx_cgroup_can_attach()
+ * never sees uninitialized tasks.
*/
scx_cgroup_lock();
ret = scx_cgroup_init();
@@ -5432,13 +5420,13 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link)
scx_task_iter_unlock(&sti);
- ret = scx_ops_init_task(p, task_group(p), false);
+ ret = scx_init_task(p, task_group(p), false);
if (ret) {
put_task_struct(p);
scx_task_iter_relock(&sti);
scx_task_iter_stop(&sti);
- scx_ops_error("ops.init_task() failed (%d) for %s[%d]",
- ret, p->comm, p->pid);
+ scx_error("ops.init_task() failed (%d) for %s[%d]",
+ ret, p->comm, p->pid);
goto err_disable_unlock_all;
}
@@ -5456,7 +5444,7 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link)
* all eligible tasks.
*/
WRITE_ONCE(scx_switching_all, !(ops->flags & SCX_OPS_SWITCH_PARTIAL));
- static_branch_enable(&__scx_ops_enabled);
+ static_branch_enable(&__scx_enabled);
/*
* We're fully committed and can't fail. The task READY -> ENABLED
@@ -5487,9 +5475,9 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link)
scx_task_iter_stop(&sti);
percpu_up_write(&scx_fork_rwsem);
- scx_ops_bypass(false);
+ scx_bypass(false);
- if (!scx_ops_tryset_enable_state(SCX_OPS_ENABLED, SCX_OPS_ENABLING)) {
+ if (!scx_tryset_enable_state(SCX_ENABLED, SCX_ENABLING)) {
WARN_ON_ONCE(atomic_read(&scx_exit_kind) == SCX_EXIT_NONE);
goto err_disable;
}
@@ -5500,7 +5488,7 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link)
pr_info("sched_ext: BPF scheduler \"%s\" enabled%s\n",
scx_ops.name, scx_switched_all() ? "" : " (partial)");
kobject_uevent(scx_root_kobj, KOBJ_ADD);
- mutex_unlock(&scx_ops_enable_mutex);
+ mutex_unlock(&scx_enable_mutex);
atomic_long_inc(&scx_enable_seq);
@@ -5516,26 +5504,26 @@ err:
scx_exit_info = NULL;
}
err_unlock:
- mutex_unlock(&scx_ops_enable_mutex);
+ mutex_unlock(&scx_enable_mutex);
return ret;
err_disable_unlock_all:
scx_cgroup_unlock();
percpu_up_write(&scx_fork_rwsem);
- scx_ops_bypass(false);
+ scx_bypass(false);
err_disable:
- mutex_unlock(&scx_ops_enable_mutex);
+ mutex_unlock(&scx_enable_mutex);
/*
* Returning an error code here would not pass all the error information
- * to userspace. Record errno using scx_ops_error() for cases
- * scx_ops_error() wasn't already invoked and exit indicating success so
- * that the error is notified through ops.exit() with all the details.
+ * to userspace. Record errno using scx_error() for cases scx_error()
+ * wasn't already invoked and exit indicating success so that the error
+ * is notified through ops.exit() with all the details.
*
- * Flush scx_ops_disable_work to ensure that error is reported before
- * init completion.
+ * Flush scx_disable_work to ensure that error is reported before init
+ * completion.
*/
- scx_ops_error("scx_ops_enable() failed (%d)", ret);
- kthread_flush_work(&scx_ops_disable_work);
+ scx_error("scx_enable() failed (%d)", ret);
+ kthread_flush_work(&scx_disable_work);
return 0;
}
@@ -5679,13 +5667,13 @@ static int bpf_scx_check_member(const struct btf_type *t,
static int bpf_scx_reg(void *kdata, struct bpf_link *link)
{
- return scx_ops_enable(kdata, link);
+ return scx_enable(kdata, link);
}
static void bpf_scx_unreg(void *kdata, struct bpf_link *link)
{
- scx_ops_disable(SCX_EXIT_UNREG);
- kthread_flush_work(&scx_ops_disable_work);
+ scx_disable(SCX_EXIT_UNREG);
+ kthread_flush_work(&scx_disable_work);
}
static int bpf_scx_init(struct btf *btf)
@@ -5807,8 +5795,8 @@ static struct bpf_struct_ops bpf_sched_ext_ops = {
static void sysrq_handle_sched_ext_reset(u8 key)
{
- if (scx_ops_helper)
- scx_ops_disable(SCX_EXIT_SYSRQ);
+ if (scx_helper)
+ scx_disable(SCX_EXIT_SYSRQ);
else
pr_info("sched_ext: BPF scheduler not yet used\n");
}
@@ -5958,13 +5946,13 @@ static void kick_cpus_irq_workfn(struct irq_work *irq_work)
*/
void print_scx_info(const char *log_lvl, struct task_struct *p)
{
- enum scx_ops_enable_state state = scx_ops_enable_state();
+ enum scx_enable_state state = scx_enable_state();
const char *all = READ_ONCE(scx_switching_all) ? "+all" : "";
char runnable_at_buf[22] = "?";
struct sched_class *class;
unsigned long runnable_at;
- if (state == SCX_OPS_DISABLED)
+ if (state == SCX_DISABLED)
return;
/*
@@ -5974,7 +5962,7 @@ void print_scx_info(const char *log_lvl, struct task_struct *p)
if (copy_from_kernel_nofault(&class, &p->sched_class, sizeof(class)) ||
class != &ext_sched_class) {
printk("%sSched_ext: %s (%s%s)", log_lvl, scx_ops.name,
- scx_ops_enable_state_str[state], all);
+ scx_enable_state_str[state], all);
return;
}
@@ -5985,7 +5973,7 @@ void print_scx_info(const char *log_lvl, struct task_struct *p)
/* print everything onto one line to conserve console space */
printk("%sSched_ext: %s (%s%s), task: runnable_at=%s",
- log_lvl, scx_ops.name, scx_ops_enable_state_str[state], all,
+ log_lvl, scx_ops.name, scx_enable_state_str[state], all,
runnable_at_buf);
}
@@ -6001,12 +5989,12 @@ static int scx_pm_handler(struct notifier_block *nb, unsigned long event, void *
case PM_HIBERNATION_PREPARE:
case PM_SUSPEND_PREPARE:
case PM_RESTORE_PREPARE:
- scx_ops_bypass(true);
+ scx_bypass(true);
break;
case PM_POST_HIBERNATION:
case PM_POST_SUSPEND:
case PM_POST_RESTORE:
- scx_ops_bypass(false);
+ scx_bypass(false);
break;
}
@@ -6073,12 +6061,12 @@ static bool scx_dsq_insert_preamble(struct task_struct *p, u64 enq_flags)
lockdep_assert_irqs_disabled();
if (unlikely(!p)) {
- scx_ops_error("called with NULL task");
+ scx_error("called with NULL task");
return false;
}
if (unlikely(enq_flags & __SCX_ENQ_INTERNAL_MASK)) {
- scx_ops_error("invalid enq_flags 0x%llx", enq_flags);
+ scx_error("invalid enq_flags 0x%llx", enq_flags);
return false;
}
@@ -6098,7 +6086,7 @@ static void scx_dsq_insert_commit(struct task_struct *p, u64 dsq_id,
}
if (unlikely(dspc->cursor >= scx_dsp_max_batch)) {
- scx_ops_error("dispatch buffer overflow");
+ scx_error("dispatch buffer overflow");
return;
}
@@ -6264,7 +6252,7 @@ static bool scx_dsq_move(struct bpf_iter_scx_dsq_kern *kit,
* cause similar live-lock conditions as consume_dispatch_q(). Insert a
* breather if necessary.
*/
- scx_ops_breather(src_rq);
+ scx_breather(src_rq);
locked_rq = src_rq;
raw_spin_lock(&src_dsq->lock);
@@ -6343,7 +6331,7 @@ __bpf_kfunc void scx_bpf_dispatch_cancel(void)
if (dspc->cursor > 0)
dspc->cursor--;
else
- scx_ops_error("dispatch buffer underflow");
+ scx_error("dispatch buffer underflow");
}
/**
@@ -6372,7 +6360,7 @@ __bpf_kfunc bool scx_bpf_dsq_move_to_local(u64 dsq_id)
dsq = find_user_dsq(dsq_id);
if (unlikely(!dsq)) {
- scx_ops_error("invalid DSQ ID 0x%016llx", dsq_id);
+ scx_error("invalid DSQ ID 0x%016llx", dsq_id);
return false;
}
@@ -6696,7 +6684,7 @@ __bpf_kfunc void scx_bpf_kick_cpu(s32 cpu, u64 flags)
struct rq *target_rq = cpu_rq(cpu);
if (unlikely(flags & (SCX_KICK_PREEMPT | SCX_KICK_WAIT)))
- scx_ops_error("PREEMPT/WAIT cannot be used with SCX_KICK_IDLE");
+ scx_error("PREEMPT/WAIT cannot be used with SCX_KICK_IDLE");
if (raw_spin_rq_trylock(target_rq)) {
if (can_skip_idle_kick(target_rq)) {
@@ -6884,21 +6872,20 @@ static s32 __bstr_format(u64 *data_buf, char *line_buf, size_t line_size,
if (data__sz % 8 || data__sz > MAX_BPRINTF_VARARGS * 8 ||
(data__sz && !data)) {
- scx_ops_error("invalid data=%p and data__sz=%u",
- (void *)data, data__sz);
+ scx_error("invalid data=%p and data__sz=%u", (void *)data, data__sz);
return -EINVAL;
}
ret = copy_from_kernel_nofault(data_buf, data, data__sz);
if (ret < 0) {
- scx_ops_error("failed to read data fields (%d)", ret);
+ scx_error("failed to read data fields (%d)", ret);
return ret;
}
ret = bpf_bprintf_prepare(fmt, UINT_MAX, data_buf, data__sz / 8,
&bprintf_data);
if (ret < 0) {
- scx_ops_error("format preparation failed (%d)", ret);
+ scx_error("format preparation failed (%d)", ret);
return ret;
}
@@ -6906,8 +6893,7 @@ static s32 __bstr_format(u64 *data_buf, char *line_buf, size_t line_size,
bprintf_data.bin_args);
bpf_bprintf_cleanup(&bprintf_data);
if (ret < 0) {
- scx_ops_error("(\"%s\", %p, %u) failed to format",
- fmt, data, data__sz);
+ scx_error("(\"%s\", %p, %u) failed to format", fmt, data, data__sz);
return ret;
}
@@ -6940,8 +6926,7 @@ __bpf_kfunc void scx_bpf_exit_bstr(s64 exit_code, char *fmt,
raw_spin_lock_irqsave(&scx_exit_bstr_buf_lock, flags);
if (bstr_format(&scx_exit_bstr_buf, fmt, data, data__sz) >= 0)
- scx_ops_exit_kind(SCX_EXIT_UNREG_BPF, exit_code, "%s",
- scx_exit_bstr_buf.line);
+ __scx_exit(SCX_EXIT_UNREG_BPF, exit_code, "%s", scx_exit_bstr_buf.line);
raw_spin_unlock_irqrestore(&scx_exit_bstr_buf_lock, flags);
}
@@ -6961,8 +6946,7 @@ __bpf_kfunc void scx_bpf_error_bstr(char *fmt, unsigned long long *data,
raw_spin_lock_irqsave(&scx_exit_bstr_buf_lock, flags);
if (bstr_format(&scx_exit_bstr_buf, fmt, data, data__sz) >= 0)
- scx_ops_exit_kind(SCX_EXIT_ERROR_BPF, 0, "%s",
- scx_exit_bstr_buf.line);
+ __scx_exit(SCX_EXIT_ERROR_BPF, 0, "%s", scx_exit_bstr_buf.line);
raw_spin_unlock_irqrestore(&scx_exit_bstr_buf_lock, flags);
}
@@ -6986,7 +6970,7 @@ __bpf_kfunc void scx_bpf_dump_bstr(char *fmt, unsigned long long *data,
s32 ret;
if (raw_smp_processor_id() != dd->cpu) {
- scx_ops_error("scx_bpf_dump() must only be called from ops.dump() and friends");
+ scx_error("scx_bpf_dump() must only be called from ops.dump() and friends");
return;
}
@@ -7072,7 +7056,7 @@ __bpf_kfunc u32 scx_bpf_cpuperf_cur(s32 cpu)
__bpf_kfunc void scx_bpf_cpuperf_set(s32 cpu, u32 perf)
{
if (unlikely(perf > SCX_CPUPERF_ONE)) {
- scx_ops_error("Invalid cpuperf target %u for CPU %d", perf, cpu);
+ scx_error("Invalid cpuperf target %u for CPU %d", perf, cpu);
return;
}
diff --git a/kernel/sched/ext_idle.c b/kernel/sched/ext_idle.c
index cb343ca889e0..023ae6df5e8c 100644
--- a/kernel/sched/ext_idle.c
+++ b/kernel/sched/ext_idle.c
@@ -47,6 +47,13 @@ static struct scx_idle_cpus scx_idle_global_masks;
static struct scx_idle_cpus **scx_idle_node_masks;
/*
+ * Local per-CPU cpumasks (used to generate temporary idle cpumasks).
+ */
+static DEFINE_PER_CPU(cpumask_var_t, local_idle_cpumask);
+static DEFINE_PER_CPU(cpumask_var_t, local_llc_idle_cpumask);
+static DEFINE_PER_CPU(cpumask_var_t, local_numa_idle_cpumask);
+
+/*
* Return the idle masks associated to a target @node.
*
* NUMA_NO_NODE identifies the global idle cpumask.
@@ -392,6 +399,14 @@ void scx_idle_update_selcpu_topology(struct sched_ext_ops *ops)
}
/*
+ * Return true if @p can run on all possible CPUs, false otherwise.
+ */
+static inline bool task_affinity_all(const struct task_struct *p)
+{
+ return p->nr_cpus_allowed >= num_possible_cpus();
+}
+
+/*
* Built-in CPU idle selection policy:
*
* 1. Prioritize full-idle cores:
@@ -403,13 +418,15 @@ void scx_idle_update_selcpu_topology(struct sched_ext_ops *ops)
* branch prediction optimizations.
*
* 3. Pick a CPU within the same LLC (Last-Level Cache):
- * - if the above conditions aren't met, pick a CPU that shares the same LLC
- * to maintain cache locality.
+ * - if the above conditions aren't met, pick a CPU that shares the same
+ * LLC, if the LLC domain is a subset of @cpus_allowed, to maintain
+ * cache locality.
*
* 4. Pick a CPU within the same NUMA node, if enabled:
- * - choose a CPU from the same NUMA node to reduce memory access latency.
+ * - choose a CPU from the same NUMA node, if the node cpumask is a
+ * subset of @cpus_allowed, to reduce memory access latency.
*
- * 5. Pick any idle CPU usable by the task.
+ * 5. Pick any idle CPU within the @cpus_allowed domain.
*
* Step 3 and 4 are performed only if the system has, respectively,
* multiple LLCs / multiple NUMA nodes (see scx_selcpu_topo_llc and
@@ -424,35 +441,77 @@ void scx_idle_update_selcpu_topology(struct sched_ext_ops *ops)
* NOTE: tasks that can only run on 1 CPU are excluded by this logic, because
* we never call ops.select_cpu() for them, see select_task_rq().
*/
-s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, u64 flags)
+s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags,
+ const struct cpumask *cpus_allowed, u64 flags)
{
- const struct cpumask *llc_cpus = NULL;
- const struct cpumask *numa_cpus = NULL;
+ const struct cpumask *llc_cpus = NULL, *numa_cpus = NULL;
+ const struct cpumask *allowed = cpus_allowed ?: p->cpus_ptr;
int node = scx_cpu_node_if_enabled(prev_cpu);
s32 cpu;
+ preempt_disable();
+
+ /*
+ * Determine the subset of CPUs usable by @p within @cpus_allowed.
+ */
+ if (allowed != p->cpus_ptr) {
+ struct cpumask *local_cpus = this_cpu_cpumask_var_ptr(local_idle_cpumask);
+
+ if (task_affinity_all(p)) {
+ allowed = cpus_allowed;
+ } else if (cpumask_and(local_cpus, cpus_allowed, p->cpus_ptr)) {
+ allowed = local_cpus;
+ } else {
+ cpu = -EBUSY;
+ goto out_enable;
+ }
+
+ /*
+ * If @prev_cpu is not in the allowed CPUs, skip topology
+ * optimizations and try to pick any idle CPU usable by the
+ * task.
+ *
+ * If %SCX_OPS_BUILTIN_IDLE_PER_NODE is enabled, prioritize
+ * the current node, as it may optimize some waker->wakee
+ * workloads.
+ */
+ if (!cpumask_test_cpu(prev_cpu, allowed)) {
+ node = scx_cpu_node_if_enabled(smp_processor_id());
+ cpu = scx_pick_idle_cpu(allowed, node, flags);
+ goto out_enable;
+ }
+ }
+
/*
* This is necessary to protect llc_cpus.
*/
rcu_read_lock();
/*
- * Determine the scheduling domain only if the task is allowed to run
- * on all CPUs.
+ * Determine the subset of CPUs that the task can use in its
+ * current LLC and node.
*
- * This is done primarily for efficiency, as it avoids the overhead of
- * updating a cpumask every time we need to select an idle CPU (which
- * can be costly in large SMP systems), but it also aligns logically:
- * if a task's scheduling domain is restricted by user-space (through
- * CPU affinity), the task will simply use the flat scheduling domain
- * defined by user-space.
+ * If the task can run on all CPUs, use the node and LLC cpumasks
+ * directly.
*/
- if (p->nr_cpus_allowed >= num_possible_cpus()) {
- if (static_branch_maybe(CONFIG_NUMA, &scx_selcpu_topo_numa))
- numa_cpus = numa_span(prev_cpu);
+ if (static_branch_maybe(CONFIG_NUMA, &scx_selcpu_topo_numa)) {
+ struct cpumask *local_cpus = this_cpu_cpumask_var_ptr(local_numa_idle_cpumask);
+ const struct cpumask *cpus = numa_span(prev_cpu);
+
+ if (allowed == p->cpus_ptr && task_affinity_all(p))
+ numa_cpus = cpus;
+ else if (cpus && cpumask_and(local_cpus, allowed, cpus))
+ numa_cpus = local_cpus;
+ }
+
+ if (static_branch_maybe(CONFIG_SCHED_MC, &scx_selcpu_topo_llc)) {
+ struct cpumask *local_cpus = this_cpu_cpumask_var_ptr(local_llc_idle_cpumask);
+ const struct cpumask *cpus = llc_span(prev_cpu);
- if (static_branch_maybe(CONFIG_SCHED_MC, &scx_selcpu_topo_llc))
- llc_cpus = llc_span(prev_cpu);
+ if (allowed == p->cpus_ptr && task_affinity_all(p))
+ llc_cpus = cpus;
+ else if (cpus && cpumask_and(local_cpus, allowed, cpus))
+ llc_cpus = local_cpus;
}
/*
@@ -490,7 +549,7 @@ s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, u64
cpu_rq(cpu)->scx.local_dsq.nr == 0 &&
(!(flags & SCX_PICK_IDLE_IN_NODE) || (waker_node == node)) &&
!cpumask_empty(idle_cpumask(waker_node)->cpu)) {
- if (cpumask_test_cpu(cpu, p->cpus_ptr))
+ if (cpumask_test_cpu(cpu, allowed))
goto out_unlock;
}
}
@@ -535,7 +594,7 @@ s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, u64
* begin in prev_cpu's node and proceed to other nodes in
* order of increasing distance.
*/
- cpu = scx_pick_idle_cpu(p->cpus_ptr, node, flags | SCX_PICK_IDLE_CORE);
+ cpu = scx_pick_idle_cpu(allowed, node, flags | SCX_PICK_IDLE_CORE);
if (cpu >= 0)
goto out_unlock;
@@ -583,10 +642,12 @@ s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, u64
* in prev_cpu's node and proceed to other nodes in order of
* increasing distance.
*/
- cpu = scx_pick_idle_cpu(p->cpus_ptr, node, flags);
+ cpu = scx_pick_idle_cpu(allowed, node, flags);
out_unlock:
rcu_read_unlock();
+out_enable:
+ preempt_enable();
return cpu;
}
@@ -596,7 +657,7 @@ out_unlock:
*/
void scx_idle_init_masks(void)
{
- int node;
+ int i;
/* Allocate global idle cpumasks */
BUG_ON(!alloc_cpumask_var(&scx_idle_global_masks.cpu, GFP_KERNEL));
@@ -607,13 +668,23 @@ void scx_idle_init_masks(void)
sizeof(*scx_idle_node_masks), GFP_KERNEL);
BUG_ON(!scx_idle_node_masks);
- for_each_node(node) {
- scx_idle_node_masks[node] = kzalloc_node(sizeof(**scx_idle_node_masks),
- GFP_KERNEL, node);
- BUG_ON(!scx_idle_node_masks[node]);
+ for_each_node(i) {
+ scx_idle_node_masks[i] = kzalloc_node(sizeof(**scx_idle_node_masks),
+ GFP_KERNEL, i);
+ BUG_ON(!scx_idle_node_masks[i]);
+
+ BUG_ON(!alloc_cpumask_var_node(&scx_idle_node_masks[i]->cpu, GFP_KERNEL, i));
+ BUG_ON(!alloc_cpumask_var_node(&scx_idle_node_masks[i]->smt, GFP_KERNEL, i));
+ }
- BUG_ON(!alloc_cpumask_var_node(&scx_idle_node_masks[node]->cpu, GFP_KERNEL, node));
- BUG_ON(!alloc_cpumask_var_node(&scx_idle_node_masks[node]->smt, GFP_KERNEL, node));
+ /* Allocate local per-cpu idle cpumasks */
+ for_each_possible_cpu(i) {
+ BUG_ON(!alloc_cpumask_var_node(&per_cpu(local_idle_cpumask, i),
+ GFP_KERNEL, cpu_to_node(i)));
+ BUG_ON(!alloc_cpumask_var_node(&per_cpu(local_llc_idle_cpumask, i),
+ GFP_KERNEL, cpu_to_node(i)));
+ BUG_ON(!alloc_cpumask_var_node(&per_cpu(local_numa_idle_cpumask, i),
+ GFP_KERNEL, cpu_to_node(i)));
}
}
@@ -748,7 +819,7 @@ void scx_idle_disable(void)
static int validate_node(int node)
{
if (!static_branch_likely(&scx_builtin_idle_per_node)) {
- scx_ops_error("per-node idle tracking is disabled");
+ scx_error("per-node idle tracking is disabled");
return -EOPNOTSUPP;
}
@@ -758,13 +829,13 @@ static int validate_node(int node)
/* Make sure node is in a valid range */
if (node < 0 || node >= nr_node_ids) {
- scx_ops_error("invalid node %d", node);
+ scx_error("invalid node %d", node);
return -EINVAL;
}
/* Make sure the node is part of the set of possible nodes */
if (!node_possible(node)) {
- scx_ops_error("unavailable node %d", node);
+ scx_error("unavailable node %d", node);
return -EINVAL;
}
@@ -778,7 +849,7 @@ static bool check_builtin_idle_enabled(void)
if (static_branch_likely(&scx_builtin_idle_enabled))
return true;
- scx_ops_error("built-in idle tracking is disabled");
+ scx_error("built-in idle tracking is disabled");
return false;
}
@@ -829,7 +900,7 @@ __bpf_kfunc s32 scx_bpf_select_cpu_dfl(struct task_struct *p, s32 prev_cpu,
goto prev_cpu;
#ifdef CONFIG_SMP
- cpu = scx_select_cpu_dfl(p, prev_cpu, wake_flags, 0);
+ cpu = scx_select_cpu_dfl(p, prev_cpu, wake_flags, NULL, 0);
if (cpu >= 0) {
*is_idle = true;
return cpu;
@@ -842,13 +913,67 @@ prev_cpu:
}
/**
+ * scx_bpf_select_cpu_and - Pick an idle CPU usable by task @p,
+ * prioritizing those in @cpus_allowed
+ * @p: task_struct to select a CPU for
+ * @prev_cpu: CPU @p was on previously
+ * @wake_flags: %SCX_WAKE_* flags
+ * @cpus_allowed: cpumask of allowed CPUs
+ * @flags: %SCX_PICK_IDLE* flags
+ *
+ * Can only be called from ops.select_cpu() or ops.enqueue() if the
+ * built-in CPU selection is enabled: ops.update_idle() is missing or
+ * %SCX_OPS_KEEP_BUILTIN_IDLE is set.
+ *
+ * @p, @prev_cpu and @wake_flags match ops.select_cpu().
+ *
+ * Returns the selected idle CPU, which will be automatically awakened upon
+ * returning from ops.select_cpu() and can be used for direct dispatch, or
+ * a negative value if no idle CPU is available.
+ */
+__bpf_kfunc s32 scx_bpf_select_cpu_and(struct task_struct *p, s32 prev_cpu, u64 wake_flags,
+ const struct cpumask *cpus_allowed, u64 flags)
+{
+ s32 cpu;
+
+ if (!ops_cpu_valid(prev_cpu, NULL))
+ return -EINVAL;
+
+ if (!check_builtin_idle_enabled())
+ return -EBUSY;
+
+ if (!scx_kf_allowed(SCX_KF_SELECT_CPU | SCX_KF_ENQUEUE))
+ return -EPERM;
+
+#ifdef CONFIG_SMP
+ /*
+ * This may also be called from ops.enqueue(), so we need to handle
+ * per-CPU tasks as well. For these tasks, we can skip all idle CPU
+ * selection optimizations and simply check whether the previously
+ * used CPU is idle and within the allowed cpumask.
+ */
+ if (p->nr_cpus_allowed == 1) {
+ if (cpumask_test_cpu(prev_cpu, cpus_allowed) &&
+ scx_idle_test_and_clear_cpu(prev_cpu))
+ return prev_cpu;
+ return -EBUSY;
+ }
+ cpu = scx_select_cpu_dfl(p, prev_cpu, wake_flags, cpus_allowed, flags);
+#else
+ cpu = -EBUSY;
+#endif
+
+ return cpu;
+}
+
+/**
* scx_bpf_get_idle_cpumask_node - Get a referenced kptr to the
* idle-tracking per-CPU cpumask of a target NUMA node.
* @node: target NUMA node
*
* Returns an empty cpumask if idle tracking is not enabled, if @node is
* not valid, or running on a UP kernel. In this case the actual error will
- * be reported to the BPF scheduler via scx_ops_error().
+ * be reported to the BPF scheduler via scx_error().
*/
__bpf_kfunc const struct cpumask *scx_bpf_get_idle_cpumask_node(int node)
{
@@ -873,7 +998,7 @@ __bpf_kfunc const struct cpumask *scx_bpf_get_idle_cpumask_node(int node)
__bpf_kfunc const struct cpumask *scx_bpf_get_idle_cpumask(void)
{
if (static_branch_unlikely(&scx_builtin_idle_per_node)) {
- scx_ops_error("SCX_OPS_BUILTIN_IDLE_PER_NODE enabled");
+ scx_error("SCX_OPS_BUILTIN_IDLE_PER_NODE enabled");
return cpu_none_mask;
}
@@ -895,7 +1020,7 @@ __bpf_kfunc const struct cpumask *scx_bpf_get_idle_cpumask(void)
*
* Returns an empty cpumask if idle tracking is not enabled, if @node is
* not valid, or running on a UP kernel. In this case the actual error will
- * be reported to the BPF scheduler via scx_ops_error().
+ * be reported to the BPF scheduler via scx_error().
*/
__bpf_kfunc const struct cpumask *scx_bpf_get_idle_smtmask_node(int node)
{
@@ -924,7 +1049,7 @@ __bpf_kfunc const struct cpumask *scx_bpf_get_idle_smtmask_node(int node)
__bpf_kfunc const struct cpumask *scx_bpf_get_idle_smtmask(void)
{
if (static_branch_unlikely(&scx_builtin_idle_per_node)) {
- scx_ops_error("SCX_OPS_BUILTIN_IDLE_PER_NODE enabled");
+ scx_error("SCX_OPS_BUILTIN_IDLE_PER_NODE enabled");
return cpu_none_mask;
}
@@ -1032,7 +1157,7 @@ __bpf_kfunc s32 scx_bpf_pick_idle_cpu(const struct cpumask *cpus_allowed,
u64 flags)
{
if (static_branch_maybe(CONFIG_NUMA, &scx_builtin_idle_per_node)) {
- scx_ops_error("per-node idle tracking is enabled");
+ scx_error("per-node idle tracking is enabled");
return -EBUSY;
}
@@ -1109,7 +1234,7 @@ __bpf_kfunc s32 scx_bpf_pick_any_cpu(const struct cpumask *cpus_allowed,
s32 cpu;
if (static_branch_maybe(CONFIG_NUMA, &scx_builtin_idle_per_node)) {
- scx_ops_error("per-node idle tracking is enabled");
+ scx_error("per-node idle tracking is enabled");
return -EBUSY;
}
@@ -1149,6 +1274,7 @@ static const struct btf_kfunc_id_set scx_kfunc_set_idle = {
BTF_KFUNCS_START(scx_kfunc_ids_select_cpu)
BTF_ID_FLAGS(func, scx_bpf_select_cpu_dfl, KF_RCU)
+BTF_ID_FLAGS(func, scx_bpf_select_cpu_and, KF_RCU)
BTF_KFUNCS_END(scx_kfunc_ids_select_cpu)
static const struct btf_kfunc_id_set scx_kfunc_set_select_cpu = {
diff --git a/kernel/sched/ext_idle.h b/kernel/sched/ext_idle.h
index 511cc2221f7a..37be78a7502b 100644
--- a/kernel/sched/ext_idle.h
+++ b/kernel/sched/ext_idle.h
@@ -27,7 +27,8 @@ static inline s32 scx_pick_idle_cpu(const struct cpumask *cpus_allowed, int node
}
#endif /* CONFIG_SMP */
-s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, u64 flags);
+s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags,
+ const struct cpumask *cpus_allowed, u64 flags);
void scx_idle_enable(struct sched_ext_ops *ops);
void scx_idle_disable(void);
int scx_idle_init(void);
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 47972f34ea70..ac07f64c8f39 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1717,10 +1717,10 @@ extern struct balance_callback balance_push_callback;
#ifdef CONFIG_SCHED_CLASS_EXT
extern const struct sched_class ext_sched_class;
-DECLARE_STATIC_KEY_FALSE(__scx_ops_enabled); /* SCX BPF scheduler loaded */
+DECLARE_STATIC_KEY_FALSE(__scx_enabled); /* SCX BPF scheduler loaded */
DECLARE_STATIC_KEY_FALSE(__scx_switched_all); /* all fair class tasks on SCX */
-#define scx_enabled() static_branch_unlikely(&__scx_ops_enabled)
+#define scx_enabled() static_branch_unlikely(&__scx_enabled)
#define scx_switched_all() static_branch_unlikely(&__scx_switched_all)
static inline void scx_rq_clock_update(struct rq *rq, u64 clock)
diff --git a/tools/sched_ext/include/scx/common.bpf.h b/tools/sched_ext/include/scx/common.bpf.h
index 8787048c6762..d4e21558e982 100644
--- a/tools/sched_ext/include/scx/common.bpf.h
+++ b/tools/sched_ext/include/scx/common.bpf.h
@@ -48,6 +48,8 @@ static inline void ___vmlinux_h_sanity_check___(void)
s32 scx_bpf_create_dsq(u64 dsq_id, s32 node) __ksym;
s32 scx_bpf_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, bool *is_idle) __ksym;
+s32 scx_bpf_select_cpu_and(struct task_struct *p, s32 prev_cpu, u64 wake_flags,
+ const struct cpumask *cpus_allowed, u64 flags) __ksym __weak;
void scx_bpf_dsq_insert(struct task_struct *p, u64 dsq_id, u64 slice, u64 enq_flags) __ksym __weak;
void scx_bpf_dsq_insert_vtime(struct task_struct *p, u64 dsq_id, u64 slice, u64 vtime, u64 enq_flags) __ksym __weak;
u32 scx_bpf_dispatch_nr_slots(void) __ksym;
diff --git a/tools/sched_ext/scx_show_state.py b/tools/sched_ext/scx_show_state.py
index b800d4f5f2e9..7cdcc6729ea4 100644
--- a/tools/sched_ext/scx_show_state.py
+++ b/tools/sched_ext/scx_show_state.py
@@ -24,19 +24,19 @@ def read_atomic(name):
def read_static_key(name):
return prog[name].key.enabled.counter.value_()
-def ops_state_str(state):
- return prog['scx_ops_enable_state_str'][state].string_().decode()
+def state_str(state):
+ return prog['scx_enable_state_str'][state].string_().decode()
ops = prog['scx_ops']
-enable_state = read_atomic("scx_ops_enable_state_var")
+enable_state = read_atomic("scx_enable_state_var")
print(f'ops : {ops.name.string_().decode()}')
-print(f'enabled : {read_static_key("__scx_ops_enabled")}')
+print(f'enabled : {read_static_key("__scx_enabled")}')
print(f'switching_all : {read_int("scx_switching_all")}')
print(f'switched_all : {read_static_key("__scx_switched_all")}')
-print(f'enable_state : {ops_state_str(enable_state)} ({enable_state})')
+print(f'enable_state : {state_str(enable_state)} ({enable_state})')
print(f'in_softlockup : {prog["scx_in_softlockup"].value_()}')
-print(f'breather_depth: {read_atomic("scx_ops_breather_depth")}')
-print(f'bypass_depth : {prog["scx_ops_bypass_depth"].value_()}')
+print(f'breather_depth: {read_atomic("scx_breather_depth")}')
+print(f'bypass_depth : {prog["scx_bypass_depth"].value_()}')
print(f'nr_rejected : {read_atomic("scx_nr_rejected")}')
print(f'enable_seq : {read_atomic("scx_enable_seq")}')
diff --git a/tools/testing/selftests/sched_ext/Makefile b/tools/testing/selftests/sched_ext/Makefile
index f4531327b8e7..e9d5bc575f80 100644
--- a/tools/testing/selftests/sched_ext/Makefile
+++ b/tools/testing/selftests/sched_ext/Makefile
@@ -173,6 +173,7 @@ auto-test-targets := \
maybe_null \
minimal \
numa \
+ allowed_cpus \
prog_run \
reload_loop \
select_cpu_dfl \
diff --git a/tools/testing/selftests/sched_ext/allowed_cpus.bpf.c b/tools/testing/selftests/sched_ext/allowed_cpus.bpf.c
new file mode 100644
index 000000000000..39d57f7f7409
--- /dev/null
+++ b/tools/testing/selftests/sched_ext/allowed_cpus.bpf.c
@@ -0,0 +1,121 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * A scheduler that validates the behavior of scx_bpf_select_cpu_and() by
+ * selecting idle CPUs strictly within a subset of allowed CPUs.
+ *
+ * Copyright (c) 2025 Andrea Righi <arighi@nvidia.com>
+ */
+
+#include <scx/common.bpf.h>
+
+char _license[] SEC("license") = "GPL";
+
+UEI_DEFINE(uei);
+
+private(PREF_CPUS) struct bpf_cpumask __kptr * allowed_cpumask;
+
+static void
+validate_idle_cpu(const struct task_struct *p, const struct cpumask *allowed, s32 cpu)
+{
+ if (scx_bpf_test_and_clear_cpu_idle(cpu))
+ scx_bpf_error("CPU %d should be marked as busy", cpu);
+
+ if (bpf_cpumask_subset(allowed, p->cpus_ptr) &&
+ !bpf_cpumask_test_cpu(cpu, allowed))
+ scx_bpf_error("CPU %d not in the allowed domain for %d (%s)",
+ cpu, p->pid, p->comm);
+}
+
+s32 BPF_STRUCT_OPS(allowed_cpus_select_cpu,
+ struct task_struct *p, s32 prev_cpu, u64 wake_flags)
+{
+ const struct cpumask *allowed;
+ s32 cpu;
+
+ allowed = cast_mask(allowed_cpumask);
+ if (!allowed) {
+ scx_bpf_error("allowed domain not initialized");
+ return -EINVAL;
+ }
+
+ /*
+ * Select an idle CPU strictly within the allowed domain.
+ */
+ cpu = scx_bpf_select_cpu_and(p, prev_cpu, wake_flags, allowed, 0);
+ if (cpu >= 0) {
+ validate_idle_cpu(p, allowed, cpu);
+ scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, 0);
+
+ return cpu;
+ }
+
+ return prev_cpu;
+}
+
+void BPF_STRUCT_OPS(allowed_cpus_enqueue, struct task_struct *p, u64 enq_flags)
+{
+ const struct cpumask *allowed;
+ s32 prev_cpu = scx_bpf_task_cpu(p), cpu;
+
+ scx_bpf_dsq_insert(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, 0);
+
+ allowed = cast_mask(allowed_cpumask);
+ if (!allowed) {
+ scx_bpf_error("allowed domain not initialized");
+ return;
+ }
+
+ /*
+ * Use scx_bpf_select_cpu_and() to proactively kick an idle CPU
+ * within @allowed_cpumask, usable by @p.
+ */
+ cpu = scx_bpf_select_cpu_and(p, prev_cpu, 0, allowed, 0);
+ if (cpu >= 0) {
+ validate_idle_cpu(p, allowed, cpu);
+ scx_bpf_kick_cpu(cpu, SCX_KICK_IDLE);
+ }
+}
+
+s32 BPF_STRUCT_OPS_SLEEPABLE(allowed_cpus_init)
+{
+ struct bpf_cpumask *mask;
+
+ mask = bpf_cpumask_create();
+ if (!mask)
+ return -ENOMEM;
+
+ mask = bpf_kptr_xchg(&allowed_cpumask, mask);
+ if (mask)
+ bpf_cpumask_release(mask);
+
+ bpf_rcu_read_lock();
+
+ /*
+ * Assign the first online CPU to the allowed domain.
+ */
+ mask = allowed_cpumask;
+ if (mask) {
+ const struct cpumask *online = scx_bpf_get_online_cpumask();
+
+ bpf_cpumask_set_cpu(bpf_cpumask_first(online), mask);
+ scx_bpf_put_cpumask(online);
+ }
+
+ bpf_rcu_read_unlock();
+
+ return 0;
+}
+
+void BPF_STRUCT_OPS(allowed_cpus_exit, struct scx_exit_info *ei)
+{
+ UEI_RECORD(uei, ei);
+}
+
+SEC(".struct_ops.link")
+struct sched_ext_ops allowed_cpus_ops = {
+ .select_cpu = (void *)allowed_cpus_select_cpu,
+ .enqueue = (void *)allowed_cpus_enqueue,
+ .init = (void *)allowed_cpus_init,
+ .exit = (void *)allowed_cpus_exit,
+ .name = "allowed_cpus",
+};
diff --git a/tools/testing/selftests/sched_ext/allowed_cpus.c b/tools/testing/selftests/sched_ext/allowed_cpus.c
new file mode 100644
index 000000000000..a001a3a0e9f1
--- /dev/null
+++ b/tools/testing/selftests/sched_ext/allowed_cpus.c
@@ -0,0 +1,57 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2025 Andrea Righi <arighi@nvidia.com>
+ */
+#include <bpf/bpf.h>
+#include <scx/common.h>
+#include <sys/wait.h>
+#include <unistd.h>
+#include "allowed_cpus.bpf.skel.h"
+#include "scx_test.h"
+
+static enum scx_test_status setup(void **ctx)
+{
+ struct allowed_cpus *skel;
+
+ skel = allowed_cpus__open();
+ SCX_FAIL_IF(!skel, "Failed to open");
+ SCX_ENUM_INIT(skel);
+ SCX_FAIL_IF(allowed_cpus__load(skel), "Failed to load skel");
+
+ *ctx = skel;
+
+ return SCX_TEST_PASS;
+}
+
+static enum scx_test_status run(void *ctx)
+{
+ struct allowed_cpus *skel = ctx;
+ struct bpf_link *link;
+
+ link = bpf_map__attach_struct_ops(skel->maps.allowed_cpus_ops);
+ SCX_FAIL_IF(!link, "Failed to attach scheduler");
+
+ /* Just sleeping is fine, plenty of scheduling events happening */
+ sleep(1);
+
+ SCX_EQ(skel->data->uei.kind, EXIT_KIND(SCX_EXIT_NONE));
+ bpf_link__destroy(link);
+
+ return SCX_TEST_PASS;
+}
+
+static void cleanup(void *ctx)
+{
+ struct allowed_cpus *skel = ctx;
+
+ allowed_cpus__destroy(skel);
+}
+
+struct scx_test allowed_cpus = {
+ .name = "allowed_cpus",
+ .description = "Verify scx_bpf_select_cpu_and()",
+ .setup = setup,
+ .run = run,
+ .cleanup = cleanup,
+};
+REGISTER_SCX_TEST(&allowed_cpus)