summaryrefslogtreecommitdiff
path: root/kernel/sched/ext.c
diff options
context:
space:
mode:
authorTejun Heo <tj@kernel.org>2024-06-18 10:09:20 -1000
committerTejun Heo <tj@kernel.org>2024-06-18 10:09:20 -1000
commit60c27fb59f6cffa73fc8c60e3a22323c78044576 (patch)
treeaf08b7ff1bb3f61bfae6b1b6ed16ed3f0fa32c0e /kernel/sched/ext.c
parent245254f7081dbe1c8da54675d0e4ddbe74cee61b (diff)
sched_ext: Implement sched_ext_ops.cpu_online/offline()
Add ops.cpu_online/offline() which are invoked when CPUs come online and offline respectively. As the enqueue path already automatically bypasses tasks to the local dsq on a deactivated CPU, BPF schedulers are guaranteed to see tasks only on CPUs which are between online() and offline(). If the BPF scheduler doesn't implement ops.cpu_online/offline(), the scheduler is automatically exited with SCX_ECODE_RESTART | SCX_ECODE_RSN_HOTPLUG. Userspace can implement CPU hotpplug support trivially by simply reinitializing and reloading the scheduler. scx_qmap is updated to print out online CPUs on hotplug events. Other schedulers are updated to restart based on ecode. v3: - The previous implementation added @reason to sched_class.rq_on/offline() to distinguish between CPU hotplug events and topology updates. This was buggy and fragile as the methods are skipped if the current state equals the target state. Instead, add scx_rq_[de]activate() which are directly called from sched_cpu_de/activate(). This also allows ops.cpu_on/offline() to sleep which can be useful. - ops.dispatch() could be called on a CPU that the BPF scheduler was told to be offline. The dispatch patch is updated to bypass in such cases. v2: - To accommodate lock ordering change between scx_cgroup_rwsem and cpus_read_lock(), CPU hotplug operations are put into its own SCX_OPI block and enabled eariler during scx_ope_enable() so that cpus_read_lock() can be dropped before acquiring scx_cgroup_rwsem. - Auto exit with ECODE added. Signed-off-by: Tejun Heo <tj@kernel.org> Reviewed-by: David Vernet <dvernet@meta.com> Acked-by: Josh Don <joshdon@google.com> Acked-by: Hao Luo <haoluo@google.com> Acked-by: Barret Rhoden <brho@google.com>
Diffstat (limited to 'kernel/sched/ext.c')
-rw-r--r--kernel/sched/ext.c156
1 files changed, 149 insertions, 7 deletions
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 686dab6ab592..7c2f2a542b32 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -31,6 +31,29 @@ enum scx_exit_kind {
};
/*
+ * An exit code can be specified when exiting with scx_bpf_exit() or
+ * scx_ops_exit(), corresponding to exit_kind UNREG_BPF and UNREG_KERN
+ * respectively. The codes are 64bit of the format:
+ *
+ * Bits: [63 .. 48 47 .. 32 31 .. 0]
+ * [ SYS ACT ] [ SYS RSN ] [ USR ]
+ *
+ * SYS ACT: System-defined exit actions
+ * SYS RSN: System-defined exit reasons
+ * USR : User-defined exit codes and reasons
+ *
+ * Using the above, users may communicate intention and context by ORing system
+ * actions and/or system reasons with a user-defined exit code.
+ */
+enum scx_exit_code {
+ /* Reasons */
+ SCX_ECODE_RSN_HOTPLUG = 1LLU << 32,
+
+ /* Actions */
+ SCX_ECODE_ACT_RESTART = 1LLU << 48,
+};
+
+/*
* scx_exit_info is passed to ops.exit() to describe why the BPF scheduler is
* being disabled.
*/
@@ -457,7 +480,29 @@ struct sched_ext_ops {
void (*dump_task)(struct scx_dump_ctx *ctx, struct task_struct *p);
/*
- * All online ops must come before ops.init().
+ * All online ops must come before ops.cpu_online().
+ */
+
+ /**
+ * cpu_online - A CPU became online
+ * @cpu: CPU which just came up
+ *
+ * @cpu just came online. @cpu will not call ops.enqueue() or
+ * ops.dispatch(), nor run tasks associated with other CPUs beforehand.
+ */
+ void (*cpu_online)(s32 cpu);
+
+ /**
+ * cpu_offline - A CPU is going offline
+ * @cpu: CPU which is going offline
+ *
+ * @cpu is going offline. @cpu will not call ops.enqueue() or
+ * ops.dispatch(), nor run tasks associated with other CPUs afterwards.
+ */
+ void (*cpu_offline)(s32 cpu);
+
+ /*
+ * All CPU hotplug ops must come before ops.init().
*/
/**
@@ -497,6 +542,15 @@ struct sched_ext_ops {
u32 exit_dump_len;
/**
+ * hotplug_seq - A sequence number that may be set by the scheduler to
+ * detect when a hotplug event has occurred during the loading process.
+ * If 0, no detection occurs. Otherwise, the scheduler will fail to
+ * load if the sequence number does not match @scx_hotplug_seq on the
+ * enable path.
+ */
+ u64 hotplug_seq;
+
+ /**
* name - BPF scheduler's name
*
* Must be a non-zero valid BPF object name including only isalnum(),
@@ -509,7 +563,9 @@ struct sched_ext_ops {
enum scx_opi {
SCX_OPI_BEGIN = 0,
SCX_OPI_NORMAL_BEGIN = 0,
- SCX_OPI_NORMAL_END = SCX_OP_IDX(init),
+ SCX_OPI_NORMAL_END = SCX_OP_IDX(cpu_online),
+ SCX_OPI_CPU_HOTPLUG_BEGIN = SCX_OP_IDX(cpu_online),
+ SCX_OPI_CPU_HOTPLUG_END = SCX_OP_IDX(init),
SCX_OPI_END = SCX_OP_IDX(init),
};
@@ -694,6 +750,7 @@ static atomic_t scx_exit_kind = ATOMIC_INIT(SCX_EXIT_DONE);
static struct scx_exit_info *scx_exit_info;
static atomic_long_t scx_nr_rejected = ATOMIC_LONG_INIT(0);
+static atomic_long_t scx_hotplug_seq = ATOMIC_LONG_INIT(0);
/*
* The maximum amount of time in jiffies that a task may be runnable without
@@ -1419,11 +1476,7 @@ static void direct_dispatch(struct task_struct *p, u64 enq_flags)
static bool scx_rq_online(struct rq *rq)
{
-#ifdef CONFIG_SMP
- return likely(rq->online);
-#else
- return true;
-#endif
+ return likely(rq->scx.flags & SCX_RQ_ONLINE);
}
static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags,
@@ -1438,6 +1491,11 @@ static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags,
if (sticky_cpu == cpu_of(rq))
goto local_norefill;
+ /*
+ * If !scx_rq_online(), we already told the BPF scheduler that the CPU
+ * is offline and are just running the hotplug path. Don't bother the
+ * BPF scheduler.
+ */
if (!scx_rq_online(rq))
goto local;
@@ -2673,6 +2731,42 @@ void __scx_update_idle(struct rq *rq, bool idle)
#endif
}
+static void handle_hotplug(struct rq *rq, bool online)
+{
+ int cpu = cpu_of(rq);
+
+ atomic_long_inc(&scx_hotplug_seq);
+
+ if (online && SCX_HAS_OP(cpu_online))
+ SCX_CALL_OP(SCX_KF_SLEEPABLE, cpu_online, cpu);
+ else if (!online && SCX_HAS_OP(cpu_offline))
+ SCX_CALL_OP(SCX_KF_SLEEPABLE, cpu_offline, cpu);
+ else
+ scx_ops_exit(SCX_ECODE_ACT_RESTART | SCX_ECODE_RSN_HOTPLUG,
+ "cpu %d going %s, exiting scheduler", cpu,
+ online ? "online" : "offline");
+}
+
+void scx_rq_activate(struct rq *rq)
+{
+ handle_hotplug(rq, true);
+}
+
+void scx_rq_deactivate(struct rq *rq)
+{
+ handle_hotplug(rq, false);
+}
+
+static void rq_online_scx(struct rq *rq)
+{
+ rq->scx.flags |= SCX_RQ_ONLINE;
+}
+
+static void rq_offline_scx(struct rq *rq)
+{
+ rq->scx.flags &= ~SCX_RQ_ONLINE;
+}
+
#else /* CONFIG_SMP */
static bool test_and_clear_cpu_idle(int cpu) { return false; }
@@ -3104,6 +3198,9 @@ DEFINE_SCHED_CLASS(ext) = {
.balance = balance_scx,
.select_task_rq = select_task_rq_scx,
.set_cpus_allowed = set_cpus_allowed_scx,
+
+ .rq_online = rq_online_scx,
+ .rq_offline = rq_offline_scx,
#endif
.task_tick = task_tick_scx,
@@ -3235,10 +3332,18 @@ static ssize_t scx_attr_nr_rejected_show(struct kobject *kobj,
}
SCX_ATTR(nr_rejected);
+static ssize_t scx_attr_hotplug_seq_show(struct kobject *kobj,
+ struct kobj_attribute *ka, char *buf)
+{
+ return sysfs_emit(buf, "%ld\n", atomic_long_read(&scx_hotplug_seq));
+}
+SCX_ATTR(hotplug_seq);
+
static struct attribute *scx_global_attrs[] = {
&scx_attr_state.attr,
&scx_attr_switch_all.attr,
&scx_attr_nr_rejected.attr,
+ &scx_attr_hotplug_seq.attr,
NULL,
};
@@ -3941,6 +4046,25 @@ static struct kthread_worker *scx_create_rt_helper(const char *name)
return helper;
}
+static void check_hotplug_seq(const struct sched_ext_ops *ops)
+{
+ unsigned long long global_hotplug_seq;
+
+ /*
+ * If a hotplug event has occurred between when a scheduler was
+ * initialized, and when we were able to attach, exit and notify user
+ * space about it.
+ */
+ if (ops->hotplug_seq) {
+ global_hotplug_seq = atomic_long_read(&scx_hotplug_seq);
+ if (ops->hotplug_seq != global_hotplug_seq) {
+ scx_ops_exit(SCX_ECODE_ACT_RESTART | SCX_ECODE_RSN_HOTPLUG,
+ "expected hotplug seq %llu did not match actual %llu",
+ ops->hotplug_seq, global_hotplug_seq);
+ }
+ }
+}
+
static int validate_ops(const struct sched_ext_ops *ops)
{
/*
@@ -4023,6 +4147,10 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link)
}
}
+ for (i = SCX_OPI_CPU_HOTPLUG_BEGIN; i < SCX_OPI_CPU_HOTPLUG_END; i++)
+ if (((void (**)(void))ops)[i])
+ static_branch_enable_cpuslocked(&scx_has_op[i]);
+
cpus_read_unlock();
ret = validate_ops(ops);
@@ -4064,6 +4192,8 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link)
percpu_down_write(&scx_fork_rwsem);
cpus_read_lock();
+ check_hotplug_seq(ops);
+
for (i = SCX_OPI_NORMAL_BEGIN; i < SCX_OPI_NORMAL_END; i++)
if (((void (**)(void))ops)[i])
static_branch_enable_cpuslocked(&scx_has_op[i]);
@@ -4374,6 +4504,9 @@ static int bpf_scx_init_member(const struct btf_type *t,
ops->exit_dump_len =
*(u32 *)(udata + moff) ?: SCX_EXIT_DUMP_DFL_LEN;
return 1;
+ case offsetof(struct sched_ext_ops, hotplug_seq):
+ ops->hotplug_seq = *(u64 *)(udata + moff);
+ return 1;
}
return 0;
@@ -4387,6 +4520,8 @@ static int bpf_scx_check_member(const struct btf_type *t,
switch (moff) {
case offsetof(struct sched_ext_ops, init_task):
+ case offsetof(struct sched_ext_ops, cpu_online):
+ case offsetof(struct sched_ext_ops, cpu_offline):
case offsetof(struct sched_ext_ops, init):
case offsetof(struct sched_ext_ops, exit):
break;
@@ -4457,6 +4592,8 @@ static s32 init_task_stub(struct task_struct *p, struct scx_init_task_args *args
static void exit_task_stub(struct task_struct *p, struct scx_exit_task_args *args) {}
static void enable_stub(struct task_struct *p) {}
static void disable_stub(struct task_struct *p) {}
+static void cpu_online_stub(s32 cpu) {}
+static void cpu_offline_stub(s32 cpu) {}
static s32 init_stub(void) { return -EINVAL; }
static void exit_stub(struct scx_exit_info *info) {}
@@ -4479,6 +4616,8 @@ static struct sched_ext_ops __bpf_ops_sched_ext_ops = {
.exit_task = exit_task_stub,
.enable = enable_stub,
.disable = disable_stub,
+ .cpu_online = cpu_online_stub,
+ .cpu_offline = cpu_offline_stub,
.init = init_stub,
.exit = exit_stub,
};
@@ -4719,6 +4858,9 @@ void __init init_sched_ext_class(void)
BUG_ON(!zalloc_cpumask_var(&rq->scx.cpus_to_preempt, GFP_KERNEL));
BUG_ON(!zalloc_cpumask_var(&rq->scx.cpus_to_wait, GFP_KERNEL));
init_irq_work(&rq->scx.kick_cpus_irq_work, kick_cpus_irq_workfn);
+
+ if (cpu_online(cpu))
+ cpu_rq(cpu)->scx.flags |= SCX_RQ_ONLINE;
}
register_sysrq_key('S', &sysrq_sched_ext_reset_op);