diff options
author | Tejun Heo <tj@kernel.org> | 2024-06-18 10:09:20 -1000 |
---|---|---|
committer | Tejun Heo <tj@kernel.org> | 2024-06-18 10:09:20 -1000 |
commit | 60c27fb59f6cffa73fc8c60e3a22323c78044576 (patch) | |
tree | af08b7ff1bb3f61bfae6b1b6ed16ed3f0fa32c0e /kernel/sched/ext.c | |
parent | 245254f7081dbe1c8da54675d0e4ddbe74cee61b (diff) |
sched_ext: Implement sched_ext_ops.cpu_online/offline()
Add ops.cpu_online/offline() which are invoked when CPUs come online and
offline respectively. As the enqueue path already automatically bypasses
tasks to the local dsq on a deactivated CPU, BPF schedulers are guaranteed
to see tasks only on CPUs which are between online() and offline().
If the BPF scheduler doesn't implement ops.cpu_online/offline(), the
scheduler is automatically exited with SCX_ECODE_RESTART |
SCX_ECODE_RSN_HOTPLUG. Userspace can implement CPU hotpplug support
trivially by simply reinitializing and reloading the scheduler.
scx_qmap is updated to print out online CPUs on hotplug events. Other
schedulers are updated to restart based on ecode.
v3: - The previous implementation added @reason to
sched_class.rq_on/offline() to distinguish between CPU hotplug events
and topology updates. This was buggy and fragile as the methods are
skipped if the current state equals the target state. Instead, add
scx_rq_[de]activate() which are directly called from
sched_cpu_de/activate(). This also allows ops.cpu_on/offline() to
sleep which can be useful.
- ops.dispatch() could be called on a CPU that the BPF scheduler was
told to be offline. The dispatch patch is updated to bypass in such
cases.
v2: - To accommodate lock ordering change between scx_cgroup_rwsem and
cpus_read_lock(), CPU hotplug operations are put into its own SCX_OPI
block and enabled eariler during scx_ope_enable() so that
cpus_read_lock() can be dropped before acquiring scx_cgroup_rwsem.
- Auto exit with ECODE added.
Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: David Vernet <dvernet@meta.com>
Acked-by: Josh Don <joshdon@google.com>
Acked-by: Hao Luo <haoluo@google.com>
Acked-by: Barret Rhoden <brho@google.com>
Diffstat (limited to 'kernel/sched/ext.c')
-rw-r--r-- | kernel/sched/ext.c | 156 |
1 files changed, 149 insertions, 7 deletions
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 686dab6ab592..7c2f2a542b32 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -31,6 +31,29 @@ enum scx_exit_kind { }; /* + * An exit code can be specified when exiting with scx_bpf_exit() or + * scx_ops_exit(), corresponding to exit_kind UNREG_BPF and UNREG_KERN + * respectively. The codes are 64bit of the format: + * + * Bits: [63 .. 48 47 .. 32 31 .. 0] + * [ SYS ACT ] [ SYS RSN ] [ USR ] + * + * SYS ACT: System-defined exit actions + * SYS RSN: System-defined exit reasons + * USR : User-defined exit codes and reasons + * + * Using the above, users may communicate intention and context by ORing system + * actions and/or system reasons with a user-defined exit code. + */ +enum scx_exit_code { + /* Reasons */ + SCX_ECODE_RSN_HOTPLUG = 1LLU << 32, + + /* Actions */ + SCX_ECODE_ACT_RESTART = 1LLU << 48, +}; + +/* * scx_exit_info is passed to ops.exit() to describe why the BPF scheduler is * being disabled. */ @@ -457,7 +480,29 @@ struct sched_ext_ops { void (*dump_task)(struct scx_dump_ctx *ctx, struct task_struct *p); /* - * All online ops must come before ops.init(). + * All online ops must come before ops.cpu_online(). + */ + + /** + * cpu_online - A CPU became online + * @cpu: CPU which just came up + * + * @cpu just came online. @cpu will not call ops.enqueue() or + * ops.dispatch(), nor run tasks associated with other CPUs beforehand. + */ + void (*cpu_online)(s32 cpu); + + /** + * cpu_offline - A CPU is going offline + * @cpu: CPU which is going offline + * + * @cpu is going offline. @cpu will not call ops.enqueue() or + * ops.dispatch(), nor run tasks associated with other CPUs afterwards. + */ + void (*cpu_offline)(s32 cpu); + + /* + * All CPU hotplug ops must come before ops.init(). */ /** @@ -497,6 +542,15 @@ struct sched_ext_ops { u32 exit_dump_len; /** + * hotplug_seq - A sequence number that may be set by the scheduler to + * detect when a hotplug event has occurred during the loading process. + * If 0, no detection occurs. Otherwise, the scheduler will fail to + * load if the sequence number does not match @scx_hotplug_seq on the + * enable path. + */ + u64 hotplug_seq; + + /** * name - BPF scheduler's name * * Must be a non-zero valid BPF object name including only isalnum(), @@ -509,7 +563,9 @@ struct sched_ext_ops { enum scx_opi { SCX_OPI_BEGIN = 0, SCX_OPI_NORMAL_BEGIN = 0, - SCX_OPI_NORMAL_END = SCX_OP_IDX(init), + SCX_OPI_NORMAL_END = SCX_OP_IDX(cpu_online), + SCX_OPI_CPU_HOTPLUG_BEGIN = SCX_OP_IDX(cpu_online), + SCX_OPI_CPU_HOTPLUG_END = SCX_OP_IDX(init), SCX_OPI_END = SCX_OP_IDX(init), }; @@ -694,6 +750,7 @@ static atomic_t scx_exit_kind = ATOMIC_INIT(SCX_EXIT_DONE); static struct scx_exit_info *scx_exit_info; static atomic_long_t scx_nr_rejected = ATOMIC_LONG_INIT(0); +static atomic_long_t scx_hotplug_seq = ATOMIC_LONG_INIT(0); /* * The maximum amount of time in jiffies that a task may be runnable without @@ -1419,11 +1476,7 @@ static void direct_dispatch(struct task_struct *p, u64 enq_flags) static bool scx_rq_online(struct rq *rq) { -#ifdef CONFIG_SMP - return likely(rq->online); -#else - return true; -#endif + return likely(rq->scx.flags & SCX_RQ_ONLINE); } static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags, @@ -1438,6 +1491,11 @@ static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags, if (sticky_cpu == cpu_of(rq)) goto local_norefill; + /* + * If !scx_rq_online(), we already told the BPF scheduler that the CPU + * is offline and are just running the hotplug path. Don't bother the + * BPF scheduler. + */ if (!scx_rq_online(rq)) goto local; @@ -2673,6 +2731,42 @@ void __scx_update_idle(struct rq *rq, bool idle) #endif } +static void handle_hotplug(struct rq *rq, bool online) +{ + int cpu = cpu_of(rq); + + atomic_long_inc(&scx_hotplug_seq); + + if (online && SCX_HAS_OP(cpu_online)) + SCX_CALL_OP(SCX_KF_SLEEPABLE, cpu_online, cpu); + else if (!online && SCX_HAS_OP(cpu_offline)) + SCX_CALL_OP(SCX_KF_SLEEPABLE, cpu_offline, cpu); + else + scx_ops_exit(SCX_ECODE_ACT_RESTART | SCX_ECODE_RSN_HOTPLUG, + "cpu %d going %s, exiting scheduler", cpu, + online ? "online" : "offline"); +} + +void scx_rq_activate(struct rq *rq) +{ + handle_hotplug(rq, true); +} + +void scx_rq_deactivate(struct rq *rq) +{ + handle_hotplug(rq, false); +} + +static void rq_online_scx(struct rq *rq) +{ + rq->scx.flags |= SCX_RQ_ONLINE; +} + +static void rq_offline_scx(struct rq *rq) +{ + rq->scx.flags &= ~SCX_RQ_ONLINE; +} + #else /* CONFIG_SMP */ static bool test_and_clear_cpu_idle(int cpu) { return false; } @@ -3104,6 +3198,9 @@ DEFINE_SCHED_CLASS(ext) = { .balance = balance_scx, .select_task_rq = select_task_rq_scx, .set_cpus_allowed = set_cpus_allowed_scx, + + .rq_online = rq_online_scx, + .rq_offline = rq_offline_scx, #endif .task_tick = task_tick_scx, @@ -3235,10 +3332,18 @@ static ssize_t scx_attr_nr_rejected_show(struct kobject *kobj, } SCX_ATTR(nr_rejected); +static ssize_t scx_attr_hotplug_seq_show(struct kobject *kobj, + struct kobj_attribute *ka, char *buf) +{ + return sysfs_emit(buf, "%ld\n", atomic_long_read(&scx_hotplug_seq)); +} +SCX_ATTR(hotplug_seq); + static struct attribute *scx_global_attrs[] = { &scx_attr_state.attr, &scx_attr_switch_all.attr, &scx_attr_nr_rejected.attr, + &scx_attr_hotplug_seq.attr, NULL, }; @@ -3941,6 +4046,25 @@ static struct kthread_worker *scx_create_rt_helper(const char *name) return helper; } +static void check_hotplug_seq(const struct sched_ext_ops *ops) +{ + unsigned long long global_hotplug_seq; + + /* + * If a hotplug event has occurred between when a scheduler was + * initialized, and when we were able to attach, exit and notify user + * space about it. + */ + if (ops->hotplug_seq) { + global_hotplug_seq = atomic_long_read(&scx_hotplug_seq); + if (ops->hotplug_seq != global_hotplug_seq) { + scx_ops_exit(SCX_ECODE_ACT_RESTART | SCX_ECODE_RSN_HOTPLUG, + "expected hotplug seq %llu did not match actual %llu", + ops->hotplug_seq, global_hotplug_seq); + } + } +} + static int validate_ops(const struct sched_ext_ops *ops) { /* @@ -4023,6 +4147,10 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link) } } + for (i = SCX_OPI_CPU_HOTPLUG_BEGIN; i < SCX_OPI_CPU_HOTPLUG_END; i++) + if (((void (**)(void))ops)[i]) + static_branch_enable_cpuslocked(&scx_has_op[i]); + cpus_read_unlock(); ret = validate_ops(ops); @@ -4064,6 +4192,8 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link) percpu_down_write(&scx_fork_rwsem); cpus_read_lock(); + check_hotplug_seq(ops); + for (i = SCX_OPI_NORMAL_BEGIN; i < SCX_OPI_NORMAL_END; i++) if (((void (**)(void))ops)[i]) static_branch_enable_cpuslocked(&scx_has_op[i]); @@ -4374,6 +4504,9 @@ static int bpf_scx_init_member(const struct btf_type *t, ops->exit_dump_len = *(u32 *)(udata + moff) ?: SCX_EXIT_DUMP_DFL_LEN; return 1; + case offsetof(struct sched_ext_ops, hotplug_seq): + ops->hotplug_seq = *(u64 *)(udata + moff); + return 1; } return 0; @@ -4387,6 +4520,8 @@ static int bpf_scx_check_member(const struct btf_type *t, switch (moff) { case offsetof(struct sched_ext_ops, init_task): + case offsetof(struct sched_ext_ops, cpu_online): + case offsetof(struct sched_ext_ops, cpu_offline): case offsetof(struct sched_ext_ops, init): case offsetof(struct sched_ext_ops, exit): break; @@ -4457,6 +4592,8 @@ static s32 init_task_stub(struct task_struct *p, struct scx_init_task_args *args static void exit_task_stub(struct task_struct *p, struct scx_exit_task_args *args) {} static void enable_stub(struct task_struct *p) {} static void disable_stub(struct task_struct *p) {} +static void cpu_online_stub(s32 cpu) {} +static void cpu_offline_stub(s32 cpu) {} static s32 init_stub(void) { return -EINVAL; } static void exit_stub(struct scx_exit_info *info) {} @@ -4479,6 +4616,8 @@ static struct sched_ext_ops __bpf_ops_sched_ext_ops = { .exit_task = exit_task_stub, .enable = enable_stub, .disable = disable_stub, + .cpu_online = cpu_online_stub, + .cpu_offline = cpu_offline_stub, .init = init_stub, .exit = exit_stub, }; @@ -4719,6 +4858,9 @@ void __init init_sched_ext_class(void) BUG_ON(!zalloc_cpumask_var(&rq->scx.cpus_to_preempt, GFP_KERNEL)); BUG_ON(!zalloc_cpumask_var(&rq->scx.cpus_to_wait, GFP_KERNEL)); init_irq_work(&rq->scx.kick_cpus_irq_work, kick_cpus_irq_workfn); + + if (cpu_online(cpu)) + cpu_rq(cpu)->scx.flags |= SCX_RQ_ONLINE; } register_sysrq_key('S', &sysrq_sched_ext_reset_op); |