diff options
Diffstat (limited to 'kernel/sched/ext.c')
-rw-r--r-- | kernel/sched/ext.c | 130 |
1 files changed, 126 insertions, 4 deletions
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 1f5d80df263a..3dc515b3351f 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -8,6 +8,7 @@ enum scx_consts { SCX_DSP_DFL_MAX_BATCH = 32, + SCX_WATCHDOG_MAX_TIMEOUT = 30 * HZ, SCX_EXIT_BT_LEN = 64, SCX_EXIT_MSG_LEN = 1024, @@ -24,6 +25,7 @@ enum scx_exit_kind { SCX_EXIT_ERROR = 1024, /* runtime error, error msg contains details */ SCX_EXIT_ERROR_BPF, /* ERROR but triggered through scx_bpf_error() */ + SCX_EXIT_ERROR_STALL, /* watchdog detected stalled runnable tasks */ }; /* @@ -320,6 +322,15 @@ struct sched_ext_ops { u64 flags; /** + * timeout_ms - The maximum amount of time, in milliseconds, that a + * runnable task should be able to wait before being scheduled. The + * maximum timeout may not exceed the default timeout of 30 seconds. + * + * Defaults to the maximum allowed timeout value of 30 seconds. + */ + u32 timeout_ms; + + /** * name - BPF scheduler's name * * Must be a non-zero valid BPF object name including only isalnum(), @@ -472,6 +483,23 @@ struct static_key_false scx_has_op[SCX_OPI_END] = static atomic_t scx_exit_kind = ATOMIC_INIT(SCX_EXIT_DONE); static struct scx_exit_info *scx_exit_info; +/* + * The maximum amount of time in jiffies that a task may be runnable without + * being scheduled on a CPU. If this timeout is exceeded, it will trigger + * scx_ops_error(). + */ +static unsigned long scx_watchdog_timeout; + +/* + * The last time the delayed work was run. This delayed work relies on + * ksoftirqd being able to run to service timer interrupts, so it's possible + * that this work itself could get wedged. To account for this, we check that + * it's not stalled in the timer tick, and trigger an error if it is. + */ +static unsigned long scx_watchdog_timestamp = INITIAL_JIFFIES; + +static struct delayed_work scx_watchdog_work; + /* idle tracking */ #ifdef CONFIG_SMP #ifdef CONFIG_CPUMASK_OFFSTACK @@ -1170,6 +1198,11 @@ static void set_task_runnable(struct rq *rq, struct task_struct *p) { lockdep_assert_rq_held(rq); + if (p->scx.flags & SCX_TASK_RESET_RUNNABLE_AT) { + p->scx.runnable_at = jiffies; + p->scx.flags &= ~SCX_TASK_RESET_RUNNABLE_AT; + } + /* * list_add_tail() must be used. scx_ops_bypass() depends on tasks being * appened to the runnable_list. @@ -1177,9 +1210,11 @@ static void set_task_runnable(struct rq *rq, struct task_struct *p) list_add_tail(&p->scx.runnable_node, &rq->scx.runnable_list); } -static void clr_task_runnable(struct task_struct *p) +static void clr_task_runnable(struct task_struct *p, bool reset_runnable_at) { list_del_init(&p->scx.runnable_node); + if (reset_runnable_at) + p->scx.flags |= SCX_TASK_RESET_RUNNABLE_AT; } static void enqueue_task_scx(struct rq *rq, struct task_struct *p, int enq_flags) @@ -1217,7 +1252,8 @@ static void ops_dequeue(struct task_struct *p, u64 deq_flags) { unsigned long opss; - clr_task_runnable(p); + /* dequeue is always temporary, don't reset runnable_at */ + clr_task_runnable(p, false); /* acquire ensures that we see the preceding updates on QUEUED */ opss = atomic_long_read_acquire(&p->scx.ops_state); @@ -1826,7 +1862,7 @@ static void set_next_task_scx(struct rq *rq, struct task_struct *p, bool first) p->se.exec_start = rq_clock_task(rq); - clr_task_runnable(p); + clr_task_runnable(p, true); } static void put_prev_task_scx(struct rq *rq, struct task_struct *p) @@ -2176,9 +2212,71 @@ static void reset_idle_masks(void) {} #endif /* CONFIG_SMP */ -static void task_tick_scx(struct rq *rq, struct task_struct *curr, int queued) +static bool check_rq_for_timeouts(struct rq *rq) +{ + struct task_struct *p; + struct rq_flags rf; + bool timed_out = false; + + rq_lock_irqsave(rq, &rf); + list_for_each_entry(p, &rq->scx.runnable_list, scx.runnable_node) { + unsigned long last_runnable = p->scx.runnable_at; + + if (unlikely(time_after(jiffies, + last_runnable + scx_watchdog_timeout))) { + u32 dur_ms = jiffies_to_msecs(jiffies - last_runnable); + + scx_ops_error_kind(SCX_EXIT_ERROR_STALL, + "%s[%d] failed to run for %u.%03us", + p->comm, p->pid, + dur_ms / 1000, dur_ms % 1000); + timed_out = true; + break; + } + } + rq_unlock_irqrestore(rq, &rf); + + return timed_out; +} + +static void scx_watchdog_workfn(struct work_struct *work) +{ + int cpu; + + WRITE_ONCE(scx_watchdog_timestamp, jiffies); + + for_each_online_cpu(cpu) { + if (unlikely(check_rq_for_timeouts(cpu_rq(cpu)))) + break; + + cond_resched(); + } + queue_delayed_work(system_unbound_wq, to_delayed_work(work), + scx_watchdog_timeout / 2); +} + +void scx_tick(struct rq *rq) { + unsigned long last_check; + + if (!scx_enabled()) + return; + + last_check = READ_ONCE(scx_watchdog_timestamp); + if (unlikely(time_after(jiffies, + last_check + READ_ONCE(scx_watchdog_timeout)))) { + u32 dur_ms = jiffies_to_msecs(jiffies - last_check); + + scx_ops_error_kind(SCX_EXIT_ERROR_STALL, + "watchdog failed to check in for %u.%03us", + dur_ms / 1000, dur_ms % 1000); + } + update_other_load_avgs(rq); +} + +static void task_tick_scx(struct rq *rq, struct task_struct *curr, int queued) +{ update_curr_scx(rq); /* @@ -2248,6 +2346,7 @@ static int scx_ops_init_task(struct task_struct *p, struct task_group *tg, bool scx_set_task_state(p, SCX_TASK_INIT); + p->scx.flags |= SCX_TASK_RESET_RUNNABLE_AT; return 0; } @@ -2326,6 +2425,7 @@ void init_scx_entity(struct sched_ext_entity *scx) scx->sticky_cpu = -1; scx->holding_cpu = -1; INIT_LIST_HEAD(&scx->runnable_node); + scx->runnable_at = jiffies; scx->ddsp_dsq_id = SCX_DSQ_INVALID; scx->slice = SCX_SLICE_DFL; } @@ -2783,6 +2883,8 @@ static const char *scx_exit_reason(enum scx_exit_kind kind) return "runtime error"; case SCX_EXIT_ERROR_BPF: return "scx_bpf_error"; + case SCX_EXIT_ERROR_STALL: + return "runnable task stall"; default: return "<UNKNOWN>"; } @@ -2904,6 +3006,8 @@ static void scx_ops_disable_workfn(struct kthread_work *work) if (scx_ops.exit) SCX_CALL_OP(SCX_KF_UNLOCKED, exit, ei); + cancel_delayed_work_sync(&scx_watchdog_work); + /* * Delete the kobject from the hierarchy eagerly in addition to just * dropping a reference. Otherwise, if the object is deleted @@ -3026,6 +3130,7 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link) { struct scx_task_iter sti; struct task_struct *p; + unsigned long timeout; int i, ret; mutex_lock(&scx_ops_enable_mutex); @@ -3103,6 +3208,16 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link) goto err_disable; } + if (ops->timeout_ms) + timeout = msecs_to_jiffies(ops->timeout_ms); + else + timeout = SCX_WATCHDOG_MAX_TIMEOUT; + + WRITE_ONCE(scx_watchdog_timeout, timeout); + WRITE_ONCE(scx_watchdog_timestamp, jiffies); + queue_delayed_work(system_unbound_wq, &scx_watchdog_work, + scx_watchdog_timeout / 2); + /* * Lock out forks before opening the floodgate so that they don't wander * into the operations prematurely. @@ -3413,6 +3528,12 @@ static int bpf_scx_init_member(const struct btf_type *t, if (ret == 0) return -EINVAL; return 1; + case offsetof(struct sched_ext_ops, timeout_ms): + if (msecs_to_jiffies(*(u32 *)(udata + moff)) > + SCX_WATCHDOG_MAX_TIMEOUT) + return -E2BIG; + ops->timeout_ms = *(u32 *)(udata + moff); + return 1; } return 0; @@ -3569,6 +3690,7 @@ void __init init_sched_ext_class(void) } register_sysrq_key('S', &sysrq_sched_ext_reset_op); + INIT_DELAYED_WORK(&scx_watchdog_work, scx_watchdog_workfn); } |