summaryrefslogtreecommitdiff
path: root/kernel/sched/ext.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched/ext.c')
-rw-r--r--kernel/sched/ext.c130
1 files changed, 126 insertions, 4 deletions
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 1f5d80df263a..3dc515b3351f 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -8,6 +8,7 @@
enum scx_consts {
SCX_DSP_DFL_MAX_BATCH = 32,
+ SCX_WATCHDOG_MAX_TIMEOUT = 30 * HZ,
SCX_EXIT_BT_LEN = 64,
SCX_EXIT_MSG_LEN = 1024,
@@ -24,6 +25,7 @@ enum scx_exit_kind {
SCX_EXIT_ERROR = 1024, /* runtime error, error msg contains details */
SCX_EXIT_ERROR_BPF, /* ERROR but triggered through scx_bpf_error() */
+ SCX_EXIT_ERROR_STALL, /* watchdog detected stalled runnable tasks */
};
/*
@@ -320,6 +322,15 @@ struct sched_ext_ops {
u64 flags;
/**
+ * timeout_ms - The maximum amount of time, in milliseconds, that a
+ * runnable task should be able to wait before being scheduled. The
+ * maximum timeout may not exceed the default timeout of 30 seconds.
+ *
+ * Defaults to the maximum allowed timeout value of 30 seconds.
+ */
+ u32 timeout_ms;
+
+ /**
* name - BPF scheduler's name
*
* Must be a non-zero valid BPF object name including only isalnum(),
@@ -472,6 +483,23 @@ struct static_key_false scx_has_op[SCX_OPI_END] =
static atomic_t scx_exit_kind = ATOMIC_INIT(SCX_EXIT_DONE);
static struct scx_exit_info *scx_exit_info;
+/*
+ * The maximum amount of time in jiffies that a task may be runnable without
+ * being scheduled on a CPU. If this timeout is exceeded, it will trigger
+ * scx_ops_error().
+ */
+static unsigned long scx_watchdog_timeout;
+
+/*
+ * The last time the delayed work was run. This delayed work relies on
+ * ksoftirqd being able to run to service timer interrupts, so it's possible
+ * that this work itself could get wedged. To account for this, we check that
+ * it's not stalled in the timer tick, and trigger an error if it is.
+ */
+static unsigned long scx_watchdog_timestamp = INITIAL_JIFFIES;
+
+static struct delayed_work scx_watchdog_work;
+
/* idle tracking */
#ifdef CONFIG_SMP
#ifdef CONFIG_CPUMASK_OFFSTACK
@@ -1170,6 +1198,11 @@ static void set_task_runnable(struct rq *rq, struct task_struct *p)
{
lockdep_assert_rq_held(rq);
+ if (p->scx.flags & SCX_TASK_RESET_RUNNABLE_AT) {
+ p->scx.runnable_at = jiffies;
+ p->scx.flags &= ~SCX_TASK_RESET_RUNNABLE_AT;
+ }
+
/*
* list_add_tail() must be used. scx_ops_bypass() depends on tasks being
* appened to the runnable_list.
@@ -1177,9 +1210,11 @@ static void set_task_runnable(struct rq *rq, struct task_struct *p)
list_add_tail(&p->scx.runnable_node, &rq->scx.runnable_list);
}
-static void clr_task_runnable(struct task_struct *p)
+static void clr_task_runnable(struct task_struct *p, bool reset_runnable_at)
{
list_del_init(&p->scx.runnable_node);
+ if (reset_runnable_at)
+ p->scx.flags |= SCX_TASK_RESET_RUNNABLE_AT;
}
static void enqueue_task_scx(struct rq *rq, struct task_struct *p, int enq_flags)
@@ -1217,7 +1252,8 @@ static void ops_dequeue(struct task_struct *p, u64 deq_flags)
{
unsigned long opss;
- clr_task_runnable(p);
+ /* dequeue is always temporary, don't reset runnable_at */
+ clr_task_runnable(p, false);
/* acquire ensures that we see the preceding updates on QUEUED */
opss = atomic_long_read_acquire(&p->scx.ops_state);
@@ -1826,7 +1862,7 @@ static void set_next_task_scx(struct rq *rq, struct task_struct *p, bool first)
p->se.exec_start = rq_clock_task(rq);
- clr_task_runnable(p);
+ clr_task_runnable(p, true);
}
static void put_prev_task_scx(struct rq *rq, struct task_struct *p)
@@ -2176,9 +2212,71 @@ static void reset_idle_masks(void) {}
#endif /* CONFIG_SMP */
-static void task_tick_scx(struct rq *rq, struct task_struct *curr, int queued)
+static bool check_rq_for_timeouts(struct rq *rq)
+{
+ struct task_struct *p;
+ struct rq_flags rf;
+ bool timed_out = false;
+
+ rq_lock_irqsave(rq, &rf);
+ list_for_each_entry(p, &rq->scx.runnable_list, scx.runnable_node) {
+ unsigned long last_runnable = p->scx.runnable_at;
+
+ if (unlikely(time_after(jiffies,
+ last_runnable + scx_watchdog_timeout))) {
+ u32 dur_ms = jiffies_to_msecs(jiffies - last_runnable);
+
+ scx_ops_error_kind(SCX_EXIT_ERROR_STALL,
+ "%s[%d] failed to run for %u.%03us",
+ p->comm, p->pid,
+ dur_ms / 1000, dur_ms % 1000);
+ timed_out = true;
+ break;
+ }
+ }
+ rq_unlock_irqrestore(rq, &rf);
+
+ return timed_out;
+}
+
+static void scx_watchdog_workfn(struct work_struct *work)
+{
+ int cpu;
+
+ WRITE_ONCE(scx_watchdog_timestamp, jiffies);
+
+ for_each_online_cpu(cpu) {
+ if (unlikely(check_rq_for_timeouts(cpu_rq(cpu))))
+ break;
+
+ cond_resched();
+ }
+ queue_delayed_work(system_unbound_wq, to_delayed_work(work),
+ scx_watchdog_timeout / 2);
+}
+
+void scx_tick(struct rq *rq)
{
+ unsigned long last_check;
+
+ if (!scx_enabled())
+ return;
+
+ last_check = READ_ONCE(scx_watchdog_timestamp);
+ if (unlikely(time_after(jiffies,
+ last_check + READ_ONCE(scx_watchdog_timeout)))) {
+ u32 dur_ms = jiffies_to_msecs(jiffies - last_check);
+
+ scx_ops_error_kind(SCX_EXIT_ERROR_STALL,
+ "watchdog failed to check in for %u.%03us",
+ dur_ms / 1000, dur_ms % 1000);
+ }
+
update_other_load_avgs(rq);
+}
+
+static void task_tick_scx(struct rq *rq, struct task_struct *curr, int queued)
+{
update_curr_scx(rq);
/*
@@ -2248,6 +2346,7 @@ static int scx_ops_init_task(struct task_struct *p, struct task_group *tg, bool
scx_set_task_state(p, SCX_TASK_INIT);
+ p->scx.flags |= SCX_TASK_RESET_RUNNABLE_AT;
return 0;
}
@@ -2326,6 +2425,7 @@ void init_scx_entity(struct sched_ext_entity *scx)
scx->sticky_cpu = -1;
scx->holding_cpu = -1;
INIT_LIST_HEAD(&scx->runnable_node);
+ scx->runnable_at = jiffies;
scx->ddsp_dsq_id = SCX_DSQ_INVALID;
scx->slice = SCX_SLICE_DFL;
}
@@ -2783,6 +2883,8 @@ static const char *scx_exit_reason(enum scx_exit_kind kind)
return "runtime error";
case SCX_EXIT_ERROR_BPF:
return "scx_bpf_error";
+ case SCX_EXIT_ERROR_STALL:
+ return "runnable task stall";
default:
return "<UNKNOWN>";
}
@@ -2904,6 +3006,8 @@ static void scx_ops_disable_workfn(struct kthread_work *work)
if (scx_ops.exit)
SCX_CALL_OP(SCX_KF_UNLOCKED, exit, ei);
+ cancel_delayed_work_sync(&scx_watchdog_work);
+
/*
* Delete the kobject from the hierarchy eagerly in addition to just
* dropping a reference. Otherwise, if the object is deleted
@@ -3026,6 +3130,7 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link)
{
struct scx_task_iter sti;
struct task_struct *p;
+ unsigned long timeout;
int i, ret;
mutex_lock(&scx_ops_enable_mutex);
@@ -3103,6 +3208,16 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link)
goto err_disable;
}
+ if (ops->timeout_ms)
+ timeout = msecs_to_jiffies(ops->timeout_ms);
+ else
+ timeout = SCX_WATCHDOG_MAX_TIMEOUT;
+
+ WRITE_ONCE(scx_watchdog_timeout, timeout);
+ WRITE_ONCE(scx_watchdog_timestamp, jiffies);
+ queue_delayed_work(system_unbound_wq, &scx_watchdog_work,
+ scx_watchdog_timeout / 2);
+
/*
* Lock out forks before opening the floodgate so that they don't wander
* into the operations prematurely.
@@ -3413,6 +3528,12 @@ static int bpf_scx_init_member(const struct btf_type *t,
if (ret == 0)
return -EINVAL;
return 1;
+ case offsetof(struct sched_ext_ops, timeout_ms):
+ if (msecs_to_jiffies(*(u32 *)(udata + moff)) >
+ SCX_WATCHDOG_MAX_TIMEOUT)
+ return -E2BIG;
+ ops->timeout_ms = *(u32 *)(udata + moff);
+ return 1;
}
return 0;
@@ -3569,6 +3690,7 @@ void __init init_sched_ext_class(void)
}
register_sysrq_key('S', &sysrq_sched_ext_reset_op);
+ INIT_DELAYED_WORK(&scx_watchdog_work, scx_watchdog_workfn);
}