summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorChengming Zhou <chengming.zhou@linux.dev>2024-12-27 06:19:41 +0000
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>2025-02-08 09:56:55 +0100
commit10a7d3e73408b6faa63085ea6b4775cf36e67769 (patch)
treea0186224f129c4b7166b847a9bc48a1e5e706a62
parent3f1215588b26ecc6fd434104e084d5f040c2230c (diff)
psi: Fix race when task wakes up before psi_sched_switch() adjusts flags
[ Upstream commit 7d9da040575b343085287686fa902a5b2d43c7ca ] When running hackbench in a cgroup with bandwidth throttling enabled, following PSI splat was observed: psi: inconsistent task state! task=1831:hackbench cpu=8 psi_flags=14 clear=0 set=4 When investigating the series of events leading up to the splat, following sequence was observed: [008] d..2.: sched_switch: ... ==> next_comm=hackbench next_pid=1831 next_prio=120 ... [008] dN.2.: dequeue_entity(task delayed): task=hackbench pid=1831 cfs_rq->throttled=0 [008] dN.2.: pick_task_fair: check_cfs_rq_runtime() throttled cfs_rq on CPU8 # CPU8 goes into newidle balance and releases the rq lock ... # CPU15 on same LLC Domain is trying to wakeup hackbench(pid=1831) [015] d..4.: psi_flags_change: psi: task state: task=1831:hackbench cpu=8 psi_flags=14 clear=0 set=4 final=14 # Splat (cfs_rq->throttled=1) [015] d..4.: sched_wakeup: comm=hackbench pid=1831 prio=120 target_cpu=008 # Task has woken on a throttled hierarchy [008] d..2.: sched_switch: prev_comm=hackbench prev_pid=1831 prev_prio=120 prev_state=S ==> ... psi_dequeue() relies on psi_sched_switch() to set the correct PSI flags for the blocked entity, however, with the introduction of DELAY_DEQUEUE, the block task can wakeup when newidle balance drops the runqueue lock during __schedule(). If a task wakes before psi_sched_switch() adjusts the PSI flags, skip any modifications in psi_enqueue() which would still see the flags of a running task and not a blocked one. Instead, rely on psi_sched_switch() to do the right thing. Since the status returned by try_to_block_task() may no longer be true by the time schedule reaches psi_sched_switch(), check if the task is blocked or not using a combination of task_on_rq_queued() and p->se.sched_delayed checks. [ prateek: Commit message, testing, early bailout in psi_enqueue() ] Fixes: 152e11f6df29 ("sched/fair: Implement delayed dequeue") # 1a6151017ee5 Signed-off-by: Chengming Zhou <chengming.zhou@linux.dev> Signed-off-by: K Prateek Nayak <kprateek.nayak@amd.com> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Reviewed-by: Chengming Zhou <chengming.zhou@linux.dev> Link: https://lore.kernel.org/r/20241227061941.2315-1-kprateek.nayak@amd.com Signed-off-by: Sasha Levin <sashal@kernel.org>
-rw-r--r--kernel/sched/core.c6
-rw-r--r--kernel/sched/stats.h4
2 files changed, 7 insertions, 3 deletions
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index c1d2d46feec5..aba41c69f09c 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6593,7 +6593,6 @@ static void __sched notrace __schedule(int sched_mode)
* as a preemption by schedule_debug() and RCU.
*/
bool preempt = sched_mode > SM_NONE;
- bool block = false;
unsigned long *switch_count;
unsigned long prev_state;
struct rq_flags rf;
@@ -6654,7 +6653,7 @@ static void __sched notrace __schedule(int sched_mode)
goto picked;
}
} else if (!preempt && prev_state) {
- block = try_to_block_task(rq, prev, prev_state);
+ try_to_block_task(rq, prev, prev_state);
switch_count = &prev->nvcsw;
}
@@ -6699,7 +6698,8 @@ picked:
migrate_disable_switch(rq, prev);
psi_account_irqtime(rq, prev, next);
- psi_sched_switch(prev, next, block);
+ psi_sched_switch(prev, next, !task_on_rq_queued(prev) ||
+ prev->se.sched_delayed);
trace_sched_switch(preempt, prev, next, prev_state);
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
index 8ee0add5a48a..6ade91bce63e 100644
--- a/kernel/sched/stats.h
+++ b/kernel/sched/stats.h
@@ -138,6 +138,10 @@ static inline void psi_enqueue(struct task_struct *p, int flags)
if (flags & ENQUEUE_RESTORE)
return;
+ /* psi_sched_switch() will handle the flags */
+ if (task_on_cpu(task_rq(p), p))
+ return;
+
if (p->se.sched_delayed) {
/* CPU migration of "sleeping" task */
SCHED_WARN_ON(!(flags & ENQUEUE_MIGRATED));