summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAndrii Nakryiko <andrii@kernel.org>2025-04-02 11:09:25 -0700
committerAndrew Morton <akpm@linux-foundation.org>2025-05-11 17:54:03 -0700
commit3ca55ca225d7627e76fcfe2894740c4f615ff2e0 (patch)
tree8e63dc3cf3d2e1cd5c8005a8b4bdc346cdc5e331
parent82f2b0b97b36ee3fcddf0f0780a9a0825d52fec3 (diff)
exit: move and extend sched_process_exit() tracepoint
It is useful to be able to access current->mm at task exit to, say, record a bunch of VMA information right before the task exits (e.g., for stack symbolization reasons when dealing with short-lived processes that exit in the middle of profiling session). Currently, trace_sched_process_exit() is triggered after exit_mm() which resets current->mm to NULL making this tracepoint unsuitable for inspecting and recording task's mm_struct-related data when tracing process lifetimes. There is a particularly suitable place, though, right after taskstats_exit() is called, but before we do exit_mm() and other exit_*() resource teardowns. taskstats performs a similar kind of accounting that some applications do with BPF, and so co-locating them seems like a good fit. So that's where trace_sched_process_exit() is moved with this patch. Also, existing trace_sched_process_exit() tracepoint is notoriously missing `group_dead` flag that is certainly useful in practice and some of our production applications have to work around this. So plumb `group_dead` through while at it, to have a richer and more complete tracepoint. Note that we can't use sched_process_template anymore, and so we use TRACE_EVENT()-based tracepoint definition. But all the field names and order, as well as assign and output logic remain intact. We just add one extra field at the end in backwards-compatible way. [andrii@kernel.org: document sched_process_exit and sched_process_template relation] Link: https://lkml.kernel.org/r/20250403174120.4087794-1-andrii@kernel.org Link: https://lkml.kernel.org/r/20250402180925.90914-1-andrii@kernel.org Signed-off-by: Andrii Nakryiko <andrii@kernel.org> Acked-by: Steven Rostedt (Google) <rostedt@goodmis.org> Acked-by: Oleg Nesterov <oleg@redhat.com> Suggested-by: Ingo Molnar <mingo@kernel.org> Cc: Alexander Potapenko <glider@google.com> Cc: Christian Brauner <brauner@kernel.org> Cc: "Masami Hiramatsu (Google)" <mhiramat@kernel.org> Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com> Cc: Michal Hocko <mhocko@kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
-rw-r--r--include/trace/events/sched.h34
-rw-r--r--kernel/exit.c2
2 files changed, 31 insertions, 5 deletions
diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
index 8994e97d86c1..3bec9fb73a36 100644
--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -326,11 +326,37 @@ DEFINE_EVENT(sched_process_template, sched_process_free,
TP_ARGS(p));
/*
- * Tracepoint for a task exiting:
+ * Tracepoint for a task exiting.
+ * Note, it's a superset of sched_process_template and should be kept
+ * compatible as much as possible. sched_process_exits has an extra
+ * `group_dead` argument, so sched_process_template can't be used,
+ * unfortunately, just like sched_migrate_task above.
*/
-DEFINE_EVENT(sched_process_template, sched_process_exit,
- TP_PROTO(struct task_struct *p),
- TP_ARGS(p));
+TRACE_EVENT(sched_process_exit,
+
+ TP_PROTO(struct task_struct *p, bool group_dead),
+
+ TP_ARGS(p, group_dead),
+
+ TP_STRUCT__entry(
+ __array( char, comm, TASK_COMM_LEN )
+ __field( pid_t, pid )
+ __field( int, prio )
+ __field( bool, group_dead )
+ ),
+
+ TP_fast_assign(
+ memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
+ __entry->pid = p->pid;
+ __entry->prio = p->prio; /* XXX SCHED_DEADLINE */
+ __entry->group_dead = group_dead;
+ ),
+
+ TP_printk("comm=%s pid=%d prio=%d group_dead=%s",
+ __entry->comm, __entry->pid, __entry->prio,
+ __entry->group_dead ? "true" : "false"
+ )
+);
/*
* Tracepoint for waiting on task to unschedule:
diff --git a/kernel/exit.c b/kernel/exit.c
index 1b51dc099f1e..f1db86dcbeb1 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -936,12 +936,12 @@ void __noreturn do_exit(long code)
tsk->exit_code = code;
taskstats_exit(tsk, group_dead);
+ trace_sched_process_exit(tsk, group_dead);
exit_mm();
if (group_dead)
acct_process();
- trace_sched_process_exit(tsk);
exit_sem(tsk);
exit_shm(tsk);