summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--drivers/gpu/drm/xe/xe_devcoredump.c21
-rw-r--r--drivers/gpu/drm/xe/xe_devcoredump.h5
-rw-r--r--drivers/gpu/drm/xe/xe_devcoredump_types.h4
-rw-r--r--drivers/gpu/drm/xe/xe_guc_submit.c14
4 files changed, 35 insertions, 9 deletions
diff --git a/drivers/gpu/drm/xe/xe_devcoredump.c b/drivers/gpu/drm/xe/xe_devcoredump.c
index 0d3a4dc87a7f..baac50f6dd7e 100644
--- a/drivers/gpu/drm/xe/xe_devcoredump.c
+++ b/drivers/gpu/drm/xe/xe_devcoredump.c
@@ -100,6 +100,7 @@ static ssize_t __xe_devcoredump_read(char *buffer, size_t count,
p = drm_coredump_printer(&iter);
drm_puts(&p, "**** Xe Device Coredump ****\n");
+ drm_printf(&p, "Reason: %s\n", ss->reason);
drm_puts(&p, "kernel: " UTS_RELEASE "\n");
drm_puts(&p, "module: " KBUILD_MODNAME "\n");
@@ -107,7 +108,7 @@ static ssize_t __xe_devcoredump_read(char *buffer, size_t count,
drm_printf(&p, "Snapshot time: %lld.%09ld\n", ts.tv_sec, ts.tv_nsec);
ts = ktime_to_timespec64(ss->boot_time);
drm_printf(&p, "Uptime: %lld.%09ld\n", ts.tv_sec, ts.tv_nsec);
- drm_printf(&p, "Process: %s\n", ss->process_name);
+ drm_printf(&p, "Process: %s [%d]\n", ss->process_name, ss->pid);
xe_device_snapshot_print(xe, &p);
drm_printf(&p, "\n**** GT #%d ****\n", ss->gt->info.id);
@@ -139,6 +140,9 @@ static void xe_devcoredump_snapshot_free(struct xe_devcoredump_snapshot *ss)
{
int i;
+ kfree(ss->reason);
+ ss->reason = NULL;
+
xe_guc_log_snapshot_free(ss->guc.log);
ss->guc.log = NULL;
@@ -259,8 +263,11 @@ static void devcoredump_snapshot(struct xe_devcoredump *coredump,
ss->snapshot_time = ktime_get_real();
ss->boot_time = ktime_get_boottime();
- if (q->vm && q->vm->xef)
+ if (q->vm && q->vm->xef) {
process_name = q->vm->xef->process_name;
+ ss->pid = q->vm->xef->pid;
+ }
+
strscpy(ss->process_name, process_name);
ss->gt = q->gt;
@@ -298,15 +305,18 @@ static void devcoredump_snapshot(struct xe_devcoredump *coredump,
* xe_devcoredump - Take the required snapshots and initialize coredump device.
* @q: The faulty xe_exec_queue, where the issue was detected.
* @job: The faulty xe_sched_job, where the issue was detected.
+ * @fmt: Printf format + args to describe the reason for the core dump
*
* This function should be called at the crash time within the serialized
* gt_reset. It is skipped if we still have the core dump device available
* with the information of the 'first' snapshot.
*/
-void xe_devcoredump(struct xe_exec_queue *q, struct xe_sched_job *job)
+__printf(3, 4)
+void xe_devcoredump(struct xe_exec_queue *q, struct xe_sched_job *job, const char *fmt, ...)
{
struct xe_device *xe = gt_to_xe(q->gt);
struct xe_devcoredump *coredump = &xe->devcoredump;
+ va_list varg;
if (coredump->captured) {
drm_dbg(&xe->drm, "Multiple hangs are occurring, but only the first snapshot was taken\n");
@@ -314,6 +324,11 @@ void xe_devcoredump(struct xe_exec_queue *q, struct xe_sched_job *job)
}
coredump->captured = true;
+
+ va_start(varg, fmt);
+ coredump->snapshot.reason = kvasprintf(GFP_ATOMIC, fmt, varg);
+ va_end(varg);
+
devcoredump_snapshot(coredump, q, job);
drm_info(&xe->drm, "Xe device coredump has been created\n");
diff --git a/drivers/gpu/drm/xe/xe_devcoredump.h b/drivers/gpu/drm/xe/xe_devcoredump.h
index c04a534e3384..6a17e6d60102 100644
--- a/drivers/gpu/drm/xe/xe_devcoredump.h
+++ b/drivers/gpu/drm/xe/xe_devcoredump.h
@@ -14,11 +14,12 @@ struct xe_exec_queue;
struct xe_sched_job;
#ifdef CONFIG_DEV_COREDUMP
-void xe_devcoredump(struct xe_exec_queue *q, struct xe_sched_job *job);
+void xe_devcoredump(struct xe_exec_queue *q, struct xe_sched_job *job, const char *fmt, ...);
int xe_devcoredump_init(struct xe_device *xe);
#else
static inline void xe_devcoredump(struct xe_exec_queue *q,
- struct xe_sched_job *job)
+ struct xe_sched_job *job,
+ const char *fmt, ...)
{
}
diff --git a/drivers/gpu/drm/xe/xe_devcoredump_types.h b/drivers/gpu/drm/xe/xe_devcoredump_types.h
index be4d59ea9ac8..e6234e887102 100644
--- a/drivers/gpu/drm/xe/xe_devcoredump_types.h
+++ b/drivers/gpu/drm/xe/xe_devcoredump_types.h
@@ -28,6 +28,10 @@ struct xe_devcoredump_snapshot {
ktime_t boot_time;
/** @process_name: Name of process that triggered this gpu hang */
char process_name[TASK_COMM_LEN];
+ /** @pid: Process id of process that triggered this gpu hang */
+ pid_t pid;
+ /** @reason: The reason the coredump was triggered */
+ char *reason;
/** @gt: Affected GT, used by forcewake for delayed capture */
struct xe_gt *gt;
diff --git a/drivers/gpu/drm/xe/xe_guc_submit.c b/drivers/gpu/drm/xe/xe_guc_submit.c
index d00af8d8acbe..9c36329fe857 100644
--- a/drivers/gpu/drm/xe/xe_guc_submit.c
+++ b/drivers/gpu/drm/xe/xe_guc_submit.c
@@ -901,7 +901,8 @@ static void xe_guc_exec_queue_lr_cleanup(struct work_struct *w)
if (!ret) {
xe_gt_warn(q->gt, "Schedule disable failed to respond, guc_id=%d\n",
q->guc->id);
- xe_devcoredump(q, NULL);
+ xe_devcoredump(q, NULL, "Schedule disable failed to respond, guc_id=%d\n",
+ q->guc->id);
xe_sched_submission_start(sched);
xe_gt_reset_async(q->gt);
return;
@@ -909,7 +910,7 @@ static void xe_guc_exec_queue_lr_cleanup(struct work_struct *w)
}
if (!exec_queue_killed(q) && !xe_lrc_ring_is_idle(q->lrc[0]))
- xe_devcoredump(q, NULL);
+ xe_devcoredump(q, NULL, "LR job cleanup, guc_id=%d", q->guc->id);
xe_sched_submission_start(sched);
}
@@ -1136,7 +1137,9 @@ trigger_reset:
xe_gt_warn(guc_to_gt(guc),
"Schedule disable failed to respond, guc_id=%d",
q->guc->id);
- xe_devcoredump(q, job);
+ xe_devcoredump(q, job,
+ "Schedule disable failed to respond, guc_id=%d, ret=%d, guc_read=%d",
+ q->guc->id, ret, xe_guc_read_stopped(guc));
set_exec_queue_extra_ref(q);
xe_exec_queue_get(q); /* GT reset owns this */
set_exec_queue_banned(q);
@@ -1166,7 +1169,10 @@ trigger_reset:
trace_xe_sched_job_timedout(job);
if (!exec_queue_killed(q))
- xe_devcoredump(q, job);
+ xe_devcoredump(q, job,
+ "Timedout job - seqno=%u, lrc_seqno=%u, guc_id=%d, flags=0x%lx",
+ xe_sched_job_seqno(job), xe_sched_job_lrc_seqno(job),
+ q->guc->id, q->flags);
/*
* Kernel jobs should never fail, nor should VM jobs if they do