diff options
-rw-r--r-- | drivers/gpu/drm/xe/xe_devcoredump.c | 21 | ||||
-rw-r--r-- | drivers/gpu/drm/xe/xe_devcoredump.h | 5 | ||||
-rw-r--r-- | drivers/gpu/drm/xe/xe_devcoredump_types.h | 4 | ||||
-rw-r--r-- | drivers/gpu/drm/xe/xe_guc_submit.c | 14 |
4 files changed, 35 insertions, 9 deletions
diff --git a/drivers/gpu/drm/xe/xe_devcoredump.c b/drivers/gpu/drm/xe/xe_devcoredump.c index 0d3a4dc87a7f..baac50f6dd7e 100644 --- a/drivers/gpu/drm/xe/xe_devcoredump.c +++ b/drivers/gpu/drm/xe/xe_devcoredump.c @@ -100,6 +100,7 @@ static ssize_t __xe_devcoredump_read(char *buffer, size_t count, p = drm_coredump_printer(&iter); drm_puts(&p, "**** Xe Device Coredump ****\n"); + drm_printf(&p, "Reason: %s\n", ss->reason); drm_puts(&p, "kernel: " UTS_RELEASE "\n"); drm_puts(&p, "module: " KBUILD_MODNAME "\n"); @@ -107,7 +108,7 @@ static ssize_t __xe_devcoredump_read(char *buffer, size_t count, drm_printf(&p, "Snapshot time: %lld.%09ld\n", ts.tv_sec, ts.tv_nsec); ts = ktime_to_timespec64(ss->boot_time); drm_printf(&p, "Uptime: %lld.%09ld\n", ts.tv_sec, ts.tv_nsec); - drm_printf(&p, "Process: %s\n", ss->process_name); + drm_printf(&p, "Process: %s [%d]\n", ss->process_name, ss->pid); xe_device_snapshot_print(xe, &p); drm_printf(&p, "\n**** GT #%d ****\n", ss->gt->info.id); @@ -139,6 +140,9 @@ static void xe_devcoredump_snapshot_free(struct xe_devcoredump_snapshot *ss) { int i; + kfree(ss->reason); + ss->reason = NULL; + xe_guc_log_snapshot_free(ss->guc.log); ss->guc.log = NULL; @@ -259,8 +263,11 @@ static void devcoredump_snapshot(struct xe_devcoredump *coredump, ss->snapshot_time = ktime_get_real(); ss->boot_time = ktime_get_boottime(); - if (q->vm && q->vm->xef) + if (q->vm && q->vm->xef) { process_name = q->vm->xef->process_name; + ss->pid = q->vm->xef->pid; + } + strscpy(ss->process_name, process_name); ss->gt = q->gt; @@ -298,15 +305,18 @@ static void devcoredump_snapshot(struct xe_devcoredump *coredump, * xe_devcoredump - Take the required snapshots and initialize coredump device. * @q: The faulty xe_exec_queue, where the issue was detected. * @job: The faulty xe_sched_job, where the issue was detected. + * @fmt: Printf format + args to describe the reason for the core dump * * This function should be called at the crash time within the serialized * gt_reset. It is skipped if we still have the core dump device available * with the information of the 'first' snapshot. */ -void xe_devcoredump(struct xe_exec_queue *q, struct xe_sched_job *job) +__printf(3, 4) +void xe_devcoredump(struct xe_exec_queue *q, struct xe_sched_job *job, const char *fmt, ...) { struct xe_device *xe = gt_to_xe(q->gt); struct xe_devcoredump *coredump = &xe->devcoredump; + va_list varg; if (coredump->captured) { drm_dbg(&xe->drm, "Multiple hangs are occurring, but only the first snapshot was taken\n"); @@ -314,6 +324,11 @@ void xe_devcoredump(struct xe_exec_queue *q, struct xe_sched_job *job) } coredump->captured = true; + + va_start(varg, fmt); + coredump->snapshot.reason = kvasprintf(GFP_ATOMIC, fmt, varg); + va_end(varg); + devcoredump_snapshot(coredump, q, job); drm_info(&xe->drm, "Xe device coredump has been created\n"); diff --git a/drivers/gpu/drm/xe/xe_devcoredump.h b/drivers/gpu/drm/xe/xe_devcoredump.h index c04a534e3384..6a17e6d60102 100644 --- a/drivers/gpu/drm/xe/xe_devcoredump.h +++ b/drivers/gpu/drm/xe/xe_devcoredump.h @@ -14,11 +14,12 @@ struct xe_exec_queue; struct xe_sched_job; #ifdef CONFIG_DEV_COREDUMP -void xe_devcoredump(struct xe_exec_queue *q, struct xe_sched_job *job); +void xe_devcoredump(struct xe_exec_queue *q, struct xe_sched_job *job, const char *fmt, ...); int xe_devcoredump_init(struct xe_device *xe); #else static inline void xe_devcoredump(struct xe_exec_queue *q, - struct xe_sched_job *job) + struct xe_sched_job *job, + const char *fmt, ...) { } diff --git a/drivers/gpu/drm/xe/xe_devcoredump_types.h b/drivers/gpu/drm/xe/xe_devcoredump_types.h index be4d59ea9ac8..e6234e887102 100644 --- a/drivers/gpu/drm/xe/xe_devcoredump_types.h +++ b/drivers/gpu/drm/xe/xe_devcoredump_types.h @@ -28,6 +28,10 @@ struct xe_devcoredump_snapshot { ktime_t boot_time; /** @process_name: Name of process that triggered this gpu hang */ char process_name[TASK_COMM_LEN]; + /** @pid: Process id of process that triggered this gpu hang */ + pid_t pid; + /** @reason: The reason the coredump was triggered */ + char *reason; /** @gt: Affected GT, used by forcewake for delayed capture */ struct xe_gt *gt; diff --git a/drivers/gpu/drm/xe/xe_guc_submit.c b/drivers/gpu/drm/xe/xe_guc_submit.c index d00af8d8acbe..9c36329fe857 100644 --- a/drivers/gpu/drm/xe/xe_guc_submit.c +++ b/drivers/gpu/drm/xe/xe_guc_submit.c @@ -901,7 +901,8 @@ static void xe_guc_exec_queue_lr_cleanup(struct work_struct *w) if (!ret) { xe_gt_warn(q->gt, "Schedule disable failed to respond, guc_id=%d\n", q->guc->id); - xe_devcoredump(q, NULL); + xe_devcoredump(q, NULL, "Schedule disable failed to respond, guc_id=%d\n", + q->guc->id); xe_sched_submission_start(sched); xe_gt_reset_async(q->gt); return; @@ -909,7 +910,7 @@ static void xe_guc_exec_queue_lr_cleanup(struct work_struct *w) } if (!exec_queue_killed(q) && !xe_lrc_ring_is_idle(q->lrc[0])) - xe_devcoredump(q, NULL); + xe_devcoredump(q, NULL, "LR job cleanup, guc_id=%d", q->guc->id); xe_sched_submission_start(sched); } @@ -1136,7 +1137,9 @@ trigger_reset: xe_gt_warn(guc_to_gt(guc), "Schedule disable failed to respond, guc_id=%d", q->guc->id); - xe_devcoredump(q, job); + xe_devcoredump(q, job, + "Schedule disable failed to respond, guc_id=%d, ret=%d, guc_read=%d", + q->guc->id, ret, xe_guc_read_stopped(guc)); set_exec_queue_extra_ref(q); xe_exec_queue_get(q); /* GT reset owns this */ set_exec_queue_banned(q); @@ -1166,7 +1169,10 @@ trigger_reset: trace_xe_sched_job_timedout(job); if (!exec_queue_killed(q)) - xe_devcoredump(q, job); + xe_devcoredump(q, job, + "Timedout job - seqno=%u, lrc_seqno=%u, guc_id=%d, flags=0x%lx", + xe_sched_job_seqno(job), xe_sched_job_lrc_seqno(job), + q->guc->id, q->flags); /* * Kernel jobs should never fail, nor should VM jobs if they do |