drm/i915/guc: Capture error state on context reset

We receive notification of an engine reset from GuC at its completion. Meaning GuC has potentially cleared any HW state we may have been interested in capturing. GuC resumes scheduling on the engine post-reset, as the resets are meant to be transparent, further muddling our error state. There is ongoing work to define an API for a GuC debug state dump. The suggestion for now is to manually disable FW initiated resets in cases where debug state is needed. Signed-off-by: Matthew Brost <matthew.brost@intel.com> Reviewed-by: John Harrison <John.C.Harrison@Intel.com> Signed-off-by: John Harrison <John.C.Harrison@Intel.com> Link: https://patchwork.freedesktop.org/patch/msgid/20210727002348.97202-19-matthew.brost@intel.com
author: Matthew Brost <matthew.brost@intel.com> 2021-07-26 17:23:33 -0700
committer: John Harrison <John.C.Harrison@Intel.com> 2021-07-27 17:31:59 -0700
commit: 573ba126aef37c8315e5bb68d2dad515efa96994 (patch)
tree: 671ca22ec14a9b93c075ad9f47bc8566f68fb161 /drivers/gpu/drm/i915/i915_gpu_error.c
parent: c17b637928f030caac2d1c737959b9627011ac49 (diff)
1 files changed, 21 insertions, 4 deletions
diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
index a2c58b54a5928..0f08bcfbe9641 100644
--- a/drivers/gpu/drm/i915/i915_gpu_error.c
+++ b/drivers/gpu/drm/i915/i915_gpu_error.c
@@ -1429,20 +1429,37 @@ capture_engine(struct intel_engine_cs *engine,
 {
 	struct intel_engine_capture_vma *capture = NULL;
 	struct intel_engine_coredump *ee;
-	struct i915_request *rq;
+	struct intel_context *ce;
+	struct i915_request *rq = NULL;
 	unsigned long flags;
 
 	ee = intel_engine_coredump_alloc(engine, GFP_KERNEL);
 	if (!ee)
 		return NULL;
 
-	spin_lock_irqsave(&engine->sched_engine->lock, flags);
-	rq = intel_engine_find_active_request(engine);
+	ce = intel_engine_get_hung_context(engine);
+	if (ce) {
+		intel_engine_clear_hung_context(engine);
+		rq = intel_context_find_active_request(ce);
+		if (!rq || !i915_request_started(rq))
+			goto no_request_capture;
+	} else {
+		/*
+		 * Getting here with GuC enabled means it is a forced error capture
+		 * with no actual hang. So, no need to attempt the execlist search.
+		 */
+		if (!intel_uc_uses_guc_submission(&engine->gt->uc)) {
+			spin_lock_irqsave(&engine->sched_engine->lock, flags);
+			rq = intel_engine_execlist_find_hung_request(engine);
+			spin_unlock_irqrestore(&engine->sched_engine->lock,
+					       flags);
+		}
+	}
 	if (rq)
 		capture = intel_engine_coredump_add_request(ee, rq,
 							    ATOMIC_MAYFAIL);
-	spin_unlock_irqrestore(&engine->sched_engine->lock, flags);
 	if (!capture) {
+no_request_capture:
 		kfree(ee);
 		return NULL;
 	}
author	Matthew Brost <matthew.brost@intel.com>	2021-07-26 17:23:33 -0700
committer	John Harrison <John.C.Harrison@Intel.com>	2021-07-27 17:31:59 -0700
commit	573ba126aef37c8315e5bb68d2dad515efa96994 (patch)
tree	671ca22ec14a9b93c075ad9f47bc8566f68fb161 /drivers/gpu/drm/i915/i915_gpu_error.c
parent	c17b637928f030caac2d1c737959b9627011ac49 (diff)