summaryrefslogtreecommitdiff
path: root/drivers/gpu/drm/xe/xe_guc_submit.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/gpu/drm/xe/xe_guc_submit.c')
-rw-r--r--drivers/gpu/drm/xe/xe_guc_submit.c106
1 files changed, 81 insertions, 25 deletions
diff --git a/drivers/gpu/drm/xe/xe_guc_submit.c b/drivers/gpu/drm/xe/xe_guc_submit.c
index 769781d577df6..cafb47711e9b3 100644
--- a/drivers/gpu/drm/xe/xe_guc_submit.c
+++ b/drivers/gpu/drm/xe/xe_guc_submit.c
@@ -229,6 +229,17 @@ static bool exec_queue_killed_or_banned_or_wedged(struct xe_exec_queue *q)
static void guc_submit_fini(struct drm_device *drm, void *arg)
{
struct xe_guc *guc = arg;
+ struct xe_device *xe = guc_to_xe(guc);
+ struct xe_gt *gt = guc_to_gt(guc);
+ int ret;
+
+ ret = wait_event_timeout(guc->submission_state.fini_wq,
+ xa_empty(&guc->submission_state.exec_queue_lookup),
+ HZ * 5);
+
+ drain_workqueue(xe->destroy_wq);
+
+ xe_gt_assert(gt, ret);
xa_destroy(&guc->submission_state.exec_queue_lookup);
}
@@ -300,6 +311,8 @@ int xe_guc_submit_init(struct xe_guc *guc, unsigned int num_ids)
primelockdep(guc);
+ guc->submission_state.initialized = true;
+
return drmm_add_action_or_reset(&xe->drm, guc_submit_fini, guc);
}
@@ -485,6 +498,15 @@ static void __register_mlrc_exec_queue(struct xe_guc *guc,
action[len++] = upper_32_bits(xe_lrc_descriptor(lrc));
}
+ /* explicitly checks some fields that we might fixup later */
+ xe_gt_assert(guc_to_gt(guc), info->wq_desc_lo ==
+ action[XE_GUC_REGISTER_CONTEXT_MULTI_LRC_DATA_5_WQ_DESC_ADDR_LOWER]);
+ xe_gt_assert(guc_to_gt(guc), info->wq_base_lo ==
+ action[XE_GUC_REGISTER_CONTEXT_MULTI_LRC_DATA_7_WQ_BUF_BASE_LOWER]);
+ xe_gt_assert(guc_to_gt(guc), q->width ==
+ action[XE_GUC_REGISTER_CONTEXT_MULTI_LRC_DATA_10_NUM_CTXS]);
+ xe_gt_assert(guc_to_gt(guc), info->hwlrca_lo ==
+ action[XE_GUC_REGISTER_CONTEXT_MULTI_LRC_DATA_11_HW_LRC_ADDR]);
xe_gt_assert(guc_to_gt(guc), len <= MAX_MLRC_REG_SIZE);
#undef MAX_MLRC_REG_SIZE
@@ -509,6 +531,14 @@ static void __register_exec_queue(struct xe_guc *guc,
info->hwlrca_hi,
};
+ /* explicitly checks some fields that we might fixup later */
+ xe_gt_assert(guc_to_gt(guc), info->wq_desc_lo ==
+ action[XE_GUC_REGISTER_CONTEXT_DATA_5_WQ_DESC_ADDR_LOWER]);
+ xe_gt_assert(guc_to_gt(guc), info->wq_base_lo ==
+ action[XE_GUC_REGISTER_CONTEXT_DATA_7_WQ_BUF_BASE_LOWER]);
+ xe_gt_assert(guc_to_gt(guc), info->hwlrca_lo ==
+ action[XE_GUC_REGISTER_CONTEXT_DATA_10_HW_LRC_ADDR]);
+
xe_guc_ct_send(&guc->ct, action, ARRAY_SIZE(action), 0, 0);
}
@@ -834,6 +864,13 @@ void xe_guc_submit_wedge(struct xe_guc *guc)
xe_gt_assert(guc_to_gt(guc), guc_to_xe(guc)->wedged.mode);
+ /*
+ * If device is being wedged even before submission_state is
+ * initialized, there's nothing to do here.
+ */
+ if (!guc->submission_state.initialized)
+ return;
+
err = devm_add_action_or_reset(guc_to_xe(guc)->drm.dev,
guc_submit_wedged_fini, guc);
if (err) {
@@ -871,12 +908,13 @@ static void xe_guc_exec_queue_lr_cleanup(struct work_struct *w)
struct xe_exec_queue *q = ge->q;
struct xe_guc *guc = exec_queue_to_guc(q);
struct xe_gpu_scheduler *sched = &ge->sched;
- bool wedged;
+ bool wedged = false;
xe_gt_assert(guc_to_gt(guc), xe_exec_queue_is_lr(q));
trace_xe_exec_queue_lr_cleanup(q);
- wedged = guc_submit_hint_wedged(exec_queue_to_guc(q));
+ if (!exec_queue_killed(q))
+ wedged = guc_submit_hint_wedged(exec_queue_to_guc(q));
/* Kill the run_job / process_msg entry points */
xe_sched_submission_stop(sched);
@@ -950,10 +988,7 @@ static bool check_timeout(struct xe_exec_queue *q, struct xe_sched_job *job)
*/
xe_gt_assert(gt, timeout_ms < 100 * MSEC_PER_SEC);
- if (ctx_timestamp < ctx_job_timestamp)
- diff = ctx_timestamp + U32_MAX - ctx_job_timestamp;
- else
- diff = ctx_timestamp - ctx_job_timestamp;
+ diff = ctx_timestamp - ctx_job_timestamp;
/*
* Ensure timeout is within 5% to account for an GuC scheduling latency
@@ -1050,7 +1085,7 @@ guc_exec_queue_timedout_job(struct drm_sched_job *drm_job)
int err = -ETIME;
pid_t pid = -1;
int i = 0;
- bool wedged, skip_timeout_check;
+ bool wedged = false, skip_timeout_check;
/*
* TDR has fired before free job worker. Common if exec queue
@@ -1058,12 +1093,8 @@ guc_exec_queue_timedout_job(struct drm_sched_job *drm_job)
* list so job can be freed and kick scheduler ensuring free job is not
* lost.
*/
- if (test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &job->fence->flags)) {
- xe_sched_add_pending_job(sched, job);
- xe_sched_submission_start(sched);
-
- return DRM_GPU_SCHED_STAT_NOMINAL;
- }
+ if (test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &job->fence->flags))
+ return DRM_GPU_SCHED_STAT_NO_HANG;
/* Kill the run_job entry point */
xe_sched_submission_stop(sched);
@@ -1096,7 +1127,8 @@ guc_exec_queue_timedout_job(struct drm_sched_job *drm_job)
* doesn't work for SRIOV. For now assuming timeouts in wedged mode are
* genuine timeouts.
*/
- wedged = guc_submit_hint_wedged(exec_queue_to_guc(q));
+ if (!exec_queue_killed(q))
+ wedged = guc_submit_hint_wedged(exec_queue_to_guc(q));
/* Engine state now stable, disable scheduling to check timestamp */
if (!wedged && exec_queue_registered(q)) {
@@ -1170,9 +1202,12 @@ trigger_reset:
process_name = q->vm->xef->process_name;
pid = q->vm->xef->pid;
}
- xe_gt_notice(guc_to_gt(guc), "Timedout job: seqno=%u, lrc_seqno=%u, guc_id=%d, flags=0x%lx in %s [%d]",
- xe_sched_job_seqno(job), xe_sched_job_lrc_seqno(job),
- q->guc->id, q->flags, process_name, pid);
+
+ if (!exec_queue_killed(q))
+ xe_gt_notice(guc_to_gt(guc),
+ "Timedout job: seqno=%u, lrc_seqno=%u, guc_id=%d, flags=0x%lx in %s [%d]",
+ xe_sched_job_seqno(job), xe_sched_job_lrc_seqno(job),
+ q->guc->id, q->flags, process_name, pid);
trace_xe_sched_job_timedout(job);
@@ -1228,7 +1263,7 @@ trigger_reset:
/* Start fence signaling */
xe_hw_fence_irq_start(q->fence_irq);
- return DRM_GPU_SCHED_STAT_NOMINAL;
+ return DRM_GPU_SCHED_STAT_RESET;
sched_enable:
enable_scheduling(q);
@@ -1238,10 +1273,8 @@ rearm:
* but there is not currently an easy way to do in DRM scheduler. With
* some thought, do this in a follow up.
*/
- xe_sched_add_pending_job(sched, job);
xe_sched_submission_start(sched);
-
- return DRM_GPU_SCHED_STAT_NOMINAL;
+ return DRM_GPU_SCHED_STAT_NO_HANG;
}
static void __guc_exec_queue_fini_async(struct work_struct *w)
@@ -1262,7 +1295,11 @@ static void __guc_exec_queue_fini_async(struct work_struct *w)
xe_sched_entity_fini(&ge->entity);
xe_sched_fini(&ge->sched);
- kfree(ge);
+ /*
+ * RCU free due sched being exported via DRM scheduler fences
+ * (timeline name).
+ */
+ kfree_rcu(ge, rcu);
xe_exec_queue_fini(q);
xe_pm_runtime_put(guc_to_xe(guc));
}
@@ -1445,6 +1482,7 @@ static int guc_exec_queue_init(struct xe_exec_queue *q)
q->guc = ge;
ge->q = q;
+ init_rcu_head(&ge->rcu);
init_waitqueue_head(&ge->suspend_wait);
for (i = 0; i < MAX_STATIC_MSG_TYPE; ++i)
@@ -1739,6 +1777,9 @@ int xe_guc_submit_reset_prepare(struct xe_guc *guc)
{
int ret;
+ if (!guc->submission_state.initialized)
+ return 0;
+
/*
* Using an atomic here rather than submission_state.lock as this
* function can be called while holding the CT lock (engine reset
@@ -2045,12 +2086,16 @@ int xe_guc_exec_queue_memory_cat_error_handler(struct xe_guc *guc, u32 *msg,
struct xe_gt *gt = guc_to_gt(guc);
struct xe_exec_queue *q;
u32 guc_id;
+ u32 type = XE_GUC_CAT_ERR_TYPE_INVALID;
- if (unlikely(len < 1))
+ if (unlikely(!len || len > 2))
return -EPROTO;
guc_id = msg[0];
+ if (len == 2)
+ type = msg[1];
+
if (guc_id == GUC_ID_UNKNOWN) {
/*
* GuC uses GUC_ID_UNKNOWN if it can not map the CAT fault to any PF/VF
@@ -2064,8 +2109,19 @@ int xe_guc_exec_queue_memory_cat_error_handler(struct xe_guc *guc, u32 *msg,
if (unlikely(!q))
return -EPROTO;
- xe_gt_dbg(gt, "Engine memory cat error: engine_class=%s, logical_mask: 0x%x, guc_id=%d",
- xe_hw_engine_class_to_str(q->class), q->logical_mask, guc_id);
+ /*
+ * The type is HW-defined and changes based on platform, so we don't
+ * decode it in the kernel and only check if it is valid.
+ * See bspec 54047 and 72187 for details.
+ */
+ if (type != XE_GUC_CAT_ERR_TYPE_INVALID)
+ xe_gt_dbg(gt,
+ "Engine memory CAT error [%u]: class=%s, logical_mask: 0x%x, guc_id=%d",
+ type, xe_hw_engine_class_to_str(q->class), q->logical_mask, guc_id);
+ else
+ xe_gt_dbg(gt,
+ "Engine memory CAT error: class=%s, logical_mask: 0x%x, guc_id=%d",
+ xe_hw_engine_class_to_str(q->class), q->logical_mask, guc_id);
trace_xe_exec_queue_memory_cat_error(q);