summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKarol Wachowski <karol.wachowski@intel.com>2025-05-28 17:42:53 +0200
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>2025-07-06 11:01:38 +0200
commit6420a8d27ef3ffc9ee7b4c0bc7d6194f56ac0825 (patch)
tree5f67f14872bf50162f9caf31360e33beff0bc852
parent397f3a7402faed3a3a212df1ccde8cea381bb8c2 (diff)
accel/ivpu: Trigger device recovery on engine reset/resume failure
[ Upstream commit a47e36dc5d90dc664cac87304c17d50f1595d634 ] Trigger full device recovery when the driver fails to restore device state via engine reset and resume operations. This is necessary because, even if submissions from a faulty context are blocked, the NPU may still process previously submitted faulty jobs if the engine reset fails to abort them. Such jobs can continue to generate faults and occupy device resources. When engine reset is ineffective, the only way to recover is to perform a full device recovery. Fixes: dad945c27a42 ("accel/ivpu: Add handling of VPU_JSM_STATUS_MVNCI_CONTEXT_VIOLATION_HW") Cc: stable@vger.kernel.org # v6.15+ Signed-off-by: Karol Wachowski <karol.wachowski@intel.com> Reviewed-by: Lizhi Hou <lizhi.hou@amd.com> Signed-off-by: Jacek Lawrynowicz <jacek.lawrynowicz@linux.intel.com> Link: https://lore.kernel.org/r/20250528154253.500556-1-jacek.lawrynowicz@linux.intel.com Signed-off-by: Sasha Levin <sashal@kernel.org>
-rw-r--r--drivers/accel/ivpu/ivpu_job.c6
-rw-r--r--drivers/accel/ivpu/ivpu_jsm_msg.c9
2 files changed, 11 insertions, 4 deletions
diff --git a/drivers/accel/ivpu/ivpu_job.c b/drivers/accel/ivpu/ivpu_job.c
index e57acae3b42e..e631098718b1 100644
--- a/drivers/accel/ivpu/ivpu_job.c
+++ b/drivers/accel/ivpu/ivpu_job.c
@@ -849,7 +849,8 @@ void ivpu_context_abort_thread_handler(struct work_struct *work)
unsigned long id;
if (vdev->fw->sched_mode == VPU_SCHEDULING_MODE_HW)
- ivpu_jsm_reset_engine(vdev, 0);
+ if (ivpu_jsm_reset_engine(vdev, 0))
+ return;
mutex_lock(&vdev->context_list_lock);
xa_for_each(&vdev->context_xa, ctx_id, file_priv) {
@@ -865,7 +866,8 @@ void ivpu_context_abort_thread_handler(struct work_struct *work)
if (vdev->fw->sched_mode != VPU_SCHEDULING_MODE_HW)
return;
- ivpu_jsm_hws_resume_engine(vdev, 0);
+ if (ivpu_jsm_hws_resume_engine(vdev, 0))
+ return;
/*
* In hardware scheduling mode NPU already has stopped processing jobs
* and won't send us any further notifications, thus we have to free job related resources
diff --git a/drivers/accel/ivpu/ivpu_jsm_msg.c b/drivers/accel/ivpu/ivpu_jsm_msg.c
index 21018feb4597..7c08308d5725 100644
--- a/drivers/accel/ivpu/ivpu_jsm_msg.c
+++ b/drivers/accel/ivpu/ivpu_jsm_msg.c
@@ -7,6 +7,7 @@
#include "ivpu_hw.h"
#include "ivpu_ipc.h"
#include "ivpu_jsm_msg.h"
+#include "ivpu_pm.h"
#include "vpu_jsm_api.h"
const char *ivpu_jsm_msg_type_to_str(enum vpu_ipc_msg_type type)
@@ -163,8 +164,10 @@ int ivpu_jsm_reset_engine(struct ivpu_device *vdev, u32 engine)
ret = ivpu_ipc_send_receive(vdev, &req, VPU_JSM_MSG_ENGINE_RESET_DONE, &resp,
VPU_IPC_CHAN_ASYNC_CMD, vdev->timeout.jsm);
- if (ret)
+ if (ret) {
ivpu_err_ratelimited(vdev, "Failed to reset engine %d: %d\n", engine, ret);
+ ivpu_pm_trigger_recovery(vdev, "Engine reset failed");
+ }
return ret;
}
@@ -354,8 +357,10 @@ int ivpu_jsm_hws_resume_engine(struct ivpu_device *vdev, u32 engine)
ret = ivpu_ipc_send_receive(vdev, &req, VPU_JSM_MSG_HWS_RESUME_ENGINE_DONE, &resp,
VPU_IPC_CHAN_ASYNC_CMD, vdev->timeout.jsm);
- if (ret)
+ if (ret) {
ivpu_err_ratelimited(vdev, "Failed to resume engine %d: %d\n", engine, ret);
+ ivpu_pm_trigger_recovery(vdev, "Engine resume failed");
+ }
return ret;
}