drm/msm: Mark VM as unusable on GPU hangs

If userspace has opted-in to VM_BIND, then GPU hangs and VM_BIND errors will mark the VM as unusable. Signed-off-by: Rob Clark <robdclark@chromium.org> Signed-off-by: Rob Clark <robin.clark@oss.qualcomm.com> Tested-by: Antonino Maniscalco <antomani103@gmail.com> Reviewed-by: Antonino Maniscalco <antomani103@gmail.com> Patchwork: https://patchwork.freedesktop.org/patch/661499/
author: Rob Clark <robdclark@chromium.org> 2025-06-29 13:13:06 -0700
committer: Rob Clark <robin.clark@oss.qualcomm.com> 2025-07-04 17:48:36 -0700
commit: 6a4d287a1ae6e49f8ef57fcb2a512c2b0bbef966 (patch)
tree: d2062a91e21f1668a3ada199e658bdb90c6ff91c
parent: feb8ef4636a457a1fd916a3ae575f552935e69b9 (diff)
3 files changed, 34 insertions, 2 deletions
diff --git a/drivers/gpu/drm/msm/msm_gem.h b/drivers/gpu/drm/msm/msm_gem.h
index b5bf21f62f9d..f2631a8c62b9 100644
--- a/drivers/gpu/drm/msm/msm_gem.h
+++ b/drivers/gpu/drm/msm/msm_gem.h
@@ -76,6 +76,23 @@ struct msm_gem_vm {
 
 	/** @managed: is this a kernel managed VM? */
 	bool managed;
+
+	/**
+	 * @unusable: True if the VM has turned unusable because something
+	 * bad happened during an asynchronous request.
+	 *
+	 * We don't try to recover from such failures, because this implies
+	 * informing userspace about the specific operation that failed, and
+	 * hoping the userspace driver can replay things from there. This all
+	 * sounds very complicated for little gain.
+	 *
+	 * Instead, we should just flag the VM as unusable, and fail any
+	 * further request targeting this VM.
+	 *
+	 * As an analogy, this would be mapped to a VK_ERROR_DEVICE_LOST
+	 * situation, where the logical device needs to be re-created.
+	 */
+	bool unusable;
 };
 #define to_msm_vm(x) container_of(x, struct msm_gem_vm, base)
 
diff --git a/drivers/gpu/drm/msm/msm_gem_submit.c b/drivers/gpu/drm/msm/msm_gem_submit.c
index daf05c380e84..50fafcacf035 100644
--- a/drivers/gpu/drm/msm/msm_gem_submit.c
+++ b/drivers/gpu/drm/msm/msm_gem_submit.c
@@ -681,6 +681,9 @@ int msm_ioctl_gem_submit(struct drm_device *dev, void *data,
 	if (args->pad)
 		return -EINVAL;
 
+	if (to_msm_vm(ctx->vm)->unusable)
+		return UERR(EPIPE, dev, "context is unusable");
+
 	/* for now, we just have 3d pipe.. eventually this would need to
 	 * be more clever to dispatch to appropriate gpu module:
 	 */
diff --git a/drivers/gpu/drm/msm/msm_gpu.c b/drivers/gpu/drm/msm/msm_gpu.c
index c08c942d85a0..0846f6c5169f 100644
--- a/drivers/gpu/drm/msm/msm_gpu.c
+++ b/drivers/gpu/drm/msm/msm_gpu.c
@@ -389,8 +389,20 @@ static void recover_worker(struct kthread_work *work)
 
 	/* Increment the fault counts */
 	submit->queue->faults++;
-	if (submit->vm)
-		to_msm_vm(submit->vm)->faults++;
+	if (submit->vm) {
+		struct msm_gem_vm *vm = to_msm_vm(submit->vm);
+
+		vm->faults++;
+
+		/*
+		 * If userspace has opted-in to VM_BIND (and therefore userspace
+		 * management of the VM), faults mark the VM as unusuable.  This
+		 * matches vulkan expectations (vulkan is the main target for
+		 * VM_BIND)
+		 */
+		if (!vm->managed)
+			vm->unusable = true;
+	}
 
 	get_comm_cmdline(submit, &comm, &cmd);
author	Rob Clark <robdclark@chromium.org>	2025-06-29 13:13:06 -0700
committer	Rob Clark <robin.clark@oss.qualcomm.com>	2025-07-04 17:48:36 -0700
commit	6a4d287a1ae6e49f8ef57fcb2a512c2b0bbef966 (patch)
tree	d2062a91e21f1668a3ada199e658bdb90c6ff91c
parent	feb8ef4636a457a1fd916a3ae575f552935e69b9 (diff)