summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJesse.Zhang <Jesse.Zhang@amd.com>2025-05-15 15:02:13 +0800
committerAlex Deucher <alexander.deucher@amd.com>2025-05-22 12:00:52 -0400
commit0132ba7ff0f613915d332a30fcf14cf66e317f98 (patch)
tree359c181902e47e9cfb55572e3b75c0918b501ddc
parentb2c11e27080d8556664c20c30ca3527ffa99bec4 (diff)
drm/amdgpu: Fix eviction fence worker race during fd close
The current cleanup order during file descriptor close can lead to a race condition where the eviction fence worker attempts to access a destroyed mutex from the user queue manager: [ 517.294055] DEBUG_LOCKS_WARN_ON(lock->magic != lock) [ 517.294060] WARNING: CPU: 8 PID: 2030 at kernel/locking/mutex.c:564 [ 517.294094] Workqueue: events amdgpu_eviction_fence_suspend_worker [amdgpu] The issue occurs because: 1. We destroy the user queue manager (including its mutex) first 2. Then try to destroy eviction fences which may have pending work 3. The eviction fence worker may try to access the already-destroyed mutex Fix this by reordering the cleanup to: 1. First mark the fd as closing and destroy eviction fences, which flushes any pending work 2. Then safely destroy the user queue manager after we're certain no more fence work will be executed The copy in amdgpu_driver_postclose_kms() needs to be removed (Christian) Reviewed-by: Christian König <christian.koenig@amd.com> Reviewed-by: Prike Liang <Prike.Liang@amd.com> Reviewed-by: Arvind Yadav <Arvind.Yadav@amd.com> Signed-off-by: Jesse Zhang <Jesse.Zhang@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c2
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c5
2 files changed, 1 insertions, 6 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
index 4ddd08ce88852..4db92e0a60da7 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
@@ -2913,8 +2913,8 @@ static int amdgpu_drm_release(struct inode *inode, struct file *filp)
if (fpriv) {
fpriv->evf_mgr.fd_closing = true;
- amdgpu_userq_mgr_fini(&fpriv->userq_mgr);
amdgpu_eviction_fence_destroy(&fpriv->evf_mgr);
+ amdgpu_userq_mgr_fini(&fpriv->userq_mgr);
}
return drm_release(inode, filp);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
index 9fbb04aee97bc..d2ce7d86dbc8e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
@@ -1502,11 +1502,6 @@ void amdgpu_driver_postclose_kms(struct drm_device *dev,
amdgpu_bo_unreserve(pd);
}
- if (!fpriv->evf_mgr.fd_closing) {
- fpriv->evf_mgr.fd_closing = true;
- amdgpu_userq_mgr_fini(&fpriv->userq_mgr);
- amdgpu_eviction_fence_destroy(&fpriv->evf_mgr);
- }
amdgpu_ctx_mgr_fini(&fpriv->ctx_mgr);
amdgpu_vm_fini(adev, &fpriv->vm);