From 54f329cc7a7a7ea265c45b206d45e3d09192aba7 Mon Sep 17 00:00:00 2001 From: Andrey Grodzovsky Date: Fri, 17 Dec 2021 13:05:15 -0500 Subject: drm/amdgpu: Serialize non TDR gpu recovery with TDRs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Use reset domain wq also for non TDR gpu recovery trigers such as sysfs and RAS. We must serialize all possible GPU recoveries to gurantee no concurrency there. For TDR call the original recovery function directly since it's already executed from within the wq. For others just use a wrapper to qeueue work and wait on it to finish. v2: Rename to amdgpu_recover_work_struct Signed-off-by: Andrey Grodzovsky Reviewed-by: Christian König Link: https://www.spinics.net/lists/amd-gfx/msg74113.html --- drivers/gpu/drm/amd/amdgpu/amdgpu_job.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_job.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c index bfc47bea23db..38c9fd7b7ad4 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c @@ -63,7 +63,7 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job) ti.process_name, ti.tgid, ti.task_name, ti.pid); if (amdgpu_device_should_recover_gpu(ring->adev)) { - amdgpu_device_gpu_recover(ring->adev, job); + amdgpu_device_gpu_recover_imp(ring->adev, job); } else { drm_sched_suspend_timeout(&ring->sched); if (amdgpu_sriov_vf(adev)) -- cgit v1.2.3 From 7258fa31eabd882f6c8ed4d6d281f6657a33ef94 Mon Sep 17 00:00:00 2001 From: Surbhi Kakarya Date: Wed, 26 Jan 2022 12:04:39 -0500 Subject: drm/amdgpu: Handle the GPU recovery failure in SRIOV environment. This patch handles the GPU recovery failure in sriov environment by retrying the reset if the first reset fails. To determine the condition of retry, a new macro AMDGPU_RETRY_SRIOV_RESET is added which returns true if failure is due to ETIMEDOUT, EINVAL or EBUSY, otherwise return false.A new macro AMDGPU_MAX_RETRY_LIMIT is used to limit the retry to 2. It also handles the return status in Post Asic Reset by updating the return code with asic_reset_res and eventually return the return code in amdgpu_job_timedout(). Signed-off-by: Surbhi Kakarya Reviewed-by: Andrey Grodzovsky Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 15 +++++++++++++++ drivers/gpu/drm/amd/amdgpu/amdgpu_job.c | 6 +++++- 2 files changed, 20 insertions(+), 1 deletion(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_job.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 7838b5f9ad19..aff9d70347ad 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -88,6 +88,8 @@ MODULE_FIRMWARE("amdgpu/vangogh_gpu_info.bin"); MODULE_FIRMWARE("amdgpu/yellow_carp_gpu_info.bin"); #define AMDGPU_RESUME_MS 2000 +#define AMDGPU_MAX_RETRY_LIMIT 2 +#define AMDGPU_RETRY_SRIOV_RESET(r) ((r) == -EBUSY || (r) == -ETIMEDOUT || (r) == -EINVAL) const char *amdgpu_asic_name[] = { "TAHITI", @@ -4366,7 +4368,9 @@ static int amdgpu_device_reset_sriov(struct amdgpu_device *adev, { int r; struct amdgpu_hive_info *hive = NULL; + int retry_limit = 0; +retry: amdgpu_amdkfd_pre_reset(adev); amdgpu_amdkfd_pre_reset(adev); @@ -4415,6 +4419,14 @@ error: } amdgpu_virt_release_full_gpu(adev, true); + if (AMDGPU_RETRY_SRIOV_RESET(r)) { + if (retry_limit < AMDGPU_MAX_RETRY_LIMIT) { + retry_limit++; + goto retry; + } else + DRM_ERROR("GPU reset retry is beyond the retry limit\n"); + } + return r; } @@ -5206,6 +5218,9 @@ skip_hw_reset: drm_helper_resume_force_mode(adev_to_drm(tmp_adev)); } + if (tmp_adev->asic_reset_res) + r = tmp_adev->asic_reset_res; + tmp_adev->asic_reset_res = 0; if (r) { diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c index bfc47bea23db..4870e093213d 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c @@ -37,6 +37,7 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job) struct amdgpu_task_info ti; struct amdgpu_device *adev = ring->adev; int idx; + int r; if (!drm_dev_enter(adev_to_drm(adev), &idx)) { DRM_INFO("%s - device unplugged skipping recovery on scheduler:%s", @@ -63,7 +64,10 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job) ti.process_name, ti.tgid, ti.task_name, ti.pid); if (amdgpu_device_should_recover_gpu(ring->adev)) { - amdgpu_device_gpu_recover(ring->adev, job); + r = amdgpu_device_gpu_recover(ring->adev, job); + if (r) + DRM_ERROR("GPU Recovery Failed: %d\n", r); + } else { drm_sched_suspend_timeout(&ring->sched); if (amdgpu_sriov_vf(adev)) -- cgit v1.2.3 From 6103b2f24e4a9716ca3f5de654964f2a083086be Mon Sep 17 00:00:00 2001 From: Christian König Date: Tue, 1 Mar 2022 09:57:41 +0100 Subject: drm/amdgpu: properly embed the IBs into the job MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We now have standard macros for that. Signed-off-by: Christian König Reviewed-by: Andrey Grodzovsky Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_job.c | 7 +------ drivers/gpu/drm/amd/amdgpu/amdgpu_job.h | 6 ++++-- 2 files changed, 5 insertions(+), 8 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_job.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c index d970336d2261..67f66f2f1809 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c @@ -81,14 +81,10 @@ exit: int amdgpu_job_alloc(struct amdgpu_device *adev, unsigned num_ibs, struct amdgpu_job **job, struct amdgpu_vm *vm) { - size_t size = sizeof(struct amdgpu_job); - if (num_ibs == 0) return -EINVAL; - size += sizeof(struct amdgpu_ib) * num_ibs; - - *job = kzalloc(size, GFP_KERNEL); + *job = kzalloc(struct_size(*job, ibs, num_ibs), GFP_KERNEL); if (!*job) return -ENOMEM; @@ -98,7 +94,6 @@ int amdgpu_job_alloc(struct amdgpu_device *adev, unsigned num_ibs, */ (*job)->base.sched = &adev->rings[0]->sched; (*job)->vm = vm; - (*job)->ibs = (void *)&(*job)[1]; (*job)->num_ibs = num_ibs; amdgpu_sync_create(&(*job)->sync); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h index 6d704772ff42..d599c0540b46 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h @@ -25,6 +25,7 @@ #include #include "amdgpu_sync.h" +#include "amdgpu_ring.h" /* bit set means command submit involves a preamble IB */ #define AMDGPU_PREAMBLE_IB_PRESENT (1 << 0) @@ -48,12 +49,10 @@ struct amdgpu_job { struct amdgpu_vm *vm; struct amdgpu_sync sync; struct amdgpu_sync sched_sync; - struct amdgpu_ib *ibs; struct dma_fence hw_fence; struct dma_fence *external_hw_fence; uint32_t preamble_status; uint32_t preemption_status; - uint32_t num_ibs; bool vm_needs_flush; uint64_t vm_pd_addr; unsigned vmid; @@ -69,6 +68,9 @@ struct amdgpu_job { /* job_run_counter >= 1 means a resubmit job */ uint32_t job_run_counter; + + uint32_t num_ibs; + struct amdgpu_ib ibs[]; }; int amdgpu_job_alloc(struct amdgpu_device *adev, unsigned num_ibs, -- cgit v1.2.3