diff options
author | Takashi Iwai <tiwai@suse.de> | 2022-04-19 17:26:01 +0200 |
---|---|---|
committer | Takashi Iwai <tiwai@suse.de> | 2022-04-19 17:26:01 +0200 |
commit | 0aea30a07ec6b50de0fc5f5b2ec34a68ead86b61 (patch) | |
tree | ee7d7d116570f39e47399c8f691a5a7565077eeb /drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | |
parent | 4ddef9c4d70aae0c9029bdec7c3f7f1c1c51ff8c (diff) | |
parent | 5b933c7262c5b0ea11ea3c3b3ea81add04895954 (diff) |
Merge tag 'asoc-fix-v5.18-rc3' of https://git.kernel.org/pub/scm/linux/kernel/git/broonie/sound into for-linus
ASoC: Fixes for v5.18
A collection of fixes that came in since the merge window, plus one new
device ID for an x86 laptop. Nothing that really stands out with
particularly big impact outside of the affected device.
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c')
-rw-r--r-- | drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 468 |
1 files changed, 288 insertions, 180 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index ed077de426d9..3987ecb24ef4 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -31,6 +31,7 @@ #include <linux/console.h> #include <linux/slab.h> #include <linux/iommu.h> +#include <linux/pci.h> #include <drm/drm_atomic_helper.h> #include <drm/drm_probe_helper.h> @@ -55,7 +56,6 @@ #include "soc15.h" #include "nv.h" #include "bif/bif_4_1_d.h" -#include <linux/pci.h> #include <linux/firmware.h> #include "amdgpu_vf_error.h" @@ -80,14 +80,11 @@ MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin"); MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin"); MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin"); MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin"); -MODULE_FIRMWARE("amdgpu/renoir_gpu_info.bin"); -MODULE_FIRMWARE("amdgpu/navi10_gpu_info.bin"); -MODULE_FIRMWARE("amdgpu/navi14_gpu_info.bin"); MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin"); -MODULE_FIRMWARE("amdgpu/vangogh_gpu_info.bin"); -MODULE_FIRMWARE("amdgpu/yellow_carp_gpu_info.bin"); #define AMDGPU_RESUME_MS 2000 +#define AMDGPU_MAX_RETRY_LIMIT 2 +#define AMDGPU_RETRY_SRIOV_RESET(r) ((r) == -EBUSY || (r) == -ETIMEDOUT || (r) == -EINVAL) const char *amdgpu_asic_name[] = { "TAHITI", @@ -424,10 +421,10 @@ bool amdgpu_device_skip_hw_access(struct amdgpu_device *adev) * the lock. */ if (in_task()) { - if (down_read_trylock(&adev->reset_sem)) - up_read(&adev->reset_sem); + if (down_read_trylock(&adev->reset_domain->sem)) + up_read(&adev->reset_domain->sem); else - lockdep_assert_held(&adev->reset_sem); + lockdep_assert_held(&adev->reset_domain->sem); } #endif return false; @@ -453,9 +450,9 @@ uint32_t amdgpu_device_rreg(struct amdgpu_device *adev, if ((reg * 4) < adev->rmmio_size) { if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && amdgpu_sriov_runtime(adev) && - down_read_trylock(&adev->reset_sem)) { + down_read_trylock(&adev->reset_domain->sem)) { ret = amdgpu_kiq_rreg(adev, reg); - up_read(&adev->reset_sem); + up_read(&adev->reset_domain->sem); } else { ret = readl(((void __iomem *)adev->rmmio) + (reg * 4)); } @@ -538,9 +535,9 @@ void amdgpu_device_wreg(struct amdgpu_device *adev, if ((reg * 4) < adev->rmmio_size) { if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && amdgpu_sriov_runtime(adev) && - down_read_trylock(&adev->reset_sem)) { + down_read_trylock(&adev->reset_domain->sem)) { amdgpu_kiq_wreg(adev, reg, v); - up_read(&adev->reset_sem); + up_read(&adev->reset_domain->sem); } else { writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); } @@ -554,7 +551,11 @@ void amdgpu_device_wreg(struct amdgpu_device *adev, /** * amdgpu_mm_wreg_mmio_rlc - write register either with direct/indirect mmio or with RLC path if in range * - * this function is invoked only the debugfs register access + * @adev: amdgpu_device pointer + * @reg: mmio/rlc register + * @v: value to write + * + * this function is invoked only for the debugfs register access */ void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev, uint32_t reg, uint32_t v) @@ -566,7 +567,7 @@ void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev, adev->gfx.rlc.funcs && adev->gfx.rlc.funcs->is_rlcg_access_range) { if (adev->gfx.rlc.funcs->is_rlcg_access_range(adev, reg)) - return adev->gfx.rlc.funcs->sriov_wreg(adev, reg, v, 0, 0); + return amdgpu_sriov_wreg(adev, reg, v, 0, 0); } else if ((reg * 4) >= adev->rmmio_size) { adev->pcie_wreg(adev, reg * 4, v); } else { @@ -1312,6 +1313,31 @@ bool amdgpu_device_need_post(struct amdgpu_device *adev) return true; } +/** + * amdgpu_device_should_use_aspm - check if the device should program ASPM + * + * @adev: amdgpu_device pointer + * + * Confirm whether the module parameter and pcie bridge agree that ASPM should + * be set for this device. + * + * Returns true if it should be used or false if not. + */ +bool amdgpu_device_should_use_aspm(struct amdgpu_device *adev) +{ + switch (amdgpu_aspm) { + case -1: + break; + case 0: + return false; + case 1: + return true; + default: + return false; + } + return pcie_aspm_enabled(adev->pdev); +} + /* if we get transitioned to only one device, take VGA back */ /** * amdgpu_device_vga_set_decode - enable/disable vga decode @@ -1446,7 +1472,8 @@ static int amdgpu_device_init_apu_flags(struct amdgpu_device *adev) case CHIP_YELLOW_CARP: break; case CHIP_CYAN_SKILLFISH: - if (adev->pdev->device == 0x13FE) + if ((adev->pdev->device == 0x13FE) || + (adev->pdev->device == 0x143F)) adev->apu_flags |= AMD_APU_IS_CYAN_SKILLFISH2; break; default: @@ -1507,6 +1534,11 @@ static int amdgpu_device_check_arguments(struct amdgpu_device *adev) amdgpu_sched_hw_submission = roundup_pow_of_two(amdgpu_sched_hw_submission); } + if (amdgpu_reset_method < -1 || amdgpu_reset_method > 4) { + dev_warn(adev->dev, "invalid option for reset method, reverting to default\n"); + amdgpu_reset_method = -1; + } + amdgpu_device_check_smu_prv_buffer_size(adev); amdgpu_device_check_vm_size(adev); @@ -1517,7 +1549,6 @@ static int amdgpu_device_check_arguments(struct amdgpu_device *adev) amdgpu_gmc_tmz_set(adev); - amdgpu_gmc_noretry_set(adev); return 0; } @@ -1955,27 +1986,9 @@ static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev) case CHIP_ARCTURUS: chip_name = "arcturus"; break; - case CHIP_RENOIR: - if (adev->apu_flags & AMD_APU_IS_RENOIR) - chip_name = "renoir"; - else - chip_name = "green_sardine"; - break; - case CHIP_NAVI10: - chip_name = "navi10"; - break; - case CHIP_NAVI14: - chip_name = "navi14"; - break; case CHIP_NAVI12: chip_name = "navi12"; break; - case CHIP_VANGOGH: - chip_name = "vangogh"; - break; - case CHIP_YELLOW_CARP: - chip_name = "yellow_carp"; - break; } snprintf(fw_name, sizeof(fw_name), "amdgpu/%s_gpu_info.bin", chip_name); @@ -2073,6 +2086,8 @@ out: */ static int amdgpu_device_ip_early_init(struct amdgpu_device *adev) { + struct drm_device *dev = adev_to_drm(adev); + struct pci_dev *parent; int i, r; amdgpu_device_enable_virtual_display(adev); @@ -2137,6 +2152,18 @@ static int amdgpu_device_ip_early_init(struct amdgpu_device *adev) break; } + if (amdgpu_has_atpx() && + (amdgpu_is_atpx_hybrid() || + amdgpu_has_atpx_dgpu_power_cntl()) && + ((adev->flags & AMD_IS_APU) == 0) && + !pci_is_thunderbolt_attached(to_pci_dev(dev->dev))) + adev->flags |= AMD_IS_PX; + + if (!(adev->flags & AMD_IS_APU)) { + parent = pci_upstream_bridge(adev->pdev); + adev->has_pr3 = parent ? pci_pr3_present(parent) : false; + } + amdgpu_amdkfd_device_probe(adev); adev->pm.pp_feature = amdgpu_pp_feature_mask; @@ -2287,6 +2314,49 @@ static int amdgpu_device_fw_loading(struct amdgpu_device *adev) return r; } +static int amdgpu_device_init_schedulers(struct amdgpu_device *adev) +{ + long timeout; + int r, i; + + for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { + struct amdgpu_ring *ring = adev->rings[i]; + + /* No need to setup the GPU scheduler for rings that don't need it */ + if (!ring || ring->no_scheduler) + continue; + + switch (ring->funcs->type) { + case AMDGPU_RING_TYPE_GFX: + timeout = adev->gfx_timeout; + break; + case AMDGPU_RING_TYPE_COMPUTE: + timeout = adev->compute_timeout; + break; + case AMDGPU_RING_TYPE_SDMA: + timeout = adev->sdma_timeout; + break; + default: + timeout = adev->video_timeout; + break; + } + + r = drm_sched_init(&ring->sched, &amdgpu_sched_ops, + ring->num_hw_submission, amdgpu_job_hang_limit, + timeout, adev->reset_domain->wq, + ring->sched_score, ring->name, + adev->dev); + if (r) { + DRM_ERROR("Failed to create scheduler on ring %s.\n", + ring->name); + return r; + } + } + + return 0; +} + + /** * amdgpu_device_ip_init - run init for hardware IPs * @@ -2398,8 +2468,28 @@ static int amdgpu_device_ip_init(struct amdgpu_device *adev) if (r) goto init_failed; - if (adev->gmc.xgmi.num_physical_nodes > 1) - amdgpu_xgmi_add_device(adev); + /** + * In case of XGMI grab extra reference for reset domain for this device + */ + if (adev->gmc.xgmi.num_physical_nodes > 1) { + if (amdgpu_xgmi_add_device(adev) == 0) { + struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); + + if (!hive->reset_domain || + !amdgpu_reset_get_reset_domain(hive->reset_domain)) { + r = -ENOENT; + goto init_failed; + } + + /* Drop the early temporary reset domain we created for device */ + amdgpu_reset_put_reset_domain(adev->reset_domain); + adev->reset_domain = hive->reset_domain; + } + } + + r = amdgpu_device_init_schedulers(adev); + if (r) + goto init_failed; /* Don't init kfd if whole hive need to be reset during init */ if (!adev->gmc.xgmi.pending_reset) @@ -2610,6 +2700,12 @@ static int amdgpu_device_ip_late_init(struct amdgpu_device *adev) adev->ip_blocks[i].status.late_initialized = true; } + r = amdgpu_ras_late_init(adev); + if (r) { + DRM_ERROR("amdgpu_ras_late_init failed %d", r); + return r; + } + amdgpu_ras_set_error_query_ready(adev, true); amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE); @@ -2624,7 +2720,7 @@ static int amdgpu_device_ip_late_init(struct amdgpu_device *adev) /* For passthrough configuration on arcturus and aldebaran, enable special handling SBR */ if (amdgpu_passthrough(adev) && ((adev->asic_type == CHIP_ARCTURUS && adev->gmc.xgmi.num_physical_nodes > 1)|| adev->asic_type == CHIP_ALDEBARAN )) - smu_handle_passthrough_sbr(&adev->smu, true); + amdgpu_dpm_handle_passthrough_sbr(adev, true); if (adev->gmc.xgmi.num_physical_nodes > 1) { mutex_lock(&mgpu_info.mutex); @@ -2708,11 +2804,11 @@ static int amdgpu_device_ip_fini_early(struct amdgpu_device *adev) } } - amdgpu_amdkfd_suspend(adev, false); - amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); + amdgpu_amdkfd_suspend(adev, false); + /* Workaroud for ASICs need to disable SMC first */ amdgpu_device_smu_fini_early(adev); @@ -2881,7 +2977,7 @@ static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev) int i, r; if (adev->in_s0ix) - amdgpu_gfx_state_change_set(adev, sGpuChangeState_D3Entry); + amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D3Entry); for (i = adev->num_ip_blocks - 1; i >= 0; i--) { if (!adev->ip_blocks[i].status.valid) @@ -3307,9 +3403,9 @@ static void amdgpu_device_xgmi_reset_func(struct work_struct *__work) if (adev->asic_reset_res) goto fail; - if (adev->mmhub.ras_funcs && - adev->mmhub.ras_funcs->reset_ras_error_count) - adev->mmhub.ras_funcs->reset_ras_error_count(adev); + if (adev->mmhub.ras && adev->mmhub.ras->ras_block.hw_ops && + adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count) + adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count(adev); } else { task_barrier_full(&hive->tb); @@ -3493,12 +3589,12 @@ int amdgpu_device_init(struct amdgpu_device *adev, mutex_init(&adev->mn_lock); mutex_init(&adev->virt.vf_errors.lock); hash_init(adev->mn_hash); - atomic_set(&adev->in_gpu_reset, 0); - init_rwsem(&adev->reset_sem); mutex_init(&adev->psp.mutex); mutex_init(&adev->notifier_lock); + mutex_init(&adev->pm.stable_pstate_ctx_lock); + mutex_init(&adev->benchmark_mutex); - amdgpu_device_init_apu_flags(adev); + amdgpu_device_init_apu_flags(adev); r = amdgpu_device_check_arguments(adev); if (r) @@ -3519,6 +3615,8 @@ int amdgpu_device_init(struct amdgpu_device *adev, INIT_LIST_HEAD(&adev->reset_list); + INIT_LIST_HEAD(&adev->ras_list); + INIT_DELAYED_WORK(&adev->delayed_init_work, amdgpu_device_delayed_init_work_handler); INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work, @@ -3568,6 +3666,15 @@ int amdgpu_device_init(struct amdgpu_device *adev, if (amdgpu_mes && adev->asic_type >= CHIP_NAVI10) adev->enable_mes = true; + /* + * Reset domain needs to be present early, before XGMI hive discovered + * (if any) and intitialized to use reset sem and in_gpu reset flag + * early on during init and before calling to RREG32. + */ + adev->reset_domain = amdgpu_reset_create_reset_domain(SINGLE_DEVICE, "amdgpu-reset-dev"); + if (!adev->reset_domain) + return -ENOMEM; + /* detect hw virtualization here */ amdgpu_detect_virtualization(adev); @@ -3582,6 +3689,7 @@ int amdgpu_device_init(struct amdgpu_device *adev, if (r) return r; + amdgpu_gmc_noretry_set(adev); /* Need to get xgmi info early to decide the reset behavior*/ if (adev->gmc.xgmi.supported) { r = adev->gfxhub.funcs->get_xgmi_info(adev); @@ -3749,19 +3857,6 @@ fence_driver_init: } else adev->ucode_sysfs_en = true; - if ((amdgpu_testing & 1)) { - if (adev->accel_working) - amdgpu_test_moves(adev); - else - DRM_INFO("amdgpu: acceleration disabled, skipping move tests\n"); - } - if (amdgpu_benchmarking) { - if (adev->accel_working) - amdgpu_benchmark(adev, amdgpu_benchmarking); - else - DRM_INFO("amdgpu: acceleration disabled, skipping benchmarks\n"); - } - /* * Register gpu instance before amdgpu_device_enable_mgpu_fan_boost. * Otherwise the mgpu fan boost feature will be skipped due to the @@ -3953,6 +4048,9 @@ void amdgpu_device_fini_sw(struct amdgpu_device *adev) if (adev->mman.discovery_bin) amdgpu_discovery_fini(adev); + amdgpu_reset_put_reset_domain(adev->reset_domain); + adev->reset_domain = NULL; + kfree(adev->pci_state); } @@ -4044,7 +4142,7 @@ int amdgpu_device_resume(struct drm_device *dev, bool fbcon) return 0; if (adev->in_s0ix) - amdgpu_gfx_state_change_set(adev, sGpuChangeState_D0Entry); + amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D0Entry); /* post card */ if (amdgpu_device_need_post(adev)) { @@ -4347,7 +4445,9 @@ static int amdgpu_device_reset_sriov(struct amdgpu_device *adev, { int r; struct amdgpu_hive_info *hive = NULL; + int retry_limit = 0; +retry: amdgpu_amdkfd_pre_reset(adev); amdgpu_amdkfd_pre_reset(adev); @@ -4396,6 +4496,14 @@ error: } amdgpu_virt_release_full_gpu(adev, true); + if (AMDGPU_RETRY_SRIOV_RESET(r)) { + if (retry_limit < AMDGPU_MAX_RETRY_LIMIT) { + retry_limit++; + goto retry; + } else + DRM_ERROR("GPU reset retry is beyond the retry limit\n"); + } + return r; } @@ -4587,6 +4695,22 @@ int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, return r; } +static int amdgpu_reset_reg_dumps(struct amdgpu_device *adev) +{ + uint32_t reg_value; + int i; + + lockdep_assert_held(&adev->reset_domain->sem); + dump_stack(); + + for (i = 0; i < adev->num_regs; i++) { + reg_value = RREG32(adev->reset_dump_reg_list[i]); + trace_amdgpu_reset_reg_dumps(adev->reset_dump_reg_list[i], reg_value); + } + + return 0; +} + int amdgpu_do_asic_reset(struct list_head *device_list_handle, struct amdgpu_reset_context *reset_context) { @@ -4597,6 +4721,7 @@ int amdgpu_do_asic_reset(struct list_head *device_list_handle, /* Try reset handler method first */ tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, reset_list); + amdgpu_reset_reg_dumps(tmp_adev); r = amdgpu_reset_perform_reset(tmp_adev, reset_context); /* If reset handler not implemented, continue; otherwise return */ if (r == -ENOSYS) @@ -4645,9 +4770,9 @@ int amdgpu_do_asic_reset(struct list_head *device_list_handle, if (!r && amdgpu_ras_intr_triggered()) { list_for_each_entry(tmp_adev, device_list_handle, reset_list) { - if (tmp_adev->mmhub.ras_funcs && - tmp_adev->mmhub.ras_funcs->reset_ras_error_count) - tmp_adev->mmhub.ras_funcs->reset_ras_error_count(tmp_adev); + if (tmp_adev->mmhub.ras && tmp_adev->mmhub.ras->ras_block.hw_ops && + tmp_adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count) + tmp_adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count(tmp_adev); } amdgpu_ras_intr_cleared(); @@ -4754,17 +4879,8 @@ end: return r; } -static bool amdgpu_device_lock_adev(struct amdgpu_device *adev, - struct amdgpu_hive_info *hive) +static void amdgpu_device_set_mp1_state(struct amdgpu_device *adev) { - if (atomic_cmpxchg(&adev->in_gpu_reset, 0, 1) != 0) - return false; - - if (hive) { - down_write_nest_lock(&adev->reset_sem, &hive->hive_lock); - } else { - down_write(&adev->reset_sem); - } switch (amdgpu_asic_reset_method(adev)) { case AMD_RESET_METHOD_MODE1: @@ -4777,56 +4893,12 @@ static bool amdgpu_device_lock_adev(struct amdgpu_device *adev, adev->mp1_state = PP_MP1_STATE_NONE; break; } - - return true; } -static void amdgpu_device_unlock_adev(struct amdgpu_device *adev) +static void amdgpu_device_unset_mp1_state(struct amdgpu_device *adev) { amdgpu_vf_error_trans_all(adev); adev->mp1_state = PP_MP1_STATE_NONE; - atomic_set(&adev->in_gpu_reset, 0); - up_write(&adev->reset_sem); -} - -/* - * to lockup a list of amdgpu devices in a hive safely, if not a hive - * with multiple nodes, it will be similar as amdgpu_device_lock_adev. - * - * unlock won't require roll back. - */ -static int amdgpu_device_lock_hive_adev(struct amdgpu_device *adev, struct amdgpu_hive_info *hive) -{ - struct amdgpu_device *tmp_adev = NULL; - - if (!amdgpu_sriov_vf(adev) && (adev->gmc.xgmi.num_physical_nodes > 1)) { - if (!hive) { - dev_err(adev->dev, "Hive is NULL while device has multiple xgmi nodes"); - return -ENODEV; - } - list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { - if (!amdgpu_device_lock_adev(tmp_adev, hive)) - goto roll_back; - } - } else if (!amdgpu_device_lock_adev(adev, hive)) - return -EAGAIN; - - return 0; -roll_back: - if (!list_is_first(&tmp_adev->gmc.xgmi.head, &hive->device_list)) { - /* - * if the lockup iteration break in the middle of a hive, - * it may means there may has a race issue, - * or a hive device locked up independently. - * we may be in trouble and may not, so will try to roll back - * the lock and give out a warnning. - */ - dev_warn(tmp_adev->dev, "Hive lock iteration broke in the middle. Rolling back to unlock"); - list_for_each_entry_continue_reverse(tmp_adev, &hive->device_list, gmc.xgmi.head) { - amdgpu_device_unlock_adev(tmp_adev); - } - } - return -EAGAIN; } static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev) @@ -4960,7 +5032,7 @@ retry: } /** - * amdgpu_device_gpu_recover - reset the asic and recover scheduler + * amdgpu_device_gpu_recover_imp - reset the asic and recover scheduler * * @adev: amdgpu_device pointer * @job: which job trigger hang @@ -4970,7 +5042,7 @@ retry: * Returns 0 for success or an error on failure. */ -int amdgpu_device_gpu_recover(struct amdgpu_device *adev, +int amdgpu_device_gpu_recover_imp(struct amdgpu_device *adev, struct amdgpu_job *job) { struct list_head device_list, *device_list_handle = NULL; @@ -5004,26 +5076,10 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, dev_info(adev->dev, "GPU %s begin!\n", need_emergency_restart ? "jobs stop":"reset"); - /* - * Here we trylock to avoid chain of resets executing from - * either trigger by jobs on different adevs in XGMI hive or jobs on - * different schedulers for same device while this TO handler is running. - * We always reset all schedulers for device and all devices for XGMI - * hive so that should take care of them too. - */ if (!amdgpu_sriov_vf(adev)) hive = amdgpu_get_xgmi_hive(adev); - if (hive) { - if (atomic_cmpxchg(&hive->in_reset, 0, 1) != 0) { - DRM_INFO("Bailing on TDR for s_job:%llx, hive: %llx as another already in progress", - job ? job->base.id : -1, hive->hive_id); - amdgpu_put_xgmi_hive(hive); - if (job && job->vm) - drm_sched_increase_karma(&job->base); - return 0; - } + if (hive) mutex_lock(&hive->hive_lock); - } reset_context.method = AMD_RESET_METHOD_NONE; reset_context.reset_req_dev = adev; @@ -5032,22 +5088,6 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); /* - * lock the device before we try to operate the linked list - * if didn't get the device lock, don't touch the linked list since - * others may iterating it. - */ - r = amdgpu_device_lock_hive_adev(adev, hive); - if (r) { - dev_info(adev->dev, "Bailing on TDR for s_job:%llx, as another already in progress", - job ? job->base.id : -1); - - /* even we skipped this reset, still need to set the job to guilty */ - if (job && job->vm) - drm_sched_increase_karma(&job->base); - goto skip_recovery; - } - - /* * Build list of devices to reset. * In case we are in XGMI hive mode, resort the device list * to put adev in the 1st position. @@ -5064,8 +5104,16 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, device_list_handle = &device_list; } + /* We need to lock reset domain only once both for XGMI and single device */ + tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, + reset_list); + amdgpu_device_lock_reset_domain(tmp_adev->reset_domain); + /* block all schedulers and reset given job's ring */ list_for_each_entry(tmp_adev, device_list_handle, reset_list) { + + amdgpu_device_set_mp1_state(tmp_adev); + /* * Try to put the audio codec into suspend state * before gpu reset started. @@ -5187,6 +5235,9 @@ skip_hw_reset: drm_helper_resume_force_mode(adev_to_drm(tmp_adev)); } + if (tmp_adev->asic_reset_res) + r = tmp_adev->asic_reset_res; + tmp_adev->asic_reset_res = 0; if (r) { @@ -5214,21 +5265,55 @@ skip_sched_resume: if (audio_suspended) amdgpu_device_resume_display_audio(tmp_adev); - amdgpu_device_unlock_adev(tmp_adev); + + amdgpu_device_unset_mp1_state(tmp_adev); } -skip_recovery: + tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, + reset_list); + amdgpu_device_unlock_reset_domain(tmp_adev->reset_domain); + if (hive) { - atomic_set(&hive->in_reset, 0); mutex_unlock(&hive->hive_lock); amdgpu_put_xgmi_hive(hive); } - if (r && r != -EAGAIN) + if (r) dev_info(adev->dev, "GPU reset end with ret = %d\n", r); return r; } +struct amdgpu_recover_work_struct { + struct work_struct base; + struct amdgpu_device *adev; + struct amdgpu_job *job; + int ret; +}; + +static void amdgpu_device_queue_gpu_recover_work(struct work_struct *work) +{ + struct amdgpu_recover_work_struct *recover_work = container_of(work, struct amdgpu_recover_work_struct, base); + + recover_work->ret = amdgpu_device_gpu_recover_imp(recover_work->adev, recover_work->job); +} +/* + * Serialize gpu recover into reset domain single threaded wq + */ +int amdgpu_device_gpu_recover(struct amdgpu_device *adev, + struct amdgpu_job *job) +{ + struct amdgpu_recover_work_struct work = {.adev = adev, .job = job}; + + INIT_WORK(&work.base, amdgpu_device_queue_gpu_recover_work); + + if (!amdgpu_reset_domain_schedule(adev->reset_domain, &work.base)) + return -EAGAIN; + + flush_work(&work.base); + + return work.ret; +} + /** * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot * @@ -5416,20 +5501,6 @@ int amdgpu_device_baco_exit(struct drm_device *dev) return 0; } -static void amdgpu_cancel_all_tdr(struct amdgpu_device *adev) -{ - int i; - - for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { - struct amdgpu_ring *ring = adev->rings[i]; - - if (!ring || !ring->sched.thread) - continue; - - cancel_delayed_work_sync(&ring->sched.work_tdr); - } -} - /** * amdgpu_pci_error_detected - Called when a PCI error is detected. * @pdev: PCI device struct @@ -5460,14 +5531,11 @@ pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_sta /* Fatal error, prepare for slot reset */ case pci_channel_io_frozen: /* - * Cancel and wait for all TDRs in progress if failing to - * set adev->in_gpu_reset in amdgpu_device_lock_adev - * - * Locking adev->reset_sem will prevent any external access + * Locking adev->reset_domain->sem will prevent any external access * to GPU during PCI error recovery */ - while (!amdgpu_device_lock_adev(adev, NULL)) - amdgpu_cancel_all_tdr(adev); + amdgpu_device_lock_reset_domain(adev->reset_domain); + amdgpu_device_set_mp1_state(adev); /* * Block any work scheduling as we do for regular GPU reset @@ -5574,7 +5642,8 @@ out: DRM_INFO("PCIe error recovery succeeded\n"); } else { DRM_ERROR("PCIe error recovery failed, err:%d", r); - amdgpu_device_unlock_adev(adev); + amdgpu_device_unset_mp1_state(adev); + amdgpu_device_unlock_reset_domain(adev->reset_domain); } return r ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED; @@ -5611,7 +5680,8 @@ void amdgpu_pci_resume(struct pci_dev *pdev) drm_sched_start(&ring->sched, true); } - amdgpu_device_unlock_adev(adev); + amdgpu_device_unset_mp1_state(adev); + amdgpu_device_unlock_reset_domain(adev->reset_domain); } bool amdgpu_device_cache_pci_state(struct pci_dev *pdev) @@ -5688,6 +5758,11 @@ void amdgpu_device_invalidate_hdp(struct amdgpu_device *adev, amdgpu_asic_invalidate_hdp(adev, ring); } +int amdgpu_in_reset(struct amdgpu_device *adev) +{ + return atomic_read(&adev->reset_domain->in_gpu_reset); + } + /** * amdgpu_device_halt() - bring hardware to some kind of halt state * @@ -5726,3 +5801,36 @@ void amdgpu_device_halt(struct amdgpu_device *adev) pci_disable_device(pdev); pci_wait_for_pending_transaction(pdev); } + +u32 amdgpu_device_pcie_port_rreg(struct amdgpu_device *adev, + u32 reg) +{ + unsigned long flags, address, data; + u32 r; + + address = adev->nbio.funcs->get_pcie_port_index_offset(adev); + data = adev->nbio.funcs->get_pcie_port_data_offset(adev); + + spin_lock_irqsave(&adev->pcie_idx_lock, flags); + WREG32(address, reg * 4); + (void)RREG32(address); + r = RREG32(data); + spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); + return r; +} + +void amdgpu_device_pcie_port_wreg(struct amdgpu_device *adev, + u32 reg, u32 v) +{ + unsigned long flags, address, data; + + address = adev->nbio.funcs->get_pcie_port_index_offset(adev); + data = adev->nbio.funcs->get_pcie_port_data_offset(adev); + + spin_lock_irqsave(&adev->pcie_idx_lock, flags); + WREG32(address, reg * 4); + (void)RREG32(address); + WREG32(data, v); + (void)RREG32(data); + spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); +} |