summaryrefslogtreecommitdiff
path: root/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
diff options
context:
space:
mode:
authorTakashi Iwai <tiwai@suse.de>2022-04-19 17:26:01 +0200
committerTakashi Iwai <tiwai@suse.de>2022-04-19 17:26:01 +0200
commit0aea30a07ec6b50de0fc5f5b2ec34a68ead86b61 (patch)
treeee7d7d116570f39e47399c8f691a5a7565077eeb /drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
parent4ddef9c4d70aae0c9029bdec7c3f7f1c1c51ff8c (diff)
parent5b933c7262c5b0ea11ea3c3b3ea81add04895954 (diff)
Merge tag 'asoc-fix-v5.18-rc3' of https://git.kernel.org/pub/scm/linux/kernel/git/broonie/sound into for-linus
ASoC: Fixes for v5.18 A collection of fixes that came in since the merge window, plus one new device ID for an x86 laptop. Nothing that really stands out with particularly big impact outside of the affected device.
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c')
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c143
1 files changed, 72 insertions, 71 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
index e8b8f28c2f72..1b108d03e785 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
@@ -32,7 +32,8 @@
#include "wafl/wafl2_4_0_0_smn.h"
#include "wafl/wafl2_4_0_0_sh_mask.h"
-#define smnPCS_XGMI23_PCS_ERROR_STATUS 0x11a01210
+#include "amdgpu_reset.h"
+
#define smnPCS_XGMI3X16_PCS_ERROR_STATUS 0x11a0020c
#define smnPCS_GOPX1_PCS_ERROR_STATUS 0x12200210
@@ -67,17 +68,6 @@ static const int wafl_pcs_err_status_reg_arct[] = {
smnPCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS + 0x100000,
};
-static const int xgmi23_pcs_err_status_reg_aldebaran[] = {
- smnPCS_XGMI23_PCS_ERROR_STATUS,
- smnPCS_XGMI23_PCS_ERROR_STATUS + 0x100000,
- smnPCS_XGMI23_PCS_ERROR_STATUS + 0x200000,
- smnPCS_XGMI23_PCS_ERROR_STATUS + 0x300000,
- smnPCS_XGMI23_PCS_ERROR_STATUS + 0x400000,
- smnPCS_XGMI23_PCS_ERROR_STATUS + 0x500000,
- smnPCS_XGMI23_PCS_ERROR_STATUS + 0x600000,
- smnPCS_XGMI23_PCS_ERROR_STATUS + 0x700000
-};
-
static const int xgmi3x16_pcs_err_status_reg_aldebaran[] = {
smnPCS_XGMI3X16_PCS_ERROR_STATUS,
smnPCS_XGMI3X16_PCS_ERROR_STATUS + 0x100000,
@@ -227,6 +217,9 @@ static void amdgpu_xgmi_hive_release(struct kobject *kobj)
struct amdgpu_hive_info *hive = container_of(
kobj, struct amdgpu_hive_info, kobj);
+ amdgpu_reset_put_reset_domain(hive->reset_domain);
+ hive->reset_domain = NULL;
+
mutex_destroy(&hive->hive_lock);
kfree(hive);
}
@@ -398,15 +391,35 @@ struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device *adev)
goto pro_end;
}
+ /**
+ * Avoid recreating reset domain when hive is reconstructed for the case
+ * of reset the devices in the XGMI hive during probe for SRIOV
+ * See https://www.spinics.net/lists/amd-gfx/msg58836.html
+ */
+ if (adev->reset_domain->type != XGMI_HIVE) {
+ hive->reset_domain = amdgpu_reset_create_reset_domain(XGMI_HIVE, "amdgpu-reset-hive");
+ if (!hive->reset_domain) {
+ dev_err(adev->dev, "XGMI: failed initializing reset domain for xgmi hive\n");
+ ret = -ENOMEM;
+ kobject_put(&hive->kobj);
+ kfree(hive);
+ hive = NULL;
+ goto pro_end;
+ }
+ } else {
+ amdgpu_reset_get_reset_domain(adev->reset_domain);
+ hive->reset_domain = adev->reset_domain;
+ }
+
hive->hive_id = adev->gmc.xgmi.hive_id;
INIT_LIST_HEAD(&hive->device_list);
INIT_LIST_HEAD(&hive->node);
mutex_init(&hive->hive_lock);
- atomic_set(&hive->in_reset, 0);
atomic_set(&hive->number_devices, 0);
task_barrier_init(&hive->tb);
hive->pstate = AMDGPU_XGMI_PSTATE_UNKNOWN;
hive->hi_req_gpu = NULL;
+
/*
* hive pstate on boot is high in vega20 so we have to go to low
* pstate on after boot.
@@ -732,53 +745,15 @@ int amdgpu_xgmi_remove_device(struct amdgpu_device *adev)
return psp_xgmi_terminate(&adev->psp);
}
-static int amdgpu_xgmi_ras_late_init(struct amdgpu_device *adev)
+static int amdgpu_xgmi_ras_late_init(struct amdgpu_device *adev, struct ras_common_if *ras_block)
{
- int r;
- struct ras_ih_if ih_info = {
- .cb = NULL,
- };
- struct ras_fs_if fs_info = {
- .sysfs_name = "xgmi_wafl_err_count",
- };
-
if (!adev->gmc.xgmi.supported ||
adev->gmc.xgmi.num_physical_nodes == 0)
return 0;
- adev->gmc.xgmi.ras_funcs->reset_ras_error_count(adev);
-
- if (!adev->gmc.xgmi.ras_if) {
- adev->gmc.xgmi.ras_if = kmalloc(sizeof(struct ras_common_if), GFP_KERNEL);
- if (!adev->gmc.xgmi.ras_if)
- return -ENOMEM;
- adev->gmc.xgmi.ras_if->block = AMDGPU_RAS_BLOCK__XGMI_WAFL;
- adev->gmc.xgmi.ras_if->type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE;
- adev->gmc.xgmi.ras_if->sub_block_index = 0;
- }
- ih_info.head = fs_info.head = *adev->gmc.xgmi.ras_if;
- r = amdgpu_ras_late_init(adev, adev->gmc.xgmi.ras_if,
- &fs_info, &ih_info);
- if (r || !amdgpu_ras_is_supported(adev, adev->gmc.xgmi.ras_if->block)) {
- kfree(adev->gmc.xgmi.ras_if);
- adev->gmc.xgmi.ras_if = NULL;
- }
-
- return r;
-}
+ adev->gmc.xgmi.ras->ras_block.hw_ops->reset_ras_error_count(adev);
-static void amdgpu_xgmi_ras_fini(struct amdgpu_device *adev)
-{
- if (amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__XGMI_WAFL) &&
- adev->gmc.xgmi.ras_if) {
- struct ras_common_if *ras_if = adev->gmc.xgmi.ras_if;
- struct ras_ih_if ih_info = {
- .cb = NULL,
- };
-
- amdgpu_ras_late_fini(adev, ras_if, &ih_info);
- kfree(ras_if);
- }
+ return amdgpu_ras_block_late_init(adev, ras_block);
}
uint64_t amdgpu_xgmi_get_relative_phy_addr(struct amdgpu_device *adev,
@@ -810,9 +785,6 @@ static void amdgpu_xgmi_reset_ras_error_count(struct amdgpu_device *adev)
xgmi_pcs_err_status_reg_vg20[i]);
break;
case CHIP_ALDEBARAN:
- for (i = 0; i < ARRAY_SIZE(xgmi23_pcs_err_status_reg_aldebaran); i++)
- pcs_clear_status(adev,
- xgmi23_pcs_err_status_reg_aldebaran[i]);
for (i = 0; i < ARRAY_SIZE(xgmi3x16_pcs_err_status_reg_aldebaran); i++)
pcs_clear_status(adev,
xgmi3x16_pcs_err_status_reg_aldebaran[i]);
@@ -865,7 +837,7 @@ static int amdgpu_xgmi_query_pcs_error_status(struct amdgpu_device *adev,
return 0;
}
-static int amdgpu_xgmi_query_ras_error_count(struct amdgpu_device *adev,
+static void amdgpu_xgmi_query_ras_error_count(struct amdgpu_device *adev,
void *ras_error_status)
{
struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
@@ -874,7 +846,7 @@ static int amdgpu_xgmi_query_ras_error_count(struct amdgpu_device *adev,
uint32_t ue_cnt = 0, ce_cnt = 0;
if (!amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__XGMI_WAFL))
- return -EINVAL;
+ return ;
err_data->ue_count = 0;
err_data->ce_count = 0;
@@ -913,13 +885,6 @@ static int amdgpu_xgmi_query_ras_error_count(struct amdgpu_device *adev,
}
break;
case CHIP_ALDEBARAN:
- /* check xgmi23 pcs error */
- for (i = 0; i < ARRAY_SIZE(xgmi23_pcs_err_status_reg_aldebaran); i++) {
- data = RREG32_PCIE(xgmi23_pcs_err_status_reg_aldebaran[i]);
- if (data)
- amdgpu_xgmi_query_pcs_error_status(adev,
- data, &ue_cnt, &ce_cnt, true);
- }
/* check xgmi3x16 pcs error */
for (i = 0; i < ARRAY_SIZE(xgmi3x16_pcs_err_status_reg_aldebaran); i++) {
data = RREG32_PCIE(xgmi3x16_pcs_err_status_reg_aldebaran[i]);
@@ -940,17 +905,53 @@ static int amdgpu_xgmi_query_ras_error_count(struct amdgpu_device *adev,
break;
}
- adev->gmc.xgmi.ras_funcs->reset_ras_error_count(adev);
+ adev->gmc.xgmi.ras->ras_block.hw_ops->reset_ras_error_count(adev);
err_data->ue_count += ue_cnt;
err_data->ce_count += ce_cnt;
+}
- return 0;
+/* Trigger XGMI/WAFL error */
+static int amdgpu_ras_error_inject_xgmi(struct amdgpu_device *adev, void *inject_if)
+{
+ int ret = 0;
+ struct ta_ras_trigger_error_input *block_info =
+ (struct ta_ras_trigger_error_input *)inject_if;
+
+ if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_DISALLOW))
+ dev_warn(adev->dev, "Failed to disallow df cstate");
+
+ if (amdgpu_dpm_allow_xgmi_power_down(adev, false))
+ dev_warn(adev->dev, "Failed to disallow XGMI power down");
+
+ ret = psp_ras_trigger_error(&adev->psp, block_info);
+
+ if (amdgpu_ras_intr_triggered())
+ return ret;
+
+ if (amdgpu_dpm_allow_xgmi_power_down(adev, true))
+ dev_warn(adev->dev, "Failed to allow XGMI power down");
+
+ if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_ALLOW))
+ dev_warn(adev->dev, "Failed to allow df cstate");
+
+ return ret;
}
-const struct amdgpu_xgmi_ras_funcs xgmi_ras_funcs = {
- .ras_late_init = amdgpu_xgmi_ras_late_init,
- .ras_fini = amdgpu_xgmi_ras_fini,
+struct amdgpu_ras_block_hw_ops xgmi_ras_hw_ops = {
.query_ras_error_count = amdgpu_xgmi_query_ras_error_count,
.reset_ras_error_count = amdgpu_xgmi_reset_ras_error_count,
+ .ras_error_inject = amdgpu_ras_error_inject_xgmi,
+};
+
+struct amdgpu_xgmi_ras xgmi_ras = {
+ .ras_block = {
+ .ras_comm = {
+ .name = "xgmi_wafl",
+ .block = AMDGPU_RAS_BLOCK__XGMI_WAFL,
+ .type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE,
+ },
+ .hw_ops = &xgmi_ras_hw_ops,
+ .ras_late_init = amdgpu_xgmi_ras_late_init,
+ },
};