diff options
author | Takashi Iwai <tiwai@suse.de> | 2022-04-19 17:26:01 +0200 |
---|---|---|
committer | Takashi Iwai <tiwai@suse.de> | 2022-04-19 17:26:01 +0200 |
commit | 0aea30a07ec6b50de0fc5f5b2ec34a68ead86b61 (patch) | |
tree | ee7d7d116570f39e47399c8f691a5a7565077eeb /drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | |
parent | 4ddef9c4d70aae0c9029bdec7c3f7f1c1c51ff8c (diff) | |
parent | 5b933c7262c5b0ea11ea3c3b3ea81add04895954 (diff) |
Merge tag 'asoc-fix-v5.18-rc3' of https://git.kernel.org/pub/scm/linux/kernel/git/broonie/sound into for-linus
ASoC: Fixes for v5.18
A collection of fixes that came in since the merge window, plus one new
device ID for an x86 laptop. Nothing that really stands out with
particularly big impact outside of the affected device.
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c')
-rw-r--r-- | drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 548 |
1 files changed, 308 insertions, 240 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 8f47c14ecbc7..424c22a841f4 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -75,6 +75,13 @@ const char *ras_mca_block_string[] = { "mca_iohc", }; +struct amdgpu_ras_block_list { + /* ras block link */ + struct list_head node; + + struct amdgpu_ras_block_object *ras_obj; +}; + const char *get_ras_block_str(struct ras_common_if *ras_block) { if (!ras_block) @@ -89,6 +96,9 @@ const char *get_ras_block_str(struct ras_common_if *ras_block) return ras_block_string[ras_block->block]; } +#define ras_block_str(_BLOCK_) \ + (((_BLOCK_) < ARRAY_SIZE(ras_block_string)) ? ras_block_string[_BLOCK_] : "Out Of Range") + #define ras_err_str(i) (ras_error_string[ffs(i)]) #define RAS_DEFAULT_FLAGS (AMDGPU_RAS_FLAG_INIT_BY_VBIOS) @@ -155,14 +165,9 @@ static int amdgpu_reserve_page_direct(struct amdgpu_device *adev, uint64_t addre } memset(&err_rec, 0x0, sizeof(struct eeprom_table_record)); - - err_rec.address = address; - err_rec.retired_page = address >> AMDGPU_GPU_PAGE_SHIFT; - err_rec.ts = (uint64_t)ktime_get_real_seconds(); - err_rec.err_type = AMDGPU_RAS_EEPROM_ERR_NON_RECOVERABLE; - err_data.err_addr = &err_rec; - err_data.err_addr_cnt = 1; + amdgpu_umc_fill_error_record(&err_data, address, + (address >> AMDGPU_GPU_PAGE_SHIFT), 0, 0); if (amdgpu_bad_page_threshold != 0) { amdgpu_ras_add_bad_pages(adev, err_data.err_addr, @@ -452,7 +457,7 @@ static ssize_t amdgpu_ras_debugfs_ctrl_write(struct file *f, } if (ret) - return -EINVAL; + return ret; return size; } @@ -866,30 +871,47 @@ static int amdgpu_ras_enable_all_features(struct amdgpu_device *adev, } /* feature ctl end */ +static int amdgpu_ras_block_match_default(struct amdgpu_ras_block_object *block_obj, + enum amdgpu_ras_block block) +{ + if (!block_obj) + return -EINVAL; -static void amdgpu_ras_mca_query_error_status(struct amdgpu_device *adev, - struct ras_common_if *ras_block, - struct ras_err_data *err_data) + if (block_obj->ras_comm.block == block) + return 0; + + return -EINVAL; +} + +static struct amdgpu_ras_block_object *amdgpu_ras_get_ras_block(struct amdgpu_device *adev, + enum amdgpu_ras_block block, uint32_t sub_block_index) { - switch (ras_block->sub_block_index) { - case AMDGPU_RAS_MCA_BLOCK__MP0: - if (adev->mca.mp0.ras_funcs && - adev->mca.mp0.ras_funcs->query_ras_error_count) - adev->mca.mp0.ras_funcs->query_ras_error_count(adev, &err_data); - break; - case AMDGPU_RAS_MCA_BLOCK__MP1: - if (adev->mca.mp1.ras_funcs && - adev->mca.mp1.ras_funcs->query_ras_error_count) - adev->mca.mp1.ras_funcs->query_ras_error_count(adev, &err_data); - break; - case AMDGPU_RAS_MCA_BLOCK__MPIO: - if (adev->mca.mpio.ras_funcs && - adev->mca.mpio.ras_funcs->query_ras_error_count) - adev->mca.mpio.ras_funcs->query_ras_error_count(adev, &err_data); - break; - default: - break; + struct amdgpu_ras_block_list *node, *tmp; + struct amdgpu_ras_block_object *obj; + + if (block >= AMDGPU_RAS_BLOCK__LAST) + return NULL; + + if (!amdgpu_ras_is_supported(adev, block)) + return NULL; + + list_for_each_entry_safe(node, tmp, &adev->ras_list, node) { + if (!node->ras_obj) { + dev_warn(adev->dev, "Warning: abnormal ras list node.\n"); + continue; + } + + obj = node->ras_obj; + if (obj->ras_block_match) { + if (obj->ras_block_match(obj, block, sub_block_index) == 0) + return obj; + } else { + if (amdgpu_ras_block_match_default(obj, block) == 0) + return obj; + } } + + return NULL; } static void amdgpu_ras_get_ecc_info(struct amdgpu_device *adev, struct ras_err_data *err_data) @@ -901,26 +923,26 @@ static void amdgpu_ras_get_ecc_info(struct amdgpu_device *adev, struct ras_err_d * choosing right query method according to * whether smu support query error information */ - ret = smu_get_ecc_info(&adev->smu, (void *)&(ras->umc_ecc)); + ret = amdgpu_dpm_get_ecc_info(adev, (void *)&(ras->umc_ecc)); if (ret == -EOPNOTSUPP) { - if (adev->umc.ras_funcs && - adev->umc.ras_funcs->query_ras_error_count) - adev->umc.ras_funcs->query_ras_error_count(adev, err_data); + if (adev->umc.ras && adev->umc.ras->ras_block.hw_ops && + adev->umc.ras->ras_block.hw_ops->query_ras_error_count) + adev->umc.ras->ras_block.hw_ops->query_ras_error_count(adev, err_data); /* umc query_ras_error_address is also responsible for clearing * error status */ - if (adev->umc.ras_funcs && - adev->umc.ras_funcs->query_ras_error_address) - adev->umc.ras_funcs->query_ras_error_address(adev, err_data); + if (adev->umc.ras && adev->umc.ras->ras_block.hw_ops && + adev->umc.ras->ras_block.hw_ops->query_ras_error_address) + adev->umc.ras->ras_block.hw_ops->query_ras_error_address(adev, err_data); } else if (!ret) { - if (adev->umc.ras_funcs && - adev->umc.ras_funcs->ecc_info_query_ras_error_count) - adev->umc.ras_funcs->ecc_info_query_ras_error_count(adev, err_data); + if (adev->umc.ras && + adev->umc.ras->ecc_info_query_ras_error_count) + adev->umc.ras->ecc_info_query_ras_error_count(adev, err_data); - if (adev->umc.ras_funcs && - adev->umc.ras_funcs->ecc_info_query_ras_error_address) - adev->umc.ras_funcs->ecc_info_query_ras_error_address(adev, err_data); + if (adev->umc.ras && + adev->umc.ras->ecc_info_query_ras_error_address) + adev->umc.ras->ecc_info_query_ras_error_address(adev, err_data); } } @@ -928,62 +950,32 @@ static void amdgpu_ras_get_ecc_info(struct amdgpu_device *adev, struct ras_err_d int amdgpu_ras_query_error_status(struct amdgpu_device *adev, struct ras_query_if *info) { + struct amdgpu_ras_block_object *block_obj = NULL; struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head); struct ras_err_data err_data = {0, 0, 0, NULL}; - int i; if (!obj) return -EINVAL; - switch (info->head.block) { - case AMDGPU_RAS_BLOCK__UMC: + if (info->head.block == AMDGPU_RAS_BLOCK__UMC) { amdgpu_ras_get_ecc_info(adev, &err_data); - break; - case AMDGPU_RAS_BLOCK__SDMA: - if (adev->sdma.funcs->query_ras_error_count) { - for (i = 0; i < adev->sdma.num_instances; i++) - adev->sdma.funcs->query_ras_error_count(adev, i, - &err_data); + } else { + block_obj = amdgpu_ras_get_ras_block(adev, info->head.block, 0); + if (!block_obj || !block_obj->hw_ops) { + dev_dbg_once(adev->dev, "%s doesn't config RAS function\n", + get_ras_block_str(&info->head)); + return -EINVAL; } - break; - case AMDGPU_RAS_BLOCK__GFX: - if (adev->gfx.ras_funcs && - adev->gfx.ras_funcs->query_ras_error_count) - adev->gfx.ras_funcs->query_ras_error_count(adev, &err_data); - - if (adev->gfx.ras_funcs && - adev->gfx.ras_funcs->query_ras_error_status) - adev->gfx.ras_funcs->query_ras_error_status(adev); - break; - case AMDGPU_RAS_BLOCK__MMHUB: - if (adev->mmhub.ras_funcs && - adev->mmhub.ras_funcs->query_ras_error_count) - adev->mmhub.ras_funcs->query_ras_error_count(adev, &err_data); - - if (adev->mmhub.ras_funcs && - adev->mmhub.ras_funcs->query_ras_error_status) - adev->mmhub.ras_funcs->query_ras_error_status(adev); - break; - case AMDGPU_RAS_BLOCK__PCIE_BIF: - if (adev->nbio.ras_funcs && - adev->nbio.ras_funcs->query_ras_error_count) - adev->nbio.ras_funcs->query_ras_error_count(adev, &err_data); - break; - case AMDGPU_RAS_BLOCK__XGMI_WAFL: - if (adev->gmc.xgmi.ras_funcs && - adev->gmc.xgmi.ras_funcs->query_ras_error_count) - adev->gmc.xgmi.ras_funcs->query_ras_error_count(adev, &err_data); - break; - case AMDGPU_RAS_BLOCK__HDP: - if (adev->hdp.ras_funcs && - adev->hdp.ras_funcs->query_ras_error_count) - adev->hdp.ras_funcs->query_ras_error_count(adev, &err_data); - break; - case AMDGPU_RAS_BLOCK__MCA: - amdgpu_ras_mca_query_error_status(adev, &info->head, &err_data); - break; - default: - break; + + if (block_obj->hw_ops->query_ras_error_count) + block_obj->hw_ops->query_ras_error_count(adev, &err_data); + + if ((info->head.block == AMDGPU_RAS_BLOCK__SDMA) || + (info->head.block == AMDGPU_RAS_BLOCK__GFX) || + (info->head.block == AMDGPU_RAS_BLOCK__MMHUB)) { + if (block_obj->hw_ops->query_ras_error_status) + block_obj->hw_ops->query_ras_error_status(adev); + } } obj->err_data.ue_count += err_data.ue_count; @@ -1040,68 +1032,27 @@ int amdgpu_ras_query_error_status(struct amdgpu_device *adev, int amdgpu_ras_reset_error_status(struct amdgpu_device *adev, enum amdgpu_ras_block block) { + struct amdgpu_ras_block_object *block_obj = amdgpu_ras_get_ras_block(adev, block, 0); + if (!amdgpu_ras_is_supported(adev, block)) return -EINVAL; - switch (block) { - case AMDGPU_RAS_BLOCK__GFX: - if (adev->gfx.ras_funcs && - adev->gfx.ras_funcs->reset_ras_error_count) - adev->gfx.ras_funcs->reset_ras_error_count(adev); - - if (adev->gfx.ras_funcs && - adev->gfx.ras_funcs->reset_ras_error_status) - adev->gfx.ras_funcs->reset_ras_error_status(adev); - break; - case AMDGPU_RAS_BLOCK__MMHUB: - if (adev->mmhub.ras_funcs && - adev->mmhub.ras_funcs->reset_ras_error_count) - adev->mmhub.ras_funcs->reset_ras_error_count(adev); - - if (adev->mmhub.ras_funcs && - adev->mmhub.ras_funcs->reset_ras_error_status) - adev->mmhub.ras_funcs->reset_ras_error_status(adev); - break; - case AMDGPU_RAS_BLOCK__SDMA: - if (adev->sdma.funcs->reset_ras_error_count) - adev->sdma.funcs->reset_ras_error_count(adev); - break; - case AMDGPU_RAS_BLOCK__HDP: - if (adev->hdp.ras_funcs && - adev->hdp.ras_funcs->reset_ras_error_count) - adev->hdp.ras_funcs->reset_ras_error_count(adev); - break; - default: - break; + if (!block_obj || !block_obj->hw_ops) { + dev_dbg_once(adev->dev, "%s doesn't config RAS function\n", + ras_block_str(block)); + return -EINVAL; } - return 0; -} - -/* Trigger XGMI/WAFL error */ -static int amdgpu_ras_error_inject_xgmi(struct amdgpu_device *adev, - struct ta_ras_trigger_error_input *block_info) -{ - int ret; - - if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_DISALLOW)) - dev_warn(adev->dev, "Failed to disallow df cstate"); - - if (amdgpu_dpm_allow_xgmi_power_down(adev, false)) - dev_warn(adev->dev, "Failed to disallow XGMI power down"); - - ret = psp_ras_trigger_error(&adev->psp, block_info); + if (block_obj->hw_ops->reset_ras_error_count) + block_obj->hw_ops->reset_ras_error_count(adev); - if (amdgpu_ras_intr_triggered()) - return ret; - - if (amdgpu_dpm_allow_xgmi_power_down(adev, true)) - dev_warn(adev->dev, "Failed to allow XGMI power down"); - - if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_ALLOW)) - dev_warn(adev->dev, "Failed to allow df cstate"); + if ((block == AMDGPU_RAS_BLOCK__GFX) || + (block == AMDGPU_RAS_BLOCK__MMHUB)) { + if (block_obj->hw_ops->reset_ras_error_status) + block_obj->hw_ops->reset_ras_error_status(adev); + } - return ret; + return 0; } /* wrapper of psp_ras_trigger_error */ @@ -1116,11 +1067,20 @@ int amdgpu_ras_error_inject(struct amdgpu_device *adev, .address = info->address, .value = info->value, }; - int ret = 0; + int ret = -EINVAL; + struct amdgpu_ras_block_object *block_obj = amdgpu_ras_get_ras_block(adev, + info->head.block, + info->head.sub_block_index); if (!obj) return -EINVAL; + if (!block_obj || !block_obj->hw_ops) { + dev_dbg_once(adev->dev, "%s doesn't config RAS function\n", + get_ras_block_str(&info->head)); + return -EINVAL; + } + /* Calculate XGMI relative offset */ if (adev->gmc.xgmi.num_physical_nodes > 1) { block_info.address = @@ -1128,28 +1088,15 @@ int amdgpu_ras_error_inject(struct amdgpu_device *adev, block_info.address); } - switch (info->head.block) { - case AMDGPU_RAS_BLOCK__GFX: - if (adev->gfx.ras_funcs && - adev->gfx.ras_funcs->ras_error_inject) - ret = adev->gfx.ras_funcs->ras_error_inject(adev, info); - else - ret = -EINVAL; - break; - case AMDGPU_RAS_BLOCK__UMC: - case AMDGPU_RAS_BLOCK__SDMA: - case AMDGPU_RAS_BLOCK__MMHUB: - case AMDGPU_RAS_BLOCK__PCIE_BIF: - case AMDGPU_RAS_BLOCK__MCA: - ret = psp_ras_trigger_error(&adev->psp, &block_info); - break; - case AMDGPU_RAS_BLOCK__XGMI_WAFL: - ret = amdgpu_ras_error_inject_xgmi(adev, &block_info); - break; - default: - dev_info(adev->dev, "%s error injection is not supported yet\n", - get_ras_block_str(&info->head)); - ret = -EINVAL; + if (info->head.block == AMDGPU_RAS_BLOCK__GFX) { + if (block_obj->hw_ops->ras_error_inject) + ret = block_obj->hw_ops->ras_error_inject(adev, info); + } else { + /* If defined special ras_error_inject(e.g: xgmi), implement special ras_error_inject */ + if (block_obj->hw_ops->ras_error_inject) + ret = block_obj->hw_ops->ras_error_inject(adev, &block_info); + else /*If not defined .ras_error_inject, use default ras_error_inject*/ + ret = psp_ras_trigger_error(&adev->psp, &block_info); } if (ret) @@ -1329,18 +1276,17 @@ static int amdgpu_ras_sysfs_remove_feature_node(struct amdgpu_device *adev) } int amdgpu_ras_sysfs_create(struct amdgpu_device *adev, - struct ras_fs_if *head) + struct ras_common_if *head) { - struct ras_manager *obj = amdgpu_ras_find_obj(adev, &head->head); + struct ras_manager *obj = amdgpu_ras_find_obj(adev, head); if (!obj || obj->attr_inuse) return -EINVAL; get_obj(obj); - memcpy(obj->fs_data.sysfs_name, - head->sysfs_name, - sizeof(obj->fs_data.sysfs_name)); + snprintf(obj->fs_data.sysfs_name, sizeof(obj->fs_data.sysfs_name), + "%s_err_count", head->name); obj->sysfs_attr = (struct device_attribute){ .attr = { @@ -1647,9 +1593,9 @@ int amdgpu_ras_interrupt_dispatch(struct amdgpu_device *adev, } int amdgpu_ras_interrupt_remove_handler(struct amdgpu_device *adev, - struct ras_ih_if *info) + struct ras_common_if *head) { - struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head); + struct ras_manager *obj = amdgpu_ras_find_obj(adev, head); struct ras_ih_data *data; if (!obj) @@ -1669,24 +1615,27 @@ int amdgpu_ras_interrupt_remove_handler(struct amdgpu_device *adev, } int amdgpu_ras_interrupt_add_handler(struct amdgpu_device *adev, - struct ras_ih_if *info) + struct ras_common_if *head) { - struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head); + struct ras_manager *obj = amdgpu_ras_find_obj(adev, head); struct ras_ih_data *data; + struct amdgpu_ras_block_object *ras_obj; if (!obj) { /* in case we registe the IH before enable ras feature */ - obj = amdgpu_ras_create_obj(adev, &info->head); + obj = amdgpu_ras_create_obj(adev, head); if (!obj) return -EINVAL; } else get_obj(obj); + ras_obj = container_of(head, struct amdgpu_ras_block_object, ras_comm); + data = &obj->ih_data; /* add the callback.etc */ *data = (struct ras_ih_data) { .inuse = 0, - .cb = info->cb, + .cb = ras_obj->ras_cb, .element_size = sizeof(struct amdgpu_iv_entry), .rptr = 0, .wptr = 0, @@ -1715,10 +1664,7 @@ static int amdgpu_ras_interrupt_remove_all(struct amdgpu_device *adev) struct ras_manager *obj, *tmp; list_for_each_entry_safe(obj, tmp, &con->head, node) { - struct ras_ih_if info = { - .head = obj->head, - }; - amdgpu_ras_interrupt_remove_handler(adev, &info); + amdgpu_ras_interrupt_remove_handler(adev, &obj->head); } return 0; @@ -1766,24 +1712,28 @@ static void amdgpu_ras_log_on_err_counter(struct amdgpu_device *adev) static void amdgpu_ras_error_status_query(struct amdgpu_device *adev, struct ras_query_if *info) { + struct amdgpu_ras_block_object *block_obj; /* * Only two block need to query read/write * RspStatus at current state */ - switch (info->head.block) { - case AMDGPU_RAS_BLOCK__GFX: - if (adev->gfx.ras_funcs && - adev->gfx.ras_funcs->query_ras_error_status) - adev->gfx.ras_funcs->query_ras_error_status(adev); - break; - case AMDGPU_RAS_BLOCK__MMHUB: - if (adev->mmhub.ras_funcs && - adev->mmhub.ras_funcs->query_ras_error_status) - adev->mmhub.ras_funcs->query_ras_error_status(adev); - break; - default: - break; + if ((info->head.block != AMDGPU_RAS_BLOCK__GFX) && + (info->head.block != AMDGPU_RAS_BLOCK__MMHUB)) + return; + + block_obj = amdgpu_ras_get_ras_block(adev, + info->head.block, + info->head.sub_block_index); + + if (!block_obj || !block_obj->hw_ops) { + dev_dbg_once(adev->dev, "%s doesn't config RAS function\n", + get_ras_block_str(&info->head)); + return; } + + if (block_obj->hw_ops->query_ras_error_status) + block_obj->hw_ops->query_ras_error_status(adev); + } static void amdgpu_ras_query_err_status(struct amdgpu_device *adev) @@ -2118,6 +2068,7 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev) mutex_init(&con->recovery_lock); INIT_WORK(&con->recovery_work, amdgpu_ras_do_recovery); atomic_set(&con->in_recovery, 0); + con->eeprom_control.bad_channel_bitmap = 0; max_eeprom_records_count = amdgpu_ras_eeprom_max_record_count(); amdgpu_ras_validate_threshold(adev, max_eeprom_records_count); @@ -2141,8 +2092,12 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev) if (ret) goto free; - if (adev->smu.ppt_funcs && adev->smu.ppt_funcs->send_hbm_bad_pages_num) - adev->smu.ppt_funcs->send_hbm_bad_pages_num(&adev->smu, con->eeprom_control.ras_num_recs); + amdgpu_dpm_send_hbm_bad_pages_num(adev, con->eeprom_control.ras_num_recs); + + if (con->update_channel_flag == true) { + amdgpu_dpm_send_hbm_bad_channel_flag(adev, con->eeprom_control.bad_channel_bitmap); + con->update_channel_flag = false; + } } #ifdef CONFIG_X86_MCE_AMD @@ -2336,6 +2291,7 @@ int amdgpu_ras_init(struct amdgpu_device *adev) goto release_con; } + con->update_channel_flag = false; con->features = 0; INIT_LIST_HEAD(&con->head); /* Might need get this flag from vbios. */ @@ -2348,24 +2304,27 @@ int amdgpu_ras_init(struct amdgpu_device *adev) case CHIP_VEGA20: case CHIP_ARCTURUS: case CHIP_ALDEBARAN: - if (!adev->gmc.xgmi.connected_to_cpu) - adev->nbio.ras_funcs = &nbio_v7_4_ras_funcs; + if (!adev->gmc.xgmi.connected_to_cpu) { + adev->nbio.ras = &nbio_v7_4_ras; + amdgpu_ras_register_ras_block(adev, &adev->nbio.ras->ras_block); + adev->nbio.ras_if = &adev->nbio.ras->ras_block.ras_comm; + } break; default: /* nbio ras is not available */ break; } - if (adev->nbio.ras_funcs && - adev->nbio.ras_funcs->init_ras_controller_interrupt) { - r = adev->nbio.ras_funcs->init_ras_controller_interrupt(adev); + if (adev->nbio.ras && + adev->nbio.ras->init_ras_controller_interrupt) { + r = adev->nbio.ras->init_ras_controller_interrupt(adev); if (r) goto release_con; } - if (adev->nbio.ras_funcs && - adev->nbio.ras_funcs->init_ras_err_event_athub_interrupt) { - r = adev->nbio.ras_funcs->init_ras_err_event_athub_interrupt(adev); + if (adev->nbio.ras && + adev->nbio.ras->init_ras_err_event_athub_interrupt) { + r = adev->nbio.ras->init_ras_err_event_athub_interrupt(adev); if (r) goto release_con; } @@ -2377,12 +2336,12 @@ int amdgpu_ras_init(struct amdgpu_device *adev) } else if (adev->df.funcs && adev->df.funcs->query_ras_poison_mode && - adev->umc.ras_funcs && - adev->umc.ras_funcs->query_ras_poison_mode) { + adev->umc.ras && + adev->umc.ras->query_ras_poison_mode) { df_poison = adev->df.funcs->query_ras_poison_mode(adev); umc_poison = - adev->umc.ras_funcs->query_ras_poison_mode(adev); + adev->umc.ras->query_ras_poison_mode(adev); /* Only poison is set in both DF and UMC, we can support it */ if (df_poison && umc_poison) con->poison_supported = true; @@ -2445,11 +2404,10 @@ bool amdgpu_ras_is_poison_mode_supported(struct amdgpu_device *adev) } /* helper function to handle common stuff in ip late init phase */ -int amdgpu_ras_late_init(struct amdgpu_device *adev, - struct ras_common_if *ras_block, - struct ras_fs_if *fs_info, - struct ras_ih_if *ih_info) +int amdgpu_ras_block_late_init(struct amdgpu_device *adev, + struct ras_common_if *ras_block) { + struct amdgpu_ras_block_object *ras_obj = NULL; struct amdgpu_ras *con = amdgpu_ras_get_context(adev); unsigned long ue_count, ce_count; int r; @@ -2477,15 +2435,16 @@ int amdgpu_ras_late_init(struct amdgpu_device *adev, if (adev->in_suspend || amdgpu_in_reset(adev)) return 0; - if (ih_info->cb) { - r = amdgpu_ras_interrupt_add_handler(adev, ih_info); + ras_obj = container_of(ras_block, struct amdgpu_ras_block_object, ras_comm); + if (ras_obj->ras_cb) { + r = amdgpu_ras_interrupt_add_handler(adev, ras_block); if (r) - goto interrupt; + goto cleanup; } - r = amdgpu_ras_sysfs_create(adev, fs_info); + r = amdgpu_ras_sysfs_create(adev, ras_block); if (r) - goto sysfs; + goto interrupt; /* Those are the cached values at init. */ @@ -2495,27 +2454,40 @@ int amdgpu_ras_late_init(struct amdgpu_device *adev, } return 0; -cleanup: - amdgpu_ras_sysfs_remove(adev, ras_block); -sysfs: - if (ih_info->cb) - amdgpu_ras_interrupt_remove_handler(adev, ih_info); + interrupt: + if (ras_obj->ras_cb) + amdgpu_ras_interrupt_remove_handler(adev, ras_block); +cleanup: amdgpu_ras_feature_enable(adev, ras_block, 0); return r; } +static int amdgpu_ras_block_late_init_default(struct amdgpu_device *adev, + struct ras_common_if *ras_block) +{ + return amdgpu_ras_block_late_init(adev, ras_block); +} + /* helper function to remove ras fs node and interrupt handler */ -void amdgpu_ras_late_fini(struct amdgpu_device *adev, - struct ras_common_if *ras_block, - struct ras_ih_if *ih_info) +void amdgpu_ras_block_late_fini(struct amdgpu_device *adev, + struct ras_common_if *ras_block) { - if (!ras_block || !ih_info) + struct amdgpu_ras_block_object *ras_obj; + if (!ras_block) return; amdgpu_ras_sysfs_remove(adev, ras_block); - if (ih_info->cb) - amdgpu_ras_interrupt_remove_handler(adev, ih_info); + + ras_obj = container_of(ras_block, struct amdgpu_ras_block_object, ras_comm); + if (ras_obj->ras_cb) + amdgpu_ras_interrupt_remove_handler(adev, ras_block); +} + +static void amdgpu_ras_block_late_fini_default(struct amdgpu_device *adev, + struct ras_common_if *ras_block) +{ + return amdgpu_ras_block_late_fini(adev, ras_block); } /* do some init work after IP late init as dependence. @@ -2568,6 +2540,33 @@ void amdgpu_ras_suspend(struct amdgpu_device *adev) amdgpu_ras_disable_all_features(adev, 1); } +int amdgpu_ras_late_init(struct amdgpu_device *adev) +{ + struct amdgpu_ras_block_list *node, *tmp; + struct amdgpu_ras_block_object *obj; + int r; + + list_for_each_entry_safe(node, tmp, &adev->ras_list, node) { + if (!node->ras_obj) { + dev_warn(adev->dev, "Warning: abnormal ras list node.\n"); + continue; + } + + obj = node->ras_obj; + if (obj->ras_late_init) { + r = obj->ras_late_init(adev, &obj->ras_comm); + if (r) { + dev_err(adev->dev, "%s failed to execute ras_late_init! ret:%d\n", + obj->ras_comm.name, r); + return r; + } + } else + amdgpu_ras_block_late_init_default(adev, &obj->ras_comm); + } + + return 0; +} + /* do some fini work before IP fini as dependence */ int amdgpu_ras_pre_fini(struct amdgpu_device *adev) { @@ -2585,11 +2584,28 @@ int amdgpu_ras_pre_fini(struct amdgpu_device *adev) int amdgpu_ras_fini(struct amdgpu_device *adev) { + struct amdgpu_ras_block_list *ras_node, *tmp; + struct amdgpu_ras_block_object *obj = NULL; struct amdgpu_ras *con = amdgpu_ras_get_context(adev); if (!adev->ras_enabled || !con) return 0; + list_for_each_entry_safe(ras_node, tmp, &adev->ras_list, node) { + if (ras_node->ras_obj) { + obj = ras_node->ras_obj; + if (amdgpu_ras_is_supported(adev, obj->ras_comm.block) && + obj->ras_fini) + obj->ras_fini(adev, &obj->ras_comm); + else + amdgpu_ras_block_late_fini_default(adev, &obj->ras_comm); + } + + /* Clear ras blocks from ras_list and free ras block list node */ + list_del(&ras_node->node); + kfree(ras_node); + } + amdgpu_ras_fs_fini(adev); amdgpu_ras_interrupt_remove_all(adev); @@ -2717,8 +2733,6 @@ static int amdgpu_bad_page_notifier(struct notifier_block *nb, dev_info(adev->dev, "Uncorrectable error detected in UMC inst: %d, chan_idx: %d", umc_inst, ch_inst); - memset(&err_rec, 0x0, sizeof(struct eeprom_table_record)); - /* * Translate UMC channel address to Physical address */ @@ -2730,16 +2744,10 @@ static int amdgpu_bad_page_notifier(struct notifier_block *nb, ADDR_OF_256B_BLOCK(channel_index) | OFFSET_IN_256B_BLOCK(m->addr); - err_rec.address = m->addr; - err_rec.retired_page = retired_page >> AMDGPU_GPU_PAGE_SHIFT; - err_rec.ts = (uint64_t)ktime_get_real_seconds(); - err_rec.err_type = AMDGPU_RAS_EEPROM_ERR_NON_RECOVERABLE; - err_rec.cu = 0; - err_rec.mem_channel = channel_index; - err_rec.mcumc_id = umc_inst; - + memset(&err_rec, 0x0, sizeof(struct eeprom_table_record)); err_data.err_addr = &err_rec; - err_data.err_addr_cnt = 1; + amdgpu_umc_fill_error_record(&err_data, m->addr, + retired_page, channel_index, umc_inst); if (amdgpu_bad_page_threshold != 0) { amdgpu_ras_add_bad_pages(adev, err_data.err_addr, @@ -2777,3 +2785,63 @@ static void amdgpu_register_bad_pages_mca_notifier(struct amdgpu_device *adev) } } #endif + +struct amdgpu_ras *amdgpu_ras_get_context(struct amdgpu_device *adev) +{ + if (!adev) + return NULL; + + return adev->psp.ras_context.ras; +} + +int amdgpu_ras_set_context(struct amdgpu_device *adev, struct amdgpu_ras *ras_con) +{ + if (!adev) + return -EINVAL; + + adev->psp.ras_context.ras = ras_con; + return 0; +} + +/* check if ras is supported on block, say, sdma, gfx */ +int amdgpu_ras_is_supported(struct amdgpu_device *adev, + unsigned int block) +{ + struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); + + if (block >= AMDGPU_RAS_BLOCK_COUNT) + return 0; + return ras && (adev->ras_enabled & (1 << block)); +} + +int amdgpu_ras_reset_gpu(struct amdgpu_device *adev) +{ + struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); + + if (atomic_cmpxchg(&ras->in_recovery, 0, 1) == 0) + schedule_work(&ras->recovery_work); + return 0; +} + + +/* Register each ip ras block into amdgpu ras */ +int amdgpu_ras_register_ras_block(struct amdgpu_device *adev, + struct amdgpu_ras_block_object *ras_block_obj) +{ + struct amdgpu_ras_block_list *ras_node; + if (!adev || !ras_block_obj) + return -EINVAL; + + if (!amdgpu_ras_asic_supported(adev)) + return 0; + + ras_node = kzalloc(sizeof(*ras_node), GFP_KERNEL); + if (!ras_node) + return -ENOMEM; + + INIT_LIST_HEAD(&ras_node->node); + ras_node->ras_obj = ras_block_obj; + list_add_tail(&ras_node->node, &adev->ras_list); + + return 0; +} |