diff options
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c')
| -rw-r--r-- | drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 548 | 
1 files changed, 308 insertions, 240 deletions
| diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 8f47c14ecbc7..424c22a841f4 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -75,6 +75,13 @@ const char *ras_mca_block_string[] = {  	"mca_iohc",  }; +struct amdgpu_ras_block_list { +	/* ras block link */ +	struct list_head node; + +	struct amdgpu_ras_block_object *ras_obj; +}; +  const char *get_ras_block_str(struct ras_common_if *ras_block)  {  	if (!ras_block) @@ -89,6 +96,9 @@ const char *get_ras_block_str(struct ras_common_if *ras_block)  	return ras_block_string[ras_block->block];  } +#define ras_block_str(_BLOCK_) \ +	(((_BLOCK_) < ARRAY_SIZE(ras_block_string)) ? ras_block_string[_BLOCK_] : "Out Of Range") +  #define ras_err_str(i) (ras_error_string[ffs(i)])  #define RAS_DEFAULT_FLAGS (AMDGPU_RAS_FLAG_INIT_BY_VBIOS) @@ -155,14 +165,9 @@ static int amdgpu_reserve_page_direct(struct amdgpu_device *adev, uint64_t addre  	}  	memset(&err_rec, 0x0, sizeof(struct eeprom_table_record)); - -	err_rec.address = address; -	err_rec.retired_page = address >> AMDGPU_GPU_PAGE_SHIFT; -	err_rec.ts = (uint64_t)ktime_get_real_seconds(); -	err_rec.err_type = AMDGPU_RAS_EEPROM_ERR_NON_RECOVERABLE; -  	err_data.err_addr = &err_rec; -	err_data.err_addr_cnt = 1; +	amdgpu_umc_fill_error_record(&err_data, address, +			(address >> AMDGPU_GPU_PAGE_SHIFT), 0, 0);  	if (amdgpu_bad_page_threshold != 0) {  		amdgpu_ras_add_bad_pages(adev, err_data.err_addr, @@ -452,7 +457,7 @@ static ssize_t amdgpu_ras_debugfs_ctrl_write(struct file *f,  	}  	if (ret) -		return -EINVAL; +		return ret;  	return size;  } @@ -866,30 +871,47 @@ static int amdgpu_ras_enable_all_features(struct amdgpu_device *adev,  }  /* feature ctl end */ +static int amdgpu_ras_block_match_default(struct amdgpu_ras_block_object *block_obj, +		enum amdgpu_ras_block block) +{ +	if (!block_obj) +		return -EINVAL; -static void amdgpu_ras_mca_query_error_status(struct amdgpu_device *adev, -					      struct ras_common_if *ras_block, -					      struct ras_err_data  *err_data) +	if (block_obj->ras_comm.block == block) +		return 0; + +	return -EINVAL; +} + +static struct amdgpu_ras_block_object *amdgpu_ras_get_ras_block(struct amdgpu_device *adev, +					enum amdgpu_ras_block block, uint32_t sub_block_index)  { -	switch (ras_block->sub_block_index) { -	case AMDGPU_RAS_MCA_BLOCK__MP0: -		if (adev->mca.mp0.ras_funcs && -		    adev->mca.mp0.ras_funcs->query_ras_error_count) -			adev->mca.mp0.ras_funcs->query_ras_error_count(adev, &err_data); -		break; -	case AMDGPU_RAS_MCA_BLOCK__MP1: -		if (adev->mca.mp1.ras_funcs && -		    adev->mca.mp1.ras_funcs->query_ras_error_count) -			adev->mca.mp1.ras_funcs->query_ras_error_count(adev, &err_data); -		break; -	case AMDGPU_RAS_MCA_BLOCK__MPIO: -		if (adev->mca.mpio.ras_funcs && -		    adev->mca.mpio.ras_funcs->query_ras_error_count) -			adev->mca.mpio.ras_funcs->query_ras_error_count(adev, &err_data); -		break; -	default: -		break; +	struct amdgpu_ras_block_list *node, *tmp; +	struct amdgpu_ras_block_object *obj; + +	if (block >= AMDGPU_RAS_BLOCK__LAST) +		return NULL; + +	if (!amdgpu_ras_is_supported(adev, block)) +		return NULL; + +	list_for_each_entry_safe(node, tmp, &adev->ras_list, node) { +		if (!node->ras_obj) { +			dev_warn(adev->dev, "Warning: abnormal ras list node.\n"); +			continue; +		} + +		obj = node->ras_obj; +		if (obj->ras_block_match) { +			if (obj->ras_block_match(obj, block, sub_block_index) == 0) +				return obj; +		} else { +			if (amdgpu_ras_block_match_default(obj, block) == 0) +				return obj; +		}  	} + +	return NULL;  }  static void amdgpu_ras_get_ecc_info(struct amdgpu_device *adev, struct ras_err_data *err_data) @@ -901,26 +923,26 @@ static void amdgpu_ras_get_ecc_info(struct amdgpu_device *adev, struct ras_err_d  	 * choosing right query method according to  	 * whether smu support query error information  	 */ -	ret = smu_get_ecc_info(&adev->smu, (void *)&(ras->umc_ecc)); +	ret = amdgpu_dpm_get_ecc_info(adev, (void *)&(ras->umc_ecc));  	if (ret == -EOPNOTSUPP) { -		if (adev->umc.ras_funcs && -			adev->umc.ras_funcs->query_ras_error_count) -			adev->umc.ras_funcs->query_ras_error_count(adev, err_data); +		if (adev->umc.ras && adev->umc.ras->ras_block.hw_ops && +			adev->umc.ras->ras_block.hw_ops->query_ras_error_count) +			adev->umc.ras->ras_block.hw_ops->query_ras_error_count(adev, err_data);  		/* umc query_ras_error_address is also responsible for clearing  		 * error status  		 */ -		if (adev->umc.ras_funcs && -		    adev->umc.ras_funcs->query_ras_error_address) -			adev->umc.ras_funcs->query_ras_error_address(adev, err_data); +		if (adev->umc.ras && adev->umc.ras->ras_block.hw_ops && +		    adev->umc.ras->ras_block.hw_ops->query_ras_error_address) +			adev->umc.ras->ras_block.hw_ops->query_ras_error_address(adev, err_data);  	} else if (!ret) { -		if (adev->umc.ras_funcs && -			adev->umc.ras_funcs->ecc_info_query_ras_error_count) -			adev->umc.ras_funcs->ecc_info_query_ras_error_count(adev, err_data); +		if (adev->umc.ras && +			adev->umc.ras->ecc_info_query_ras_error_count) +			adev->umc.ras->ecc_info_query_ras_error_count(adev, err_data); -		if (adev->umc.ras_funcs && -			adev->umc.ras_funcs->ecc_info_query_ras_error_address) -			adev->umc.ras_funcs->ecc_info_query_ras_error_address(adev, err_data); +		if (adev->umc.ras && +			adev->umc.ras->ecc_info_query_ras_error_address) +			adev->umc.ras->ecc_info_query_ras_error_address(adev, err_data);  	}  } @@ -928,62 +950,32 @@ static void amdgpu_ras_get_ecc_info(struct amdgpu_device *adev, struct ras_err_d  int amdgpu_ras_query_error_status(struct amdgpu_device *adev,  				  struct ras_query_if *info)  { +	struct amdgpu_ras_block_object *block_obj = NULL;  	struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head);  	struct ras_err_data err_data = {0, 0, 0, NULL}; -	int i;  	if (!obj)  		return -EINVAL; -	switch (info->head.block) { -	case AMDGPU_RAS_BLOCK__UMC: +	if (info->head.block == AMDGPU_RAS_BLOCK__UMC) {  		amdgpu_ras_get_ecc_info(adev, &err_data); -		break; -	case AMDGPU_RAS_BLOCK__SDMA: -		if (adev->sdma.funcs->query_ras_error_count) { -			for (i = 0; i < adev->sdma.num_instances; i++) -				adev->sdma.funcs->query_ras_error_count(adev, i, -									&err_data); +	} else { +		block_obj = amdgpu_ras_get_ras_block(adev, info->head.block, 0); +		if (!block_obj || !block_obj->hw_ops)   { +			dev_dbg_once(adev->dev, "%s doesn't config RAS function\n", +				     get_ras_block_str(&info->head)); +			return -EINVAL;  		} -		break; -	case AMDGPU_RAS_BLOCK__GFX: -		if (adev->gfx.ras_funcs && -		    adev->gfx.ras_funcs->query_ras_error_count) -			adev->gfx.ras_funcs->query_ras_error_count(adev, &err_data); - -		if (adev->gfx.ras_funcs && -		    adev->gfx.ras_funcs->query_ras_error_status) -			adev->gfx.ras_funcs->query_ras_error_status(adev); -		break; -	case AMDGPU_RAS_BLOCK__MMHUB: -		if (adev->mmhub.ras_funcs && -		    adev->mmhub.ras_funcs->query_ras_error_count) -			adev->mmhub.ras_funcs->query_ras_error_count(adev, &err_data); - -		if (adev->mmhub.ras_funcs && -		    adev->mmhub.ras_funcs->query_ras_error_status) -			adev->mmhub.ras_funcs->query_ras_error_status(adev); -		break; -	case AMDGPU_RAS_BLOCK__PCIE_BIF: -		if (adev->nbio.ras_funcs && -		    adev->nbio.ras_funcs->query_ras_error_count) -			adev->nbio.ras_funcs->query_ras_error_count(adev, &err_data); -		break; -	case AMDGPU_RAS_BLOCK__XGMI_WAFL: -		if (adev->gmc.xgmi.ras_funcs && -		    adev->gmc.xgmi.ras_funcs->query_ras_error_count) -			adev->gmc.xgmi.ras_funcs->query_ras_error_count(adev, &err_data); -		break; -	case AMDGPU_RAS_BLOCK__HDP: -		if (adev->hdp.ras_funcs && -		    adev->hdp.ras_funcs->query_ras_error_count) -			adev->hdp.ras_funcs->query_ras_error_count(adev, &err_data); -		break; -	case AMDGPU_RAS_BLOCK__MCA: -		amdgpu_ras_mca_query_error_status(adev, &info->head, &err_data); -		break; -	default: -		break; + +		if (block_obj->hw_ops->query_ras_error_count) +			block_obj->hw_ops->query_ras_error_count(adev, &err_data); + +		if ((info->head.block == AMDGPU_RAS_BLOCK__SDMA) || +		    (info->head.block == AMDGPU_RAS_BLOCK__GFX) || +		    (info->head.block == AMDGPU_RAS_BLOCK__MMHUB)) { +				if (block_obj->hw_ops->query_ras_error_status) +					block_obj->hw_ops->query_ras_error_status(adev); +			}  	}  	obj->err_data.ue_count += err_data.ue_count; @@ -1040,68 +1032,27 @@ int amdgpu_ras_query_error_status(struct amdgpu_device *adev,  int amdgpu_ras_reset_error_status(struct amdgpu_device *adev,  		enum amdgpu_ras_block block)  { +	struct amdgpu_ras_block_object *block_obj = amdgpu_ras_get_ras_block(adev, block, 0); +  	if (!amdgpu_ras_is_supported(adev, block))  		return -EINVAL; -	switch (block) { -	case AMDGPU_RAS_BLOCK__GFX: -		if (adev->gfx.ras_funcs && -		    adev->gfx.ras_funcs->reset_ras_error_count) -			adev->gfx.ras_funcs->reset_ras_error_count(adev); - -		if (adev->gfx.ras_funcs && -		    adev->gfx.ras_funcs->reset_ras_error_status) -			adev->gfx.ras_funcs->reset_ras_error_status(adev); -		break; -	case AMDGPU_RAS_BLOCK__MMHUB: -		if (adev->mmhub.ras_funcs && -		    adev->mmhub.ras_funcs->reset_ras_error_count) -			adev->mmhub.ras_funcs->reset_ras_error_count(adev); - -		if (adev->mmhub.ras_funcs && -		    adev->mmhub.ras_funcs->reset_ras_error_status) -			adev->mmhub.ras_funcs->reset_ras_error_status(adev); -		break; -	case AMDGPU_RAS_BLOCK__SDMA: -		if (adev->sdma.funcs->reset_ras_error_count) -			adev->sdma.funcs->reset_ras_error_count(adev); -		break; -	case AMDGPU_RAS_BLOCK__HDP: -		if (adev->hdp.ras_funcs && -		    adev->hdp.ras_funcs->reset_ras_error_count) -			adev->hdp.ras_funcs->reset_ras_error_count(adev); -		break; -	default: -		break; +	if (!block_obj || !block_obj->hw_ops)   { +		dev_dbg_once(adev->dev, "%s doesn't config RAS function\n", +			     ras_block_str(block)); +		return -EINVAL;  	} -	return 0; -} - -/* Trigger XGMI/WAFL error */ -static int amdgpu_ras_error_inject_xgmi(struct amdgpu_device *adev, -				 struct ta_ras_trigger_error_input *block_info) -{ -	int ret; - -	if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_DISALLOW)) -		dev_warn(adev->dev, "Failed to disallow df cstate"); - -	if (amdgpu_dpm_allow_xgmi_power_down(adev, false)) -		dev_warn(adev->dev, "Failed to disallow XGMI power down"); - -	ret = psp_ras_trigger_error(&adev->psp, block_info); +	if (block_obj->hw_ops->reset_ras_error_count) +		block_obj->hw_ops->reset_ras_error_count(adev); -	if (amdgpu_ras_intr_triggered()) -		return ret; - -	if (amdgpu_dpm_allow_xgmi_power_down(adev, true)) -		dev_warn(adev->dev, "Failed to allow XGMI power down"); - -	if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_ALLOW)) -		dev_warn(adev->dev, "Failed to allow df cstate"); +	if ((block == AMDGPU_RAS_BLOCK__GFX) || +	    (block == AMDGPU_RAS_BLOCK__MMHUB)) { +		if (block_obj->hw_ops->reset_ras_error_status) +			block_obj->hw_ops->reset_ras_error_status(adev); +	} -	return ret; +	return 0;  }  /* wrapper of psp_ras_trigger_error */ @@ -1116,11 +1067,20 @@ int amdgpu_ras_error_inject(struct amdgpu_device *adev,  		.address = info->address,  		.value = info->value,  	}; -	int ret = 0; +	int ret = -EINVAL; +	struct amdgpu_ras_block_object *block_obj = amdgpu_ras_get_ras_block(adev, +							info->head.block, +							info->head.sub_block_index);  	if (!obj)  		return -EINVAL; +	if (!block_obj || !block_obj->hw_ops)	{ +		dev_dbg_once(adev->dev, "%s doesn't config RAS function\n", +			     get_ras_block_str(&info->head)); +		return -EINVAL; +	} +  	/* Calculate XGMI relative offset */  	if (adev->gmc.xgmi.num_physical_nodes > 1) {  		block_info.address = @@ -1128,28 +1088,15 @@ int amdgpu_ras_error_inject(struct amdgpu_device *adev,  							  block_info.address);  	} -	switch (info->head.block) { -	case AMDGPU_RAS_BLOCK__GFX: -		if (adev->gfx.ras_funcs && -		    adev->gfx.ras_funcs->ras_error_inject) -			ret = adev->gfx.ras_funcs->ras_error_inject(adev, info); -		else -			ret = -EINVAL; -		break; -	case AMDGPU_RAS_BLOCK__UMC: -	case AMDGPU_RAS_BLOCK__SDMA: -	case AMDGPU_RAS_BLOCK__MMHUB: -	case AMDGPU_RAS_BLOCK__PCIE_BIF: -	case AMDGPU_RAS_BLOCK__MCA: -		ret = psp_ras_trigger_error(&adev->psp, &block_info); -		break; -	case AMDGPU_RAS_BLOCK__XGMI_WAFL: -		ret = amdgpu_ras_error_inject_xgmi(adev, &block_info); -		break; -	default: -		dev_info(adev->dev, "%s error injection is not supported yet\n", -			 get_ras_block_str(&info->head)); -		ret = -EINVAL; +	if (info->head.block == AMDGPU_RAS_BLOCK__GFX) { +		if (block_obj->hw_ops->ras_error_inject) +			ret = block_obj->hw_ops->ras_error_inject(adev, info); +	} else { +		/* If defined special ras_error_inject(e.g: xgmi), implement special ras_error_inject */ +		if (block_obj->hw_ops->ras_error_inject) +			ret = block_obj->hw_ops->ras_error_inject(adev, &block_info); +		else  /*If not defined .ras_error_inject, use default ras_error_inject*/ +			ret = psp_ras_trigger_error(&adev->psp, &block_info);  	}  	if (ret) @@ -1329,18 +1276,17 @@ static int amdgpu_ras_sysfs_remove_feature_node(struct amdgpu_device *adev)  }  int amdgpu_ras_sysfs_create(struct amdgpu_device *adev, -		struct ras_fs_if *head) +		struct ras_common_if *head)  { -	struct ras_manager *obj = amdgpu_ras_find_obj(adev, &head->head); +	struct ras_manager *obj = amdgpu_ras_find_obj(adev, head);  	if (!obj || obj->attr_inuse)  		return -EINVAL;  	get_obj(obj); -	memcpy(obj->fs_data.sysfs_name, -			head->sysfs_name, -			sizeof(obj->fs_data.sysfs_name)); +	snprintf(obj->fs_data.sysfs_name, sizeof(obj->fs_data.sysfs_name), +		"%s_err_count", head->name);  	obj->sysfs_attr = (struct device_attribute){  		.attr = { @@ -1647,9 +1593,9 @@ int amdgpu_ras_interrupt_dispatch(struct amdgpu_device *adev,  }  int amdgpu_ras_interrupt_remove_handler(struct amdgpu_device *adev, -		struct ras_ih_if *info) +		struct ras_common_if *head)  { -	struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head); +	struct ras_manager *obj = amdgpu_ras_find_obj(adev, head);  	struct ras_ih_data *data;  	if (!obj) @@ -1669,24 +1615,27 @@ int amdgpu_ras_interrupt_remove_handler(struct amdgpu_device *adev,  }  int amdgpu_ras_interrupt_add_handler(struct amdgpu_device *adev, -		struct ras_ih_if *info) +		struct ras_common_if *head)  { -	struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head); +	struct ras_manager *obj = amdgpu_ras_find_obj(adev, head);  	struct ras_ih_data *data; +	struct amdgpu_ras_block_object *ras_obj;  	if (!obj) {  		/* in case we registe the IH before enable ras feature */ -		obj = amdgpu_ras_create_obj(adev, &info->head); +		obj = amdgpu_ras_create_obj(adev, head);  		if (!obj)  			return -EINVAL;  	} else  		get_obj(obj); +	ras_obj = container_of(head, struct amdgpu_ras_block_object, ras_comm); +  	data = &obj->ih_data;  	/* add the callback.etc */  	*data = (struct ras_ih_data) {  		.inuse = 0, -		.cb = info->cb, +		.cb = ras_obj->ras_cb,  		.element_size = sizeof(struct amdgpu_iv_entry),  		.rptr = 0,  		.wptr = 0, @@ -1715,10 +1664,7 @@ static int amdgpu_ras_interrupt_remove_all(struct amdgpu_device *adev)  	struct ras_manager *obj, *tmp;  	list_for_each_entry_safe(obj, tmp, &con->head, node) { -		struct ras_ih_if info = { -			.head = obj->head, -		}; -		amdgpu_ras_interrupt_remove_handler(adev, &info); +		amdgpu_ras_interrupt_remove_handler(adev, &obj->head);  	}  	return 0; @@ -1766,24 +1712,28 @@ static void amdgpu_ras_log_on_err_counter(struct amdgpu_device *adev)  static void amdgpu_ras_error_status_query(struct amdgpu_device *adev,  					  struct ras_query_if *info)  { +	struct amdgpu_ras_block_object *block_obj;  	/*  	 * Only two block need to query read/write  	 * RspStatus at current state  	 */ -	switch (info->head.block) { -	case AMDGPU_RAS_BLOCK__GFX: -		if (adev->gfx.ras_funcs && -		    adev->gfx.ras_funcs->query_ras_error_status) -			adev->gfx.ras_funcs->query_ras_error_status(adev); -		break; -	case AMDGPU_RAS_BLOCK__MMHUB: -		if (adev->mmhub.ras_funcs && -		    adev->mmhub.ras_funcs->query_ras_error_status) -			adev->mmhub.ras_funcs->query_ras_error_status(adev); -		break; -	default: -		break; +	if ((info->head.block != AMDGPU_RAS_BLOCK__GFX) && +		(info->head.block != AMDGPU_RAS_BLOCK__MMHUB)) +		return; + +	block_obj = amdgpu_ras_get_ras_block(adev, +					info->head.block, +					info->head.sub_block_index); + +	if (!block_obj || !block_obj->hw_ops) { +		dev_dbg_once(adev->dev, "%s doesn't config RAS function\n", +			     get_ras_block_str(&info->head)); +		return;  	} + +	if (block_obj->hw_ops->query_ras_error_status) +		block_obj->hw_ops->query_ras_error_status(adev); +  }  static void amdgpu_ras_query_err_status(struct amdgpu_device *adev) @@ -2118,6 +2068,7 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev)  	mutex_init(&con->recovery_lock);  	INIT_WORK(&con->recovery_work, amdgpu_ras_do_recovery);  	atomic_set(&con->in_recovery, 0); +	con->eeprom_control.bad_channel_bitmap = 0;  	max_eeprom_records_count = amdgpu_ras_eeprom_max_record_count();  	amdgpu_ras_validate_threshold(adev, max_eeprom_records_count); @@ -2141,8 +2092,12 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev)  		if (ret)  			goto free; -		if (adev->smu.ppt_funcs && adev->smu.ppt_funcs->send_hbm_bad_pages_num) -			adev->smu.ppt_funcs->send_hbm_bad_pages_num(&adev->smu, con->eeprom_control.ras_num_recs); +		amdgpu_dpm_send_hbm_bad_pages_num(adev, con->eeprom_control.ras_num_recs); + +		if (con->update_channel_flag == true) { +			amdgpu_dpm_send_hbm_bad_channel_flag(adev, con->eeprom_control.bad_channel_bitmap); +			con->update_channel_flag = false; +		}  	}  #ifdef CONFIG_X86_MCE_AMD @@ -2336,6 +2291,7 @@ int amdgpu_ras_init(struct amdgpu_device *adev)  		goto release_con;  	} +	con->update_channel_flag = false;  	con->features = 0;  	INIT_LIST_HEAD(&con->head);  	/* Might need get this flag from vbios. */ @@ -2348,24 +2304,27 @@ int amdgpu_ras_init(struct amdgpu_device *adev)  	case CHIP_VEGA20:  	case CHIP_ARCTURUS:  	case CHIP_ALDEBARAN: -		if (!adev->gmc.xgmi.connected_to_cpu) -			adev->nbio.ras_funcs = &nbio_v7_4_ras_funcs; +		if (!adev->gmc.xgmi.connected_to_cpu) { +			adev->nbio.ras = &nbio_v7_4_ras; +			amdgpu_ras_register_ras_block(adev, &adev->nbio.ras->ras_block); +			adev->nbio.ras_if = &adev->nbio.ras->ras_block.ras_comm; +		}  		break;  	default:  		/* nbio ras is not available */  		break;  	} -	if (adev->nbio.ras_funcs && -	    adev->nbio.ras_funcs->init_ras_controller_interrupt) { -		r = adev->nbio.ras_funcs->init_ras_controller_interrupt(adev); +	if (adev->nbio.ras && +	    adev->nbio.ras->init_ras_controller_interrupt) { +		r = adev->nbio.ras->init_ras_controller_interrupt(adev);  		if (r)  			goto release_con;  	} -	if (adev->nbio.ras_funcs && -	    adev->nbio.ras_funcs->init_ras_err_event_athub_interrupt) { -		r = adev->nbio.ras_funcs->init_ras_err_event_athub_interrupt(adev); +	if (adev->nbio.ras && +	    adev->nbio.ras->init_ras_err_event_athub_interrupt) { +		r = adev->nbio.ras->init_ras_err_event_athub_interrupt(adev);  		if (r)  			goto release_con;  	} @@ -2377,12 +2336,12 @@ int amdgpu_ras_init(struct amdgpu_device *adev)  	}  	else if (adev->df.funcs &&  	    adev->df.funcs->query_ras_poison_mode && -	    adev->umc.ras_funcs && -	    adev->umc.ras_funcs->query_ras_poison_mode) { +	    adev->umc.ras && +	    adev->umc.ras->query_ras_poison_mode) {  		df_poison =  			adev->df.funcs->query_ras_poison_mode(adev);  		umc_poison = -			adev->umc.ras_funcs->query_ras_poison_mode(adev); +			adev->umc.ras->query_ras_poison_mode(adev);  		/* Only poison is set in both DF and UMC, we can support it */  		if (df_poison && umc_poison)  			con->poison_supported = true; @@ -2445,11 +2404,10 @@ bool amdgpu_ras_is_poison_mode_supported(struct amdgpu_device *adev)  }  /* helper function to handle common stuff in ip late init phase */ -int amdgpu_ras_late_init(struct amdgpu_device *adev, -			 struct ras_common_if *ras_block, -			 struct ras_fs_if *fs_info, -			 struct ras_ih_if *ih_info) +int amdgpu_ras_block_late_init(struct amdgpu_device *adev, +			 struct ras_common_if *ras_block)  { +	struct amdgpu_ras_block_object *ras_obj = NULL;  	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);  	unsigned long ue_count, ce_count;  	int r; @@ -2477,15 +2435,16 @@ int amdgpu_ras_late_init(struct amdgpu_device *adev,  	if (adev->in_suspend || amdgpu_in_reset(adev))  		return 0; -	if (ih_info->cb) { -		r = amdgpu_ras_interrupt_add_handler(adev, ih_info); +	ras_obj = container_of(ras_block, struct amdgpu_ras_block_object, ras_comm); +	if (ras_obj->ras_cb) { +		r = amdgpu_ras_interrupt_add_handler(adev, ras_block);  		if (r) -			goto interrupt; +			goto cleanup;  	} -	r = amdgpu_ras_sysfs_create(adev, fs_info); +	r = amdgpu_ras_sysfs_create(adev, ras_block);  	if (r) -		goto sysfs; +		goto interrupt;  	/* Those are the cached values at init.  	 */ @@ -2495,27 +2454,40 @@ int amdgpu_ras_late_init(struct amdgpu_device *adev,  	}  	return 0; -cleanup: -	amdgpu_ras_sysfs_remove(adev, ras_block); -sysfs: -	if (ih_info->cb) -		amdgpu_ras_interrupt_remove_handler(adev, ih_info); +  interrupt: +	if (ras_obj->ras_cb) +		amdgpu_ras_interrupt_remove_handler(adev, ras_block); +cleanup:  	amdgpu_ras_feature_enable(adev, ras_block, 0);  	return r;  } +static int amdgpu_ras_block_late_init_default(struct amdgpu_device *adev, +			 struct ras_common_if *ras_block) +{ +	return amdgpu_ras_block_late_init(adev, ras_block); +} +  /* helper function to remove ras fs node and interrupt handler */ -void amdgpu_ras_late_fini(struct amdgpu_device *adev, -			  struct ras_common_if *ras_block, -			  struct ras_ih_if *ih_info) +void amdgpu_ras_block_late_fini(struct amdgpu_device *adev, +			  struct ras_common_if *ras_block)  { -	if (!ras_block || !ih_info) +	struct amdgpu_ras_block_object *ras_obj; +	if (!ras_block)  		return;  	amdgpu_ras_sysfs_remove(adev, ras_block); -	if (ih_info->cb) -		amdgpu_ras_interrupt_remove_handler(adev, ih_info); + +	ras_obj = container_of(ras_block, struct amdgpu_ras_block_object, ras_comm); +	if (ras_obj->ras_cb) +		amdgpu_ras_interrupt_remove_handler(adev, ras_block); +} + +static void amdgpu_ras_block_late_fini_default(struct amdgpu_device *adev, +			  struct ras_common_if *ras_block) +{ +	return amdgpu_ras_block_late_fini(adev, ras_block);  }  /* do some init work after IP late init as dependence. @@ -2568,6 +2540,33 @@ void amdgpu_ras_suspend(struct amdgpu_device *adev)  		amdgpu_ras_disable_all_features(adev, 1);  } +int amdgpu_ras_late_init(struct amdgpu_device *adev) +{ +	struct amdgpu_ras_block_list *node, *tmp; +	struct amdgpu_ras_block_object *obj; +	int r; + +	list_for_each_entry_safe(node, tmp, &adev->ras_list, node) { +		if (!node->ras_obj) { +			dev_warn(adev->dev, "Warning: abnormal ras list node.\n"); +			continue; +		} + +		obj = node->ras_obj; +		if (obj->ras_late_init) { +			r = obj->ras_late_init(adev, &obj->ras_comm); +			if (r) { +				dev_err(adev->dev, "%s failed to execute ras_late_init! ret:%d\n", +					obj->ras_comm.name, r); +				return r; +			} +		} else +			amdgpu_ras_block_late_init_default(adev, &obj->ras_comm); +	} + +	return 0; +} +  /* do some fini work before IP fini as dependence */  int amdgpu_ras_pre_fini(struct amdgpu_device *adev)  { @@ -2585,11 +2584,28 @@ int amdgpu_ras_pre_fini(struct amdgpu_device *adev)  int amdgpu_ras_fini(struct amdgpu_device *adev)  { +	struct amdgpu_ras_block_list *ras_node, *tmp; +	struct amdgpu_ras_block_object *obj = NULL;  	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);  	if (!adev->ras_enabled || !con)  		return 0; +	list_for_each_entry_safe(ras_node, tmp, &adev->ras_list, node) { +		if (ras_node->ras_obj) { +			obj = ras_node->ras_obj; +			if (amdgpu_ras_is_supported(adev, obj->ras_comm.block) && +			    obj->ras_fini) +				obj->ras_fini(adev, &obj->ras_comm); +			else +				amdgpu_ras_block_late_fini_default(adev, &obj->ras_comm); +		} + +		/* Clear ras blocks from ras_list and free ras block list node */ +		list_del(&ras_node->node); +		kfree(ras_node); +	} +  	amdgpu_ras_fs_fini(adev);  	amdgpu_ras_interrupt_remove_all(adev); @@ -2717,8 +2733,6 @@ static int amdgpu_bad_page_notifier(struct notifier_block *nb,  	dev_info(adev->dev, "Uncorrectable error detected in UMC inst: %d, chan_idx: %d",  			     umc_inst, ch_inst); -	memset(&err_rec, 0x0, sizeof(struct eeprom_table_record)); -  	/*  	 * Translate UMC channel address to Physical address  	 */ @@ -2730,16 +2744,10 @@ static int amdgpu_bad_page_notifier(struct notifier_block *nb,  			ADDR_OF_256B_BLOCK(channel_index) |  			OFFSET_IN_256B_BLOCK(m->addr); -	err_rec.address = m->addr; -	err_rec.retired_page = retired_page >> AMDGPU_GPU_PAGE_SHIFT; -	err_rec.ts = (uint64_t)ktime_get_real_seconds(); -	err_rec.err_type = AMDGPU_RAS_EEPROM_ERR_NON_RECOVERABLE; -	err_rec.cu = 0; -	err_rec.mem_channel = channel_index; -	err_rec.mcumc_id = umc_inst; - +	memset(&err_rec, 0x0, sizeof(struct eeprom_table_record));  	err_data.err_addr = &err_rec; -	err_data.err_addr_cnt = 1; +	amdgpu_umc_fill_error_record(&err_data, m->addr, +			retired_page, channel_index, umc_inst);  	if (amdgpu_bad_page_threshold != 0) {  		amdgpu_ras_add_bad_pages(adev, err_data.err_addr, @@ -2777,3 +2785,63 @@ static void amdgpu_register_bad_pages_mca_notifier(struct amdgpu_device *adev)  	}  }  #endif + +struct amdgpu_ras *amdgpu_ras_get_context(struct amdgpu_device *adev) +{ +	if (!adev) +		return NULL; + +	return adev->psp.ras_context.ras; +} + +int amdgpu_ras_set_context(struct amdgpu_device *adev, struct amdgpu_ras *ras_con) +{ +	if (!adev) +		return -EINVAL; + +	adev->psp.ras_context.ras = ras_con; +	return 0; +} + +/* check if ras is supported on block, say, sdma, gfx */ +int amdgpu_ras_is_supported(struct amdgpu_device *adev, +		unsigned int block) +{ +	struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); + +	if (block >= AMDGPU_RAS_BLOCK_COUNT) +		return 0; +	return ras && (adev->ras_enabled & (1 << block)); +} + +int amdgpu_ras_reset_gpu(struct amdgpu_device *adev) +{ +	struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); + +	if (atomic_cmpxchg(&ras->in_recovery, 0, 1) == 0) +		schedule_work(&ras->recovery_work); +	return 0; +} + + +/* Register each ip ras block into amdgpu ras */ +int amdgpu_ras_register_ras_block(struct amdgpu_device *adev, +		struct amdgpu_ras_block_object *ras_block_obj) +{ +	struct amdgpu_ras_block_list *ras_node; +	if (!adev || !ras_block_obj) +		return -EINVAL; + +	if (!amdgpu_ras_asic_supported(adev)) +		return 0; + +	ras_node = kzalloc(sizeof(*ras_node), GFP_KERNEL); +	if (!ras_node) +		return -ENOMEM; + +	INIT_LIST_HEAD(&ras_node->node); +	ras_node->ras_obj = ras_block_obj; +	list_add_tail(&ras_node->node, &adev->ras_list); + +	return 0; +} | 
