diff options
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c')
| -rw-r--r-- | drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c | 87 | 
1 files changed, 73 insertions, 14 deletions
| diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c index aad3c8b4c810..f76c19fc0392 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c @@ -22,6 +22,59 @@   */  #include "amdgpu.h" +#include "umc_v6_7.h" + +static int amdgpu_umc_convert_error_address(struct amdgpu_device *adev, +				    struct ras_err_data *err_data, uint64_t err_addr, +				    uint32_t ch_inst, uint32_t umc_inst) +{ +	switch (adev->ip_versions[UMC_HWIP][0]) { +	case IP_VERSION(6, 7, 0): +		umc_v6_7_convert_error_address(adev, +				err_data, err_addr, ch_inst, umc_inst); +		break; +	default: +		dev_warn(adev->dev, +			 "UMC address to Physical address translation is not supported\n"); +		return AMDGPU_RAS_FAIL; +	} + +	return AMDGPU_RAS_SUCCESS; +} + +int amdgpu_umc_page_retirement_mca(struct amdgpu_device *adev, +			uint64_t err_addr, uint32_t ch_inst, uint32_t umc_inst) +{ +	struct ras_err_data err_data = {0, 0, 0, NULL}; +	int ret = AMDGPU_RAS_FAIL; + +	err_data.err_addr = +		kcalloc(adev->umc.max_ras_err_cnt_per_query, +			sizeof(struct eeprom_table_record), GFP_KERNEL); +	if (!err_data.err_addr) { +		dev_warn(adev->dev, +			"Failed to alloc memory for umc error record in MCA notifier!\n"); +		return AMDGPU_RAS_FAIL; +	} + +	/* +	 * Translate UMC channel address to Physical address +	 */ +	ret = amdgpu_umc_convert_error_address(adev, &err_data, err_addr, +					ch_inst, umc_inst); +	if (ret) +		goto out; + +	if (amdgpu_bad_page_threshold != 0) { +		amdgpu_ras_add_bad_pages(adev, err_data.err_addr, +						err_data.err_addr_cnt); +		amdgpu_ras_save_bad_pages(adev); +	} + +out: +	kfree(err_data.err_addr); +	return ret; +}  static int amdgpu_umc_do_page_retirement(struct amdgpu_device *adev,  		void *ras_error_status, @@ -112,23 +165,29 @@ static int amdgpu_umc_do_page_retirement(struct amdgpu_device *adev,  	return AMDGPU_RAS_SUCCESS;  } -int amdgpu_umc_poison_handler(struct amdgpu_device *adev, -		void *ras_error_status, -		bool reset) +int amdgpu_umc_poison_handler(struct amdgpu_device *adev, bool reset)  { -	int ret; -	struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status; -	struct ras_common_if head = { -		.block = AMDGPU_RAS_BLOCK__UMC, -	}; -	struct ras_manager *obj = amdgpu_ras_find_obj(adev, &head); +	int ret = AMDGPU_RAS_SUCCESS; -	ret = -		amdgpu_umc_do_page_retirement(adev, ras_error_status, NULL, reset); +	if (!adev->gmc.xgmi.connected_to_cpu) { +		struct ras_err_data err_data = {0, 0, 0, NULL}; +		struct ras_common_if head = { +			.block = AMDGPU_RAS_BLOCK__UMC, +		}; +		struct ras_manager *obj = amdgpu_ras_find_obj(adev, &head); -	if (ret == AMDGPU_RAS_SUCCESS && obj) { -		obj->err_data.ue_count += err_data->ue_count; -		obj->err_data.ce_count += err_data->ce_count; +		ret = amdgpu_umc_do_page_retirement(adev, &err_data, NULL, reset); + +		if (ret == AMDGPU_RAS_SUCCESS && obj) { +			obj->err_data.ue_count += err_data.ue_count; +			obj->err_data.ce_count += err_data.ce_count; +		} +	} else if (reset) { +		/* MCA poison handler is only responsible for GPU reset, +		 * let MCA notifier do page retirement. +		 */ +		kgd2kfd_set_sram_ecc_flag(adev->kfd.dev); +		amdgpu_ras_reset_gpu(adev);  	}  	return ret; | 
