diff options
| author | John Clements <john.clements@amd.com> | 2021-08-26 13:55:52 +0800 | 
|---|---|---|
| committer | Alex Deucher <alexander.deucher@amd.com> | 2021-08-26 13:56:14 -0400 | 
| commit | 3c4ff2dcc0dffbfa79f7f55237f502a74ed018b7 (patch) | |
| tree | 18647c5db76cb193e584537c35683f7252ef03b8 /drivers/gpu/drm/amd | |
| parent | 1ec06c2dee679e9f089e78ed20cb74ee90155f61 (diff) | |
drm/amdgpu: Add support for RAS XGMI err query
Update XGMI RAS to support error query on aldebaran
Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com>
Signed-off-by: John Clements <john.clements@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
Diffstat (limited to 'drivers/gpu/drm/amd')
| -rw-r--r-- | drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c | 65 | 
1 files changed, 65 insertions, 0 deletions
| diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c index e2a8dfea003c..978ac927ac11 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c @@ -32,6 +32,10 @@  #include "wafl/wafl2_4_0_0_smn.h"  #include "wafl/wafl2_4_0_0_sh_mask.h" +#define smnPCS_XGMI23_PCS_ERROR_STATUS   0x11a01210 +#define smnPCS_XGMI3X16_PCS_ERROR_STATUS 0x11a0020c +#define smnPCS_GOPX1_PCS_ERROR_STATUS    0x12200210 +  static DEFINE_MUTEX(xgmi_mutex);  #define AMDGPU_MAX_XGMI_DEVICE_PER_HIVE		4 @@ -63,6 +67,33 @@ static const int wafl_pcs_err_status_reg_arct[] = {  	smnPCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS + 0x100000,  }; +static const int xgmi23_pcs_err_status_reg_aldebaran[] = { +	smnPCS_XGMI23_PCS_ERROR_STATUS, +	smnPCS_XGMI23_PCS_ERROR_STATUS + 0x100000, +	smnPCS_XGMI23_PCS_ERROR_STATUS + 0x200000, +	smnPCS_XGMI23_PCS_ERROR_STATUS + 0x300000, +	smnPCS_XGMI23_PCS_ERROR_STATUS + 0x400000, +	smnPCS_XGMI23_PCS_ERROR_STATUS + 0x500000, +	smnPCS_XGMI23_PCS_ERROR_STATUS + 0x600000, +	smnPCS_XGMI23_PCS_ERROR_STATUS + 0x700000 +}; + +static const int xgmi3x16_pcs_err_status_reg_aldebaran[] = { +	smnPCS_XGMI3X16_PCS_ERROR_STATUS, +	smnPCS_XGMI3X16_PCS_ERROR_STATUS + 0x100000, +	smnPCS_XGMI3X16_PCS_ERROR_STATUS + 0x200000, +	smnPCS_XGMI3X16_PCS_ERROR_STATUS + 0x300000, +	smnPCS_XGMI3X16_PCS_ERROR_STATUS + 0x400000, +	smnPCS_XGMI3X16_PCS_ERROR_STATUS + 0x500000, +	smnPCS_XGMI3X16_PCS_ERROR_STATUS + 0x600000, +	smnPCS_XGMI3X16_PCS_ERROR_STATUS + 0x700000 +}; + +static const int walf_pcs_err_status_reg_aldebaran[] = { +	smnPCS_GOPX1_PCS_ERROR_STATUS, +	smnPCS_GOPX1_PCS_ERROR_STATUS + 0x100000 +}; +  static const struct amdgpu_pcs_ras_field xgmi_pcs_ras_fields[] = {  	{"XGMI PCS DataLossErr",  	 SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, DataLossErr)}, @@ -771,6 +802,17 @@ static void amdgpu_xgmi_reset_ras_error_count(struct amdgpu_device *adev)  			pcs_clear_status(adev,  					 xgmi_pcs_err_status_reg_vg20[i]);  		break; +	case CHIP_ALDEBARAN: +		for (i = 0; i < ARRAY_SIZE(xgmi23_pcs_err_status_reg_aldebaran); i++) +			pcs_clear_status(adev, +					 xgmi23_pcs_err_status_reg_aldebaran[i]); +		for (i = 0; i < ARRAY_SIZE(xgmi23_pcs_err_status_reg_aldebaran); i++) +			pcs_clear_status(adev, +					 xgmi23_pcs_err_status_reg_aldebaran[i]); +		for (i = 0; i < ARRAY_SIZE(walf_pcs_err_status_reg_aldebaran); i++) +			pcs_clear_status(adev, +					 walf_pcs_err_status_reg_aldebaran[i]); +		break;  	default:  		break;  	} @@ -863,6 +905,29 @@ static int amdgpu_xgmi_query_ras_error_count(struct amdgpu_device *adev,  						data, &ue_cnt, &ce_cnt, false);  		}  		break; +	case CHIP_ALDEBARAN: +		/* check xgmi23 pcs error */ +		for (i = 0; i < ARRAY_SIZE(xgmi23_pcs_err_status_reg_aldebaran); i++) { +			data = RREG32_PCIE(xgmi23_pcs_err_status_reg_aldebaran[i]); +			if (data) +				amdgpu_xgmi_query_pcs_error_status(adev, +						data, &ue_cnt, &ce_cnt, true); +		} +		/* check xgmi3x16 pcs error */ +		for (i = 0; i < ARRAY_SIZE(xgmi3x16_pcs_err_status_reg_aldebaran); i++) { +			data = RREG32_PCIE(xgmi3x16_pcs_err_status_reg_aldebaran[i]); +			if (data) +				amdgpu_xgmi_query_pcs_error_status(adev, +						data, &ue_cnt, &ce_cnt, true); +		} +		/* check wafl pcs error */ +		for (i = 0; i < ARRAY_SIZE(walf_pcs_err_status_reg_aldebaran); i++) { +			data = RREG32_PCIE(walf_pcs_err_status_reg_aldebaran[i]); +			if (data) +				amdgpu_xgmi_query_pcs_error_status(adev, +						data, &ue_cnt, &ce_cnt, false); +		} +		break;  	default:  		dev_warn(adev->dev, "XGMI RAS error query not supported");  		break; | 
