diff options
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c')
| -rw-r--r-- | drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 345 | 
1 files changed, 308 insertions, 37 deletions
| diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 4c9fa24dd972..f0924aa3f4e4 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -36,6 +36,7 @@  #include "amdgpu_xgmi.h"  #include "ivsrcid/nbio/irqsrcs_nbif_7_4.h"  #include "nbio_v4_3.h" +#include "nbif_v6_3_1.h"  #include "nbio_v7_9.h"  #include "atom.h"  #include "amdgpu_reset.h" @@ -192,7 +193,7 @@ static int amdgpu_reserve_page_direct(struct amdgpu_device *adev, uint64_t addre  	if (amdgpu_bad_page_threshold != 0) {  		amdgpu_ras_add_bad_pages(adev, err_data.err_addr, -					 err_data.err_addr_cnt); +					 err_data.err_addr_cnt, false);  		amdgpu_ras_save_bad_pages(adev, NULL);  	} @@ -2015,6 +2016,7 @@ static bool amdgpu_ras_aca_is_supported(struct amdgpu_device *adev)  	switch (amdgpu_ip_version(adev, MP0_HWIP, 0)) {  	case IP_VERSION(13, 0, 6): +	case IP_VERSION(13, 0, 12):  	case IP_VERSION(13, 0, 14):  		ret = true;  		break; @@ -2156,6 +2158,16 @@ void amdgpu_ras_interrupt_fatal_error_handler(struct amdgpu_device *adev)  	/* Fatal error events are handled on host side */  	if (amdgpu_sriov_vf(adev))  		return; +	/** +	 * If the current interrupt is caused by a non-fatal RAS error, skip +	 * check for fatal error. For fatal errors, FED status of all devices +	 * in XGMI hive gets set when the first device gets fatal error +	 * interrupt. The error gets propagated to other devices as well, so +	 * make sure to ack the interrupt regardless of FED status. +	 */ +	if (!amdgpu_ras_get_fed_status(adev) && +	    amdgpu_ras_is_err_state(adev, AMDGPU_RAS_BLOCK__ANY)) +		return;  	if (adev->nbio.ras &&  	    adev->nbio.ras->handle_ras_controller_intr_no_bifring) @@ -2185,6 +2197,7 @@ static void amdgpu_ras_interrupt_poison_consumption_handler(struct ras_manager *  	if (ret)  		return; +	amdgpu_ras_set_err_poison(adev, block_obj->ras_comm.block);  	/* both query_poison_status and handle_poison_consumption are optional,  	 * but at least one of them should be implemented if we need poison  	 * consumption handler @@ -2717,40 +2730,203 @@ static int amdgpu_ras_realloc_eh_data_space(struct amdgpu_device *adev,  	return 0;  } +static int amdgpu_ras_mca2pa_by_idx(struct amdgpu_device *adev, +			struct eeprom_table_record *bps, +			struct ras_err_data *err_data) +{ +	struct ta_ras_query_address_input addr_in; +	uint32_t socket = 0; +	int ret = 0; + +	if (adev->smuio.funcs && adev->smuio.funcs->get_socket_id) +		socket = adev->smuio.funcs->get_socket_id(adev); + +	/* reinit err_data */ +	err_data->err_addr_cnt = 0; +	err_data->err_addr_len = adev->umc.retire_unit; + +	memset(&addr_in, 0, sizeof(addr_in)); +	addr_in.ma.err_addr = bps->address; +	addr_in.ma.socket_id = socket; +	addr_in.ma.ch_inst = bps->mem_channel; +	/* tell RAS TA the node instance is not used */ +	addr_in.ma.node_inst = TA_RAS_INV_NODE; + +	if (adev->umc.ras && adev->umc.ras->convert_ras_err_addr) +		ret = adev->umc.ras->convert_ras_err_addr(adev, err_data, +				&addr_in, NULL, false); + +	return ret; +} + +static int amdgpu_ras_mca2pa(struct amdgpu_device *adev, +			struct eeprom_table_record *bps, +			struct ras_err_data *err_data) +{ +	struct ta_ras_query_address_input addr_in; +	uint32_t die_id, socket = 0; + +	if (adev->smuio.funcs && adev->smuio.funcs->get_socket_id) +		socket = adev->smuio.funcs->get_socket_id(adev); + +	/* although die id is gotten from PA in nps1 mode, the id is +	 * fitable for any nps mode +	 */ +	if (adev->umc.ras && adev->umc.ras->get_die_id_from_pa) +		die_id = adev->umc.ras->get_die_id_from_pa(adev, bps->address, +					bps->retired_page << AMDGPU_GPU_PAGE_SHIFT); +	else +		return -EINVAL; + +	/* reinit err_data */ +	err_data->err_addr_cnt = 0; +	err_data->err_addr_len = adev->umc.retire_unit; + +	memset(&addr_in, 0, sizeof(addr_in)); +	addr_in.ma.err_addr = bps->address; +	addr_in.ma.ch_inst = bps->mem_channel; +	addr_in.ma.umc_inst = bps->mcumc_id; +	addr_in.ma.node_inst = die_id; +	addr_in.ma.socket_id = socket; + +	if (adev->umc.ras && adev->umc.ras->convert_ras_err_addr) +		return adev->umc.ras->convert_ras_err_addr(adev, err_data, +					&addr_in, NULL, false); +	else +		return  -EINVAL; +} +  /* it deal with vram only. */  int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev, -		struct eeprom_table_record *bps, int pages) +		struct eeprom_table_record *bps, int pages, bool from_rom)  {  	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);  	struct ras_err_handler_data *data; +	struct ras_err_data err_data; +	struct eeprom_table_record *err_rec; +	struct amdgpu_ras_eeprom_control *control = +			&adev->psp.ras_context.ras->eeprom_control; +	enum amdgpu_memory_partition nps = AMDGPU_NPS1_PARTITION_MODE;  	int ret = 0; -	uint32_t i; +	uint32_t i, j, loop_cnt = 1; +	bool find_pages_per_pa = false;  	if (!con || !con->eh_data || !bps || pages <= 0)  		return 0; +	if (from_rom) { +		err_data.err_addr = +			kcalloc(adev->umc.retire_unit, +				sizeof(struct eeprom_table_record), GFP_KERNEL); +		if (!err_data.err_addr) { +			dev_warn(adev->dev, "Failed to alloc UMC error address record in mca2pa conversion!\n"); +			ret = -ENOMEM; +			goto out; +		} + +		err_rec = err_data.err_addr; +		loop_cnt = adev->umc.retire_unit; +		if (adev->gmc.gmc_funcs->query_mem_partition_mode) +			nps = adev->gmc.gmc_funcs->query_mem_partition_mode(adev); +	} +  	mutex_lock(&con->recovery_lock);  	data = con->eh_data; -	if (!data) -		goto out; +	if (!data) { +		/* Returning 0 as the absence of eh_data is acceptable */ +		goto free; +	}  	for (i = 0; i < pages; i++) { -		if (amdgpu_ras_check_bad_page_unlock(con, -			bps[i].retired_page << AMDGPU_GPU_PAGE_SHIFT)) -			continue; +		if (from_rom && +		    control->rec_type == AMDGPU_RAS_EEPROM_REC_MCA) { +			if (!find_pages_per_pa) { +				if (amdgpu_ras_mca2pa_by_idx(adev, &bps[i], &err_data)) { +					if (!i && nps == AMDGPU_NPS1_PARTITION_MODE) { +						/* may use old RAS TA, use PA to find pages in +						 * one row +						 */ +						if (amdgpu_umc_pages_in_a_row(adev, &err_data, +									      bps[i].retired_page << +									      AMDGPU_GPU_PAGE_SHIFT)) { +							ret = -EINVAL; +							goto free; +						} else { +							find_pages_per_pa = true; +						} +					} else { +						/* unsupported cases */ +						ret = -EOPNOTSUPP; +						goto free; +					} +				} +			} else { +				if (amdgpu_umc_pages_in_a_row(adev, &err_data, +						bps[i].retired_page << AMDGPU_GPU_PAGE_SHIFT)) { +					ret = -EINVAL; +					goto free; +				} +			} +		} else { +			if (from_rom && !find_pages_per_pa) { +				if (bps[i].retired_page & UMC_CHANNEL_IDX_V2) { +					/* bad page in any NPS mode in eeprom */ +					if (amdgpu_ras_mca2pa_by_idx(adev, &bps[i], &err_data)) { +						ret = -EINVAL; +						goto free; +					} +				} else { +					/* legacy bad page in eeprom, generated only in +					 * NPS1 mode +					 */ +					if (amdgpu_ras_mca2pa(adev, &bps[i], &err_data)) { +						/* old RAS TA or ASICs which don't support to +						 * convert addrss via mca address +						 */ +						if (!i && nps == AMDGPU_NPS1_PARTITION_MODE) { +							find_pages_per_pa = true; +							err_rec = &bps[i]; +							loop_cnt = 1; +						} else { +							/* non-nps1 mode, old RAS TA +							 * can't support it +							 */ +							ret = -EOPNOTSUPP; +							goto free; +						} +					} +				} -		if (!data->space_left && -			amdgpu_ras_realloc_eh_data_space(adev, data, 256)) { -			ret = -ENOMEM; -			goto out; +				if (!find_pages_per_pa) +					i += (adev->umc.retire_unit - 1); +			} else { +				err_rec = &bps[i]; +			}  		} -		amdgpu_ras_reserve_page(adev, bps[i].retired_page); +		for (j = 0; j < loop_cnt; j++) { +			if (amdgpu_ras_check_bad_page_unlock(con, +				err_rec[j].retired_page << AMDGPU_GPU_PAGE_SHIFT)) +				continue; + +			if (!data->space_left && +			    amdgpu_ras_realloc_eh_data_space(adev, data, 256)) { +				ret = -ENOMEM; +				goto free; +			} -		memcpy(&data->bps[data->count], &bps[i], sizeof(*data->bps)); -		data->count++; -		data->space_left--; +			amdgpu_ras_reserve_page(adev, err_rec[j].retired_page); + +			memcpy(&data->bps[data->count], &(err_rec[j]), +					sizeof(struct eeprom_table_record)); +			data->count++; +			data->space_left--; +		}  	} + +free: +	if (from_rom) +		kfree(err_data.err_addr);  out:  	mutex_unlock(&con->recovery_lock); @@ -2768,7 +2944,7 @@ int amdgpu_ras_save_bad_pages(struct amdgpu_device *adev,  	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);  	struct ras_err_handler_data *data;  	struct amdgpu_ras_eeprom_control *control; -	int save_count; +	int save_count, unit_num, bad_page_num, i;  	if (!con || !con->eh_data) {  		if (new_cnt) @@ -2780,19 +2956,32 @@ int amdgpu_ras_save_bad_pages(struct amdgpu_device *adev,  	mutex_lock(&con->recovery_lock);  	control = &con->eeprom_control;  	data = con->eh_data; -	save_count = data->count - control->ras_num_recs; +	bad_page_num = control->ras_num_bad_pages; +	save_count = data->count - bad_page_num;  	mutex_unlock(&con->recovery_lock); +	unit_num = save_count / adev->umc.retire_unit;  	if (new_cnt) -		*new_cnt = save_count / adev->umc.retire_unit; +		*new_cnt = unit_num;  	/* only new entries are saved */  	if (save_count > 0) { -		if (amdgpu_ras_eeprom_append(control, -					     &data->bps[control->ras_num_recs], -					     save_count)) { -			dev_err(adev->dev, "Failed to save EEPROM table data!"); -			return -EIO; +		if (control->rec_type == AMDGPU_RAS_EEPROM_REC_PA) { +			if (amdgpu_ras_eeprom_append(control, +						     &data->bps[control->ras_num_recs], +						     save_count)) { +				dev_err(adev->dev, "Failed to save EEPROM table data!"); +				return -EIO; +			} +		} else { +			for (i = 0; i < unit_num; i++) { +				if (amdgpu_ras_eeprom_append(control, +						&data->bps[bad_page_num + i * adev->umc.retire_unit], +						1)) { +					dev_err(adev->dev, "Failed to save EEPROM table data!"); +					return -EIO; +				} +			}  		}  		dev_info(adev->dev, "Saved %d pages to EEPROM table.\n", save_count); @@ -2821,11 +3010,32 @@ static int amdgpu_ras_load_bad_pages(struct amdgpu_device *adev)  		return -ENOMEM;  	ret = amdgpu_ras_eeprom_read(control, bps, control->ras_num_recs); -	if (ret) +	if (ret) {  		dev_err(adev->dev, "Failed to load EEPROM table records!"); -	else -		ret = amdgpu_ras_add_bad_pages(adev, bps, control->ras_num_recs); +	} else { +		if (control->ras_num_recs > 1 && +		    adev->umc.ras && adev->umc.ras->convert_ras_err_addr) { +			if ((bps[0].address == bps[1].address) && +			    (bps[0].mem_channel == bps[1].mem_channel)) +				control->rec_type = AMDGPU_RAS_EEPROM_REC_PA; +			else +				control->rec_type = AMDGPU_RAS_EEPROM_REC_MCA; +		} + +		ret = amdgpu_ras_eeprom_check(control); +		if (ret) +			goto out; + +		/* HW not usable */ +		if (amdgpu_ras_is_rma(adev)) { +			ret = -EHWPOISON; +			goto out; +		} +		ret = amdgpu_ras_add_bad_pages(adev, bps, control->ras_num_recs, true); +	} + +out:  	kfree(bps);  	return ret;  } @@ -3205,31 +3415,36 @@ static int amdgpu_ras_page_retirement_thread(void *param)  int amdgpu_ras_init_badpage_info(struct amdgpu_device *adev)  {  	struct amdgpu_ras *con = amdgpu_ras_get_context(adev); +	struct amdgpu_ras_eeprom_control *control;  	int ret;  	if (!con || amdgpu_sriov_vf(adev))  		return 0; -	ret = amdgpu_ras_eeprom_init(&con->eeprom_control); - +	control = &con->eeprom_control; +	ret = amdgpu_ras_eeprom_init(control);  	if (ret)  		return ret; -	/* HW not usable */ -	if (amdgpu_ras_is_rma(adev)) -		return -EHWPOISON; +	if (!adev->umc.ras || !adev->umc.ras->convert_ras_err_addr) +		control->rec_type = AMDGPU_RAS_EEPROM_REC_PA; + +	/* default status is MCA storage */ +	if (control->ras_num_recs <= 1 && +	    adev->umc.ras && adev->umc.ras->convert_ras_err_addr) +		control->rec_type = AMDGPU_RAS_EEPROM_REC_MCA; -	if (con->eeprom_control.ras_num_recs) { +	if (control->ras_num_recs) {  		ret = amdgpu_ras_load_bad_pages(adev);  		if (ret)  			return ret;  		amdgpu_dpm_send_hbm_bad_pages_num( -			adev, con->eeprom_control.ras_num_recs); +			adev, control->ras_num_bad_pages);  		if (con->update_channel_flag == true) {  			amdgpu_dpm_send_hbm_bad_channel_flag( -				adev, con->eeprom_control.bad_channel_bitmap); +				adev, control->bad_channel_bitmap);  			con->update_channel_flag = false;  		}  	} @@ -3366,6 +3581,7 @@ static bool amdgpu_ras_asic_supported(struct amdgpu_device *adev)  		switch (amdgpu_ip_version(adev, MP0_HWIP, 0)) {  		case IP_VERSION(13, 0, 2):  		case IP_VERSION(13, 0, 6): +		case IP_VERSION(13, 0, 12):  		case IP_VERSION(13, 0, 14):  			return true;  		default: @@ -3378,7 +3594,9 @@ static bool amdgpu_ras_asic_supported(struct amdgpu_device *adev)  		case IP_VERSION(13, 0, 0):  		case IP_VERSION(13, 0, 6):  		case IP_VERSION(13, 0, 10): +		case IP_VERSION(13, 0, 12):  		case IP_VERSION(13, 0, 14): +		case IP_VERSION(14, 0, 3):  			return true;  		default:  			return false; @@ -3629,6 +3847,7 @@ static void amdgpu_ras_init_reserved_vram_size(struct amdgpu_device *adev)  	switch (amdgpu_ip_version(adev, MP0_HWIP, 0)) {  	case IP_VERSION(13, 0, 2):  	case IP_VERSION(13, 0, 6): +	case IP_VERSION(13, 0, 12):  	case IP_VERSION(13, 0, 14):  		con->reserved_pages_in_bytes = AMDGPU_RAS_RESERVED_VRAM_SIZE;  		break; @@ -3704,7 +3923,19 @@ int amdgpu_ras_init(struct amdgpu_device *adev)  			 * check DF RAS */  			adev->nbio.ras = &nbio_v4_3_ras;  		break; +	case IP_VERSION(6, 3, 1): +		if (adev->ras_hw_enabled & (1 << AMDGPU_RAS_BLOCK__DF)) +			/* unlike other generation of nbio ras, +			 * nbif v6_3_1 only support fatal error interrupt +			 * to inform software that DF is freezed due to +			 * system fatal error event. driver should not +			 * enable nbio ras in such case. Instead, +			 * check DF RAS +			 */ +			adev->nbio.ras = &nbif_v6_3_1_ras; +		break;  	case IP_VERSION(7, 9, 0): +	case IP_VERSION(7, 9, 1):  		if (!adev->gmc.is_app_apu)  			adev->nbio.ras = &nbio_v7_9_ras;  		break; @@ -4083,7 +4314,7 @@ bool amdgpu_ras_get_fed_status(struct amdgpu_device *adev)  	if (!ras)  		return false; -	return atomic_read(&ras->fed); +	return test_bit(AMDGPU_RAS_BLOCK__LAST, &ras->ras_err_state);  }  void amdgpu_ras_set_fed(struct amdgpu_device *adev, bool status) @@ -4091,8 +4322,48 @@ void amdgpu_ras_set_fed(struct amdgpu_device *adev, bool status)  	struct amdgpu_ras *ras;  	ras = amdgpu_ras_get_context(adev); +	if (ras) { +		if (status) +			set_bit(AMDGPU_RAS_BLOCK__LAST, &ras->ras_err_state); +		else +			clear_bit(AMDGPU_RAS_BLOCK__LAST, &ras->ras_err_state); +	} +} + +void amdgpu_ras_clear_err_state(struct amdgpu_device *adev) +{ +	struct amdgpu_ras *ras; + +	ras = amdgpu_ras_get_context(adev); +	if (ras) +		ras->ras_err_state = 0; +} + +void amdgpu_ras_set_err_poison(struct amdgpu_device *adev, +			       enum amdgpu_ras_block block) +{ +	struct amdgpu_ras *ras; + +	ras = amdgpu_ras_get_context(adev);  	if (ras) -		atomic_set(&ras->fed, !!status); +		set_bit(block, &ras->ras_err_state); +} + +bool amdgpu_ras_is_err_state(struct amdgpu_device *adev, int block) +{ +	struct amdgpu_ras *ras; + +	ras = amdgpu_ras_get_context(adev); +	if (ras) { +		if (block == AMDGPU_RAS_BLOCK__ANY) +			return (ras->ras_err_state != 0); +		else +			return test_bit(block, &ras->ras_err_state) || +			       test_bit(AMDGPU_RAS_BLOCK__LAST, +					&ras->ras_err_state); +	} + +	return false;  }  static struct ras_event_manager *__get_ras_event_mgr(struct amdgpu_device *adev) | 
