diff options
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c')
| -rw-r--r-- | drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 191 | 
1 files changed, 53 insertions, 138 deletions
| diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 4e36551ab50b..c136bd449744 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -80,6 +80,8 @@ enum amdgpu_ras_retire_page_reservation {  atomic_t amdgpu_ras_in_intr = ATOMIC_INIT(0); +static bool amdgpu_ras_check_bad_page_unlock(struct amdgpu_ras *con, +				uint64_t addr);  static bool amdgpu_ras_check_bad_page(struct amdgpu_device *adev,  				uint64_t addr); @@ -516,9 +518,9 @@ struct ras_manager *amdgpu_ras_find_obj(struct amdgpu_device *adev,  /* obj end */  static void amdgpu_ras_parse_status_code(struct amdgpu_device *adev, -				  const char* 		invoke_type, -				  const char* 		block_name, -				  enum ta_ras_status 	ret) +					 const char* invoke_type, +					 const char* block_name, +					 enum ta_ras_status ret)  {  	switch (ret) {  	case TA_RAS_STATUS__SUCCESS: @@ -607,7 +609,7 @@ int amdgpu_ras_feature_enable(struct amdgpu_device *adev,  	if (!con)  		return -EINVAL; -        info = kzalloc(sizeof(union ta_ras_cmd_input), GFP_KERNEL); +	info = kzalloc(sizeof(union ta_ras_cmd_input), GFP_KERNEL);  	if (!info)  		return -ENOMEM; @@ -903,13 +905,6 @@ int amdgpu_ras_error_inject(struct amdgpu_device *adev,  	return ret;  } -int amdgpu_ras_error_cure(struct amdgpu_device *adev, -		struct ras_cure_if *info) -{ -	/* psp fw has no cure interface for now. */ -	return 0; -} -  /* get the total error counts on all IPs */  unsigned long amdgpu_ras_query_error_count(struct amdgpu_device *adev,  		bool is_ce) @@ -953,7 +948,7 @@ static char *amdgpu_ras_badpage_flags_str(unsigned int flags)  	case AMDGPU_RAS_RETIRE_PAGE_FAULT:  	default:  		return "F"; -	}; +	}  }  /** @@ -1172,7 +1167,7 @@ static void amdgpu_ras_debugfs_create_ctrl_node(struct amdgpu_device *adev)  			con->dir, &con->disable_ras_err_cnt_harvest);  } -void amdgpu_ras_debugfs_create(struct amdgpu_device *adev, +static void amdgpu_ras_debugfs_create(struct amdgpu_device *adev,  		struct ras_fs_if *head)  {  	struct amdgpu_ras *con = amdgpu_ras_get_context(adev); @@ -1194,7 +1189,6 @@ void amdgpu_ras_debugfs_create(struct amdgpu_device *adev,  void amdgpu_ras_debugfs_create_all(struct amdgpu_device *adev)  { -#if defined(CONFIG_DEBUG_FS)  	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);  	struct ras_manager *obj;  	struct ras_fs_if fs_info; @@ -1203,7 +1197,7 @@ void amdgpu_ras_debugfs_create_all(struct amdgpu_device *adev)  	 * it won't be called in resume path, no need to check  	 * suspend and gpu reset status  	 */ -	if (!con) +	if (!IS_ENABLED(CONFIG_DEBUG_FS) || !con)  		return;  	amdgpu_ras_debugfs_create_ctrl_node(adev); @@ -1217,10 +1211,9 @@ void amdgpu_ras_debugfs_create_all(struct amdgpu_device *adev)  			amdgpu_ras_debugfs_create(adev, &fs_info);  		}  	} -#endif  } -void amdgpu_ras_debugfs_remove(struct amdgpu_device *adev, +static void amdgpu_ras_debugfs_remove(struct amdgpu_device *adev,  		struct ras_common_if *head)  {  	struct ras_manager *obj = amdgpu_ras_find_obj(adev, head); @@ -1234,7 +1227,6 @@ void amdgpu_ras_debugfs_remove(struct amdgpu_device *adev,  static void amdgpu_ras_debugfs_remove_all(struct amdgpu_device *adev)  { -#if defined(CONFIG_DEBUG_FS)  	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);  	struct ras_manager *obj, *tmp; @@ -1243,7 +1235,6 @@ static void amdgpu_ras_debugfs_remove_all(struct amdgpu_device *adev)  	}  	con->dir = NULL; -#endif  }  /* debugfs end */ @@ -1291,7 +1282,8 @@ static int amdgpu_ras_fs_init(struct amdgpu_device *adev)  static int amdgpu_ras_fs_fini(struct amdgpu_device *adev)  { -	amdgpu_ras_debugfs_remove_all(adev); +	if (IS_ENABLED(CONFIG_DEBUG_FS)) +		amdgpu_ras_debugfs_remove_all(adev);  	amdgpu_ras_sysfs_remove_all(adev);  	return 0;  } @@ -1477,8 +1469,8 @@ static void amdgpu_ras_log_on_err_counter(struct amdgpu_device *adev)  }  /* Parse RdRspStatus and WrRspStatus */ -void amdgpu_ras_error_status_query(struct amdgpu_device *adev, -		struct ras_query_if *info) +static void amdgpu_ras_error_status_query(struct amdgpu_device *adev, +					  struct ras_query_if *info)  {  	/*  	 * Only two block need to query read/write @@ -1551,10 +1543,12 @@ static int amdgpu_ras_badpages_read(struct amdgpu_device *adev,  			.size = AMDGPU_GPU_PAGE_SIZE,  			.flags = AMDGPU_RAS_RETIRE_PAGE_RESERVED,  		}; - -		if (data->last_reserved <= i) +		ret = amdgpu_vram_mgr_query_page_status( +				ttm_manager_type(&adev->mman.bdev, TTM_PL_VRAM), +				data->bps[i].retired_page); +		if (ret == -EBUSY)  			(*bps)[i].flags = AMDGPU_RAS_RETIRE_PAGE_PENDING; -		else if (data->bps_bo[i] == NULL) +		else if (ret == -ENOENT)  			(*bps)[i].flags = AMDGPU_RAS_RETIRE_PAGE_FAULT;  	} @@ -1606,12 +1600,9 @@ static int amdgpu_ras_realloc_eh_data_space(struct amdgpu_device *adev,  	unsigned int new_space = old_space + pages;  	unsigned int align_space = ALIGN(new_space, 512);  	void *bps = kmalloc(align_space * sizeof(*data->bps), GFP_KERNEL); -	struct amdgpu_bo **bps_bo = -			kmalloc(align_space * sizeof(*data->bps_bo), GFP_KERNEL); -	if (!bps || !bps_bo) { +	if (!bps) {  		kfree(bps); -		kfree(bps_bo);  		return -ENOMEM;  	} @@ -1620,14 +1611,8 @@ static int amdgpu_ras_realloc_eh_data_space(struct amdgpu_device *adev,  				data->count * sizeof(*data->bps));  		kfree(data->bps);  	} -	if (data->bps_bo) { -		memcpy(bps_bo, data->bps_bo, -				data->count * sizeof(*data->bps_bo)); -		kfree(data->bps_bo); -	}  	data->bps = bps; -	data->bps_bo = bps_bo;  	data->space_left += align_space - old_space;  	return 0;  } @@ -1639,6 +1624,7 @@ int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev,  	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);  	struct ras_err_handler_data *data;  	int ret = 0; +	uint32_t i;  	if (!con || !con->eh_data || !bps || pages <= 0)  		return 0; @@ -1648,16 +1634,26 @@ int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev,  	if (!data)  		goto out; -	if (data->space_left <= pages) -		if (amdgpu_ras_realloc_eh_data_space(adev, data, pages)) { +	for (i = 0; i < pages; i++) { +		if (amdgpu_ras_check_bad_page_unlock(con, +			bps[i].retired_page << AMDGPU_GPU_PAGE_SHIFT)) +			continue; + +		if (!data->space_left && +			amdgpu_ras_realloc_eh_data_space(adev, data, 256)) {  			ret = -ENOMEM;  			goto out;  		} -	memcpy(&data->bps[data->count], bps, pages * sizeof(*data->bps)); -	data->count += pages; -	data->space_left -= pages; +		amdgpu_vram_mgr_reserve_range( +			ttm_manager_type(&adev->mman.bdev, TTM_PL_VRAM), +			bps[i].retired_page << AMDGPU_GPU_PAGE_SHIFT, +			AMDGPU_GPU_PAGE_SIZE); +		memcpy(&data->bps[data->count], &bps[i], sizeof(*data->bps)); +		data->count++; +		data->space_left--; +	}  out:  	mutex_unlock(&con->recovery_lock); @@ -1668,7 +1664,7 @@ out:   * write error record array to eeprom, the function should be   * protected by recovery_lock   */ -static int amdgpu_ras_save_bad_pages(struct amdgpu_device *adev) +int amdgpu_ras_save_bad_pages(struct amdgpu_device *adev)  {  	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);  	struct ras_err_handler_data *data; @@ -1730,6 +1726,20 @@ out:  	return ret;  } +static bool amdgpu_ras_check_bad_page_unlock(struct amdgpu_ras *con, +				uint64_t addr) +{ +	struct ras_err_handler_data *data = con->eh_data; +	int i; + +	addr >>= AMDGPU_GPU_PAGE_SHIFT; +	for (i = 0; i < data->count; i++) +		if (addr == data->bps[i].retired_page) +			return true; + +	return false; +} +  /*   * check if an address belongs to bad page   * @@ -1739,26 +1749,13 @@ static bool amdgpu_ras_check_bad_page(struct amdgpu_device *adev,  				uint64_t addr)  {  	struct amdgpu_ras *con = amdgpu_ras_get_context(adev); -	struct ras_err_handler_data *data; -	int i;  	bool ret = false;  	if (!con || !con->eh_data)  		return ret;  	mutex_lock(&con->recovery_lock); -	data = con->eh_data; -	if (!data) -		goto out; - -	addr >>= AMDGPU_GPU_PAGE_SHIFT; -	for (i = 0; i < data->count; i++) -		if (addr == data->bps[i].retired_page) { -			ret = true; -			goto out; -		} - -out: +	ret = amdgpu_ras_check_bad_page_unlock(con, addr);  	mutex_unlock(&con->recovery_lock);  	return ret;  } @@ -1804,80 +1801,6 @@ static void amdgpu_ras_validate_threshold(struct amdgpu_device *adev,  	}  } -/* called in gpu recovery/init */ -int amdgpu_ras_reserve_bad_pages(struct amdgpu_device *adev) -{ -	struct amdgpu_ras *con = amdgpu_ras_get_context(adev); -	struct ras_err_handler_data *data; -	uint64_t bp; -	struct amdgpu_bo *bo = NULL; -	int i, ret = 0; - -	/* Not reserve bad page when amdgpu_bad_page_threshold == 0. */ -	if (!con || !con->eh_data || (amdgpu_bad_page_threshold == 0)) -		return 0; - -	mutex_lock(&con->recovery_lock); -	data = con->eh_data; -	if (!data) -		goto out; -	/* reserve vram at driver post stage. */ -	for (i = data->last_reserved; i < data->count; i++) { -		bp = data->bps[i].retired_page; - -		/* There are two cases of reserve error should be ignored: -		 * 1) a ras bad page has been allocated (used by someone); -		 * 2) a ras bad page has been reserved (duplicate error injection -		 *    for one page); -		 */ -		if (amdgpu_bo_create_kernel_at(adev, bp << AMDGPU_GPU_PAGE_SHIFT, -					       AMDGPU_GPU_PAGE_SIZE, -					       AMDGPU_GEM_DOMAIN_VRAM, -					       &bo, NULL)) -			dev_warn(adev->dev, "RAS WARN: reserve vram for " -					"retired page %llx fail\n", bp); - -		data->bps_bo[i] = bo; -		data->last_reserved = i + 1; -		bo = NULL; -	} - -	/* continue to save bad pages to eeprom even reesrve_vram fails */ -	ret = amdgpu_ras_save_bad_pages(adev); -out: -	mutex_unlock(&con->recovery_lock); -	return ret; -} - -/* called when driver unload */ -static int amdgpu_ras_release_bad_pages(struct amdgpu_device *adev) -{ -	struct amdgpu_ras *con = amdgpu_ras_get_context(adev); -	struct ras_err_handler_data *data; -	struct amdgpu_bo *bo; -	int i; - -	if (!con || !con->eh_data) -		return 0; - -	mutex_lock(&con->recovery_lock); -	data = con->eh_data; -	if (!data) -		goto out; - -	for (i = data->last_reserved - 1; i >= 0; i--) { -		bo = data->bps_bo[i]; - -		amdgpu_bo_free_kernel(&bo, NULL, NULL); - -		data->bps_bo[i] = bo; -		data->last_reserved = i; -	} -out: -	mutex_unlock(&con->recovery_lock); -	return 0; -} -  int amdgpu_ras_recovery_init(struct amdgpu_device *adev)  {  	struct amdgpu_ras *con = amdgpu_ras_get_context(adev); @@ -1917,18 +1840,12 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev)  		ret = amdgpu_ras_load_bad_pages(adev);  		if (ret)  			goto free; -		ret = amdgpu_ras_reserve_bad_pages(adev); -		if (ret) -			goto release;  	}  	return 0; -release: -	amdgpu_ras_release_bad_pages(adev);  free:  	kfree((*data)->bps); -	kfree((*data)->bps_bo);  	kfree(*data);  	con->eh_data = NULL;  out: @@ -1956,12 +1873,10 @@ static int amdgpu_ras_recovery_fini(struct amdgpu_device *adev)  		return 0;  	cancel_work_sync(&con->recovery_work); -	amdgpu_ras_release_bad_pages(adev);  	mutex_lock(&con->recovery_lock);  	con->eh_data = NULL;  	kfree(data->bps); -	kfree(data->bps_bo);  	kfree(data);  	mutex_unlock(&con->recovery_lock); @@ -2156,7 +2071,7 @@ void amdgpu_ras_late_fini(struct amdgpu_device *adev,  	amdgpu_ras_sysfs_remove(adev, ras_block);  	if (ih_info->cb) -                amdgpu_ras_interrupt_remove_handler(adev, ih_info); +		amdgpu_ras_interrupt_remove_handler(adev, ih_info);  	amdgpu_ras_feature_enable(adev, ras_block, 0);  } | 
