summaryrefslogtreecommitdiff
path: root/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c')
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c504
1 files changed, 337 insertions, 167 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 0e16683876aa..c2c791ca00f4 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -33,6 +33,7 @@
#include "amdgpu_atomfirmware.h"
#include "amdgpu_xgmi.h"
#include "ivsrcid/nbio/irqsrcs_nbif_7_4.h"
+#include "atom.h"
static const char *RAS_FS_NAME = "ras";
@@ -99,6 +100,49 @@ static bool amdgpu_ras_get_error_query_ready(struct amdgpu_device *adev)
return false;
}
+static int amdgpu_reserve_page_direct(struct amdgpu_device *adev, uint64_t address)
+{
+ struct ras_err_data err_data = {0, 0, 0, NULL};
+ struct eeprom_table_record err_rec;
+
+ if ((address >= adev->gmc.mc_vram_size) ||
+ (address >= RAS_UMC_INJECT_ADDR_LIMIT)) {
+ dev_warn(adev->dev,
+ "RAS WARN: input address 0x%llx is invalid.\n",
+ address);
+ return -EINVAL;
+ }
+
+ if (amdgpu_ras_check_bad_page(adev, address)) {
+ dev_warn(adev->dev,
+ "RAS WARN: 0x%llx has already been marked as bad page!\n",
+ address);
+ return 0;
+ }
+
+ memset(&err_rec, 0x0, sizeof(struct eeprom_table_record));
+
+ err_rec.address = address;
+ err_rec.retired_page = address >> AMDGPU_GPU_PAGE_SHIFT;
+ err_rec.ts = (uint64_t)ktime_get_real_seconds();
+ err_rec.err_type = AMDGPU_RAS_EEPROM_ERR_NON_RECOVERABLE;
+
+ err_data.err_addr = &err_rec;
+ err_data.err_addr_cnt = 1;
+
+ if (amdgpu_bad_page_threshold != 0) {
+ amdgpu_ras_add_bad_pages(adev, err_data.err_addr,
+ err_data.err_addr_cnt);
+ amdgpu_ras_save_bad_pages(adev);
+ }
+
+ dev_warn(adev->dev, "WARNING: THIS IS ONLY FOR TEST PURPOSES AND WILL CORRUPT RAS EEPROM\n");
+ dev_warn(adev->dev, "Clear EEPROM:\n");
+ dev_warn(adev->dev, " echo 1 > /sys/kernel/debug/dri/0/ras/ras_eeprom_reset\n");
+
+ return 0;
+}
+
static ssize_t amdgpu_ras_debugfs_read(struct file *f, char __user *buf,
size_t size, loff_t *pos)
{
@@ -178,11 +222,24 @@ static int amdgpu_ras_debugfs_ctrl_parse_data(struct file *f,
op = 1;
else if (sscanf(str, "inject %32s %8s", block_name, err) == 2)
op = 2;
+ else if (strstr(str, "retire_page") != NULL)
+ op = 3;
else if (str[0] && str[1] && str[2] && str[3])
/* ascii string, but commands are not matched. */
return -EINVAL;
if (op != -1) {
+ if (op == 3) {
+ if (sscanf(str, "%*s 0x%llx", &address) != 1 &&
+ sscanf(str, "%*s %llu", &address) != 1)
+ return -EINVAL;
+
+ data->op = op;
+ data->inject.address = address;
+
+ return 0;
+ }
+
if (amdgpu_ras_find_block_id_by_name(block_name, &block_id))
return -EINVAL;
@@ -198,11 +255,11 @@ static int amdgpu_ras_debugfs_ctrl_parse_data(struct file *f,
data->op = op;
if (op == 2) {
- if (sscanf(str, "%*s %*s %*s %u %llu %llu",
- &sub_block, &address, &value) != 3)
- if (sscanf(str, "%*s %*s %*s 0x%x 0x%llx 0x%llx",
- &sub_block, &address, &value) != 3)
- return -EINVAL;
+ if (sscanf(str, "%*s %*s %*s 0x%x 0x%llx 0x%llx",
+ &sub_block, &address, &value) != 3 &&
+ sscanf(str, "%*s %*s %*s %u %llu %llu",
+ &sub_block, &address, &value) != 3)
+ return -EINVAL;
data->head.sub_block_index = sub_block;
data->inject.address = address;
data->inject.value = value;
@@ -221,7 +278,7 @@ static int amdgpu_ras_debugfs_ctrl_parse_data(struct file *f,
/**
* DOC: AMDGPU RAS debugfs control interface
*
- * It accepts struct ras_debug_if who has two members.
+ * The control interface accepts struct ras_debug_if which has two members.
*
* First member: ras_debug_if::head or ras_debug_if::inject.
*
@@ -246,32 +303,36 @@ static int amdgpu_ras_debugfs_ctrl_parse_data(struct file *f,
*
* How to use the interface?
*
- * Programs
+ * In a program
*
- * Copy the struct ras_debug_if in your codes and initialize it.
- * Write the struct to the control node.
+ * Copy the struct ras_debug_if in your code and initialize it.
+ * Write the struct to the control interface.
*
- * Shells
+ * From shell
*
* .. code-block:: bash
*
- * echo op block [error [sub_block address value]] > .../ras/ras_ctrl
+ * echo "disable <block>" > /sys/kernel/debug/dri/<N>/ras/ras_ctrl
+ * echo "enable <block> <error>" > /sys/kernel/debug/dri/<N>/ras/ras_ctrl
+ * echo "inject <block> <error> <sub-block> <address> <value> > /sys/kernel/debug/dri/<N>/ras/ras_ctrl
+ *
+ * Where N, is the card which you want to affect.
*
- * Parameters:
+ * "disable" requires only the block.
+ * "enable" requires the block and error type.
+ * "inject" requires the block, error type, address, and value.
*
- * op: disable, enable, inject
- * disable: only block is needed
- * enable: block and error are needed
- * inject: error, address, value are needed
- * block: umc, sdma, gfx, .........
+ * The block is one of: umc, sdma, gfx, etc.
* see ras_block_string[] for details
- * error: ue, ce
- * ue: multi_uncorrectable
- * ce: single_correctable
- * sub_block:
- * sub block index, pass 0 if there is no sub block
*
- * here are some examples for bash commands:
+ * The error type is one of: ue, ce, where,
+ * ue is multi-uncorrectable
+ * ce is single-correctable
+ *
+ * The sub-block is a the sub-block index, pass 0 if there is no sub-block.
+ * The address and value are hexadecimal numbers, leading 0x is optional.
+ *
+ * For instance,
*
* .. code-block:: bash
*
@@ -279,17 +340,17 @@ static int amdgpu_ras_debugfs_ctrl_parse_data(struct file *f,
* echo inject umc ce 0 0 0 > /sys/kernel/debug/dri/0/ras/ras_ctrl
* echo disable umc > /sys/kernel/debug/dri/0/ras/ras_ctrl
*
- * How to check the result?
+ * How to check the result of the operation?
*
- * For disable/enable, please check ras features at
+ * To check disable/enable, see "ras" features at,
* /sys/class/drm/card[0/1/2...]/device/ras/features
*
- * For inject, please check corresponding err count at
- * /sys/class/drm/card[0/1/2...]/device/ras/[gfx/sdma/...]_err_count
+ * To check inject, see the corresponding error count at,
+ * /sys/class/drm/card[0/1/2...]/device/ras/[gfx|sdma|umc|...]_err_count
*
* .. note::
* Operations are only allowed on blocks which are supported.
- * Please check ras mask at /sys/module/amdgpu/parameters/ras_mask
+ * Check the "ras" mask at /sys/module/amdgpu/parameters/ras_mask
* to see which blocks support RAS on a particular asic.
*
*/
@@ -310,6 +371,14 @@ static ssize_t amdgpu_ras_debugfs_ctrl_write(struct file *f, const char __user *
if (ret)
return -EINVAL;
+ if (data.op == 3) {
+ ret = amdgpu_reserve_page_direct(adev, data.inject.address);
+ if (!ret)
+ return size;
+ else
+ return ret;
+ }
+
if (!amdgpu_ras_is_supported(adev, data.head.block))
return -EINVAL;
@@ -431,15 +500,19 @@ static ssize_t amdgpu_ras_sysfs_read(struct device *dev,
};
if (!amdgpu_ras_get_error_query_ready(obj->adev))
- return snprintf(buf, PAGE_SIZE,
- "Query currently inaccessible\n");
+ return sysfs_emit(buf, "Query currently inaccessible\n");
if (amdgpu_ras_query_error_status(obj->adev, &info))
return -EINVAL;
- return snprintf(buf, PAGE_SIZE, "%s: %lu\n%s: %lu\n",
- "ue", info.ue_count,
- "ce", info.ce_count);
+
+ if (obj->adev->asic_type == CHIP_ALDEBARAN) {
+ if (amdgpu_ras_reset_error_status(obj->adev, info.head.block))
+ DRM_WARN("Failed to reset error counter and error status");
+ }
+
+ return sysfs_emit(buf, "%s: %lu\n%s: %lu\n", "ue", info.ue_count,
+ "ce", info.ce_count);
}
/* obj begin */
@@ -449,11 +522,10 @@ static ssize_t amdgpu_ras_sysfs_read(struct device *dev,
static inline void put_obj(struct ras_manager *obj)
{
- if (obj && --obj->use == 0)
+ if (obj && (--obj->use == 0))
list_del(&obj->node);
- if (obj && obj->use < 0) {
- DRM_ERROR("RAS ERROR: Unbalance obj(%s) use\n", obj->head.name);
- }
+ if (obj && (obj->use < 0))
+ DRM_ERROR("RAS ERROR: Unbalance obj(%s) use\n", obj->head.name);
}
/* make one obj and return it. */
@@ -463,7 +535,7 @@ static struct ras_manager *amdgpu_ras_create_obj(struct amdgpu_device *adev,
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
struct ras_manager *obj;
- if (!adev->ras_features || !con)
+ if (!adev->ras_enabled || !con)
return NULL;
if (head->block >= AMDGPU_RAS_BLOCK_COUNT)
@@ -490,7 +562,7 @@ struct ras_manager *amdgpu_ras_find_obj(struct amdgpu_device *adev,
struct ras_manager *obj;
int i;
- if (!adev->ras_features || !con)
+ if (!adev->ras_enabled || !con)
return NULL;
if (head) {
@@ -517,36 +589,11 @@ struct ras_manager *amdgpu_ras_find_obj(struct amdgpu_device *adev,
}
/* obj end */
-static void amdgpu_ras_parse_status_code(struct amdgpu_device *adev,
- const char* invoke_type,
- const char* block_name,
- enum ta_ras_status ret)
-{
- switch (ret) {
- case TA_RAS_STATUS__SUCCESS:
- return;
- case TA_RAS_STATUS__ERROR_RAS_NOT_AVAILABLE:
- dev_warn(adev->dev,
- "RAS WARN: %s %s currently unavailable\n",
- invoke_type,
- block_name);
- break;
- default:
- dev_err(adev->dev,
- "RAS ERROR: %s %s error failed ret 0x%X\n",
- invoke_type,
- block_name,
- ret);
- }
-}
-
/* feature ctl begin */
static int amdgpu_ras_is_feature_allowed(struct amdgpu_device *adev,
- struct ras_common_if *head)
+ struct ras_common_if *head)
{
- struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
-
- return con->hw_supported & BIT(head->block);
+ return adev->ras_hw_enabled & BIT(head->block);
}
static int amdgpu_ras_is_feature_enabled(struct amdgpu_device *adev,
@@ -590,11 +637,7 @@ static int __amdgpu_ras_feature_enable(struct amdgpu_device *adev,
con->features |= BIT(head->block);
} else {
if (obj && amdgpu_ras_is_feature_enabled(adev, head)) {
- /* skip clean gfx ras context feature for VEGA20 Gaming.
- * will clean later
- */
- if (!(!adev->ras_features && con->features & BIT(AMDGPU_RAS_BLOCK__GFX)))
- con->features &= ~BIT(head->block);
+ con->features &= ~BIT(head->block);
put_obj(obj);
}
}
@@ -640,15 +683,10 @@ int amdgpu_ras_feature_enable(struct amdgpu_device *adev,
if (!amdgpu_ras_intr_triggered()) {
ret = psp_ras_enable_features(&adev->psp, info, enable);
if (ret) {
- amdgpu_ras_parse_status_code(adev,
- enable ? "enable":"disable",
- ras_block_str(head->block),
- (enum ta_ras_status)ret);
- if (ret == TA_RAS_STATUS__RESET_NEEDED)
- ret = -EAGAIN;
- else
- ret = -EINVAL;
-
+ dev_err(adev->dev, "ras %s %s failed %d\n",
+ enable ? "enable":"disable",
+ ras_block_str(head->block),
+ ret);
goto out;
}
}
@@ -702,6 +740,10 @@ int amdgpu_ras_feature_enable_on_boot(struct amdgpu_device *adev,
con->features |= BIT(head->block);
ret = amdgpu_ras_feature_enable(adev, head, 0);
+
+ /* clean gfx block ras features flag */
+ if (adev->ras_enabled && head->block == AMDGPU_RAS_BLOCK__GFX)
+ con->features &= ~BIT(head->block);
}
} else
ret = amdgpu_ras_feature_enable(adev, head, enable);
@@ -777,13 +819,15 @@ int amdgpu_ras_query_error_status(struct amdgpu_device *adev,
switch (info->head.block) {
case AMDGPU_RAS_BLOCK__UMC:
- if (adev->umc.funcs->query_ras_error_count)
- adev->umc.funcs->query_ras_error_count(adev, &err_data);
+ if (adev->umc.ras_funcs &&
+ adev->umc.ras_funcs->query_ras_error_count)
+ adev->umc.ras_funcs->query_ras_error_count(adev, &err_data);
/* umc query_ras_error_address is also responsible for clearing
* error status
*/
- if (adev->umc.funcs->query_ras_error_address)
- adev->umc.funcs->query_ras_error_address(adev, &err_data);
+ if (adev->umc.ras_funcs &&
+ adev->umc.ras_funcs->query_ras_error_address)
+ adev->umc.ras_funcs->query_ras_error_address(adev, &err_data);
break;
case AMDGPU_RAS_BLOCK__SDMA:
if (adev->sdma.funcs->query_ras_error_count) {
@@ -793,25 +837,37 @@ int amdgpu_ras_query_error_status(struct amdgpu_device *adev,
}
break;
case AMDGPU_RAS_BLOCK__GFX:
- if (adev->gfx.funcs->query_ras_error_count)
- adev->gfx.funcs->query_ras_error_count(adev, &err_data);
+ if (adev->gfx.ras_funcs &&
+ adev->gfx.ras_funcs->query_ras_error_count)
+ adev->gfx.ras_funcs->query_ras_error_count(adev, &err_data);
- if (adev->gfx.funcs->query_ras_error_status)
- adev->gfx.funcs->query_ras_error_status(adev);
+ if (adev->gfx.ras_funcs &&
+ adev->gfx.ras_funcs->query_ras_error_status)
+ adev->gfx.ras_funcs->query_ras_error_status(adev);
break;
case AMDGPU_RAS_BLOCK__MMHUB:
- if (adev->mmhub.funcs->query_ras_error_count)
- adev->mmhub.funcs->query_ras_error_count(adev, &err_data);
+ if (adev->mmhub.ras_funcs &&
+ adev->mmhub.ras_funcs->query_ras_error_count)
+ adev->mmhub.ras_funcs->query_ras_error_count(adev, &err_data);
- if (adev->mmhub.funcs->query_ras_error_status)
- adev->mmhub.funcs->query_ras_error_status(adev);
+ if (adev->mmhub.ras_funcs &&
+ adev->mmhub.ras_funcs->query_ras_error_status)
+ adev->mmhub.ras_funcs->query_ras_error_status(adev);
break;
case AMDGPU_RAS_BLOCK__PCIE_BIF:
- if (adev->nbio.funcs->query_ras_error_count)
- adev->nbio.funcs->query_ras_error_count(adev, &err_data);
+ if (adev->nbio.ras_funcs &&
+ adev->nbio.ras_funcs->query_ras_error_count)
+ adev->nbio.ras_funcs->query_ras_error_count(adev, &err_data);
break;
case AMDGPU_RAS_BLOCK__XGMI_WAFL:
- amdgpu_xgmi_query_ras_error_count(adev, &err_data);
+ if (adev->gmc.xgmi.ras_funcs &&
+ adev->gmc.xgmi.ras_funcs->query_ras_error_count)
+ adev->gmc.xgmi.ras_funcs->query_ras_error_count(adev, &err_data);
+ break;
+ case AMDGPU_RAS_BLOCK__HDP:
+ if (adev->hdp.ras_funcs &&
+ adev->hdp.ras_funcs->query_ras_error_count)
+ adev->hdp.ras_funcs->query_ras_error_count(adev, &err_data);
break;
default:
break;
@@ -824,17 +880,42 @@ int amdgpu_ras_query_error_status(struct amdgpu_device *adev,
info->ce_count = obj->err_data.ce_count;
if (err_data.ce_count) {
- dev_info(adev->dev, "%ld correctable hardware errors "
+ if (adev->smuio.funcs &&
+ adev->smuio.funcs->get_socket_id &&
+ adev->smuio.funcs->get_die_id) {
+ dev_info(adev->dev, "socket: %d, die: %d "
+ "%ld correctable hardware errors "
"detected in %s block, no user "
"action is needed.\n",
+ adev->smuio.funcs->get_socket_id(adev),
+ adev->smuio.funcs->get_die_id(adev),
obj->err_data.ce_count,
ras_block_str(info->head.block));
+ } else {
+ dev_info(adev->dev, "%ld correctable hardware errors "
+ "detected in %s block, no user "
+ "action is needed.\n",
+ obj->err_data.ce_count,
+ ras_block_str(info->head.block));
+ }
}
if (err_data.ue_count) {
- dev_info(adev->dev, "%ld uncorrectable hardware errors "
+ if (adev->smuio.funcs &&
+ adev->smuio.funcs->get_socket_id &&
+ adev->smuio.funcs->get_die_id) {
+ dev_info(adev->dev, "socket: %d, die: %d "
+ "%ld uncorrectable hardware errors "
+ "detected in %s block\n",
+ adev->smuio.funcs->get_socket_id(adev),
+ adev->smuio.funcs->get_die_id(adev),
+ obj->err_data.ue_count,
+ ras_block_str(info->head.block));
+ } else {
+ dev_info(adev->dev, "%ld uncorrectable hardware errors "
"detected in %s block\n",
obj->err_data.ue_count,
ras_block_str(info->head.block));
+ }
}
return 0;
@@ -848,20 +929,32 @@ int amdgpu_ras_reset_error_status(struct amdgpu_device *adev,
switch (block) {
case AMDGPU_RAS_BLOCK__GFX:
- if (adev->gfx.funcs->reset_ras_error_count)
- adev->gfx.funcs->reset_ras_error_count(adev);
+ if (adev->gfx.ras_funcs &&
+ adev->gfx.ras_funcs->reset_ras_error_count)
+ adev->gfx.ras_funcs->reset_ras_error_count(adev);
- if (adev->gfx.funcs->reset_ras_error_status)
- adev->gfx.funcs->reset_ras_error_status(adev);
+ if (adev->gfx.ras_funcs &&
+ adev->gfx.ras_funcs->reset_ras_error_status)
+ adev->gfx.ras_funcs->reset_ras_error_status(adev);
break;
case AMDGPU_RAS_BLOCK__MMHUB:
- if (adev->mmhub.funcs->reset_ras_error_count)
- adev->mmhub.funcs->reset_ras_error_count(adev);
+ if (adev->mmhub.ras_funcs &&
+ adev->mmhub.ras_funcs->reset_ras_error_count)
+ adev->mmhub.ras_funcs->reset_ras_error_count(adev);
+
+ if (adev->mmhub.ras_funcs &&
+ adev->mmhub.ras_funcs->reset_ras_error_status)
+ adev->mmhub.ras_funcs->reset_ras_error_status(adev);
break;
case AMDGPU_RAS_BLOCK__SDMA:
if (adev->sdma.funcs->reset_ras_error_count)
adev->sdma.funcs->reset_ras_error_count(adev);
break;
+ case AMDGPU_RAS_BLOCK__HDP:
+ if (adev->hdp.ras_funcs &&
+ adev->hdp.ras_funcs->reset_ras_error_count)
+ adev->hdp.ras_funcs->reset_ras_error_count(adev);
+ break;
default:
break;
}
@@ -921,12 +1014,14 @@ int amdgpu_ras_error_inject(struct amdgpu_device *adev,
switch (info->head.block) {
case AMDGPU_RAS_BLOCK__GFX:
- if (adev->gfx.funcs->ras_error_inject)
- ret = adev->gfx.funcs->ras_error_inject(adev, info);
+ if (adev->gfx.ras_funcs &&
+ adev->gfx.ras_funcs->ras_error_inject)
+ ret = adev->gfx.ras_funcs->ras_error_inject(adev, info);
else
ret = -EINVAL;
break;
case AMDGPU_RAS_BLOCK__UMC:
+ case AMDGPU_RAS_BLOCK__SDMA:
case AMDGPU_RAS_BLOCK__MMHUB:
case AMDGPU_RAS_BLOCK__PCIE_BIF:
ret = psp_ras_trigger_error(&adev->psp, &block_info);
@@ -940,10 +1035,9 @@ int amdgpu_ras_error_inject(struct amdgpu_device *adev,
ret = -EINVAL;
}
- amdgpu_ras_parse_status_code(adev,
- "inject",
- ras_block_str(info->head.block),
- (enum ta_ras_status)ret);
+ if (ret)
+ dev_err(adev->dev, "ras inject %s failed %d\n",
+ ras_block_str(info->head.block), ret);
return ret;
}
@@ -956,7 +1050,7 @@ unsigned long amdgpu_ras_query_error_count(struct amdgpu_device *adev,
struct ras_manager *obj;
struct ras_err_data data = {0, 0};
- if (!adev->ras_features || !con)
+ if (!adev->ras_enabled || !con)
return 0;
list_for_each_entry(obj, &con->head, node) {
@@ -1183,14 +1277,18 @@ static int amdgpu_ras_sysfs_remove_all(struct amdgpu_device *adev)
static struct dentry *amdgpu_ras_debugfs_create_ctrl_node(struct amdgpu_device *adev)
{
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
- struct dentry *dir;
- struct drm_minor *minor = adev_to_drm(adev)->primary;
+ struct drm_minor *minor = adev_to_drm(adev)->primary;
+ struct dentry *dir;
dir = debugfs_create_dir(RAS_FS_NAME, minor->debugfs_root);
debugfs_create_file("ras_ctrl", S_IWUGO | S_IRUGO, dir, adev,
&amdgpu_ras_debugfs_ctrl_ops);
debugfs_create_file("ras_eeprom_reset", S_IWUGO | S_IRUGO, dir, adev,
&amdgpu_ras_debugfs_eeprom_ops);
+ debugfs_create_u32("bad_page_cnt_threshold", 0444, dir,
+ &con->bad_page_cnt_threshold);
+ debugfs_create_x32("ras_hw_enabled", 0444, dir, &adev->ras_hw_enabled);
+ debugfs_create_x32("ras_enabled", 0444, dir, &adev->ras_enabled);
/*
* After one uncorrectable error happens, usually GPU recovery will
@@ -1477,7 +1575,7 @@ static void amdgpu_ras_log_on_err_counter(struct amdgpu_device *adev)
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
struct ras_manager *obj;
- if (!adev->ras_features || !con)
+ if (!adev->ras_enabled || !con)
return;
list_for_each_entry(obj, &con->head, node) {
@@ -1508,12 +1606,14 @@ static void amdgpu_ras_error_status_query(struct amdgpu_device *adev,
*/
switch (info->head.block) {
case AMDGPU_RAS_BLOCK__GFX:
- if (adev->gfx.funcs->query_ras_error_status)
- adev->gfx.funcs->query_ras_error_status(adev);
+ if (adev->gfx.ras_funcs &&
+ adev->gfx.ras_funcs->query_ras_error_status)
+ adev->gfx.ras_funcs->query_ras_error_status(adev);
break;
case AMDGPU_RAS_BLOCK__MMHUB:
- if (adev->mmhub.funcs->query_ras_error_status)
- adev->mmhub.funcs->query_ras_error_status(adev);
+ if (adev->mmhub.ras_funcs &&
+ adev->mmhub.ras_funcs->query_ras_error_status)
+ adev->mmhub.ras_funcs->query_ras_error_status(adev);
break;
default:
break;
@@ -1525,7 +1625,7 @@ static void amdgpu_ras_query_err_status(struct amdgpu_device *adev)
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
struct ras_manager *obj;
- if (!adev->ras_features || !con)
+ if (!adev->ras_enabled || !con)
return;
list_for_each_entry(obj, &con->head, node) {
@@ -1839,7 +1939,7 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
bool exc_err_limit = false;
int ret;
- if (adev->ras_features && con)
+ if (adev->ras_enabled && con)
data = &con->eh_data;
else
return 0;
@@ -1933,15 +2033,30 @@ int amdgpu_ras_request_reset_on_boot(struct amdgpu_device *adev,
return 0;
}
-static int amdgpu_ras_check_asic_type(struct amdgpu_device *adev)
+static bool amdgpu_ras_asic_supported(struct amdgpu_device *adev)
{
- if (adev->asic_type != CHIP_VEGA10 &&
- adev->asic_type != CHIP_VEGA20 &&
- adev->asic_type != CHIP_ARCTURUS &&
- adev->asic_type != CHIP_SIENNA_CICHLID)
- return 1;
- else
- return 0;
+ return adev->asic_type == CHIP_VEGA10 ||
+ adev->asic_type == CHIP_VEGA20 ||
+ adev->asic_type == CHIP_ARCTURUS ||
+ adev->asic_type == CHIP_ALDEBARAN ||
+ adev->asic_type == CHIP_SIENNA_CICHLID;
+}
+
+/*
+ * this is workaround for vega20 workstation sku,
+ * force enable gfx ras, ignore vbios gfx ras flag
+ * due to GC EDC can not write
+ */
+static void amdgpu_ras_get_quirks(struct amdgpu_device *adev)
+{
+ struct atom_context *ctx = adev->mode_info.atom_context;
+
+ if (!ctx)
+ return;
+
+ if (strnstr(ctx->vbios_version, "D16406",
+ sizeof(ctx->vbios_version)))
+ adev->ras_hw_enabled |= (1 << AMDGPU_RAS_BLOCK__GFX);
}
/*
@@ -1953,36 +2068,45 @@ static int amdgpu_ras_check_asic_type(struct amdgpu_device *adev)
* we have to initialize ras as normal. but need check if operation is
* allowed or not in each function.
*/
-static void amdgpu_ras_check_supported(struct amdgpu_device *adev,
- uint32_t *hw_supported, uint32_t *supported)
+static void amdgpu_ras_check_supported(struct amdgpu_device *adev)
{
- *hw_supported = 0;
- *supported = 0;
+ adev->ras_hw_enabled = adev->ras_enabled = 0;
if (amdgpu_sriov_vf(adev) || !adev->is_atom_fw ||
- amdgpu_ras_check_asic_type(adev))
+ !amdgpu_ras_asic_supported(adev))
return;
- if (amdgpu_atomfirmware_mem_ecc_supported(adev)) {
- dev_info(adev->dev, "MEM ECC is active.\n");
- *hw_supported |= (1 << AMDGPU_RAS_BLOCK__UMC |
- 1 << AMDGPU_RAS_BLOCK__DF);
- } else
- dev_info(adev->dev, "MEM ECC is not presented.\n");
+ if (!adev->gmc.xgmi.connected_to_cpu) {
+ if (amdgpu_atomfirmware_mem_ecc_supported(adev)) {
+ dev_info(adev->dev, "MEM ECC is active.\n");
+ adev->ras_hw_enabled |= (1 << AMDGPU_RAS_BLOCK__UMC |
+ 1 << AMDGPU_RAS_BLOCK__DF);
+ } else {
+ dev_info(adev->dev, "MEM ECC is not presented.\n");
+ }
- if (amdgpu_atomfirmware_sram_ecc_supported(adev)) {
- dev_info(adev->dev, "SRAM ECC is active.\n");
- *hw_supported |= ~(1 << AMDGPU_RAS_BLOCK__UMC |
- 1 << AMDGPU_RAS_BLOCK__DF);
- } else
- dev_info(adev->dev, "SRAM ECC is not presented.\n");
+ if (amdgpu_atomfirmware_sram_ecc_supported(adev)) {
+ dev_info(adev->dev, "SRAM ECC is active.\n");
+ adev->ras_hw_enabled |= ~(1 << AMDGPU_RAS_BLOCK__UMC |
+ 1 << AMDGPU_RAS_BLOCK__DF);
+ } else {
+ dev_info(adev->dev, "SRAM ECC is not presented.\n");
+ }
+ } else {
+ /* driver only manages a few IP blocks RAS feature
+ * when GPU is connected cpu through XGMI */
+ adev->ras_hw_enabled |= (1 << AMDGPU_RAS_BLOCK__GFX |
+ 1 << AMDGPU_RAS_BLOCK__SDMA |
+ 1 << AMDGPU_RAS_BLOCK__MMHUB);
+ }
+
+ amdgpu_ras_get_quirks(adev);
/* hw_supported needs to be aligned with RAS block mask. */
- *hw_supported &= AMDGPU_RAS_BLOCK_MASK;
+ adev->ras_hw_enabled &= AMDGPU_RAS_BLOCK_MASK;
- *supported = amdgpu_ras_enable == 0 ?
- 0 : *hw_supported & amdgpu_ras_mask;
- adev->ras_features = *supported;
+ adev->ras_enabled = amdgpu_ras_enable == 0 ? 0 :
+ adev->ras_hw_enabled & amdgpu_ras_mask;
}
int amdgpu_ras_init(struct amdgpu_device *adev)
@@ -2003,13 +2127,13 @@ int amdgpu_ras_init(struct amdgpu_device *adev)
amdgpu_ras_set_context(adev, con);
- amdgpu_ras_check_supported(adev, &con->hw_supported,
- &con->supported);
- if (!con->hw_supported || (adev->asic_type == CHIP_VEGA10)) {
+ amdgpu_ras_check_supported(adev);
+
+ if (!adev->ras_enabled || adev->asic_type == CHIP_VEGA10) {
/* set gfx block ras context feature for VEGA20 Gaming
* send ras disable cmd to ras ta during ras late init.
*/
- if (!adev->ras_features && adev->asic_type == CHIP_VEGA20) {
+ if (!adev->ras_enabled && adev->asic_type == CHIP_VEGA20) {
con->features |= BIT(AMDGPU_RAS_BLOCK__GFX);
return 0;
@@ -2024,14 +2148,31 @@ int amdgpu_ras_init(struct amdgpu_device *adev)
/* Might need get this flag from vbios. */
con->flags = RAS_DEFAULT_FLAGS;
- if (adev->nbio.funcs->init_ras_controller_interrupt) {
- r = adev->nbio.funcs->init_ras_controller_interrupt(adev);
+ /* initialize nbio ras function ahead of any other
+ * ras functions so hardware fatal error interrupt
+ * can be enabled as early as possible */
+ switch (adev->asic_type) {
+ case CHIP_VEGA20:
+ case CHIP_ARCTURUS:
+ case CHIP_ALDEBARAN:
+ if (!adev->gmc.xgmi.connected_to_cpu)
+ adev->nbio.ras_funcs = &nbio_v7_4_ras_funcs;
+ break;
+ default:
+ /* nbio ras is not available */
+ break;
+ }
+
+ if (adev->nbio.ras_funcs &&
+ adev->nbio.ras_funcs->init_ras_controller_interrupt) {
+ r = adev->nbio.ras_funcs->init_ras_controller_interrupt(adev);
if (r)
goto release_con;
}
- if (adev->nbio.funcs->init_ras_err_event_athub_interrupt) {
- r = adev->nbio.funcs->init_ras_err_event_athub_interrupt(adev);
+ if (adev->nbio.ras_funcs &&
+ adev->nbio.ras_funcs->init_ras_err_event_athub_interrupt) {
+ r = adev->nbio.ras_funcs->init_ras_err_event_athub_interrupt(adev);
if (r)
goto release_con;
}
@@ -2042,8 +2183,9 @@ int amdgpu_ras_init(struct amdgpu_device *adev)
}
dev_info(adev->dev, "RAS INFO: ras initialized successfully, "
- "hardware ability[%x] ras_mask[%x]\n",
- con->hw_supported, con->supported);
+ "hardware ability[%x] ras_mask[%x]\n",
+ adev->ras_hw_enabled, adev->ras_enabled);
+
return 0;
release_con:
amdgpu_ras_set_context(adev, NULL);
@@ -2052,6 +2194,32 @@ release_con:
return r;
}
+int amdgpu_persistent_edc_harvesting_supported(struct amdgpu_device *adev)
+{
+ if (adev->gmc.xgmi.connected_to_cpu)
+ return 1;
+ return 0;
+}
+
+static int amdgpu_persistent_edc_harvesting(struct amdgpu_device *adev,
+ struct ras_common_if *ras_block)
+{
+ struct ras_query_if info = {
+ .head = *ras_block,
+ };
+
+ if (!amdgpu_persistent_edc_harvesting_supported(adev))
+ return 0;
+
+ if (amdgpu_ras_query_error_status(adev, &info) != 0)
+ DRM_WARN("RAS init harvest failure");
+
+ if (amdgpu_ras_reset_error_status(adev, ras_block->block) != 0)
+ DRM_WARN("RAS init harvest reset failure");
+
+ return 0;
+}
+
/* helper function to handle common stuff in ip late init phase */
int amdgpu_ras_late_init(struct amdgpu_device *adev,
struct ras_common_if *ras_block,
@@ -2081,6 +2249,9 @@ int amdgpu_ras_late_init(struct amdgpu_device *adev,
return r;
}
+ /* check for errors on warm reset edc persisant supported ASIC */
+ amdgpu_persistent_edc_harvesting(adev, ras_block);
+
/* in resume phase, no need to create ras fs node */
if (adev->in_suspend || amdgpu_in_reset(adev))
return 0;
@@ -2128,7 +2299,7 @@ void amdgpu_ras_resume(struct amdgpu_device *adev)
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
struct ras_manager *obj, *tmp;
- if (!adev->ras_features || !con) {
+ if (!adev->ras_enabled || !con) {
/* clean ras context for VEGA20 Gaming after send ras disable cmd */
amdgpu_release_ras_context(adev);
@@ -2174,7 +2345,7 @@ void amdgpu_ras_suspend(struct amdgpu_device *adev)
{
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
- if (!adev->ras_features || !con)
+ if (!adev->ras_enabled || !con)
return;
amdgpu_ras_disable_all_features(adev, 0);
@@ -2188,9 +2359,10 @@ int amdgpu_ras_pre_fini(struct amdgpu_device *adev)
{
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
- if (!adev->ras_features || !con)
+ if (!adev->ras_enabled || !con)
return 0;
+
/* Need disable ras on all IPs here before ip [hw/sw]fini */
amdgpu_ras_disable_all_features(adev, 0);
amdgpu_ras_recovery_fini(adev);
@@ -2201,7 +2373,7 @@ int amdgpu_ras_fini(struct amdgpu_device *adev)
{
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
- if (!adev->ras_features || !con)
+ if (!adev->ras_enabled || !con)
return 0;
amdgpu_ras_fs_fini(adev);
@@ -2220,10 +2392,8 @@ int amdgpu_ras_fini(struct amdgpu_device *adev)
void amdgpu_ras_global_ras_isr(struct amdgpu_device *adev)
{
- uint32_t hw_supported, supported;
-
- amdgpu_ras_check_supported(adev, &hw_supported, &supported);
- if (!hw_supported)
+ amdgpu_ras_check_supported(adev);
+ if (!adev->ras_hw_enabled)
return;
if (atomic_cmpxchg(&amdgpu_ras_in_intr, 0, 1) == 0) {
@@ -2252,7 +2422,7 @@ void amdgpu_release_ras_context(struct amdgpu_device *adev)
if (!con)
return;
- if (!adev->ras_features && con->features & BIT(AMDGPU_RAS_BLOCK__GFX)) {
+ if (!adev->ras_enabled && con->features & BIT(AMDGPU_RAS_BLOCK__GFX)) {
con->features &= ~BIT(AMDGPU_RAS_BLOCK__GFX);
amdgpu_ras_set_context(adev, NULL);
kfree(con);