summaryrefslogtreecommitdiff
path: root/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
diff options
context:
space:
mode:
authorJakub Kicinski <kuba@kernel.org>2023-03-09 22:18:59 -0800
committerJakub Kicinski <kuba@kernel.org>2023-03-09 22:22:11 -0800
commitd0ddf5065ffef45f8fce4001abe0206081c7ff10 (patch)
treeea83817cbe9fc25261eae87b85afd9fe086f479e /drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
parentdb47fa2e4cbf180a39d8e6d6170962bd7d82e52d (diff)
parent44889ba56cbb3d51154660ccd15818bc77276696 (diff)
Merge git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net
Documentation/bpf/bpf_devel_QA.rst b7abcd9c656b ("bpf, doc: Link to submitting-patches.rst for general patch submission info") d56b0c461d19 ("bpf, docs: Fix link to netdev-FAQ target") https://lore.kernel.org/all/20230307095812.236eb1be@canb.auug.org.au/ Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c')
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c23
1 files changed, 17 insertions, 6 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 6e543558386d..63dfcc98152d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -176,7 +176,7 @@ static int amdgpu_reserve_page_direct(struct amdgpu_device *adev, uint64_t addre
if (amdgpu_bad_page_threshold != 0) {
amdgpu_ras_add_bad_pages(adev, err_data.err_addr,
err_data.err_addr_cnt);
- amdgpu_ras_save_bad_pages(adev);
+ amdgpu_ras_save_bad_pages(adev, NULL);
}
dev_warn(adev->dev, "WARNING: THIS IS ONLY FOR TEST PURPOSES AND WILL CORRUPT RAS EEPROM\n");
@@ -2084,22 +2084,32 @@ out:
/*
* write error record array to eeprom, the function should be
* protected by recovery_lock
+ * new_cnt: new added UE count, excluding reserved bad pages, can be NULL
*/
-int amdgpu_ras_save_bad_pages(struct amdgpu_device *adev)
+int amdgpu_ras_save_bad_pages(struct amdgpu_device *adev,
+ unsigned long *new_cnt)
{
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
struct ras_err_handler_data *data;
struct amdgpu_ras_eeprom_control *control;
int save_count;
- if (!con || !con->eh_data)
+ if (!con || !con->eh_data) {
+ if (new_cnt)
+ *new_cnt = 0;
+
return 0;
+ }
mutex_lock(&con->recovery_lock);
control = &con->eeprom_control;
data = con->eh_data;
save_count = data->count - control->ras_num_recs;
mutex_unlock(&con->recovery_lock);
+
+ if (new_cnt)
+ *new_cnt = save_count / adev->umc.retire_unit;
+
/* only new entries are saved */
if (save_count > 0) {
if (amdgpu_ras_eeprom_append(control,
@@ -2186,11 +2196,12 @@ static void amdgpu_ras_validate_threshold(struct amdgpu_device *adev,
/*
* Justification of value bad_page_cnt_threshold in ras structure
*
- * Generally, -1 <= amdgpu_bad_page_threshold <= max record length
- * in eeprom, and introduce two scenarios accordingly.
+ * Generally, 0 <= amdgpu_bad_page_threshold <= max record length
+ * in eeprom or amdgpu_bad_page_threshold == -2, introduce two
+ * scenarios accordingly.
*
* Bad page retirement enablement:
- * - If amdgpu_bad_page_threshold = -1,
+ * - If amdgpu_bad_page_threshold = -2,
* bad_page_cnt_threshold = typical value by formula.
*
* - When the value from user is 0 < amdgpu_bad_page_threshold <