summaryrefslogtreecommitdiff
path: root/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2021-07-16 11:14:54 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2021-07-16 11:14:54 -0700
commit786cb0a2f9bba267c8a80caf906b94c76d18f7e8 (patch)
treebbdd10fb607885e44798ea72bd9d0ce9bb9de3a3 /drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
parent6e442d06621f2af87fc0bf352976694db547c780 (diff)
parent876d98e5511d8cfd12fc617a6717e7a8ea07be17 (diff)
Merge tag 'drm-fixes-2021-07-16' of git://anongit.freedesktop.org/drm/drm
Pull drm fixes from Dave Airlie: "Regular rc2 fixes though a bit more than usual at rc2 stage, people must have been testing early or else some fixes from last week got a bit laggy. There is one larger change in the amd fixes to amalgamate some power management code on the newer chips with the code from the older chips, it should only affects chips where support was introduced in rc1 and it should make future fixes easier to maintain probably a good idea to merge it now. Otherwise it's mostly fixes across the board. dma-buf: - Fix fence leak in sync_file_merge() error code drm/panel: - nt35510: Don't fail on DSI reads fbdev: - Avoid use-after-free by not deleting current video mode ttm: - Avoid NULL-ptr deref in ttm_range_man_fini() vmwgfx: - Fix a merge commit qxl: - fix a TTM regression amdgpu: - SR-IOV fixes - RAS fixes - eDP fixes - SMU13 code unification to facilitate fixes in the future - Add new renoir DID - Yellow Carp fixes - Beige Goby fixes - Revert a bunch of TLB fixes that caused regressions - Revert an LTTPR display regression amdkfd - Fix VRAM access regression - SVM fixes i915: - Fix -EDEADLK handling regression - Drop the page table optimisation" * tag 'drm-fixes-2021-07-16' of git://anongit.freedesktop.org/drm/drm: (29 commits) drm/amdgpu: add another Renoir DID drm/ttm: add a check against null pointer dereference drm/i915/gtt: drop the page table optimisation drm/i915/gt: Fix -EDEADLK handling regression drm/amd/pm: Add waiting for response of mode-reset message for yellow carp Revert "drm/amdkfd: Add heavy-weight TLB flush after unmapping" Revert "drm/amdgpu: Add table_freed parameter to amdgpu_vm_bo_update" Revert "drm/amdkfd: Make TLB flush conditional on mapping" Revert "drm/amdgpu: Fix warning of Function parameter or member not described" Revert "drm/amdkfd: Add memory sync before TLB flush on unmap" drm/amd/pm: Fix BACO state setting for Beige_Goby drm/amdgpu: Restore msix after FLR drm/amdkfd: Allow CPU access for all VRAM BOs drm/amdgpu/display - only update eDP's backlight level when necessary drm/amdkfd: handle fault counters on invalid address drm/amdgpu: Correct the irq numbers for virtual crtc drm/amd/display: update header file name drm/amd/pm: drop smu_v13_0_1.c|h files for yellow carp drm/amd/display: remove faulty assert Revert "drm/amd/display: Always write repeater mode regardless of LTTPR" ...
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c')
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c49
1 files changed, 35 insertions, 14 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index c13b02caf8c3..fc66aca28594 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -809,7 +809,7 @@ static int amdgpu_ras_enable_all_features(struct amdgpu_device *adev,
/* query/inject/cure begin */
int amdgpu_ras_query_error_status(struct amdgpu_device *adev,
- struct ras_query_if *info)
+ struct ras_query_if *info)
{
struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head);
struct ras_err_data err_data = {0, 0, 0, NULL};
@@ -1043,17 +1043,32 @@ int amdgpu_ras_error_inject(struct amdgpu_device *adev,
return ret;
}
-/* get the total error counts on all IPs */
-void amdgpu_ras_query_error_count(struct amdgpu_device *adev,
- unsigned long *ce_count,
- unsigned long *ue_count)
+/**
+ * amdgpu_ras_query_error_count -- Get error counts of all IPs
+ * adev: pointer to AMD GPU device
+ * ce_count: pointer to an integer to be set to the count of correctible errors.
+ * ue_count: pointer to an integer to be set to the count of uncorrectible
+ * errors.
+ *
+ * If set, @ce_count or @ue_count, count and return the corresponding
+ * error counts in those integer pointers. Return 0 if the device
+ * supports RAS. Return -EOPNOTSUPP if the device doesn't support RAS.
+ */
+int amdgpu_ras_query_error_count(struct amdgpu_device *adev,
+ unsigned long *ce_count,
+ unsigned long *ue_count)
{
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
struct ras_manager *obj;
unsigned long ce, ue;
if (!adev->ras_enabled || !con)
- return;
+ return -EOPNOTSUPP;
+
+ /* Don't count since no reporting.
+ */
+ if (!ce_count && !ue_count)
+ return 0;
ce = 0;
ue = 0;
@@ -1061,9 +1076,11 @@ void amdgpu_ras_query_error_count(struct amdgpu_device *adev,
struct ras_query_if info = {
.head = obj->head,
};
+ int res;
- if (amdgpu_ras_query_error_status(adev, &info))
- return;
+ res = amdgpu_ras_query_error_status(adev, &info);
+ if (res)
+ return res;
ce += info.ce_count;
ue += info.ue_count;
@@ -1074,6 +1091,8 @@ void amdgpu_ras_query_error_count(struct amdgpu_device *adev,
if (ue_count)
*ue_count = ue;
+
+ return 0;
}
/* query/inject/cure end */
@@ -2137,9 +2156,10 @@ static void amdgpu_ras_counte_dw(struct work_struct *work)
/* Cache new values.
*/
- amdgpu_ras_query_error_count(adev, &ce_count, &ue_count);
- atomic_set(&con->ras_ce_count, ce_count);
- atomic_set(&con->ras_ue_count, ue_count);
+ if (amdgpu_ras_query_error_count(adev, &ce_count, &ue_count) == 0) {
+ atomic_set(&con->ras_ce_count, ce_count);
+ atomic_set(&con->ras_ue_count, ue_count);
+ }
pm_runtime_mark_last_busy(dev->dev);
Out:
@@ -2312,9 +2332,10 @@ int amdgpu_ras_late_init(struct amdgpu_device *adev,
/* Those are the cached values at init.
*/
- amdgpu_ras_query_error_count(adev, &ce_count, &ue_count);
- atomic_set(&con->ras_ce_count, ce_count);
- atomic_set(&con->ras_ue_count, ue_count);
+ if (amdgpu_ras_query_error_count(adev, &ce_count, &ue_count) == 0) {
+ atomic_set(&con->ras_ce_count, ce_count);
+ atomic_set(&con->ras_ue_count, ue_count);
+ }
return 0;
cleanup: