summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorYiPeng Chai <YiPeng.Chai@amd.com>2025-07-21 15:14:03 +0800
committerAlex Deucher <alexander.deucher@amd.com>2025-10-20 18:18:35 -0400
commit408bd841ad24fd9891a1cd9ca11cf4f48d9667dc (patch)
tree9fb77c5f5f04b1f54fbdb3143c121a78f54c4b66
parent3d72d2e5f4c47d88f292c915ce31ef14092df3f2 (diff)
drm/amdgpu: Improve ras fatal error handling function
In multi-gpu case, a fatal error will generate several fatal error interrupts. After improving this function, the ras module can reuse this function to only handle the first interrupt. V3: Initialize event_id using RAS_EVENT_INVALID_ID. Signed-off-by: YiPeng Chai <YiPeng.Chai@amd.com> Reviewed-by: Tao Zhou <tao.zhou1@amd.com> Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c16
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h2
-rw-r--r--drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_sys.c5
3 files changed, 14 insertions, 9 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 9aa4b93ac6af..9e632adc7746 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -4650,20 +4650,18 @@ u64 amdgpu_ras_acquire_event_id(struct amdgpu_device *adev, enum ras_event_type
return id;
}
-void amdgpu_ras_global_ras_isr(struct amdgpu_device *adev)
+int amdgpu_ras_global_ras_isr(struct amdgpu_device *adev)
{
if (atomic_cmpxchg(&amdgpu_ras_in_intr, 0, 1) == 0) {
struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
enum ras_event_type type = RAS_EVENT_TYPE_FATAL;
- u64 event_id;
+ u64 event_id = RAS_EVENT_INVALID_ID;
- if (amdgpu_ras_mark_ras_event(adev, type)) {
- dev_err(adev->dev,
- "uncorrectable hardware error (ERREVENT_ATHUB_INTERRUPT) detected!\n");
- return;
- }
+ if (amdgpu_uniras_enabled(adev))
+ return 0;
- event_id = amdgpu_ras_acquire_event_id(adev, type);
+ if (!amdgpu_ras_mark_ras_event(adev, type))
+ event_id = amdgpu_ras_acquire_event_id(adev, type);
RAS_EVENT_LOG(adev, event_id, "uncorrectable hardware error"
"(ERREVENT_ATHUB_INTERRUPT) detected!\n");
@@ -4672,6 +4670,8 @@ void amdgpu_ras_global_ras_isr(struct amdgpu_device *adev)
ras->gpu_reset_flags |= AMDGPU_RAS_GPU_RESET_MODE1_RESET;
amdgpu_ras_reset_gpu(adev);
}
+
+ return -EBUSY;
}
bool amdgpu_ras_need_emergency_restart(struct amdgpu_device *adev)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
index 9f21b6cf8724..556cf4d7b5ef 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -910,7 +910,7 @@ static inline void amdgpu_ras_intr_cleared(void)
atomic_set(&amdgpu_ras_in_intr, 0);
}
-void amdgpu_ras_global_ras_isr(struct amdgpu_device *adev);
+int amdgpu_ras_global_ras_isr(struct amdgpu_device *adev);
void amdgpu_ras_set_error_query_ready(struct amdgpu_device *adev, bool ready);
diff --git a/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_sys.c b/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_sys.c
index 40071b876333..f21cd55a25be 100644
--- a/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_sys.c
+++ b/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_sys.c
@@ -29,8 +29,13 @@
static int amdgpu_ras_sys_detect_fatal_event(struct ras_core_context *ras_core, void *data)
{
struct amdgpu_device *adev = (struct amdgpu_device *)ras_core->dev;
+ int ret;
uint64_t seq_no;
+ ret = amdgpu_ras_global_ras_isr(adev);
+ if (ret)
+ return ret;
+
seq_no = amdgpu_ras_mgr_gen_ras_event_seqno(adev, RAS_SEQNO_TYPE_UE);
RAS_DEV_INFO(adev,
"{%llu} Uncorrectable hardware error(ERREVENT_ATHUB_INTERRUPT) detected!\n",