diff options
| author | YiPeng Chai <YiPeng.Chai@amd.com> | 2025-10-28 16:18:31 +0800 |
|---|---|---|
| committer | Alex Deucher <alexander.deucher@amd.com> | 2025-11-04 11:53:59 -0500 |
| commit | d95ca7f515cfd2e721de07e86aa79adb17575a52 (patch) | |
| tree | 726550907388bb73193175e1da23f00e7e8a80ea /drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | |
| parent | d4432f16d3393180e8f0b344b21839e553f7938b (diff) | |
drm/amdgpu: suspend ras module before gpu reset
During gpu reset, all GPU-related resources are
inaccessible. To avoid affecting ras functionality,
suspend ras module before gpu reset and resume
it after gpu reset is complete.
V2:
Rename functions to avoid misunderstanding.
V3:
Move flush_delayed_work to amdgpu_ras_process_pause,
Move schedule_delayed_work to amdgpu_ras_process_unpause.
V4:
Rename functions.
V5:
Move the function to amdgpu_ras.c.
Signed-off-by: YiPeng Chai <YiPeng.Chai@amd.com>
Reviewed-by: Tao Zhou <tao.zhou1@amd.com>
Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com>
Acked-by: Lijo Lazar <lijo.lazar@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c')
| -rw-r--r-- | drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 30 |
1 files changed, 28 insertions, 2 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 37999b367957..62d2f988d88f 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -2921,8 +2921,12 @@ static void amdgpu_ras_do_recovery(struct work_struct *work) type = amdgpu_ras_get_fatal_error_event(adev); list_for_each_entry(remote_adev, device_list_handle, gmc.xgmi.head) { - amdgpu_ras_query_err_status(remote_adev); - amdgpu_ras_log_on_err_counter(remote_adev, type); + if (amdgpu_uniras_enabled(remote_adev)) { + amdgpu_ras_mgr_update_ras_ecc(remote_adev); + } else { + amdgpu_ras_query_err_status(remote_adev); + amdgpu_ras_log_on_err_counter(remote_adev, type); + } } } @@ -5673,3 +5677,25 @@ bool amdgpu_ras_check_critical_address(struct amdgpu_device *adev, uint64_t addr return ret; } + +void amdgpu_ras_pre_reset(struct amdgpu_device *adev, + struct list_head *device_list) +{ + struct amdgpu_device *tmp_adev = NULL; + + list_for_each_entry(tmp_adev, device_list, reset_list) { + if (amdgpu_uniras_enabled(tmp_adev)) + amdgpu_ras_mgr_pre_reset(tmp_adev); + } +} + +void amdgpu_ras_post_reset(struct amdgpu_device *adev, + struct list_head *device_list) +{ + struct amdgpu_device *tmp_adev = NULL; + + list_for_each_entry(tmp_adev, device_list, reset_list) { + if (amdgpu_uniras_enabled(tmp_adev)) + amdgpu_ras_mgr_post_reset(tmp_adev); + } +} |