diff options
| author | Christian König <ckoenig.leichtzumerken@gmail.com> | 2025-05-02 18:17:16 +0200 |
|---|---|---|
| committer | Alex Deucher <alexander.deucher@amd.com> | 2025-06-30 11:57:21 -0400 |
| commit | 821aacb2dcf0d1fbc3c0f7803b6089b01addb8bf (patch) | |
| tree | 240f594228bec09d4dc90429507eda0d5bee7f3e /drivers/gpu/drm/amd/amdgpu/amdgpu_job.c | |
| parent | 787e2ce10fdce2b95a9ad81cc244ef22b0c200fe (diff) | |
drm/amdgpu: rework queue reset scheduler interaction
Stopping the scheduler for queue reset is generally a good idea because
it prevents any worker from touching the ring buffer.
Reviewed-by: Alex Deucher <alexander.deucher@amd.com>
Signed-off-by: Christian König <christian.koenig@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_job.c')
| -rw-r--r-- | drivers/gpu/drm/amd/amdgpu/amdgpu_job.c | 35 |
1 files changed, 20 insertions, 15 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c index a7ff1fa4c778..93413be59e08 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c @@ -91,8 +91,8 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job) struct amdgpu_job *job = to_amdgpu_job(s_job); struct amdgpu_task_info *ti; struct amdgpu_device *adev = ring->adev; - int idx; - int r; + bool set_error = false; + int idx, r; if (!drm_dev_enter(adev_to_drm(adev), &idx)) { dev_info(adev->dev, "%s - device unplugged skipping recovery on scheduler:%s", @@ -136,10 +136,12 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job) } else if (amdgpu_gpu_recovery && ring->funcs->reset) { bool is_guilty; - dev_err(adev->dev, "Starting %s ring reset\n", s_job->sched->name); - /* stop the scheduler, but don't mess with the - * bad job yet because if ring reset fails - * we'll fall back to full GPU reset. + dev_err(adev->dev, "Starting %s ring reset\n", + s_job->sched->name); + + /* + * Stop the scheduler to prevent anybody else from touching the + * ring buffer. */ drm_sched_wqueue_stop(&ring->sched); @@ -152,26 +154,29 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job) else is_guilty = true; - if (is_guilty) + if (is_guilty) { dma_fence_set_error(&s_job->s_fence->finished, -ETIME); + set_error = true; + } r = amdgpu_ring_reset(ring, job->vmid, NULL); if (!r) { - if (amdgpu_ring_sched_ready(ring)) - drm_sched_stop(&ring->sched, s_job); if (is_guilty) { atomic_inc(&ring->adev->gpu_reset_counter); amdgpu_fence_driver_force_completion(ring); } - if (amdgpu_ring_sched_ready(ring)) - drm_sched_start(&ring->sched, 0); - dev_err(adev->dev, "Ring %s reset succeeded\n", ring->sched.name); - drm_dev_wedged_event(adev_to_drm(adev), DRM_WEDGE_RECOVERY_NONE); + drm_sched_wqueue_start(&ring->sched); + dev_err(adev->dev, "Ring %s reset succeeded\n", + ring->sched.name); + drm_dev_wedged_event(adev_to_drm(adev), + DRM_WEDGE_RECOVERY_NONE); goto exit; } - dev_err(adev->dev, "Ring %s reset failure\n", ring->sched.name); + dev_err(adev->dev, "Ring %s reset failed\n", ring->sched.name); } - dma_fence_set_error(&s_job->s_fence->finished, -ETIME); + + if (!set_error) + dma_fence_set_error(&s_job->s_fence->finished, -ETIME); if (amdgpu_device_should_recover_gpu(ring->adev)) { struct amdgpu_reset_context reset_context; |