summaryrefslogtreecommitdiff
path: root/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
diff options
context:
space:
mode:
authorPhilip Yang <Philip.Yang@amd.com>2024-08-02 11:28:45 -0400
committerAlex Deucher <alexander.deucher@amd.com>2024-08-13 10:43:16 -0400
commita1fc9f584c4aaf8bc1ebfa459fc57a3f26a290d8 (patch)
tree92f73c1a86f4892b86270f03f518285bae2d2360 /drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
parent70f83e7706e57200edb8ffa36883b2f43d214142 (diff)
drm/amdkfd: Handle queue destroy buffer access race
Add helper function kfd_queue_unreference_buffers to reduce queue buffer refcount, separate it from release queue buffers. Because it is circular locking to hold dqm_lock to take vm lock, kfd_ioctl_destroy_queue should take vm lock, unreference queue buffers first, but not release queue buffers, to handle error in case failed to hold vm lock. Then hold dqm_lock to remove queue from queue list and then release queue buffers. Restore process worker restore queue hold dqm_lock, will always find the queue with valid queue buffers. v2 (Felix): - renamed kfd_queue_unreference_buffer(s) to kfd_queue_unref_bo_va(s) - added two FIXME comments for follow up Signed-off-by: Philip Yang <Philip.Yang@amd.com> Signed-off-by: Felix Kuehling <felix.kuehling@amd.com> Reviewed-by: Felix Kuehling <felix.kuehling@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
Diffstat (limited to 'drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c')
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c8
1 files changed, 5 insertions, 3 deletions
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
index f732ee35b531..20ea745729ee 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
@@ -217,6 +217,7 @@ void pqm_uninit(struct process_queue_manager *pqm)
list_for_each_entry_safe(pqn, next, &pqm->queues, process_queue_list) {
if (pqn->q) {
pdd = kfd_get_process_device_data(pqn->q->device, pqm->process);
+ kfd_queue_unref_bo_vas(pdd, &pqn->q->properties);
kfd_queue_release_buffers(pdd, &pqn->q->properties);
pqm_clean_queue_resource(pqm, pqn);
}
@@ -512,7 +513,7 @@ int pqm_destroy_queue(struct process_queue_manager *pqm, unsigned int qid)
}
if (pqn->q) {
- retval = kfd_queue_release_buffers(pdd, &pqn->q->properties);
+ retval = kfd_queue_unref_bo_vas(pdd, &pqn->q->properties);
if (retval)
goto err_destroy_queue;
@@ -526,7 +527,7 @@ int pqm_destroy_queue(struct process_queue_manager *pqm, unsigned int qid)
if (retval != -ETIME)
goto err_destroy_queue;
}
-
+ kfd_queue_release_buffers(pdd, &pqn->q->properties);
pqm_clean_queue_resource(pqm, pqn);
uninit_queue(pqn->q);
}
@@ -579,7 +580,8 @@ int pqm_update_queue_properties(struct process_queue_manager *pqm,
return -EFAULT;
}
- kfd_queue_buffer_put(vm, &pqn->q->properties.ring_bo);
+ kfd_queue_unref_bo_va(vm, &pqn->q->properties.ring_bo);
+ kfd_queue_buffer_put(&pqn->q->properties.ring_bo);
amdgpu_bo_unreserve(vm->root.bo);
pqn->q->properties.ring_bo = p->ring_bo;