summaryrefslogtreecommitdiff
path: root/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
diff options
context:
space:
mode:
authorJani Nikula <jani.nikula@intel.com>2025-11-11 12:32:07 +0200
committerJani Nikula <jani.nikula@intel.com>2025-11-11 12:32:07 +0200
commit1c1960f57151e36f550d1a53352f832d5aaa5f8f (patch)
tree102cdd74d069bfcc1c83f99db00ae59f82515c5c /drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
parente109f644b871df8440c886a69cdce971ed533088 (diff)
parente237dfe70867f02de223e36340fe5f8b0fe0eada (diff)
Merge drm/drm-next into drm-intel-next
Primarily sync with the drm_print.h changes from drm-misc. Signed-off-by: Jani Nikula <jani.nikula@intel.com>
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c')
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_device.c267
1 files changed, 156 insertions, 111 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 7a899fb4de29..654f4844b7ad 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -1882,6 +1882,13 @@ static bool amdgpu_device_pcie_dynamic_switching_supported(struct amdgpu_device
static bool amdgpu_device_aspm_support_quirk(struct amdgpu_device *adev)
{
+ /* Enabling ASPM causes randoms hangs on Tahiti and Oland on Zen4.
+ * It's unclear if this is a platform-specific or GPU-specific issue.
+ * Disable ASPM on SI for the time being.
+ */
+ if (adev->family == AMDGPU_FAMILY_SI)
+ return true;
+
#if IS_ENABLED(CONFIG_X86)
struct cpuinfo_x86 *c = &cpu_data(0);
@@ -2380,7 +2387,7 @@ int amdgpu_device_ip_wait_for_idle(struct amdgpu_device *adev,
}
/**
- * amdgpu_device_ip_is_valid - is the hardware IP enabled
+ * amdgpu_device_ip_is_hw - is the hardware IP enabled
*
* @adev: amdgpu_device pointer
* @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
@@ -2388,6 +2395,27 @@ int amdgpu_device_ip_wait_for_idle(struct amdgpu_device *adev,
* Check if the hardware IP is enable or not.
* Returns true if it the IP is enable, false if not.
*/
+bool amdgpu_device_ip_is_hw(struct amdgpu_device *adev,
+ enum amd_ip_block_type block_type)
+{
+ int i;
+
+ for (i = 0; i < adev->num_ip_blocks; i++) {
+ if (adev->ip_blocks[i].version->type == block_type)
+ return adev->ip_blocks[i].status.hw;
+ }
+ return false;
+}
+
+/**
+ * amdgpu_device_ip_is_valid - is the hardware IP valid
+ *
+ * @adev: amdgpu_device pointer
+ * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
+ *
+ * Check if the hardware IP is valid or not.
+ * Returns true if it the IP is valid, false if not.
+ */
bool amdgpu_device_ip_is_valid(struct amdgpu_device *adev,
enum amd_ip_block_type block_type)
{
@@ -2626,7 +2654,7 @@ static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev)
chip_name = "arcturus";
break;
case CHIP_NAVI12:
- if (adev->mman.discovery_bin)
+ if (adev->discovery.bin)
return 0;
chip_name = "navi12";
break;
@@ -2754,6 +2782,10 @@ static int amdgpu_device_ip_early_init(struct amdgpu_device *adev)
r = amdgpu_virt_request_full_gpu(adev, true);
if (r)
return r;
+
+ r = amdgpu_virt_init_critical_region(adev);
+ if (r)
+ return r;
}
switch (adev->asic_type) {
@@ -3773,7 +3805,6 @@ static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev)
if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_DCE)
continue;
- /* XXX handle errors */
r = amdgpu_ip_block_suspend(&adev->ip_blocks[i]);
if (r)
return r;
@@ -3856,9 +3887,9 @@ static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev)
adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)
continue;
- /* XXX handle errors */
r = amdgpu_ip_block_suspend(&adev->ip_blocks[i]);
- adev->ip_blocks[i].status.hw = false;
+ if (r)
+ return r;
/* handle putting the SMC in the appropriate state */
if (!amdgpu_sriov_vf(adev)) {
@@ -3888,7 +3919,7 @@ static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev)
* in each IP into a state suitable for suspend.
* Returns 0 on success, negative error code on failure.
*/
-int amdgpu_device_ip_suspend(struct amdgpu_device *adev)
+static int amdgpu_device_ip_suspend(struct amdgpu_device *adev)
{
int r;
@@ -4184,7 +4215,6 @@ bool amdgpu_device_asic_has_dc_support(struct pci_dev *pdev,
#else
return false;
#endif
- case CHIP_BONAIRE:
case CHIP_KAVERI:
case CHIP_KABINI:
case CHIP_MULLINS:
@@ -4278,58 +4308,53 @@ static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev)
long timeout;
int ret = 0;
- /*
- * By default timeout for jobs is 10 sec
- */
- adev->compute_timeout = adev->gfx_timeout = msecs_to_jiffies(10000);
- adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout;
+ /* By default timeout for all queues is 2 sec */
+ adev->gfx_timeout = adev->compute_timeout = adev->sdma_timeout =
+ adev->video_timeout = msecs_to_jiffies(2000);
- if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) {
- while ((timeout_setting = strsep(&input, ",")) &&
- strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) {
- ret = kstrtol(timeout_setting, 0, &timeout);
- if (ret)
- return ret;
+ if (!strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH))
+ return 0;
- if (timeout == 0) {
- index++;
- continue;
- } else if (timeout < 0) {
- timeout = MAX_SCHEDULE_TIMEOUT;
- dev_warn(adev->dev, "lockup timeout disabled");
- add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK);
- } else {
- timeout = msecs_to_jiffies(timeout);
- }
+ while ((timeout_setting = strsep(&input, ",")) &&
+ strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) {
+ ret = kstrtol(timeout_setting, 0, &timeout);
+ if (ret)
+ return ret;
- switch (index++) {
- case 0:
- adev->gfx_timeout = timeout;
- break;
- case 1:
- adev->compute_timeout = timeout;
- break;
- case 2:
- adev->sdma_timeout = timeout;
- break;
- case 3:
- adev->video_timeout = timeout;
- break;
- default:
- break;
- }
+ if (timeout == 0) {
+ index++;
+ continue;
+ } else if (timeout < 0) {
+ timeout = MAX_SCHEDULE_TIMEOUT;
+ dev_warn(adev->dev, "lockup timeout disabled");
+ add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK);
+ } else {
+ timeout = msecs_to_jiffies(timeout);
}
- /*
- * There is only one value specified and
- * it should apply to all non-compute jobs.
- */
- if (index == 1) {
- adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout;
- if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev))
- adev->compute_timeout = adev->gfx_timeout;
+
+ switch (index++) {
+ case 0:
+ adev->gfx_timeout = timeout;
+ break;
+ case 1:
+ adev->compute_timeout = timeout;
+ break;
+ case 2:
+ adev->sdma_timeout = timeout;
+ break;
+ case 3:
+ adev->video_timeout = timeout;
+ break;
+ default:
+ break;
}
}
+ /* When only one value specified apply it to all queues. */
+ if (index == 1)
+ adev->gfx_timeout = adev->compute_timeout = adev->sdma_timeout =
+ adev->video_timeout = timeout;
+
return ret;
}
@@ -4384,6 +4409,55 @@ static void amdgpu_device_set_mcbp(struct amdgpu_device *adev)
dev_info(adev->dev, "MCBP is enabled\n");
}
+static int amdgpu_device_sys_interface_init(struct amdgpu_device *adev)
+{
+ int r;
+
+ r = amdgpu_atombios_sysfs_init(adev);
+ if (r)
+ drm_err(&adev->ddev,
+ "registering atombios sysfs failed (%d).\n", r);
+
+ r = amdgpu_pm_sysfs_init(adev);
+ if (r)
+ dev_err(adev->dev, "registering pm sysfs failed (%d).\n", r);
+
+ r = amdgpu_ucode_sysfs_init(adev);
+ if (r) {
+ adev->ucode_sysfs_en = false;
+ dev_err(adev->dev, "Creating firmware sysfs failed (%d).\n", r);
+ } else
+ adev->ucode_sysfs_en = true;
+
+ r = amdgpu_device_attr_sysfs_init(adev);
+ if (r)
+ dev_err(adev->dev, "Could not create amdgpu device attr\n");
+
+ r = devm_device_add_group(adev->dev, &amdgpu_board_attrs_group);
+ if (r)
+ dev_err(adev->dev,
+ "Could not create amdgpu board attributes\n");
+
+ amdgpu_fru_sysfs_init(adev);
+ amdgpu_reg_state_sysfs_init(adev);
+ amdgpu_xcp_sysfs_init(adev);
+
+ return r;
+}
+
+static void amdgpu_device_sys_interface_fini(struct amdgpu_device *adev)
+{
+ if (adev->pm.sysfs_initialized)
+ amdgpu_pm_sysfs_fini(adev);
+ if (adev->ucode_sysfs_en)
+ amdgpu_ucode_sysfs_fini(adev);
+ amdgpu_device_attr_sysfs_fini(adev);
+ amdgpu_fru_sysfs_fini(adev);
+
+ amdgpu_reg_state_sysfs_fini(adev);
+ amdgpu_xcp_sysfs_fini(adev);
+}
+
/**
* amdgpu_device_init - initialize the driver
*
@@ -4483,7 +4557,6 @@ int amdgpu_device_init(struct amdgpu_device *adev,
mutex_init(&adev->gfx.userq_sch_mutex);
mutex_init(&adev->gfx.workload_profile_mutex);
mutex_init(&adev->vcn.workload_profile_mutex);
- mutex_init(&adev->userq_mutex);
amdgpu_device_init_apu_flags(adev);
@@ -4511,7 +4584,7 @@ int amdgpu_device_init(struct amdgpu_device *adev,
INIT_LIST_HEAD(&adev->pm.od_kobj_list);
- INIT_LIST_HEAD(&adev->userq_mgr_list);
+ xa_init(&adev->userq_doorbell_xa);
INIT_DELAYED_WORK(&adev->delayed_init_work,
amdgpu_device_delayed_init_work_handler);
@@ -4807,39 +4880,14 @@ fence_driver_init:
flush_delayed_work(&adev->delayed_init_work);
}
+ if (adev->init_lvl->level == AMDGPU_INIT_LEVEL_MINIMAL_XGMI)
+ amdgpu_xgmi_reset_on_init(adev);
/*
* Place those sysfs registering after `late_init`. As some of those
* operations performed in `late_init` might affect the sysfs
* interfaces creating.
*/
- r = amdgpu_atombios_sysfs_init(adev);
- if (r)
- drm_err(&adev->ddev,
- "registering atombios sysfs failed (%d).\n", r);
-
- r = amdgpu_pm_sysfs_init(adev);
- if (r)
- dev_err(adev->dev, "registering pm sysfs failed (%d).\n", r);
-
- r = amdgpu_ucode_sysfs_init(adev);
- if (r) {
- adev->ucode_sysfs_en = false;
- dev_err(adev->dev, "Creating firmware sysfs failed (%d).\n", r);
- } else
- adev->ucode_sysfs_en = true;
-
- r = amdgpu_device_attr_sysfs_init(adev);
- if (r)
- dev_err(adev->dev, "Could not create amdgpu device attr\n");
-
- r = devm_device_add_group(adev->dev, &amdgpu_board_attrs_group);
- if (r)
- dev_err(adev->dev,
- "Could not create amdgpu board attributes\n");
-
- amdgpu_fru_sysfs_init(adev);
- amdgpu_reg_state_sysfs_init(adev);
- amdgpu_xcp_sysfs_init(adev);
+ r = amdgpu_device_sys_interface_init(adev);
if (IS_ENABLED(CONFIG_PERF_EVENTS))
r = amdgpu_pmu_init(adev);
@@ -4867,9 +4915,6 @@ fence_driver_init:
if (px)
vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain);
- if (adev->init_lvl->level == AMDGPU_INIT_LEVEL_MINIMAL_XGMI)
- amdgpu_xgmi_reset_on_init(adev);
-
amdgpu_device_check_iommu_direct_map(adev);
adev->pm_nb.notifier_call = amdgpu_device_pm_notifier;
@@ -4961,15 +5006,7 @@ void amdgpu_device_fini_hw(struct amdgpu_device *adev)
}
amdgpu_fence_driver_hw_fini(adev);
- if (adev->pm.sysfs_initialized)
- amdgpu_pm_sysfs_fini(adev);
- if (adev->ucode_sysfs_en)
- amdgpu_ucode_sysfs_fini(adev);
- amdgpu_device_attr_sysfs_fini(adev);
- amdgpu_fru_sysfs_fini(adev);
-
- amdgpu_reg_state_sysfs_fini(adev);
- amdgpu_xcp_sysfs_fini(adev);
+ amdgpu_device_sys_interface_fini(adev);
/* disable ras feature must before hw fini */
amdgpu_ras_pre_fini(adev);
@@ -5044,7 +5081,7 @@ void amdgpu_device_fini_sw(struct amdgpu_device *adev)
if (IS_ENABLED(CONFIG_PERF_EVENTS))
amdgpu_pmu_fini(adev);
- if (adev->mman.discovery_bin)
+ if (adev->discovery.bin)
amdgpu_discovery_fini(adev);
amdgpu_reset_put_reset_domain(adev->reset_domain);
@@ -5212,16 +5249,20 @@ int amdgpu_device_suspend(struct drm_device *dev, bool notify_clients)
dev_warn(adev->dev, "smart shift update failed\n");
if (notify_clients)
- drm_client_dev_suspend(adev_to_drm(adev), false);
+ drm_client_dev_suspend(adev_to_drm(adev));
cancel_delayed_work_sync(&adev->delayed_init_work);
amdgpu_ras_suspend(adev);
- amdgpu_device_ip_suspend_phase1(adev);
+ r = amdgpu_device_ip_suspend_phase1(adev);
+ if (r)
+ return r;
amdgpu_amdkfd_suspend(adev, !amdgpu_sriov_vf(adev) && !adev->in_runpm);
- amdgpu_userq_suspend(adev);
+ r = amdgpu_userq_suspend(adev);
+ if (r)
+ return r;
r = amdgpu_device_evict_resources(adev);
if (r)
@@ -5231,7 +5272,9 @@ int amdgpu_device_suspend(struct drm_device *dev, bool notify_clients)
amdgpu_fence_driver_hw_fini(adev);
- amdgpu_device_ip_suspend_phase2(adev);
+ r = amdgpu_device_ip_suspend_phase2(adev);
+ if (r)
+ return r;
if (amdgpu_sriov_vf(adev))
amdgpu_virt_release_full_gpu(adev, false);
@@ -5346,7 +5389,7 @@ exit:
flush_delayed_work(&adev->delayed_init_work);
if (notify_clients)
- drm_client_dev_resume(adev_to_drm(adev), false);
+ drm_client_dev_resume(adev_to_drm(adev));
amdgpu_ras_resume(adev);
@@ -5802,11 +5845,6 @@ int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
if (!amdgpu_ring_sched_ready(ring))
continue;
- /* Clear job fence from fence drv to avoid force_completion
- * leave NULL and vm flush fence in fence drv
- */
- amdgpu_fence_driver_clear_job_fences(ring);
-
/* after all hw jobs are reset, hw fence is meaningless, so force_completion */
amdgpu_fence_driver_force_completion(ring);
}
@@ -5951,7 +5989,7 @@ int amdgpu_device_reinit_after_reset(struct amdgpu_reset_context *reset_context)
if (r)
goto out;
- drm_client_dev_resume(adev_to_drm(tmp_adev), false);
+ drm_client_dev_resume(adev_to_drm(tmp_adev));
/*
* The GPU enters bad state once faulty pages
@@ -6286,7 +6324,7 @@ static void amdgpu_device_halt_activities(struct amdgpu_device *adev,
*/
amdgpu_unregister_gpu_instance(tmp_adev);
- drm_client_dev_suspend(adev_to_drm(tmp_adev), false);
+ drm_client_dev_suspend(adev_to_drm(tmp_adev));
/* disable ras on ALL IPs */
if (!need_emergency_restart && !amdgpu_reset_in_dpc(adev) &&
@@ -6535,7 +6573,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
*
* job->base holds a reference to parent fence
*/
- if (job && dma_fence_is_signaled(&job->hw_fence.base)) {
+ if (job && dma_fence_is_signaled(&job->hw_fence->base)) {
job_signaled = true;
dev_info(adev->dev, "Guilty job already signaled, skipping HW reset");
goto skip_hw_reset;
@@ -7279,10 +7317,17 @@ void amdgpu_device_flush_hdp(struct amdgpu_device *adev,
if (adev->gmc.xgmi.connected_to_cpu)
return;
- if (ring && ring->funcs->emit_hdp_flush)
+ if (ring && ring->funcs->emit_hdp_flush) {
amdgpu_ring_emit_hdp_flush(ring);
- else
- amdgpu_asic_flush_hdp(adev, ring);
+ return;
+ }
+
+ if (!ring && amdgpu_sriov_runtime(adev)) {
+ if (!amdgpu_kiq_hdp_flush(adev))
+ return;
+ }
+
+ amdgpu_asic_flush_hdp(adev, ring);
}
void amdgpu_device_invalidate_hdp(struct amdgpu_device *adev,