summaryrefslogtreecommitdiff
path: root/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c')
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_device.c344
1 files changed, 201 insertions, 143 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index bdfb80377e6a..54ca8e2683fe 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -1882,6 +1882,13 @@ static bool amdgpu_device_pcie_dynamic_switching_supported(struct amdgpu_device
static bool amdgpu_device_aspm_support_quirk(struct amdgpu_device *adev)
{
+ /* Enabling ASPM causes randoms hangs on Tahiti and Oland on Zen4.
+ * It's unclear if this is a platform-specific or GPU-specific issue.
+ * Disable ASPM on SI for the time being.
+ */
+ if (adev->family == AMDGPU_FAMILY_SI)
+ return true;
+
#if IS_ENABLED(CONFIG_X86)
struct cpuinfo_x86 *c = &cpu_data(0);
@@ -2380,7 +2387,7 @@ int amdgpu_device_ip_wait_for_idle(struct amdgpu_device *adev,
}
/**
- * amdgpu_device_ip_is_valid - is the hardware IP enabled
+ * amdgpu_device_ip_is_hw - is the hardware IP enabled
*
* @adev: amdgpu_device pointer
* @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
@@ -2388,6 +2395,27 @@ int amdgpu_device_ip_wait_for_idle(struct amdgpu_device *adev,
* Check if the hardware IP is enable or not.
* Returns true if it the IP is enable, false if not.
*/
+bool amdgpu_device_ip_is_hw(struct amdgpu_device *adev,
+ enum amd_ip_block_type block_type)
+{
+ int i;
+
+ for (i = 0; i < adev->num_ip_blocks; i++) {
+ if (adev->ip_blocks[i].version->type == block_type)
+ return adev->ip_blocks[i].status.hw;
+ }
+ return false;
+}
+
+/**
+ * amdgpu_device_ip_is_valid - is the hardware IP valid
+ *
+ * @adev: amdgpu_device pointer
+ * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
+ *
+ * Check if the hardware IP is valid or not.
+ * Returns true if it the IP is valid, false if not.
+ */
bool amdgpu_device_ip_is_valid(struct amdgpu_device *adev,
enum amd_ip_block_type block_type)
{
@@ -2626,7 +2654,7 @@ static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev)
chip_name = "arcturus";
break;
case CHIP_NAVI12:
- if (adev->mman.discovery_bin)
+ if (adev->discovery.bin)
return 0;
chip_name = "navi12";
break;
@@ -2754,6 +2782,10 @@ static int amdgpu_device_ip_early_init(struct amdgpu_device *adev)
r = amdgpu_virt_request_full_gpu(adev, true);
if (r)
return r;
+
+ r = amdgpu_virt_init_critical_region(adev);
+ if (r)
+ return r;
}
switch (adev->asic_type) {
@@ -3773,7 +3805,6 @@ static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev)
if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_DCE)
continue;
- /* XXX handle errors */
r = amdgpu_ip_block_suspend(&adev->ip_blocks[i]);
if (r)
return r;
@@ -3856,9 +3887,9 @@ static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev)
adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)
continue;
- /* XXX handle errors */
r = amdgpu_ip_block_suspend(&adev->ip_blocks[i]);
- adev->ip_blocks[i].status.hw = false;
+ if (r)
+ return r;
/* handle putting the SMC in the appropriate state */
if (!amdgpu_sriov_vf(adev)) {
@@ -3888,7 +3919,7 @@ static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev)
* in each IP into a state suitable for suspend.
* Returns 0 on success, negative error code on failure.
*/
-int amdgpu_device_ip_suspend(struct amdgpu_device *adev)
+static int amdgpu_device_ip_suspend(struct amdgpu_device *adev)
{
int r;
@@ -4278,58 +4309,53 @@ static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev)
long timeout;
int ret = 0;
- /*
- * By default timeout for jobs is 10 sec
- */
- adev->compute_timeout = adev->gfx_timeout = msecs_to_jiffies(10000);
- adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout;
+ /* By default timeout for all queues is 2 sec */
+ adev->gfx_timeout = adev->compute_timeout = adev->sdma_timeout =
+ adev->video_timeout = msecs_to_jiffies(2000);
- if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) {
- while ((timeout_setting = strsep(&input, ",")) &&
- strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) {
- ret = kstrtol(timeout_setting, 0, &timeout);
- if (ret)
- return ret;
+ if (!strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH))
+ return 0;
- if (timeout == 0) {
- index++;
- continue;
- } else if (timeout < 0) {
- timeout = MAX_SCHEDULE_TIMEOUT;
- dev_warn(adev->dev, "lockup timeout disabled");
- add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK);
- } else {
- timeout = msecs_to_jiffies(timeout);
- }
+ while ((timeout_setting = strsep(&input, ",")) &&
+ strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) {
+ ret = kstrtol(timeout_setting, 0, &timeout);
+ if (ret)
+ return ret;
- switch (index++) {
- case 0:
- adev->gfx_timeout = timeout;
- break;
- case 1:
- adev->compute_timeout = timeout;
- break;
- case 2:
- adev->sdma_timeout = timeout;
- break;
- case 3:
- adev->video_timeout = timeout;
- break;
- default:
- break;
- }
+ if (timeout == 0) {
+ index++;
+ continue;
+ } else if (timeout < 0) {
+ timeout = MAX_SCHEDULE_TIMEOUT;
+ dev_warn(adev->dev, "lockup timeout disabled");
+ add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK);
+ } else {
+ timeout = msecs_to_jiffies(timeout);
}
- /*
- * There is only one value specified and
- * it should apply to all non-compute jobs.
- */
- if (index == 1) {
- adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout;
- if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev))
- adev->compute_timeout = adev->gfx_timeout;
+
+ switch (index++) {
+ case 0:
+ adev->gfx_timeout = timeout;
+ break;
+ case 1:
+ adev->compute_timeout = timeout;
+ break;
+ case 2:
+ adev->sdma_timeout = timeout;
+ break;
+ case 3:
+ adev->video_timeout = timeout;
+ break;
+ default:
+ break;
}
}
+ /* When only one value specified apply it to all queues. */
+ if (index == 1)
+ adev->gfx_timeout = adev->compute_timeout = adev->sdma_timeout =
+ adev->video_timeout = timeout;
+
return ret;
}
@@ -4384,6 +4410,55 @@ static void amdgpu_device_set_mcbp(struct amdgpu_device *adev)
dev_info(adev->dev, "MCBP is enabled\n");
}
+static int amdgpu_device_sys_interface_init(struct amdgpu_device *adev)
+{
+ int r;
+
+ r = amdgpu_atombios_sysfs_init(adev);
+ if (r)
+ drm_err(&adev->ddev,
+ "registering atombios sysfs failed (%d).\n", r);
+
+ r = amdgpu_pm_sysfs_init(adev);
+ if (r)
+ dev_err(adev->dev, "registering pm sysfs failed (%d).\n", r);
+
+ r = amdgpu_ucode_sysfs_init(adev);
+ if (r) {
+ adev->ucode_sysfs_en = false;
+ dev_err(adev->dev, "Creating firmware sysfs failed (%d).\n", r);
+ } else
+ adev->ucode_sysfs_en = true;
+
+ r = amdgpu_device_attr_sysfs_init(adev);
+ if (r)
+ dev_err(adev->dev, "Could not create amdgpu device attr\n");
+
+ r = devm_device_add_group(adev->dev, &amdgpu_board_attrs_group);
+ if (r)
+ dev_err(adev->dev,
+ "Could not create amdgpu board attributes\n");
+
+ amdgpu_fru_sysfs_init(adev);
+ amdgpu_reg_state_sysfs_init(adev);
+ amdgpu_xcp_sysfs_init(adev);
+
+ return r;
+}
+
+static void amdgpu_device_sys_interface_fini(struct amdgpu_device *adev)
+{
+ if (adev->pm.sysfs_initialized)
+ amdgpu_pm_sysfs_fini(adev);
+ if (adev->ucode_sysfs_en)
+ amdgpu_ucode_sysfs_fini(adev);
+ amdgpu_device_attr_sysfs_fini(adev);
+ amdgpu_fru_sysfs_fini(adev);
+
+ amdgpu_reg_state_sysfs_fini(adev);
+ amdgpu_xcp_sysfs_fini(adev);
+}
+
/**
* amdgpu_device_init - initialize the driver
*
@@ -4807,39 +4882,14 @@ fence_driver_init:
flush_delayed_work(&adev->delayed_init_work);
}
+ if (adev->init_lvl->level == AMDGPU_INIT_LEVEL_MINIMAL_XGMI)
+ amdgpu_xgmi_reset_on_init(adev);
/*
* Place those sysfs registering after `late_init`. As some of those
* operations performed in `late_init` might affect the sysfs
* interfaces creating.
*/
- r = amdgpu_atombios_sysfs_init(adev);
- if (r)
- drm_err(&adev->ddev,
- "registering atombios sysfs failed (%d).\n", r);
-
- r = amdgpu_pm_sysfs_init(adev);
- if (r)
- dev_err(adev->dev, "registering pm sysfs failed (%d).\n", r);
-
- r = amdgpu_ucode_sysfs_init(adev);
- if (r) {
- adev->ucode_sysfs_en = false;
- dev_err(adev->dev, "Creating firmware sysfs failed (%d).\n", r);
- } else
- adev->ucode_sysfs_en = true;
-
- r = amdgpu_device_attr_sysfs_init(adev);
- if (r)
- dev_err(adev->dev, "Could not create amdgpu device attr\n");
-
- r = devm_device_add_group(adev->dev, &amdgpu_board_attrs_group);
- if (r)
- dev_err(adev->dev,
- "Could not create amdgpu board attributes\n");
-
- amdgpu_fru_sysfs_init(adev);
- amdgpu_reg_state_sysfs_init(adev);
- amdgpu_xcp_sysfs_init(adev);
+ r = amdgpu_device_sys_interface_init(adev);
if (IS_ENABLED(CONFIG_PERF_EVENTS))
r = amdgpu_pmu_init(adev);
@@ -4867,9 +4917,6 @@ fence_driver_init:
if (px)
vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain);
- if (adev->init_lvl->level == AMDGPU_INIT_LEVEL_MINIMAL_XGMI)
- amdgpu_xgmi_reset_on_init(adev);
-
amdgpu_device_check_iommu_direct_map(adev);
adev->pm_nb.notifier_call = amdgpu_device_pm_notifier;
@@ -4961,15 +5008,7 @@ void amdgpu_device_fini_hw(struct amdgpu_device *adev)
}
amdgpu_fence_driver_hw_fini(adev);
- if (adev->pm.sysfs_initialized)
- amdgpu_pm_sysfs_fini(adev);
- if (adev->ucode_sysfs_en)
- amdgpu_ucode_sysfs_fini(adev);
- amdgpu_device_attr_sysfs_fini(adev);
- amdgpu_fru_sysfs_fini(adev);
-
- amdgpu_reg_state_sysfs_fini(adev);
- amdgpu_xcp_sysfs_fini(adev);
+ amdgpu_device_sys_interface_fini(adev);
/* disable ras feature must before hw fini */
amdgpu_ras_pre_fini(adev);
@@ -5044,7 +5083,7 @@ void amdgpu_device_fini_sw(struct amdgpu_device *adev)
if (IS_ENABLED(CONFIG_PERF_EVENTS))
amdgpu_pmu_fini(adev);
- if (adev->mman.discovery_bin)
+ if (adev->discovery.bin)
amdgpu_discovery_fini(adev);
amdgpu_reset_put_reset_domain(adev->reset_domain);
@@ -5072,6 +5111,10 @@ static int amdgpu_device_evict_resources(struct amdgpu_device *adev)
if (!adev->in_s4 && (adev->flags & AMD_IS_APU))
return 0;
+ /* No need to evict when going to S5 through S4 callbacks */
+ if (system_state == SYSTEM_POWER_OFF)
+ return 0;
+
ret = amdgpu_ttm_evict_resources(adev, TTM_PL_VRAM);
if (ret) {
dev_warn(adev->dev, "evicting device resources failed\n");
@@ -5196,7 +5239,7 @@ int amdgpu_device_suspend(struct drm_device *dev, bool notify_clients)
adev->in_suspend = true;
if (amdgpu_sriov_vf(adev)) {
- if (!adev->in_s0ix && !adev->in_runpm)
+ if (!adev->in_runpm)
amdgpu_amdkfd_suspend_process(adev);
amdgpu_virt_fini_data_exchange(adev);
r = amdgpu_virt_request_full_gpu(adev, false);
@@ -5208,18 +5251,20 @@ int amdgpu_device_suspend(struct drm_device *dev, bool notify_clients)
dev_warn(adev->dev, "smart shift update failed\n");
if (notify_clients)
- drm_client_dev_suspend(adev_to_drm(adev), false);
+ drm_client_dev_suspend(adev_to_drm(adev));
cancel_delayed_work_sync(&adev->delayed_init_work);
amdgpu_ras_suspend(adev);
- amdgpu_device_ip_suspend_phase1(adev);
+ r = amdgpu_device_ip_suspend_phase1(adev);
+ if (r)
+ return r;
- if (!adev->in_s0ix) {
- amdgpu_amdkfd_suspend(adev, !amdgpu_sriov_vf(adev) && !adev->in_runpm);
- amdgpu_userq_suspend(adev);
- }
+ amdgpu_amdkfd_suspend(adev, !amdgpu_sriov_vf(adev) && !adev->in_runpm);
+ r = amdgpu_userq_suspend(adev);
+ if (r)
+ return r;
r = amdgpu_device_evict_resources(adev);
if (r)
@@ -5229,7 +5274,9 @@ int amdgpu_device_suspend(struct drm_device *dev, bool notify_clients)
amdgpu_fence_driver_hw_fini(adev);
- amdgpu_device_ip_suspend_phase2(adev);
+ r = amdgpu_device_ip_suspend_phase2(adev);
+ if (r)
+ return r;
if (amdgpu_sriov_vf(adev))
amdgpu_virt_release_full_gpu(adev, false);
@@ -5314,15 +5361,13 @@ int amdgpu_device_resume(struct drm_device *dev, bool notify_clients)
goto exit;
}
- if (!adev->in_s0ix) {
- r = amdgpu_amdkfd_resume(adev, !amdgpu_sriov_vf(adev) && !adev->in_runpm);
- if (r)
- goto exit;
+ r = amdgpu_amdkfd_resume(adev, !amdgpu_sriov_vf(adev) && !adev->in_runpm);
+ if (r)
+ goto exit;
- r = amdgpu_userq_resume(adev);
- if (r)
- goto exit;
- }
+ r = amdgpu_userq_resume(adev);
+ if (r)
+ goto exit;
r = amdgpu_device_ip_late_init(adev);
if (r)
@@ -5335,7 +5380,7 @@ exit:
amdgpu_virt_init_data_exchange(adev);
amdgpu_virt_release_full_gpu(adev, true);
- if (!adev->in_s0ix && !r && !adev->in_runpm)
+ if (!r && !adev->in_runpm)
r = amdgpu_amdkfd_resume_process(adev);
}
@@ -5346,7 +5391,7 @@ exit:
flush_delayed_work(&adev->delayed_init_work);
if (notify_clients)
- drm_client_dev_resume(adev_to_drm(adev), false);
+ drm_client_dev_resume(adev_to_drm(adev));
amdgpu_ras_resume(adev);
@@ -5802,11 +5847,6 @@ int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
if (!amdgpu_ring_sched_ready(ring))
continue;
- /* Clear job fence from fence drv to avoid force_completion
- * leave NULL and vm flush fence in fence drv
- */
- amdgpu_fence_driver_clear_job_fences(ring);
-
/* after all hw jobs are reset, hw fence is meaningless, so force_completion */
amdgpu_fence_driver_force_completion(ring);
}
@@ -5951,7 +5991,7 @@ int amdgpu_device_reinit_after_reset(struct amdgpu_reset_context *reset_context)
if (r)
goto out;
- drm_client_dev_resume(adev_to_drm(tmp_adev), false);
+ drm_client_dev_resume(adev_to_drm(tmp_adev));
/*
* The GPU enters bad state once faulty pages
@@ -6286,7 +6326,7 @@ static void amdgpu_device_halt_activities(struct amdgpu_device *adev,
*/
amdgpu_unregister_gpu_instance(tmp_adev);
- drm_client_dev_suspend(adev_to_drm(tmp_adev), false);
+ drm_client_dev_suspend(adev_to_drm(tmp_adev));
/* disable ras on ALL IPs */
if (!need_emergency_restart && !amdgpu_reset_in_dpc(adev) &&
@@ -6389,23 +6429,28 @@ static int amdgpu_device_sched_resume(struct list_head *device_list,
if (!drm_drv_uses_atomic_modeset(adev_to_drm(tmp_adev)) && !job_signaled)
drm_helper_resume_force_mode(adev_to_drm(tmp_adev));
- if (tmp_adev->asic_reset_res)
- r = tmp_adev->asic_reset_res;
-
- tmp_adev->asic_reset_res = 0;
-
- if (r) {
+ if (tmp_adev->asic_reset_res) {
/* bad news, how to tell it to userspace ?
* for ras error, we should report GPU bad status instead of
* reset failure
*/
if (reset_context->src != AMDGPU_RESET_SRC_RAS ||
!amdgpu_ras_eeprom_check_err_threshold(tmp_adev))
- dev_info(tmp_adev->dev, "GPU reset(%d) failed\n",
- atomic_read(&tmp_adev->gpu_reset_counter));
- amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r);
+ dev_info(
+ tmp_adev->dev,
+ "GPU reset(%d) failed with error %d \n",
+ atomic_read(
+ &tmp_adev->gpu_reset_counter),
+ tmp_adev->asic_reset_res);
+ amdgpu_vf_error_put(tmp_adev,
+ AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0,
+ tmp_adev->asic_reset_res);
+ if (!r)
+ r = tmp_adev->asic_reset_res;
+ tmp_adev->asic_reset_res = 0;
} else {
- dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter));
+ dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n",
+ atomic_read(&tmp_adev->gpu_reset_counter));
if (amdgpu_acpi_smart_shift_update(tmp_adev,
AMDGPU_SS_DEV_D0))
dev_warn(tmp_adev->dev,
@@ -6530,7 +6575,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
*
* job->base holds a reference to parent fence
*/
- if (job && dma_fence_is_signaled(&job->hw_fence.base)) {
+ if (job && dma_fence_is_signaled(&job->hw_fence->base)) {
job_signaled = true;
dev_info(adev->dev, "Guilty job already signaled, skipping HW reset");
goto skip_hw_reset;
@@ -6937,7 +6982,8 @@ pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_sta
{
struct drm_device *dev = pci_get_drvdata(pdev);
struct amdgpu_device *adev = drm_to_adev(dev);
- struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev);
+ struct amdgpu_hive_info *hive __free(xgmi_put_hive) =
+ amdgpu_get_xgmi_hive(adev);
struct amdgpu_reset_context reset_context;
struct list_head device_list;
@@ -6976,10 +7022,8 @@ pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_sta
amdgpu_device_recovery_get_reset_lock(adev, &device_list);
amdgpu_device_halt_activities(adev, NULL, &reset_context, &device_list,
hive, false);
- if (hive) {
+ if (hive)
mutex_unlock(&hive->hive_lock);
- amdgpu_put_xgmi_hive(hive);
- }
return PCI_ERS_RESULT_NEED_RESET;
case pci_channel_io_perm_failure:
/* Permanent error, prepare for device removal */
@@ -7158,28 +7202,35 @@ void amdgpu_pci_resume(struct pci_dev *pdev)
static void amdgpu_device_cache_switch_state(struct amdgpu_device *adev)
{
- struct pci_dev *parent = pci_upstream_bridge(adev->pdev);
+ struct pci_dev *swus, *swds;
int r;
- if (parent->vendor != PCI_VENDOR_ID_ATI)
+ swds = pci_upstream_bridge(adev->pdev);
+ if (!swds || swds->vendor != PCI_VENDOR_ID_ATI ||
+ pci_pcie_type(swds) != PCI_EXP_TYPE_DOWNSTREAM)
+ return;
+ swus = pci_upstream_bridge(swds);
+ if (!swus ||
+ (swus->vendor != PCI_VENDOR_ID_ATI &&
+ swus->vendor != PCI_VENDOR_ID_AMD) ||
+ pci_pcie_type(swus) != PCI_EXP_TYPE_UPSTREAM)
return;
/* If already saved, return */
if (adev->pcie_reset_ctx.swus)
return;
/* Upstream bridge is ATI, assume it's SWUS/DS architecture */
- r = pci_save_state(parent);
+ r = pci_save_state(swds);
if (r)
return;
- adev->pcie_reset_ctx.swds_pcistate = pci_store_saved_state(parent);
+ adev->pcie_reset_ctx.swds_pcistate = pci_store_saved_state(swds);
- parent = pci_upstream_bridge(parent);
- r = pci_save_state(parent);
+ r = pci_save_state(swus);
if (r)
return;
- adev->pcie_reset_ctx.swus_pcistate = pci_store_saved_state(parent);
+ adev->pcie_reset_ctx.swus_pcistate = pci_store_saved_state(swus);
- adev->pcie_reset_ctx.swus = parent;
+ adev->pcie_reset_ctx.swus = swus;
}
static void amdgpu_device_load_switch_state(struct amdgpu_device *adev)
@@ -7268,10 +7319,17 @@ void amdgpu_device_flush_hdp(struct amdgpu_device *adev,
if (adev->gmc.xgmi.connected_to_cpu)
return;
- if (ring && ring->funcs->emit_hdp_flush)
+ if (ring && ring->funcs->emit_hdp_flush) {
amdgpu_ring_emit_hdp_flush(ring);
- else
- amdgpu_asic_flush_hdp(adev, ring);
+ return;
+ }
+
+ if (!ring && amdgpu_sriov_runtime(adev)) {
+ if (!amdgpu_kiq_hdp_flush(adev))
+ return;
+ }
+
+ amdgpu_asic_flush_hdp(adev, ring);
}
void amdgpu_device_invalidate_hdp(struct amdgpu_device *adev,