diff options
| author | Dave Airlie <airlied@redhat.com> | 2025-12-03 09:43:09 +1000 |
|---|---|---|
| committer | Dave Airlie <airlied@redhat.com> | 2025-12-03 09:43:49 +1000 |
| commit | 0692602defb0c273f80dec9c564ca50726404aca (patch) | |
| tree | 2fea4ab096be220f377eab9934c39790cb453c83 /drivers/gpu/drm/amd/amdgpu | |
| parent | b3239df349c2c2c94686674489c9629c89ca49a1 (diff) | |
| parent | 3925683515e93844be204381d2d5a1df5de34f31 (diff) | |
Merge tag 'amd-drm-next-6.19-2025-12-02' of https://gitlab.freedesktop.org/agd5f/linux into drm-next
amd-drm-next-6.19-2025-12-02:
amdgpu:
- Unified MES fix
- SMU 11 unbalanced irq fix
- Fix for driver reloading on APUs
- pp_table sysfs fix
- Fix memory leak in fence handling
- HDMI fix
- DC cursor fixes
- eDP panel parsing fix
- Brightness fix
- DC analog fixes
- EDID retry fixes
- UserQ fixes
- RAS fixes
- IP discovery fix
- Add missing locking in amdgpu_ttm_access_memory_sdma()
- Smart Power OLED fix
- PRT and page fault fixes for GC 6-8
- VMID reservation fix
- ACP platform device fix
- Add missing vm fault handling for GC 11-12
- VPE fix
amdkfd:
- Partitioning fix
Signed-off-by: Dave Airlie <airlied@redhat.com>
From: Alex Deucher <alexander.deucher@amd.com>
Link: https://patch.msgid.link/20251202220101.2039347-1-alexander.deucher@amd.com
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu')
26 files changed, 228 insertions, 28 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_acp.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_acp.c index 4926996f94da..381ef205b0df 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_acp.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_acp.c @@ -302,17 +302,19 @@ static int acp_hw_init(struct amdgpu_ip_block *ip_block) adev->acp.acp_res[2].end = adev->acp.acp_res[2].start; adev->acp.acp_cell[0].name = "acp_audio_dma"; + adev->acp.acp_cell[0].id = 0; adev->acp.acp_cell[0].num_resources = 3; adev->acp.acp_cell[0].resources = &adev->acp.acp_res[0]; adev->acp.acp_cell[0].platform_data = &adev->asic_type; adev->acp.acp_cell[0].pdata_size = sizeof(adev->asic_type); adev->acp.acp_cell[1].name = "designware-i2s"; + adev->acp.acp_cell[1].id = 1; adev->acp.acp_cell[1].num_resources = 1; adev->acp.acp_cell[1].resources = &adev->acp.acp_res[1]; adev->acp.acp_cell[1].platform_data = &i2s_pdata[0]; adev->acp.acp_cell[1].pdata_size = sizeof(struct i2s_platform_data); - r = mfd_add_hotplug_devices(adev->acp.parent, adev->acp.acp_cell, 2); + r = mfd_add_devices(adev->acp.parent, 0, adev->acp.acp_cell, 2, NULL, 0, NULL); if (r) goto failure; r = device_for_each_child(adev->acp.parent, &adev->acp.acp_genpd->gpd, @@ -410,30 +412,34 @@ static int acp_hw_init(struct amdgpu_ip_block *ip_block) adev->acp.acp_res[4].end = adev->acp.acp_res[4].start; adev->acp.acp_cell[0].name = "acp_audio_dma"; + adev->acp.acp_cell[0].id = 0; adev->acp.acp_cell[0].num_resources = 5; adev->acp.acp_cell[0].resources = &adev->acp.acp_res[0]; adev->acp.acp_cell[0].platform_data = &adev->asic_type; adev->acp.acp_cell[0].pdata_size = sizeof(adev->asic_type); adev->acp.acp_cell[1].name = "designware-i2s"; + adev->acp.acp_cell[1].id = 1; adev->acp.acp_cell[1].num_resources = 1; adev->acp.acp_cell[1].resources = &adev->acp.acp_res[1]; adev->acp.acp_cell[1].platform_data = &i2s_pdata[0]; adev->acp.acp_cell[1].pdata_size = sizeof(struct i2s_platform_data); adev->acp.acp_cell[2].name = "designware-i2s"; + adev->acp.acp_cell[2].id = 2; adev->acp.acp_cell[2].num_resources = 1; adev->acp.acp_cell[2].resources = &adev->acp.acp_res[2]; adev->acp.acp_cell[2].platform_data = &i2s_pdata[1]; adev->acp.acp_cell[2].pdata_size = sizeof(struct i2s_platform_data); adev->acp.acp_cell[3].name = "designware-i2s"; + adev->acp.acp_cell[3].id = 3; adev->acp.acp_cell[3].num_resources = 1; adev->acp.acp_cell[3].resources = &adev->acp.acp_res[3]; adev->acp.acp_cell[3].platform_data = &i2s_pdata[2]; adev->acp.acp_cell[3].pdata_size = sizeof(struct i2s_platform_data); - r = mfd_add_hotplug_devices(adev->acp.parent, adev->acp.acp_cell, ACP_DEVS); + r = mfd_add_devices(adev->acp.parent, 0, adev->acp.acp_cell, ACP_DEVS, NULL, 0, NULL); if (r) goto failure; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index bfbf874a1000..903c4706040d 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -2665,6 +2665,8 @@ static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev) chip_name = "navi12"; break; case CHIP_CYAN_SKILLFISH: + if (adev->discovery.bin) + return 0; chip_name = "cyan_skillfish"; break; } @@ -3680,6 +3682,20 @@ static int amdgpu_device_ip_fini_early(struct amdgpu_device *adev) "failed to release exclusive mode on fini\n"); } + /* + * Driver reload on the APU can fail due to firmware validation because + * the PSP is always running, as it is shared across the whole SoC. + * This same issue does not occur on dGPU because it has a mechanism + * that checks whether the PSP is running. A solution for those issues + * in the APU is to trigger a GPU reset, but this should be done during + * the unload phase to avoid adding boot latency and screen flicker. + */ + if ((adev->flags & AMD_IS_APU) && !adev->gmc.is_app_apu) { + r = amdgpu_asic_reset(adev); + if (r) + dev_err(adev->dev, "asic reset on %s failed\n", __func__); + } + return 0; } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c index 9dcf51991b5b..869bceb0fe2c 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c @@ -597,6 +597,9 @@ int amdgpu_gmc_allocate_vm_inv_eng(struct amdgpu_device *adev) /* reserve engine 5 for firmware */ if (adev->enable_mes) vm_inv_engs[i] &= ~(1 << 5); + /* reserve engine 6 for uni mes */ + if (adev->enable_uni_mes) + vm_inv_engs[i] &= ~(1 << 6); /* reserve mmhub engine 3 for firmware */ if (adev->enable_umsch_mm) vm_inv_engs[i] &= ~(1 << 3); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h index 55097ca10738..727342689d4b 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h @@ -86,6 +86,11 @@ enum amdgpu_memory_partition { #define AMDGPU_MAX_MEM_RANGES 8 +#define AMDGPU_GMC9_FAULT_SOURCE_DATA_RETRY 0x80 +#define AMDGPU_GMC9_FAULT_SOURCE_DATA_READ 0x40 +#define AMDGPU_GMC9_FAULT_SOURCE_DATA_WRITE 0x20 +#define AMDGPU_GMC9_FAULT_SOURCE_DATA_EXE 0x10 + /* * GMC page fault information */ diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c index 7d8ef7ae10c2..0a0dcbf0798d 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c @@ -224,6 +224,7 @@ err_fence: kfree((*job)->hw_fence); err_job: kfree(*job); + *job = NULL; return r; } @@ -245,7 +246,10 @@ int amdgpu_job_alloc_with_ib(struct amdgpu_device *adev, if (r) { if (entity) drm_sched_job_cleanup(&(*job)->base); + kfree((*job)->hw_vm_fence); + kfree((*job)->hw_fence); kfree(*job); + *job = NULL; } return r; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 9e2e098af86c..2a6cf7963dde 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -150,6 +150,8 @@ static void amdgpu_ras_critical_region_fini(struct amdgpu_device *adev); #ifdef CONFIG_X86_MCE_AMD static void amdgpu_register_bad_pages_mca_notifier(struct amdgpu_device *adev); +static void +amdgpu_unregister_bad_pages_mca_notifier(struct amdgpu_device *adev); struct mce_notifier_adev_list { struct amdgpu_device *devs[MAX_GPU_INSTANCE]; int num_gpu; @@ -3954,7 +3956,9 @@ static int amdgpu_ras_recovery_fini(struct amdgpu_device *adev) mutex_unlock(&con->recovery_lock); amdgpu_ras_critical_region_init(adev); - +#ifdef CONFIG_X86_MCE_AMD + amdgpu_unregister_bad_pages_mca_notifier(adev); +#endif return 0; } /* recovery end */ @@ -4988,6 +4992,28 @@ static void amdgpu_register_bad_pages_mca_notifier(struct amdgpu_device *adev) notifier_registered = true; } } +static void amdgpu_unregister_bad_pages_mca_notifier(struct amdgpu_device *adev) +{ + int i, j; + + if (!notifier_registered && !mce_adev_list.num_gpu) + return; + for (i = 0, j = 0; i < mce_adev_list.num_gpu; i++) { + if (mce_adev_list.devs[i] == adev) + mce_adev_list.devs[i] = NULL; + if (!mce_adev_list.devs[i]) + ++j; + } + + if (j == mce_adev_list.num_gpu) { + mce_adev_list.num_gpu = 0; + /* Unregister x86 notifier with MCE subsystem. */ + if (notifier_registered) { + mce_unregister_decode_chain(&amdgpu_bad_page_nb); + notifier_registered = false; + } + } +} #endif struct amdgpu_ras *amdgpu_ras_get_context(struct amdgpu_device *adev) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c index cd8873c6931a..c596b6df2e2d 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c @@ -520,9 +520,14 @@ static ssize_t amdgpu_ras_cper_debugfs_read(struct file *f, char __user *buf, return -ENOMEM; if (!(*offset)) { + /* Need at least 12 bytes for the header on the first read */ + if (size < ring_header_size) + return -EINVAL; + if (copy_to_user(buf, ring_header, ring_header_size)) return -EFAULT; buf += ring_header_size; + size -= ring_header_size; } r = amdgpu_ras_mgr_handle_ras_cmd(ring->adev, diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c index 421c2bbe2b6a..2b931e855abd 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c @@ -1329,7 +1329,7 @@ uint64_t amdgpu_ttm_tt_pde_flags(struct ttm_tt *ttm, struct ttm_resource *mem) mem->mem_type == AMDGPU_PL_MMIO_REMAP)) { flags |= AMDGPU_PTE_SYSTEM; - if (ttm->caching == ttm_cached) + if (ttm && ttm->caching == ttm_cached) flags |= AMDGPU_PTE_SNOOPED; } @@ -1486,6 +1486,7 @@ static int amdgpu_ttm_access_memory_sdma(struct ttm_buffer_object *bo, if (r) goto out; + mutex_lock(&adev->mman.gtt_window_lock); amdgpu_res_first(abo->tbo.resource, offset, len, &src_mm); src_addr = amdgpu_ttm_domain_start(adev, bo->resource->mem_type) + src_mm.start; @@ -1500,6 +1501,7 @@ static int amdgpu_ttm_access_memory_sdma(struct ttm_buffer_object *bo, WARN_ON(job->ibs[0].length_dw > num_dw); fence = amdgpu_job_submit(job); + mutex_unlock(&adev->mman.gtt_window_lock); if (!dma_fence_wait_timeout(fence, false, adev->sdma_timeout)) r = -ETIMEDOUT; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c index 7fc081e88b6a..a67285118c37 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c @@ -1069,7 +1069,7 @@ amdgpu_vm_tlb_flush(struct amdgpu_vm_update_params *params, } /* Prepare a TLB flush fence to be attached to PTs */ - if (!params->unlocked && vm->is_compute_context) { + if (!params->unlocked) { amdgpu_vm_tlb_fence_create(params->adev, vm, fence); /* Makes sure no PD/PT is freed before the flush */ @@ -2093,7 +2093,7 @@ int amdgpu_vm_bo_clear_mappings(struct amdgpu_device *adev, struct amdgpu_bo *bo = before->bo_va->base.bo; amdgpu_vm_it_insert(before, &vm->va); - if (before->flags & AMDGPU_PTE_PRT_FLAG(adev)) + if (before->flags & AMDGPU_VM_PAGE_PRT) amdgpu_vm_prt_get(adev); if (amdgpu_vm_is_bo_always_valid(vm, bo) && @@ -2108,7 +2108,7 @@ int amdgpu_vm_bo_clear_mappings(struct amdgpu_device *adev, struct amdgpu_bo *bo = after->bo_va->base.bo; amdgpu_vm_it_insert(after, &vm->va); - if (after->flags & AMDGPU_PTE_PRT_FLAG(adev)) + if (after->flags & AMDGPU_VM_PAGE_PRT) amdgpu_vm_prt_get(adev); if (amdgpu_vm_is_bo_always_valid(vm, bo) && @@ -2916,8 +2916,7 @@ int amdgpu_vm_ioctl(struct drm_device *dev, void *data, struct drm_file *filp) switch (args->in.op) { case AMDGPU_VM_OP_RESERVE_VMID: /* We only have requirement to reserve vmid from gfxhub */ - amdgpu_vmid_alloc_reserved(adev, vm, AMDGPU_GFXHUB(0)); - break; + return amdgpu_vmid_alloc_reserved(adev, vm, AMDGPU_GFXHUB(0)); case AMDGPU_VM_OP_UNRESERVE_VMID: amdgpu_vmid_free_reserved(adev, vm, AMDGPU_GFXHUB(0)); break; diff --git a/drivers/gpu/drm/amd/amdgpu/cik_ih.c b/drivers/gpu/drm/amd/amdgpu/cik_ih.c index 41f4705bdbbd..876a3256dba4 100644 --- a/drivers/gpu/drm/amd/amdgpu/cik_ih.c +++ b/drivers/gpu/drm/amd/amdgpu/cik_ih.c @@ -156,6 +156,9 @@ static int cik_ih_irq_init(struct amdgpu_device *adev) /* enable irqs */ cik_ih_enable_interrupts(adev); + if (adev->irq.ih_soft.ring_size) + adev->irq.ih_soft.enabled = true; + return 0; } @@ -192,6 +195,9 @@ static u32 cik_ih_get_wptr(struct amdgpu_device *adev, wptr = le32_to_cpu(*ih->wptr_cpu); + if (ih == &adev->irq.ih_soft) + goto out; + if (wptr & IH_RB_WPTR__RB_OVERFLOW_MASK) { wptr &= ~IH_RB_WPTR__RB_OVERFLOW_MASK; /* When a ring buffer overflow happen start parsing interrupt @@ -211,6 +217,8 @@ static u32 cik_ih_get_wptr(struct amdgpu_device *adev, tmp &= ~IH_RB_CNTL__WPTR_OVERFLOW_CLEAR_MASK; WREG32(mmIH_RB_CNTL, tmp); } + +out: return (wptr & ih->ptr_mask); } @@ -306,6 +314,10 @@ static int cik_ih_sw_init(struct amdgpu_ip_block *ip_block) if (r) return r; + r = amdgpu_ih_ring_init(adev, &adev->irq.ih_soft, IH_SW_RING_SIZE, true); + if (r) + return r; + r = amdgpu_irq_init(adev); return r; diff --git a/drivers/gpu/drm/amd/amdgpu/cz_ih.c b/drivers/gpu/drm/amd/amdgpu/cz_ih.c index 2f891fb846d5..bc7a2e06ab5f 100644 --- a/drivers/gpu/drm/amd/amdgpu/cz_ih.c +++ b/drivers/gpu/drm/amd/amdgpu/cz_ih.c @@ -157,6 +157,9 @@ static int cz_ih_irq_init(struct amdgpu_device *adev) /* enable interrupts */ cz_ih_enable_interrupts(adev); + if (adev->irq.ih_soft.ring_size) + adev->irq.ih_soft.enabled = true; + return 0; } @@ -194,6 +197,9 @@ static u32 cz_ih_get_wptr(struct amdgpu_device *adev, wptr = le32_to_cpu(*ih->wptr_cpu); + if (ih == &adev->irq.ih_soft) + goto out; + if (!REG_GET_FIELD(wptr, IH_RB_WPTR, RB_OVERFLOW)) goto out; @@ -297,6 +303,10 @@ static int cz_ih_sw_init(struct amdgpu_ip_block *ip_block) if (r) return r; + r = amdgpu_ih_ring_init(adev, &adev->irq.ih_soft, IH_SW_RING_SIZE, true); + if (r) + return r; + r = amdgpu_irq_init(adev); return r; diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c index 02d7cfae22bd..8a2ee2de390f 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c @@ -5874,9 +5874,9 @@ static void gfx_v11_0_ring_emit_ib_gfx(struct amdgpu_ring *ring, if (flags & AMDGPU_IB_PREEMPTED) control |= INDIRECT_BUFFER_PRE_RESUME(1); - if (vmid) + if (vmid && !ring->adev->gfx.rs64_enable) gfx_v11_0_ring_emit_de_meta(ring, - (!amdgpu_sriov_vf(ring->adev) && flags & AMDGPU_IB_PREEMPTED) ? true : false); + !amdgpu_sriov_vf(ring->adev) && (flags & AMDGPU_IB_PREEMPTED)); } amdgpu_ring_write(ring, header); diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c index d7499be8c4bf..ce6e04242c52 100644 --- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c @@ -103,8 +103,10 @@ static int gmc_v10_0_process_interrupt(struct amdgpu_device *adev, uint32_t vmhub_index = entry->client_id == SOC15_IH_CLIENTID_VMC ? AMDGPU_MMHUB0(0) : AMDGPU_GFXHUB(0); struct amdgpu_vmhub *hub = &adev->vmhub[vmhub_index]; - bool retry_fault = !!(entry->src_data[1] & 0x80); - bool write_fault = !!(entry->src_data[1] & 0x20); + bool retry_fault = !!(entry->src_data[1] & + AMDGPU_GMC9_FAULT_SOURCE_DATA_RETRY); + bool write_fault = !!(entry->src_data[1] & + AMDGPU_GMC9_FAULT_SOURCE_DATA_WRITE); struct amdgpu_task_info *task_info; uint32_t status = 0; u64 addr; diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c index 7bc389d9f5c4..ba59ee8e398a 100644 --- a/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c @@ -103,12 +103,41 @@ static int gmc_v11_0_process_interrupt(struct amdgpu_device *adev, uint32_t vmhub_index = entry->client_id == SOC21_IH_CLIENTID_VMC ? AMDGPU_MMHUB0(0) : AMDGPU_GFXHUB(0); struct amdgpu_vmhub *hub = &adev->vmhub[vmhub_index]; + bool retry_fault = !!(entry->src_data[1] & + AMDGPU_GMC9_FAULT_SOURCE_DATA_RETRY); + bool write_fault = !!(entry->src_data[1] & + AMDGPU_GMC9_FAULT_SOURCE_DATA_WRITE); uint32_t status = 0; u64 addr; addr = (u64)entry->src_data[0] << 12; addr |= ((u64)entry->src_data[1] & 0xf) << 44; + if (retry_fault) { + /* Returning 1 here also prevents sending the IV to the KFD */ + + /* Process it only if it's the first fault for this address */ + if (entry->ih != &adev->irq.ih_soft && + amdgpu_gmc_filter_faults(adev, entry->ih, addr, entry->pasid, + entry->timestamp)) + return 1; + + /* Delegate it to a different ring if the hardware hasn't + * already done it. + */ + if (entry->ih == &adev->irq.ih) { + amdgpu_irq_delegate(adev, entry, 8); + return 1; + } + + /* Try to handle the recoverable page faults by filling page + * tables + */ + if (amdgpu_vm_handle_fault(adev, entry->pasid, 0, 0, addr, + entry->timestamp, write_fault)) + return 1; + } + if (!amdgpu_sriov_vf(adev)) { /* * Issue a dummy read to wait for the status register to diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v12_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v12_0.c index cad2d19105c4..7a9d6894e321 100644 --- a/drivers/gpu/drm/amd/amdgpu/gmc_v12_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v12_0.c @@ -91,6 +91,10 @@ static int gmc_v12_0_process_interrupt(struct amdgpu_device *adev, struct amdgpu_iv_entry *entry) { struct amdgpu_vmhub *hub; + bool retry_fault = !!(entry->src_data[1] & + AMDGPU_GMC9_FAULT_SOURCE_DATA_RETRY); + bool write_fault = !!(entry->src_data[1] & + AMDGPU_GMC9_FAULT_SOURCE_DATA_WRITE); uint32_t status = 0; u64 addr; @@ -102,6 +106,31 @@ static int gmc_v12_0_process_interrupt(struct amdgpu_device *adev, else hub = &adev->vmhub[AMDGPU_GFXHUB(0)]; + if (retry_fault) { + /* Returning 1 here also prevents sending the IV to the KFD */ + + /* Process it only if it's the first fault for this address */ + if (entry->ih != &adev->irq.ih_soft && + amdgpu_gmc_filter_faults(adev, entry->ih, addr, entry->pasid, + entry->timestamp)) + return 1; + + /* Delegate it to a different ring if the hardware hasn't + * already done it. + */ + if (entry->ih == &adev->irq.ih) { + amdgpu_irq_delegate(adev, entry, 8); + return 1; + } + + /* Try to handle the recoverable page faults by filling page + * tables + */ + if (amdgpu_vm_handle_fault(adev, entry->pasid, 0, 0, addr, + entry->timestamp, write_fault)) + return 1; + } + if (!amdgpu_sriov_vf(adev)) { /* * Issue a dummy read to wait for the status register to diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v6_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v6_0.c index 499dfd78092d..a8ec95f42926 100644 --- a/drivers/gpu/drm/amd/amdgpu/gmc_v6_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v6_0.c @@ -610,23 +610,21 @@ static void gmc_v6_0_gart_disable(struct amdgpu_device *adev) } static void gmc_v6_0_vm_decode_fault(struct amdgpu_device *adev, - u32 status, u32 addr, u32 mc_client) + u32 status, u32 addr) { u32 mc_id; u32 vmid = REG_GET_FIELD(status, VM_CONTEXT1_PROTECTION_FAULT_STATUS, VMID); u32 protections = REG_GET_FIELD(status, VM_CONTEXT1_PROTECTION_FAULT_STATUS, PROTECTIONS); - char block[5] = { mc_client >> 24, (mc_client >> 16) & 0xff, - (mc_client >> 8) & 0xff, mc_client & 0xff, 0 }; mc_id = REG_GET_FIELD(status, VM_CONTEXT1_PROTECTION_FAULT_STATUS, MEMORY_CLIENT_ID); - dev_err(adev->dev, "VM fault (0x%02x, vmid %d) at page %u, %s from '%s' (0x%08x) (%d)\n", + dev_err(adev->dev, "VM fault (0x%02x, vmid %d) at page %u, %s from %d\n", protections, vmid, addr, REG_GET_FIELD(status, VM_CONTEXT1_PROTECTION_FAULT_STATUS, MEMORY_CLIENT_RW) ? - "write" : "read", block, mc_client, mc_id); + "write" : "read", mc_id); } static const u32 mc_cg_registers[] = { @@ -1072,6 +1070,12 @@ static int gmc_v6_0_process_interrupt(struct amdgpu_device *adev, { u32 addr, status; + /* Delegate to the soft IRQ handler ring */ + if (adev->irq.ih_soft.enabled && entry->ih != &adev->irq.ih_soft) { + amdgpu_irq_delegate(adev, entry, 4); + return 1; + } + addr = RREG32(mmVM_CONTEXT1_PROTECTION_FAULT_ADDR); status = RREG32(mmVM_CONTEXT1_PROTECTION_FAULT_STATUS); WREG32_P(mmVM_CONTEXT1_CNTL2, 1, ~1); @@ -1079,6 +1083,10 @@ static int gmc_v6_0_process_interrupt(struct amdgpu_device *adev, if (!addr && !status) return 0; + amdgpu_vm_update_fault_cache(adev, entry->pasid, + ((u64)addr) << AMDGPU_GPU_PAGE_SHIFT, + status, AMDGPU_GFXHUB(0)); + if (amdgpu_vm_fault_stop == AMDGPU_VM_FAULT_STOP_FIRST) gmc_v6_0_set_fault_enable_default(adev, false); @@ -1089,7 +1097,7 @@ static int gmc_v6_0_process_interrupt(struct amdgpu_device *adev, addr); dev_err(adev->dev, " VM_CONTEXT1_PROTECTION_FAULT_STATUS 0x%08X\n", status); - gmc_v6_0_vm_decode_fault(adev, status, addr, 0); + gmc_v6_0_vm_decode_fault(adev, status, addr); } return 0; diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c index 0e5e54d0a9a5..fbd0bf147f50 100644 --- a/drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c @@ -1261,6 +1261,12 @@ static int gmc_v7_0_process_interrupt(struct amdgpu_device *adev, { u32 addr, status, mc_client, vmid; + /* Delegate to the soft IRQ handler ring */ + if (adev->irq.ih_soft.enabled && entry->ih != &adev->irq.ih_soft) { + amdgpu_irq_delegate(adev, entry, 4); + return 1; + } + addr = RREG32(mmVM_CONTEXT1_PROTECTION_FAULT_ADDR); status = RREG32(mmVM_CONTEXT1_PROTECTION_FAULT_STATUS); mc_client = RREG32(mmVM_CONTEXT1_PROTECTION_FAULT_MCCLIENT); diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c index e1509480dfc2..6551b60f2584 100644 --- a/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c @@ -1439,6 +1439,12 @@ static int gmc_v8_0_process_interrupt(struct amdgpu_device *adev, return 0; } + /* Delegate to the soft IRQ handler ring */ + if (adev->irq.ih_soft.enabled && entry->ih != &adev->irq.ih_soft) { + amdgpu_irq_delegate(adev, entry, 4); + return 1; + } + addr = RREG32(mmVM_CONTEXT1_PROTECTION_FAULT_ADDR); status = RREG32(mmVM_CONTEXT1_PROTECTION_FAULT_STATUS); mc_client = RREG32(mmVM_CONTEXT1_PROTECTION_FAULT_MCCLIENT); diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c index e716097dfde4..8ad7519f7b58 100644 --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c @@ -544,8 +544,10 @@ static int gmc_v9_0_process_interrupt(struct amdgpu_device *adev, struct amdgpu_irq_src *source, struct amdgpu_iv_entry *entry) { - bool retry_fault = !!(entry->src_data[1] & 0x80); - bool write_fault = !!(entry->src_data[1] & 0x20); + bool retry_fault = !!(entry->src_data[1] & + AMDGPU_GMC9_FAULT_SOURCE_DATA_RETRY); + bool write_fault = !!(entry->src_data[1] & + AMDGPU_GMC9_FAULT_SOURCE_DATA_WRITE); uint32_t status = 0, cid = 0, rw = 0, fed = 0; struct amdgpu_task_info *task_info; struct amdgpu_vmhub *hub; diff --git a/drivers/gpu/drm/amd/amdgpu/iceland_ih.c b/drivers/gpu/drm/amd/amdgpu/iceland_ih.c index 1317ede131b6..01cadf898c00 100644 --- a/drivers/gpu/drm/amd/amdgpu/iceland_ih.c +++ b/drivers/gpu/drm/amd/amdgpu/iceland_ih.c @@ -157,6 +157,9 @@ static int iceland_ih_irq_init(struct amdgpu_device *adev) /* enable interrupts */ iceland_ih_enable_interrupts(adev); + if (adev->irq.ih_soft.ring_size) + adev->irq.ih_soft.enabled = true; + return 0; } @@ -194,6 +197,9 @@ static u32 iceland_ih_get_wptr(struct amdgpu_device *adev, wptr = le32_to_cpu(*ih->wptr_cpu); + if (ih == &adev->irq.ih_soft) + goto out; + if (!REG_GET_FIELD(wptr, IH_RB_WPTR, RB_OVERFLOW)) goto out; @@ -296,6 +302,10 @@ static int iceland_ih_sw_init(struct amdgpu_ip_block *ip_block) if (r) return r; + r = amdgpu_ih_ring_init(adev, &adev->irq.ih_soft, IH_SW_RING_SIZE, true); + if (r) + return r; + r = amdgpu_irq_init(adev); return r; diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v6_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v6_0.c index 0ceeb19df2e5..217040044987 100644 --- a/drivers/gpu/drm/amd/amdgpu/sdma_v6_0.c +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v6_0.c @@ -1390,7 +1390,7 @@ static int sdma_v6_0_sw_init(struct amdgpu_ip_block *ip_block) adev->userq_funcs[AMDGPU_HW_IP_DMA] = &userq_mes_funcs; break; case IP_VERSION(6, 0, 3): - if ((adev->sdma.instance[0].fw_version >= 27) && !adev->sdma.disable_uq) + if (adev->sdma.instance[0].fw_version >= 29 && !adev->sdma.disable_uq) adev->userq_funcs[AMDGPU_HW_IP_DMA] = &userq_mes_funcs; break; case IP_VERSION(6, 1, 0): diff --git a/drivers/gpu/drm/amd/amdgpu/si_ih.c b/drivers/gpu/drm/amd/amdgpu/si_ih.c index 1df00f8a2406..66f650f87243 100644 --- a/drivers/gpu/drm/amd/amdgpu/si_ih.c +++ b/drivers/gpu/drm/amd/amdgpu/si_ih.c @@ -96,6 +96,9 @@ static int si_ih_irq_init(struct amdgpu_device *adev) pci_set_master(adev->pdev); si_ih_enable_interrupts(adev); + if (adev->irq.ih_soft.ring_size) + adev->irq.ih_soft.enabled = true; + return 0; } @@ -112,6 +115,9 @@ static u32 si_ih_get_wptr(struct amdgpu_device *adev, wptr = le32_to_cpu(*ih->wptr_cpu); + if (ih == &adev->irq.ih_soft) + goto out; + if (wptr & IH_RB_WPTR__RB_OVERFLOW_MASK) { wptr &= ~IH_RB_WPTR__RB_OVERFLOW_MASK; dev_warn(adev->dev, "IH ring buffer overflow (0x%08X, 0x%08X, 0x%08X)\n", @@ -127,6 +133,8 @@ static u32 si_ih_get_wptr(struct amdgpu_device *adev, tmp &= ~IH_RB_CNTL__WPTR_OVERFLOW_CLEAR_MASK; WREG32(IH_RB_CNTL, tmp); } + +out: return (wptr & ih->ptr_mask); } @@ -175,6 +183,10 @@ static int si_ih_sw_init(struct amdgpu_ip_block *ip_block) if (r) return r; + r = amdgpu_ih_ring_init(adev, &adev->irq.ih_soft, IH_SW_RING_SIZE, true); + if (r) + return r; + return amdgpu_irq_init(adev); } diff --git a/drivers/gpu/drm/amd/amdgpu/soc15.c b/drivers/gpu/drm/amd/amdgpu/soc15.c index 9785fada4fa7..42f5d9c0e3af 100644 --- a/drivers/gpu/drm/amd/amdgpu/soc15.c +++ b/drivers/gpu/drm/amd/amdgpu/soc15.c @@ -853,10 +853,6 @@ static bool soc15_need_reset_on_init(struct amdgpu_device *adev) { u32 sol_reg; - /* CP hangs in IGT reloading test on RN, reset to WA */ - if (adev->asic_type == CHIP_RENOIR) - return true; - if (amdgpu_gmc_need_reset_on_init(adev)) return true; if (amdgpu_psp_tos_reload_needed(adev)) diff --git a/drivers/gpu/drm/amd/amdgpu/tonga_ih.c b/drivers/gpu/drm/amd/amdgpu/tonga_ih.c index 7d17ae56f901..ee8038df17e3 100644 --- a/drivers/gpu/drm/amd/amdgpu/tonga_ih.c +++ b/drivers/gpu/drm/amd/amdgpu/tonga_ih.c @@ -159,6 +159,9 @@ static int tonga_ih_irq_init(struct amdgpu_device *adev) /* enable interrupts */ tonga_ih_enable_interrupts(adev); + if (adev->irq.ih_soft.ring_size) + adev->irq.ih_soft.enabled = true; + return 0; } @@ -196,6 +199,9 @@ static u32 tonga_ih_get_wptr(struct amdgpu_device *adev, wptr = le32_to_cpu(*ih->wptr_cpu); + if (ih == &adev->irq.ih_soft) + goto out; + if (!REG_GET_FIELD(wptr, IH_RB_WPTR, RB_OVERFLOW)) goto out; @@ -306,6 +312,10 @@ static int tonga_ih_sw_init(struct amdgpu_ip_block *ip_block) if (r) return r; + r = amdgpu_ih_ring_init(adev, &adev->irq.ih_soft, IH_SW_RING_SIZE, true); + if (r) + return r; + adev->irq.ih.use_doorbell = true; adev->irq.ih.doorbell_index = adev->doorbell_index.ih; diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c b/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c index eacf4e93ba2f..cb7123ec1a5d 100644 --- a/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c +++ b/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c @@ -141,7 +141,7 @@ static int vcn_v4_0_3_late_init(struct amdgpu_ip_block *ip_block) adev->vcn.supported_reset = amdgpu_get_soft_full_reset_mask(&adev->vcn.inst[0].ring_enc[0]); - if (amdgpu_dpm_reset_vcn_is_supported(adev)) + if (amdgpu_dpm_reset_vcn_is_supported(adev) && !amdgpu_sriov_vf(adev)) adev->vcn.supported_reset |= AMDGPU_RESET_TYPE_PER_QUEUE; return 0; diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v5_0_1.c b/drivers/gpu/drm/amd/amdgpu/vcn_v5_0_1.c index 714350cabf2f..8bd457dea4cf 100644 --- a/drivers/gpu/drm/amd/amdgpu/vcn_v5_0_1.c +++ b/drivers/gpu/drm/amd/amdgpu/vcn_v5_0_1.c @@ -122,7 +122,9 @@ static int vcn_v5_0_1_late_init(struct amdgpu_ip_block *ip_block) switch (amdgpu_ip_version(adev, MP0_HWIP, 0)) { case IP_VERSION(13, 0, 12): - if ((adev->psp.sos.fw_version >= 0x00450025) && amdgpu_dpm_reset_vcn_is_supported(adev)) + if ((adev->psp.sos.fw_version >= 0x00450025) && + amdgpu_dpm_reset_vcn_is_supported(adev) && + !amdgpu_sriov_vf(adev)) adev->vcn.supported_reset |= AMDGPU_RESET_TYPE_PER_QUEUE; break; default: |