diff options
| author | Paolo Bonzini <pbonzini@redhat.com> | 2025-12-02 18:36:26 +0100 |
|---|---|---|
| committer | Paolo Bonzini <pbonzini@redhat.com> | 2025-12-02 18:36:26 +0100 |
| commit | f58e70cc31e3109b4f81688c74146702b05199c7 (patch) | |
| tree | c4e76f037cb584842b7f8a64fe48f44a671ebff0 | |
| parent | 63a9b0bc65d5d3ea96a57e7985ea22a8582fbbe5 (diff) | |
| parent | 3eef0c83c3f3e58933e98e678ddf4e95457d4d14 (diff) | |
Merge tag 'kvmarm-6.19' of https://git.kernel.org/pub/scm/linux/kernel/git/kvmarm/kvmarm into HEAD
KVM/arm64 updates for 6.19
- Support for userspace handling of synchronous external aborts (SEAs),
allowing the VMM to potentially handle the abort in a non-fatal
manner.
- Large rework of the VGIC's list register handling with the goal of
supporting more active/pending IRQs than available list registers in
hardware. In addition, the VGIC now supports EOImode==1 style
deactivations for IRQs which may occur on a separate vCPU than the
one that acked the IRQ.
- Support for FEAT_XNX (user / privileged execute permissions) and
FEAT_HAF (hardware update to the Access Flag) in the software page
table walkers and shadow MMU.
- Allow page table destruction to reschedule, fixing long need_resched
latencies observed when destroying a large VM.
- Minor fixes to KVM and selftests
55 files changed, 2575 insertions, 531 deletions
diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index 79cdc1b177f1..f38a0cfcbec5 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -7286,6 +7286,41 @@ exit, even without calls to ``KVM_ENABLE_CAP`` or similar. In this case, it will enter with output fields already valid; in the common case, the ``unknown.ret`` field of the union will be ``TDVMCALL_STATUS_SUBFUNC_UNSUPPORTED``. Userspace need not do anything if it does not wish to support a TDVMCALL. + +:: + + /* KVM_EXIT_ARM_SEA */ + struct { + #define KVM_EXIT_ARM_SEA_FLAG_GPA_VALID (1ULL << 0) + __u64 flags; + __u64 esr; + __u64 gva; + __u64 gpa; + } arm_sea; + +Used on arm64 systems. When the VM capability ``KVM_CAP_ARM_SEA_TO_USER`` is +enabled, a KVM exits to userspace if a guest access causes a synchronous +external abort (SEA) and the host APEI fails to handle the SEA. + +``esr`` is set to a sanitized value of ESR_EL2 from the exception taken to KVM, +consisting of the following fields: + + - ``ESR_EL2.EC`` + - ``ESR_EL2.IL`` + - ``ESR_EL2.FnV`` + - ``ESR_EL2.EA`` + - ``ESR_EL2.CM`` + - ``ESR_EL2.WNR`` + - ``ESR_EL2.FSC`` + - ``ESR_EL2.SET`` (when FEAT_RAS is implemented for the VM) + +``gva`` is set to the value of FAR_EL2 from the exception taken to KVM when +``ESR_EL2.FnV == 0``. Otherwise, the value of ``gva`` is unknown. + +``gpa`` is set to the faulting IPA from the exception taken to KVM when +the ``KVM_EXIT_ARM_SEA_FLAG_GPA_VALID`` flag is set. Otherwise, the value of +``gpa`` is unknown. + :: /* Fix the size of the union. */ @@ -8703,6 +8738,18 @@ This capability indicate to the userspace whether a PFNMAP memory region can be safely mapped as cacheable. This relies on the presence of force write back (FWB) feature support on the hardware. +7.45 KVM_CAP_ARM_SEA_TO_USER +---------------------------- + +:Architecture: arm64 +:Target: VM +:Parameters: none +:Returns: 0 on success, -EINVAL if unsupported. + +When this capability is enabled, KVM may exit to userspace for SEAs taken to +EL2 resulting from a guest access. See ``KVM_EXIT_ARM_SEA`` for more +information. + 8. Other capabilities. ====================== diff --git a/arch/arm64/include/asm/kvm_arm.h b/arch/arm64/include/asm/kvm_arm.h index 1da290aeedce..e500600e4b9b 100644 --- a/arch/arm64/include/asm/kvm_arm.h +++ b/arch/arm64/include/asm/kvm_arm.h @@ -111,6 +111,7 @@ #define TCR_EL2_DS (1UL << 32) #define TCR_EL2_RES1 ((1U << 31) | (1 << 23)) #define TCR_EL2_HPD (1 << 24) +#define TCR_EL2_HA (1 << 21) #define TCR_EL2_TBI (1 << 20) #define TCR_EL2_PS_SHIFT 16 #define TCR_EL2_PS_MASK (7 << TCR_EL2_PS_SHIFT) diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h index 9da54d4ee49e..4b17e822afec 100644 --- a/arch/arm64/include/asm/kvm_asm.h +++ b/arch/arm64/include/asm/kvm_asm.h @@ -79,7 +79,7 @@ enum __kvm_host_smccc_func { __KVM_HOST_SMCCC_FUNC___kvm_tlb_flush_vmid_range, __KVM_HOST_SMCCC_FUNC___kvm_flush_cpu_context, __KVM_HOST_SMCCC_FUNC___kvm_timer_set_cntvoff, - __KVM_HOST_SMCCC_FUNC___vgic_v3_save_vmcr_aprs, + __KVM_HOST_SMCCC_FUNC___vgic_v3_save_aprs, __KVM_HOST_SMCCC_FUNC___vgic_v3_restore_vmcr_aprs, __KVM_HOST_SMCCC_FUNC___pkvm_reserve_vm, __KVM_HOST_SMCCC_FUNC___pkvm_unreserve_vm, @@ -246,9 +246,9 @@ extern void __kvm_tlb_flush_vmid(struct kvm_s2_mmu *mmu); extern int __kvm_tlbi_s1e2(struct kvm_s2_mmu *mmu, u64 va, u64 sys_encoding); extern void __kvm_timer_set_cntvoff(u64 cntvoff); -extern void __kvm_at_s1e01(struct kvm_vcpu *vcpu, u32 op, u64 vaddr); -extern void __kvm_at_s1e2(struct kvm_vcpu *vcpu, u32 op, u64 vaddr); -extern void __kvm_at_s12(struct kvm_vcpu *vcpu, u32 op, u64 vaddr); +extern int __kvm_at_s1e01(struct kvm_vcpu *vcpu, u32 op, u64 vaddr); +extern int __kvm_at_s1e2(struct kvm_vcpu *vcpu, u32 op, u64 vaddr); +extern int __kvm_at_s12(struct kvm_vcpu *vcpu, u32 op, u64 vaddr); extern int __kvm_vcpu_run(struct kvm_vcpu *vcpu); diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 64302c438355..ac7f970c7883 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -54,6 +54,7 @@ #define KVM_REQ_NESTED_S2_UNMAP KVM_ARCH_REQ(8) #define KVM_REQ_GUEST_HYP_IRQ_PENDING KVM_ARCH_REQ(9) #define KVM_REQ_MAP_L1_VNCR_EL2 KVM_ARCH_REQ(10) +#define KVM_REQ_VGIC_PROCESS_UPDATE KVM_ARCH_REQ(11) #define KVM_DIRTY_LOG_MANUAL_CAPS (KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE | \ KVM_DIRTY_LOG_INITIALLY_SET) @@ -350,6 +351,8 @@ struct kvm_arch { #define KVM_ARCH_FLAG_GUEST_HAS_SVE 9 /* MIDR_EL1, REVIDR_EL1, and AIDR_EL1 are writable from userspace */ #define KVM_ARCH_FLAG_WRITABLE_IMP_ID_REGS 10 + /* Unhandled SEAs are taken to userspace */ +#define KVM_ARCH_FLAG_EXIT_SEA 11 unsigned long flags; /* VM-wide vCPU feature set */ diff --git a/arch/arm64/include/asm/kvm_hyp.h b/arch/arm64/include/asm/kvm_hyp.h index e6be1f5d0967..76ce2b94bd97 100644 --- a/arch/arm64/include/asm/kvm_hyp.h +++ b/arch/arm64/include/asm/kvm_hyp.h @@ -77,12 +77,13 @@ DECLARE_PER_CPU(struct kvm_nvhe_init_params, kvm_init_params); int __vgic_v2_perform_cpuif_access(struct kvm_vcpu *vcpu); u64 __gic_v3_get_lr(unsigned int lr); +void __gic_v3_set_lr(u64 val, int lr); void __vgic_v3_save_state(struct vgic_v3_cpu_if *cpu_if); void __vgic_v3_restore_state(struct vgic_v3_cpu_if *cpu_if); void __vgic_v3_activate_traps(struct vgic_v3_cpu_if *cpu_if); void __vgic_v3_deactivate_traps(struct vgic_v3_cpu_if *cpu_if); -void __vgic_v3_save_vmcr_aprs(struct vgic_v3_cpu_if *cpu_if); +void __vgic_v3_save_aprs(struct vgic_v3_cpu_if *cpu_if); void __vgic_v3_restore_vmcr_aprs(struct vgic_v3_cpu_if *cpu_if); int __vgic_v3_perform_cpuif_access(struct kvm_vcpu *vcpu); diff --git a/arch/arm64/include/asm/kvm_nested.h b/arch/arm64/include/asm/kvm_nested.h index f7c06a840963..905c658057a4 100644 --- a/arch/arm64/include/asm/kvm_nested.h +++ b/arch/arm64/include/asm/kvm_nested.h @@ -120,9 +120,42 @@ static inline bool kvm_s2_trans_writable(struct kvm_s2_trans *trans) return trans->writable; } -static inline bool kvm_s2_trans_executable(struct kvm_s2_trans *trans) +static inline bool kvm_has_xnx(struct kvm *kvm) { - return !(trans->desc & BIT(54)); + return cpus_have_final_cap(ARM64_HAS_XNX) && + kvm_has_feat(kvm, ID_AA64MMFR1_EL1, XNX, IMP); +} + +static inline bool kvm_s2_trans_exec_el0(struct kvm *kvm, struct kvm_s2_trans *trans) +{ + u8 xn = FIELD_GET(KVM_PTE_LEAF_ATTR_HI_S2_XN, trans->desc); + + if (!kvm_has_xnx(kvm)) + xn &= FIELD_PREP(KVM_PTE_LEAF_ATTR_HI_S2_XN, 0b10); + + switch (xn) { + case 0b00: + case 0b01: + return true; + default: + return false; + } +} + +static inline bool kvm_s2_trans_exec_el1(struct kvm *kvm, struct kvm_s2_trans *trans) +{ + u8 xn = FIELD_GET(KVM_PTE_LEAF_ATTR_HI_S2_XN, trans->desc); + + if (!kvm_has_xnx(kvm)) + xn &= FIELD_PREP(KVM_PTE_LEAF_ATTR_HI_S2_XN, 0b10); + + switch (xn) { + case 0b00: + case 0b11: + return true; + default: + return false; + } } extern int kvm_walk_nested_s2(struct kvm_vcpu *vcpu, phys_addr_t gipa, @@ -320,6 +353,7 @@ struct s1_walk_info { bool be; bool s2; bool pa52bit; + bool ha; }; struct s1_walk_result { @@ -370,4 +404,6 @@ void kvm_handle_s1e2_tlbi(struct kvm_vcpu *vcpu, u32 inst, u64 val); (FIX_VNCR - __c); \ }) +int __kvm_at_swap_desc(struct kvm *kvm, gpa_t ipa, u64 old, u64 new); + #endif /* __ARM64_KVM_NESTED_H */ diff --git a/arch/arm64/include/asm/kvm_pgtable.h b/arch/arm64/include/asm/kvm_pgtable.h index 2888b5d03757..fc02de43c68d 100644 --- a/arch/arm64/include/asm/kvm_pgtable.h +++ b/arch/arm64/include/asm/kvm_pgtable.h @@ -89,7 +89,7 @@ typedef u64 kvm_pte_t; #define KVM_PTE_LEAF_ATTR_HI_S1_XN BIT(54) -#define KVM_PTE_LEAF_ATTR_HI_S2_XN BIT(54) +#define KVM_PTE_LEAF_ATTR_HI_S2_XN GENMASK(54, 53) #define KVM_PTE_LEAF_ATTR_HI_S1_GP BIT(50) @@ -240,7 +240,9 @@ enum kvm_pgtable_stage2_flags { /** * enum kvm_pgtable_prot - Page-table permissions and attributes. - * @KVM_PGTABLE_PROT_X: Execute permission. + * @KVM_PGTABLE_PROT_UX: Unprivileged execute permission. + * @KVM_PGTABLE_PROT_PX: Privileged execute permission. + * @KVM_PGTABLE_PROT_X: Privileged and unprivileged execute permission. * @KVM_PGTABLE_PROT_W: Write permission. * @KVM_PGTABLE_PROT_R: Read permission. * @KVM_PGTABLE_PROT_DEVICE: Device attributes. @@ -251,12 +253,15 @@ enum kvm_pgtable_stage2_flags { * @KVM_PGTABLE_PROT_SW3: Software bit 3. */ enum kvm_pgtable_prot { - KVM_PGTABLE_PROT_X = BIT(0), - KVM_PGTABLE_PROT_W = BIT(1), - KVM_PGTABLE_PROT_R = BIT(2), + KVM_PGTABLE_PROT_PX = BIT(0), + KVM_PGTABLE_PROT_UX = BIT(1), + KVM_PGTABLE_PROT_X = KVM_PGTABLE_PROT_PX | + KVM_PGTABLE_PROT_UX, + KVM_PGTABLE_PROT_W = BIT(2), + KVM_PGTABLE_PROT_R = BIT(3), - KVM_PGTABLE_PROT_DEVICE = BIT(3), - KVM_PGTABLE_PROT_NORMAL_NC = BIT(4), + KVM_PGTABLE_PROT_DEVICE = BIT(4), + KVM_PGTABLE_PROT_NORMAL_NC = BIT(5), KVM_PGTABLE_PROT_SW0 = BIT(55), KVM_PGTABLE_PROT_SW1 = BIT(56), @@ -355,6 +360,11 @@ static inline kvm_pte_t *kvm_dereference_pteref(struct kvm_pgtable_walker *walke return pteref; } +static inline kvm_pte_t *kvm_dereference_pteref_raw(kvm_pteref_t pteref) +{ + return pteref; +} + static inline int kvm_pgtable_walk_begin(struct kvm_pgtable_walker *walker) { /* @@ -384,6 +394,11 @@ static inline kvm_pte_t *kvm_dereference_pteref(struct kvm_pgtable_walker *walke return rcu_dereference_check(pteref, !(walker->flags & KVM_PGTABLE_WALK_SHARED)); } +static inline kvm_pte_t *kvm_dereference_pteref_raw(kvm_pteref_t pteref) +{ + return rcu_dereference_raw(pteref); +} + static inline int kvm_pgtable_walk_begin(struct kvm_pgtable_walker *walker) { if (walker->flags & KVM_PGTABLE_WALK_SHARED) @@ -552,6 +567,26 @@ static inline int kvm_pgtable_stage2_init(struct kvm_pgtable *pgt, struct kvm_s2 void kvm_pgtable_stage2_destroy(struct kvm_pgtable *pgt); /** + * kvm_pgtable_stage2_destroy_range() - Destroy the unlinked range of addresses. + * @pgt: Page-table structure initialised by kvm_pgtable_stage2_init*(). + * @addr: Intermediate physical address at which to place the mapping. + * @size: Size of the mapping. + * + * The page-table is assumed to be unreachable by any hardware walkers prior + * to freeing and therefore no TLB invalidation is performed. + */ +void kvm_pgtable_stage2_destroy_range(struct kvm_pgtable *pgt, + u64 addr, u64 size); + +/** + * kvm_pgtable_stage2_destroy_pgd() - Destroy the PGD of guest stage-2 page-table. + * @pgt: Page-table structure initialised by kvm_pgtable_stage2_init*(). + * + * It is assumed that the rest of the page-table is freed before this operation. + */ +void kvm_pgtable_stage2_destroy_pgd(struct kvm_pgtable *pgt); + +/** * kvm_pgtable_stage2_free_unlinked() - Free an unlinked stage-2 paging structure. * @mm_ops: Memory management callbacks. * @pgtable: Unlinked stage-2 paging structure to be freed. diff --git a/arch/arm64/include/asm/kvm_pkvm.h b/arch/arm64/include/asm/kvm_pkvm.h index 08be89c95466..0aecd4ac5f45 100644 --- a/arch/arm64/include/asm/kvm_pkvm.h +++ b/arch/arm64/include/asm/kvm_pkvm.h @@ -180,7 +180,9 @@ struct pkvm_mapping { int pkvm_pgtable_stage2_init(struct kvm_pgtable *pgt, struct kvm_s2_mmu *mmu, struct kvm_pgtable_mm_ops *mm_ops); -void pkvm_pgtable_stage2_destroy(struct kvm_pgtable *pgt); +void pkvm_pgtable_stage2_destroy_range(struct kvm_pgtable *pgt, + u64 addr, u64 size); +void pkvm_pgtable_stage2_destroy_pgd(struct kvm_pgtable *pgt); int pkvm_pgtable_stage2_map(struct kvm_pgtable *pgt, u64 addr, u64 size, u64 phys, enum kvm_pgtable_prot prot, void *mc, enum kvm_pgtable_walk_flags flags); diff --git a/arch/arm64/include/asm/virt.h b/arch/arm64/include/asm/virt.h index aa280f356b96..8eb63d329497 100644 --- a/arch/arm64/include/asm/virt.h +++ b/arch/arm64/include/asm/virt.h @@ -40,8 +40,13 @@ */ #define HVC_FINALISE_EL2 3 +/* + * HVC_GET_ICH_VTR_EL2 - Retrieve the ICH_VTR_EL2 value + */ +#define HVC_GET_ICH_VTR_EL2 4 + /* Max number of HYP stub hypercalls */ -#define HVC_STUB_HCALL_NR 4 +#define HVC_STUB_HCALL_NR 5 /* Error returned when an invalid stub number is passed into x0 */ #define HVC_STUB_ERR 0xbadca11 diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index e25b0f84a22d..79e30cd5334b 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -2304,6 +2304,49 @@ static bool has_gic_prio_relaxed_sync(const struct arm64_cpu_capabilities *entry } #endif +static bool can_trap_icv_dir_el1(const struct arm64_cpu_capabilities *entry, + int scope) +{ + static const struct midr_range has_vgic_v3[] = { + MIDR_ALL_VERSIONS(MIDR_APPLE_M1_ICESTORM), + MIDR_ALL_VERSIONS(MIDR_APPLE_M1_FIRESTORM), + MIDR_ALL_VERSIONS(MIDR_APPLE_M1_ICESTORM_PRO), + MIDR_ALL_VERSIONS(MIDR_APPLE_M1_FIRESTORM_PRO), + MIDR_ALL_VERSIONS(MIDR_APPLE_M1_ICESTORM_MAX), + MIDR_ALL_VERSIONS(MIDR_APPLE_M1_FIRESTORM_MAX), + MIDR_ALL_VERSIONS(MIDR_APPLE_M2_BLIZZARD), + MIDR_ALL_VERSIONS(MIDR_APPLE_M2_AVALANCHE), + MIDR_ALL_VERSIONS(MIDR_APPLE_M2_BLIZZARD_PRO), + MIDR_ALL_VERSIONS(MIDR_APPLE_M2_AVALANCHE_PRO), + MIDR_ALL_VERSIONS(MIDR_APPLE_M2_BLIZZARD_MAX), + MIDR_ALL_VERSIONS(MIDR_APPLE_M2_AVALANCHE_MAX), + {}, + }; + struct arm_smccc_res res = {}; + + BUILD_BUG_ON(ARM64_HAS_ICH_HCR_EL2_TDIR <= ARM64_HAS_GICV3_CPUIF); + BUILD_BUG_ON(ARM64_HAS_ICH_HCR_EL2_TDIR <= ARM64_HAS_GICV5_LEGACY); + if (!this_cpu_has_cap(ARM64_HAS_GICV3_CPUIF) && + !is_midr_in_range_list(has_vgic_v3)) + return false; + + if (!is_hyp_mode_available()) + return false; + + if (this_cpu_has_cap(ARM64_HAS_GICV5_LEGACY)) + return true; + + if (is_kernel_in_hyp_mode()) + res.a1 = read_sysreg_s(SYS_ICH_VTR_EL2); + else + arm_smccc_1_1_hvc(HVC_GET_ICH_VTR_EL2, &res); + + if (res.a0 == HVC_STUB_ERR) + return false; + + return res.a1 & ICH_VTR_EL2_TDS; +} + #ifdef CONFIG_ARM64_BTI static void bti_enable(const struct arm64_cpu_capabilities *__unused) { @@ -2815,6 +2858,15 @@ static const struct arm64_cpu_capabilities arm64_features[] = { .matches = has_gic_prio_relaxed_sync, }, #endif + { + /* + * Depends on having GICv3 + */ + .desc = "ICV_DIR_EL1 trapping", + .capability = ARM64_HAS_ICH_HCR_EL2_TDIR, + .type = ARM64_CPUCAP_EARLY_LOCAL_CPU_FEATURE, + .matches = can_trap_icv_dir_el1, + }, #ifdef CONFIG_ARM64_E0PD { .desc = "E0PD", @@ -3089,6 +3141,13 @@ static const struct arm64_cpu_capabilities arm64_features[] = { .capability = ARM64_HAS_GICV5_LEGACY, .matches = test_has_gicv5_legacy, }, + { + .desc = "XNX", + .capability = ARM64_HAS_XNX, + .type = ARM64_CPUCAP_SYSTEM_FEATURE, + .matches = has_cpuid_feature, + ARM64_CPUID_FIELDS(ID_AA64MMFR1_EL1, XNX, IMP) + }, {}, }; diff --git a/arch/arm64/kernel/hyp-stub.S b/arch/arm64/kernel/hyp-stub.S index 36e2d26b54f5..085bc9972f6b 100644 --- a/arch/arm64/kernel/hyp-stub.S +++ b/arch/arm64/kernel/hyp-stub.S @@ -54,6 +54,11 @@ SYM_CODE_START_LOCAL(elx_sync) 1: cmp x0, #HVC_FINALISE_EL2 b.eq __finalise_el2 + cmp x0, #HVC_GET_ICH_VTR_EL2 + b.ne 2f + mrs_s x1, SYS_ICH_VTR_EL2 + b 9f + 2: cmp x0, #HVC_SOFT_RESTART b.ne 3f mov x0, x2 diff --git a/arch/arm64/kernel/image-vars.h b/arch/arm64/kernel/image-vars.h index 5369763606e7..85bc629270bd 100644 --- a/arch/arm64/kernel/image-vars.h +++ b/arch/arm64/kernel/image-vars.h @@ -91,6 +91,7 @@ KVM_NVHE_ALIAS(spectre_bhb_patch_loop_mitigation_enable); KVM_NVHE_ALIAS(spectre_bhb_patch_wa3); KVM_NVHE_ALIAS(spectre_bhb_patch_clearbhb); KVM_NVHE_ALIAS(alt_cb_patch_nops); +KVM_NVHE_ALIAS(kvm_compute_ich_hcr_trap_bits); /* Global kernel state accessed by nVHE hyp code. */ KVM_NVHE_ALIAS(kvm_vgic_global_state); diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index e7140c19771b..97627638e802 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -132,6 +132,10 @@ int kvm_vm_ioctl_enable_cap(struct kvm *kvm, } mutex_unlock(&kvm->lock); break; + case KVM_CAP_ARM_SEA_TO_USER: + r = 0; + set_bit(KVM_ARCH_FLAG_EXIT_SEA, &kvm->arch.flags); + break; default: break; } @@ -327,6 +331,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_IRQFD_RESAMPLE: case KVM_CAP_COUNTER_OFFSET: case KVM_CAP_ARM_WRITABLE_IMP_ID_REGS: + case KVM_CAP_ARM_SEA_TO_USER: r = 1; break; case KVM_CAP_SET_GUEST_DEBUG2: @@ -440,7 +445,7 @@ struct kvm *kvm_arch_alloc_vm(void) if (!has_vhe()) return kzalloc(sz, GFP_KERNEL_ACCOUNT); - return __vmalloc(sz, GFP_KERNEL_ACCOUNT | __GFP_HIGHMEM | __GFP_ZERO); + return kvzalloc(sz, GFP_KERNEL_ACCOUNT); } int kvm_arch_vcpu_precreate(struct kvm *kvm, unsigned int id) @@ -659,8 +664,7 @@ nommu: void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) { if (is_protected_kvm_enabled()) { - kvm_call_hyp(__vgic_v3_save_vmcr_aprs, - &vcpu->arch.vgic_cpu.vgic_v3); + kvm_call_hyp(__vgic_v3_save_aprs, &vcpu->arch.vgic_cpu.vgic_v3); kvm_call_hyp_nvhe(__pkvm_vcpu_put); } @@ -1042,6 +1046,10 @@ static int check_vcpu_requests(struct kvm_vcpu *vcpu) */ kvm_check_request(KVM_REQ_IRQ_PENDING, vcpu); + /* Process interrupts deactivated through a trap */ + if (kvm_check_request(KVM_REQ_VGIC_PROCESS_UPDATE, vcpu)) + kvm_vgic_process_async_update(vcpu); + if (kvm_check_request(KVM_REQ_RECORD_STEAL, vcpu)) kvm_update_stolen_time(vcpu); diff --git a/arch/arm64/kvm/at.c b/arch/arm64/kvm/at.c index be26d5aa668c..53bf70126f81 100644 --- a/arch/arm64/kvm/at.c +++ b/arch/arm64/kvm/at.c @@ -346,6 +346,11 @@ static int setup_s1_walk(struct kvm_vcpu *vcpu, struct s1_walk_info *wi, wi->baddr &= GENMASK_ULL(wi->max_oa_bits - 1, x); + wi->ha = kvm_has_feat(vcpu->kvm, ID_AA64MMFR1_EL1, HAFDBS, AF); + wi->ha &= (wi->regime == TR_EL2 ? + FIELD_GET(TCR_EL2_HA, tcr) : + FIELD_GET(TCR_HA, tcr)); + return 0; addrsz: @@ -362,10 +367,42 @@ transfault: return -EFAULT; } +static int kvm_read_s1_desc(struct kvm_vcpu *vcpu, u64 pa, u64 *desc, + struct s1_walk_info *wi) +{ + u64 val; + int r; + + r = kvm_read_guest(vcpu->kvm, pa, &val, sizeof(val)); + if (r) + return r; + + if (wi->be) + *desc = be64_to_cpu((__force __be64)val); + else + *desc = le64_to_cpu((__force __le64)val); + + return 0; +} + +static int kvm_swap_s1_desc(struct kvm_vcpu *vcpu, u64 pa, u64 old, u64 new, + struct s1_walk_info *wi) +{ + if (wi->be) { + old = (__force u64)cpu_to_be64(old); + new = (__force u64)cpu_to_be64(new); + } else { + old = (__force u64)cpu_to_le64(old); + new = (__force u64)cpu_to_le64(new); + } + + return __kvm_at_swap_desc(vcpu->kvm, pa, old, new); +} + static int walk_s1(struct kvm_vcpu *vcpu, struct s1_walk_info *wi, struct s1_walk_result *wr, u64 va) { - u64 va_top, va_bottom, baddr, desc; + u64 va_top, va_bottom, baddr, desc, new_desc, ipa; int level, stride, ret; level = wi->sl; @@ -375,7 +412,7 @@ static int walk_s1(struct kvm_vcpu *vcpu, struct s1_walk_info *wi, va_top = get_ia_size(wi) - 1; while (1) { - u64 index, ipa; + u64 index; va_bottom = (3 - level) * stride + wi->pgshift; index = (va & GENMASK_ULL(va_top, va_bottom)) >> (va_bottom - 3); @@ -414,16 +451,13 @@ static int walk_s1(struct kvm_vcpu *vcpu, struct s1_walk_info *wi, return ret; } - ret = kvm_read_guest(vcpu->kvm, ipa, &desc, sizeof(desc)); + ret = kvm_read_s1_desc(vcpu, ipa, &desc, wi); if (ret) { fail_s1_walk(wr, ESR_ELx_FSC_SEA_TTW(level), false); return ret; } - if (wi->be) - desc = be64_to_cpu((__force __be64)desc); - else - desc = le64_to_cpu((__force __le64)desc); + new_desc = desc; /* Invalid descriptor */ if (!(desc & BIT(0))) @@ -477,6 +511,17 @@ static int walk_s1(struct kvm_vcpu *vcpu, struct s1_walk_info *wi, if (check_output_size(baddr & GENMASK(52, va_bottom), wi)) goto addrsz; + if (wi->ha) + new_desc |= PTE_AF; + + if (new_desc != desc) { + ret = kvm_swap_s1_desc(vcpu, ipa, desc, new_desc, wi); + if (ret) + return ret; + + desc = new_desc; + } + if (!(desc & PTE_AF)) { fail_s1_walk(wr, ESR_ELx_FSC_ACCESS_L(level), false); return -EACCES; @@ -1221,7 +1266,7 @@ static void compute_s1_permissions(struct kvm_vcpu *vcpu, wr->pr &= !pan; } -static u64 handle_at_slow(struct kvm_vcpu *vcpu, u32 op, u64 vaddr) +static int handle_at_slow(struct kvm_vcpu *vcpu, u32 op, u64 vaddr, u64 *par) { struct s1_walk_result wr = {}; struct s1_walk_info wi = {}; @@ -1246,6 +1291,11 @@ static u64 handle_at_slow(struct kvm_vcpu *vcpu, u32 op, u64 vaddr) srcu_read_unlock(&vcpu->kvm->srcu, idx); + /* + * Race to update a descriptor -- restart the walk. + */ + if (ret == -EAGAIN) + return ret; if (ret) goto compute_par; @@ -1279,7 +1329,8 @@ static u64 handle_at_slow(struct kvm_vcpu *vcpu, u32 op, u64 vaddr) fail_s1_walk(&wr, ESR_ELx_FSC_PERM_L(wr.level), false); compute_par: - return compute_par_s1(vcpu, &wi, &wr); + *par = compute_par_s1(vcpu, &wi, &wr); + return 0; } /* @@ -1407,9 +1458,10 @@ static bool par_check_s1_access_fault(u64 par) !(par & SYS_PAR_EL1_S)); } -void __kvm_at_s1e01(struct kvm_vcpu *vcpu, u32 op, u64 vaddr) +int __kvm_at_s1e01(struct kvm_vcpu *vcpu, u32 op, u64 vaddr) { u64 par = __kvm_at_s1e01_fast(vcpu, op, vaddr); + int ret; /* * If PAR_EL1 reports that AT failed on a S1 permission or access @@ -1421,15 +1473,20 @@ void __kvm_at_s1e01(struct kvm_vcpu *vcpu, u32 op, u64 vaddr) */ if ((par & SYS_PAR_EL1_F) && !par_check_s1_perm_fault(par) && - !par_check_s1_access_fault(par)) - par = handle_at_slow(vcpu, op, vaddr); + !par_check_s1_access_fault(par)) { + ret = handle_at_slow(vcpu, op, vaddr, &par); + if (ret) + return ret; + } vcpu_write_sys_reg(vcpu, par, PAR_EL1); + return 0; } -void __kvm_at_s1e2(struct kvm_vcpu *vcpu, u32 op, u64 vaddr) +int __kvm_at_s1e2(struct kvm_vcpu *vcpu, u32 op, u64 vaddr) { u64 par; + int ret; /* * We've trapped, so everything is live on the CPU. As we will be @@ -1476,13 +1533,17 @@ void __kvm_at_s1e2(struct kvm_vcpu *vcpu, u32 op, u64 vaddr) } /* We failed the translation, let's replay it in slow motion */ - if ((par & SYS_PAR_EL1_F) && !par_check_s1_perm_fault(par)) - par = handle_at_slow(vcpu, op, vaddr); + if ((par & SYS_PAR_EL1_F) && !par_check_s1_perm_fault(par)) { + ret = handle_at_slow(vcpu, op, vaddr, &par); + if (ret) + return ret; + } vcpu_write_sys_reg(vcpu, par, PAR_EL1); + return 0; } -void __kvm_at_s12(struct kvm_vcpu *vcpu, u32 op, u64 vaddr) +int __kvm_at_s12(struct kvm_vcpu *vcpu, u32 op, u64 vaddr) { struct kvm_s2_trans out = {}; u64 ipa, par; @@ -1509,13 +1570,13 @@ void __kvm_at_s12(struct kvm_vcpu *vcpu, u32 op, u64 vaddr) break; default: WARN_ON_ONCE(1); - return; + return 0; } __kvm_at_s1e01(vcpu, op, vaddr); par = vcpu_read_sys_reg(vcpu, PAR_EL1); if (par & SYS_PAR_EL1_F) - return; + return 0; /* * If we only have a single stage of translation (EL2&0), exit @@ -1523,14 +1584,14 @@ void __kvm_at_s12(struct kvm_vcpu *vcpu, u32 op, u64 vaddr) */ if (compute_translation_regime(vcpu, op) == TR_EL20 || !(vcpu_read_sys_reg(vcpu, HCR_EL2) & (HCR_VM | HCR_DC))) - return; + return 0; /* Do the stage-2 translation */ ipa = (par & GENMASK_ULL(47, 12)) | (vaddr & GENMASK_ULL(11, 0)); out.esr = 0; ret = kvm_walk_nested_s2(vcpu, ipa, &out); if (ret < 0) - return; + return ret; /* Check the access permission */ if (!out.esr && @@ -1539,6 +1600,7 @@ void __kvm_at_s12(struct kvm_vcpu *vcpu, u32 op, u64 vaddr) par = compute_par_s12(vcpu, par, &out); vcpu_write_sys_reg(vcpu, par, PAR_EL1); + return 0; } /* @@ -1637,3 +1699,97 @@ int __kvm_find_s1_desc_level(struct kvm_vcpu *vcpu, u64 va, u64 ipa, int *level) return ret; } } + +#ifdef CONFIG_ARM64_LSE_ATOMICS +static int __lse_swap_desc(u64 __user *ptep, u64 old, u64 new) +{ + u64 tmp = old; + int ret = 0; + + uaccess_enable_privileged(); + + asm volatile(__LSE_PREAMBLE + "1: cas %[old], %[new], %[addr]\n" + "2:\n" + _ASM_EXTABLE_UACCESS_ERR(1b, 2b, %w[ret]) + : [old] "+r" (old), [addr] "+Q" (*ptep), [ret] "+r" (ret) + : [new] "r" (new) + : "memory"); + + uaccess_disable_privileged(); + + if (ret) + return ret; + if (tmp != old) + return -EAGAIN; + + return ret; +} +#else +static int __lse_swap_desc(u64 __user *ptep, u64 old, u64 new) +{ + return -EINVAL; +} +#endif + +static int __llsc_swap_desc(u64 __user *ptep, u64 old, u64 new) +{ + int ret = 1; + u64 tmp; + + uaccess_enable_privileged(); + + asm volatile("prfm pstl1strm, %[addr]\n" + "1: ldxr %[tmp], %[addr]\n" + "sub %[tmp], %[tmp], %[old]\n" + "cbnz %[tmp], 3f\n" + "2: stlxr %w[ret], %[new], %[addr]\n" + "3:\n" + _ASM_EXTABLE_UACCESS_ERR(1b, 3b, %w[ret]) + _ASM_EXTABLE_UACCESS_ERR(2b, 3b, %w[ret]) + : [ret] "+r" (ret), [addr] "+Q" (*ptep), [tmp] "=&r" (tmp) + : [old] "r" (old), [new] "r" (new) + : "memory"); + + uaccess_disable_privileged(); + + /* STLXR didn't update the descriptor, or the compare failed */ + if (ret == 1) + return -EAGAIN; + + return ret; +} + +int __kvm_at_swap_desc(struct kvm *kvm, gpa_t ipa, u64 old, u64 new) +{ + struct kvm_memory_slot *slot; + unsigned long hva; + u64 __user *ptep; + bool writable; + int offset; + gfn_t gfn; + int r; + + lockdep_assert(srcu_read_lock_held(&kvm->srcu)); + + gfn = ipa >> PAGE_SHIFT; + offset = offset_in_page(ipa); + slot = gfn_to_memslot(kvm, gfn); + hva = gfn_to_hva_memslot_prot(slot, gfn, &writable); + if (kvm_is_error_hva(hva)) + return -EINVAL; + if (!writable) + return -EPERM; + + ptep = (u64 __user *)hva + offset; + if (cpus_have_final_cap(ARM64_HAS_LSE_ATOMICS)) + r = __lse_swap_desc(ptep, old, new); + else + r = __llsc_swap_desc(ptep, old, new); + + if (r < 0) + return r; + + mark_page_dirty_in_slot(kvm, slot, gfn); + return 0; +} diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-main.c b/arch/arm64/kvm/hyp/nvhe/hyp-main.c index 29430c031095..a7c689152f68 100644 --- a/arch/arm64/kvm/hyp/nvhe/hyp-main.c +++ b/arch/arm64/kvm/hyp/nvhe/hyp-main.c @@ -157,6 +157,7 @@ static void sync_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu) host_vcpu->arch.iflags = hyp_vcpu->vcpu.arch.iflags; host_cpu_if->vgic_hcr = hyp_cpu_if->vgic_hcr; + host_cpu_if->vgic_vmcr = hyp_cpu_if->vgic_vmcr; for (i = 0; i < hyp_cpu_if->used_lrs; ++i) host_cpu_if->vgic_lr[i] = hyp_cpu_if->vgic_lr[i]; } @@ -464,11 +465,11 @@ static void handle___vgic_v3_init_lrs(struct kvm_cpu_context *host_ctxt) __vgic_v3_init_lrs(); } -static void handle___vgic_v3_save_vmcr_aprs(struct kvm_cpu_context *host_ctxt) +static void handle___vgic_v3_save_aprs(struct kvm_cpu_context *host_ctxt) { DECLARE_REG(struct vgic_v3_cpu_if *, cpu_if, host_ctxt, 1); - __vgic_v3_save_vmcr_aprs(kern_hyp_va(cpu_if)); + __vgic_v3_save_aprs(kern_hyp_va(cpu_if)); } static void handle___vgic_v3_restore_vmcr_aprs(struct kvm_cpu_context *host_ctxt) @@ -616,7 +617,7 @@ static const hcall_t host_hcall[] = { HANDLE_FUNC(__kvm_tlb_flush_vmid_range), HANDLE_FUNC(__kvm_flush_cpu_context), HANDLE_FUNC(__kvm_timer_set_cntvoff), - HANDLE_FUNC(__vgic_v3_save_vmcr_aprs), + HANDLE_FUNC(__vgic_v3_save_aprs), HANDLE_FUNC(__vgic_v3_restore_vmcr_aprs), HANDLE_FUNC(__pkvm_reserve_vm), HANDLE_FUNC(__pkvm_unreserve_vm), diff --git a/arch/arm64/kvm/hyp/nvhe/pkvm.c b/arch/arm64/kvm/hyp/nvhe/pkvm.c index 43bde061b65d..8911338961c5 100644 --- a/arch/arm64/kvm/hyp/nvhe/pkvm.c +++ b/arch/arm64/kvm/hyp/nvhe/pkvm.c @@ -337,6 +337,9 @@ static void pkvm_init_features_from_host(struct pkvm_hyp_vm *hyp_vm, const struc /* CTR_EL0 is always under host control, even for protected VMs. */ hyp_vm->kvm.arch.ctr_el0 = host_kvm->arch.ctr_el0; + /* Preserve the vgic model so that GICv3 emulation works */ + hyp_vm->kvm.arch.vgic.vgic_model = host_kvm->arch.vgic.vgic_model; + if (test_bit(KVM_ARCH_FLAG_MTE_ENABLED, &host_kvm->arch.flags)) set_bit(KVM_ARCH_FLAG_MTE_ENABLED, &kvm->arch.flags); diff --git a/arch/arm64/kvm/hyp/nvhe/sys_regs.c b/arch/arm64/kvm/hyp/nvhe/sys_regs.c index 82da9b03692d..3108b5185c20 100644 --- a/arch/arm64/kvm/hyp/nvhe/sys_regs.c +++ b/arch/arm64/kvm/hyp/nvhe/sys_regs.c @@ -444,6 +444,8 @@ static const struct sys_reg_desc pvm_sys_reg_descs[] = { /* Scalable Vector Registers are restricted. */ + HOST_HANDLED(SYS_ICC_PMR_EL1), + RAZ_WI(SYS_ERRIDR_EL1), RAZ_WI(SYS_ERRSELR_EL1), RAZ_WI(SYS_ERXFR_EL1), @@ -457,9 +459,12 @@ static const struct sys_reg_desc pvm_sys_reg_descs[] = { /* Limited Ordering Regions Registers are restricted. */ + HOST_HANDLED(SYS_ICC_DIR_EL1), + HOST_HANDLED(SYS_ICC_RPR_EL1), HOST_HANDLED(SYS_ICC_SGI1R_EL1), HOST_HANDLED(SYS_ICC_ASGI1R_EL1), HOST_HANDLED(SYS_ICC_SGI0R_EL1), + HOST_HANDLED(SYS_ICC_CTLR_EL1), { SYS_DESC(SYS_ICC_SRE_EL1), .access = pvm_gic_read_sre, }, HOST_HANDLED(SYS_CCSIDR_EL1), diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c index c351b4abd5db..947ac1a951a5 100644 --- a/arch/arm64/kvm/hyp/pgtable.c +++ b/arch/arm64/kvm/hyp/pgtable.c @@ -661,11 +661,37 @@ void kvm_tlb_flush_vmid_range(struct kvm_s2_mmu *mmu, #define KVM_S2_MEMATTR(pgt, attr) PAGE_S2_MEMATTR(attr, stage2_has_fwb(pgt)) +static int stage2_set_xn_attr(enum kvm_pgtable_prot prot, kvm_pte_t *attr) +{ + bool px, ux; + u8 xn; + + px = prot & KVM_PGTABLE_PROT_PX; + ux = prot & KVM_PGTABLE_PROT_UX; + + if (!cpus_have_final_cap(ARM64_HAS_XNX) && px != ux) + return -EINVAL; + + if (px && ux) + xn = 0b00; + else if (!px && ux) + xn = 0b01; + else if (!px && !ux) + xn = 0b10; + else + xn = 0b11; + + *attr &= ~KVM_PTE_LEAF_ATTR_HI_S2_XN; + *attr |= FIELD_PREP(KVM_PTE_LEAF_ATTR_HI_S2_XN, xn); + return 0; +} + static int stage2_set_prot_attr(struct kvm_pgtable *pgt, enum kvm_pgtable_prot prot, kvm_pte_t *ptep) { kvm_pte_t attr; u32 sh = KVM_PTE_LEAF_ATTR_LO_S2_SH_IS; + int r; switch (prot & (KVM_PGTABLE_PROT_DEVICE | KVM_PGTABLE_PROT_NORMAL_NC)) { @@ -685,8 +711,9 @@ static int stage2_set_prot_attr(struct kvm_pgtable *pgt, enum kvm_pgtable_prot p attr = KVM_S2_MEMATTR(pgt, NORMAL); } - if (!(prot & KVM_PGTABLE_PROT_X)) - attr |= KVM_PTE_LEAF_ATTR_HI_S2_XN; + r = stage2_set_xn_attr(prot, &attr); + if (r) + return r; if (prot & KVM_PGTABLE_PROT_R) attr |= KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R; @@ -715,8 +742,20 @@ enum kvm_pgtable_prot kvm_pgtable_stage2_pte_prot(kvm_pte_t pte) prot |= KVM_PGTABLE_PROT_R; if (pte & KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W) prot |= KVM_PGTABLE_PROT_W; - if (!(pte & KVM_PTE_LEAF_ATTR_HI_S2_XN)) - prot |= KVM_PGTABLE_PROT_X; + + switch (FIELD_GET(KVM_PTE_LEAF_ATTR_HI_S2_XN, pte)) { + case 0b00: + prot |= KVM_PGTABLE_PROT_PX | KVM_PGTABLE_PROT_UX; + break; + case 0b01: + prot |= KVM_PGTABLE_PROT_UX; + break; + case 0b11: + prot |= KVM_PGTABLE_PROT_PX; + break; + default: + break; + } return prot; } @@ -1290,9 +1329,9 @@ bool kvm_pgtable_stage2_test_clear_young(struct kvm_pgtable *pgt, u64 addr, int kvm_pgtable_stage2_relax_perms(struct kvm_pgtable *pgt, u64 addr, enum kvm_pgtable_prot prot, enum kvm_pgtable_walk_flags flags) { - int ret; + kvm_pte_t xn = 0, set = 0, clr = 0; s8 level; - kvm_pte_t set = 0, clr = 0; + int ret; if (prot & KVM_PTE_LEAF_ATTR_HI_SW) return -EINVAL; @@ -1303,8 +1342,12 @@ int kvm_pgtable_stage2_relax_perms(struct kvm_pgtable *pgt, u64 addr, if (prot & KVM_PGTABLE_PROT_W) set |= KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W; - if (prot & KVM_PGTABLE_PROT_X) - clr |= KVM_PTE_LEAF_ATTR_HI_S2_XN; + ret = stage2_set_xn_attr(prot, &xn); + if (ret) + return ret; + + set |= xn & KVM_PTE_LEAF_ATTR_HI_S2_XN; + clr |= ~xn & KVM_PTE_LEAF_ATTR_HI_S2_XN; ret = stage2_update_leaf_attrs(pgt, addr, 1, set, clr, NULL, &level, flags); if (!ret || ret == -EAGAIN) @@ -1535,37 +1578,80 @@ size_t kvm_pgtable_stage2_pgd_size(u64 vtcr) return kvm_pgd_pages(ia_bits, start_level) * PAGE_SIZE; } -static int stage2_free_walker(const struct kvm_pgtable_visit_ctx *ctx, - enum kvm_pgtable_walk_flags visit) +static int stage2_free_leaf(const struct kvm_pgtable_visit_ctx *ctx) { struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops; - if (!stage2_pte_is_counted(ctx->old)) + mm_ops->put_page(ctx->ptep); + return 0; +} + +static int stage2_free_table_post(const struct kvm_pgtable_visit_ctx *ctx) +{ + struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops; + kvm_pte_t *childp = kvm_pte_follow(ctx->old, mm_ops); + + if (mm_ops->page_count(childp) != 1) return 0; + /* + * Drop references and clear the now stale PTE to avoid rewalking the + * freed page table. + */ mm_ops->put_page(ctx->ptep); + mm_ops->put_page(childp); + kvm_clear_pte(ctx->ptep); + return 0; +} - if (kvm_pte_table(ctx->old, ctx->level)) - mm_ops->put_page(kvm_pte_follow(ctx->old, mm_ops)); +static int stage2_free_walker(const struct kvm_pgtable_visit_ctx *ctx, + enum kvm_pgtable_walk_flags visit) +{ + if (!stage2_pte_is_counted(ctx->old)) + return 0; - return 0; + switch (visit) { + case KVM_PGTABLE_WALK_LEAF: + return stage2_free_leaf(ctx); + case KVM_PGTABLE_WALK_TABLE_POST: + return stage2_free_table_post(ctx); + default: + return -EINVAL; + } } -void kvm_pgtable_stage2_destroy(struct kvm_pgtable *pgt) +void kvm_pgtable_stage2_destroy_range(struct kvm_pgtable *pgt, + u64 addr, u64 size) { - size_t pgd_sz; struct kvm_pgtable_walker walker = { .cb = stage2_free_walker, .flags = KVM_PGTABLE_WALK_LEAF | KVM_PGTABLE_WALK_TABLE_POST, }; - WARN_ON(kvm_pgtable_walk(pgt, 0, BIT(pgt->ia_bits), &walker)); + WARN_ON(kvm_pgtable_walk(pgt, addr, size, &walker)); +} + +void kvm_pgtable_stage2_destroy_pgd(struct kvm_pgtable *pgt) +{ + size_t pgd_sz; + pgd_sz = kvm_pgd_pages(pgt->ia_bits, pgt->start_level) * PAGE_SIZE; - pgt->mm_ops->free_pages_exact(kvm_dereference_pteref(&walker, pgt->pgd), pgd_sz); + + /* + * Since the pgtable is unlinked at this point, and not shared with + * other walkers, safely deference pgd with kvm_dereference_pteref_raw() + */ + pgt->mm_ops->free_pages_exact(kvm_dereference_pteref_raw(pgt->pgd), pgd_sz); pgt->pgd = NULL; } +void kvm_pgtable_stage2_destroy(struct kvm_pgtable *pgt) +{ + kvm_pgtable_stage2_destroy_range(pgt, 0, BIT(pgt->ia_bits)); + kvm_pgtable_stage2_destroy_pgd(pgt); +} + void kvm_pgtable_stage2_free_unlinked(struct kvm_pgtable_mm_ops *mm_ops, void *pgtable, s8 level) { kvm_pteref_t ptep = (kvm_pteref_t)pgtable; diff --git a/arch/arm64/kvm/hyp/vgic-v2-cpuif-proxy.c b/arch/arm64/kvm/hyp/vgic-v2-cpuif-proxy.c index 78579b31a420..5fd99763b54d 100644 --- a/arch/arm64/kvm/hyp/vgic-v2-cpuif-proxy.c +++ b/arch/arm64/kvm/hyp/vgic-v2-cpuif-proxy.c @@ -63,6 +63,10 @@ int __vgic_v2_perform_cpuif_access(struct kvm_vcpu *vcpu) return -1; } + /* Handle deactivation as a normal exit */ + if ((fault_ipa - vgic->vgic_cpu_base) >= GIC_CPU_DEACTIVATE) + return 0; + rd = kvm_vcpu_dabt_get_rd(vcpu); addr = kvm_vgic_global_state.vcpu_hyp_va; addr += fault_ipa - vgic->vgic_cpu_base; diff --git a/arch/arm64/kvm/hyp/vgic-v3-sr.c b/arch/arm64/kvm/hyp/vgic-v3-sr.c index acd909b7f225..0b670a033fd8 100644 --- a/arch/arm64/kvm/hyp/vgic-v3-sr.c +++ b/arch/arm64/kvm/hyp/vgic-v3-sr.c @@ -14,6 +14,8 @@ #include <asm/kvm_hyp.h> #include <asm/kvm_mmu.h> +#include "../../vgic/vgic.h" + #define vtr_to_max_lr_idx(v) ((v) & 0xf) #define vtr_to_nr_pre_bits(v) ((((u32)(v) >> 26) & 7) + 1) #define vtr_to_nr_apr_regs(v) (1 << (vtr_to_nr_pre_bits(v) - 5)) @@ -58,7 +60,7 @@ u64 __gic_v3_get_lr(unsigned int lr) unreachable(); } -static void __gic_v3_set_lr(u64 val, int lr) +void __gic_v3_set_lr(u64 val, int lr) { switch (lr & 0xf) { case 0: @@ -196,6 +198,11 @@ static u32 __vgic_v3_read_ap1rn(int n) return val; } +static u64 compute_ich_hcr(struct vgic_v3_cpu_if *cpu_if) +{ + return cpu_if->vgic_hcr | vgic_ich_hcr_trap_bits(); +} + void __vgic_v3_save_state(struct vgic_v3_cpu_if *cpu_if) { u64 used_lrs = cpu_if->used_lrs; @@ -212,14 +219,12 @@ void __vgic_v3_save_state(struct vgic_v3_cpu_if *cpu_if) } } - if (used_lrs || cpu_if->its_vpe.its_vm) { + if (used_lrs) { int i; u32 elrsr; elrsr = read_gicreg(ICH_ELRSR_EL2); - write_gicreg(cpu_if->vgic_hcr & ~ICH_HCR_EL2_En, ICH_HCR_EL2); - for (i = 0; i < used_lrs; i++) { if (elrsr & (1 << i)) cpu_if->vgic_lr[i] &= ~ICH_LR_STATE; @@ -229,6 +234,23 @@ void __vgic_v3_save_state(struct vgic_v3_cpu_if *cpu_if) __gic_v3_set_lr(0, i); } } + + cpu_if->vgic_vmcr = read_gicreg(ICH_VMCR_EL2); + + if (cpu_if->vgic_hcr & ICH_HCR_EL2_LRENPIE) { + u64 val = read_gicreg(ICH_HCR_EL2); + cpu_if->vgic_hcr &= ~ICH_HCR_EL2_EOIcount; + cpu_if->vgic_hcr |= val & ICH_HCR_EL2_EOIcount; + } + + write_gicreg(0, ICH_HCR_EL2); + + /* + * Hack alert: On NV, this results in a trap so that the above write + * actually takes effect... No synchronisation is necessary, as we + * only care about the effects when this traps. + */ + read_gicreg(ICH_MISR_EL2); } void __vgic_v3_restore_state(struct vgic_v3_cpu_if *cpu_if) @@ -236,12 +258,10 @@ void __vgic_v3_restore_state(struct vgic_v3_cpu_if *cpu_if) u64 used_lrs = cpu_if->used_lrs; int i; - if (used_lrs || cpu_if->its_vpe.its_vm) { - write_gicreg(cpu_if->vgic_hcr, ICH_HCR_EL2); + write_gicreg(compute_ich_hcr(cpu_if), ICH_HCR_EL2); - for (i = 0; i < used_lrs; i++) - __gic_v3_set_lr(cpu_if->vgic_lr[i], i); - } + for (i = 0; i < used_lrs; i++) + __gic_v3_set_lr(cpu_if->vgic_lr[i], i); /* * Ensure that writes to the LRs, and on non-VHE systems ensure that @@ -307,24 +327,20 @@ void __vgic_v3_activate_traps(struct vgic_v3_cpu_if *cpu_if) } /* - * If we need to trap system registers, we must write - * ICH_HCR_EL2 anyway, even if no interrupts are being - * injected. Note that this also applies if we don't expect - * any system register access (no vgic at all). + * If we need to trap system registers, we must write ICH_HCR_EL2 + * anyway, even if no interrupts are being injected. Note that this + * also applies if we don't expect any system register access (no + * vgic at all). In any case, no need to provide MI configuration. */ if (static_branch_unlikely(&vgic_v3_cpuif_trap) || cpu_if->its_vpe.its_vm || !cpu_if->vgic_sre) - write_gicreg(cpu_if->vgic_hcr, ICH_HCR_EL2); + write_gicreg(vgic_ich_hcr_trap_bits() | ICH_HCR_EL2_En, ICH_HCR_EL2); } void __vgic_v3_deactivate_traps(struct vgic_v3_cpu_if *cpu_if) { u64 val; - if (!cpu_if->vgic_sre) { - cpu_if->vgic_vmcr = read_gicreg(ICH_VMCR_EL2); - } - /* Only restore SRE if the host implements the GICv2 interface */ if (static_branch_unlikely(&vgic_v3_has_v2_compat)) { val = read_gicreg(ICC_SRE_EL2); @@ -346,7 +362,7 @@ void __vgic_v3_deactivate_traps(struct vgic_v3_cpu_if *cpu_if) write_gicreg(0, ICH_HCR_EL2); } -static void __vgic_v3_save_aprs(struct vgic_v3_cpu_if *cpu_if) +void __vgic_v3_save_aprs(struct vgic_v3_cpu_if *cpu_if) { u64 val; u32 nr_pre_bits; @@ -507,13 +523,6 @@ static void __vgic_v3_write_vmcr(u32 vmcr) write_gicreg(vmcr, ICH_VMCR_EL2); } -void __vgic_v3_save_vmcr_aprs(struct vgic_v3_cpu_if *cpu_if) -{ - __vgic_v3_save_aprs(cpu_if); - if (cpu_if->vgic_sre) - cpu_if->vgic_vmcr = __vgic_v3_read_vmcr(); -} - void __vgic_v3_restore_vmcr_aprs(struct vgic_v3_cpu_if *cpu_if) { __vgic_v3_compat_mode_enable(); @@ -790,7 +799,7 @@ static void __vgic_v3_bump_eoicount(void) write_gicreg(hcr, ICH_HCR_EL2); } -static void __vgic_v3_write_dir(struct kvm_vcpu *vcpu, u32 vmcr, int rt) +static int ___vgic_v3_write_dir(struct kvm_vcpu *vcpu, u32 vmcr, int rt) { u32 vid = vcpu_get_reg(vcpu, rt); u64 lr_val; @@ -798,19 +807,25 @@ static void __vgic_v3_write_dir(struct kvm_vcpu *vcpu, u32 vmcr, int rt) /* EOImode == 0, nothing to be done here */ if (!(vmcr & ICH_VMCR_EOIM_MASK)) - return; + return 1; /* No deactivate to be performed on an LPI */ if (vid >= VGIC_MIN_LPI) - return; + return 1; lr = __vgic_v3_find_active_lr(vcpu, vid, &lr_val); - if (lr == -1) { - __vgic_v3_bump_eoicount(); - return; + if (lr != -1) { + __vgic_v3_clear_active_lr(lr, lr_val); + return 1; } - __vgic_v3_clear_active_lr(lr, lr_val); + return 0; +} + +static void __vgic_v3_write_dir(struct kvm_vcpu *vcpu, u32 vmcr, int rt) +{ + if (!___vgic_v3_write_dir(vcpu, vmcr, rt)) + __vgic_v3_bump_eoicount(); } static void __vgic_v3_write_eoir(struct kvm_vcpu *vcpu, u32 vmcr, int rt) @@ -1245,6 +1260,21 @@ int __vgic_v3_perform_cpuif_access(struct kvm_vcpu *vcpu) case SYS_ICC_DIR_EL1: if (unlikely(is_read)) return 0; + /* + * Full exit if required to handle overflow deactivation, + * unless we can emulate it in the LRs (likely the majority + * of the cases). + */ + if (vcpu->arch.vgic_cpu.vgic_v3.vgic_hcr & ICH_HCR_EL2_TDIR) { + int ret; + + ret = ___vgic_v3_write_dir(vcpu, __vgic_v3_read_vmcr(), + kvm_vcpu_sys_get_rt(vcpu)); + if (ret) + __kvm_skip_instr(vcpu); + + return ret; + } fn = __vgic_v3_write_dir; break; case SYS_ICC_RPR_EL1: diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index 7cc964af8d30..5ab0cfa08343 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -904,6 +904,38 @@ static int kvm_init_ipa_range(struct kvm_s2_mmu *mmu, unsigned long type) return 0; } +/* + * Assume that @pgt is valid and unlinked from the KVM MMU to free the + * page-table without taking the kvm_mmu_lock and without performing any + * TLB invalidations. + * + * Also, the range of addresses can be large enough to cause need_resched + * warnings, for instance on CONFIG_PREEMPT_NONE kernels. Hence, invoke + * cond_resched() periodically to prevent hogging the CPU for a long time + * and schedule something else, if required. + */ +static void stage2_destroy_range(struct kvm_pgtable *pgt, phys_addr_t addr, + phys_addr_t end) +{ + u64 next; + + do { + next = stage2_range_addr_end(addr, end); + KVM_PGT_FN(kvm_pgtable_stage2_destroy_range)(pgt, addr, + next - addr); + if (next != end) + cond_resched(); + } while (addr = next, addr != end); +} + +static void kvm_stage2_destroy(struct kvm_pgtable *pgt) +{ + unsigned int ia_bits = VTCR_EL2_IPA(pgt->mmu->vtcr); + + stage2_destroy_range(pgt, 0, BIT(ia_bits)); + KVM_PGT_FN(kvm_pgtable_stage2_destroy_pgd)(pgt); +} + /** * kvm_init_stage2_mmu - Initialise a S2 MMU structure * @kvm: The pointer to the KVM structure @@ -980,7 +1012,7 @@ int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu, unsigned long t return 0; out_destroy_pgtable: - KVM_PGT_FN(kvm_pgtable_stage2_destroy)(pgt); + kvm_stage2_destroy(pgt); out_free_pgtable: kfree(pgt); return err; @@ -1081,7 +1113,7 @@ void kvm_free_stage2_pgd(struct kvm_s2_mmu *mmu) write_unlock(&kvm->mmu_lock); if (pgt) { - KVM_PGT_FN(kvm_pgtable_stage2_destroy)(pgt); + kvm_stage2_destroy(pgt); kfree(pgt); } } @@ -1521,6 +1553,16 @@ static void adjust_nested_fault_perms(struct kvm_s2_trans *nested, *prot |= kvm_encode_nested_level(nested); } +static void adjust_nested_exec_perms(struct kvm *kvm, + struct kvm_s2_trans *nested, + enum kvm_pgtable_prot *prot) +{ + if (!kvm_s2_trans_exec_el0(kvm, nested)) + *prot &= ~KVM_PGTABLE_PROT_UX; + if (!kvm_s2_trans_exec_el1(kvm, nested)) + *prot &= ~KVM_PGTABLE_PROT_PX; +} + #define KVM_PGTABLE_WALK_MEMABORT_FLAGS (KVM_PGTABLE_WALK_HANDLE_FAULT | KVM_PGTABLE_WALK_SHARED) static int gmem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, @@ -1572,11 +1614,12 @@ static int gmem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, if (writable) prot |= KVM_PGTABLE_PROT_W; - if (exec_fault || - (cpus_have_final_cap(ARM64_HAS_CACHE_DIC) && - (!nested || kvm_s2_trans_executable(nested)))) + if (exec_fault || cpus_have_final_cap(ARM64_HAS_CACHE_DIC)) prot |= KVM_PGTABLE_PROT_X; + if (nested) + adjust_nested_exec_perms(kvm, nested, &prot); + kvm_fault_lock(kvm); if (mmu_invalidate_retry(kvm, mmu_seq)) { ret = -EAGAIN; @@ -1851,11 +1894,13 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, prot |= KVM_PGTABLE_PROT_NORMAL_NC; else prot |= KVM_PGTABLE_PROT_DEVICE; - } else if (cpus_have_final_cap(ARM64_HAS_CACHE_DIC) && - (!nested || kvm_s2_trans_executable(nested))) { + } else if (cpus_have_final_cap(ARM64_HAS_CACHE_DIC)) { prot |= KVM_PGTABLE_PROT_X; } + if (nested) + adjust_nested_exec_perms(kvm, nested, &prot); + /* * Under the premise of getting a FSC_PERM fault, we just need to relax * permissions only if vma_pagesize equals fault_granule. Otherwise, @@ -1899,8 +1944,48 @@ static void handle_access_fault(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa) read_unlock(&vcpu->kvm->mmu_lock); } +/* + * Returns true if the SEA should be handled locally within KVM if the abort + * is caused by a kernel memory allocation (e.g. stage-2 table memory). + */ +static bool host_owns_sea(struct kvm_vcpu *vcpu, u64 esr) +{ + /* + * Without FEAT_RAS HCR_EL2.TEA is RES0, meaning any external abort + * taken from a guest EL to EL2 is due to a host-imposed access (e.g. + * stage-2 PTW). + */ + if (!cpus_have_final_cap(ARM64_HAS_RAS_EXTN)) + return true; + + /* KVM owns the VNCR when the vCPU isn't in a nested context. */ + if (is_hyp_ctxt(vcpu) && !kvm_vcpu_trap_is_iabt(vcpu) && (esr & ESR_ELx_VNCR)) + return true; + + /* + * Determining if an external abort during a table walk happened at + * stage-2 is only possible with S1PTW is set. Otherwise, since KVM + * sets HCR_EL2.TEA, SEAs due to a stage-1 walk (i.e. accessing the + * PA of the stage-1 descriptor) can reach here and are reported + * with a TTW ESR value. + */ + return (esr_fsc_is_sea_ttw(esr) && (esr & ESR_ELx_S1PTW)); +} + int kvm_handle_guest_sea(struct kvm_vcpu *vcpu) { + struct kvm *kvm = vcpu->kvm; + struct kvm_run *run = vcpu->run; + u64 esr = kvm_vcpu_get_esr(vcpu); + u64 esr_mask = ESR_ELx_EC_MASK | + ESR_ELx_IL | + ESR_ELx_FnV | + ESR_ELx_EA | + ESR_ELx_CM | + ESR_ELx_WNR | + ESR_ELx_FSC; + u64 ipa; + /* * Give APEI the opportunity to claim the abort before handling it * within KVM. apei_claim_sea() expects to be called with IRQs enabled. @@ -1909,7 +1994,33 @@ int kvm_handle_guest_sea(struct kvm_vcpu *vcpu) if (apei_claim_sea(NULL) == 0) return 1; - return kvm_inject_serror(vcpu); + if (host_owns_sea(vcpu, esr) || + !test_bit(KVM_ARCH_FLAG_EXIT_SEA, &vcpu->kvm->arch.flags)) + return kvm_inject_serror(vcpu); + + /* ESR_ELx.SET is RES0 when FEAT_RAS isn't implemented. */ + if (kvm_has_ras(kvm)) + esr_mask |= ESR_ELx_SET_MASK; + + /* + * Exit to userspace, and provide faulting guest virtual and physical + * addresses in case userspace wants to emulate SEA to guest by + * writing to FAR_ELx and HPFAR_ELx registers. + */ + memset(&run->arm_sea, 0, sizeof(run->arm_sea)); + run->exit_reason = KVM_EXIT_ARM_SEA; + run->arm_sea.esr = esr & esr_mask; + + if (!(esr & ESR_ELx_FnV)) + run->arm_sea.gva = kvm_vcpu_get_hfar(vcpu); + + ipa = kvm_vcpu_get_fault_ipa(vcpu); + if (ipa != INVALID_GPA) { + run->arm_sea.flags |= KVM_EXIT_ARM_SEA_FLAG_GPA_VALID; + run->arm_sea.gpa = ipa; + } + + return 0; } /** @@ -1999,6 +2110,11 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu) u32 esr; ret = kvm_walk_nested_s2(vcpu, fault_ipa, &nested_trans); + if (ret == -EAGAIN) { + ret = 1; + goto out_unlock; + } + if (ret) { esr = kvm_s2_trans_esr(&nested_trans); kvm_inject_s2_fault(vcpu, esr); diff --git a/arch/arm64/kvm/nested.c b/arch/arm64/kvm/nested.c index f04cda40545b..911fc99ed99d 100644 --- a/arch/arm64/kvm/nested.c +++ b/arch/arm64/kvm/nested.c @@ -124,14 +124,13 @@ int kvm_vcpu_init_nested(struct kvm_vcpu *vcpu) } struct s2_walk_info { - int (*read_desc)(phys_addr_t pa, u64 *desc, void *data); - void *data; - u64 baddr; - unsigned int max_oa_bits; - unsigned int pgshift; - unsigned int sl; - unsigned int t0sz; - bool be; + u64 baddr; + unsigned int max_oa_bits; + unsigned int pgshift; + unsigned int sl; + unsigned int t0sz; + bool be; + bool ha; }; static u32 compute_fsc(int level, u32 fsc) @@ -199,6 +198,42 @@ static int check_output_size(struct s2_walk_info *wi, phys_addr_t output) return 0; } +static int read_guest_s2_desc(struct kvm_vcpu *vcpu, phys_addr_t pa, u64 *desc, + struct s2_walk_info *wi) +{ + u64 val; + int r; + + r = kvm_read_guest(vcpu->kvm, pa, &val, sizeof(val)); + if (r) + return r; + + /* + * Handle reversedescriptors if endianness differs between the + * host and the guest hypervisor. + */ + if (wi->be) + *desc = be64_to_cpu((__force __be64)val); + else + *desc = le64_to_cpu((__force __le64)val); + + return 0; +} + +static int swap_guest_s2_desc(struct kvm_vcpu *vcpu, phys_addr_t pa, u64 old, u64 new, + struct s2_walk_info *wi) +{ + if (wi->be) { + old = (__force u64)cpu_to_be64(old); + new = (__force u64)cpu_to_be64(new); + } else { + old = (__force u64)cpu_to_le64(old); + new = (__force u64)cpu_to_le64(new); + } + + return __kvm_at_swap_desc(vcpu->kvm, pa, old, new); +} + /* * This is essentially a C-version of the pseudo code from the ARM ARM * AArch64.TranslationTableWalk function. I strongly recommend looking at @@ -206,13 +241,13 @@ static int check_output_size(struct s2_walk_info *wi, phys_addr_t output) * * Must be called with the kvm->srcu read lock held */ -static int walk_nested_s2_pgd(phys_addr_t ipa, +static int walk_nested_s2_pgd(struct kvm_vcpu *vcpu, phys_addr_t ipa, struct s2_walk_info *wi, struct kvm_s2_trans *out) { int first_block_level, level, stride, input_size, base_lower_bound; phys_addr_t base_addr; unsigned int addr_top, addr_bottom; - u64 desc; /* page table entry */ + u64 desc, new_desc; /* page table entry */ int ret; phys_addr_t paddr; @@ -257,28 +292,30 @@ static int walk_nested_s2_pgd(phys_addr_t ipa, >> (addr_bottom - 3); paddr = base_addr | index; - ret = wi->read_desc(paddr, &desc, wi->data); + ret = read_guest_s2_desc(vcpu, paddr, &desc, wi); if (ret < 0) return ret; - /* - * Handle reversedescriptors if endianness differs between the - * host and the guest hypervisor. - */ - if (wi->be) - desc = be64_to_cpu((__force __be64)desc); - else - desc = le64_to_cpu((__force __le64)desc); + new_desc = desc; /* Check for valid descriptor at this point */ - if (!(desc & 1) || ((desc & 3) == 1 && level == 3)) { + if (!(desc & KVM_PTE_VALID)) { out->esr = compute_fsc(level, ESR_ELx_FSC_FAULT); out->desc = desc; return 1; } - /* We're at the final level or block translation level */ - if ((desc & 3) == 1 || level == 3) + if (FIELD_GET(KVM_PTE_TYPE, desc) == KVM_PTE_TYPE_BLOCK) { + if (level < 3) + break; + + out->esr = compute_fsc(level, ESR_ELx_FSC_FAULT); + out->desc = desc; + return 1; + } + + /* We're at the final level */ + if (level == 3) break; if (check_output_size(wi, desc)) { @@ -305,7 +342,18 @@ static int walk_nested_s2_pgd(phys_addr_t ipa, return 1; } - if (!(desc & BIT(10))) { + if (wi->ha) + new_desc |= KVM_PTE_LEAF_ATTR_LO_S2_AF; + + if (new_desc != desc) { + ret = swap_guest_s2_desc(vcpu, paddr, desc, new_desc, wi); + if (ret) + return ret; + + desc = new_desc; + } + + if (!(desc & KVM_PTE_LEAF_ATTR_LO_S2_AF)) { out->esr = compute_fsc(level, ESR_ELx_FSC_ACCESS); out->desc = desc; return 1; @@ -318,20 +366,13 @@ static int walk_nested_s2_pgd(phys_addr_t ipa, (ipa & GENMASK_ULL(addr_bottom - 1, 0)); out->output = paddr; out->block_size = 1UL << ((3 - level) * stride + wi->pgshift); - out->readable = desc & (0b01 << 6); - out->writable = desc & (0b10 << 6); + out->readable = desc & KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R; + out->writable = desc & KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W; out->level = level; out->desc = desc; return 0; } -static int read_guest_s2_desc(phys_addr_t pa, u64 *desc, void *data) -{ - struct kvm_vcpu *vcpu = data; - - return kvm_read_guest(vcpu->kvm, pa, desc, sizeof(*desc)); -} - static void vtcr_to_walk_info(u64 vtcr, struct s2_walk_info *wi) { wi->t0sz = vtcr & TCR_EL2_T0SZ_MASK; @@ -350,6 +391,8 @@ static void vtcr_to_walk_info(u64 vtcr, struct s2_walk_info *wi) /* Global limit for now, should eventually be per-VM */ wi->max_oa_bits = min(get_kvm_ipa_limit(), ps_to_output_size(FIELD_GET(VTCR_EL2_PS_MASK, vtcr), false)); + + wi->ha = vtcr & VTCR_EL2_HA; } int kvm_walk_nested_s2(struct kvm_vcpu *vcpu, phys_addr_t gipa, @@ -364,15 +407,13 @@ int kvm_walk_nested_s2(struct kvm_vcpu *vcpu, phys_addr_t gipa, if (!vcpu_has_nv(vcpu)) return 0; - wi.read_desc = read_guest_s2_desc; - wi.data = vcpu; wi.baddr = vcpu_read_sys_reg(vcpu, VTTBR_EL2); vtcr_to_walk_info(vtcr, &wi); wi.be = vcpu_read_sys_reg(vcpu, SCTLR_EL2) & SCTLR_ELx_EE; - ret = walk_nested_s2_pgd(gipa, &wi, result); + ret = walk_nested_s2_pgd(vcpu, gipa, &wi, result); if (ret) result->esr |= (kvm_vcpu_get_esr(vcpu) & ~ESR_ELx_FSC); @@ -788,7 +829,10 @@ int kvm_s2_handle_perm_fault(struct kvm_vcpu *vcpu, struct kvm_s2_trans *trans) return 0; if (kvm_vcpu_trap_is_iabt(vcpu)) { - forward_fault = !kvm_s2_trans_executable(trans); + if (vcpu_mode_priv(vcpu)) + forward_fault = !kvm_s2_trans_exec_el1(vcpu->kvm, trans); + else + forward_fault = !kvm_s2_trans_exec_el0(vcpu->kvm, trans); } else { bool write_fault = kvm_is_write_fault(vcpu); @@ -1555,12 +1599,13 @@ u64 limit_nv_id_reg(struct kvm *kvm, u32 reg, u64 val) case SYS_ID_AA64MMFR1_EL1: val &= ~(ID_AA64MMFR1_EL1_CMOW | ID_AA64MMFR1_EL1_nTLBPA | - ID_AA64MMFR1_EL1_ETS | - ID_AA64MMFR1_EL1_XNX | - ID_AA64MMFR1_EL1_HAFDBS); + ID_AA64MMFR1_EL1_ETS); + /* FEAT_E2H0 implies no VHE */ if (test_bit(KVM_ARM_VCPU_HAS_EL2_E2H0, kvm->arch.vcpu_features)) val &= ~ID_AA64MMFR1_EL1_VH; + + val = ID_REG_LIMIT_FIELD_ENUM(val, ID_AA64MMFR1_EL1, HAFDBS, AF); break; case SYS_ID_AA64MMFR2_EL1: diff --git a/arch/arm64/kvm/pkvm.c b/arch/arm64/kvm/pkvm.c index 24f0f8a8c943..d7a0f69a9982 100644 --- a/arch/arm64/kvm/pkvm.c +++ b/arch/arm64/kvm/pkvm.c @@ -344,9 +344,16 @@ static int __pkvm_pgtable_stage2_unmap(struct kvm_pgtable *pgt, u64 start, u64 e return 0; } -void pkvm_pgtable_stage2_destroy(struct kvm_pgtable *pgt) +void pkvm_pgtable_stage2_destroy_range(struct kvm_pgtable *pgt, + u64 addr, u64 size) { - __pkvm_pgtable_stage2_unmap(pgt, 0, ~(0ULL)); + __pkvm_pgtable_stage2_unmap(pgt, addr, addr + size); +} + +void pkvm_pgtable_stage2_destroy_pgd(struct kvm_pgtable *pgt) +{ + /* Expected to be called after all pKVM mappings have been released. */ + WARN_ON_ONCE(!RB_EMPTY_ROOT(&pgt->pkvm_mappings.rb_root)); } int pkvm_pgtable_stage2_map(struct kvm_pgtable *pgt, u64 addr, u64 size, diff --git a/arch/arm64/kvm/ptdump.c b/arch/arm64/kvm/ptdump.c index dc5acfb00af9..6cbe018fd6fd 100644 --- a/arch/arm64/kvm/ptdump.c +++ b/arch/arm64/kvm/ptdump.c @@ -31,27 +31,46 @@ static const struct ptdump_prot_bits stage2_pte_bits[] = { .val = PTE_VALID, .set = " ", .clear = "F", - }, { + }, + { .mask = KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R, .val = KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R, .set = "R", .clear = " ", - }, { + }, + { .mask = KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W, .val = KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W, .set = "W", .clear = " ", - }, { + }, + { .mask = KVM_PTE_LEAF_ATTR_HI_S2_XN, - .val = KVM_PTE_LEAF_ATTR_HI_S2_XN, - .set = "NX", - .clear = "x ", - }, { + .val = 0b00UL << __bf_shf(KVM_PTE_LEAF_ATTR_HI_S2_XN), + .set = "px ux ", + }, + { + .mask = KVM_PTE_LEAF_ATTR_HI_S2_XN, + .val = 0b01UL << __bf_shf(KVM_PTE_LEAF_ATTR_HI_S2_XN), + .set = "PXNux ", + }, + { + .mask = KVM_PTE_LEAF_ATTR_HI_S2_XN, + .val = 0b10UL << __bf_shf(KVM_PTE_LEAF_ATTR_HI_S2_XN), + .set = "PXNUXN", + }, + { + .mask = KVM_PTE_LEAF_ATTR_HI_S2_XN, + .val = 0b11UL << __bf_shf(KVM_PTE_LEAF_ATTR_HI_S2_XN), + .set = "px UXN", + }, + { .mask = KVM_PTE_LEAF_ATTR_LO_S2_AF, .val = KVM_PTE_LEAF_ATTR_LO_S2_AF, .set = "AF", .clear = " ", - }, { + }, + { .mask = PMD_TYPE_MASK, .val = PMD_TYPE_SECT, .set = "BLK", diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index ec3fbe0b8d52..c8fd7c6a12a1 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -666,6 +666,21 @@ static bool access_gic_sre(struct kvm_vcpu *vcpu, return true; } +static bool access_gic_dir(struct kvm_vcpu *vcpu, + struct sys_reg_params *p, + const struct sys_reg_desc *r) +{ + if (!kvm_has_gicv3(vcpu->kvm)) + return undef_access(vcpu, p, r); + + if (!p->is_write) + return undef_access(vcpu, p, r); + + vgic_v3_deactivate(vcpu, p->regval); + + return true; +} + static bool trap_raz_wi(struct kvm_vcpu *vcpu, struct sys_reg_params *p, const struct sys_reg_desc *r) @@ -3373,7 +3388,7 @@ static const struct sys_reg_desc sys_reg_descs[] = { { SYS_DESC(SYS_ICC_AP1R1_EL1), undef_access }, { SYS_DESC(SYS_ICC_AP1R2_EL1), undef_access }, { SYS_DESC(SYS_ICC_AP1R3_EL1), undef_access }, - { SYS_DESC(SYS_ICC_DIR_EL1), undef_access }, + { SYS_DESC(SYS_ICC_DIR_EL1), access_gic_dir }, { SYS_DESC(SYS_ICC_RPR_EL1), undef_access }, { SYS_DESC(SYS_ICC_SGI1R_EL1), access_gic_sgi }, { SYS_DESC(SYS_ICC_ASGI1R_EL1), access_gic_sgi }, @@ -3770,7 +3785,8 @@ static bool handle_at_s1e01(struct kvm_vcpu *vcpu, struct sys_reg_params *p, { u32 op = sys_insn(p->Op0, p->Op1, p->CRn, p->CRm, p->Op2); - __kvm_at_s1e01(vcpu, op, p->regval); + if (__kvm_at_s1e01(vcpu, op, p->regval)) + return false; return true; } @@ -3787,7 +3803,8 @@ static bool handle_at_s1e2(struct kvm_vcpu *vcpu, struct sys_reg_params *p, return false; } - __kvm_at_s1e2(vcpu, op, p->regval); + if (__kvm_at_s1e2(vcpu, op, p->regval)) + return false; return true; } @@ -3797,7 +3814,8 @@ static bool handle_at_s12(struct kvm_vcpu *vcpu, struct sys_reg_params *p, { u32 op = sys_insn(p->Op0, p->Op1, p->CRn, p->CRm, p->Op2); - __kvm_at_s12(vcpu, op, p->regval); + if (__kvm_at_s12(vcpu, op, p->regval)) + return false; return true; } @@ -4498,7 +4516,7 @@ static const struct sys_reg_desc cp15_regs[] = { { CP15_SYS_DESC(SYS_ICC_AP1R1_EL1), undef_access }, { CP15_SYS_DESC(SYS_ICC_AP1R2_EL1), undef_access }, { CP15_SYS_DESC(SYS_ICC_AP1R3_EL1), undef_access }, - { CP15_SYS_DESC(SYS_ICC_DIR_EL1), undef_access }, + { CP15_SYS_DESC(SYS_ICC_DIR_EL1), access_gic_dir }, { CP15_SYS_DESC(SYS_ICC_RPR_EL1), undef_access }, { CP15_SYS_DESC(SYS_ICC_IAR1_EL1), undef_access }, { CP15_SYS_DESC(SYS_ICC_EOIR1_EL1), undef_access }, diff --git a/arch/arm64/kvm/vgic/vgic-init.c b/arch/arm64/kvm/vgic/vgic-init.c index da62edbc1205..dc9f9db31026 100644 --- a/arch/arm64/kvm/vgic/vgic-init.c +++ b/arch/arm64/kvm/vgic/vgic-init.c @@ -198,6 +198,7 @@ static int kvm_vgic_dist_init(struct kvm *kvm, unsigned int nr_spis) struct kvm_vcpu *vcpu0 = kvm_get_vcpu(kvm, 0); int i; + dist->active_spis = (atomic_t)ATOMIC_INIT(0); dist->spis = kcalloc(nr_spis, sizeof(struct vgic_irq), GFP_KERNEL_ACCOUNT); if (!dist->spis) return -ENOMEM; @@ -363,12 +364,12 @@ int kvm_vgic_vcpu_init(struct kvm_vcpu *vcpu) return ret; } -static void kvm_vgic_vcpu_enable(struct kvm_vcpu *vcpu) +static void kvm_vgic_vcpu_reset(struct kvm_vcpu *vcpu) { if (kvm_vgic_global_state.type == VGIC_V2) - vgic_v2_enable(vcpu); + vgic_v2_reset(vcpu); else - vgic_v3_enable(vcpu); + vgic_v3_reset(vcpu); } /* @@ -415,7 +416,7 @@ int vgic_init(struct kvm *kvm) } kvm_for_each_vcpu(idx, vcpu, kvm) - kvm_vgic_vcpu_enable(vcpu); + kvm_vgic_vcpu_reset(vcpu); ret = kvm_vgic_setup_default_irq_routing(kvm); if (ret) diff --git a/arch/arm64/kvm/vgic/vgic-mmio-v2.c b/arch/arm64/kvm/vgic/vgic-mmio-v2.c index f25fccb1f8e6..406845b3117c 100644 --- a/arch/arm64/kvm/vgic/vgic-mmio-v2.c +++ b/arch/arm64/kvm/vgic/vgic-mmio-v2.c @@ -359,6 +359,16 @@ static void vgic_mmio_write_vcpuif(struct kvm_vcpu *vcpu, vgic_set_vmcr(vcpu, &vmcr); } +static void vgic_mmio_write_dir(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len, + unsigned long val) +{ + if (kvm_vgic_global_state.type == VGIC_V2) + vgic_v2_deactivate(vcpu, val); + else + vgic_v3_deactivate(vcpu, val); +} + static unsigned long vgic_mmio_read_apr(struct kvm_vcpu *vcpu, gpa_t addr, unsigned int len) { @@ -482,6 +492,10 @@ static const struct vgic_register_region vgic_v2_cpu_registers[] = { REGISTER_DESC_WITH_LENGTH(GIC_CPU_IDENT, vgic_mmio_read_vcpuif, vgic_mmio_write_vcpuif, 4, VGIC_ACCESS_32bit), + REGISTER_DESC_WITH_LENGTH_UACCESS(GIC_CPU_DEACTIVATE, + vgic_mmio_read_raz, vgic_mmio_write_dir, + vgic_mmio_read_raz, vgic_mmio_uaccess_write_wi, + 4, VGIC_ACCESS_32bit), }; unsigned int vgic_v2_init_dist_iodev(struct vgic_io_device *dev) @@ -494,6 +508,16 @@ unsigned int vgic_v2_init_dist_iodev(struct vgic_io_device *dev) return SZ_4K; } +unsigned int vgic_v2_init_cpuif_iodev(struct vgic_io_device *dev) +{ + dev->regions = vgic_v2_cpu_registers; + dev->nr_regions = ARRAY_SIZE(vgic_v2_cpu_registers); + + kvm_iodevice_init(&dev->dev, &kvm_io_gic_ops); + + return KVM_VGIC_V2_CPU_SIZE; +} + int vgic_v2_has_attr_regs(struct kvm_device *dev, struct kvm_device_attr *attr) { const struct vgic_register_region *region; diff --git a/arch/arm64/kvm/vgic/vgic-mmio.h b/arch/arm64/kvm/vgic/vgic-mmio.h index 5b490a4dfa5e..50dc80220b0f 100644 --- a/arch/arm64/kvm/vgic/vgic-mmio.h +++ b/arch/arm64/kvm/vgic/vgic-mmio.h @@ -213,6 +213,7 @@ void vgic_write_irq_line_level_info(struct kvm_vcpu *vcpu, u32 intid, const u32 val); unsigned int vgic_v2_init_dist_iodev(struct vgic_io_device *dev); +unsigned int vgic_v2_init_cpuif_iodev(struct vgic_io_device *dev); unsigned int vgic_v3_init_dist_iodev(struct vgic_io_device *dev); diff --git a/arch/arm64/kvm/vgic/vgic-v2.c b/arch/arm64/kvm/vgic/vgic-v2.c index 381673f03c39..585491fbda80 100644 --- a/arch/arm64/kvm/vgic/vgic-v2.c +++ b/arch/arm64/kvm/vgic/vgic-v2.c @@ -9,6 +9,7 @@ #include <kvm/arm_vgic.h> #include <asm/kvm_mmu.h> +#include "vgic-mmio.h" #include "vgic.h" static inline void vgic_v2_write_lr(int lr, u32 val) @@ -26,11 +27,24 @@ void vgic_v2_init_lrs(void) vgic_v2_write_lr(i, 0); } -void vgic_v2_set_underflow(struct kvm_vcpu *vcpu) +void vgic_v2_configure_hcr(struct kvm_vcpu *vcpu, + struct ap_list_summary *als) { struct vgic_v2_cpu_if *cpuif = &vcpu->arch.vgic_cpu.vgic_v2; - cpuif->vgic_hcr |= GICH_HCR_UIE; + cpuif->vgic_hcr = GICH_HCR_EN; + + if (irqs_pending_outside_lrs(als)) + cpuif->vgic_hcr |= GICH_HCR_NPIE; + if (irqs_active_outside_lrs(als)) + cpuif->vgic_hcr |= GICH_HCR_LRENPIE; + if (irqs_outside_lrs(als)) + cpuif->vgic_hcr |= GICH_HCR_UIE; + + cpuif->vgic_hcr |= (cpuif->vgic_vmcr & GICH_VMCR_ENABLE_GRP0_MASK) ? + GICH_HCR_VGrp0DIE : GICH_HCR_VGrp0EIE; + cpuif->vgic_hcr |= (cpuif->vgic_vmcr & GICH_VMCR_ENABLE_GRP1_MASK) ? + GICH_HCR_VGrp1DIE : GICH_HCR_VGrp1EIE; } static bool lr_signals_eoi_mi(u32 lr_val) @@ -39,43 +53,23 @@ static bool lr_signals_eoi_mi(u32 lr_val) !(lr_val & GICH_LR_HW); } -/* - * transfer the content of the LRs back into the corresponding ap_list: - * - active bit is transferred as is - * - pending bit is - * - transferred as is in case of edge sensitive IRQs - * - set to the line-level (resample time) for level sensitive IRQs - */ -void vgic_v2_fold_lr_state(struct kvm_vcpu *vcpu) +static void vgic_v2_fold_lr(struct kvm_vcpu *vcpu, u32 val) { - struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; - struct vgic_v2_cpu_if *cpuif = &vgic_cpu->vgic_v2; - int lr; - - DEBUG_SPINLOCK_BUG_ON(!irqs_disabled()); - - cpuif->vgic_hcr &= ~GICH_HCR_UIE; - - for (lr = 0; lr < vgic_cpu->vgic_v2.used_lrs; lr++) { - u32 val = cpuif->vgic_lr[lr]; - u32 cpuid, intid = val & GICH_LR_VIRTUALID; - struct vgic_irq *irq; - bool deactivated; - - /* Extract the source vCPU id from the LR */ - cpuid = val & GICH_LR_PHYSID_CPUID; - cpuid >>= GICH_LR_PHYSID_CPUID_SHIFT; - cpuid &= 7; + u32 cpuid, intid = val & GICH_LR_VIRTUALID; + struct vgic_irq *irq; + bool deactivated; - /* Notify fds when the guest EOI'ed a level-triggered SPI */ - if (lr_signals_eoi_mi(val) && vgic_valid_spi(vcpu->kvm, intid)) - kvm_notify_acked_irq(vcpu->kvm, 0, - intid - VGIC_NR_PRIVATE_IRQS); + /* Extract the source vCPU id from the LR */ + cpuid = FIELD_GET(GICH_LR_PHYSID_CPUID, val) & 7; - irq = vgic_get_vcpu_irq(vcpu, intid); + /* Notify fds when the guest EOI'ed a level-triggered SPI */ + if (lr_signals_eoi_mi(val) && vgic_valid_spi(vcpu->kvm, intid)) + kvm_notify_acked_irq(vcpu->kvm, 0, + intid - VGIC_NR_PRIVATE_IRQS); - raw_spin_lock(&irq->irq_lock); + irq = vgic_get_vcpu_irq(vcpu, intid); + scoped_guard(raw_spinlock, &irq->irq_lock) { /* Always preserve the active bit, note deactivation */ deactivated = irq->active && !(val & GICH_LR_ACTIVE_BIT); irq->active = !!(val & GICH_LR_ACTIVE_BIT); @@ -101,29 +95,139 @@ void vgic_v2_fold_lr_state(struct kvm_vcpu *vcpu) /* Handle resampling for mapped interrupts if required */ vgic_irq_handle_resampling(irq, deactivated, val & GICH_LR_PENDING_BIT); - raw_spin_unlock(&irq->irq_lock); - vgic_put_irq(vcpu->kvm, irq); + irq->on_lr = false; } - cpuif->used_lrs = 0; + vgic_put_irq(vcpu->kvm, irq); } +static u32 vgic_v2_compute_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq); + /* - * Populates the particular LR with the state of a given IRQ: - * - for an edge sensitive IRQ the pending state is cleared in struct vgic_irq - * - for a level sensitive IRQ the pending state value is unchanged; - * it is dictated directly by the input level - * - * If @irq describes an SGI with multiple sources, we choose the - * lowest-numbered source VCPU and clear that bit in the source bitmap. - * - * The irq_lock must be held by the caller. + * transfer the content of the LRs back into the corresponding ap_list: + * - active bit is transferred as is + * - pending bit is + * - transferred as is in case of edge sensitive IRQs + * - set to the line-level (resample time) for level sensitive IRQs */ -void vgic_v2_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr) +void vgic_v2_fold_lr_state(struct kvm_vcpu *vcpu) +{ + struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; + struct vgic_v2_cpu_if *cpuif = &vgic_cpu->vgic_v2; + u32 eoicount = FIELD_GET(GICH_HCR_EOICOUNT, cpuif->vgic_hcr); + struct vgic_irq *irq; + + DEBUG_SPINLOCK_BUG_ON(!irqs_disabled()); + + for (int lr = 0; lr < vgic_cpu->vgic_v2.used_lrs; lr++) + vgic_v2_fold_lr(vcpu, cpuif->vgic_lr[lr]); + + /* See the GICv3 equivalent for the EOIcount handling rationale */ + list_for_each_entry(irq, &vgic_cpu->ap_list_head, ap_list) { + u32 lr; + + if (!eoicount) { + break; + } else { + guard(raw_spinlock)(&irq->irq_lock); + + if (!(likely(vgic_target_oracle(irq) == vcpu) && + irq->active)) + continue; + + lr = vgic_v2_compute_lr(vcpu, irq) & ~GICH_LR_ACTIVE_BIT; + } + + if (lr & GICH_LR_HW) + writel_relaxed(FIELD_GET(GICH_LR_PHYSID_CPUID, lr), + kvm_vgic_global_state.gicc_base + GIC_CPU_DEACTIVATE); + vgic_v2_fold_lr(vcpu, lr); + eoicount--; + } + + cpuif->used_lrs = 0; +} + +void vgic_v2_deactivate(struct kvm_vcpu *vcpu, u32 val) +{ + struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; + struct vgic_v2_cpu_if *cpuif = &vgic_cpu->vgic_v2; + struct kvm_vcpu *target_vcpu = NULL; + bool mmio = false; + struct vgic_irq *irq; + unsigned long flags; + u64 lr = 0; + u8 cpuid; + + /* Snapshot CPUID, and remove it from the INTID */ + cpuid = FIELD_GET(GENMASK_ULL(12, 10), val); + val &= ~GENMASK_ULL(12, 10); + + /* We only deal with DIR when EOIMode==1 */ + if (!(cpuif->vgic_vmcr & GICH_VMCR_EOI_MODE_MASK)) + return; + + /* Make sure we're in the same context as LR handling */ + local_irq_save(flags); + + irq = vgic_get_vcpu_irq(vcpu, val); + if (WARN_ON_ONCE(!irq)) + goto out; + + /* See the corresponding v3 code for the rationale */ + scoped_guard(raw_spinlock, &irq->irq_lock) { + target_vcpu = irq->vcpu; + + /* Not on any ap_list? */ + if (!target_vcpu) + goto put; + + /* + * Urgh. We're deactivating something that we cannot + * observe yet... Big hammer time. + */ + if (irq->on_lr) { + mmio = true; + goto put; + } + + /* SGI: check that the cpuid matches */ + if (val < VGIC_NR_SGIS && irq->active_source != cpuid) { + target_vcpu = NULL; + goto put; + } + + /* (with a Dalek voice) DEACTIVATE!!!! */ + lr = vgic_v2_compute_lr(vcpu, irq) & ~GICH_LR_ACTIVE_BIT; + } + + if (lr & GICH_LR_HW) + writel_relaxed(FIELD_GET(GICH_LR_PHYSID_CPUID, lr), + kvm_vgic_global_state.gicc_base + GIC_CPU_DEACTIVATE); + + vgic_v2_fold_lr(vcpu, lr); + +put: + vgic_put_irq(vcpu->kvm, irq); + +out: + local_irq_restore(flags); + + if (mmio) + vgic_mmio_write_cactive(vcpu, (val / 32) * 4, 4, BIT(val % 32)); + + /* Force the ap_list to be pruned */ + if (target_vcpu) + kvm_make_request(KVM_REQ_VGIC_PROCESS_UPDATE, target_vcpu); +} + +static u32 vgic_v2_compute_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq) { u32 val = irq->intid; bool allow_pending = true; + WARN_ON(irq->on_lr); + if (irq->active) { val |= GICH_LR_ACTIVE_BIT; if (vgic_irq_is_sgi(irq->intid)) @@ -163,22 +267,52 @@ void vgic_v2_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr) if (allow_pending && irq_is_pending(irq)) { val |= GICH_LR_PENDING_BIT; - if (irq->config == VGIC_CONFIG_EDGE) - irq->pending_latch = false; - if (vgic_irq_is_sgi(irq->intid)) { u32 src = ffs(irq->source); if (WARN_RATELIMIT(!src, "No SGI source for INTID %d\n", irq->intid)) - return; + return 0; val |= (src - 1) << GICH_LR_PHYSID_CPUID_SHIFT; - irq->source &= ~(1 << (src - 1)); - if (irq->source) { - irq->pending_latch = true; + if (irq->source & ~BIT(src - 1)) val |= GICH_LR_EOI; - } + } + } + + /* The GICv2 LR only holds five bits of priority. */ + val |= (irq->priority >> 3) << GICH_LR_PRIORITY_SHIFT; + + return val; +} + +/* + * Populates the particular LR with the state of a given IRQ: + * - for an edge sensitive IRQ the pending state is cleared in struct vgic_irq + * - for a level sensitive IRQ the pending state value is unchanged; + * it is dictated directly by the input level + * + * If @irq describes an SGI with multiple sources, we choose the + * lowest-numbered source VCPU and clear that bit in the source bitmap. + * + * The irq_lock must be held by the caller. + */ +void vgic_v2_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr) +{ + u32 val = vgic_v2_compute_lr(vcpu, irq); + + vcpu->arch.vgic_cpu.vgic_v2.vgic_lr[lr] = val; + + if (val & GICH_LR_PENDING_BIT) { + if (irq->config == VGIC_CONFIG_EDGE) + irq->pending_latch = false; + + if (vgic_irq_is_sgi(irq->intid)) { + u32 src = ffs(irq->source); + + irq->source &= ~BIT(src - 1); + if (irq->source) + irq->pending_latch = true; } } @@ -194,7 +328,7 @@ void vgic_v2_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr) /* The GICv2 LR only holds five bits of priority. */ val |= (irq->priority >> 3) << GICH_LR_PRIORITY_SHIFT; - vcpu->arch.vgic_cpu.vgic_v2.vgic_lr[lr] = val; + irq->on_lr = true; } void vgic_v2_clear_lr(struct kvm_vcpu *vcpu, int lr) @@ -257,7 +391,7 @@ void vgic_v2_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcrp) GICH_VMCR_PRIMASK_SHIFT) << GICV_PMR_PRIORITY_SHIFT; } -void vgic_v2_enable(struct kvm_vcpu *vcpu) +void vgic_v2_reset(struct kvm_vcpu *vcpu) { /* * By forcing VMCR to zero, the GIC will restore the binary @@ -265,9 +399,6 @@ void vgic_v2_enable(struct kvm_vcpu *vcpu) * anyway. */ vcpu->arch.vgic_cpu.vgic_v2.vgic_vmcr = 0; - - /* Get the show on the road... */ - vcpu->arch.vgic_cpu.vgic_v2.vgic_hcr = GICH_HCR_EN; } /* check for overlapping regions and for regions crossing the end of memory */ @@ -289,6 +420,7 @@ static bool vgic_v2_check_base(gpa_t dist_base, gpa_t cpu_base) int vgic_v2_map_resources(struct kvm *kvm) { struct vgic_dist *dist = &kvm->arch.vgic; + unsigned int len; int ret = 0; if (IS_VGIC_ADDR_UNDEF(dist->vgic_dist_base) || @@ -312,10 +444,20 @@ int vgic_v2_map_resources(struct kvm *kvm) return ret; } + len = vgic_v2_init_cpuif_iodev(&dist->cpuif_iodev); + dist->cpuif_iodev.base_addr = dist->vgic_cpu_base; + dist->cpuif_iodev.iodev_type = IODEV_CPUIF; + dist->cpuif_iodev.redist_vcpu = NULL; + + ret = kvm_io_bus_register_dev(kvm, KVM_MMIO_BUS, dist->vgic_cpu_base, + len, &dist->cpuif_iodev.dev); + if (ret) + return ret; + if (!static_branch_unlikely(&vgic_v2_cpuif_trap)) { ret = kvm_phys_addr_ioremap(kvm, dist->vgic_cpu_base, kvm_vgic_global_state.vcpu_base, - KVM_VGIC_V2_CPU_SIZE, true); + KVM_VGIC_V2_CPU_SIZE - SZ_4K, true); if (ret) { kvm_err("Unable to remap VGIC CPU to VCPU\n"); return ret; @@ -385,6 +527,7 @@ int vgic_v2_probe(const struct gic_kvm_info *info) kvm_vgic_global_state.can_emulate_gicv2 = true; kvm_vgic_global_state.vcpu_base = info->vcpu.start; + kvm_vgic_global_state.gicc_base = info->gicc_base; kvm_vgic_global_state.type = VGIC_V2; kvm_vgic_global_state.max_gic_vcpus = VGIC_V2_MAX_CPUS; @@ -423,16 +566,26 @@ static void save_lrs(struct kvm_vcpu *vcpu, void __iomem *base) void vgic_v2_save_state(struct kvm_vcpu *vcpu) { + struct vgic_v2_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v2; void __iomem *base = kvm_vgic_global_state.vctrl_base; u64 used_lrs = vcpu->arch.vgic_cpu.vgic_v2.used_lrs; if (!base) return; - if (used_lrs) { + cpu_if->vgic_vmcr = readl_relaxed(kvm_vgic_global_state.vctrl_base + GICH_VMCR); + + if (used_lrs) save_lrs(vcpu, base); - writel_relaxed(0, base + GICH_HCR); + + if (cpu_if->vgic_hcr & GICH_HCR_LRENPIE) { + u32 val = readl_relaxed(base + GICH_HCR); + + cpu_if->vgic_hcr &= ~GICH_HCR_EOICOUNT; + cpu_if->vgic_hcr |= val & GICH_HCR_EOICOUNT; } + + writel_relaxed(0, base + GICH_HCR); } void vgic_v2_restore_state(struct kvm_vcpu *vcpu) @@ -445,13 +598,10 @@ void vgic_v2_restore_state(struct kvm_vcpu *vcpu) if (!base) return; - if (used_lrs) { - writel_relaxed(cpu_if->vgic_hcr, base + GICH_HCR); - for (i = 0; i < used_lrs; i++) { - writel_relaxed(cpu_if->vgic_lr[i], - base + GICH_LR0 + (i * 4)); - } - } + writel_relaxed(cpu_if->vgic_hcr, base + GICH_HCR); + + for (i = 0; i < used_lrs; i++) + writel_relaxed(cpu_if->vgic_lr[i], base + GICH_LR0 + (i * 4)); } void vgic_v2_load(struct kvm_vcpu *vcpu) @@ -468,6 +618,5 @@ void vgic_v2_put(struct kvm_vcpu *vcpu) { struct vgic_v2_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v2; - cpu_if->vgic_vmcr = readl_relaxed(kvm_vgic_global_state.vctrl_base + GICH_VMCR); cpu_if->vgic_apr = readl_relaxed(kvm_vgic_global_state.vctrl_base + GICH_APR); } diff --git a/arch/arm64/kvm/vgic/vgic-v3-nested.c b/arch/arm64/kvm/vgic/vgic-v3-nested.c index 7f1259b49c50..61b44f3f2bf1 100644 --- a/arch/arm64/kvm/vgic/vgic-v3-nested.c +++ b/arch/arm64/kvm/vgic/vgic-v3-nested.c @@ -70,13 +70,14 @@ static int lr_map_idx_to_shadow_idx(struct shadow_if *shadow_if, int idx) * - on L2 put: perform the inverse transformation, so that the result of L2 * running becomes visible to L1 in the VNCR-accessible registers. * - * - there is nothing to do on L2 entry, as everything will have happened - * on load. However, this is the point where we detect that an interrupt - * targeting L1 and prepare the grand switcheroo. + * - there is nothing to do on L2 entry apart from enabling the vgic, as + * everything will have happened on load. However, this is the point where + * we detect that an interrupt targeting L1 and prepare the grand + * switcheroo. * - * - on L2 exit: emulate the HW bit, and deactivate corresponding the L1 - * interrupt. The L0 active state will be cleared by the HW if the L1 - * interrupt was itself backed by a HW interrupt. + * - on L2 exit: resync the LRs and VMCR, emulate the HW bit, and deactivate + * corresponding the L1 interrupt. The L0 active state will be cleared by + * the HW if the L1 interrupt was itself backed by a HW interrupt. * * Maintenance Interrupt (MI) management: * @@ -93,8 +94,10 @@ static int lr_map_idx_to_shadow_idx(struct shadow_if *shadow_if, int idx) * * - because most of the ICH_*_EL2 registers live in the VNCR page, the * quality of emulation is poor: L1 can setup the vgic so that an MI would - * immediately fire, and not observe anything until the next exit. Trying - * to read ICH_MISR_EL2 would do the trick, for example. + * immediately fire, and not observe anything until the next exit. + * Similarly, a pending MI is not immediately disabled by clearing + * ICH_HCR_EL2.En. Trying to read ICH_MISR_EL2 would do the trick, for + * example. * * System register emulation: * @@ -265,16 +268,37 @@ static void vgic_v3_create_shadow_lr(struct kvm_vcpu *vcpu, s_cpu_if->used_lrs = hweight16(shadow_if->lr_map); } +void vgic_v3_flush_nested(struct kvm_vcpu *vcpu) +{ + u64 val = __vcpu_sys_reg(vcpu, ICH_HCR_EL2); + + write_sysreg_s(val | vgic_ich_hcr_trap_bits(), SYS_ICH_HCR_EL2); +} + void vgic_v3_sync_nested(struct kvm_vcpu *vcpu) { struct shadow_if *shadow_if = get_shadow_if(); int i; for_each_set_bit(i, &shadow_if->lr_map, kvm_vgic_global_state.nr_lr) { - u64 lr = __vcpu_sys_reg(vcpu, ICH_LRN(i)); - struct vgic_irq *irq; + u64 val, host_lr, lr; + + host_lr = __gic_v3_get_lr(lr_map_idx_to_shadow_idx(shadow_if, i)); + + /* Propagate the new LR state */ + lr = __vcpu_sys_reg(vcpu, ICH_LRN(i)); + val = lr & ~ICH_LR_STATE; + val |= host_lr & ICH_LR_STATE; + __vcpu_assign_sys_reg(vcpu, ICH_LRN(i), val); - if (!(lr & ICH_LR_HW) || !(lr & ICH_LR_STATE)) + /* + * Deactivation of a HW interrupt: the LR must have the HW + * bit set, have been in a non-invalid state before the run, + * and now be in an invalid state. If any of that doesn't + * hold, we're done with this LR. + */ + if (!((lr & ICH_LR_HW) && (lr & ICH_LR_STATE) && + !(host_lr & ICH_LR_STATE))) continue; /* @@ -282,35 +306,27 @@ void vgic_v3_sync_nested(struct kvm_vcpu *vcpu) * need to emulate the HW effect between the guest hypervisor * and the nested guest. */ - irq = vgic_get_vcpu_irq(vcpu, FIELD_GET(ICH_LR_PHYS_ID_MASK, lr)); - if (WARN_ON(!irq)) /* Shouldn't happen as we check on load */ - continue; + vgic_v3_deactivate(vcpu, FIELD_GET(ICH_LR_PHYS_ID_MASK, lr)); + } - lr = __gic_v3_get_lr(lr_map_idx_to_shadow_idx(shadow_if, i)); - if (!(lr & ICH_LR_STATE)) - irq->active = false; + /* We need these to be synchronised to generate the MI */ + __vcpu_assign_sys_reg(vcpu, ICH_VMCR_EL2, read_sysreg_s(SYS_ICH_VMCR_EL2)); + __vcpu_rmw_sys_reg(vcpu, ICH_HCR_EL2, &=, ~ICH_HCR_EL2_EOIcount); + __vcpu_rmw_sys_reg(vcpu, ICH_HCR_EL2, |=, read_sysreg_s(SYS_ICH_HCR_EL2) & ICH_HCR_EL2_EOIcount); - vgic_put_irq(vcpu->kvm, irq); - } + write_sysreg_s(0, SYS_ICH_HCR_EL2); + isb(); + + vgic_v3_nested_update_mi(vcpu); } static void vgic_v3_create_shadow_state(struct kvm_vcpu *vcpu, struct vgic_v3_cpu_if *s_cpu_if) { struct vgic_v3_cpu_if *host_if = &vcpu->arch.vgic_cpu.vgic_v3; - u64 val = 0; int i; - /* - * If we're on a system with a broken vgic that requires - * trapping, propagate the trapping requirements. - * - * Ah, the smell of rotten fruits... - */ - if (static_branch_unlikely(&vgic_v3_cpuif_trap)) - val = host_if->vgic_hcr & (ICH_HCR_EL2_TALL0 | ICH_HCR_EL2_TALL1 | - ICH_HCR_EL2_TC | ICH_HCR_EL2_TDIR); - s_cpu_if->vgic_hcr = __vcpu_sys_reg(vcpu, ICH_HCR_EL2) | val; + s_cpu_if->vgic_hcr = __vcpu_sys_reg(vcpu, ICH_HCR_EL2); s_cpu_if->vgic_vmcr = __vcpu_sys_reg(vcpu, ICH_VMCR_EL2); s_cpu_if->vgic_sre = host_if->vgic_sre; @@ -334,7 +350,8 @@ void vgic_v3_load_nested(struct kvm_vcpu *vcpu) __vgic_v3_restore_vmcr_aprs(cpu_if); __vgic_v3_activate_traps(cpu_if); - __vgic_v3_restore_state(cpu_if); + for (int i = 0; i < cpu_if->used_lrs; i++) + __gic_v3_set_lr(cpu_if->vgic_lr[i], i); /* * Propagate the number of used LRs for the benefit of the HYP @@ -347,36 +364,19 @@ void vgic_v3_put_nested(struct kvm_vcpu *vcpu) { struct shadow_if *shadow_if = get_shadow_if(); struct vgic_v3_cpu_if *s_cpu_if = &shadow_if->cpuif; - u64 val; int i; - __vgic_v3_save_vmcr_aprs(s_cpu_if); - __vgic_v3_deactivate_traps(s_cpu_if); - __vgic_v3_save_state(s_cpu_if); - - /* - * Translate the shadow state HW fields back to the virtual ones - * before copying the shadow struct back to the nested one. - */ - val = __vcpu_sys_reg(vcpu, ICH_HCR_EL2); - val &= ~ICH_HCR_EL2_EOIcount_MASK; - val |= (s_cpu_if->vgic_hcr & ICH_HCR_EL2_EOIcount_MASK); - __vcpu_assign_sys_reg(vcpu, ICH_HCR_EL2, val); - __vcpu_assign_sys_reg(vcpu, ICH_VMCR_EL2, s_cpu_if->vgic_vmcr); + __vgic_v3_save_aprs(s_cpu_if); for (i = 0; i < 4; i++) { __vcpu_assign_sys_reg(vcpu, ICH_AP0RN(i), s_cpu_if->vgic_ap0r[i]); __vcpu_assign_sys_reg(vcpu, ICH_AP1RN(i), s_cpu_if->vgic_ap1r[i]); } - for_each_set_bit(i, &shadow_if->lr_map, kvm_vgic_global_state.nr_lr) { - val = __vcpu_sys_reg(vcpu, ICH_LRN(i)); - - val &= ~ICH_LR_STATE; - val |= s_cpu_if->vgic_lr[lr_map_idx_to_shadow_idx(shadow_if, i)] & ICH_LR_STATE; + for (i = 0; i < s_cpu_if->used_lrs; i++) + __gic_v3_set_lr(0, i); - __vcpu_assign_sys_reg(vcpu, ICH_LRN(i), val); - } + __vgic_v3_deactivate_traps(s_cpu_if); vcpu->arch.vgic_cpu.vgic_v3.used_lrs = 0; } diff --git a/arch/arm64/kvm/vgic/vgic-v3.c b/arch/arm64/kvm/vgic/vgic-v3.c index 2f75ef14d339..1d6dd1b545bd 100644 --- a/arch/arm64/kvm/vgic/vgic-v3.c +++ b/arch/arm64/kvm/vgic/vgic-v3.c @@ -12,6 +12,7 @@ #include <asm/kvm_mmu.h> #include <asm/kvm_asm.h> +#include "vgic-mmio.h" #include "vgic.h" static bool group0_trap; @@ -20,11 +21,48 @@ static bool common_trap; static bool dir_trap; static bool gicv4_enable; -void vgic_v3_set_underflow(struct kvm_vcpu *vcpu) +void vgic_v3_configure_hcr(struct kvm_vcpu *vcpu, + struct ap_list_summary *als) { struct vgic_v3_cpu_if *cpuif = &vcpu->arch.vgic_cpu.vgic_v3; - cpuif->vgic_hcr |= ICH_HCR_EL2_UIE; + if (!irqchip_in_kernel(vcpu->kvm)) + return; + + cpuif->vgic_hcr = ICH_HCR_EL2_En; + + if (irqs_pending_outside_lrs(als)) + cpuif->vgic_hcr |= ICH_HCR_EL2_NPIE; + if (irqs_active_outside_lrs(als)) + cpuif->vgic_hcr |= ICH_HCR_EL2_LRENPIE; + if (irqs_outside_lrs(als)) + cpuif->vgic_hcr |= ICH_HCR_EL2_UIE; + + if (!als->nr_sgi) + cpuif->vgic_hcr |= ICH_HCR_EL2_vSGIEOICount; + + cpuif->vgic_hcr |= (cpuif->vgic_vmcr & ICH_VMCR_ENG0_MASK) ? + ICH_HCR_EL2_VGrp0DIE : ICH_HCR_EL2_VGrp0EIE; + cpuif->vgic_hcr |= (cpuif->vgic_vmcr & ICH_VMCR_ENG1_MASK) ? + ICH_HCR_EL2_VGrp1DIE : ICH_HCR_EL2_VGrp1EIE; + + /* + * Dealing with EOImode=1 is a massive source of headache. Not + * only do we need to track that we have active interrupts + * outside of the LRs and force DIR to be trapped, we also + * need to deal with SPIs that can be deactivated on another + * CPU. + * + * On systems that do not implement TDIR, force the bit in the + * shadow state anyway to avoid IPI-ing on these poor sods. + * + * Note that we set the trap irrespective of EOIMode, as that + * can change behind our back without any warning... + */ + if (!cpus_have_final_cap(ARM64_HAS_ICH_HCR_EL2_TDIR) || + irqs_active_outside_lrs(als) || + atomic_read(&vcpu->kvm->arch.vgic.active_spis)) + cpuif->vgic_hcr |= ICH_HCR_EL2_TDIR; } static bool lr_signals_eoi_mi(u64 lr_val) @@ -33,84 +71,238 @@ static bool lr_signals_eoi_mi(u64 lr_val) !(lr_val & ICH_LR_HW); } +static void vgic_v3_fold_lr(struct kvm_vcpu *vcpu, u64 val) +{ + struct vgic_irq *irq; + bool is_v2_sgi = false; + bool deactivated; + u32 intid; + + if (vcpu->kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3) { + intid = val & ICH_LR_VIRTUAL_ID_MASK; + } else { + intid = val & GICH_LR_VIRTUALID; + is_v2_sgi = vgic_irq_is_sgi(intid); + } + + irq = vgic_get_vcpu_irq(vcpu, intid); + if (!irq) /* An LPI could have been unmapped. */ + return; + + scoped_guard(raw_spinlock, &irq->irq_lock) { + /* Always preserve the active bit for !LPIs, note deactivation */ + if (irq->intid >= VGIC_MIN_LPI) + val &= ~ICH_LR_ACTIVE_BIT; + deactivated = irq->active && !(val & ICH_LR_ACTIVE_BIT); + irq->active = !!(val & ICH_LR_ACTIVE_BIT); + + /* Edge is the only case where we preserve the pending bit */ + if (irq->config == VGIC_CONFIG_EDGE && + (val & ICH_LR_PENDING_BIT)) + irq->pending_latch = true; + + /* + * Clear soft pending state when level irqs have been acked. + */ + if (irq->config == VGIC_CONFIG_LEVEL && !(val & ICH_LR_STATE)) + irq->pending_latch = false; + + if (is_v2_sgi) { + u8 cpuid = FIELD_GET(GICH_LR_PHYSID_CPUID, val); + + if (irq->active) + irq->active_source = cpuid; + + if (val & ICH_LR_PENDING_BIT) + irq->source |= BIT(cpuid); + } + + /* Handle resampling for mapped interrupts if required */ + vgic_irq_handle_resampling(irq, deactivated, val & ICH_LR_PENDING_BIT); + + irq->on_lr = false; + } + + /* Notify fds when the guest EOI'ed a level-triggered SPI, and drop the refcount */ + if (deactivated && lr_signals_eoi_mi(val) && vgic_valid_spi(vcpu->kvm, intid)) { + kvm_notify_acked_irq(vcpu->kvm, 0, + intid - VGIC_NR_PRIVATE_IRQS); + atomic_dec_if_positive(&vcpu->kvm->arch.vgic.active_spis); + } + + vgic_put_irq(vcpu->kvm, irq); +} + +static u64 vgic_v3_compute_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq); + +static void vgic_v3_deactivate_phys(u32 intid) +{ + if (cpus_have_final_cap(ARM64_HAS_GICV5_LEGACY)) + gic_insn(intid | FIELD_PREP(GICV5_GIC_CDDI_TYPE_MASK, 1), CDDI); + else + gic_write_dir(intid); +} + void vgic_v3_fold_lr_state(struct kvm_vcpu *vcpu) { struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; struct vgic_v3_cpu_if *cpuif = &vgic_cpu->vgic_v3; - u32 model = vcpu->kvm->arch.vgic.vgic_model; - int lr; + u32 eoicount = FIELD_GET(ICH_HCR_EL2_EOIcount, cpuif->vgic_hcr); + struct vgic_irq *irq; DEBUG_SPINLOCK_BUG_ON(!irqs_disabled()); - cpuif->vgic_hcr &= ~ICH_HCR_EL2_UIE; - - for (lr = 0; lr < cpuif->used_lrs; lr++) { - u64 val = cpuif->vgic_lr[lr]; - u32 intid, cpuid; - struct vgic_irq *irq; - bool is_v2_sgi = false; - bool deactivated; + for (int lr = 0; lr < cpuif->used_lrs; lr++) + vgic_v3_fold_lr(vcpu, cpuif->vgic_lr[lr]); - cpuid = val & GICH_LR_PHYSID_CPUID; - cpuid >>= GICH_LR_PHYSID_CPUID_SHIFT; + /* + * EOIMode=0: use EOIcount to emulate deactivation. We are + * guaranteed to deactivate in reverse order of the activation, so + * just pick one active interrupt after the other in the ap_list, + * and replay the deactivation as if the CPU was doing it. We also + * rely on priority drop to have taken place, and the list to be + * sorted by priority. + */ + list_for_each_entry(irq, &vgic_cpu->ap_list_head, ap_list) { + u64 lr; - if (model == KVM_DEV_TYPE_ARM_VGIC_V3) { - intid = val & ICH_LR_VIRTUAL_ID_MASK; + /* + * I would have loved to write this using a scoped_guard(), + * but using 'continue' here is a total train wreck. + */ + if (!eoicount) { + break; } else { - intid = val & GICH_LR_VIRTUALID; - is_v2_sgi = vgic_irq_is_sgi(intid); + guard(raw_spinlock)(&irq->irq_lock); + + if (!(likely(vgic_target_oracle(irq) == vcpu) && + irq->active)) + continue; + + lr = vgic_v3_compute_lr(vcpu, irq) & ~ICH_LR_ACTIVE_BIT; } - /* Notify fds when the guest EOI'ed a level-triggered IRQ */ - if (lr_signals_eoi_mi(val) && vgic_valid_spi(vcpu->kvm, intid)) - kvm_notify_acked_irq(vcpu->kvm, 0, - intid - VGIC_NR_PRIVATE_IRQS); + if (lr & ICH_LR_HW) + vgic_v3_deactivate_phys(FIELD_GET(ICH_LR_PHYS_ID_MASK, lr)); - irq = vgic_get_vcpu_irq(vcpu, intid); - if (!irq) /* An LPI could have been unmapped. */ - continue; + vgic_v3_fold_lr(vcpu, lr); + eoicount--; + } - raw_spin_lock(&irq->irq_lock); + cpuif->used_lrs = 0; +} - /* Always preserve the active bit, note deactivation */ - deactivated = irq->active && !(val & ICH_LR_ACTIVE_BIT); - irq->active = !!(val & ICH_LR_ACTIVE_BIT); +void vgic_v3_deactivate(struct kvm_vcpu *vcpu, u64 val) +{ + struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; + struct vgic_v3_cpu_if *cpuif = &vgic_cpu->vgic_v3; + u32 model = vcpu->kvm->arch.vgic.vgic_model; + struct kvm_vcpu *target_vcpu = NULL; + bool mmio = false, is_v2_sgi; + struct vgic_irq *irq; + unsigned long flags; + u64 lr = 0; + u8 cpuid; - if (irq->active && is_v2_sgi) - irq->active_source = cpuid; + /* Snapshot CPUID, and remove it from the INTID */ + cpuid = FIELD_GET(GENMASK_ULL(12, 10), val); + val &= ~GENMASK_ULL(12, 10); - /* Edge is the only case where we preserve the pending bit */ - if (irq->config == VGIC_CONFIG_EDGE && - (val & ICH_LR_PENDING_BIT)) { - irq->pending_latch = true; + is_v2_sgi = (model == KVM_DEV_TYPE_ARM_VGIC_V2 && + val < VGIC_NR_SGIS); - if (is_v2_sgi) - irq->source |= (1 << cpuid); - } + /* + * We only deal with DIR when EOIMode==1, and only for SGI, + * PPI or SPI. + */ + if (!(cpuif->vgic_vmcr & ICH_VMCR_EOIM_MASK) || + val >= vcpu->kvm->arch.vgic.nr_spis + VGIC_NR_PRIVATE_IRQS) + return; + + /* Make sure we're in the same context as LR handling */ + local_irq_save(flags); + + irq = vgic_get_vcpu_irq(vcpu, val); + if (WARN_ON_ONCE(!irq)) + goto out; + + /* + * EOIMode=1: we must rely on traps to handle deactivate of + * overflowing interrupts, as there is no ordering guarantee and + * EOIcount isn't being incremented. Priority drop will have taken + * place, as ICV_EOIxR_EL1 only affects the APRs and not the LRs. + * + * Three possibities: + * + * - The irq is not queued on any CPU, and there is nothing to + * do, + * + * - Or the irq is in an LR, meaning that its state is not + * directly observable. Treat it bluntly by making it as if + * this was a write to GICD_ICACTIVER, which will force an + * exit on all vcpus. If it hurts, don't do that. + * + * - Or the irq is active, but not in an LR, and we can + * directly deactivate it by building a pseudo-LR, fold it, + * and queue a request to prune the resulting ap_list, + * + * Special care must be taken to match the source CPUID when + * deactivating a GICv2 SGI. + */ + scoped_guard(raw_spinlock, &irq->irq_lock) { + target_vcpu = irq->vcpu; + + /* Not on any ap_list? */ + if (!target_vcpu) + goto put; /* - * Clear soft pending state when level irqs have been acked. + * Urgh. We're deactivating something that we cannot + * observe yet... Big hammer time. */ - if (irq->config == VGIC_CONFIG_LEVEL && !(val & ICH_LR_STATE)) - irq->pending_latch = false; + if (irq->on_lr) { + mmio = true; + goto put; + } - /* Handle resampling for mapped interrupts if required */ - vgic_irq_handle_resampling(irq, deactivated, val & ICH_LR_PENDING_BIT); + /* GICv2 SGI: check that the cpuid matches */ + if (is_v2_sgi && irq->active_source != cpuid) { + target_vcpu = NULL; + goto put; + } - raw_spin_unlock(&irq->irq_lock); - vgic_put_irq(vcpu->kvm, irq); + /* (with a Dalek voice) DEACTIVATE!!!! */ + lr = vgic_v3_compute_lr(vcpu, irq) & ~ICH_LR_ACTIVE_BIT; } - cpuif->used_lrs = 0; + if (lr & ICH_LR_HW) + vgic_v3_deactivate_phys(FIELD_GET(ICH_LR_PHYS_ID_MASK, lr)); + + vgic_v3_fold_lr(vcpu, lr); + +put: + vgic_put_irq(vcpu->kvm, irq); + +out: + local_irq_restore(flags); + + if (mmio) + vgic_mmio_write_cactive(vcpu, (val / 32) * 4, 4, BIT(val % 32)); + + /* Force the ap_list to be pruned */ + if (target_vcpu) + kvm_make_request(KVM_REQ_VGIC_PROCESS_UPDATE, target_vcpu); } /* Requires the irq to be locked already */ -void vgic_v3_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr) +static u64 vgic_v3_compute_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq) { u32 model = vcpu->kvm->arch.vgic.vgic_model; u64 val = irq->intid; bool allow_pending = true, is_v2_sgi; + WARN_ON(irq->on_lr); + is_v2_sgi = (vgic_irq_is_sgi(irq->intid) && model == KVM_DEV_TYPE_ARM_VGIC_V2); @@ -150,6 +342,35 @@ void vgic_v3_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr) if (allow_pending && irq_is_pending(irq)) { val |= ICH_LR_PENDING_BIT; + if (is_v2_sgi) { + u32 src = ffs(irq->source); + + if (WARN_RATELIMIT(!src, "No SGI source for INTID %d\n", + irq->intid)) + return 0; + + val |= (src - 1) << GICH_LR_PHYSID_CPUID_SHIFT; + if (irq->source & ~BIT(src - 1)) + val |= ICH_LR_EOI; + } + } + + if (irq->group) + val |= ICH_LR_GROUP; + + val |= (u64)irq->priority << ICH_LR_PRIORITY_SHIFT; + + return val; +} + +void vgic_v3_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr) +{ + u32 model = vcpu->kvm->arch.vgic.vgic_model; + u64 val = vgic_v3_compute_lr(vcpu, irq); + + vcpu->arch.vgic_cpu.vgic_v3.vgic_lr[lr] = val; + + if (val & ICH_LR_PENDING_BIT) { if (irq->config == VGIC_CONFIG_EDGE) irq->pending_latch = false; @@ -157,16 +378,9 @@ void vgic_v3_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr) model == KVM_DEV_TYPE_ARM_VGIC_V2) { u32 src = ffs(irq->source); - if (WARN_RATELIMIT(!src, "No SGI source for INTID %d\n", - irq->intid)) - return; - - val |= (src - 1) << GICH_LR_PHYSID_CPUID_SHIFT; - irq->source &= ~(1 << (src - 1)); - if (irq->source) { + irq->source &= ~BIT(src - 1); + if (irq->source) irq->pending_latch = true; - val |= ICH_LR_EOI; - } } } @@ -179,12 +393,7 @@ void vgic_v3_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr) if (vgic_irq_is_mapped_level(irq) && (val & ICH_LR_PENDING_BIT)) irq->line_level = false; - if (irq->group) - val |= ICH_LR_GROUP; - - val |= (u64)irq->priority << ICH_LR_PRIORITY_SHIFT; - - vcpu->arch.vgic_cpu.vgic_v3.vgic_lr[lr] = val; + irq->on_lr = true; } void vgic_v3_clear_lr(struct kvm_vcpu *vcpu, int lr) @@ -258,7 +467,7 @@ void vgic_v3_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcrp) GIC_BASER_CACHEABILITY(GICR_PENDBASER, OUTER, SameAsInner) | \ GIC_BASER_SHAREABILITY(GICR_PENDBASER, InnerShareable)) -void vgic_v3_enable(struct kvm_vcpu *vcpu) +void vgic_v3_reset(struct kvm_vcpu *vcpu) { struct vgic_v3_cpu_if *vgic_v3 = &vcpu->arch.vgic_cpu.vgic_v3; @@ -288,9 +497,6 @@ void vgic_v3_enable(struct kvm_vcpu *vcpu) kvm_vgic_global_state.ich_vtr_el2); vcpu->arch.vgic_cpu.num_pri_bits = FIELD_GET(ICH_VTR_EL2_PRIbits, kvm_vgic_global_state.ich_vtr_el2) + 1; - - /* Get the show on the road... */ - vgic_v3->vgic_hcr = ICH_HCR_EL2_En; } void vcpu_set_ich_hcr(struct kvm_vcpu *vcpu) @@ -302,20 +508,9 @@ void vcpu_set_ich_hcr(struct kvm_vcpu *vcpu) /* Hide GICv3 sysreg if necessary */ if (vcpu->kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V2 || - !irqchip_in_kernel(vcpu->kvm)) { + !irqchip_in_kernel(vcpu->kvm)) vgic_v3->vgic_hcr |= (ICH_HCR_EL2_TALL0 | ICH_HCR_EL2_TALL1 | ICH_HCR_EL2_TC); - return; - } - - if (group0_trap) - vgic_v3->vgic_hcr |= ICH_HCR_EL2_TALL0; - if (group1_trap) - vgic_v3->vgic_hcr |= ICH_HCR_EL2_TALL1; - if (common_trap) - vgic_v3->vgic_hcr |= ICH_HCR_EL2_TC; - if (dir_trap) - vgic_v3->vgic_hcr |= ICH_HCR_EL2_TDIR; } int vgic_v3_lpi_sync_pending_status(struct kvm *kvm, struct vgic_irq *irq) @@ -636,8 +831,53 @@ static const struct midr_range broken_seis[] = { static bool vgic_v3_broken_seis(void) { - return ((kvm_vgic_global_state.ich_vtr_el2 & ICH_VTR_EL2_SEIS) && - is_midr_in_range_list(broken_seis)); + return (is_kernel_in_hyp_mode() && + is_midr_in_range_list(broken_seis) && + (read_sysreg_s(SYS_ICH_VTR_EL2) & ICH_VTR_EL2_SEIS)); +} + +void noinstr kvm_compute_ich_hcr_trap_bits(struct alt_instr *alt, + __le32 *origptr, __le32 *updptr, + int nr_inst) +{ + u32 insn, oinsn, rd; + u64 hcr = 0; + + if (cpus_have_cap(ARM64_WORKAROUND_CAVIUM_30115)) { + group0_trap = true; + group1_trap = true; + } + + if (vgic_v3_broken_seis()) { + /* We know that these machines have ICH_HCR_EL2.TDIR */ + group0_trap = true; + group1_trap = true; + dir_trap = true; + } + + if (!cpus_have_cap(ARM64_HAS_ICH_HCR_EL2_TDIR)) + common_trap = true; + + if (group0_trap) + hcr |= ICH_HCR_EL2_TALL0; + if (group1_trap) + hcr |= ICH_HCR_EL2_TALL1; + if (common_trap) + hcr |= ICH_HCR_EL2_TC; + if (dir_trap) + hcr |= ICH_HCR_EL2_TDIR; + + /* Compute target register */ + oinsn = le32_to_cpu(*origptr); + rd = aarch64_insn_decode_register(AARCH64_INSN_REGTYPE_RD, oinsn); + + /* movz rd, #(val & 0xffff) */ + insn = aarch64_insn_gen_movewide(rd, + (u16)hcr, + 0, + AARCH64_INSN_VARIANT_64BIT, + AARCH64_INSN_MOVEWIDE_ZERO); + *updptr = cpu_to_le32(insn); } /** @@ -651,6 +891,7 @@ int vgic_v3_probe(const struct gic_kvm_info *info) { u64 ich_vtr_el2 = kvm_call_hyp_ret(__vgic_v3_get_gic_config); bool has_v2; + u64 traps; int ret; has_v2 = ich_vtr_el2 >> 63; @@ -709,29 +950,18 @@ int vgic_v3_probe(const struct gic_kvm_info *info) if (has_v2) static_branch_enable(&vgic_v3_has_v2_compat); - if (cpus_have_final_cap(ARM64_WORKAROUND_CAVIUM_30115)) { - group0_trap = true; - group1_trap = true; - } - if (vgic_v3_broken_seis()) { kvm_info("GICv3 with broken locally generated SEI\n"); - kvm_vgic_global_state.ich_vtr_el2 &= ~ICH_VTR_EL2_SEIS; - group0_trap = true; - group1_trap = true; - if (ich_vtr_el2 & ICH_VTR_EL2_TDS) - dir_trap = true; - else - common_trap = true; } - if (group0_trap || group1_trap || common_trap | dir_trap) { + traps = vgic_ich_hcr_trap_bits(); + if (traps) { kvm_info("GICv3 sysreg trapping enabled ([%s%s%s%s], reduced performance)\n", - group0_trap ? "G0" : "", - group1_trap ? "G1" : "", - common_trap ? "C" : "", - dir_trap ? "D" : ""); + (traps & ICH_HCR_EL2_TALL0) ? "G0" : "", + (traps & ICH_HCR_EL2_TALL1) ? "G1" : "", + (traps & ICH_HCR_EL2_TC) ? "C" : "", + (traps & ICH_HCR_EL2_TDIR) ? "D" : ""); static_branch_enable(&vgic_v3_cpuif_trap); } @@ -771,7 +1001,7 @@ void vgic_v3_put(struct kvm_vcpu *vcpu) } if (likely(!is_protected_kvm_enabled())) - kvm_call_hyp(__vgic_v3_save_vmcr_aprs, cpu_if); + kvm_call_hyp(__vgic_v3_save_aprs, cpu_if); WARN_ON(vgic_v4_put(vcpu)); if (has_vhe()) diff --git a/arch/arm64/kvm/vgic/vgic-v4.c b/arch/arm64/kvm/vgic/vgic-v4.c index 548aec9d5a72..09c3e9eb23f8 100644 --- a/arch/arm64/kvm/vgic/vgic-v4.c +++ b/arch/arm64/kvm/vgic/vgic-v4.c @@ -163,6 +163,7 @@ static void vgic_v4_disable_vsgis(struct kvm_vcpu *vcpu) struct vgic_irq *irq = vgic_get_vcpu_irq(vcpu, i); struct irq_desc *desc; unsigned long flags; + bool pending; int ret; raw_spin_lock_irqsave(&irq->irq_lock, flags); @@ -173,9 +174,11 @@ static void vgic_v4_disable_vsgis(struct kvm_vcpu *vcpu) irq->hw = false; ret = irq_get_irqchip_state(irq->host_irq, IRQCHIP_STATE_PENDING, - &irq->pending_latch); + &pending); WARN_ON(ret); + irq->pending_latch = pending; + desc = irq_to_desc(irq->host_irq); irq_domain_deactivate_irq(irq_desc_get_irq_data(desc)); unlock: diff --git a/arch/arm64/kvm/vgic/vgic.c b/arch/arm64/kvm/vgic/vgic.c index 8d20c53faef0..430aa98888fd 100644 --- a/arch/arm64/kvm/vgic/vgic.c +++ b/arch/arm64/kvm/vgic/vgic.c @@ -244,7 +244,7 @@ void vgic_irq_set_phys_active(struct vgic_irq *irq, bool active) * * Requires the IRQ lock to be held. */ -static struct kvm_vcpu *vgic_target_oracle(struct vgic_irq *irq) +struct kvm_vcpu *vgic_target_oracle(struct vgic_irq *irq) { lockdep_assert_held(&irq->irq_lock); @@ -272,17 +272,20 @@ static struct kvm_vcpu *vgic_target_oracle(struct vgic_irq *irq) return NULL; } +struct vgic_sort_info { + struct kvm_vcpu *vcpu; + struct vgic_vmcr vmcr; +}; + /* * The order of items in the ap_lists defines how we'll pack things in LRs as * well, the first items in the list being the first things populated in the * LRs. * - * A hard rule is that active interrupts can never be pushed out of the LRs - * (and therefore take priority) since we cannot reliably trap on deactivation - * of IRQs and therefore they have to be present in the LRs. - * + * Pending, non-active interrupts must be placed at the head of the list. * Otherwise things should be sorted by the priority field and the GIC * hardware support will take care of preemption of priority groups etc. + * Interrupts that are not deliverable should be at the end of the list. * * Return negative if "a" sorts before "b", 0 to preserve order, and positive * to sort "b" before "a". @@ -292,6 +295,8 @@ static int vgic_irq_cmp(void *priv, const struct list_head *a, { struct vgic_irq *irqa = container_of(a, struct vgic_irq, ap_list); struct vgic_irq *irqb = container_of(b, struct vgic_irq, ap_list); + struct vgic_sort_info *info = priv; + struct kvm_vcpu *vcpu = info->vcpu; bool penda, pendb; int ret; @@ -305,21 +310,32 @@ static int vgic_irq_cmp(void *priv, const struct list_head *a, raw_spin_lock(&irqa->irq_lock); raw_spin_lock_nested(&irqb->irq_lock, SINGLE_DEPTH_NESTING); - if (irqa->active || irqb->active) { - ret = (int)irqb->active - (int)irqa->active; + /* Undeliverable interrupts should be last */ + ret = (int)(vgic_target_oracle(irqb) == vcpu) - (int)(vgic_target_oracle(irqa) == vcpu); + if (ret) + goto out; + + /* Same thing for interrupts targeting a disabled group */ + ret = (int)(irqb->group ? info->vmcr.grpen1 : info->vmcr.grpen0); + ret -= (int)(irqa->group ? info->vmcr.grpen1 : info->vmcr.grpen0); + if (ret) goto out; - } - penda = irqa->enabled && irq_is_pending(irqa); - pendb = irqb->enabled && irq_is_pending(irqb); + penda = irqa->enabled && irq_is_pending(irqa) && !irqa->active; + pendb = irqb->enabled && irq_is_pending(irqb) && !irqb->active; - if (!penda || !pendb) { - ret = (int)pendb - (int)penda; + ret = (int)pendb - (int)penda; + if (ret) goto out; - } - /* Both pending and enabled, sort by priority */ - ret = irqa->priority - irqb->priority; + /* Both pending and enabled, sort by priority (lower number first) */ + ret = (int)irqa->priority - (int)irqb->priority; + if (ret) + goto out; + + /* Finally, HW bit active interrupts have priority over non-HW ones */ + ret = (int)irqb->hw - (int)irqa->hw; + out: raw_spin_unlock(&irqb->irq_lock); raw_spin_unlock(&irqa->irq_lock); @@ -330,10 +346,12 @@ out: static void vgic_sort_ap_list(struct kvm_vcpu *vcpu) { struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; + struct vgic_sort_info info = { .vcpu = vcpu, }; lockdep_assert_held(&vgic_cpu->ap_list_lock); - list_sort(NULL, &vgic_cpu->ap_list_head, vgic_irq_cmp); + vgic_get_vmcr(vcpu, &info.vmcr); + list_sort(&info, &vgic_cpu->ap_list_head, vgic_irq_cmp); } /* @@ -356,6 +374,20 @@ static bool vgic_validate_injection(struct vgic_irq *irq, bool level, void *owne return false; } +static bool vgic_model_needs_bcst_kick(struct kvm *kvm) +{ + /* + * A GICv3 (or GICv3-like) system exposing a GICv3 to the guest + * needs a broadcast kick to set TDIR globally. + * + * For systems that do not have TDIR (ARM's own v8.0 CPUs), the + * shadow TDIR bit is always set, and so is the register's TC bit, + * so no need to kick the CPUs. + */ + return (cpus_have_final_cap(ARM64_HAS_ICH_HCR_EL2_TDIR) && + kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3); +} + /* * Check whether an IRQ needs to (and can) be queued to a VCPU's ap list. * Do the queuing if necessary, taking the right locks in the right order. @@ -368,6 +400,7 @@ bool vgic_queue_irq_unlock(struct kvm *kvm, struct vgic_irq *irq, unsigned long flags) __releases(&irq->irq_lock) { struct kvm_vcpu *vcpu; + bool bcast; lockdep_assert_held(&irq->irq_lock); @@ -442,11 +475,20 @@ retry: list_add_tail(&irq->ap_list, &vcpu->arch.vgic_cpu.ap_list_head); irq->vcpu = vcpu; + /* A new SPI may result in deactivation trapping on all vcpus */ + bcast = (vgic_model_needs_bcst_kick(vcpu->kvm) && + vgic_valid_spi(vcpu->kvm, irq->intid) && + atomic_fetch_inc(&vcpu->kvm->arch.vgic.active_spis) == 0); + raw_spin_unlock(&irq->irq_lock); raw_spin_unlock_irqrestore(&vcpu->arch.vgic_cpu.ap_list_lock, flags); - kvm_make_request(KVM_REQ_IRQ_PENDING, vcpu); - kvm_vcpu_kick(vcpu); + if (!bcast) { + kvm_make_request(KVM_REQ_IRQ_PENDING, vcpu); + kvm_vcpu_kick(vcpu); + } else { + kvm_make_all_cpus_request(vcpu->kvm, KVM_REQ_IRQ_PENDING); + } return true; } @@ -798,98 +840,148 @@ static inline void vgic_clear_lr(struct kvm_vcpu *vcpu, int lr) vgic_v3_clear_lr(vcpu, lr); } -static inline void vgic_set_underflow(struct kvm_vcpu *vcpu) -{ - if (kvm_vgic_global_state.type == VGIC_V2) - vgic_v2_set_underflow(vcpu); - else - vgic_v3_set_underflow(vcpu); -} - -/* Requires the ap_list_lock to be held. */ -static int compute_ap_list_depth(struct kvm_vcpu *vcpu, - bool *multi_sgi) +static void summarize_ap_list(struct kvm_vcpu *vcpu, + struct ap_list_summary *als) { struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; struct vgic_irq *irq; - int count = 0; - - *multi_sgi = false; lockdep_assert_held(&vgic_cpu->ap_list_lock); + *als = (typeof(*als)){}; + list_for_each_entry(irq, &vgic_cpu->ap_list_head, ap_list) { - int w; + guard(raw_spinlock)(&irq->irq_lock); - raw_spin_lock(&irq->irq_lock); - /* GICv2 SGIs can count for more than one... */ - w = vgic_irq_get_lr_count(irq); - raw_spin_unlock(&irq->irq_lock); + if (unlikely(vgic_target_oracle(irq) != vcpu)) + continue; + + if (!irq->active) + als->nr_pend++; + else + als->nr_act++; - count += w; - *multi_sgi |= (w > 1); + if (irq->intid < VGIC_NR_SGIS) + als->nr_sgi++; } - return count; } -/* Requires the VCPU's ap_list_lock to be held. */ +/* + * Dealing with LR overflow is close to black magic -- dress accordingly. + * + * We have to present an almost infinite number of interrupts through a very + * limited number of registers. Therefore crucial decisions must be made to + * ensure we feed the most relevant interrupts into the LRs, and yet have + * some facilities to let the guest interact with those that are not there. + * + * All considerations below are in the context of interrupts targeting a + * single vcpu with non-idle state (either pending, active, or both), + * colloquially called the ap_list: + * + * - Pending interrupts must have priority over active interrupts. This also + * excludes pending+active interrupts. This ensures that a guest can + * perform priority drops on any number of interrupts, and yet be + * presented the next pending one. + * + * - Deactivation of interrupts outside of the LRs must be tracked by using + * either the EOIcount-driven maintenance interrupt, and sometimes by + * trapping the DIR register. + * + * - For EOImode=0, a non-zero EOIcount means walking the ap_list past the + * point that made it into the LRs, and deactivate interrupts that would + * have made it onto the LRs if we had the space. + * + * - The MI-generation bits must be used to try and force an exit when the + * guest has done enough changes to the LRs that we want to reevaluate the + * situation: + * + * - if the total number of pending interrupts exceeds the number of + * LR, NPIE must be set in order to exit once no pending interrupts + * are present in the LRs, allowing us to populate the next batch. + * + * - if there are active interrupts outside of the LRs, then LRENPIE + * must be set so that we exit on deactivation of one of these, and + * work out which one is to be deactivated. Note that this is not + * enough to deal with EOImode=1, see below. + * + * - if the overall number of interrupts exceeds the number of LRs, + * then UIE must be set to allow refilling of the LRs once the + * majority of them has been processed. + * + * - as usual, MI triggers are only an optimisation, since we cannot + * rely on the MI being delivered in timely manner... + * + * - EOImode=1 creates some additional problems: + * + * - deactivation can happen in any order, and we cannot rely on + * EOImode=0's coupling of priority-drop and deactivation which + * imposes strict reverse Ack order. This means that DIR must + * trap if we have active interrupts outside of the LRs. + * + * - deactivation of SPIs can occur on any CPU, while the SPI is only + * present in the ap_list of the CPU that actually ack-ed it. In that + * case, EOIcount doesn't provide enough information, and we must + * resort to trapping DIR even if we don't overflow the LRs. Bonus + * point for not trapping DIR when no SPIs are pending or active in + * the whole VM. + * + * - LPIs do not suffer the same problem as SPIs on deactivation, as we + * have to essentially discard the active state, see below. + * + * - Virtual LPIs have an active state (surprise!), which gets removed on + * priority drop (EOI). However, EOIcount doesn't get bumped when the LPI + * is not present in the LR (surprise again!). Special care must therefore + * be taken to remove the active state from any activated LPI when exiting + * from the guest. This is in a way no different from what happens on the + * physical side. We still rely on the running priority to have been + * removed from the APRs, irrespective of the LPI being present in the LRs + * or not. + * + * - Virtual SGIs directly injected via GICv4.1 must not affect EOIcount, as + * they are not managed in SW and don't have a true active state. So only + * set vSGIEOICount when no SGIs are in the ap_list. + * + * - GICv2 SGIs with multiple sources are injected one source at a time, as + * if they were made pending sequentially. This may mean that we don't + * always present the HPPI if other interrupts with lower priority are + * pending in the LRs. Big deal. + */ static void vgic_flush_lr_state(struct kvm_vcpu *vcpu) { struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; + struct ap_list_summary als; struct vgic_irq *irq; - int count; - bool multi_sgi; - u8 prio = 0xff; - int i = 0; + int count = 0; lockdep_assert_held(&vgic_cpu->ap_list_lock); - count = compute_ap_list_depth(vcpu, &multi_sgi); - if (count > kvm_vgic_global_state.nr_lr || multi_sgi) - vgic_sort_ap_list(vcpu); + summarize_ap_list(vcpu, &als); - count = 0; + if (irqs_outside_lrs(&als)) + vgic_sort_ap_list(vcpu); list_for_each_entry(irq, &vgic_cpu->ap_list_head, ap_list) { - raw_spin_lock(&irq->irq_lock); - - /* - * If we have multi-SGIs in the pipeline, we need to - * guarantee that they are all seen before any IRQ of - * lower priority. In that case, we need to filter out - * these interrupts by exiting early. This is easy as - * the AP list has been sorted already. - */ - if (multi_sgi && irq->priority > prio) { - raw_spin_unlock(&irq->irq_lock); - break; - } - - if (likely(vgic_target_oracle(irq) == vcpu)) { - vgic_populate_lr(vcpu, irq, count++); - - if (irq->source) - prio = irq->priority; + scoped_guard(raw_spinlock, &irq->irq_lock) { + if (likely(vgic_target_oracle(irq) == vcpu)) { + vgic_populate_lr(vcpu, irq, count++); + } } - raw_spin_unlock(&irq->irq_lock); - - if (count == kvm_vgic_global_state.nr_lr) { - if (!list_is_last(&irq->ap_list, - &vgic_cpu->ap_list_head)) - vgic_set_underflow(vcpu); + if (count == kvm_vgic_global_state.nr_lr) break; - } } /* Nuke remaining LRs */ - for (i = count ; i < kvm_vgic_global_state.nr_lr; i++) + for (int i = count ; i < kvm_vgic_global_state.nr_lr; i++) vgic_clear_lr(vcpu, i); - if (!static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif)) + if (!static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif)) { vcpu->arch.vgic_cpu.vgic_v2.used_lrs = count; - else + vgic_v2_configure_hcr(vcpu, &als); + } else { vcpu->arch.vgic_cpu.vgic_v3.used_lrs = count; + vgic_v3_configure_hcr(vcpu, &als); + } } static inline bool can_access_vgic_from_kernel(void) @@ -913,8 +1005,6 @@ static inline void vgic_save_state(struct kvm_vcpu *vcpu) /* Sync back the hardware VGIC state into our emulation after a guest's run. */ void kvm_vgic_sync_hwstate(struct kvm_vcpu *vcpu) { - int used_lrs; - /* If nesting, emulate the HW effect from L0 to L1 */ if (vgic_state_is_nested(vcpu)) { vgic_v3_sync_nested(vcpu); @@ -924,21 +1014,22 @@ void kvm_vgic_sync_hwstate(struct kvm_vcpu *vcpu) if (vcpu_has_nv(vcpu)) vgic_v3_nested_update_mi(vcpu); - /* An empty ap_list_head implies used_lrs == 0 */ - if (list_empty(&vcpu->arch.vgic_cpu.ap_list_head)) - return; - if (can_access_vgic_from_kernel()) vgic_save_state(vcpu); - if (!static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif)) - used_lrs = vcpu->arch.vgic_cpu.vgic_v2.used_lrs; - else - used_lrs = vcpu->arch.vgic_cpu.vgic_v3.used_lrs; + vgic_fold_lr_state(vcpu); + vgic_prune_ap_list(vcpu); +} + +/* Sync interrupts that were deactivated through a DIR trap */ +void kvm_vgic_process_async_update(struct kvm_vcpu *vcpu) +{ + unsigned long flags; - if (used_lrs) - vgic_fold_lr_state(vcpu); + /* Make sure we're in the same context as LR handling */ + local_irq_save(flags); vgic_prune_ap_list(vcpu); + local_irq_restore(flags); } static inline void vgic_restore_state(struct kvm_vcpu *vcpu) @@ -965,8 +1056,9 @@ void kvm_vgic_flush_hwstate(struct kvm_vcpu *vcpu) * abort the entry procedure and inject the exception at the * beginning of the run loop. * - * - Otherwise, do exactly *NOTHING*. The guest state is - * already loaded, and we can carry on with running it. + * - Otherwise, do exactly *NOTHING* apart from enabling the virtual + * CPU interface. The guest state is already loaded, and we can + * carry on with running it. * * If we have NV, but are not in a nested state, compute the * maintenance interrupt state, as it may fire. @@ -975,35 +1067,17 @@ void kvm_vgic_flush_hwstate(struct kvm_vcpu *vcpu) if (kvm_vgic_vcpu_pending_irq(vcpu)) kvm_make_request(KVM_REQ_GUEST_HYP_IRQ_PENDING, vcpu); + vgic_v3_flush_nested(vcpu); return; } if (vcpu_has_nv(vcpu)) vgic_v3_nested_update_mi(vcpu); - /* - * If there are no virtual interrupts active or pending for this - * VCPU, then there is no work to do and we can bail out without - * taking any lock. There is a potential race with someone injecting - * interrupts to the VCPU, but it is a benign race as the VCPU will - * either observe the new interrupt before or after doing this check, - * and introducing additional synchronization mechanism doesn't change - * this. - * - * Note that we still need to go through the whole thing if anything - * can be directly injected (GICv4). - */ - if (list_empty(&vcpu->arch.vgic_cpu.ap_list_head) && - !vgic_supports_direct_irqs(vcpu->kvm)) - return; - DEBUG_SPINLOCK_BUG_ON(!irqs_disabled()); - if (!list_empty(&vcpu->arch.vgic_cpu.ap_list_head)) { - raw_spin_lock(&vcpu->arch.vgic_cpu.ap_list_lock); + scoped_guard(raw_spinlock, &vcpu->arch.vgic_cpu.ap_list_lock) vgic_flush_lr_state(vcpu); - raw_spin_unlock(&vcpu->arch.vgic_cpu.ap_list_lock); - } if (can_access_vgic_from_kernel()) vgic_restore_state(vcpu); diff --git a/arch/arm64/kvm/vgic/vgic.h b/arch/arm64/kvm/vgic/vgic.h index ac5f9c5d2b98..5f0fc96b4dc2 100644 --- a/arch/arm64/kvm/vgic/vgic.h +++ b/arch/arm64/kvm/vgic/vgic.h @@ -164,6 +164,22 @@ static inline int vgic_write_guest_lock(struct kvm *kvm, gpa_t gpa, return ret; } +void kvm_compute_ich_hcr_trap_bits(struct alt_instr *alt, + __le32 *origptr, __le32 *updptr, int nr_inst); + +static inline u64 vgic_ich_hcr_trap_bits(void) +{ + u64 hcr; + + /* All the traps are in the bottom 16bits */ + asm volatile(ALTERNATIVE_CB("movz %0, #0\n", + ARM64_ALWAYS_SYSTEM, + kvm_compute_ich_hcr_trap_bits) + : "=r" (hcr)); + + return hcr; +} + /* * This struct provides an intermediate representation of the fields contained * in the GICH_VMCR and ICH_VMCR registers, such that code exporting the GIC @@ -220,6 +236,21 @@ struct its_ite { u32 event_id; }; +struct ap_list_summary { + unsigned int nr_pend; /* purely pending, not active */ + unsigned int nr_act; /* active, or active+pending */ + unsigned int nr_sgi; /* any SGI */ +}; + +#define irqs_outside_lrs(s) \ + (((s)->nr_pend + (s)->nr_act) > kvm_vgic_global_state.nr_lr) + +#define irqs_pending_outside_lrs(s) \ + ((s)->nr_pend > kvm_vgic_global_state.nr_lr) + +#define irqs_active_outside_lrs(s) \ + ((s)->nr_act && irqs_outside_lrs(s)) + int vgic_v3_parse_attr(struct kvm_device *dev, struct kvm_device_attr *attr, struct vgic_reg_attr *reg_attr); int vgic_v2_parse_attr(struct kvm_device *dev, struct kvm_device_attr *attr, @@ -230,6 +261,7 @@ vgic_get_mmio_region(struct kvm_vcpu *vcpu, struct vgic_io_device *iodev, struct vgic_irq *vgic_get_irq(struct kvm *kvm, u32 intid); struct vgic_irq *vgic_get_vcpu_irq(struct kvm_vcpu *vcpu, u32 intid); void vgic_put_irq(struct kvm *kvm, struct vgic_irq *irq); +struct kvm_vcpu *vgic_target_oracle(struct vgic_irq *irq); bool vgic_get_phys_line_level(struct vgic_irq *irq); void vgic_irq_set_phys_pending(struct vgic_irq *irq, bool pending); void vgic_irq_set_phys_active(struct vgic_irq *irq, bool active); @@ -245,8 +277,9 @@ int vgic_check_iorange(struct kvm *kvm, phys_addr_t ioaddr, void vgic_v2_fold_lr_state(struct kvm_vcpu *vcpu); void vgic_v2_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr); +void vgic_v2_deactivate(struct kvm_vcpu *vcpu, u32 val); void vgic_v2_clear_lr(struct kvm_vcpu *vcpu, int lr); -void vgic_v2_set_underflow(struct kvm_vcpu *vcpu); +void vgic_v2_configure_hcr(struct kvm_vcpu *vcpu, struct ap_list_summary *als); int vgic_v2_has_attr_regs(struct kvm_device *dev, struct kvm_device_attr *attr); int vgic_v2_dist_uaccess(struct kvm_vcpu *vcpu, bool is_write, int offset, u32 *val); @@ -254,7 +287,7 @@ int vgic_v2_cpuif_uaccess(struct kvm_vcpu *vcpu, bool is_write, int offset, u32 *val); void vgic_v2_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr); void vgic_v2_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr); -void vgic_v2_enable(struct kvm_vcpu *vcpu); +void vgic_v2_reset(struct kvm_vcpu *vcpu); int vgic_v2_probe(const struct gic_kvm_info *info); int vgic_v2_map_resources(struct kvm *kvm); int vgic_register_dist_iodev(struct kvm *kvm, gpa_t dist_base_address, @@ -286,10 +319,11 @@ static inline void vgic_get_irq_ref(struct vgic_irq *irq) void vgic_v3_fold_lr_state(struct kvm_vcpu *vcpu); void vgic_v3_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr); void vgic_v3_clear_lr(struct kvm_vcpu *vcpu, int lr); -void vgic_v3_set_underflow(struct kvm_vcpu *vcpu); +void vgic_v3_deactivate(struct kvm_vcpu *vcpu, u64 val); +void vgic_v3_configure_hcr(struct kvm_vcpu *vcpu, struct ap_list_summary *als); void vgic_v3_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr); void vgic_v3_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr); -void vgic_v3_enable(struct kvm_vcpu *vcpu); +void vgic_v3_reset(struct kvm_vcpu *vcpu); int vgic_v3_probe(const struct gic_kvm_info *info); int vgic_v3_map_resources(struct kvm *kvm); int vgic_v3_lpi_sync_pending_status(struct kvm *kvm, struct vgic_irq *irq); @@ -412,6 +446,7 @@ static inline bool kvm_has_gicv3(struct kvm *kvm) return kvm_has_feat(kvm, ID_AA64PFR0_EL1, GIC, IMP); } +void vgic_v3_flush_nested(struct kvm_vcpu *vcpu); void vgic_v3_sync_nested(struct kvm_vcpu *vcpu); void vgic_v3_load_nested(struct kvm_vcpu *vcpu); void vgic_v3_put_nested(struct kvm_vcpu *vcpu); diff --git a/arch/arm64/tools/cpucaps b/arch/arm64/tools/cpucaps index 1b32c1232d28..0fac75f01534 100644 --- a/arch/arm64/tools/cpucaps +++ b/arch/arm64/tools/cpucaps @@ -40,6 +40,7 @@ HAS_GICV5_CPUIF HAS_GICV5_LEGACY HAS_GIC_PRIO_MASKING HAS_GIC_PRIO_RELAXED_SYNC +HAS_ICH_HCR_EL2_TDIR HAS_HCR_NV1 HAS_HCX HAS_LDAPR @@ -64,6 +65,7 @@ HAS_TLB_RANGE HAS_VA52 HAS_VIRT_HOST_EXTN HAS_WFXT +HAS_XNX HAFT HW_DBM KVM_HVHE diff --git a/drivers/irqchip/irq-apple-aic.c b/drivers/irqchip/irq-apple-aic.c index 032d66dceb8e..4607f4943b19 100644 --- a/drivers/irqchip/irq-apple-aic.c +++ b/drivers/irqchip/irq-apple-aic.c @@ -411,12 +411,15 @@ static void __exception_irq_entry aic_handle_irq(struct pt_regs *regs) if (is_kernel_in_hyp_mode() && (read_sysreg_s(SYS_ICH_HCR_EL2) & ICH_HCR_EL2_En) && read_sysreg_s(SYS_ICH_MISR_EL2) != 0) { + u64 val; + generic_handle_domain_irq(aic_irqc->hw_domain, AIC_FIQ_HWIRQ(AIC_VGIC_MI)); if (unlikely((read_sysreg_s(SYS_ICH_HCR_EL2) & ICH_HCR_EL2_En) && - read_sysreg_s(SYS_ICH_MISR_EL2))) { - pr_err_ratelimited("vGIC IRQ fired and not handled by KVM, disabling.\n"); + (val = read_sysreg_s(SYS_ICH_MISR_EL2)))) { + pr_err_ratelimited("vGIC IRQ fired and not handled by KVM (MISR=%llx), disabling.\n", + val); sysreg_clear_set_s(SYS_ICH_HCR_EL2, ICH_HCR_EL2_En, 0); } } diff --git a/drivers/irqchip/irq-gic.c b/drivers/irqchip/irq-gic.c index 1269ab8eb726..ec70c84e9f91 100644 --- a/drivers/irqchip/irq-gic.c +++ b/drivers/irqchip/irq-gic.c @@ -1459,6 +1459,8 @@ static void __init gic_of_setup_kvm_info(struct device_node *node) if (ret) return; + gic_v2_kvm_info.gicc_base = gic_data[0].cpu_base.common_base; + if (static_branch_likely(&supports_deactivate_key)) vgic_set_kvm_info(&gic_v2_kvm_info); } @@ -1620,6 +1622,7 @@ static void __init gic_acpi_setup_kvm_info(void) return; gic_v2_kvm_info.maint_irq = irq; + gic_v2_kvm_info.gicc_base = gic_data[0].cpu_base.common_base; vgic_set_kvm_info(&gic_v2_kvm_info); } diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h index 7a0b972eb1b1..b261fb3968d0 100644 --- a/include/kvm/arm_vgic.h +++ b/include/kvm/arm_vgic.h @@ -59,6 +59,9 @@ struct vgic_global { /* virtual control interface mapping, HYP VA */ void __iomem *vctrl_hyp; + /* Physical CPU interface, kernel VA */ + void __iomem *gicc_base; + /* Number of implemented list registers */ int nr_lr; @@ -120,6 +123,7 @@ struct irq_ops { struct vgic_irq { raw_spinlock_t irq_lock; /* Protects the content of the struct */ + u32 intid; /* Guest visible INTID */ struct rcu_head rcu; struct list_head ap_list; @@ -134,17 +138,18 @@ struct vgic_irq { * affinity reg (v3). */ - u32 intid; /* Guest visible INTID */ - bool line_level; /* Level only */ - bool pending_latch; /* The pending latch state used to calculate - * the pending state for both level - * and edge triggered IRQs. */ - bool active; - bool pending_release; /* Used for LPIs only, unreferenced IRQ + bool pending_release:1; /* Used for LPIs only, unreferenced IRQ * pending a release */ - bool enabled; - bool hw; /* Tied to HW IRQ */ + bool pending_latch:1; /* The pending latch state used to calculate + * the pending state for both level + * and edge triggered IRQs. */ + enum vgic_irq_config config:1; /* Level or edge */ + bool line_level:1; /* Level only */ + bool enabled:1; + bool active:1; + bool hw:1; /* Tied to HW IRQ */ + bool on_lr:1; /* Present in a CPU LR */ refcount_t refcount; /* Used for LPIs */ u32 hwintid; /* HW INTID number */ unsigned int host_irq; /* linux irq corresponding to hwintid */ @@ -156,7 +161,6 @@ struct vgic_irq { u8 active_source; /* GICv2 SGIs only */ u8 priority; u8 group; /* 0 == group 0, 1 == group 1 */ - enum vgic_irq_config config; /* Level or edge */ struct irq_ops *ops; @@ -259,6 +263,9 @@ struct vgic_dist { /* The GIC maintenance IRQ for nested hypervisors. */ u32 mi_intid; + /* Track the number of in-flight active SPIs */ + atomic_t active_spis; + /* base addresses in guest physical address space: */ gpa_t vgic_dist_base; /* distributor */ union { @@ -280,6 +287,7 @@ struct vgic_dist { struct vgic_irq *spis; struct vgic_io_device dist_iodev; + struct vgic_io_device cpuif_iodev; bool has_its; bool table_write_in_progress; @@ -417,6 +425,7 @@ bool kvm_vcpu_has_pending_irqs(struct kvm_vcpu *vcpu); void kvm_vgic_sync_hwstate(struct kvm_vcpu *vcpu); void kvm_vgic_flush_hwstate(struct kvm_vcpu *vcpu); void kvm_vgic_reset_mapped_irq(struct kvm_vcpu *vcpu, u32 vintid); +void kvm_vgic_process_async_update(struct kvm_vcpu *vcpu); void vgic_v3_dispatch_sgi(struct kvm_vcpu *vcpu, u64 reg, bool allow_group1); diff --git a/include/linux/irqchip/arm-gic.h b/include/linux/irqchip/arm-gic.h index 2223f95079ce..d45fa19f9e47 100644 --- a/include/linux/irqchip/arm-gic.h +++ b/include/linux/irqchip/arm-gic.h @@ -86,7 +86,13 @@ #define GICH_HCR_EN (1 << 0) #define GICH_HCR_UIE (1 << 1) +#define GICH_HCR_LRENPIE (1 << 2) #define GICH_HCR_NPIE (1 << 3) +#define GICH_HCR_VGrp0EIE (1 << 4) +#define GICH_HCR_VGrp0DIE (1 << 5) +#define GICH_HCR_VGrp1EIE (1 << 6) +#define GICH_HCR_VGrp1DIE (1 << 7) +#define GICH_HCR_EOICOUNT GENMASK(31, 27) #define GICH_LR_VIRTUALID (0x3ff << 0) #define GICH_LR_PHYSID_CPUID_SHIFT (10) diff --git a/include/linux/irqchip/arm-vgic-info.h b/include/linux/irqchip/arm-vgic-info.h index a470a73a805a..67d9d960273b 100644 --- a/include/linux/irqchip/arm-vgic-info.h +++ b/include/linux/irqchip/arm-vgic-info.h @@ -24,6 +24,8 @@ struct gic_kvm_info { enum gic_type type; /* Virtual CPU interface */ struct resource vcpu; + /* GICv2 GICC VA */ + void __iomem *gicc_base; /* Interrupt number */ unsigned int maint_irq; /* No interrupt mask, no need to use the above field */ diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 52f6000ab020..1e541193e98d 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -179,6 +179,7 @@ struct kvm_xen_exit { #define KVM_EXIT_LOONGARCH_IOCSR 38 #define KVM_EXIT_MEMORY_FAULT 39 #define KVM_EXIT_TDX 40 +#define KVM_EXIT_ARM_SEA 41 /* For KVM_EXIT_INTERNAL_ERROR */ /* Emulate instruction failed. */ @@ -473,6 +474,14 @@ struct kvm_run { } setup_event_notify; }; } tdx; + /* KVM_EXIT_ARM_SEA */ + struct { +#define KVM_EXIT_ARM_SEA_FLAG_GPA_VALID (1ULL << 0) + __u64 flags; + __u64 esr; + __u64 gva; + __u64 gpa; + } arm_sea; /* Fix the size of the union. */ char padding[256]; }; @@ -963,6 +972,7 @@ struct kvm_enable_cap { #define KVM_CAP_RISCV_MP_STATE_RESET 242 #define KVM_CAP_ARM_CACHEABLE_PFNMAP_SUPPORTED 243 #define KVM_CAP_GUEST_MEMFD_FLAGS 244 +#define KVM_CAP_ARM_SEA_TO_USER 245 struct kvm_irq_routing_irqchip { __u32 irqchip; diff --git a/tools/arch/arm64/include/asm/esr.h b/tools/arch/arm64/include/asm/esr.h index bd592ca81571..0fa17b3af1f7 100644 --- a/tools/arch/arm64/include/asm/esr.h +++ b/tools/arch/arm64/include/asm/esr.h @@ -141,6 +141,8 @@ #define ESR_ELx_SF (UL(1) << ESR_ELx_SF_SHIFT) #define ESR_ELx_AR_SHIFT (14) #define ESR_ELx_AR (UL(1) << ESR_ELx_AR_SHIFT) +#define ESR_ELx_VNCR_SHIFT (13) +#define ESR_ELx_VNCR (UL(1) << ESR_ELx_VNCR_SHIFT) #define ESR_ELx_CM_SHIFT (8) #define ESR_ELx_CM (UL(1) << ESR_ELx_CM_SHIFT) diff --git a/tools/testing/selftests/kvm/Makefile.kvm b/tools/testing/selftests/kvm/Makefile.kvm index 1b5b6668c803..1f92cc28d58b 100644 --- a/tools/testing/selftests/kvm/Makefile.kvm +++ b/tools/testing/selftests/kvm/Makefile.kvm @@ -158,6 +158,7 @@ TEST_GEN_PROGS_EXTENDED_x86 += x86/nx_huge_pages_test TEST_GEN_PROGS_arm64 = $(TEST_GEN_PROGS_COMMON) TEST_GEN_PROGS_arm64 += arm64/aarch32_id_regs TEST_GEN_PROGS_arm64 += arm64/arch_timer_edge_cases +TEST_GEN_PROGS_arm64 += arm64/at TEST_GEN_PROGS_arm64 += arm64/debug-exceptions TEST_GEN_PROGS_arm64 += arm64/hello_el2 TEST_GEN_PROGS_arm64 += arm64/host_sve @@ -165,6 +166,7 @@ TEST_GEN_PROGS_arm64 += arm64/hypercalls TEST_GEN_PROGS_arm64 += arm64/external_aborts TEST_GEN_PROGS_arm64 += arm64/page_fault_test TEST_GEN_PROGS_arm64 += arm64/psci_test +TEST_GEN_PROGS_arm64 += arm64/sea_to_user TEST_GEN_PROGS_arm64 += arm64/set_id_regs TEST_GEN_PROGS_arm64 += arm64/smccc_filter TEST_GEN_PROGS_arm64 += arm64/vcpu_width_config diff --git a/tools/testing/selftests/kvm/arm64/at.c b/tools/testing/selftests/kvm/arm64/at.c new file mode 100644 index 000000000000..c8ee6f520734 --- /dev/null +++ b/tools/testing/selftests/kvm/arm64/at.c @@ -0,0 +1,166 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * at - Test for KVM's AT emulation in the EL2&0 and EL1&0 translation regimes. + */ +#include "kvm_util.h" +#include "processor.h" +#include "test_util.h" +#include "ucall.h" + +#include <asm/sysreg.h> + +#define TEST_ADDR 0x80000000 + +enum { + CLEAR_ACCESS_FLAG, + TEST_ACCESS_FLAG, +}; + +static u64 *ptep_hva; + +#define copy_el2_to_el1(reg) \ + write_sysreg_s(read_sysreg_s(SYS_##reg##_EL1), SYS_##reg##_EL12) + +/* Yes, this is an ugly hack */ +#define __at(op, addr) write_sysreg_s(addr, op) + +#define test_at_insn(op, expect_fault) \ +do { \ + u64 par, fsc; \ + bool fault; \ + \ + GUEST_SYNC(CLEAR_ACCESS_FLAG); \ + \ + __at(OP_AT_##op, TEST_ADDR); \ + isb(); \ + par = read_sysreg(par_el1); \ + \ + fault = par & SYS_PAR_EL1_F; \ + fsc = FIELD_GET(SYS_PAR_EL1_FST, par); \ + \ + __GUEST_ASSERT((expect_fault) == fault, \ + "AT "#op": %sexpected fault (par: %lx)1", \ + (expect_fault) ? "" : "un", par); \ + if ((expect_fault)) { \ + __GUEST_ASSERT(fsc == ESR_ELx_FSC_ACCESS_L(3), \ + "AT "#op": expected access flag fault (par: %lx)", \ + par); \ + } else { \ + GUEST_ASSERT_EQ(FIELD_GET(SYS_PAR_EL1_ATTR, par), MAIR_ATTR_NORMAL); \ + GUEST_ASSERT_EQ(FIELD_GET(SYS_PAR_EL1_SH, par), PTE_SHARED >> 8); \ + GUEST_ASSERT_EQ(par & SYS_PAR_EL1_PA, TEST_ADDR); \ + GUEST_SYNC(TEST_ACCESS_FLAG); \ + } \ +} while (0) + +static void test_at(bool expect_fault) +{ + test_at_insn(S1E2R, expect_fault); + test_at_insn(S1E2W, expect_fault); + + /* Reuse the stage-1 MMU context from EL2 at EL1 */ + copy_el2_to_el1(SCTLR); + copy_el2_to_el1(MAIR); + copy_el2_to_el1(TCR); + copy_el2_to_el1(TTBR0); + copy_el2_to_el1(TTBR1); + + /* Disable stage-2 translation and enter a non-host context */ + write_sysreg(0, vtcr_el2); + write_sysreg(0, vttbr_el2); + sysreg_clear_set(hcr_el2, HCR_EL2_TGE | HCR_EL2_VM, 0); + isb(); + + test_at_insn(S1E1R, expect_fault); + test_at_insn(S1E1W, expect_fault); +} + +static void guest_code(void) +{ + sysreg_clear_set(tcr_el1, TCR_HA, 0); + isb(); + + test_at(true); + + if (!SYS_FIELD_GET(ID_AA64MMFR1_EL1, HAFDBS, read_sysreg(id_aa64mmfr1_el1))) + GUEST_DONE(); + + /* + * KVM's software PTW makes the implementation choice that the AT + * instruction sets the access flag. + */ + sysreg_clear_set(tcr_el1, 0, TCR_HA); + isb(); + test_at(false); + + GUEST_DONE(); +} + +static void handle_sync(struct kvm_vcpu *vcpu, struct ucall *uc) +{ + switch (uc->args[1]) { + case CLEAR_ACCESS_FLAG: + /* + * Delete + reinstall the memslot to invalidate stage-2 + * mappings of the stage-1 page tables, forcing KVM to + * use the 'slow' AT emulation path. + * + * This and clearing the access flag from host userspace + * ensures that the access flag cannot be set speculatively + * and is reliably cleared at the time of the AT instruction. + */ + clear_bit(__ffs(PTE_AF), ptep_hva); + vm_mem_region_reload(vcpu->vm, vcpu->vm->memslots[MEM_REGION_PT]); + break; + case TEST_ACCESS_FLAG: + TEST_ASSERT(test_bit(__ffs(PTE_AF), ptep_hva), + "Expected access flag to be set (desc: %lu)", *ptep_hva); + break; + default: + TEST_FAIL("Unexpected SYNC arg: %lu", uc->args[1]); + } +} + +static void run_test(struct kvm_vcpu *vcpu) +{ + struct ucall uc; + + while (true) { + vcpu_run(vcpu); + switch (get_ucall(vcpu, &uc)) { + case UCALL_DONE: + return; + case UCALL_SYNC: + handle_sync(vcpu, &uc); + continue; + case UCALL_ABORT: + REPORT_GUEST_ASSERT(uc); + return; + default: + TEST_FAIL("Unexpected ucall: %lu", uc.cmd); + } + } +} + +int main(void) +{ + struct kvm_vcpu_init init; + struct kvm_vcpu *vcpu; + struct kvm_vm *vm; + + TEST_REQUIRE(kvm_check_cap(KVM_CAP_ARM_EL2)); + + vm = vm_create(1); + + kvm_get_default_vcpu_target(vm, &init); + init.features[0] |= BIT(KVM_ARM_VCPU_HAS_EL2); + vcpu = aarch64_vcpu_add(vm, 0, &init, guest_code); + kvm_arch_vm_finalize_vcpus(vm); + + virt_map(vm, TEST_ADDR, TEST_ADDR, 1); + ptep_hva = virt_get_pte_hva_at_level(vm, TEST_ADDR, 3); + run_test(vcpu); + + kvm_vm_free(vm); + return 0; +} diff --git a/tools/testing/selftests/kvm/arm64/sea_to_user.c b/tools/testing/selftests/kvm/arm64/sea_to_user.c new file mode 100644 index 000000000000..573dd790aeb8 --- /dev/null +++ b/tools/testing/selftests/kvm/arm64/sea_to_user.c @@ -0,0 +1,331 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Test KVM returns to userspace with KVM_EXIT_ARM_SEA if host APEI fails + * to handle SEA and userspace has opt-ed in KVM_CAP_ARM_SEA_TO_USER. + * + * After reaching userspace with expected arm_sea info, also test userspace + * injecting a synchronous external data abort into the guest. + * + * This test utilizes EINJ to generate a REAL synchronous external data + * abort by consuming a recoverable uncorrectable memory error. Therefore + * the device under test must support EINJ in both firmware and host kernel, + * including the notrigger feature. Otherwise the test will be skipped. + * The under-test platform's APEI should be unable to claim SEA. Otherwise + * the test will also be skipped. + */ + +#include <signal.h> +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> + +#include "test_util.h" +#include "kvm_util.h" +#include "processor.h" +#include "guest_modes.h" + +#define PAGE_PRESENT (1ULL << 63) +#define PAGE_PHYSICAL 0x007fffffffffffffULL +#define PAGE_ADDR_MASK (~(0xfffULL)) + +/* Group ISV and ISS[23:14]. */ +#define ESR_ELx_INST_SYNDROME ((ESR_ELx_ISV) | (ESR_ELx_SAS) | \ + (ESR_ELx_SSE) | (ESR_ELx_SRT_MASK) | \ + (ESR_ELx_SF) | (ESR_ELx_AR)) + +#define EINJ_ETYPE "/sys/kernel/debug/apei/einj/error_type" +#define EINJ_ADDR "/sys/kernel/debug/apei/einj/param1" +#define EINJ_MASK "/sys/kernel/debug/apei/einj/param2" +#define EINJ_FLAGS "/sys/kernel/debug/apei/einj/flags" +#define EINJ_NOTRIGGER "/sys/kernel/debug/apei/einj/notrigger" +#define EINJ_DOIT "/sys/kernel/debug/apei/einj/error_inject" +/* Memory Uncorrectable non-fatal. */ +#define ERROR_TYPE_MEMORY_UER 0x10 +/* Memory address and mask valid (param1 and param2). */ +#define MASK_MEMORY_UER 0b10 + +/* Guest virtual address region = [2G, 3G). */ +#define START_GVA 0x80000000UL +#define VM_MEM_SIZE 0x40000000UL +/* Note: EINJ_OFFSET must < VM_MEM_SIZE. */ +#define EINJ_OFFSET 0x01234badUL +#define EINJ_GVA ((START_GVA) + (EINJ_OFFSET)) + +static vm_paddr_t einj_gpa; +static void *einj_hva; +static uint64_t einj_hpa; +static bool far_invalid; + +static uint64_t translate_to_host_paddr(unsigned long vaddr) +{ + uint64_t pinfo; + int64_t offset = vaddr / getpagesize() * sizeof(pinfo); + int fd; + uint64_t page_addr; + uint64_t paddr; + + fd = open("/proc/self/pagemap", O_RDONLY); + if (fd < 0) + ksft_exit_fail_perror("Failed to open /proc/self/pagemap"); + if (pread(fd, &pinfo, sizeof(pinfo), offset) != sizeof(pinfo)) { + close(fd); + ksft_exit_fail_perror("Failed to read /proc/self/pagemap"); + } + + close(fd); + + if ((pinfo & PAGE_PRESENT) == 0) + ksft_exit_fail_perror("Page not present"); + + page_addr = (pinfo & PAGE_PHYSICAL) << MIN_PAGE_SHIFT; + paddr = page_addr + (vaddr & (getpagesize() - 1)); + return paddr; +} + +static void write_einj_entry(const char *einj_path, uint64_t val) +{ + char cmd[256] = {0}; + FILE *cmdfile = NULL; + + sprintf(cmd, "echo %#lx > %s", val, einj_path); + cmdfile = popen(cmd, "r"); + + if (pclose(cmdfile) == 0) + ksft_print_msg("echo %#lx > %s - done\n", val, einj_path); + else + ksft_exit_fail_perror("Failed to write EINJ entry"); +} + +static void inject_uer(uint64_t paddr) +{ + if (access("/sys/firmware/acpi/tables/EINJ", R_OK) == -1) + ksft_test_result_skip("EINJ table no available in firmware"); + + if (access(EINJ_ETYPE, R_OK | W_OK) == -1) + ksft_test_result_skip("EINJ module probably not loaded?"); + + write_einj_entry(EINJ_ETYPE, ERROR_TYPE_MEMORY_UER); + write_einj_entry(EINJ_FLAGS, MASK_MEMORY_UER); + write_einj_entry(EINJ_ADDR, paddr); + write_einj_entry(EINJ_MASK, ~0x0UL); + write_einj_entry(EINJ_NOTRIGGER, 1); + write_einj_entry(EINJ_DOIT, 1); +} + +/* + * When host APEI successfully claims the SEA caused by guest_code, kernel + * will send SIGBUS signal with BUS_MCEERR_AR to test thread. + * + * We set up this SIGBUS handler to skip the test for that case. + */ +static void sigbus_signal_handler(int sig, siginfo_t *si, void *v) +{ + ksft_print_msg("SIGBUS (%d) received, dumping siginfo...\n", sig); + ksft_print_msg("si_signo=%d, si_errno=%d, si_code=%d, si_addr=%p\n", + si->si_signo, si->si_errno, si->si_code, si->si_addr); + if (si->si_code == BUS_MCEERR_AR) + ksft_test_result_skip("SEA is claimed by host APEI\n"); + else + ksft_test_result_fail("Exit with signal unhandled\n"); + + exit(0); +} + +static void setup_sigbus_handler(void) +{ + struct sigaction act; + + memset(&act, 0, sizeof(act)); + sigemptyset(&act.sa_mask); + act.sa_sigaction = sigbus_signal_handler; + act.sa_flags = SA_SIGINFO; + TEST_ASSERT(sigaction(SIGBUS, &act, NULL) == 0, + "Failed to setup SIGBUS handler"); +} + +static void guest_code(void) +{ + uint64_t guest_data; + + /* Consumes error will cause a SEA. */ + guest_data = *(uint64_t *)EINJ_GVA; + + GUEST_FAIL("Poison not protected by SEA: gva=%#lx, guest_data=%#lx\n", + EINJ_GVA, guest_data); +} + +static void expect_sea_handler(struct ex_regs *regs) +{ + u64 esr = read_sysreg(esr_el1); + u64 far = read_sysreg(far_el1); + bool expect_far_invalid = far_invalid; + + GUEST_PRINTF("Handling Guest SEA\n"); + GUEST_PRINTF("ESR_EL1=%#lx, FAR_EL1=%#lx\n", esr, far); + + GUEST_ASSERT_EQ(ESR_ELx_EC(esr), ESR_ELx_EC_DABT_CUR); + GUEST_ASSERT_EQ(esr & ESR_ELx_FSC_TYPE, ESR_ELx_FSC_EXTABT); + + if (expect_far_invalid) { + GUEST_ASSERT_EQ(esr & ESR_ELx_FnV, ESR_ELx_FnV); + GUEST_PRINTF("Guest observed garbage value in FAR\n"); + } else { + GUEST_ASSERT_EQ(esr & ESR_ELx_FnV, 0); + GUEST_ASSERT_EQ(far, EINJ_GVA); + } + + GUEST_DONE(); +} + +static void vcpu_inject_sea(struct kvm_vcpu *vcpu) +{ + struct kvm_vcpu_events events = {}; + + events.exception.ext_dabt_pending = true; + vcpu_events_set(vcpu, &events); +} + +static void run_vm(struct kvm_vm *vm, struct kvm_vcpu *vcpu) +{ + struct ucall uc; + bool guest_done = false; + struct kvm_run *run = vcpu->run; + u64 esr; + + /* Resume the vCPU after error injection to consume the error. */ + vcpu_run(vcpu); + + ksft_print_msg("Dump kvm_run info about KVM_EXIT_%s\n", + exit_reason_str(run->exit_reason)); + ksft_print_msg("kvm_run.arm_sea: esr=%#llx, flags=%#llx\n", + run->arm_sea.esr, run->arm_sea.flags); + ksft_print_msg("kvm_run.arm_sea: gva=%#llx, gpa=%#llx\n", + run->arm_sea.gva, run->arm_sea.gpa); + + TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_ARM_SEA); + + esr = run->arm_sea.esr; + TEST_ASSERT_EQ(ESR_ELx_EC(esr), ESR_ELx_EC_DABT_LOW); + TEST_ASSERT_EQ(esr & ESR_ELx_FSC_TYPE, ESR_ELx_FSC_EXTABT); + TEST_ASSERT_EQ(ESR_ELx_ISS2(esr), 0); + TEST_ASSERT_EQ((esr & ESR_ELx_INST_SYNDROME), 0); + TEST_ASSERT_EQ(esr & ESR_ELx_VNCR, 0); + + if (!(esr & ESR_ELx_FnV)) { + ksft_print_msg("Expect gva to match given FnV bit is 0\n"); + TEST_ASSERT_EQ(run->arm_sea.gva, EINJ_GVA); + } + + if (run->arm_sea.flags & KVM_EXIT_ARM_SEA_FLAG_GPA_VALID) { + ksft_print_msg("Expect gpa to match given KVM_EXIT_ARM_SEA_FLAG_GPA_VALID is set\n"); + TEST_ASSERT_EQ(run->arm_sea.gpa, einj_gpa & PAGE_ADDR_MASK); + } + + far_invalid = esr & ESR_ELx_FnV; + + /* Inject a SEA into guest and expect handled in SEA handler. */ + vcpu_inject_sea(vcpu); + + /* Expect the guest to reach GUEST_DONE gracefully. */ + do { + vcpu_run(vcpu); + switch (get_ucall(vcpu, &uc)) { + case UCALL_PRINTF: + ksft_print_msg("From guest: %s", uc.buffer); + break; + case UCALL_DONE: + ksft_print_msg("Guest done gracefully!\n"); + guest_done = 1; + break; + case UCALL_ABORT: + ksft_print_msg("Guest aborted!\n"); + guest_done = 1; + REPORT_GUEST_ASSERT(uc); + break; + default: + TEST_FAIL("Unexpected ucall: %lu\n", uc.cmd); + } + } while (!guest_done); +} + +static struct kvm_vm *vm_create_with_sea_handler(struct kvm_vcpu **vcpu) +{ + size_t backing_page_size; + size_t guest_page_size; + size_t alignment; + uint64_t num_guest_pages; + vm_paddr_t start_gpa; + enum vm_mem_backing_src_type src_type = VM_MEM_SRC_ANONYMOUS_HUGETLB_1GB; + struct kvm_vm *vm; + + backing_page_size = get_backing_src_pagesz(src_type); + guest_page_size = vm_guest_mode_params[VM_MODE_DEFAULT].page_size; + alignment = max(backing_page_size, guest_page_size); + num_guest_pages = VM_MEM_SIZE / guest_page_size; + + vm = __vm_create_with_one_vcpu(vcpu, num_guest_pages, guest_code); + vm_init_descriptor_tables(vm); + vcpu_init_descriptor_tables(*vcpu); + + vm_install_sync_handler(vm, + /*vector=*/VECTOR_SYNC_CURRENT, + /*ec=*/ESR_ELx_EC_DABT_CUR, + /*handler=*/expect_sea_handler); + + start_gpa = (vm->max_gfn - num_guest_pages) * guest_page_size; + start_gpa = align_down(start_gpa, alignment); + + vm_userspace_mem_region_add( + /*vm=*/vm, + /*src_type=*/src_type, + /*guest_paddr=*/start_gpa, + /*slot=*/1, + /*npages=*/num_guest_pages, + /*flags=*/0); + + virt_map(vm, START_GVA, start_gpa, num_guest_pages); + + ksft_print_msg("Mapped %#lx pages: gva=%#lx to gpa=%#lx\n", + num_guest_pages, START_GVA, start_gpa); + return vm; +} + +static void vm_inject_memory_uer(struct kvm_vm *vm) +{ + uint64_t guest_data; + + einj_gpa = addr_gva2gpa(vm, EINJ_GVA); + einj_hva = addr_gva2hva(vm, EINJ_GVA); + + /* Populate certain data before injecting UER. */ + *(uint64_t *)einj_hva = 0xBAADCAFE; + guest_data = *(uint64_t *)einj_hva; + ksft_print_msg("Before EINJect: data=%#lx\n", + guest_data); + + einj_hpa = translate_to_host_paddr((unsigned long)einj_hva); + + ksft_print_msg("EINJ_GVA=%#lx, einj_gpa=%#lx, einj_hva=%p, einj_hpa=%#lx\n", + EINJ_GVA, einj_gpa, einj_hva, einj_hpa); + + inject_uer(einj_hpa); + ksft_print_msg("Memory UER EINJected\n"); +} + +int main(int argc, char *argv[]) +{ + struct kvm_vm *vm; + struct kvm_vcpu *vcpu; + + TEST_REQUIRE(kvm_has_cap(KVM_CAP_ARM_SEA_TO_USER)); + + setup_sigbus_handler(); + + vm = vm_create_with_sea_handler(&vcpu); + vm_enable_cap(vm, KVM_CAP_ARM_SEA_TO_USER, 0); + vm_inject_memory_uer(vm); + run_vm(vm, vcpu); + kvm_vm_free(vm); + + return 0; +} diff --git a/tools/testing/selftests/kvm/arm64/vgic_irq.c b/tools/testing/selftests/kvm/arm64/vgic_irq.c index 8d7758f12280..2fb2c7939fe9 100644 --- a/tools/testing/selftests/kvm/arm64/vgic_irq.c +++ b/tools/testing/selftests/kvm/arm64/vgic_irq.c @@ -29,6 +29,7 @@ struct test_args { bool level_sensitive; /* 1 is level, 0 is edge */ int kvm_max_routes; /* output of KVM_CAP_IRQ_ROUTING */ bool kvm_supports_irqfd; /* output of KVM_CAP_IRQFD */ + uint32_t shared_data; }; /* @@ -205,7 +206,7 @@ static void kvm_inject_call(kvm_inject_cmd cmd, uint32_t first_intid, do { \ uint32_t _intid; \ _intid = gic_get_and_ack_irq(); \ - GUEST_ASSERT(_intid == 0 || _intid == IAR_SPURIOUS); \ + GUEST_ASSERT(_intid == IAR_SPURIOUS); \ } while (0) #define CAT_HELPER(a, b) a ## b @@ -359,8 +360,9 @@ static uint32_t wait_for_and_activate_irq(void) * interrupts for the whole test. */ static void test_inject_preemption(struct test_args *args, - uint32_t first_intid, int num, - kvm_inject_cmd cmd) + uint32_t first_intid, int num, + const unsigned long *exclude, + kvm_inject_cmd cmd) { uint32_t intid, prio, step = KVM_PRIO_STEPS; int i; @@ -379,6 +381,10 @@ static void test_inject_preemption(struct test_args *args, for (i = 0; i < num; i++) { uint32_t tmp; intid = i + first_intid; + + if (exclude && test_bit(i, exclude)) + continue; + KVM_INJECT(cmd, intid); /* Each successive IRQ will preempt the previous one. */ tmp = wait_for_and_activate_irq(); @@ -390,15 +396,33 @@ static void test_inject_preemption(struct test_args *args, /* finish handling the IRQs starting with the highest priority one. */ for (i = 0; i < num; i++) { intid = num - i - 1 + first_intid; + + if (exclude && test_bit(intid - first_intid, exclude)) + continue; + gic_set_eoi(intid); - if (args->eoi_split) - gic_set_dir(intid); + } + + if (args->eoi_split) { + for (i = 0; i < num; i++) { + intid = i + first_intid; + + if (exclude && test_bit(i, exclude)) + continue; + + if (args->eoi_split) + gic_set_dir(intid); + } } local_irq_enable(); - for (i = 0; i < num; i++) + for (i = 0; i < num; i++) { + if (exclude && test_bit(i, exclude)) + continue; + GUEST_ASSERT(!gic_irq_get_active(i + first_intid)); + } GUEST_ASSERT_EQ(gic_read_ap1r0(), 0); GUEST_ASSERT_IAR_EMPTY(); @@ -436,33 +460,32 @@ static void test_injection_failure(struct test_args *args, static void test_preemption(struct test_args *args, struct kvm_inject_desc *f) { - /* - * Test up to 4 levels of preemption. The reason is that KVM doesn't - * currently implement the ability to have more than the number-of-LRs - * number of concurrently active IRQs. The number of LRs implemented is - * IMPLEMENTATION DEFINED, however, it seems that most implement 4. - */ + /* Timer PPIs cannot be injected from userspace */ + static const unsigned long ppi_exclude = (BIT(27 - MIN_PPI) | + BIT(30 - MIN_PPI) | + BIT(28 - MIN_PPI) | + BIT(26 - MIN_PPI)); + if (f->sgi) - test_inject_preemption(args, MIN_SGI, 4, f->cmd); + test_inject_preemption(args, MIN_SGI, 16, NULL, f->cmd); if (f->ppi) - test_inject_preemption(args, MIN_PPI, 4, f->cmd); + test_inject_preemption(args, MIN_PPI, 16, &ppi_exclude, f->cmd); if (f->spi) - test_inject_preemption(args, MIN_SPI, 4, f->cmd); + test_inject_preemption(args, MIN_SPI, 31, NULL, f->cmd); } static void test_restore_active(struct test_args *args, struct kvm_inject_desc *f) { - /* Test up to 4 active IRQs. Same reason as in test_preemption. */ if (f->sgi) - guest_restore_active(args, MIN_SGI, 4, f->cmd); + guest_restore_active(args, MIN_SGI, 16, f->cmd); if (f->ppi) - guest_restore_active(args, MIN_PPI, 4, f->cmd); + guest_restore_active(args, MIN_PPI, 16, f->cmd); if (f->spi) - guest_restore_active(args, MIN_SPI, 4, f->cmd); + guest_restore_active(args, MIN_SPI, 31, f->cmd); } static void guest_code(struct test_args *args) @@ -473,12 +496,12 @@ static void guest_code(struct test_args *args) gic_init(GIC_V3, 1); - for (i = 0; i < nr_irqs; i++) - gic_irq_enable(i); - for (i = MIN_SPI; i < nr_irqs; i++) gic_irq_set_config(i, !level_sensitive); + for (i = 0; i < nr_irqs; i++) + gic_irq_enable(i); + gic_set_eoi_split(args->eoi_split); reset_priorities(args); @@ -779,6 +802,221 @@ done: kvm_vm_free(vm); } +static void guest_code_asym_dir(struct test_args *args, int cpuid) +{ + gic_init(GIC_V3, 2); + + gic_set_eoi_split(1); + gic_set_priority_mask(CPU_PRIO_MASK); + + if (cpuid == 0) { + uint32_t intid; + + local_irq_disable(); + + gic_set_priority(MIN_PPI, IRQ_DEFAULT_PRIO); + gic_irq_enable(MIN_SPI); + gic_irq_set_pending(MIN_SPI); + + intid = wait_for_and_activate_irq(); + GUEST_ASSERT_EQ(intid, MIN_SPI); + + gic_set_eoi(intid); + isb(); + + WRITE_ONCE(args->shared_data, MIN_SPI); + dsb(ishst); + + do { + dsb(ishld); + } while (READ_ONCE(args->shared_data) == MIN_SPI); + GUEST_ASSERT(!gic_irq_get_active(MIN_SPI)); + } else { + do { + dsb(ishld); + } while (READ_ONCE(args->shared_data) != MIN_SPI); + + gic_set_dir(MIN_SPI); + isb(); + + WRITE_ONCE(args->shared_data, 0); + dsb(ishst); + } + + GUEST_DONE(); +} + +static void guest_code_group_en(struct test_args *args, int cpuid) +{ + uint32_t intid; + + gic_init(GIC_V3, 2); + + gic_set_eoi_split(0); + gic_set_priority_mask(CPU_PRIO_MASK); + /* SGI0 is G0, which is disabled */ + gic_irq_set_group(0, 0); + + /* Configure all SGIs with decreasing priority */ + for (intid = 0; intid < MIN_PPI; intid++) { + gic_set_priority(intid, (intid + 1) * 8); + gic_irq_enable(intid); + gic_irq_set_pending(intid); + } + + /* Ack and EOI all G1 interrupts */ + for (int i = 1; i < MIN_PPI; i++) { + intid = wait_for_and_activate_irq(); + + GUEST_ASSERT(intid < MIN_PPI); + gic_set_eoi(intid); + isb(); + } + + /* + * Check that SGI0 is still pending, inactive, and that we cannot + * ack anything. + */ + GUEST_ASSERT(gic_irq_get_pending(0)); + GUEST_ASSERT(!gic_irq_get_active(0)); + GUEST_ASSERT_IAR_EMPTY(); + GUEST_ASSERT(read_sysreg_s(SYS_ICC_IAR0_EL1) == IAR_SPURIOUS); + + /* Open the G0 gates, and verify we can ack SGI0 */ + write_sysreg_s(1, SYS_ICC_IGRPEN0_EL1); + isb(); + + do { + intid = read_sysreg_s(SYS_ICC_IAR0_EL1); + } while (intid == IAR_SPURIOUS); + + GUEST_ASSERT(intid == 0); + GUEST_DONE(); +} + +static void guest_code_timer_spi(struct test_args *args, int cpuid) +{ + uint32_t intid; + u64 val; + + gic_init(GIC_V3, 2); + + gic_set_eoi_split(1); + gic_set_priority_mask(CPU_PRIO_MASK); + + /* Add a pending SPI so that KVM starts trapping DIR */ + gic_set_priority(MIN_SPI + cpuid, IRQ_DEFAULT_PRIO); + gic_irq_set_pending(MIN_SPI + cpuid); + + /* Configure the timer with a higher priority, make it pending */ + gic_set_priority(27, IRQ_DEFAULT_PRIO - 8); + + isb(); + val = read_sysreg(cntvct_el0); + write_sysreg(val, cntv_cval_el0); + write_sysreg(1, cntv_ctl_el0); + isb(); + + GUEST_ASSERT(gic_irq_get_pending(27)); + + /* Enable both interrupts */ + gic_irq_enable(MIN_SPI + cpuid); + gic_irq_enable(27); + + /* The timer must fire */ + intid = wait_for_and_activate_irq(); + GUEST_ASSERT(intid == 27); + + /* Check that we can deassert it */ + write_sysreg(0, cntv_ctl_el0); + isb(); + + GUEST_ASSERT(!gic_irq_get_pending(27)); + + /* + * Priority drop, deactivation -- we expect that the host + * deactivation will have been effective + */ + gic_set_eoi(27); + gic_set_dir(27); + + GUEST_ASSERT(!gic_irq_get_active(27)); + + /* Do it one more time */ + isb(); + val = read_sysreg(cntvct_el0); + write_sysreg(val, cntv_cval_el0); + write_sysreg(1, cntv_ctl_el0); + isb(); + + GUEST_ASSERT(gic_irq_get_pending(27)); + + /* The timer must fire again */ + intid = wait_for_and_activate_irq(); + GUEST_ASSERT(intid == 27); + + GUEST_DONE(); +} + +static void *test_vcpu_run(void *arg) +{ + struct kvm_vcpu *vcpu = arg; + struct ucall uc; + + while (1) { + vcpu_run(vcpu); + + switch (get_ucall(vcpu, &uc)) { + case UCALL_ABORT: + REPORT_GUEST_ASSERT(uc); + break; + case UCALL_DONE: + return NULL; + default: + TEST_FAIL("Unknown ucall %lu", uc.cmd); + } + } + + return NULL; +} + +static void test_vgic_two_cpus(void *gcode) +{ + pthread_t thr[2]; + struct kvm_vcpu *vcpus[2]; + struct test_args args = {}; + struct kvm_vm *vm; + vm_vaddr_t args_gva; + int gic_fd, ret; + + vm = vm_create_with_vcpus(2, gcode, vcpus); + + vm_init_descriptor_tables(vm); + vcpu_init_descriptor_tables(vcpus[0]); + vcpu_init_descriptor_tables(vcpus[1]); + + /* Setup the guest args page (so it gets the args). */ + args_gva = vm_vaddr_alloc_page(vm); + memcpy(addr_gva2hva(vm, args_gva), &args, sizeof(args)); + vcpu_args_set(vcpus[0], 2, args_gva, 0); + vcpu_args_set(vcpus[1], 2, args_gva, 1); + + gic_fd = vgic_v3_setup(vm, 2, 64); + + ret = pthread_create(&thr[0], NULL, test_vcpu_run, vcpus[0]); + if (ret) + TEST_FAIL("Can't create thread for vcpu 0 (%d)\n", ret); + ret = pthread_create(&thr[1], NULL, test_vcpu_run, vcpus[1]); + if (ret) + TEST_FAIL("Can't create thread for vcpu 1 (%d)\n", ret); + + pthread_join(thr[0], NULL); + pthread_join(thr[1], NULL); + + close(gic_fd); + kvm_vm_free(vm); +} + static void help(const char *name) { printf( @@ -835,6 +1073,9 @@ int main(int argc, char **argv) test_vgic(nr_irqs, false /* level */, true /* eoi_split */); test_vgic(nr_irqs, true /* level */, false /* eoi_split */); test_vgic(nr_irqs, true /* level */, true /* eoi_split */); + test_vgic_two_cpus(guest_code_asym_dir); + test_vgic_two_cpus(guest_code_group_en); + test_vgic_two_cpus(guest_code_timer_spi); } else { test_vgic(nr_irqs, level_sensitive, eoi_split); } diff --git a/tools/testing/selftests/kvm/arm64/vgic_lpi_stress.c b/tools/testing/selftests/kvm/arm64/vgic_lpi_stress.c index 687d04463983..e857a605f577 100644 --- a/tools/testing/selftests/kvm/arm64/vgic_lpi_stress.c +++ b/tools/testing/selftests/kvm/arm64/vgic_lpi_stress.c @@ -118,6 +118,10 @@ static void guest_setup_gic(void) guest_setup_its_mappings(); guest_invalidate_all_rdists(); + + /* SYNC to ensure ITS setup is complete */ + for (cpuid = 0; cpuid < test_data.nr_cpus; cpuid++) + its_send_sync_cmd(test_data.cmdq_base_va, cpuid); } static void guest_code(size_t nr_lpis) diff --git a/tools/testing/selftests/kvm/include/arm64/gic.h b/tools/testing/selftests/kvm/include/arm64/gic.h index baeb3c859389..cc7a7f34ed37 100644 --- a/tools/testing/selftests/kvm/include/arm64/gic.h +++ b/tools/testing/selftests/kvm/include/arm64/gic.h @@ -57,6 +57,7 @@ void gic_irq_set_pending(unsigned int intid); void gic_irq_clear_pending(unsigned int intid); bool gic_irq_get_pending(unsigned int intid); void gic_irq_set_config(unsigned int intid, bool is_edge); +void gic_irq_set_group(unsigned int intid, bool group); void gic_rdist_enable_lpis(vm_paddr_t cfg_table, size_t cfg_table_size, vm_paddr_t pend_table); diff --git a/tools/testing/selftests/kvm/include/arm64/gic_v3_its.h b/tools/testing/selftests/kvm/include/arm64/gic_v3_its.h index 3722ed9c8f96..58feef3eb386 100644 --- a/tools/testing/selftests/kvm/include/arm64/gic_v3_its.h +++ b/tools/testing/selftests/kvm/include/arm64/gic_v3_its.h @@ -15,5 +15,6 @@ void its_send_mapc_cmd(void *cmdq_base, u32 vcpu_id, u32 collection_id, bool val void its_send_mapti_cmd(void *cmdq_base, u32 device_id, u32 event_id, u32 collection_id, u32 intid); void its_send_invall_cmd(void *cmdq_base, u32 collection_id); +void its_send_sync_cmd(void *cmdq_base, u32 vcpu_id); #endif // __SELFTESTS_GIC_V3_ITS_H__ diff --git a/tools/testing/selftests/kvm/include/kvm_util.h b/tools/testing/selftests/kvm/include/kvm_util.h index 94c219c3877c..81f4355ff28a 100644 --- a/tools/testing/selftests/kvm/include/kvm_util.h +++ b/tools/testing/selftests/kvm/include/kvm_util.h @@ -688,6 +688,7 @@ static inline bool vm_arch_has_protected_memory(struct kvm_vm *vm) #endif void vm_mem_region_set_flags(struct kvm_vm *vm, uint32_t slot, uint32_t flags); +void vm_mem_region_reload(struct kvm_vm *vm, uint32_t slot); void vm_mem_region_move(struct kvm_vm *vm, uint32_t slot, uint64_t new_gpa); void vm_mem_region_delete(struct kvm_vm *vm, uint32_t slot); struct kvm_vcpu *__vm_vcpu_add(struct kvm_vm *vm, uint32_t vcpu_id); diff --git a/tools/testing/selftests/kvm/lib/arm64/gic.c b/tools/testing/selftests/kvm/lib/arm64/gic.c index 7abbf8866512..b023868fe0b8 100644 --- a/tools/testing/selftests/kvm/lib/arm64/gic.c +++ b/tools/testing/selftests/kvm/lib/arm64/gic.c @@ -155,3 +155,9 @@ void gic_irq_set_config(unsigned int intid, bool is_edge) GUEST_ASSERT(gic_common_ops); gic_common_ops->gic_irq_set_config(intid, is_edge); } + +void gic_irq_set_group(unsigned int intid, bool group) +{ + GUEST_ASSERT(gic_common_ops); + gic_common_ops->gic_irq_set_group(intid, group); +} diff --git a/tools/testing/selftests/kvm/lib/arm64/gic_private.h b/tools/testing/selftests/kvm/lib/arm64/gic_private.h index d24e9ecc96c6..b6a7e30c3eb1 100644 --- a/tools/testing/selftests/kvm/lib/arm64/gic_private.h +++ b/tools/testing/selftests/kvm/lib/arm64/gic_private.h @@ -25,6 +25,7 @@ struct gic_common_ops { void (*gic_irq_clear_pending)(uint32_t intid); bool (*gic_irq_get_pending)(uint32_t intid); void (*gic_irq_set_config)(uint32_t intid, bool is_edge); + void (*gic_irq_set_group)(uint32_t intid, bool group); }; extern const struct gic_common_ops gicv3_ops; diff --git a/tools/testing/selftests/kvm/lib/arm64/gic_v3.c b/tools/testing/selftests/kvm/lib/arm64/gic_v3.c index 66d05506f78b..50754a27f493 100644 --- a/tools/testing/selftests/kvm/lib/arm64/gic_v3.c +++ b/tools/testing/selftests/kvm/lib/arm64/gic_v3.c @@ -293,17 +293,36 @@ static void gicv3_enable_redist(volatile void *redist_base) } } +static void gicv3_set_group(uint32_t intid, bool grp) +{ + uint32_t cpu_or_dist; + uint32_t val; + + cpu_or_dist = (get_intid_range(intid) == SPI_RANGE) ? DIST_BIT : guest_get_vcpuid(); + val = gicv3_reg_readl(cpu_or_dist, GICD_IGROUPR + (intid / 32) * 4); + if (grp) + val |= BIT(intid % 32); + else + val &= ~BIT(intid % 32); + gicv3_reg_writel(cpu_or_dist, GICD_IGROUPR + (intid / 32) * 4, val); +} + static void gicv3_cpu_init(unsigned int cpu) { volatile void *sgi_base; unsigned int i; volatile void *redist_base_cpu; + u64 typer; GUEST_ASSERT(cpu < gicv3_data.nr_cpus); redist_base_cpu = gicr_base_cpu(cpu); sgi_base = sgi_base_from_redist(redist_base_cpu); + /* Verify assumption that GICR_TYPER.Processor_number == cpu */ + typer = readq_relaxed(redist_base_cpu + GICR_TYPER); + GUEST_ASSERT_EQ(GICR_TYPER_CPU_NUMBER(typer), cpu); + gicv3_enable_redist(redist_base_cpu); /* @@ -328,6 +347,8 @@ static void gicv3_cpu_init(unsigned int cpu) /* Set a default priority threshold */ write_sysreg_s(ICC_PMR_DEF_PRIO, SYS_ICC_PMR_EL1); + /* Disable Group-0 interrupts */ + write_sysreg_s(ICC_IGRPEN0_EL1_MASK, SYS_ICC_IGRPEN1_EL1); /* Enable non-secure Group-1 interrupts */ write_sysreg_s(ICC_IGRPEN1_EL1_MASK, SYS_ICC_IGRPEN1_EL1); } @@ -400,6 +421,7 @@ const struct gic_common_ops gicv3_ops = { .gic_irq_clear_pending = gicv3_irq_clear_pending, .gic_irq_get_pending = gicv3_irq_get_pending, .gic_irq_set_config = gicv3_irq_set_config, + .gic_irq_set_group = gicv3_set_group, }; void gic_rdist_enable_lpis(vm_paddr_t cfg_table, size_t cfg_table_size, diff --git a/tools/testing/selftests/kvm/lib/arm64/gic_v3_its.c b/tools/testing/selftests/kvm/lib/arm64/gic_v3_its.c index 0e2f8ed90f30..7f9fdcf42ae6 100644 --- a/tools/testing/selftests/kvm/lib/arm64/gic_v3_its.c +++ b/tools/testing/selftests/kvm/lib/arm64/gic_v3_its.c @@ -253,3 +253,13 @@ void its_send_invall_cmd(void *cmdq_base, u32 collection_id) its_send_cmd(cmdq_base, &cmd); } + +void its_send_sync_cmd(void *cmdq_base, u32 vcpu_id) +{ + struct its_cmd_block cmd = {}; + + its_encode_cmd(&cmd, GITS_CMD_SYNC); + its_encode_target(&cmd, procnum_to_rdbase(vcpu_id)); + + its_send_cmd(cmdq_base, &cmd); +} diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c index b81b2df0512e..8279b6ced8d2 100644 --- a/tools/testing/selftests/kvm/lib/kvm_util.c +++ b/tools/testing/selftests/kvm/lib/kvm_util.c @@ -1184,6 +1184,16 @@ void vm_mem_region_set_flags(struct kvm_vm *vm, uint32_t slot, uint32_t flags) ret, errno, slot, flags); } +void vm_mem_region_reload(struct kvm_vm *vm, uint32_t slot) +{ + struct userspace_mem_region *region = memslot2region(vm, slot); + struct kvm_userspace_memory_region2 tmp = region->region; + + tmp.memory_size = 0; + vm_ioctl(vm, KVM_SET_USER_MEMORY_REGION2, &tmp); + vm_ioctl(vm, KVM_SET_USER_MEMORY_REGION2, ®ion->region); +} + /* * VM Memory Region Move * @@ -2005,6 +2015,7 @@ static struct exit_reason { KVM_EXIT_STRING(NOTIFY), KVM_EXIT_STRING(LOONGARCH_IOCSR), KVM_EXIT_STRING(MEMORY_FAULT), + KVM_EXIT_STRING(ARM_SEA), }; /* |