summaryrefslogtreecommitdiff
path: root/arch
diff options
context:
space:
mode:
Diffstat (limited to 'arch')
-rw-r--r--arch/arm64/include/asm/kvm_arm.h1
-rw-r--r--arch/arm64/include/asm/kvm_asm.h8
-rw-r--r--arch/arm64/include/asm/kvm_host.h3
-rw-r--r--arch/arm64/include/asm/kvm_hyp.h3
-rw-r--r--arch/arm64/include/asm/kvm_nested.h40
-rw-r--r--arch/arm64/include/asm/kvm_pgtable.h49
-rw-r--r--arch/arm64/include/asm/kvm_pkvm.h4
-rw-r--r--arch/arm64/include/asm/virt.h7
-rw-r--r--arch/arm64/kernel/cpufeature.c59
-rw-r--r--arch/arm64/kernel/hyp-stub.S5
-rw-r--r--arch/arm64/kernel/image-vars.h1
-rw-r--r--arch/arm64/kvm/arm.c20
-rw-r--r--arch/arm64/kvm/at.c196
-rw-r--r--arch/arm64/kvm/hyp/nvhe/hyp-main.c7
-rw-r--r--arch/arm64/kvm/hyp/nvhe/pkvm.c3
-rw-r--r--arch/arm64/kvm/hyp/nvhe/sys_regs.c5
-rw-r--r--arch/arm64/kvm/hyp/pgtable.c122
-rw-r--r--arch/arm64/kvm/hyp/vgic-v2-cpuif-proxy.c4
-rw-r--r--arch/arm64/kvm/hyp/vgic-v3-sr.c96
-rw-r--r--arch/arm64/kvm/mmu.c132
-rw-r--r--arch/arm64/kvm/nested.c123
-rw-r--r--arch/arm64/kvm/pkvm.c11
-rw-r--r--arch/arm64/kvm/ptdump.c35
-rw-r--r--arch/arm64/kvm/sys_regs.c28
-rw-r--r--arch/arm64/kvm/vgic/vgic-init.c9
-rw-r--r--arch/arm64/kvm/vgic/vgic-mmio-v2.c24
-rw-r--r--arch/arm64/kvm/vgic/vgic-mmio.h1
-rw-r--r--arch/arm64/kvm/vgic/vgic-v2.c291
-rw-r--r--arch/arm64/kvm/vgic/vgic-v3-nested.c104
-rw-r--r--arch/arm64/kvm/vgic/vgic-v3.c426
-rw-r--r--arch/arm64/kvm/vgic/vgic-v4.c5
-rw-r--r--arch/arm64/kvm/vgic/vgic.c298
-rw-r--r--arch/arm64/kvm/vgic/vgic.h43
-rw-r--r--arch/arm64/tools/cpucaps2
-rw-r--r--arch/loongarch/include/asm/kvm_eiointc.h55
-rw-r--r--arch/loongarch/include/asm/kvm_host.h8
-rw-r--r--arch/loongarch/include/asm/kvm_vcpu.h1
-rw-r--r--arch/loongarch/include/asm/loongarch.h2
-rw-r--r--arch/loongarch/include/uapi/asm/kvm.h1
-rw-r--r--arch/loongarch/kvm/Kconfig1
-rw-r--r--arch/loongarch/kvm/intc/eiointc.c80
-rw-r--r--arch/loongarch/kvm/interrupt.c15
-rw-r--r--arch/loongarch/kvm/vcpu.c23
-rw-r--r--arch/loongarch/kvm/vm.c40
-rw-r--r--arch/mips/kvm/Kconfig1
-rw-r--r--arch/mips/kvm/mips.c4
-rw-r--r--arch/powerpc/kvm/Kconfig1
-rw-r--r--arch/powerpc/kvm/powerpc.c4
-rw-r--r--arch/riscv/include/asm/kvm_host.h6
-rw-r--r--arch/riscv/include/asm/kvm_tlb.h1
-rw-r--r--arch/riscv/include/asm/kvm_vcpu_sbi.h5
-rw-r--r--arch/riscv/include/asm/kvm_vmid.h1
-rw-r--r--arch/riscv/include/uapi/asm/kvm.h3
-rw-r--r--arch/riscv/kvm/Kconfig1
-rw-r--r--arch/riscv/kvm/Makefile1
-rw-r--r--arch/riscv/kvm/aia_imsic.c2
-rw-r--r--arch/riscv/kvm/main.c14
-rw-r--r--arch/riscv/kvm/mmu.c5
-rw-r--r--arch/riscv/kvm/tlb.c30
-rw-r--r--arch/riscv/kvm/vcpu.c6
-rw-r--r--arch/riscv/kvm/vcpu_insn.c22
-rw-r--r--arch/riscv/kvm/vcpu_sbi.c10
-rw-r--r--arch/riscv/kvm/vcpu_sbi_base.c28
-rw-r--r--arch/riscv/kvm/vcpu_sbi_forward.c34
-rw-r--r--arch/riscv/kvm/vcpu_sbi_replace.c32
-rw-r--r--arch/riscv/kvm/vcpu_sbi_system.c4
-rw-r--r--arch/riscv/kvm/vcpu_sbi_v01.c3
-rw-r--r--arch/riscv/kvm/vmid.c23
-rw-r--r--arch/s390/include/asm/kvm_host.h8
-rw-r--r--arch/s390/include/asm/stacktrace.h1
-rw-r--r--arch/s390/kernel/asm-offsets.c1
-rw-r--r--arch/s390/kernel/entry.S2
-rw-r--r--arch/s390/kvm/Kconfig2
-rw-r--r--arch/s390/kvm/gaccess.c27
-rw-r--r--arch/s390/kvm/intercept.c3
-rw-r--r--arch/s390/kvm/interrupt.c80
-rw-r--r--arch/s390/kvm/kvm-s390.c233
-rw-r--r--arch/s390/kvm/kvm-s390.h9
-rw-r--r--arch/s390/kvm/vsie.c20
-rw-r--r--arch/x86/include/asm/cpufeatures.h7
-rw-r--r--arch/x86/include/asm/hardirq.h4
-rw-r--r--arch/x86/include/asm/kvm-x86-ops.h1
-rw-r--r--arch/x86/include/asm/kvm_host.h23
-rw-r--r--arch/x86/include/asm/nospec-branch.h30
-rw-r--r--arch/x86/include/asm/svm.h5
-rw-r--r--arch/x86/include/uapi/asm/kvm.h1
-rw-r--r--arch/x86/kernel/cpu/bugs.c22
-rw-r--r--arch/x86/kernel/cpu/scattered.c1
-rw-r--r--arch/x86/kvm/cpuid.c1
-rw-r--r--arch/x86/kvm/emulate.c319
-rw-r--r--arch/x86/kvm/fpu.h66
-rw-r--r--arch/x86/kvm/hyperv.c2
-rw-r--r--arch/x86/kvm/kvm_emulate.h20
-rw-r--r--arch/x86/kvm/lapic.c44
-rw-r--r--arch/x86/kvm/mmu.h5
-rw-r--r--arch/x86/kvm/mmu/mmu.c94
-rw-r--r--arch/x86/kvm/mmu/mmu_internal.h10
-rw-r--r--arch/x86/kvm/mmu/paging_tmpl.h2
-rw-r--r--arch/x86/kvm/mmu/spte.c2
-rw-r--r--arch/x86/kvm/mmu/spte.h10
-rw-r--r--arch/x86/kvm/mmu/tdp_mmu.c50
-rw-r--r--arch/x86/kvm/svm/avic.c86
-rw-r--r--arch/x86/kvm/svm/nested.c12
-rw-r--r--arch/x86/kvm/svm/sev.c47
-rw-r--r--arch/x86/kvm/svm/svm.c103
-rw-r--r--arch/x86/kvm/svm/svm.h4
-rw-r--r--arch/x86/kvm/svm/vmenter.S53
-rw-r--r--arch/x86/kvm/vmx/main.c9
-rw-r--r--arch/x86/kvm/vmx/nested.c173
-rw-r--r--arch/x86/kvm/vmx/run_flags.h10
-rw-r--r--arch/x86/kvm/vmx/tdx.c805
-rw-r--r--arch/x86/kvm/vmx/tdx.h9
-rw-r--r--arch/x86/kvm/vmx/vmenter.S51
-rw-r--r--arch/x86/kvm/vmx/vmx.c323
-rw-r--r--arch/x86/kvm/vmx/vmx.h2
-rw-r--r--arch/x86/kvm/vmx/x86_ops.h2
-rw-r--r--arch/x86/kvm/x86.c285
-rw-r--r--arch/x86/kvm/x86.h16
118 files changed, 3583 insertions, 2144 deletions
diff --git a/arch/arm64/include/asm/kvm_arm.h b/arch/arm64/include/asm/kvm_arm.h
index 1da290aeedce..e500600e4b9b 100644
--- a/arch/arm64/include/asm/kvm_arm.h
+++ b/arch/arm64/include/asm/kvm_arm.h
@@ -111,6 +111,7 @@
#define TCR_EL2_DS (1UL << 32)
#define TCR_EL2_RES1 ((1U << 31) | (1 << 23))
#define TCR_EL2_HPD (1 << 24)
+#define TCR_EL2_HA (1 << 21)
#define TCR_EL2_TBI (1 << 20)
#define TCR_EL2_PS_SHIFT 16
#define TCR_EL2_PS_MASK (7 << TCR_EL2_PS_SHIFT)
diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h
index 4b34f7b7ed2f..a1ad12c72ebf 100644
--- a/arch/arm64/include/asm/kvm_asm.h
+++ b/arch/arm64/include/asm/kvm_asm.h
@@ -79,7 +79,7 @@ enum __kvm_host_smccc_func {
__KVM_HOST_SMCCC_FUNC___kvm_tlb_flush_vmid_range,
__KVM_HOST_SMCCC_FUNC___kvm_flush_cpu_context,
__KVM_HOST_SMCCC_FUNC___kvm_timer_set_cntvoff,
- __KVM_HOST_SMCCC_FUNC___vgic_v3_save_vmcr_aprs,
+ __KVM_HOST_SMCCC_FUNC___vgic_v3_save_aprs,
__KVM_HOST_SMCCC_FUNC___vgic_v3_restore_vmcr_aprs,
__KVM_HOST_SMCCC_FUNC___pkvm_reserve_vm,
__KVM_HOST_SMCCC_FUNC___pkvm_unreserve_vm,
@@ -246,9 +246,9 @@ extern void __kvm_tlb_flush_vmid(struct kvm_s2_mmu *mmu);
extern int __kvm_tlbi_s1e2(struct kvm_s2_mmu *mmu, u64 va, u64 sys_encoding);
extern void __kvm_timer_set_cntvoff(u64 cntvoff);
-extern void __kvm_at_s1e01(struct kvm_vcpu *vcpu, u32 op, u64 vaddr);
-extern void __kvm_at_s1e2(struct kvm_vcpu *vcpu, u32 op, u64 vaddr);
-extern void __kvm_at_s12(struct kvm_vcpu *vcpu, u32 op, u64 vaddr);
+extern int __kvm_at_s1e01(struct kvm_vcpu *vcpu, u32 op, u64 vaddr);
+extern int __kvm_at_s1e2(struct kvm_vcpu *vcpu, u32 op, u64 vaddr);
+extern int __kvm_at_s12(struct kvm_vcpu *vcpu, u32 op, u64 vaddr);
extern int __kvm_vcpu_run(struct kvm_vcpu *vcpu);
diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index 64302c438355..ac7f970c7883 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -54,6 +54,7 @@
#define KVM_REQ_NESTED_S2_UNMAP KVM_ARCH_REQ(8)
#define KVM_REQ_GUEST_HYP_IRQ_PENDING KVM_ARCH_REQ(9)
#define KVM_REQ_MAP_L1_VNCR_EL2 KVM_ARCH_REQ(10)
+#define KVM_REQ_VGIC_PROCESS_UPDATE KVM_ARCH_REQ(11)
#define KVM_DIRTY_LOG_MANUAL_CAPS (KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE | \
KVM_DIRTY_LOG_INITIALLY_SET)
@@ -350,6 +351,8 @@ struct kvm_arch {
#define KVM_ARCH_FLAG_GUEST_HAS_SVE 9
/* MIDR_EL1, REVIDR_EL1, and AIDR_EL1 are writable from userspace */
#define KVM_ARCH_FLAG_WRITABLE_IMP_ID_REGS 10
+ /* Unhandled SEAs are taken to userspace */
+#define KVM_ARCH_FLAG_EXIT_SEA 11
unsigned long flags;
/* VM-wide vCPU feature set */
diff --git a/arch/arm64/include/asm/kvm_hyp.h b/arch/arm64/include/asm/kvm_hyp.h
index e6be1f5d0967..76ce2b94bd97 100644
--- a/arch/arm64/include/asm/kvm_hyp.h
+++ b/arch/arm64/include/asm/kvm_hyp.h
@@ -77,12 +77,13 @@ DECLARE_PER_CPU(struct kvm_nvhe_init_params, kvm_init_params);
int __vgic_v2_perform_cpuif_access(struct kvm_vcpu *vcpu);
u64 __gic_v3_get_lr(unsigned int lr);
+void __gic_v3_set_lr(u64 val, int lr);
void __vgic_v3_save_state(struct vgic_v3_cpu_if *cpu_if);
void __vgic_v3_restore_state(struct vgic_v3_cpu_if *cpu_if);
void __vgic_v3_activate_traps(struct vgic_v3_cpu_if *cpu_if);
void __vgic_v3_deactivate_traps(struct vgic_v3_cpu_if *cpu_if);
-void __vgic_v3_save_vmcr_aprs(struct vgic_v3_cpu_if *cpu_if);
+void __vgic_v3_save_aprs(struct vgic_v3_cpu_if *cpu_if);
void __vgic_v3_restore_vmcr_aprs(struct vgic_v3_cpu_if *cpu_if);
int __vgic_v3_perform_cpuif_access(struct kvm_vcpu *vcpu);
diff --git a/arch/arm64/include/asm/kvm_nested.h b/arch/arm64/include/asm/kvm_nested.h
index f7c06a840963..905c658057a4 100644
--- a/arch/arm64/include/asm/kvm_nested.h
+++ b/arch/arm64/include/asm/kvm_nested.h
@@ -120,9 +120,42 @@ static inline bool kvm_s2_trans_writable(struct kvm_s2_trans *trans)
return trans->writable;
}
-static inline bool kvm_s2_trans_executable(struct kvm_s2_trans *trans)
+static inline bool kvm_has_xnx(struct kvm *kvm)
{
- return !(trans->desc & BIT(54));
+ return cpus_have_final_cap(ARM64_HAS_XNX) &&
+ kvm_has_feat(kvm, ID_AA64MMFR1_EL1, XNX, IMP);
+}
+
+static inline bool kvm_s2_trans_exec_el0(struct kvm *kvm, struct kvm_s2_trans *trans)
+{
+ u8 xn = FIELD_GET(KVM_PTE_LEAF_ATTR_HI_S2_XN, trans->desc);
+
+ if (!kvm_has_xnx(kvm))
+ xn &= FIELD_PREP(KVM_PTE_LEAF_ATTR_HI_S2_XN, 0b10);
+
+ switch (xn) {
+ case 0b00:
+ case 0b01:
+ return true;
+ default:
+ return false;
+ }
+}
+
+static inline bool kvm_s2_trans_exec_el1(struct kvm *kvm, struct kvm_s2_trans *trans)
+{
+ u8 xn = FIELD_GET(KVM_PTE_LEAF_ATTR_HI_S2_XN, trans->desc);
+
+ if (!kvm_has_xnx(kvm))
+ xn &= FIELD_PREP(KVM_PTE_LEAF_ATTR_HI_S2_XN, 0b10);
+
+ switch (xn) {
+ case 0b00:
+ case 0b11:
+ return true;
+ default:
+ return false;
+ }
}
extern int kvm_walk_nested_s2(struct kvm_vcpu *vcpu, phys_addr_t gipa,
@@ -320,6 +353,7 @@ struct s1_walk_info {
bool be;
bool s2;
bool pa52bit;
+ bool ha;
};
struct s1_walk_result {
@@ -370,4 +404,6 @@ void kvm_handle_s1e2_tlbi(struct kvm_vcpu *vcpu, u32 inst, u64 val);
(FIX_VNCR - __c); \
})
+int __kvm_at_swap_desc(struct kvm *kvm, gpa_t ipa, u64 old, u64 new);
+
#endif /* __ARM64_KVM_NESTED_H */
diff --git a/arch/arm64/include/asm/kvm_pgtable.h b/arch/arm64/include/asm/kvm_pgtable.h
index 2888b5d03757..fc02de43c68d 100644
--- a/arch/arm64/include/asm/kvm_pgtable.h
+++ b/arch/arm64/include/asm/kvm_pgtable.h
@@ -89,7 +89,7 @@ typedef u64 kvm_pte_t;
#define KVM_PTE_LEAF_ATTR_HI_S1_XN BIT(54)
-#define KVM_PTE_LEAF_ATTR_HI_S2_XN BIT(54)
+#define KVM_PTE_LEAF_ATTR_HI_S2_XN GENMASK(54, 53)
#define KVM_PTE_LEAF_ATTR_HI_S1_GP BIT(50)
@@ -240,7 +240,9 @@ enum kvm_pgtable_stage2_flags {
/**
* enum kvm_pgtable_prot - Page-table permissions and attributes.
- * @KVM_PGTABLE_PROT_X: Execute permission.
+ * @KVM_PGTABLE_PROT_UX: Unprivileged execute permission.
+ * @KVM_PGTABLE_PROT_PX: Privileged execute permission.
+ * @KVM_PGTABLE_PROT_X: Privileged and unprivileged execute permission.
* @KVM_PGTABLE_PROT_W: Write permission.
* @KVM_PGTABLE_PROT_R: Read permission.
* @KVM_PGTABLE_PROT_DEVICE: Device attributes.
@@ -251,12 +253,15 @@ enum kvm_pgtable_stage2_flags {
* @KVM_PGTABLE_PROT_SW3: Software bit 3.
*/
enum kvm_pgtable_prot {
- KVM_PGTABLE_PROT_X = BIT(0),
- KVM_PGTABLE_PROT_W = BIT(1),
- KVM_PGTABLE_PROT_R = BIT(2),
+ KVM_PGTABLE_PROT_PX = BIT(0),
+ KVM_PGTABLE_PROT_UX = BIT(1),
+ KVM_PGTABLE_PROT_X = KVM_PGTABLE_PROT_PX |
+ KVM_PGTABLE_PROT_UX,
+ KVM_PGTABLE_PROT_W = BIT(2),
+ KVM_PGTABLE_PROT_R = BIT(3),
- KVM_PGTABLE_PROT_DEVICE = BIT(3),
- KVM_PGTABLE_PROT_NORMAL_NC = BIT(4),
+ KVM_PGTABLE_PROT_DEVICE = BIT(4),
+ KVM_PGTABLE_PROT_NORMAL_NC = BIT(5),
KVM_PGTABLE_PROT_SW0 = BIT(55),
KVM_PGTABLE_PROT_SW1 = BIT(56),
@@ -355,6 +360,11 @@ static inline kvm_pte_t *kvm_dereference_pteref(struct kvm_pgtable_walker *walke
return pteref;
}
+static inline kvm_pte_t *kvm_dereference_pteref_raw(kvm_pteref_t pteref)
+{
+ return pteref;
+}
+
static inline int kvm_pgtable_walk_begin(struct kvm_pgtable_walker *walker)
{
/*
@@ -384,6 +394,11 @@ static inline kvm_pte_t *kvm_dereference_pteref(struct kvm_pgtable_walker *walke
return rcu_dereference_check(pteref, !(walker->flags & KVM_PGTABLE_WALK_SHARED));
}
+static inline kvm_pte_t *kvm_dereference_pteref_raw(kvm_pteref_t pteref)
+{
+ return rcu_dereference_raw(pteref);
+}
+
static inline int kvm_pgtable_walk_begin(struct kvm_pgtable_walker *walker)
{
if (walker->flags & KVM_PGTABLE_WALK_SHARED)
@@ -552,6 +567,26 @@ static inline int kvm_pgtable_stage2_init(struct kvm_pgtable *pgt, struct kvm_s2
void kvm_pgtable_stage2_destroy(struct kvm_pgtable *pgt);
/**
+ * kvm_pgtable_stage2_destroy_range() - Destroy the unlinked range of addresses.
+ * @pgt: Page-table structure initialised by kvm_pgtable_stage2_init*().
+ * @addr: Intermediate physical address at which to place the mapping.
+ * @size: Size of the mapping.
+ *
+ * The page-table is assumed to be unreachable by any hardware walkers prior
+ * to freeing and therefore no TLB invalidation is performed.
+ */
+void kvm_pgtable_stage2_destroy_range(struct kvm_pgtable *pgt,
+ u64 addr, u64 size);
+
+/**
+ * kvm_pgtable_stage2_destroy_pgd() - Destroy the PGD of guest stage-2 page-table.
+ * @pgt: Page-table structure initialised by kvm_pgtable_stage2_init*().
+ *
+ * It is assumed that the rest of the page-table is freed before this operation.
+ */
+void kvm_pgtable_stage2_destroy_pgd(struct kvm_pgtable *pgt);
+
+/**
* kvm_pgtable_stage2_free_unlinked() - Free an unlinked stage-2 paging structure.
* @mm_ops: Memory management callbacks.
* @pgtable: Unlinked stage-2 paging structure to be freed.
diff --git a/arch/arm64/include/asm/kvm_pkvm.h b/arch/arm64/include/asm/kvm_pkvm.h
index 08be89c95466..0aecd4ac5f45 100644
--- a/arch/arm64/include/asm/kvm_pkvm.h
+++ b/arch/arm64/include/asm/kvm_pkvm.h
@@ -180,7 +180,9 @@ struct pkvm_mapping {
int pkvm_pgtable_stage2_init(struct kvm_pgtable *pgt, struct kvm_s2_mmu *mmu,
struct kvm_pgtable_mm_ops *mm_ops);
-void pkvm_pgtable_stage2_destroy(struct kvm_pgtable *pgt);
+void pkvm_pgtable_stage2_destroy_range(struct kvm_pgtable *pgt,
+ u64 addr, u64 size);
+void pkvm_pgtable_stage2_destroy_pgd(struct kvm_pgtable *pgt);
int pkvm_pgtable_stage2_map(struct kvm_pgtable *pgt, u64 addr, u64 size, u64 phys,
enum kvm_pgtable_prot prot, void *mc,
enum kvm_pgtable_walk_flags flags);
diff --git a/arch/arm64/include/asm/virt.h b/arch/arm64/include/asm/virt.h
index 530af9620fdb..b51ab6840f9c 100644
--- a/arch/arm64/include/asm/virt.h
+++ b/arch/arm64/include/asm/virt.h
@@ -40,8 +40,13 @@
*/
#define HVC_FINALISE_EL2 3
+/*
+ * HVC_GET_ICH_VTR_EL2 - Retrieve the ICH_VTR_EL2 value
+ */
+#define HVC_GET_ICH_VTR_EL2 4
+
/* Max number of HYP stub hypercalls */
-#define HVC_STUB_HCALL_NR 4
+#define HVC_STUB_HCALL_NR 5
/* Error returned when an invalid stub number is passed into x0 */
#define HVC_STUB_ERR 0xbadca11
diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c
index 42b182cfa404..c840a93b9ef9 100644
--- a/arch/arm64/kernel/cpufeature.c
+++ b/arch/arm64/kernel/cpufeature.c
@@ -2304,6 +2304,49 @@ static bool has_gic_prio_relaxed_sync(const struct arm64_cpu_capabilities *entry
}
#endif
+static bool can_trap_icv_dir_el1(const struct arm64_cpu_capabilities *entry,
+ int scope)
+{
+ static const struct midr_range has_vgic_v3[] = {
+ MIDR_ALL_VERSIONS(MIDR_APPLE_M1_ICESTORM),
+ MIDR_ALL_VERSIONS(MIDR_APPLE_M1_FIRESTORM),
+ MIDR_ALL_VERSIONS(MIDR_APPLE_M1_ICESTORM_PRO),
+ MIDR_ALL_VERSIONS(MIDR_APPLE_M1_FIRESTORM_PRO),
+ MIDR_ALL_VERSIONS(MIDR_APPLE_M1_ICESTORM_MAX),
+ MIDR_ALL_VERSIONS(MIDR_APPLE_M1_FIRESTORM_MAX),
+ MIDR_ALL_VERSIONS(MIDR_APPLE_M2_BLIZZARD),
+ MIDR_ALL_VERSIONS(MIDR_APPLE_M2_AVALANCHE),
+ MIDR_ALL_VERSIONS(MIDR_APPLE_M2_BLIZZARD_PRO),
+ MIDR_ALL_VERSIONS(MIDR_APPLE_M2_AVALANCHE_PRO),
+ MIDR_ALL_VERSIONS(MIDR_APPLE_M2_BLIZZARD_MAX),
+ MIDR_ALL_VERSIONS(MIDR_APPLE_M2_AVALANCHE_MAX),
+ {},
+ };
+ struct arm_smccc_res res = {};
+
+ BUILD_BUG_ON(ARM64_HAS_ICH_HCR_EL2_TDIR <= ARM64_HAS_GICV3_CPUIF);
+ BUILD_BUG_ON(ARM64_HAS_ICH_HCR_EL2_TDIR <= ARM64_HAS_GICV5_LEGACY);
+ if (!this_cpu_has_cap(ARM64_HAS_GICV3_CPUIF) &&
+ !is_midr_in_range_list(has_vgic_v3))
+ return false;
+
+ if (!is_hyp_mode_available())
+ return false;
+
+ if (this_cpu_has_cap(ARM64_HAS_GICV5_LEGACY))
+ return true;
+
+ if (is_kernel_in_hyp_mode())
+ res.a1 = read_sysreg_s(SYS_ICH_VTR_EL2);
+ else
+ arm_smccc_1_1_hvc(HVC_GET_ICH_VTR_EL2, &res);
+
+ if (res.a0 == HVC_STUB_ERR)
+ return false;
+
+ return res.a1 & ICH_VTR_EL2_TDS;
+}
+
#ifdef CONFIG_ARM64_BTI
static void bti_enable(const struct arm64_cpu_capabilities *__unused)
{
@@ -2815,6 +2858,15 @@ static const struct arm64_cpu_capabilities arm64_features[] = {
.matches = has_gic_prio_relaxed_sync,
},
#endif
+ {
+ /*
+ * Depends on having GICv3
+ */
+ .desc = "ICV_DIR_EL1 trapping",
+ .capability = ARM64_HAS_ICH_HCR_EL2_TDIR,
+ .type = ARM64_CPUCAP_EARLY_LOCAL_CPU_FEATURE,
+ .matches = can_trap_icv_dir_el1,
+ },
#ifdef CONFIG_ARM64_E0PD
{
.desc = "E0PD",
@@ -3089,6 +3141,13 @@ static const struct arm64_cpu_capabilities arm64_features[] = {
.capability = ARM64_HAS_GICV5_LEGACY,
.matches = test_has_gicv5_legacy,
},
+ {
+ .desc = "XNX",
+ .capability = ARM64_HAS_XNX,
+ .type = ARM64_CPUCAP_SYSTEM_FEATURE,
+ .matches = has_cpuid_feature,
+ ARM64_CPUID_FIELDS(ID_AA64MMFR1_EL1, XNX, IMP)
+ },
{},
};
diff --git a/arch/arm64/kernel/hyp-stub.S b/arch/arm64/kernel/hyp-stub.S
index 36e2d26b54f5..085bc9972f6b 100644
--- a/arch/arm64/kernel/hyp-stub.S
+++ b/arch/arm64/kernel/hyp-stub.S
@@ -54,6 +54,11 @@ SYM_CODE_START_LOCAL(elx_sync)
1: cmp x0, #HVC_FINALISE_EL2
b.eq __finalise_el2
+ cmp x0, #HVC_GET_ICH_VTR_EL2
+ b.ne 2f
+ mrs_s x1, SYS_ICH_VTR_EL2
+ b 9f
+
2: cmp x0, #HVC_SOFT_RESTART
b.ne 3f
mov x0, x2
diff --git a/arch/arm64/kernel/image-vars.h b/arch/arm64/kernel/image-vars.h
index 5369763606e7..85bc629270bd 100644
--- a/arch/arm64/kernel/image-vars.h
+++ b/arch/arm64/kernel/image-vars.h
@@ -91,6 +91,7 @@ KVM_NVHE_ALIAS(spectre_bhb_patch_loop_mitigation_enable);
KVM_NVHE_ALIAS(spectre_bhb_patch_wa3);
KVM_NVHE_ALIAS(spectre_bhb_patch_clearbhb);
KVM_NVHE_ALIAS(alt_cb_patch_nops);
+KVM_NVHE_ALIAS(kvm_compute_ich_hcr_trap_bits);
/* Global kernel state accessed by nVHE hyp code. */
KVM_NVHE_ALIAS(kvm_vgic_global_state);
diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
index 12acc024f7a8..4f80da0c0d1d 100644
--- a/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@@ -132,6 +132,10 @@ int kvm_vm_ioctl_enable_cap(struct kvm *kvm,
}
mutex_unlock(&kvm->lock);
break;
+ case KVM_CAP_ARM_SEA_TO_USER:
+ r = 0;
+ set_bit(KVM_ARCH_FLAG_EXIT_SEA, &kvm->arch.flags);
+ break;
default:
break;
}
@@ -327,6 +331,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
case KVM_CAP_IRQFD_RESAMPLE:
case KVM_CAP_COUNTER_OFFSET:
case KVM_CAP_ARM_WRITABLE_IMP_ID_REGS:
+ case KVM_CAP_ARM_SEA_TO_USER:
r = 1;
break;
case KVM_CAP_SET_GUEST_DEBUG2:
@@ -440,7 +445,7 @@ struct kvm *kvm_arch_alloc_vm(void)
if (!has_vhe())
return kzalloc(sz, GFP_KERNEL_ACCOUNT);
- return __vmalloc(sz, GFP_KERNEL_ACCOUNT | __GFP_HIGHMEM | __GFP_ZERO);
+ return kvzalloc(sz, GFP_KERNEL_ACCOUNT);
}
int kvm_arch_vcpu_precreate(struct kvm *kvm, unsigned int id)
@@ -659,8 +664,7 @@ nommu:
void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
{
if (is_protected_kvm_enabled()) {
- kvm_call_hyp(__vgic_v3_save_vmcr_aprs,
- &vcpu->arch.vgic_cpu.vgic_v3);
+ kvm_call_hyp(__vgic_v3_save_aprs, &vcpu->arch.vgic_cpu.vgic_v3);
kvm_call_hyp_nvhe(__pkvm_vcpu_put);
}
@@ -1042,6 +1046,10 @@ static int check_vcpu_requests(struct kvm_vcpu *vcpu)
*/
kvm_check_request(KVM_REQ_IRQ_PENDING, vcpu);
+ /* Process interrupts deactivated through a trap */
+ if (kvm_check_request(KVM_REQ_VGIC_PROCESS_UPDATE, vcpu))
+ kvm_vgic_process_async_update(vcpu);
+
if (kvm_check_request(KVM_REQ_RECORD_STEAL, vcpu))
kvm_update_stolen_time(vcpu);
@@ -1835,6 +1843,12 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
return r;
}
+long kvm_arch_vcpu_unlocked_ioctl(struct file *filp, unsigned int ioctl,
+ unsigned long arg)
+{
+ return -ENOIOCTLCMD;
+}
+
void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot)
{
diff --git a/arch/arm64/kvm/at.c b/arch/arm64/kvm/at.c
index be26d5aa668c..53bf70126f81 100644
--- a/arch/arm64/kvm/at.c
+++ b/arch/arm64/kvm/at.c
@@ -346,6 +346,11 @@ static int setup_s1_walk(struct kvm_vcpu *vcpu, struct s1_walk_info *wi,
wi->baddr &= GENMASK_ULL(wi->max_oa_bits - 1, x);
+ wi->ha = kvm_has_feat(vcpu->kvm, ID_AA64MMFR1_EL1, HAFDBS, AF);
+ wi->ha &= (wi->regime == TR_EL2 ?
+ FIELD_GET(TCR_EL2_HA, tcr) :
+ FIELD_GET(TCR_HA, tcr));
+
return 0;
addrsz:
@@ -362,10 +367,42 @@ transfault:
return -EFAULT;
}
+static int kvm_read_s1_desc(struct kvm_vcpu *vcpu, u64 pa, u64 *desc,
+ struct s1_walk_info *wi)
+{
+ u64 val;
+ int r;
+
+ r = kvm_read_guest(vcpu->kvm, pa, &val, sizeof(val));
+ if (r)
+ return r;
+
+ if (wi->be)
+ *desc = be64_to_cpu((__force __be64)val);
+ else
+ *desc = le64_to_cpu((__force __le64)val);
+
+ return 0;
+}
+
+static int kvm_swap_s1_desc(struct kvm_vcpu *vcpu, u64 pa, u64 old, u64 new,
+ struct s1_walk_info *wi)
+{
+ if (wi->be) {
+ old = (__force u64)cpu_to_be64(old);
+ new = (__force u64)cpu_to_be64(new);
+ } else {
+ old = (__force u64)cpu_to_le64(old);
+ new = (__force u64)cpu_to_le64(new);
+ }
+
+ return __kvm_at_swap_desc(vcpu->kvm, pa, old, new);
+}
+
static int walk_s1(struct kvm_vcpu *vcpu, struct s1_walk_info *wi,
struct s1_walk_result *wr, u64 va)
{
- u64 va_top, va_bottom, baddr, desc;
+ u64 va_top, va_bottom, baddr, desc, new_desc, ipa;
int level, stride, ret;
level = wi->sl;
@@ -375,7 +412,7 @@ static int walk_s1(struct kvm_vcpu *vcpu, struct s1_walk_info *wi,
va_top = get_ia_size(wi) - 1;
while (1) {
- u64 index, ipa;
+ u64 index;
va_bottom = (3 - level) * stride + wi->pgshift;
index = (va & GENMASK_ULL(va_top, va_bottom)) >> (va_bottom - 3);
@@ -414,16 +451,13 @@ static int walk_s1(struct kvm_vcpu *vcpu, struct s1_walk_info *wi,
return ret;
}
- ret = kvm_read_guest(vcpu->kvm, ipa, &desc, sizeof(desc));
+ ret = kvm_read_s1_desc(vcpu, ipa, &desc, wi);
if (ret) {
fail_s1_walk(wr, ESR_ELx_FSC_SEA_TTW(level), false);
return ret;
}
- if (wi->be)
- desc = be64_to_cpu((__force __be64)desc);
- else
- desc = le64_to_cpu((__force __le64)desc);
+ new_desc = desc;
/* Invalid descriptor */
if (!(desc & BIT(0)))
@@ -477,6 +511,17 @@ static int walk_s1(struct kvm_vcpu *vcpu, struct s1_walk_info *wi,
if (check_output_size(baddr & GENMASK(52, va_bottom), wi))
goto addrsz;
+ if (wi->ha)
+ new_desc |= PTE_AF;
+
+ if (new_desc != desc) {
+ ret = kvm_swap_s1_desc(vcpu, ipa, desc, new_desc, wi);
+ if (ret)
+ return ret;
+
+ desc = new_desc;
+ }
+
if (!(desc & PTE_AF)) {
fail_s1_walk(wr, ESR_ELx_FSC_ACCESS_L(level), false);
return -EACCES;
@@ -1221,7 +1266,7 @@ static void compute_s1_permissions(struct kvm_vcpu *vcpu,
wr->pr &= !pan;
}
-static u64 handle_at_slow(struct kvm_vcpu *vcpu, u32 op, u64 vaddr)
+static int handle_at_slow(struct kvm_vcpu *vcpu, u32 op, u64 vaddr, u64 *par)
{
struct s1_walk_result wr = {};
struct s1_walk_info wi = {};
@@ -1246,6 +1291,11 @@ static u64 handle_at_slow(struct kvm_vcpu *vcpu, u32 op, u64 vaddr)
srcu_read_unlock(&vcpu->kvm->srcu, idx);
+ /*
+ * Race to update a descriptor -- restart the walk.
+ */
+ if (ret == -EAGAIN)
+ return ret;
if (ret)
goto compute_par;
@@ -1279,7 +1329,8 @@ static u64 handle_at_slow(struct kvm_vcpu *vcpu, u32 op, u64 vaddr)
fail_s1_walk(&wr, ESR_ELx_FSC_PERM_L(wr.level), false);
compute_par:
- return compute_par_s1(vcpu, &wi, &wr);
+ *par = compute_par_s1(vcpu, &wi, &wr);
+ return 0;
}
/*
@@ -1407,9 +1458,10 @@ static bool par_check_s1_access_fault(u64 par)
!(par & SYS_PAR_EL1_S));
}
-void __kvm_at_s1e01(struct kvm_vcpu *vcpu, u32 op, u64 vaddr)
+int __kvm_at_s1e01(struct kvm_vcpu *vcpu, u32 op, u64 vaddr)
{
u64 par = __kvm_at_s1e01_fast(vcpu, op, vaddr);
+ int ret;
/*
* If PAR_EL1 reports that AT failed on a S1 permission or access
@@ -1421,15 +1473,20 @@ void __kvm_at_s1e01(struct kvm_vcpu *vcpu, u32 op, u64 vaddr)
*/
if ((par & SYS_PAR_EL1_F) &&
!par_check_s1_perm_fault(par) &&
- !par_check_s1_access_fault(par))
- par = handle_at_slow(vcpu, op, vaddr);
+ !par_check_s1_access_fault(par)) {
+ ret = handle_at_slow(vcpu, op, vaddr, &par);
+ if (ret)
+ return ret;
+ }
vcpu_write_sys_reg(vcpu, par, PAR_EL1);
+ return 0;
}
-void __kvm_at_s1e2(struct kvm_vcpu *vcpu, u32 op, u64 vaddr)
+int __kvm_at_s1e2(struct kvm_vcpu *vcpu, u32 op, u64 vaddr)
{
u64 par;
+ int ret;
/*
* We've trapped, so everything is live on the CPU. As we will be
@@ -1476,13 +1533,17 @@ void __kvm_at_s1e2(struct kvm_vcpu *vcpu, u32 op, u64 vaddr)
}
/* We failed the translation, let's replay it in slow motion */
- if ((par & SYS_PAR_EL1_F) && !par_check_s1_perm_fault(par))
- par = handle_at_slow(vcpu, op, vaddr);
+ if ((par & SYS_PAR_EL1_F) && !par_check_s1_perm_fault(par)) {
+ ret = handle_at_slow(vcpu, op, vaddr, &par);
+ if (ret)
+ return ret;
+ }
vcpu_write_sys_reg(vcpu, par, PAR_EL1);
+ return 0;
}
-void __kvm_at_s12(struct kvm_vcpu *vcpu, u32 op, u64 vaddr)
+int __kvm_at_s12(struct kvm_vcpu *vcpu, u32 op, u64 vaddr)
{
struct kvm_s2_trans out = {};
u64 ipa, par;
@@ -1509,13 +1570,13 @@ void __kvm_at_s12(struct kvm_vcpu *vcpu, u32 op, u64 vaddr)
break;
default:
WARN_ON_ONCE(1);
- return;
+ return 0;
}
__kvm_at_s1e01(vcpu, op, vaddr);
par = vcpu_read_sys_reg(vcpu, PAR_EL1);
if (par & SYS_PAR_EL1_F)
- return;
+ return 0;
/*
* If we only have a single stage of translation (EL2&0), exit
@@ -1523,14 +1584,14 @@ void __kvm_at_s12(struct kvm_vcpu *vcpu, u32 op, u64 vaddr)
*/
if (compute_translation_regime(vcpu, op) == TR_EL20 ||
!(vcpu_read_sys_reg(vcpu, HCR_EL2) & (HCR_VM | HCR_DC)))
- return;
+ return 0;
/* Do the stage-2 translation */
ipa = (par & GENMASK_ULL(47, 12)) | (vaddr & GENMASK_ULL(11, 0));
out.esr = 0;
ret = kvm_walk_nested_s2(vcpu, ipa, &out);
if (ret < 0)
- return;
+ return ret;
/* Check the access permission */
if (!out.esr &&
@@ -1539,6 +1600,7 @@ void __kvm_at_s12(struct kvm_vcpu *vcpu, u32 op, u64 vaddr)
par = compute_par_s12(vcpu, par, &out);
vcpu_write_sys_reg(vcpu, par, PAR_EL1);
+ return 0;
}
/*
@@ -1637,3 +1699,97 @@ int __kvm_find_s1_desc_level(struct kvm_vcpu *vcpu, u64 va, u64 ipa, int *level)
return ret;
}
}
+
+#ifdef CONFIG_ARM64_LSE_ATOMICS
+static int __lse_swap_desc(u64 __user *ptep, u64 old, u64 new)
+{
+ u64 tmp = old;
+ int ret = 0;
+
+ uaccess_enable_privileged();
+
+ asm volatile(__LSE_PREAMBLE
+ "1: cas %[old], %[new], %[addr]\n"
+ "2:\n"
+ _ASM_EXTABLE_UACCESS_ERR(1b, 2b, %w[ret])
+ : [old] "+r" (old), [addr] "+Q" (*ptep), [ret] "+r" (ret)
+ : [new] "r" (new)
+ : "memory");
+
+ uaccess_disable_privileged();
+
+ if (ret)
+ return ret;
+ if (tmp != old)
+ return -EAGAIN;
+
+ return ret;
+}
+#else
+static int __lse_swap_desc(u64 __user *ptep, u64 old, u64 new)
+{
+ return -EINVAL;
+}
+#endif
+
+static int __llsc_swap_desc(u64 __user *ptep, u64 old, u64 new)
+{
+ int ret = 1;
+ u64 tmp;
+
+ uaccess_enable_privileged();
+
+ asm volatile("prfm pstl1strm, %[addr]\n"
+ "1: ldxr %[tmp], %[addr]\n"
+ "sub %[tmp], %[tmp], %[old]\n"
+ "cbnz %[tmp], 3f\n"
+ "2: stlxr %w[ret], %[new], %[addr]\n"
+ "3:\n"
+ _ASM_EXTABLE_UACCESS_ERR(1b, 3b, %w[ret])
+ _ASM_EXTABLE_UACCESS_ERR(2b, 3b, %w[ret])
+ : [ret] "+r" (ret), [addr] "+Q" (*ptep), [tmp] "=&r" (tmp)
+ : [old] "r" (old), [new] "r" (new)
+ : "memory");
+
+ uaccess_disable_privileged();
+
+ /* STLXR didn't update the descriptor, or the compare failed */
+ if (ret == 1)
+ return -EAGAIN;
+
+ return ret;
+}
+
+int __kvm_at_swap_desc(struct kvm *kvm, gpa_t ipa, u64 old, u64 new)
+{
+ struct kvm_memory_slot *slot;
+ unsigned long hva;
+ u64 __user *ptep;
+ bool writable;
+ int offset;
+ gfn_t gfn;
+ int r;
+
+ lockdep_assert(srcu_read_lock_held(&kvm->srcu));
+
+ gfn = ipa >> PAGE_SHIFT;
+ offset = offset_in_page(ipa);
+ slot = gfn_to_memslot(kvm, gfn);
+ hva = gfn_to_hva_memslot_prot(slot, gfn, &writable);
+ if (kvm_is_error_hva(hva))
+ return -EINVAL;
+ if (!writable)
+ return -EPERM;
+
+ ptep = (u64 __user *)hva + offset;
+ if (cpus_have_final_cap(ARM64_HAS_LSE_ATOMICS))
+ r = __lse_swap_desc(ptep, old, new);
+ else
+ r = __llsc_swap_desc(ptep, old, new);
+
+ if (r < 0)
+ return r;
+
+ mark_page_dirty_in_slot(kvm, slot, gfn);
+ return 0;
+}
diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-main.c b/arch/arm64/kvm/hyp/nvhe/hyp-main.c
index 29430c031095..a7c689152f68 100644
--- a/arch/arm64/kvm/hyp/nvhe/hyp-main.c
+++ b/arch/arm64/kvm/hyp/nvhe/hyp-main.c
@@ -157,6 +157,7 @@ static void sync_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu)
host_vcpu->arch.iflags = hyp_vcpu->vcpu.arch.iflags;
host_cpu_if->vgic_hcr = hyp_cpu_if->vgic_hcr;
+ host_cpu_if->vgic_vmcr = hyp_cpu_if->vgic_vmcr;
for (i = 0; i < hyp_cpu_if->used_lrs; ++i)
host_cpu_if->vgic_lr[i] = hyp_cpu_if->vgic_lr[i];
}
@@ -464,11 +465,11 @@ static void handle___vgic_v3_init_lrs(struct kvm_cpu_context *host_ctxt)
__vgic_v3_init_lrs();
}
-static void handle___vgic_v3_save_vmcr_aprs(struct kvm_cpu_context *host_ctxt)
+static void handle___vgic_v3_save_aprs(struct kvm_cpu_context *host_ctxt)
{
DECLARE_REG(struct vgic_v3_cpu_if *, cpu_if, host_ctxt, 1);
- __vgic_v3_save_vmcr_aprs(kern_hyp_va(cpu_if));
+ __vgic_v3_save_aprs(kern_hyp_va(cpu_if));
}
static void handle___vgic_v3_restore_vmcr_aprs(struct kvm_cpu_context *host_ctxt)
@@ -616,7 +617,7 @@ static const hcall_t host_hcall[] = {
HANDLE_FUNC(__kvm_tlb_flush_vmid_range),
HANDLE_FUNC(__kvm_flush_cpu_context),
HANDLE_FUNC(__kvm_timer_set_cntvoff),
- HANDLE_FUNC(__vgic_v3_save_vmcr_aprs),
+ HANDLE_FUNC(__vgic_v3_save_aprs),
HANDLE_FUNC(__vgic_v3_restore_vmcr_aprs),
HANDLE_FUNC(__pkvm_reserve_vm),
HANDLE_FUNC(__pkvm_unreserve_vm),
diff --git a/arch/arm64/kvm/hyp/nvhe/pkvm.c b/arch/arm64/kvm/hyp/nvhe/pkvm.c
index 43bde061b65d..8911338961c5 100644
--- a/arch/arm64/kvm/hyp/nvhe/pkvm.c
+++ b/arch/arm64/kvm/hyp/nvhe/pkvm.c
@@ -337,6 +337,9 @@ static void pkvm_init_features_from_host(struct pkvm_hyp_vm *hyp_vm, const struc
/* CTR_EL0 is always under host control, even for protected VMs. */
hyp_vm->kvm.arch.ctr_el0 = host_kvm->arch.ctr_el0;
+ /* Preserve the vgic model so that GICv3 emulation works */
+ hyp_vm->kvm.arch.vgic.vgic_model = host_kvm->arch.vgic.vgic_model;
+
if (test_bit(KVM_ARCH_FLAG_MTE_ENABLED, &host_kvm->arch.flags))
set_bit(KVM_ARCH_FLAG_MTE_ENABLED, &kvm->arch.flags);
diff --git a/arch/arm64/kvm/hyp/nvhe/sys_regs.c b/arch/arm64/kvm/hyp/nvhe/sys_regs.c
index 82da9b03692d..3108b5185c20 100644
--- a/arch/arm64/kvm/hyp/nvhe/sys_regs.c
+++ b/arch/arm64/kvm/hyp/nvhe/sys_regs.c
@@ -444,6 +444,8 @@ static const struct sys_reg_desc pvm_sys_reg_descs[] = {
/* Scalable Vector Registers are restricted. */
+ HOST_HANDLED(SYS_ICC_PMR_EL1),
+
RAZ_WI(SYS_ERRIDR_EL1),
RAZ_WI(SYS_ERRSELR_EL1),
RAZ_WI(SYS_ERXFR_EL1),
@@ -457,9 +459,12 @@ static const struct sys_reg_desc pvm_sys_reg_descs[] = {
/* Limited Ordering Regions Registers are restricted. */
+ HOST_HANDLED(SYS_ICC_DIR_EL1),
+ HOST_HANDLED(SYS_ICC_RPR_EL1),
HOST_HANDLED(SYS_ICC_SGI1R_EL1),
HOST_HANDLED(SYS_ICC_ASGI1R_EL1),
HOST_HANDLED(SYS_ICC_SGI0R_EL1),
+ HOST_HANDLED(SYS_ICC_CTLR_EL1),
{ SYS_DESC(SYS_ICC_SRE_EL1), .access = pvm_gic_read_sre, },
HOST_HANDLED(SYS_CCSIDR_EL1),
diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c
index c351b4abd5db..947ac1a951a5 100644
--- a/arch/arm64/kvm/hyp/pgtable.c
+++ b/arch/arm64/kvm/hyp/pgtable.c
@@ -661,11 +661,37 @@ void kvm_tlb_flush_vmid_range(struct kvm_s2_mmu *mmu,
#define KVM_S2_MEMATTR(pgt, attr) PAGE_S2_MEMATTR(attr, stage2_has_fwb(pgt))
+static int stage2_set_xn_attr(enum kvm_pgtable_prot prot, kvm_pte_t *attr)
+{
+ bool px, ux;
+ u8 xn;
+
+ px = prot & KVM_PGTABLE_PROT_PX;
+ ux = prot & KVM_PGTABLE_PROT_UX;
+
+ if (!cpus_have_final_cap(ARM64_HAS_XNX) && px != ux)
+ return -EINVAL;
+
+ if (px && ux)
+ xn = 0b00;
+ else if (!px && ux)
+ xn = 0b01;
+ else if (!px && !ux)
+ xn = 0b10;
+ else
+ xn = 0b11;
+
+ *attr &= ~KVM_PTE_LEAF_ATTR_HI_S2_XN;
+ *attr |= FIELD_PREP(KVM_PTE_LEAF_ATTR_HI_S2_XN, xn);
+ return 0;
+}
+
static int stage2_set_prot_attr(struct kvm_pgtable *pgt, enum kvm_pgtable_prot prot,
kvm_pte_t *ptep)
{
kvm_pte_t attr;
u32 sh = KVM_PTE_LEAF_ATTR_LO_S2_SH_IS;
+ int r;
switch (prot & (KVM_PGTABLE_PROT_DEVICE |
KVM_PGTABLE_PROT_NORMAL_NC)) {
@@ -685,8 +711,9 @@ static int stage2_set_prot_attr(struct kvm_pgtable *pgt, enum kvm_pgtable_prot p
attr = KVM_S2_MEMATTR(pgt, NORMAL);
}
- if (!(prot & KVM_PGTABLE_PROT_X))
- attr |= KVM_PTE_LEAF_ATTR_HI_S2_XN;
+ r = stage2_set_xn_attr(prot, &attr);
+ if (r)
+ return r;
if (prot & KVM_PGTABLE_PROT_R)
attr |= KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R;
@@ -715,8 +742,20 @@ enum kvm_pgtable_prot kvm_pgtable_stage2_pte_prot(kvm_pte_t pte)
prot |= KVM_PGTABLE_PROT_R;
if (pte & KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W)
prot |= KVM_PGTABLE_PROT_W;
- if (!(pte & KVM_PTE_LEAF_ATTR_HI_S2_XN))
- prot |= KVM_PGTABLE_PROT_X;
+
+ switch (FIELD_GET(KVM_PTE_LEAF_ATTR_HI_S2_XN, pte)) {
+ case 0b00:
+ prot |= KVM_PGTABLE_PROT_PX | KVM_PGTABLE_PROT_UX;
+ break;
+ case 0b01:
+ prot |= KVM_PGTABLE_PROT_UX;
+ break;
+ case 0b11:
+ prot |= KVM_PGTABLE_PROT_PX;
+ break;
+ default:
+ break;
+ }
return prot;
}
@@ -1290,9 +1329,9 @@ bool kvm_pgtable_stage2_test_clear_young(struct kvm_pgtable *pgt, u64 addr,
int kvm_pgtable_stage2_relax_perms(struct kvm_pgtable *pgt, u64 addr,
enum kvm_pgtable_prot prot, enum kvm_pgtable_walk_flags flags)
{
- int ret;
+ kvm_pte_t xn = 0, set = 0, clr = 0;
s8 level;
- kvm_pte_t set = 0, clr = 0;
+ int ret;
if (prot & KVM_PTE_LEAF_ATTR_HI_SW)
return -EINVAL;
@@ -1303,8 +1342,12 @@ int kvm_pgtable_stage2_relax_perms(struct kvm_pgtable *pgt, u64 addr,
if (prot & KVM_PGTABLE_PROT_W)
set |= KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W;
- if (prot & KVM_PGTABLE_PROT_X)
- clr |= KVM_PTE_LEAF_ATTR_HI_S2_XN;
+ ret = stage2_set_xn_attr(prot, &xn);
+ if (ret)
+ return ret;
+
+ set |= xn & KVM_PTE_LEAF_ATTR_HI_S2_XN;
+ clr |= ~xn & KVM_PTE_LEAF_ATTR_HI_S2_XN;
ret = stage2_update_leaf_attrs(pgt, addr, 1, set, clr, NULL, &level, flags);
if (!ret || ret == -EAGAIN)
@@ -1535,37 +1578,80 @@ size_t kvm_pgtable_stage2_pgd_size(u64 vtcr)
return kvm_pgd_pages(ia_bits, start_level) * PAGE_SIZE;
}
-static int stage2_free_walker(const struct kvm_pgtable_visit_ctx *ctx,
- enum kvm_pgtable_walk_flags visit)
+static int stage2_free_leaf(const struct kvm_pgtable_visit_ctx *ctx)
{
struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops;
- if (!stage2_pte_is_counted(ctx->old))
+ mm_ops->put_page(ctx->ptep);
+ return 0;
+}
+
+static int stage2_free_table_post(const struct kvm_pgtable_visit_ctx *ctx)
+{
+ struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops;
+ kvm_pte_t *childp = kvm_pte_follow(ctx->old, mm_ops);
+
+ if (mm_ops->page_count(childp) != 1)
return 0;
+ /*
+ * Drop references and clear the now stale PTE to avoid rewalking the
+ * freed page table.
+ */
mm_ops->put_page(ctx->ptep);
+ mm_ops->put_page(childp);
+ kvm_clear_pte(ctx->ptep);
+ return 0;
+}
- if (kvm_pte_table(ctx->old, ctx->level))
- mm_ops->put_page(kvm_pte_follow(ctx->old, mm_ops));
+static int stage2_free_walker(const struct kvm_pgtable_visit_ctx *ctx,
+ enum kvm_pgtable_walk_flags visit)
+{
+ if (!stage2_pte_is_counted(ctx->old))
+ return 0;
- return 0;
+ switch (visit) {
+ case KVM_PGTABLE_WALK_LEAF:
+ return stage2_free_leaf(ctx);
+ case KVM_PGTABLE_WALK_TABLE_POST:
+ return stage2_free_table_post(ctx);
+ default:
+ return -EINVAL;
+ }
}
-void kvm_pgtable_stage2_destroy(struct kvm_pgtable *pgt)
+void kvm_pgtable_stage2_destroy_range(struct kvm_pgtable *pgt,
+ u64 addr, u64 size)
{
- size_t pgd_sz;
struct kvm_pgtable_walker walker = {
.cb = stage2_free_walker,
.flags = KVM_PGTABLE_WALK_LEAF |
KVM_PGTABLE_WALK_TABLE_POST,
};
- WARN_ON(kvm_pgtable_walk(pgt, 0, BIT(pgt->ia_bits), &walker));
+ WARN_ON(kvm_pgtable_walk(pgt, addr, size, &walker));
+}
+
+void kvm_pgtable_stage2_destroy_pgd(struct kvm_pgtable *pgt)
+{
+ size_t pgd_sz;
+
pgd_sz = kvm_pgd_pages(pgt->ia_bits, pgt->start_level) * PAGE_SIZE;
- pgt->mm_ops->free_pages_exact(kvm_dereference_pteref(&walker, pgt->pgd), pgd_sz);
+
+ /*
+ * Since the pgtable is unlinked at this point, and not shared with
+ * other walkers, safely deference pgd with kvm_dereference_pteref_raw()
+ */
+ pgt->mm_ops->free_pages_exact(kvm_dereference_pteref_raw(pgt->pgd), pgd_sz);
pgt->pgd = NULL;
}
+void kvm_pgtable_stage2_destroy(struct kvm_pgtable *pgt)
+{
+ kvm_pgtable_stage2_destroy_range(pgt, 0, BIT(pgt->ia_bits));
+ kvm_pgtable_stage2_destroy_pgd(pgt);
+}
+
void kvm_pgtable_stage2_free_unlinked(struct kvm_pgtable_mm_ops *mm_ops, void *pgtable, s8 level)
{
kvm_pteref_t ptep = (kvm_pteref_t)pgtable;
diff --git a/arch/arm64/kvm/hyp/vgic-v2-cpuif-proxy.c b/arch/arm64/kvm/hyp/vgic-v2-cpuif-proxy.c
index 78579b31a420..5fd99763b54d 100644
--- a/arch/arm64/kvm/hyp/vgic-v2-cpuif-proxy.c
+++ b/arch/arm64/kvm/hyp/vgic-v2-cpuif-proxy.c
@@ -63,6 +63,10 @@ int __vgic_v2_perform_cpuif_access(struct kvm_vcpu *vcpu)
return -1;
}
+ /* Handle deactivation as a normal exit */
+ if ((fault_ipa - vgic->vgic_cpu_base) >= GIC_CPU_DEACTIVATE)
+ return 0;
+
rd = kvm_vcpu_dabt_get_rd(vcpu);
addr = kvm_vgic_global_state.vcpu_hyp_va;
addr += fault_ipa - vgic->vgic_cpu_base;
diff --git a/arch/arm64/kvm/hyp/vgic-v3-sr.c b/arch/arm64/kvm/hyp/vgic-v3-sr.c
index acd909b7f225..0b670a033fd8 100644
--- a/arch/arm64/kvm/hyp/vgic-v3-sr.c
+++ b/arch/arm64/kvm/hyp/vgic-v3-sr.c
@@ -14,6 +14,8 @@
#include <asm/kvm_hyp.h>
#include <asm/kvm_mmu.h>
+#include "../../vgic/vgic.h"
+
#define vtr_to_max_lr_idx(v) ((v) & 0xf)
#define vtr_to_nr_pre_bits(v) ((((u32)(v) >> 26) & 7) + 1)
#define vtr_to_nr_apr_regs(v) (1 << (vtr_to_nr_pre_bits(v) - 5))
@@ -58,7 +60,7 @@ u64 __gic_v3_get_lr(unsigned int lr)
unreachable();
}
-static void __gic_v3_set_lr(u64 val, int lr)
+void __gic_v3_set_lr(u64 val, int lr)
{
switch (lr & 0xf) {
case 0:
@@ -196,6 +198,11 @@ static u32 __vgic_v3_read_ap1rn(int n)
return val;
}
+static u64 compute_ich_hcr(struct vgic_v3_cpu_if *cpu_if)
+{
+ return cpu_if->vgic_hcr | vgic_ich_hcr_trap_bits();
+}
+
void __vgic_v3_save_state(struct vgic_v3_cpu_if *cpu_if)
{
u64 used_lrs = cpu_if->used_lrs;
@@ -212,14 +219,12 @@ void __vgic_v3_save_state(struct vgic_v3_cpu_if *cpu_if)
}
}
- if (used_lrs || cpu_if->its_vpe.its_vm) {
+ if (used_lrs) {
int i;
u32 elrsr;
elrsr = read_gicreg(ICH_ELRSR_EL2);
- write_gicreg(cpu_if->vgic_hcr & ~ICH_HCR_EL2_En, ICH_HCR_EL2);
-
for (i = 0; i < used_lrs; i++) {
if (elrsr & (1 << i))
cpu_if->vgic_lr[i] &= ~ICH_LR_STATE;
@@ -229,6 +234,23 @@ void __vgic_v3_save_state(struct vgic_v3_cpu_if *cpu_if)
__gic_v3_set_lr(0, i);
}
}
+
+ cpu_if->vgic_vmcr = read_gicreg(ICH_VMCR_EL2);
+
+ if (cpu_if->vgic_hcr & ICH_HCR_EL2_LRENPIE) {
+ u64 val = read_gicreg(ICH_HCR_EL2);
+ cpu_if->vgic_hcr &= ~ICH_HCR_EL2_EOIcount;
+ cpu_if->vgic_hcr |= val & ICH_HCR_EL2_EOIcount;
+ }
+
+ write_gicreg(0, ICH_HCR_EL2);
+
+ /*
+ * Hack alert: On NV, this results in a trap so that the above write
+ * actually takes effect... No synchronisation is necessary, as we
+ * only care about the effects when this traps.
+ */
+ read_gicreg(ICH_MISR_EL2);
}
void __vgic_v3_restore_state(struct vgic_v3_cpu_if *cpu_if)
@@ -236,12 +258,10 @@ void __vgic_v3_restore_state(struct vgic_v3_cpu_if *cpu_if)
u64 used_lrs = cpu_if->used_lrs;
int i;
- if (used_lrs || cpu_if->its_vpe.its_vm) {
- write_gicreg(cpu_if->vgic_hcr, ICH_HCR_EL2);
+ write_gicreg(compute_ich_hcr(cpu_if), ICH_HCR_EL2);
- for (i = 0; i < used_lrs; i++)
- __gic_v3_set_lr(cpu_if->vgic_lr[i], i);
- }
+ for (i = 0; i < used_lrs; i++)
+ __gic_v3_set_lr(cpu_if->vgic_lr[i], i);
/*
* Ensure that writes to the LRs, and on non-VHE systems ensure that
@@ -307,24 +327,20 @@ void __vgic_v3_activate_traps(struct vgic_v3_cpu_if *cpu_if)
}
/*
- * If we need to trap system registers, we must write
- * ICH_HCR_EL2 anyway, even if no interrupts are being
- * injected. Note that this also applies if we don't expect
- * any system register access (no vgic at all).
+ * If we need to trap system registers, we must write ICH_HCR_EL2
+ * anyway, even if no interrupts are being injected. Note that this
+ * also applies if we don't expect any system register access (no
+ * vgic at all). In any case, no need to provide MI configuration.
*/
if (static_branch_unlikely(&vgic_v3_cpuif_trap) ||
cpu_if->its_vpe.its_vm || !cpu_if->vgic_sre)
- write_gicreg(cpu_if->vgic_hcr, ICH_HCR_EL2);
+ write_gicreg(vgic_ich_hcr_trap_bits() | ICH_HCR_EL2_En, ICH_HCR_EL2);
}
void __vgic_v3_deactivate_traps(struct vgic_v3_cpu_if *cpu_if)
{
u64 val;
- if (!cpu_if->vgic_sre) {
- cpu_if->vgic_vmcr = read_gicreg(ICH_VMCR_EL2);
- }
-
/* Only restore SRE if the host implements the GICv2 interface */
if (static_branch_unlikely(&vgic_v3_has_v2_compat)) {
val = read_gicreg(ICC_SRE_EL2);
@@ -346,7 +362,7 @@ void __vgic_v3_deactivate_traps(struct vgic_v3_cpu_if *cpu_if)
write_gicreg(0, ICH_HCR_EL2);
}
-static void __vgic_v3_save_aprs(struct vgic_v3_cpu_if *cpu_if)
+void __vgic_v3_save_aprs(struct vgic_v3_cpu_if *cpu_if)
{
u64 val;
u32 nr_pre_bits;
@@ -507,13 +523,6 @@ static void __vgic_v3_write_vmcr(u32 vmcr)
write_gicreg(vmcr, ICH_VMCR_EL2);
}
-void __vgic_v3_save_vmcr_aprs(struct vgic_v3_cpu_if *cpu_if)
-{
- __vgic_v3_save_aprs(cpu_if);
- if (cpu_if->vgic_sre)
- cpu_if->vgic_vmcr = __vgic_v3_read_vmcr();
-}
-
void __vgic_v3_restore_vmcr_aprs(struct vgic_v3_cpu_if *cpu_if)
{
__vgic_v3_compat_mode_enable();
@@ -790,7 +799,7 @@ static void __vgic_v3_bump_eoicount(void)
write_gicreg(hcr, ICH_HCR_EL2);
}
-static void __vgic_v3_write_dir(struct kvm_vcpu *vcpu, u32 vmcr, int rt)
+static int ___vgic_v3_write_dir(struct kvm_vcpu *vcpu, u32 vmcr, int rt)
{
u32 vid = vcpu_get_reg(vcpu, rt);
u64 lr_val;
@@ -798,19 +807,25 @@ static void __vgic_v3_write_dir(struct kvm_vcpu *vcpu, u32 vmcr, int rt)
/* EOImode == 0, nothing to be done here */
if (!(vmcr & ICH_VMCR_EOIM_MASK))
- return;
+ return 1;
/* No deactivate to be performed on an LPI */
if (vid >= VGIC_MIN_LPI)
- return;
+ return 1;
lr = __vgic_v3_find_active_lr(vcpu, vid, &lr_val);
- if (lr == -1) {
- __vgic_v3_bump_eoicount();
- return;
+ if (lr != -1) {
+ __vgic_v3_clear_active_lr(lr, lr_val);
+ return 1;
}
- __vgic_v3_clear_active_lr(lr, lr_val);
+ return 0;
+}
+
+static void __vgic_v3_write_dir(struct kvm_vcpu *vcpu, u32 vmcr, int rt)
+{
+ if (!___vgic_v3_write_dir(vcpu, vmcr, rt))
+ __vgic_v3_bump_eoicount();
}
static void __vgic_v3_write_eoir(struct kvm_vcpu *vcpu, u32 vmcr, int rt)
@@ -1245,6 +1260,21 @@ int __vgic_v3_perform_cpuif_access(struct kvm_vcpu *vcpu)
case SYS_ICC_DIR_EL1:
if (unlikely(is_read))
return 0;
+ /*
+ * Full exit if required to handle overflow deactivation,
+ * unless we can emulate it in the LRs (likely the majority
+ * of the cases).
+ */
+ if (vcpu->arch.vgic_cpu.vgic_v3.vgic_hcr & ICH_HCR_EL2_TDIR) {
+ int ret;
+
+ ret = ___vgic_v3_write_dir(vcpu, __vgic_v3_read_vmcr(),
+ kvm_vcpu_sys_get_rt(vcpu));
+ if (ret)
+ __kvm_skip_instr(vcpu);
+
+ return ret;
+ }
fn = __vgic_v3_write_dir;
break;
case SYS_ICC_RPR_EL1:
diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
index 9a6a80c3fbe5..48d7c372a4cd 100644
--- a/arch/arm64/kvm/mmu.c
+++ b/arch/arm64/kvm/mmu.c
@@ -904,6 +904,38 @@ static int kvm_init_ipa_range(struct kvm_s2_mmu *mmu, unsigned long type)
return 0;
}
+/*
+ * Assume that @pgt is valid and unlinked from the KVM MMU to free the
+ * page-table without taking the kvm_mmu_lock and without performing any
+ * TLB invalidations.
+ *
+ * Also, the range of addresses can be large enough to cause need_resched
+ * warnings, for instance on CONFIG_PREEMPT_NONE kernels. Hence, invoke
+ * cond_resched() periodically to prevent hogging the CPU for a long time
+ * and schedule something else, if required.
+ */
+static void stage2_destroy_range(struct kvm_pgtable *pgt, phys_addr_t addr,
+ phys_addr_t end)
+{
+ u64 next;
+
+ do {
+ next = stage2_range_addr_end(addr, end);
+ KVM_PGT_FN(kvm_pgtable_stage2_destroy_range)(pgt, addr,
+ next - addr);
+ if (next != end)
+ cond_resched();
+ } while (addr = next, addr != end);
+}
+
+static void kvm_stage2_destroy(struct kvm_pgtable *pgt)
+{
+ unsigned int ia_bits = VTCR_EL2_IPA(pgt->mmu->vtcr);
+
+ stage2_destroy_range(pgt, 0, BIT(ia_bits));
+ KVM_PGT_FN(kvm_pgtable_stage2_destroy_pgd)(pgt);
+}
+
/**
* kvm_init_stage2_mmu - Initialise a S2 MMU structure
* @kvm: The pointer to the KVM structure
@@ -980,7 +1012,7 @@ int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu, unsigned long t
return 0;
out_destroy_pgtable:
- KVM_PGT_FN(kvm_pgtable_stage2_destroy)(pgt);
+ kvm_stage2_destroy(pgt);
out_free_pgtable:
kfree(pgt);
return err;
@@ -1081,7 +1113,7 @@ void kvm_free_stage2_pgd(struct kvm_s2_mmu *mmu)
write_unlock(&kvm->mmu_lock);
if (pgt) {
- KVM_PGT_FN(kvm_pgtable_stage2_destroy)(pgt);
+ kvm_stage2_destroy(pgt);
kfree(pgt);
}
}
@@ -1521,6 +1553,16 @@ static void adjust_nested_fault_perms(struct kvm_s2_trans *nested,
*prot |= kvm_encode_nested_level(nested);
}
+static void adjust_nested_exec_perms(struct kvm *kvm,
+ struct kvm_s2_trans *nested,
+ enum kvm_pgtable_prot *prot)
+{
+ if (!kvm_s2_trans_exec_el0(kvm, nested))
+ *prot &= ~KVM_PGTABLE_PROT_UX;
+ if (!kvm_s2_trans_exec_el1(kvm, nested))
+ *prot &= ~KVM_PGTABLE_PROT_PX;
+}
+
#define KVM_PGTABLE_WALK_MEMABORT_FLAGS (KVM_PGTABLE_WALK_HANDLE_FAULT | KVM_PGTABLE_WALK_SHARED)
static int gmem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
@@ -1572,11 +1614,12 @@ static int gmem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
if (writable)
prot |= KVM_PGTABLE_PROT_W;
- if (exec_fault ||
- (cpus_have_final_cap(ARM64_HAS_CACHE_DIC) &&
- (!nested || kvm_s2_trans_executable(nested))))
+ if (exec_fault || cpus_have_final_cap(ARM64_HAS_CACHE_DIC))
prot |= KVM_PGTABLE_PROT_X;
+ if (nested)
+ adjust_nested_exec_perms(kvm, nested, &prot);
+
kvm_fault_lock(kvm);
if (mmu_invalidate_retry(kvm, mmu_seq)) {
ret = -EAGAIN;
@@ -1851,11 +1894,13 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
prot |= KVM_PGTABLE_PROT_NORMAL_NC;
else
prot |= KVM_PGTABLE_PROT_DEVICE;
- } else if (cpus_have_final_cap(ARM64_HAS_CACHE_DIC) &&
- (!nested || kvm_s2_trans_executable(nested))) {
+ } else if (cpus_have_final_cap(ARM64_HAS_CACHE_DIC)) {
prot |= KVM_PGTABLE_PROT_X;
}
+ if (nested)
+ adjust_nested_exec_perms(kvm, nested, &prot);
+
/*
* Under the premise of getting a FSC_PERM fault, we just need to relax
* permissions only if vma_pagesize equals fault_granule. Otherwise,
@@ -1899,8 +1944,48 @@ static void handle_access_fault(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa)
read_unlock(&vcpu->kvm->mmu_lock);
}
+/*
+ * Returns true if the SEA should be handled locally within KVM if the abort
+ * is caused by a kernel memory allocation (e.g. stage-2 table memory).
+ */
+static bool host_owns_sea(struct kvm_vcpu *vcpu, u64 esr)
+{
+ /*
+ * Without FEAT_RAS HCR_EL2.TEA is RES0, meaning any external abort
+ * taken from a guest EL to EL2 is due to a host-imposed access (e.g.
+ * stage-2 PTW).
+ */
+ if (!cpus_have_final_cap(ARM64_HAS_RAS_EXTN))
+ return true;
+
+ /* KVM owns the VNCR when the vCPU isn't in a nested context. */
+ if (is_hyp_ctxt(vcpu) && !kvm_vcpu_trap_is_iabt(vcpu) && (esr & ESR_ELx_VNCR))
+ return true;
+
+ /*
+ * Determining if an external abort during a table walk happened at
+ * stage-2 is only possible with S1PTW is set. Otherwise, since KVM
+ * sets HCR_EL2.TEA, SEAs due to a stage-1 walk (i.e. accessing the
+ * PA of the stage-1 descriptor) can reach here and are reported
+ * with a TTW ESR value.
+ */
+ return (esr_fsc_is_sea_ttw(esr) && (esr & ESR_ELx_S1PTW));
+}
+
int kvm_handle_guest_sea(struct kvm_vcpu *vcpu)
{
+ struct kvm *kvm = vcpu->kvm;
+ struct kvm_run *run = vcpu->run;
+ u64 esr = kvm_vcpu_get_esr(vcpu);
+ u64 esr_mask = ESR_ELx_EC_MASK |
+ ESR_ELx_IL |
+ ESR_ELx_FnV |
+ ESR_ELx_EA |
+ ESR_ELx_CM |
+ ESR_ELx_WNR |
+ ESR_ELx_FSC;
+ u64 ipa;
+
/*
* Give APEI the opportunity to claim the abort before handling it
* within KVM. apei_claim_sea() expects to be called with IRQs enabled.
@@ -1909,7 +1994,33 @@ int kvm_handle_guest_sea(struct kvm_vcpu *vcpu)
if (apei_claim_sea(NULL) == 0)
return 1;
- return kvm_inject_serror(vcpu);
+ if (host_owns_sea(vcpu, esr) ||
+ !test_bit(KVM_ARCH_FLAG_EXIT_SEA, &vcpu->kvm->arch.flags))
+ return kvm_inject_serror(vcpu);
+
+ /* ESR_ELx.SET is RES0 when FEAT_RAS isn't implemented. */
+ if (kvm_has_ras(kvm))
+ esr_mask |= ESR_ELx_SET_MASK;
+
+ /*
+ * Exit to userspace, and provide faulting guest virtual and physical
+ * addresses in case userspace wants to emulate SEA to guest by
+ * writing to FAR_ELx and HPFAR_ELx registers.
+ */
+ memset(&run->arm_sea, 0, sizeof(run->arm_sea));
+ run->exit_reason = KVM_EXIT_ARM_SEA;
+ run->arm_sea.esr = esr & esr_mask;
+
+ if (!(esr & ESR_ELx_FnV))
+ run->arm_sea.gva = kvm_vcpu_get_hfar(vcpu);
+
+ ipa = kvm_vcpu_get_fault_ipa(vcpu);
+ if (ipa != INVALID_GPA) {
+ run->arm_sea.flags |= KVM_EXIT_ARM_SEA_FLAG_GPA_VALID;
+ run->arm_sea.gpa = ipa;
+ }
+
+ return 0;
}
/**
@@ -1999,6 +2110,11 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu)
u32 esr;
ret = kvm_walk_nested_s2(vcpu, fault_ipa, &nested_trans);
+ if (ret == -EAGAIN) {
+ ret = 1;
+ goto out_unlock;
+ }
+
if (ret) {
esr = kvm_s2_trans_esr(&nested_trans);
kvm_inject_s2_fault(vcpu, esr);
diff --git a/arch/arm64/kvm/nested.c b/arch/arm64/kvm/nested.c
index be6bbd167770..cdeeb8f09e72 100644
--- a/arch/arm64/kvm/nested.c
+++ b/arch/arm64/kvm/nested.c
@@ -124,14 +124,13 @@ int kvm_vcpu_init_nested(struct kvm_vcpu *vcpu)
}
struct s2_walk_info {
- int (*read_desc)(phys_addr_t pa, u64 *desc, void *data);
- void *data;
- u64 baddr;
- unsigned int max_oa_bits;
- unsigned int pgshift;
- unsigned int sl;
- unsigned int t0sz;
- bool be;
+ u64 baddr;
+ unsigned int max_oa_bits;
+ unsigned int pgshift;
+ unsigned int sl;
+ unsigned int t0sz;
+ bool be;
+ bool ha;
};
static u32 compute_fsc(int level, u32 fsc)
@@ -199,6 +198,42 @@ static int check_output_size(struct s2_walk_info *wi, phys_addr_t output)
return 0;
}
+static int read_guest_s2_desc(struct kvm_vcpu *vcpu, phys_addr_t pa, u64 *desc,
+ struct s2_walk_info *wi)
+{
+ u64 val;
+ int r;
+
+ r = kvm_read_guest(vcpu->kvm, pa, &val, sizeof(val));
+ if (r)
+ return r;
+
+ /*
+ * Handle reversedescriptors if endianness differs between the
+ * host and the guest hypervisor.
+ */
+ if (wi->be)
+ *desc = be64_to_cpu((__force __be64)val);
+ else
+ *desc = le64_to_cpu((__force __le64)val);
+
+ return 0;
+}
+
+static int swap_guest_s2_desc(struct kvm_vcpu *vcpu, phys_addr_t pa, u64 old, u64 new,
+ struct s2_walk_info *wi)
+{
+ if (wi->be) {
+ old = (__force u64)cpu_to_be64(old);
+ new = (__force u64)cpu_to_be64(new);
+ } else {
+ old = (__force u64)cpu_to_le64(old);
+ new = (__force u64)cpu_to_le64(new);
+ }
+
+ return __kvm_at_swap_desc(vcpu->kvm, pa, old, new);
+}
+
/*
* This is essentially a C-version of the pseudo code from the ARM ARM
* AArch64.TranslationTableWalk function. I strongly recommend looking at
@@ -206,13 +241,13 @@ static int check_output_size(struct s2_walk_info *wi, phys_addr_t output)
*
* Must be called with the kvm->srcu read lock held
*/
-static int walk_nested_s2_pgd(phys_addr_t ipa,
+static int walk_nested_s2_pgd(struct kvm_vcpu *vcpu, phys_addr_t ipa,
struct s2_walk_info *wi, struct kvm_s2_trans *out)
{
int first_block_level, level, stride, input_size, base_lower_bound;
phys_addr_t base_addr;
unsigned int addr_top, addr_bottom;
- u64 desc; /* page table entry */
+ u64 desc, new_desc; /* page table entry */
int ret;
phys_addr_t paddr;
@@ -257,28 +292,30 @@ static int walk_nested_s2_pgd(phys_addr_t ipa,
>> (addr_bottom - 3);
paddr = base_addr | index;
- ret = wi->read_desc(paddr, &desc, wi->data);
+ ret = read_guest_s2_desc(vcpu, paddr, &desc, wi);
if (ret < 0)
return ret;
- /*
- * Handle reversedescriptors if endianness differs between the
- * host and the guest hypervisor.
- */
- if (wi->be)
- desc = be64_to_cpu((__force __be64)desc);
- else
- desc = le64_to_cpu((__force __le64)desc);
+ new_desc = desc;
/* Check for valid descriptor at this point */
- if (!(desc & 1) || ((desc & 3) == 1 && level == 3)) {
+ if (!(desc & KVM_PTE_VALID)) {
out->esr = compute_fsc(level, ESR_ELx_FSC_FAULT);
out->desc = desc;
return 1;
}
- /* We're at the final level or block translation level */
- if ((desc & 3) == 1 || level == 3)
+ if (FIELD_GET(KVM_PTE_TYPE, desc) == KVM_PTE_TYPE_BLOCK) {
+ if (level < 3)
+ break;
+
+ out->esr = compute_fsc(level, ESR_ELx_FSC_FAULT);
+ out->desc = desc;
+ return 1;
+ }
+
+ /* We're at the final level */
+ if (level == 3)
break;
if (check_output_size(wi, desc)) {
@@ -305,7 +342,18 @@ static int walk_nested_s2_pgd(phys_addr_t ipa,
return 1;
}
- if (!(desc & BIT(10))) {
+ if (wi->ha)
+ new_desc |= KVM_PTE_LEAF_ATTR_LO_S2_AF;
+
+ if (new_desc != desc) {
+ ret = swap_guest_s2_desc(vcpu, paddr, desc, new_desc, wi);
+ if (ret)
+ return ret;
+
+ desc = new_desc;
+ }
+
+ if (!(desc & KVM_PTE_LEAF_ATTR_LO_S2_AF)) {
out->esr = compute_fsc(level, ESR_ELx_FSC_ACCESS);
out->desc = desc;
return 1;
@@ -318,20 +366,13 @@ static int walk_nested_s2_pgd(phys_addr_t ipa,
(ipa & GENMASK_ULL(addr_bottom - 1, 0));
out->output = paddr;
out->block_size = 1UL << ((3 - level) * stride + wi->pgshift);
- out->readable = desc & (0b01 << 6);
- out->writable = desc & (0b10 << 6);
+ out->readable = desc & KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R;
+ out->writable = desc & KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W;
out->level = level;
out->desc = desc;
return 0;
}
-static int read_guest_s2_desc(phys_addr_t pa, u64 *desc, void *data)
-{
- struct kvm_vcpu *vcpu = data;
-
- return kvm_read_guest(vcpu->kvm, pa, desc, sizeof(*desc));
-}
-
static void vtcr_to_walk_info(u64 vtcr, struct s2_walk_info *wi)
{
wi->t0sz = vtcr & TCR_EL2_T0SZ_MASK;
@@ -350,6 +391,8 @@ static void vtcr_to_walk_info(u64 vtcr, struct s2_walk_info *wi)
/* Global limit for now, should eventually be per-VM */
wi->max_oa_bits = min(get_kvm_ipa_limit(),
ps_to_output_size(FIELD_GET(VTCR_EL2_PS_MASK, vtcr), false));
+
+ wi->ha = vtcr & VTCR_EL2_HA;
}
int kvm_walk_nested_s2(struct kvm_vcpu *vcpu, phys_addr_t gipa,
@@ -364,15 +407,13 @@ int kvm_walk_nested_s2(struct kvm_vcpu *vcpu, phys_addr_t gipa,
if (!vcpu_has_nv(vcpu))
return 0;
- wi.read_desc = read_guest_s2_desc;
- wi.data = vcpu;
wi.baddr = vcpu_read_sys_reg(vcpu, VTTBR_EL2);
vtcr_to_walk_info(vtcr, &wi);
wi.be = vcpu_read_sys_reg(vcpu, SCTLR_EL2) & SCTLR_ELx_EE;
- ret = walk_nested_s2_pgd(gipa, &wi, result);
+ ret = walk_nested_s2_pgd(vcpu, gipa, &wi, result);
if (ret)
result->esr |= (kvm_vcpu_get_esr(vcpu) & ~ESR_ELx_FSC);
@@ -788,7 +829,10 @@ int kvm_s2_handle_perm_fault(struct kvm_vcpu *vcpu, struct kvm_s2_trans *trans)
return 0;
if (kvm_vcpu_trap_is_iabt(vcpu)) {
- forward_fault = !kvm_s2_trans_executable(trans);
+ if (vcpu_mode_priv(vcpu))
+ forward_fault = !kvm_s2_trans_exec_el1(vcpu->kvm, trans);
+ else
+ forward_fault = !kvm_s2_trans_exec_el0(vcpu->kvm, trans);
} else {
bool write_fault = kvm_is_write_fault(vcpu);
@@ -1555,12 +1599,13 @@ u64 limit_nv_id_reg(struct kvm *kvm, u32 reg, u64 val)
case SYS_ID_AA64MMFR1_EL1:
val &= ~(ID_AA64MMFR1_EL1_CMOW |
ID_AA64MMFR1_EL1_nTLBPA |
- ID_AA64MMFR1_EL1_ETS |
- ID_AA64MMFR1_EL1_XNX |
- ID_AA64MMFR1_EL1_HAFDBS);
+ ID_AA64MMFR1_EL1_ETS);
+
/* FEAT_E2H0 implies no VHE */
if (test_bit(KVM_ARM_VCPU_HAS_EL2_E2H0, kvm->arch.vcpu_features))
val &= ~ID_AA64MMFR1_EL1_VH;
+
+ val = ID_REG_LIMIT_FIELD_ENUM(val, ID_AA64MMFR1_EL1, HAFDBS, AF);
break;
case SYS_ID_AA64MMFR2_EL1:
diff --git a/arch/arm64/kvm/pkvm.c b/arch/arm64/kvm/pkvm.c
index 24f0f8a8c943..d7a0f69a9982 100644
--- a/arch/arm64/kvm/pkvm.c
+++ b/arch/arm64/kvm/pkvm.c
@@ -344,9 +344,16 @@ static int __pkvm_pgtable_stage2_unmap(struct kvm_pgtable *pgt, u64 start, u64 e
return 0;
}
-void pkvm_pgtable_stage2_destroy(struct kvm_pgtable *pgt)
+void pkvm_pgtable_stage2_destroy_range(struct kvm_pgtable *pgt,
+ u64 addr, u64 size)
{
- __pkvm_pgtable_stage2_unmap(pgt, 0, ~(0ULL));
+ __pkvm_pgtable_stage2_unmap(pgt, addr, addr + size);
+}
+
+void pkvm_pgtable_stage2_destroy_pgd(struct kvm_pgtable *pgt)
+{
+ /* Expected to be called after all pKVM mappings have been released. */
+ WARN_ON_ONCE(!RB_EMPTY_ROOT(&pgt->pkvm_mappings.rb_root));
}
int pkvm_pgtable_stage2_map(struct kvm_pgtable *pgt, u64 addr, u64 size,
diff --git a/arch/arm64/kvm/ptdump.c b/arch/arm64/kvm/ptdump.c
index dc5acfb00af9..6cbe018fd6fd 100644
--- a/arch/arm64/kvm/ptdump.c
+++ b/arch/arm64/kvm/ptdump.c
@@ -31,27 +31,46 @@ static const struct ptdump_prot_bits stage2_pte_bits[] = {
.val = PTE_VALID,
.set = " ",
.clear = "F",
- }, {
+ },
+ {
.mask = KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R,
.val = KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R,
.set = "R",
.clear = " ",
- }, {
+ },
+ {
.mask = KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W,
.val = KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W,
.set = "W",
.clear = " ",
- }, {
+ },
+ {
.mask = KVM_PTE_LEAF_ATTR_HI_S2_XN,
- .val = KVM_PTE_LEAF_ATTR_HI_S2_XN,
- .set = "NX",
- .clear = "x ",
- }, {
+ .val = 0b00UL << __bf_shf(KVM_PTE_LEAF_ATTR_HI_S2_XN),
+ .set = "px ux ",
+ },
+ {
+ .mask = KVM_PTE_LEAF_ATTR_HI_S2_XN,
+ .val = 0b01UL << __bf_shf(KVM_PTE_LEAF_ATTR_HI_S2_XN),
+ .set = "PXNux ",
+ },
+ {
+ .mask = KVM_PTE_LEAF_ATTR_HI_S2_XN,
+ .val = 0b10UL << __bf_shf(KVM_PTE_LEAF_ATTR_HI_S2_XN),
+ .set = "PXNUXN",
+ },
+ {
+ .mask = KVM_PTE_LEAF_ATTR_HI_S2_XN,
+ .val = 0b11UL << __bf_shf(KVM_PTE_LEAF_ATTR_HI_S2_XN),
+ .set = "px UXN",
+ },
+ {
.mask = KVM_PTE_LEAF_ATTR_LO_S2_AF,
.val = KVM_PTE_LEAF_ATTR_LO_S2_AF,
.set = "AF",
.clear = " ",
- }, {
+ },
+ {
.mask = PMD_TYPE_MASK,
.val = PMD_TYPE_SECT,
.set = "BLK",
diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
index ec3fbe0b8d52..c8fd7c6a12a1 100644
--- a/arch/arm64/kvm/sys_regs.c
+++ b/arch/arm64/kvm/sys_regs.c
@@ -666,6 +666,21 @@ static bool access_gic_sre(struct kvm_vcpu *vcpu,
return true;
}
+static bool access_gic_dir(struct kvm_vcpu *vcpu,
+ struct sys_reg_params *p,
+ const struct sys_reg_desc *r)
+{
+ if (!kvm_has_gicv3(vcpu->kvm))
+ return undef_access(vcpu, p, r);
+
+ if (!p->is_write)
+ return undef_access(vcpu, p, r);
+
+ vgic_v3_deactivate(vcpu, p->regval);
+
+ return true;
+}
+
static bool trap_raz_wi(struct kvm_vcpu *vcpu,
struct sys_reg_params *p,
const struct sys_reg_desc *r)
@@ -3373,7 +3388,7 @@ static const struct sys_reg_desc sys_reg_descs[] = {
{ SYS_DESC(SYS_ICC_AP1R1_EL1), undef_access },
{ SYS_DESC(SYS_ICC_AP1R2_EL1), undef_access },
{ SYS_DESC(SYS_ICC_AP1R3_EL1), undef_access },
- { SYS_DESC(SYS_ICC_DIR_EL1), undef_access },
+ { SYS_DESC(SYS_ICC_DIR_EL1), access_gic_dir },
{ SYS_DESC(SYS_ICC_RPR_EL1), undef_access },
{ SYS_DESC(SYS_ICC_SGI1R_EL1), access_gic_sgi },
{ SYS_DESC(SYS_ICC_ASGI1R_EL1), access_gic_sgi },
@@ -3770,7 +3785,8 @@ static bool handle_at_s1e01(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
{
u32 op = sys_insn(p->Op0, p->Op1, p->CRn, p->CRm, p->Op2);
- __kvm_at_s1e01(vcpu, op, p->regval);
+ if (__kvm_at_s1e01(vcpu, op, p->regval))
+ return false;
return true;
}
@@ -3787,7 +3803,8 @@ static bool handle_at_s1e2(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
return false;
}
- __kvm_at_s1e2(vcpu, op, p->regval);
+ if (__kvm_at_s1e2(vcpu, op, p->regval))
+ return false;
return true;
}
@@ -3797,7 +3814,8 @@ static bool handle_at_s12(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
{
u32 op = sys_insn(p->Op0, p->Op1, p->CRn, p->CRm, p->Op2);
- __kvm_at_s12(vcpu, op, p->regval);
+ if (__kvm_at_s12(vcpu, op, p->regval))
+ return false;
return true;
}
@@ -4498,7 +4516,7 @@ static const struct sys_reg_desc cp15_regs[] = {
{ CP15_SYS_DESC(SYS_ICC_AP1R1_EL1), undef_access },
{ CP15_SYS_DESC(SYS_ICC_AP1R2_EL1), undef_access },
{ CP15_SYS_DESC(SYS_ICC_AP1R3_EL1), undef_access },
- { CP15_SYS_DESC(SYS_ICC_DIR_EL1), undef_access },
+ { CP15_SYS_DESC(SYS_ICC_DIR_EL1), access_gic_dir },
{ CP15_SYS_DESC(SYS_ICC_RPR_EL1), undef_access },
{ CP15_SYS_DESC(SYS_ICC_IAR1_EL1), undef_access },
{ CP15_SYS_DESC(SYS_ICC_EOIR1_EL1), undef_access },
diff --git a/arch/arm64/kvm/vgic/vgic-init.c b/arch/arm64/kvm/vgic/vgic-init.c
index da62edbc1205..dc9f9db31026 100644
--- a/arch/arm64/kvm/vgic/vgic-init.c
+++ b/arch/arm64/kvm/vgic/vgic-init.c
@@ -198,6 +198,7 @@ static int kvm_vgic_dist_init(struct kvm *kvm, unsigned int nr_spis)
struct kvm_vcpu *vcpu0 = kvm_get_vcpu(kvm, 0);
int i;
+ dist->active_spis = (atomic_t)ATOMIC_INIT(0);
dist->spis = kcalloc(nr_spis, sizeof(struct vgic_irq), GFP_KERNEL_ACCOUNT);
if (!dist->spis)
return -ENOMEM;
@@ -363,12 +364,12 @@ int kvm_vgic_vcpu_init(struct kvm_vcpu *vcpu)
return ret;
}
-static void kvm_vgic_vcpu_enable(struct kvm_vcpu *vcpu)
+static void kvm_vgic_vcpu_reset(struct kvm_vcpu *vcpu)
{
if (kvm_vgic_global_state.type == VGIC_V2)
- vgic_v2_enable(vcpu);
+ vgic_v2_reset(vcpu);
else
- vgic_v3_enable(vcpu);
+ vgic_v3_reset(vcpu);
}
/*
@@ -415,7 +416,7 @@ int vgic_init(struct kvm *kvm)
}
kvm_for_each_vcpu(idx, vcpu, kvm)
- kvm_vgic_vcpu_enable(vcpu);
+ kvm_vgic_vcpu_reset(vcpu);
ret = kvm_vgic_setup_default_irq_routing(kvm);
if (ret)
diff --git a/arch/arm64/kvm/vgic/vgic-mmio-v2.c b/arch/arm64/kvm/vgic/vgic-mmio-v2.c
index f25fccb1f8e6..406845b3117c 100644
--- a/arch/arm64/kvm/vgic/vgic-mmio-v2.c
+++ b/arch/arm64/kvm/vgic/vgic-mmio-v2.c
@@ -359,6 +359,16 @@ static void vgic_mmio_write_vcpuif(struct kvm_vcpu *vcpu,
vgic_set_vmcr(vcpu, &vmcr);
}
+static void vgic_mmio_write_dir(struct kvm_vcpu *vcpu,
+ gpa_t addr, unsigned int len,
+ unsigned long val)
+{
+ if (kvm_vgic_global_state.type == VGIC_V2)
+ vgic_v2_deactivate(vcpu, val);
+ else
+ vgic_v3_deactivate(vcpu, val);
+}
+
static unsigned long vgic_mmio_read_apr(struct kvm_vcpu *vcpu,
gpa_t addr, unsigned int len)
{
@@ -482,6 +492,10 @@ static const struct vgic_register_region vgic_v2_cpu_registers[] = {
REGISTER_DESC_WITH_LENGTH(GIC_CPU_IDENT,
vgic_mmio_read_vcpuif, vgic_mmio_write_vcpuif, 4,
VGIC_ACCESS_32bit),
+ REGISTER_DESC_WITH_LENGTH_UACCESS(GIC_CPU_DEACTIVATE,
+ vgic_mmio_read_raz, vgic_mmio_write_dir,
+ vgic_mmio_read_raz, vgic_mmio_uaccess_write_wi,
+ 4, VGIC_ACCESS_32bit),
};
unsigned int vgic_v2_init_dist_iodev(struct vgic_io_device *dev)
@@ -494,6 +508,16 @@ unsigned int vgic_v2_init_dist_iodev(struct vgic_io_device *dev)
return SZ_4K;
}
+unsigned int vgic_v2_init_cpuif_iodev(struct vgic_io_device *dev)
+{
+ dev->regions = vgic_v2_cpu_registers;
+ dev->nr_regions = ARRAY_SIZE(vgic_v2_cpu_registers);
+
+ kvm_iodevice_init(&dev->dev, &kvm_io_gic_ops);
+
+ return KVM_VGIC_V2_CPU_SIZE;
+}
+
int vgic_v2_has_attr_regs(struct kvm_device *dev, struct kvm_device_attr *attr)
{
const struct vgic_register_region *region;
diff --git a/arch/arm64/kvm/vgic/vgic-mmio.h b/arch/arm64/kvm/vgic/vgic-mmio.h
index 5b490a4dfa5e..50dc80220b0f 100644
--- a/arch/arm64/kvm/vgic/vgic-mmio.h
+++ b/arch/arm64/kvm/vgic/vgic-mmio.h
@@ -213,6 +213,7 @@ void vgic_write_irq_line_level_info(struct kvm_vcpu *vcpu, u32 intid,
const u32 val);
unsigned int vgic_v2_init_dist_iodev(struct vgic_io_device *dev);
+unsigned int vgic_v2_init_cpuif_iodev(struct vgic_io_device *dev);
unsigned int vgic_v3_init_dist_iodev(struct vgic_io_device *dev);
diff --git a/arch/arm64/kvm/vgic/vgic-v2.c b/arch/arm64/kvm/vgic/vgic-v2.c
index 381673f03c39..585491fbda80 100644
--- a/arch/arm64/kvm/vgic/vgic-v2.c
+++ b/arch/arm64/kvm/vgic/vgic-v2.c
@@ -9,6 +9,7 @@
#include <kvm/arm_vgic.h>
#include <asm/kvm_mmu.h>
+#include "vgic-mmio.h"
#include "vgic.h"
static inline void vgic_v2_write_lr(int lr, u32 val)
@@ -26,11 +27,24 @@ void vgic_v2_init_lrs(void)
vgic_v2_write_lr(i, 0);
}
-void vgic_v2_set_underflow(struct kvm_vcpu *vcpu)
+void vgic_v2_configure_hcr(struct kvm_vcpu *vcpu,
+ struct ap_list_summary *als)
{
struct vgic_v2_cpu_if *cpuif = &vcpu->arch.vgic_cpu.vgic_v2;
- cpuif->vgic_hcr |= GICH_HCR_UIE;
+ cpuif->vgic_hcr = GICH_HCR_EN;
+
+ if (irqs_pending_outside_lrs(als))
+ cpuif->vgic_hcr |= GICH_HCR_NPIE;
+ if (irqs_active_outside_lrs(als))
+ cpuif->vgic_hcr |= GICH_HCR_LRENPIE;
+ if (irqs_outside_lrs(als))
+ cpuif->vgic_hcr |= GICH_HCR_UIE;
+
+ cpuif->vgic_hcr |= (cpuif->vgic_vmcr & GICH_VMCR_ENABLE_GRP0_MASK) ?
+ GICH_HCR_VGrp0DIE : GICH_HCR_VGrp0EIE;
+ cpuif->vgic_hcr |= (cpuif->vgic_vmcr & GICH_VMCR_ENABLE_GRP1_MASK) ?
+ GICH_HCR_VGrp1DIE : GICH_HCR_VGrp1EIE;
}
static bool lr_signals_eoi_mi(u32 lr_val)
@@ -39,43 +53,23 @@ static bool lr_signals_eoi_mi(u32 lr_val)
!(lr_val & GICH_LR_HW);
}
-/*
- * transfer the content of the LRs back into the corresponding ap_list:
- * - active bit is transferred as is
- * - pending bit is
- * - transferred as is in case of edge sensitive IRQs
- * - set to the line-level (resample time) for level sensitive IRQs
- */
-void vgic_v2_fold_lr_state(struct kvm_vcpu *vcpu)
+static void vgic_v2_fold_lr(struct kvm_vcpu *vcpu, u32 val)
{
- struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
- struct vgic_v2_cpu_if *cpuif = &vgic_cpu->vgic_v2;
- int lr;
-
- DEBUG_SPINLOCK_BUG_ON(!irqs_disabled());
-
- cpuif->vgic_hcr &= ~GICH_HCR_UIE;
-
- for (lr = 0; lr < vgic_cpu->vgic_v2.used_lrs; lr++) {
- u32 val = cpuif->vgic_lr[lr];
- u32 cpuid, intid = val & GICH_LR_VIRTUALID;
- struct vgic_irq *irq;
- bool deactivated;
-
- /* Extract the source vCPU id from the LR */
- cpuid = val & GICH_LR_PHYSID_CPUID;
- cpuid >>= GICH_LR_PHYSID_CPUID_SHIFT;
- cpuid &= 7;
+ u32 cpuid, intid = val & GICH_LR_VIRTUALID;
+ struct vgic_irq *irq;
+ bool deactivated;
- /* Notify fds when the guest EOI'ed a level-triggered SPI */
- if (lr_signals_eoi_mi(val) && vgic_valid_spi(vcpu->kvm, intid))
- kvm_notify_acked_irq(vcpu->kvm, 0,
- intid - VGIC_NR_PRIVATE_IRQS);
+ /* Extract the source vCPU id from the LR */
+ cpuid = FIELD_GET(GICH_LR_PHYSID_CPUID, val) & 7;
- irq = vgic_get_vcpu_irq(vcpu, intid);
+ /* Notify fds when the guest EOI'ed a level-triggered SPI */
+ if (lr_signals_eoi_mi(val) && vgic_valid_spi(vcpu->kvm, intid))
+ kvm_notify_acked_irq(vcpu->kvm, 0,
+ intid - VGIC_NR_PRIVATE_IRQS);
- raw_spin_lock(&irq->irq_lock);
+ irq = vgic_get_vcpu_irq(vcpu, intid);
+ scoped_guard(raw_spinlock, &irq->irq_lock) {
/* Always preserve the active bit, note deactivation */
deactivated = irq->active && !(val & GICH_LR_ACTIVE_BIT);
irq->active = !!(val & GICH_LR_ACTIVE_BIT);
@@ -101,29 +95,139 @@ void vgic_v2_fold_lr_state(struct kvm_vcpu *vcpu)
/* Handle resampling for mapped interrupts if required */
vgic_irq_handle_resampling(irq, deactivated, val & GICH_LR_PENDING_BIT);
- raw_spin_unlock(&irq->irq_lock);
- vgic_put_irq(vcpu->kvm, irq);
+ irq->on_lr = false;
}
- cpuif->used_lrs = 0;
+ vgic_put_irq(vcpu->kvm, irq);
}
+static u32 vgic_v2_compute_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq);
+
/*
- * Populates the particular LR with the state of a given IRQ:
- * - for an edge sensitive IRQ the pending state is cleared in struct vgic_irq
- * - for a level sensitive IRQ the pending state value is unchanged;
- * it is dictated directly by the input level
- *
- * If @irq describes an SGI with multiple sources, we choose the
- * lowest-numbered source VCPU and clear that bit in the source bitmap.
- *
- * The irq_lock must be held by the caller.
+ * transfer the content of the LRs back into the corresponding ap_list:
+ * - active bit is transferred as is
+ * - pending bit is
+ * - transferred as is in case of edge sensitive IRQs
+ * - set to the line-level (resample time) for level sensitive IRQs
*/
-void vgic_v2_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr)
+void vgic_v2_fold_lr_state(struct kvm_vcpu *vcpu)
+{
+ struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
+ struct vgic_v2_cpu_if *cpuif = &vgic_cpu->vgic_v2;
+ u32 eoicount = FIELD_GET(GICH_HCR_EOICOUNT, cpuif->vgic_hcr);
+ struct vgic_irq *irq;
+
+ DEBUG_SPINLOCK_BUG_ON(!irqs_disabled());
+
+ for (int lr = 0; lr < vgic_cpu->vgic_v2.used_lrs; lr++)
+ vgic_v2_fold_lr(vcpu, cpuif->vgic_lr[lr]);
+
+ /* See the GICv3 equivalent for the EOIcount handling rationale */
+ list_for_each_entry(irq, &vgic_cpu->ap_list_head, ap_list) {
+ u32 lr;
+
+ if (!eoicount) {
+ break;
+ } else {
+ guard(raw_spinlock)(&irq->irq_lock);
+
+ if (!(likely(vgic_target_oracle(irq) == vcpu) &&
+ irq->active))
+ continue;
+
+ lr = vgic_v2_compute_lr(vcpu, irq) & ~GICH_LR_ACTIVE_BIT;
+ }
+
+ if (lr & GICH_LR_HW)
+ writel_relaxed(FIELD_GET(GICH_LR_PHYSID_CPUID, lr),
+ kvm_vgic_global_state.gicc_base + GIC_CPU_DEACTIVATE);
+ vgic_v2_fold_lr(vcpu, lr);
+ eoicount--;
+ }
+
+ cpuif->used_lrs = 0;
+}
+
+void vgic_v2_deactivate(struct kvm_vcpu *vcpu, u32 val)
+{
+ struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
+ struct vgic_v2_cpu_if *cpuif = &vgic_cpu->vgic_v2;
+ struct kvm_vcpu *target_vcpu = NULL;
+ bool mmio = false;
+ struct vgic_irq *irq;
+ unsigned long flags;
+ u64 lr = 0;
+ u8 cpuid;
+
+ /* Snapshot CPUID, and remove it from the INTID */
+ cpuid = FIELD_GET(GENMASK_ULL(12, 10), val);
+ val &= ~GENMASK_ULL(12, 10);
+
+ /* We only deal with DIR when EOIMode==1 */
+ if (!(cpuif->vgic_vmcr & GICH_VMCR_EOI_MODE_MASK))
+ return;
+
+ /* Make sure we're in the same context as LR handling */
+ local_irq_save(flags);
+
+ irq = vgic_get_vcpu_irq(vcpu, val);
+ if (WARN_ON_ONCE(!irq))
+ goto out;
+
+ /* See the corresponding v3 code for the rationale */
+ scoped_guard(raw_spinlock, &irq->irq_lock) {
+ target_vcpu = irq->vcpu;
+
+ /* Not on any ap_list? */
+ if (!target_vcpu)
+ goto put;
+
+ /*
+ * Urgh. We're deactivating something that we cannot
+ * observe yet... Big hammer time.
+ */
+ if (irq->on_lr) {
+ mmio = true;
+ goto put;
+ }
+
+ /* SGI: check that the cpuid matches */
+ if (val < VGIC_NR_SGIS && irq->active_source != cpuid) {
+ target_vcpu = NULL;
+ goto put;
+ }
+
+ /* (with a Dalek voice) DEACTIVATE!!!! */
+ lr = vgic_v2_compute_lr(vcpu, irq) & ~GICH_LR_ACTIVE_BIT;
+ }
+
+ if (lr & GICH_LR_HW)
+ writel_relaxed(FIELD_GET(GICH_LR_PHYSID_CPUID, lr),
+ kvm_vgic_global_state.gicc_base + GIC_CPU_DEACTIVATE);
+
+ vgic_v2_fold_lr(vcpu, lr);
+
+put:
+ vgic_put_irq(vcpu->kvm, irq);
+
+out:
+ local_irq_restore(flags);
+
+ if (mmio)
+ vgic_mmio_write_cactive(vcpu, (val / 32) * 4, 4, BIT(val % 32));
+
+ /* Force the ap_list to be pruned */
+ if (target_vcpu)
+ kvm_make_request(KVM_REQ_VGIC_PROCESS_UPDATE, target_vcpu);
+}
+
+static u32 vgic_v2_compute_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq)
{
u32 val = irq->intid;
bool allow_pending = true;
+ WARN_ON(irq->on_lr);
+
if (irq->active) {
val |= GICH_LR_ACTIVE_BIT;
if (vgic_irq_is_sgi(irq->intid))
@@ -163,22 +267,52 @@ void vgic_v2_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr)
if (allow_pending && irq_is_pending(irq)) {
val |= GICH_LR_PENDING_BIT;
- if (irq->config == VGIC_CONFIG_EDGE)
- irq->pending_latch = false;
-
if (vgic_irq_is_sgi(irq->intid)) {
u32 src = ffs(irq->source);
if (WARN_RATELIMIT(!src, "No SGI source for INTID %d\n",
irq->intid))
- return;
+ return 0;
val |= (src - 1) << GICH_LR_PHYSID_CPUID_SHIFT;
- irq->source &= ~(1 << (src - 1));
- if (irq->source) {
- irq->pending_latch = true;
+ if (irq->source & ~BIT(src - 1))
val |= GICH_LR_EOI;
- }
+ }
+ }
+
+ /* The GICv2 LR only holds five bits of priority. */
+ val |= (irq->priority >> 3) << GICH_LR_PRIORITY_SHIFT;
+
+ return val;
+}
+
+/*
+ * Populates the particular LR with the state of a given IRQ:
+ * - for an edge sensitive IRQ the pending state is cleared in struct vgic_irq
+ * - for a level sensitive IRQ the pending state value is unchanged;
+ * it is dictated directly by the input level
+ *
+ * If @irq describes an SGI with multiple sources, we choose the
+ * lowest-numbered source VCPU and clear that bit in the source bitmap.
+ *
+ * The irq_lock must be held by the caller.
+ */
+void vgic_v2_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr)
+{
+ u32 val = vgic_v2_compute_lr(vcpu, irq);
+
+ vcpu->arch.vgic_cpu.vgic_v2.vgic_lr[lr] = val;
+
+ if (val & GICH_LR_PENDING_BIT) {
+ if (irq->config == VGIC_CONFIG_EDGE)
+ irq->pending_latch = false;
+
+ if (vgic_irq_is_sgi(irq->intid)) {
+ u32 src = ffs(irq->source);
+
+ irq->source &= ~BIT(src - 1);
+ if (irq->source)
+ irq->pending_latch = true;
}
}
@@ -194,7 +328,7 @@ void vgic_v2_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr)
/* The GICv2 LR only holds five bits of priority. */
val |= (irq->priority >> 3) << GICH_LR_PRIORITY_SHIFT;
- vcpu->arch.vgic_cpu.vgic_v2.vgic_lr[lr] = val;
+ irq->on_lr = true;
}
void vgic_v2_clear_lr(struct kvm_vcpu *vcpu, int lr)
@@ -257,7 +391,7 @@ void vgic_v2_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcrp)
GICH_VMCR_PRIMASK_SHIFT) << GICV_PMR_PRIORITY_SHIFT;
}
-void vgic_v2_enable(struct kvm_vcpu *vcpu)
+void vgic_v2_reset(struct kvm_vcpu *vcpu)
{
/*
* By forcing VMCR to zero, the GIC will restore the binary
@@ -265,9 +399,6 @@ void vgic_v2_enable(struct kvm_vcpu *vcpu)
* anyway.
*/
vcpu->arch.vgic_cpu.vgic_v2.vgic_vmcr = 0;
-
- /* Get the show on the road... */
- vcpu->arch.vgic_cpu.vgic_v2.vgic_hcr = GICH_HCR_EN;
}
/* check for overlapping regions and for regions crossing the end of memory */
@@ -289,6 +420,7 @@ static bool vgic_v2_check_base(gpa_t dist_base, gpa_t cpu_base)
int vgic_v2_map_resources(struct kvm *kvm)
{
struct vgic_dist *dist = &kvm->arch.vgic;
+ unsigned int len;
int ret = 0;
if (IS_VGIC_ADDR_UNDEF(dist->vgic_dist_base) ||
@@ -312,10 +444,20 @@ int vgic_v2_map_resources(struct kvm *kvm)
return ret;
}
+ len = vgic_v2_init_cpuif_iodev(&dist->cpuif_iodev);
+ dist->cpuif_iodev.base_addr = dist->vgic_cpu_base;
+ dist->cpuif_iodev.iodev_type = IODEV_CPUIF;
+ dist->cpuif_iodev.redist_vcpu = NULL;
+
+ ret = kvm_io_bus_register_dev(kvm, KVM_MMIO_BUS, dist->vgic_cpu_base,
+ len, &dist->cpuif_iodev.dev);
+ if (ret)
+ return ret;
+
if (!static_branch_unlikely(&vgic_v2_cpuif_trap)) {
ret = kvm_phys_addr_ioremap(kvm, dist->vgic_cpu_base,
kvm_vgic_global_state.vcpu_base,
- KVM_VGIC_V2_CPU_SIZE, true);
+ KVM_VGIC_V2_CPU_SIZE - SZ_4K, true);
if (ret) {
kvm_err("Unable to remap VGIC CPU to VCPU\n");
return ret;
@@ -385,6 +527,7 @@ int vgic_v2_probe(const struct gic_kvm_info *info)
kvm_vgic_global_state.can_emulate_gicv2 = true;
kvm_vgic_global_state.vcpu_base = info->vcpu.start;
+ kvm_vgic_global_state.gicc_base = info->gicc_base;
kvm_vgic_global_state.type = VGIC_V2;
kvm_vgic_global_state.max_gic_vcpus = VGIC_V2_MAX_CPUS;
@@ -423,16 +566,26 @@ static void save_lrs(struct kvm_vcpu *vcpu, void __iomem *base)
void vgic_v2_save_state(struct kvm_vcpu *vcpu)
{
+ struct vgic_v2_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v2;
void __iomem *base = kvm_vgic_global_state.vctrl_base;
u64 used_lrs = vcpu->arch.vgic_cpu.vgic_v2.used_lrs;
if (!base)
return;
- if (used_lrs) {
+ cpu_if->vgic_vmcr = readl_relaxed(kvm_vgic_global_state.vctrl_base + GICH_VMCR);
+
+ if (used_lrs)
save_lrs(vcpu, base);
- writel_relaxed(0, base + GICH_HCR);
+
+ if (cpu_if->vgic_hcr & GICH_HCR_LRENPIE) {
+ u32 val = readl_relaxed(base + GICH_HCR);
+
+ cpu_if->vgic_hcr &= ~GICH_HCR_EOICOUNT;
+ cpu_if->vgic_hcr |= val & GICH_HCR_EOICOUNT;
}
+
+ writel_relaxed(0, base + GICH_HCR);
}
void vgic_v2_restore_state(struct kvm_vcpu *vcpu)
@@ -445,13 +598,10 @@ void vgic_v2_restore_state(struct kvm_vcpu *vcpu)
if (!base)
return;
- if (used_lrs) {
- writel_relaxed(cpu_if->vgic_hcr, base + GICH_HCR);
- for (i = 0; i < used_lrs; i++) {
- writel_relaxed(cpu_if->vgic_lr[i],
- base + GICH_LR0 + (i * 4));
- }
- }
+ writel_relaxed(cpu_if->vgic_hcr, base + GICH_HCR);
+
+ for (i = 0; i < used_lrs; i++)
+ writel_relaxed(cpu_if->vgic_lr[i], base + GICH_LR0 + (i * 4));
}
void vgic_v2_load(struct kvm_vcpu *vcpu)
@@ -468,6 +618,5 @@ void vgic_v2_put(struct kvm_vcpu *vcpu)
{
struct vgic_v2_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v2;
- cpu_if->vgic_vmcr = readl_relaxed(kvm_vgic_global_state.vctrl_base + GICH_VMCR);
cpu_if->vgic_apr = readl_relaxed(kvm_vgic_global_state.vctrl_base + GICH_APR);
}
diff --git a/arch/arm64/kvm/vgic/vgic-v3-nested.c b/arch/arm64/kvm/vgic/vgic-v3-nested.c
index 7f1259b49c50..61b44f3f2bf1 100644
--- a/arch/arm64/kvm/vgic/vgic-v3-nested.c
+++ b/arch/arm64/kvm/vgic/vgic-v3-nested.c
@@ -70,13 +70,14 @@ static int lr_map_idx_to_shadow_idx(struct shadow_if *shadow_if, int idx)
* - on L2 put: perform the inverse transformation, so that the result of L2
* running becomes visible to L1 in the VNCR-accessible registers.
*
- * - there is nothing to do on L2 entry, as everything will have happened
- * on load. However, this is the point where we detect that an interrupt
- * targeting L1 and prepare the grand switcheroo.
+ * - there is nothing to do on L2 entry apart from enabling the vgic, as
+ * everything will have happened on load. However, this is the point where
+ * we detect that an interrupt targeting L1 and prepare the grand
+ * switcheroo.
*
- * - on L2 exit: emulate the HW bit, and deactivate corresponding the L1
- * interrupt. The L0 active state will be cleared by the HW if the L1
- * interrupt was itself backed by a HW interrupt.
+ * - on L2 exit: resync the LRs and VMCR, emulate the HW bit, and deactivate
+ * corresponding the L1 interrupt. The L0 active state will be cleared by
+ * the HW if the L1 interrupt was itself backed by a HW interrupt.
*
* Maintenance Interrupt (MI) management:
*
@@ -93,8 +94,10 @@ static int lr_map_idx_to_shadow_idx(struct shadow_if *shadow_if, int idx)
*
* - because most of the ICH_*_EL2 registers live in the VNCR page, the
* quality of emulation is poor: L1 can setup the vgic so that an MI would
- * immediately fire, and not observe anything until the next exit. Trying
- * to read ICH_MISR_EL2 would do the trick, for example.
+ * immediately fire, and not observe anything until the next exit.
+ * Similarly, a pending MI is not immediately disabled by clearing
+ * ICH_HCR_EL2.En. Trying to read ICH_MISR_EL2 would do the trick, for
+ * example.
*
* System register emulation:
*
@@ -265,16 +268,37 @@ static void vgic_v3_create_shadow_lr(struct kvm_vcpu *vcpu,
s_cpu_if->used_lrs = hweight16(shadow_if->lr_map);
}
+void vgic_v3_flush_nested(struct kvm_vcpu *vcpu)
+{
+ u64 val = __vcpu_sys_reg(vcpu, ICH_HCR_EL2);
+
+ write_sysreg_s(val | vgic_ich_hcr_trap_bits(), SYS_ICH_HCR_EL2);
+}
+
void vgic_v3_sync_nested(struct kvm_vcpu *vcpu)
{
struct shadow_if *shadow_if = get_shadow_if();
int i;
for_each_set_bit(i, &shadow_if->lr_map, kvm_vgic_global_state.nr_lr) {
- u64 lr = __vcpu_sys_reg(vcpu, ICH_LRN(i));
- struct vgic_irq *irq;
+ u64 val, host_lr, lr;
+
+ host_lr = __gic_v3_get_lr(lr_map_idx_to_shadow_idx(shadow_if, i));
+
+ /* Propagate the new LR state */
+ lr = __vcpu_sys_reg(vcpu, ICH_LRN(i));
+ val = lr & ~ICH_LR_STATE;
+ val |= host_lr & ICH_LR_STATE;
+ __vcpu_assign_sys_reg(vcpu, ICH_LRN(i), val);
- if (!(lr & ICH_LR_HW) || !(lr & ICH_LR_STATE))
+ /*
+ * Deactivation of a HW interrupt: the LR must have the HW
+ * bit set, have been in a non-invalid state before the run,
+ * and now be in an invalid state. If any of that doesn't
+ * hold, we're done with this LR.
+ */
+ if (!((lr & ICH_LR_HW) && (lr & ICH_LR_STATE) &&
+ !(host_lr & ICH_LR_STATE)))
continue;
/*
@@ -282,35 +306,27 @@ void vgic_v3_sync_nested(struct kvm_vcpu *vcpu)
* need to emulate the HW effect between the guest hypervisor
* and the nested guest.
*/
- irq = vgic_get_vcpu_irq(vcpu, FIELD_GET(ICH_LR_PHYS_ID_MASK, lr));
- if (WARN_ON(!irq)) /* Shouldn't happen as we check on load */
- continue;
+ vgic_v3_deactivate(vcpu, FIELD_GET(ICH_LR_PHYS_ID_MASK, lr));
+ }
- lr = __gic_v3_get_lr(lr_map_idx_to_shadow_idx(shadow_if, i));
- if (!(lr & ICH_LR_STATE))
- irq->active = false;
+ /* We need these to be synchronised to generate the MI */
+ __vcpu_assign_sys_reg(vcpu, ICH_VMCR_EL2, read_sysreg_s(SYS_ICH_VMCR_EL2));
+ __vcpu_rmw_sys_reg(vcpu, ICH_HCR_EL2, &=, ~ICH_HCR_EL2_EOIcount);
+ __vcpu_rmw_sys_reg(vcpu, ICH_HCR_EL2, |=, read_sysreg_s(SYS_ICH_HCR_EL2) & ICH_HCR_EL2_EOIcount);
- vgic_put_irq(vcpu->kvm, irq);
- }
+ write_sysreg_s(0, SYS_ICH_HCR_EL2);
+ isb();
+
+ vgic_v3_nested_update_mi(vcpu);
}
static void vgic_v3_create_shadow_state(struct kvm_vcpu *vcpu,
struct vgic_v3_cpu_if *s_cpu_if)
{
struct vgic_v3_cpu_if *host_if = &vcpu->arch.vgic_cpu.vgic_v3;
- u64 val = 0;
int i;
- /*
- * If we're on a system with a broken vgic that requires
- * trapping, propagate the trapping requirements.
- *
- * Ah, the smell of rotten fruits...
- */
- if (static_branch_unlikely(&vgic_v3_cpuif_trap))
- val = host_if->vgic_hcr & (ICH_HCR_EL2_TALL0 | ICH_HCR_EL2_TALL1 |
- ICH_HCR_EL2_TC | ICH_HCR_EL2_TDIR);
- s_cpu_if->vgic_hcr = __vcpu_sys_reg(vcpu, ICH_HCR_EL2) | val;
+ s_cpu_if->vgic_hcr = __vcpu_sys_reg(vcpu, ICH_HCR_EL2);
s_cpu_if->vgic_vmcr = __vcpu_sys_reg(vcpu, ICH_VMCR_EL2);
s_cpu_if->vgic_sre = host_if->vgic_sre;
@@ -334,7 +350,8 @@ void vgic_v3_load_nested(struct kvm_vcpu *vcpu)
__vgic_v3_restore_vmcr_aprs(cpu_if);
__vgic_v3_activate_traps(cpu_if);
- __vgic_v3_restore_state(cpu_if);
+ for (int i = 0; i < cpu_if->used_lrs; i++)
+ __gic_v3_set_lr(cpu_if->vgic_lr[i], i);
/*
* Propagate the number of used LRs for the benefit of the HYP
@@ -347,36 +364,19 @@ void vgic_v3_put_nested(struct kvm_vcpu *vcpu)
{
struct shadow_if *shadow_if = get_shadow_if();
struct vgic_v3_cpu_if *s_cpu_if = &shadow_if->cpuif;
- u64 val;
int i;
- __vgic_v3_save_vmcr_aprs(s_cpu_if);
- __vgic_v3_deactivate_traps(s_cpu_if);
- __vgic_v3_save_state(s_cpu_if);
-
- /*
- * Translate the shadow state HW fields back to the virtual ones
- * before copying the shadow struct back to the nested one.
- */
- val = __vcpu_sys_reg(vcpu, ICH_HCR_EL2);
- val &= ~ICH_HCR_EL2_EOIcount_MASK;
- val |= (s_cpu_if->vgic_hcr & ICH_HCR_EL2_EOIcount_MASK);
- __vcpu_assign_sys_reg(vcpu, ICH_HCR_EL2, val);
- __vcpu_assign_sys_reg(vcpu, ICH_VMCR_EL2, s_cpu_if->vgic_vmcr);
+ __vgic_v3_save_aprs(s_cpu_if);
for (i = 0; i < 4; i++) {
__vcpu_assign_sys_reg(vcpu, ICH_AP0RN(i), s_cpu_if->vgic_ap0r[i]);
__vcpu_assign_sys_reg(vcpu, ICH_AP1RN(i), s_cpu_if->vgic_ap1r[i]);
}
- for_each_set_bit(i, &shadow_if->lr_map, kvm_vgic_global_state.nr_lr) {
- val = __vcpu_sys_reg(vcpu, ICH_LRN(i));
-
- val &= ~ICH_LR_STATE;
- val |= s_cpu_if->vgic_lr[lr_map_idx_to_shadow_idx(shadow_if, i)] & ICH_LR_STATE;
+ for (i = 0; i < s_cpu_if->used_lrs; i++)
+ __gic_v3_set_lr(0, i);
- __vcpu_assign_sys_reg(vcpu, ICH_LRN(i), val);
- }
+ __vgic_v3_deactivate_traps(s_cpu_if);
vcpu->arch.vgic_cpu.vgic_v3.used_lrs = 0;
}
diff --git a/arch/arm64/kvm/vgic/vgic-v3.c b/arch/arm64/kvm/vgic/vgic-v3.c
index 2f75ef14d339..1d6dd1b545bd 100644
--- a/arch/arm64/kvm/vgic/vgic-v3.c
+++ b/arch/arm64/kvm/vgic/vgic-v3.c
@@ -12,6 +12,7 @@
#include <asm/kvm_mmu.h>
#include <asm/kvm_asm.h>
+#include "vgic-mmio.h"
#include "vgic.h"
static bool group0_trap;
@@ -20,11 +21,48 @@ static bool common_trap;
static bool dir_trap;
static bool gicv4_enable;
-void vgic_v3_set_underflow(struct kvm_vcpu *vcpu)
+void vgic_v3_configure_hcr(struct kvm_vcpu *vcpu,
+ struct ap_list_summary *als)
{
struct vgic_v3_cpu_if *cpuif = &vcpu->arch.vgic_cpu.vgic_v3;
- cpuif->vgic_hcr |= ICH_HCR_EL2_UIE;
+ if (!irqchip_in_kernel(vcpu->kvm))
+ return;
+
+ cpuif->vgic_hcr = ICH_HCR_EL2_En;
+
+ if (irqs_pending_outside_lrs(als))
+ cpuif->vgic_hcr |= ICH_HCR_EL2_NPIE;
+ if (irqs_active_outside_lrs(als))
+ cpuif->vgic_hcr |= ICH_HCR_EL2_LRENPIE;
+ if (irqs_outside_lrs(als))
+ cpuif->vgic_hcr |= ICH_HCR_EL2_UIE;
+
+ if (!als->nr_sgi)
+ cpuif->vgic_hcr |= ICH_HCR_EL2_vSGIEOICount;
+
+ cpuif->vgic_hcr |= (cpuif->vgic_vmcr & ICH_VMCR_ENG0_MASK) ?
+ ICH_HCR_EL2_VGrp0DIE : ICH_HCR_EL2_VGrp0EIE;
+ cpuif->vgic_hcr |= (cpuif->vgic_vmcr & ICH_VMCR_ENG1_MASK) ?
+ ICH_HCR_EL2_VGrp1DIE : ICH_HCR_EL2_VGrp1EIE;
+
+ /*
+ * Dealing with EOImode=1 is a massive source of headache. Not
+ * only do we need to track that we have active interrupts
+ * outside of the LRs and force DIR to be trapped, we also
+ * need to deal with SPIs that can be deactivated on another
+ * CPU.
+ *
+ * On systems that do not implement TDIR, force the bit in the
+ * shadow state anyway to avoid IPI-ing on these poor sods.
+ *
+ * Note that we set the trap irrespective of EOIMode, as that
+ * can change behind our back without any warning...
+ */
+ if (!cpus_have_final_cap(ARM64_HAS_ICH_HCR_EL2_TDIR) ||
+ irqs_active_outside_lrs(als) ||
+ atomic_read(&vcpu->kvm->arch.vgic.active_spis))
+ cpuif->vgic_hcr |= ICH_HCR_EL2_TDIR;
}
static bool lr_signals_eoi_mi(u64 lr_val)
@@ -33,84 +71,238 @@ static bool lr_signals_eoi_mi(u64 lr_val)
!(lr_val & ICH_LR_HW);
}
+static void vgic_v3_fold_lr(struct kvm_vcpu *vcpu, u64 val)
+{
+ struct vgic_irq *irq;
+ bool is_v2_sgi = false;
+ bool deactivated;
+ u32 intid;
+
+ if (vcpu->kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3) {
+ intid = val & ICH_LR_VIRTUAL_ID_MASK;
+ } else {
+ intid = val & GICH_LR_VIRTUALID;
+ is_v2_sgi = vgic_irq_is_sgi(intid);
+ }
+
+ irq = vgic_get_vcpu_irq(vcpu, intid);
+ if (!irq) /* An LPI could have been unmapped. */
+ return;
+
+ scoped_guard(raw_spinlock, &irq->irq_lock) {
+ /* Always preserve the active bit for !LPIs, note deactivation */
+ if (irq->intid >= VGIC_MIN_LPI)
+ val &= ~ICH_LR_ACTIVE_BIT;
+ deactivated = irq->active && !(val & ICH_LR_ACTIVE_BIT);
+ irq->active = !!(val & ICH_LR_ACTIVE_BIT);
+
+ /* Edge is the only case where we preserve the pending bit */
+ if (irq->config == VGIC_CONFIG_EDGE &&
+ (val & ICH_LR_PENDING_BIT))
+ irq->pending_latch = true;
+
+ /*
+ * Clear soft pending state when level irqs have been acked.
+ */
+ if (irq->config == VGIC_CONFIG_LEVEL && !(val & ICH_LR_STATE))
+ irq->pending_latch = false;
+
+ if (is_v2_sgi) {
+ u8 cpuid = FIELD_GET(GICH_LR_PHYSID_CPUID, val);
+
+ if (irq->active)
+ irq->active_source = cpuid;
+
+ if (val & ICH_LR_PENDING_BIT)
+ irq->source |= BIT(cpuid);
+ }
+
+ /* Handle resampling for mapped interrupts if required */
+ vgic_irq_handle_resampling(irq, deactivated, val & ICH_LR_PENDING_BIT);
+
+ irq->on_lr = false;
+ }
+
+ /* Notify fds when the guest EOI'ed a level-triggered SPI, and drop the refcount */
+ if (deactivated && lr_signals_eoi_mi(val) && vgic_valid_spi(vcpu->kvm, intid)) {
+ kvm_notify_acked_irq(vcpu->kvm, 0,
+ intid - VGIC_NR_PRIVATE_IRQS);
+ atomic_dec_if_positive(&vcpu->kvm->arch.vgic.active_spis);
+ }
+
+ vgic_put_irq(vcpu->kvm, irq);
+}
+
+static u64 vgic_v3_compute_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq);
+
+static void vgic_v3_deactivate_phys(u32 intid)
+{
+ if (cpus_have_final_cap(ARM64_HAS_GICV5_LEGACY))
+ gic_insn(intid | FIELD_PREP(GICV5_GIC_CDDI_TYPE_MASK, 1), CDDI);
+ else
+ gic_write_dir(intid);
+}
+
void vgic_v3_fold_lr_state(struct kvm_vcpu *vcpu)
{
struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
struct vgic_v3_cpu_if *cpuif = &vgic_cpu->vgic_v3;
- u32 model = vcpu->kvm->arch.vgic.vgic_model;
- int lr;
+ u32 eoicount = FIELD_GET(ICH_HCR_EL2_EOIcount, cpuif->vgic_hcr);
+ struct vgic_irq *irq;
DEBUG_SPINLOCK_BUG_ON(!irqs_disabled());
- cpuif->vgic_hcr &= ~ICH_HCR_EL2_UIE;
-
- for (lr = 0; lr < cpuif->used_lrs; lr++) {
- u64 val = cpuif->vgic_lr[lr];
- u32 intid, cpuid;
- struct vgic_irq *irq;
- bool is_v2_sgi = false;
- bool deactivated;
+ for (int lr = 0; lr < cpuif->used_lrs; lr++)
+ vgic_v3_fold_lr(vcpu, cpuif->vgic_lr[lr]);
- cpuid = val & GICH_LR_PHYSID_CPUID;
- cpuid >>= GICH_LR_PHYSID_CPUID_SHIFT;
+ /*
+ * EOIMode=0: use EOIcount to emulate deactivation. We are
+ * guaranteed to deactivate in reverse order of the activation, so
+ * just pick one active interrupt after the other in the ap_list,
+ * and replay the deactivation as if the CPU was doing it. We also
+ * rely on priority drop to have taken place, and the list to be
+ * sorted by priority.
+ */
+ list_for_each_entry(irq, &vgic_cpu->ap_list_head, ap_list) {
+ u64 lr;
- if (model == KVM_DEV_TYPE_ARM_VGIC_V3) {
- intid = val & ICH_LR_VIRTUAL_ID_MASK;
+ /*
+ * I would have loved to write this using a scoped_guard(),
+ * but using 'continue' here is a total train wreck.
+ */
+ if (!eoicount) {
+ break;
} else {
- intid = val & GICH_LR_VIRTUALID;
- is_v2_sgi = vgic_irq_is_sgi(intid);
+ guard(raw_spinlock)(&irq->irq_lock);
+
+ if (!(likely(vgic_target_oracle(irq) == vcpu) &&
+ irq->active))
+ continue;
+
+ lr = vgic_v3_compute_lr(vcpu, irq) & ~ICH_LR_ACTIVE_BIT;
}
- /* Notify fds when the guest EOI'ed a level-triggered IRQ */
- if (lr_signals_eoi_mi(val) && vgic_valid_spi(vcpu->kvm, intid))
- kvm_notify_acked_irq(vcpu->kvm, 0,
- intid - VGIC_NR_PRIVATE_IRQS);
+ if (lr & ICH_LR_HW)
+ vgic_v3_deactivate_phys(FIELD_GET(ICH_LR_PHYS_ID_MASK, lr));
- irq = vgic_get_vcpu_irq(vcpu, intid);
- if (!irq) /* An LPI could have been unmapped. */
- continue;
+ vgic_v3_fold_lr(vcpu, lr);
+ eoicount--;
+ }
- raw_spin_lock(&irq->irq_lock);
+ cpuif->used_lrs = 0;
+}
- /* Always preserve the active bit, note deactivation */
- deactivated = irq->active && !(val & ICH_LR_ACTIVE_BIT);
- irq->active = !!(val & ICH_LR_ACTIVE_BIT);
+void vgic_v3_deactivate(struct kvm_vcpu *vcpu, u64 val)
+{
+ struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
+ struct vgic_v3_cpu_if *cpuif = &vgic_cpu->vgic_v3;
+ u32 model = vcpu->kvm->arch.vgic.vgic_model;
+ struct kvm_vcpu *target_vcpu = NULL;
+ bool mmio = false, is_v2_sgi;
+ struct vgic_irq *irq;
+ unsigned long flags;
+ u64 lr = 0;
+ u8 cpuid;
- if (irq->active && is_v2_sgi)
- irq->active_source = cpuid;
+ /* Snapshot CPUID, and remove it from the INTID */
+ cpuid = FIELD_GET(GENMASK_ULL(12, 10), val);
+ val &= ~GENMASK_ULL(12, 10);
- /* Edge is the only case where we preserve the pending bit */
- if (irq->config == VGIC_CONFIG_EDGE &&
- (val & ICH_LR_PENDING_BIT)) {
- irq->pending_latch = true;
+ is_v2_sgi = (model == KVM_DEV_TYPE_ARM_VGIC_V2 &&
+ val < VGIC_NR_SGIS);
- if (is_v2_sgi)
- irq->source |= (1 << cpuid);
- }
+ /*
+ * We only deal with DIR when EOIMode==1, and only for SGI,
+ * PPI or SPI.
+ */
+ if (!(cpuif->vgic_vmcr & ICH_VMCR_EOIM_MASK) ||
+ val >= vcpu->kvm->arch.vgic.nr_spis + VGIC_NR_PRIVATE_IRQS)
+ return;
+
+ /* Make sure we're in the same context as LR handling */
+ local_irq_save(flags);
+
+ irq = vgic_get_vcpu_irq(vcpu, val);
+ if (WARN_ON_ONCE(!irq))
+ goto out;
+
+ /*
+ * EOIMode=1: we must rely on traps to handle deactivate of
+ * overflowing interrupts, as there is no ordering guarantee and
+ * EOIcount isn't being incremented. Priority drop will have taken
+ * place, as ICV_EOIxR_EL1 only affects the APRs and not the LRs.
+ *
+ * Three possibities:
+ *
+ * - The irq is not queued on any CPU, and there is nothing to
+ * do,
+ *
+ * - Or the irq is in an LR, meaning that its state is not
+ * directly observable. Treat it bluntly by making it as if
+ * this was a write to GICD_ICACTIVER, which will force an
+ * exit on all vcpus. If it hurts, don't do that.
+ *
+ * - Or the irq is active, but not in an LR, and we can
+ * directly deactivate it by building a pseudo-LR, fold it,
+ * and queue a request to prune the resulting ap_list,
+ *
+ * Special care must be taken to match the source CPUID when
+ * deactivating a GICv2 SGI.
+ */
+ scoped_guard(raw_spinlock, &irq->irq_lock) {
+ target_vcpu = irq->vcpu;
+
+ /* Not on any ap_list? */
+ if (!target_vcpu)
+ goto put;
/*
- * Clear soft pending state when level irqs have been acked.
+ * Urgh. We're deactivating something that we cannot
+ * observe yet... Big hammer time.
*/
- if (irq->config == VGIC_CONFIG_LEVEL && !(val & ICH_LR_STATE))
- irq->pending_latch = false;
+ if (irq->on_lr) {
+ mmio = true;
+ goto put;
+ }
- /* Handle resampling for mapped interrupts if required */
- vgic_irq_handle_resampling(irq, deactivated, val & ICH_LR_PENDING_BIT);
+ /* GICv2 SGI: check that the cpuid matches */
+ if (is_v2_sgi && irq->active_source != cpuid) {
+ target_vcpu = NULL;
+ goto put;
+ }
- raw_spin_unlock(&irq->irq_lock);
- vgic_put_irq(vcpu->kvm, irq);
+ /* (with a Dalek voice) DEACTIVATE!!!! */
+ lr = vgic_v3_compute_lr(vcpu, irq) & ~ICH_LR_ACTIVE_BIT;
}
- cpuif->used_lrs = 0;
+ if (lr & ICH_LR_HW)
+ vgic_v3_deactivate_phys(FIELD_GET(ICH_LR_PHYS_ID_MASK, lr));
+
+ vgic_v3_fold_lr(vcpu, lr);
+
+put:
+ vgic_put_irq(vcpu->kvm, irq);
+
+out:
+ local_irq_restore(flags);
+
+ if (mmio)
+ vgic_mmio_write_cactive(vcpu, (val / 32) * 4, 4, BIT(val % 32));
+
+ /* Force the ap_list to be pruned */
+ if (target_vcpu)
+ kvm_make_request(KVM_REQ_VGIC_PROCESS_UPDATE, target_vcpu);
}
/* Requires the irq to be locked already */
-void vgic_v3_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr)
+static u64 vgic_v3_compute_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq)
{
u32 model = vcpu->kvm->arch.vgic.vgic_model;
u64 val = irq->intid;
bool allow_pending = true, is_v2_sgi;
+ WARN_ON(irq->on_lr);
+
is_v2_sgi = (vgic_irq_is_sgi(irq->intid) &&
model == KVM_DEV_TYPE_ARM_VGIC_V2);
@@ -150,6 +342,35 @@ void vgic_v3_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr)
if (allow_pending && irq_is_pending(irq)) {
val |= ICH_LR_PENDING_BIT;
+ if (is_v2_sgi) {
+ u32 src = ffs(irq->source);
+
+ if (WARN_RATELIMIT(!src, "No SGI source for INTID %d\n",
+ irq->intid))
+ return 0;
+
+ val |= (src - 1) << GICH_LR_PHYSID_CPUID_SHIFT;
+ if (irq->source & ~BIT(src - 1))
+ val |= ICH_LR_EOI;
+ }
+ }
+
+ if (irq->group)
+ val |= ICH_LR_GROUP;
+
+ val |= (u64)irq->priority << ICH_LR_PRIORITY_SHIFT;
+
+ return val;
+}
+
+void vgic_v3_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr)
+{
+ u32 model = vcpu->kvm->arch.vgic.vgic_model;
+ u64 val = vgic_v3_compute_lr(vcpu, irq);
+
+ vcpu->arch.vgic_cpu.vgic_v3.vgic_lr[lr] = val;
+
+ if (val & ICH_LR_PENDING_BIT) {
if (irq->config == VGIC_CONFIG_EDGE)
irq->pending_latch = false;
@@ -157,16 +378,9 @@ void vgic_v3_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr)
model == KVM_DEV_TYPE_ARM_VGIC_V2) {
u32 src = ffs(irq->source);
- if (WARN_RATELIMIT(!src, "No SGI source for INTID %d\n",
- irq->intid))
- return;
-
- val |= (src - 1) << GICH_LR_PHYSID_CPUID_SHIFT;
- irq->source &= ~(1 << (src - 1));
- if (irq->source) {
+ irq->source &= ~BIT(src - 1);
+ if (irq->source)
irq->pending_latch = true;
- val |= ICH_LR_EOI;
- }
}
}
@@ -179,12 +393,7 @@ void vgic_v3_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr)
if (vgic_irq_is_mapped_level(irq) && (val & ICH_LR_PENDING_BIT))
irq->line_level = false;
- if (irq->group)
- val |= ICH_LR_GROUP;
-
- val |= (u64)irq->priority << ICH_LR_PRIORITY_SHIFT;
-
- vcpu->arch.vgic_cpu.vgic_v3.vgic_lr[lr] = val;
+ irq->on_lr = true;
}
void vgic_v3_clear_lr(struct kvm_vcpu *vcpu, int lr)
@@ -258,7 +467,7 @@ void vgic_v3_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcrp)
GIC_BASER_CACHEABILITY(GICR_PENDBASER, OUTER, SameAsInner) | \
GIC_BASER_SHAREABILITY(GICR_PENDBASER, InnerShareable))
-void vgic_v3_enable(struct kvm_vcpu *vcpu)
+void vgic_v3_reset(struct kvm_vcpu *vcpu)
{
struct vgic_v3_cpu_if *vgic_v3 = &vcpu->arch.vgic_cpu.vgic_v3;
@@ -288,9 +497,6 @@ void vgic_v3_enable(struct kvm_vcpu *vcpu)
kvm_vgic_global_state.ich_vtr_el2);
vcpu->arch.vgic_cpu.num_pri_bits = FIELD_GET(ICH_VTR_EL2_PRIbits,
kvm_vgic_global_state.ich_vtr_el2) + 1;
-
- /* Get the show on the road... */
- vgic_v3->vgic_hcr = ICH_HCR_EL2_En;
}
void vcpu_set_ich_hcr(struct kvm_vcpu *vcpu)
@@ -302,20 +508,9 @@ void vcpu_set_ich_hcr(struct kvm_vcpu *vcpu)
/* Hide GICv3 sysreg if necessary */
if (vcpu->kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V2 ||
- !irqchip_in_kernel(vcpu->kvm)) {
+ !irqchip_in_kernel(vcpu->kvm))
vgic_v3->vgic_hcr |= (ICH_HCR_EL2_TALL0 | ICH_HCR_EL2_TALL1 |
ICH_HCR_EL2_TC);
- return;
- }
-
- if (group0_trap)
- vgic_v3->vgic_hcr |= ICH_HCR_EL2_TALL0;
- if (group1_trap)
- vgic_v3->vgic_hcr |= ICH_HCR_EL2_TALL1;
- if (common_trap)
- vgic_v3->vgic_hcr |= ICH_HCR_EL2_TC;
- if (dir_trap)
- vgic_v3->vgic_hcr |= ICH_HCR_EL2_TDIR;
}
int vgic_v3_lpi_sync_pending_status(struct kvm *kvm, struct vgic_irq *irq)
@@ -636,8 +831,53 @@ static const struct midr_range broken_seis[] = {
static bool vgic_v3_broken_seis(void)
{
- return ((kvm_vgic_global_state.ich_vtr_el2 & ICH_VTR_EL2_SEIS) &&
- is_midr_in_range_list(broken_seis));
+ return (is_kernel_in_hyp_mode() &&
+ is_midr_in_range_list(broken_seis) &&
+ (read_sysreg_s(SYS_ICH_VTR_EL2) & ICH_VTR_EL2_SEIS));
+}
+
+void noinstr kvm_compute_ich_hcr_trap_bits(struct alt_instr *alt,
+ __le32 *origptr, __le32 *updptr,
+ int nr_inst)
+{
+ u32 insn, oinsn, rd;
+ u64 hcr = 0;
+
+ if (cpus_have_cap(ARM64_WORKAROUND_CAVIUM_30115)) {
+ group0_trap = true;
+ group1_trap = true;
+ }
+
+ if (vgic_v3_broken_seis()) {
+ /* We know that these machines have ICH_HCR_EL2.TDIR */
+ group0_trap = true;
+ group1_trap = true;
+ dir_trap = true;
+ }
+
+ if (!cpus_have_cap(ARM64_HAS_ICH_HCR_EL2_TDIR))
+ common_trap = true;
+
+ if (group0_trap)
+ hcr |= ICH_HCR_EL2_TALL0;
+ if (group1_trap)
+ hcr |= ICH_HCR_EL2_TALL1;
+ if (common_trap)
+ hcr |= ICH_HCR_EL2_TC;
+ if (dir_trap)
+ hcr |= ICH_HCR_EL2_TDIR;
+
+ /* Compute target register */
+ oinsn = le32_to_cpu(*origptr);
+ rd = aarch64_insn_decode_register(AARCH64_INSN_REGTYPE_RD, oinsn);
+
+ /* movz rd, #(val & 0xffff) */
+ insn = aarch64_insn_gen_movewide(rd,
+ (u16)hcr,
+ 0,
+ AARCH64_INSN_VARIANT_64BIT,
+ AARCH64_INSN_MOVEWIDE_ZERO);
+ *updptr = cpu_to_le32(insn);
}
/**
@@ -651,6 +891,7 @@ int vgic_v3_probe(const struct gic_kvm_info *info)
{
u64 ich_vtr_el2 = kvm_call_hyp_ret(__vgic_v3_get_gic_config);
bool has_v2;
+ u64 traps;
int ret;
has_v2 = ich_vtr_el2 >> 63;
@@ -709,29 +950,18 @@ int vgic_v3_probe(const struct gic_kvm_info *info)
if (has_v2)
static_branch_enable(&vgic_v3_has_v2_compat);
- if (cpus_have_final_cap(ARM64_WORKAROUND_CAVIUM_30115)) {
- group0_trap = true;
- group1_trap = true;
- }
-
if (vgic_v3_broken_seis()) {
kvm_info("GICv3 with broken locally generated SEI\n");
-
kvm_vgic_global_state.ich_vtr_el2 &= ~ICH_VTR_EL2_SEIS;
- group0_trap = true;
- group1_trap = true;
- if (ich_vtr_el2 & ICH_VTR_EL2_TDS)
- dir_trap = true;
- else
- common_trap = true;
}
- if (group0_trap || group1_trap || common_trap | dir_trap) {
+ traps = vgic_ich_hcr_trap_bits();
+ if (traps) {
kvm_info("GICv3 sysreg trapping enabled ([%s%s%s%s], reduced performance)\n",
- group0_trap ? "G0" : "",
- group1_trap ? "G1" : "",
- common_trap ? "C" : "",
- dir_trap ? "D" : "");
+ (traps & ICH_HCR_EL2_TALL0) ? "G0" : "",
+ (traps & ICH_HCR_EL2_TALL1) ? "G1" : "",
+ (traps & ICH_HCR_EL2_TC) ? "C" : "",
+ (traps & ICH_HCR_EL2_TDIR) ? "D" : "");
static_branch_enable(&vgic_v3_cpuif_trap);
}
@@ -771,7 +1001,7 @@ void vgic_v3_put(struct kvm_vcpu *vcpu)
}
if (likely(!is_protected_kvm_enabled()))
- kvm_call_hyp(__vgic_v3_save_vmcr_aprs, cpu_if);
+ kvm_call_hyp(__vgic_v3_save_aprs, cpu_if);
WARN_ON(vgic_v4_put(vcpu));
if (has_vhe())
diff --git a/arch/arm64/kvm/vgic/vgic-v4.c b/arch/arm64/kvm/vgic/vgic-v4.c
index 548aec9d5a72..09c3e9eb23f8 100644
--- a/arch/arm64/kvm/vgic/vgic-v4.c
+++ b/arch/arm64/kvm/vgic/vgic-v4.c
@@ -163,6 +163,7 @@ static void vgic_v4_disable_vsgis(struct kvm_vcpu *vcpu)
struct vgic_irq *irq = vgic_get_vcpu_irq(vcpu, i);
struct irq_desc *desc;
unsigned long flags;
+ bool pending;
int ret;
raw_spin_lock_irqsave(&irq->irq_lock, flags);
@@ -173,9 +174,11 @@ static void vgic_v4_disable_vsgis(struct kvm_vcpu *vcpu)
irq->hw = false;
ret = irq_get_irqchip_state(irq->host_irq,
IRQCHIP_STATE_PENDING,
- &irq->pending_latch);
+ &pending);
WARN_ON(ret);
+ irq->pending_latch = pending;
+
desc = irq_to_desc(irq->host_irq);
irq_domain_deactivate_irq(irq_desc_get_irq_data(desc));
unlock:
diff --git a/arch/arm64/kvm/vgic/vgic.c b/arch/arm64/kvm/vgic/vgic.c
index 8d20c53faef0..430aa98888fd 100644
--- a/arch/arm64/kvm/vgic/vgic.c
+++ b/arch/arm64/kvm/vgic/vgic.c
@@ -244,7 +244,7 @@ void vgic_irq_set_phys_active(struct vgic_irq *irq, bool active)
*
* Requires the IRQ lock to be held.
*/
-static struct kvm_vcpu *vgic_target_oracle(struct vgic_irq *irq)
+struct kvm_vcpu *vgic_target_oracle(struct vgic_irq *irq)
{
lockdep_assert_held(&irq->irq_lock);
@@ -272,17 +272,20 @@ static struct kvm_vcpu *vgic_target_oracle(struct vgic_irq *irq)
return NULL;
}
+struct vgic_sort_info {
+ struct kvm_vcpu *vcpu;
+ struct vgic_vmcr vmcr;
+};
+
/*
* The order of items in the ap_lists defines how we'll pack things in LRs as
* well, the first items in the list being the first things populated in the
* LRs.
*
- * A hard rule is that active interrupts can never be pushed out of the LRs
- * (and therefore take priority) since we cannot reliably trap on deactivation
- * of IRQs and therefore they have to be present in the LRs.
- *
+ * Pending, non-active interrupts must be placed at the head of the list.
* Otherwise things should be sorted by the priority field and the GIC
* hardware support will take care of preemption of priority groups etc.
+ * Interrupts that are not deliverable should be at the end of the list.
*
* Return negative if "a" sorts before "b", 0 to preserve order, and positive
* to sort "b" before "a".
@@ -292,6 +295,8 @@ static int vgic_irq_cmp(void *priv, const struct list_head *a,
{
struct vgic_irq *irqa = container_of(a, struct vgic_irq, ap_list);
struct vgic_irq *irqb = container_of(b, struct vgic_irq, ap_list);
+ struct vgic_sort_info *info = priv;
+ struct kvm_vcpu *vcpu = info->vcpu;
bool penda, pendb;
int ret;
@@ -305,21 +310,32 @@ static int vgic_irq_cmp(void *priv, const struct list_head *a,
raw_spin_lock(&irqa->irq_lock);
raw_spin_lock_nested(&irqb->irq_lock, SINGLE_DEPTH_NESTING);
- if (irqa->active || irqb->active) {
- ret = (int)irqb->active - (int)irqa->active;
+ /* Undeliverable interrupts should be last */
+ ret = (int)(vgic_target_oracle(irqb) == vcpu) - (int)(vgic_target_oracle(irqa) == vcpu);
+ if (ret)
+ goto out;
+
+ /* Same thing for interrupts targeting a disabled group */
+ ret = (int)(irqb->group ? info->vmcr.grpen1 : info->vmcr.grpen0);
+ ret -= (int)(irqa->group ? info->vmcr.grpen1 : info->vmcr.grpen0);
+ if (ret)
goto out;
- }
- penda = irqa->enabled && irq_is_pending(irqa);
- pendb = irqb->enabled && irq_is_pending(irqb);
+ penda = irqa->enabled && irq_is_pending(irqa) && !irqa->active;
+ pendb = irqb->enabled && irq_is_pending(irqb) && !irqb->active;
- if (!penda || !pendb) {
- ret = (int)pendb - (int)penda;
+ ret = (int)pendb - (int)penda;
+ if (ret)
goto out;
- }
- /* Both pending and enabled, sort by priority */
- ret = irqa->priority - irqb->priority;
+ /* Both pending and enabled, sort by priority (lower number first) */
+ ret = (int)irqa->priority - (int)irqb->priority;
+ if (ret)
+ goto out;
+
+ /* Finally, HW bit active interrupts have priority over non-HW ones */
+ ret = (int)irqb->hw - (int)irqa->hw;
+
out:
raw_spin_unlock(&irqb->irq_lock);
raw_spin_unlock(&irqa->irq_lock);
@@ -330,10 +346,12 @@ out:
static void vgic_sort_ap_list(struct kvm_vcpu *vcpu)
{
struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
+ struct vgic_sort_info info = { .vcpu = vcpu, };
lockdep_assert_held(&vgic_cpu->ap_list_lock);
- list_sort(NULL, &vgic_cpu->ap_list_head, vgic_irq_cmp);
+ vgic_get_vmcr(vcpu, &info.vmcr);
+ list_sort(&info, &vgic_cpu->ap_list_head, vgic_irq_cmp);
}
/*
@@ -356,6 +374,20 @@ static bool vgic_validate_injection(struct vgic_irq *irq, bool level, void *owne
return false;
}
+static bool vgic_model_needs_bcst_kick(struct kvm *kvm)
+{
+ /*
+ * A GICv3 (or GICv3-like) system exposing a GICv3 to the guest
+ * needs a broadcast kick to set TDIR globally.
+ *
+ * For systems that do not have TDIR (ARM's own v8.0 CPUs), the
+ * shadow TDIR bit is always set, and so is the register's TC bit,
+ * so no need to kick the CPUs.
+ */
+ return (cpus_have_final_cap(ARM64_HAS_ICH_HCR_EL2_TDIR) &&
+ kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3);
+}
+
/*
* Check whether an IRQ needs to (and can) be queued to a VCPU's ap list.
* Do the queuing if necessary, taking the right locks in the right order.
@@ -368,6 +400,7 @@ bool vgic_queue_irq_unlock(struct kvm *kvm, struct vgic_irq *irq,
unsigned long flags) __releases(&irq->irq_lock)
{
struct kvm_vcpu *vcpu;
+ bool bcast;
lockdep_assert_held(&irq->irq_lock);
@@ -442,11 +475,20 @@ retry:
list_add_tail(&irq->ap_list, &vcpu->arch.vgic_cpu.ap_list_head);
irq->vcpu = vcpu;
+ /* A new SPI may result in deactivation trapping on all vcpus */
+ bcast = (vgic_model_needs_bcst_kick(vcpu->kvm) &&
+ vgic_valid_spi(vcpu->kvm, irq->intid) &&
+ atomic_fetch_inc(&vcpu->kvm->arch.vgic.active_spis) == 0);
+
raw_spin_unlock(&irq->irq_lock);
raw_spin_unlock_irqrestore(&vcpu->arch.vgic_cpu.ap_list_lock, flags);
- kvm_make_request(KVM_REQ_IRQ_PENDING, vcpu);
- kvm_vcpu_kick(vcpu);
+ if (!bcast) {
+ kvm_make_request(KVM_REQ_IRQ_PENDING, vcpu);
+ kvm_vcpu_kick(vcpu);
+ } else {
+ kvm_make_all_cpus_request(vcpu->kvm, KVM_REQ_IRQ_PENDING);
+ }
return true;
}
@@ -798,98 +840,148 @@ static inline void vgic_clear_lr(struct kvm_vcpu *vcpu, int lr)
vgic_v3_clear_lr(vcpu, lr);
}
-static inline void vgic_set_underflow(struct kvm_vcpu *vcpu)
-{
- if (kvm_vgic_global_state.type == VGIC_V2)
- vgic_v2_set_underflow(vcpu);
- else
- vgic_v3_set_underflow(vcpu);
-}
-
-/* Requires the ap_list_lock to be held. */
-static int compute_ap_list_depth(struct kvm_vcpu *vcpu,
- bool *multi_sgi)
+static void summarize_ap_list(struct kvm_vcpu *vcpu,
+ struct ap_list_summary *als)
{
struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
struct vgic_irq *irq;
- int count = 0;
-
- *multi_sgi = false;
lockdep_assert_held(&vgic_cpu->ap_list_lock);
+ *als = (typeof(*als)){};
+
list_for_each_entry(irq, &vgic_cpu->ap_list_head, ap_list) {
- int w;
+ guard(raw_spinlock)(&irq->irq_lock);
- raw_spin_lock(&irq->irq_lock);
- /* GICv2 SGIs can count for more than one... */
- w = vgic_irq_get_lr_count(irq);
- raw_spin_unlock(&irq->irq_lock);
+ if (unlikely(vgic_target_oracle(irq) != vcpu))
+ continue;
+
+ if (!irq->active)
+ als->nr_pend++;
+ else
+ als->nr_act++;
- count += w;
- *multi_sgi |= (w > 1);
+ if (irq->intid < VGIC_NR_SGIS)
+ als->nr_sgi++;
}
- return count;
}
-/* Requires the VCPU's ap_list_lock to be held. */
+/*
+ * Dealing with LR overflow is close to black magic -- dress accordingly.
+ *
+ * We have to present an almost infinite number of interrupts through a very
+ * limited number of registers. Therefore crucial decisions must be made to
+ * ensure we feed the most relevant interrupts into the LRs, and yet have
+ * some facilities to let the guest interact with those that are not there.
+ *
+ * All considerations below are in the context of interrupts targeting a
+ * single vcpu with non-idle state (either pending, active, or both),
+ * colloquially called the ap_list:
+ *
+ * - Pending interrupts must have priority over active interrupts. This also
+ * excludes pending+active interrupts. This ensures that a guest can
+ * perform priority drops on any number of interrupts, and yet be
+ * presented the next pending one.
+ *
+ * - Deactivation of interrupts outside of the LRs must be tracked by using
+ * either the EOIcount-driven maintenance interrupt, and sometimes by
+ * trapping the DIR register.
+ *
+ * - For EOImode=0, a non-zero EOIcount means walking the ap_list past the
+ * point that made it into the LRs, and deactivate interrupts that would
+ * have made it onto the LRs if we had the space.
+ *
+ * - The MI-generation bits must be used to try and force an exit when the
+ * guest has done enough changes to the LRs that we want to reevaluate the
+ * situation:
+ *
+ * - if the total number of pending interrupts exceeds the number of
+ * LR, NPIE must be set in order to exit once no pending interrupts
+ * are present in the LRs, allowing us to populate the next batch.
+ *
+ * - if there are active interrupts outside of the LRs, then LRENPIE
+ * must be set so that we exit on deactivation of one of these, and
+ * work out which one is to be deactivated. Note that this is not
+ * enough to deal with EOImode=1, see below.
+ *
+ * - if the overall number of interrupts exceeds the number of LRs,
+ * then UIE must be set to allow refilling of the LRs once the
+ * majority of them has been processed.
+ *
+ * - as usual, MI triggers are only an optimisation, since we cannot
+ * rely on the MI being delivered in timely manner...
+ *
+ * - EOImode=1 creates some additional problems:
+ *
+ * - deactivation can happen in any order, and we cannot rely on
+ * EOImode=0's coupling of priority-drop and deactivation which
+ * imposes strict reverse Ack order. This means that DIR must
+ * trap if we have active interrupts outside of the LRs.
+ *
+ * - deactivation of SPIs can occur on any CPU, while the SPI is only
+ * present in the ap_list of the CPU that actually ack-ed it. In that
+ * case, EOIcount doesn't provide enough information, and we must
+ * resort to trapping DIR even if we don't overflow the LRs. Bonus
+ * point for not trapping DIR when no SPIs are pending or active in
+ * the whole VM.
+ *
+ * - LPIs do not suffer the same problem as SPIs on deactivation, as we
+ * have to essentially discard the active state, see below.
+ *
+ * - Virtual LPIs have an active state (surprise!), which gets removed on
+ * priority drop (EOI). However, EOIcount doesn't get bumped when the LPI
+ * is not present in the LR (surprise again!). Special care must therefore
+ * be taken to remove the active state from any activated LPI when exiting
+ * from the guest. This is in a way no different from what happens on the
+ * physical side. We still rely on the running priority to have been
+ * removed from the APRs, irrespective of the LPI being present in the LRs
+ * or not.
+ *
+ * - Virtual SGIs directly injected via GICv4.1 must not affect EOIcount, as
+ * they are not managed in SW and don't have a true active state. So only
+ * set vSGIEOICount when no SGIs are in the ap_list.
+ *
+ * - GICv2 SGIs with multiple sources are injected one source at a time, as
+ * if they were made pending sequentially. This may mean that we don't
+ * always present the HPPI if other interrupts with lower priority are
+ * pending in the LRs. Big deal.
+ */
static void vgic_flush_lr_state(struct kvm_vcpu *vcpu)
{
struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
+ struct ap_list_summary als;
struct vgic_irq *irq;
- int count;
- bool multi_sgi;
- u8 prio = 0xff;
- int i = 0;
+ int count = 0;
lockdep_assert_held(&vgic_cpu->ap_list_lock);
- count = compute_ap_list_depth(vcpu, &multi_sgi);
- if (count > kvm_vgic_global_state.nr_lr || multi_sgi)
- vgic_sort_ap_list(vcpu);
+ summarize_ap_list(vcpu, &als);
- count = 0;
+ if (irqs_outside_lrs(&als))
+ vgic_sort_ap_list(vcpu);
list_for_each_entry(irq, &vgic_cpu->ap_list_head, ap_list) {
- raw_spin_lock(&irq->irq_lock);
-
- /*
- * If we have multi-SGIs in the pipeline, we need to
- * guarantee that they are all seen before any IRQ of
- * lower priority. In that case, we need to filter out
- * these interrupts by exiting early. This is easy as
- * the AP list has been sorted already.
- */
- if (multi_sgi && irq->priority > prio) {
- raw_spin_unlock(&irq->irq_lock);
- break;
- }
-
- if (likely(vgic_target_oracle(irq) == vcpu)) {
- vgic_populate_lr(vcpu, irq, count++);
-
- if (irq->source)
- prio = irq->priority;
+ scoped_guard(raw_spinlock, &irq->irq_lock) {
+ if (likely(vgic_target_oracle(irq) == vcpu)) {
+ vgic_populate_lr(vcpu, irq, count++);
+ }
}
- raw_spin_unlock(&irq->irq_lock);
-
- if (count == kvm_vgic_global_state.nr_lr) {
- if (!list_is_last(&irq->ap_list,
- &vgic_cpu->ap_list_head))
- vgic_set_underflow(vcpu);
+ if (count == kvm_vgic_global_state.nr_lr)
break;
- }
}
/* Nuke remaining LRs */
- for (i = count ; i < kvm_vgic_global_state.nr_lr; i++)
+ for (int i = count ; i < kvm_vgic_global_state.nr_lr; i++)
vgic_clear_lr(vcpu, i);
- if (!static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif))
+ if (!static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif)) {
vcpu->arch.vgic_cpu.vgic_v2.used_lrs = count;
- else
+ vgic_v2_configure_hcr(vcpu, &als);
+ } else {
vcpu->arch.vgic_cpu.vgic_v3.used_lrs = count;
+ vgic_v3_configure_hcr(vcpu, &als);
+ }
}
static inline bool can_access_vgic_from_kernel(void)
@@ -913,8 +1005,6 @@ static inline void vgic_save_state(struct kvm_vcpu *vcpu)
/* Sync back the hardware VGIC state into our emulation after a guest's run. */
void kvm_vgic_sync_hwstate(struct kvm_vcpu *vcpu)
{
- int used_lrs;
-
/* If nesting, emulate the HW effect from L0 to L1 */
if (vgic_state_is_nested(vcpu)) {
vgic_v3_sync_nested(vcpu);
@@ -924,21 +1014,22 @@ void kvm_vgic_sync_hwstate(struct kvm_vcpu *vcpu)
if (vcpu_has_nv(vcpu))
vgic_v3_nested_update_mi(vcpu);
- /* An empty ap_list_head implies used_lrs == 0 */
- if (list_empty(&vcpu->arch.vgic_cpu.ap_list_head))
- return;
-
if (can_access_vgic_from_kernel())
vgic_save_state(vcpu);
- if (!static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif))
- used_lrs = vcpu->arch.vgic_cpu.vgic_v2.used_lrs;
- else
- used_lrs = vcpu->arch.vgic_cpu.vgic_v3.used_lrs;
+ vgic_fold_lr_state(vcpu);
+ vgic_prune_ap_list(vcpu);
+}
+
+/* Sync interrupts that were deactivated through a DIR trap */
+void kvm_vgic_process_async_update(struct kvm_vcpu *vcpu)
+{
+ unsigned long flags;
- if (used_lrs)
- vgic_fold_lr_state(vcpu);
+ /* Make sure we're in the same context as LR handling */
+ local_irq_save(flags);
vgic_prune_ap_list(vcpu);
+ local_irq_restore(flags);
}
static inline void vgic_restore_state(struct kvm_vcpu *vcpu)
@@ -965,8 +1056,9 @@ void kvm_vgic_flush_hwstate(struct kvm_vcpu *vcpu)
* abort the entry procedure and inject the exception at the
* beginning of the run loop.
*
- * - Otherwise, do exactly *NOTHING*. The guest state is
- * already loaded, and we can carry on with running it.
+ * - Otherwise, do exactly *NOTHING* apart from enabling the virtual
+ * CPU interface. The guest state is already loaded, and we can
+ * carry on with running it.
*
* If we have NV, but are not in a nested state, compute the
* maintenance interrupt state, as it may fire.
@@ -975,35 +1067,17 @@ void kvm_vgic_flush_hwstate(struct kvm_vcpu *vcpu)
if (kvm_vgic_vcpu_pending_irq(vcpu))
kvm_make_request(KVM_REQ_GUEST_HYP_IRQ_PENDING, vcpu);
+ vgic_v3_flush_nested(vcpu);
return;
}
if (vcpu_has_nv(vcpu))
vgic_v3_nested_update_mi(vcpu);
- /*
- * If there are no virtual interrupts active or pending for this
- * VCPU, then there is no work to do and we can bail out without
- * taking any lock. There is a potential race with someone injecting
- * interrupts to the VCPU, but it is a benign race as the VCPU will
- * either observe the new interrupt before or after doing this check,
- * and introducing additional synchronization mechanism doesn't change
- * this.
- *
- * Note that we still need to go through the whole thing if anything
- * can be directly injected (GICv4).
- */
- if (list_empty(&vcpu->arch.vgic_cpu.ap_list_head) &&
- !vgic_supports_direct_irqs(vcpu->kvm))
- return;
-
DEBUG_SPINLOCK_BUG_ON(!irqs_disabled());
- if (!list_empty(&vcpu->arch.vgic_cpu.ap_list_head)) {
- raw_spin_lock(&vcpu->arch.vgic_cpu.ap_list_lock);
+ scoped_guard(raw_spinlock, &vcpu->arch.vgic_cpu.ap_list_lock)
vgic_flush_lr_state(vcpu);
- raw_spin_unlock(&vcpu->arch.vgic_cpu.ap_list_lock);
- }
if (can_access_vgic_from_kernel())
vgic_restore_state(vcpu);
diff --git a/arch/arm64/kvm/vgic/vgic.h b/arch/arm64/kvm/vgic/vgic.h
index ac5f9c5d2b98..5f0fc96b4dc2 100644
--- a/arch/arm64/kvm/vgic/vgic.h
+++ b/arch/arm64/kvm/vgic/vgic.h
@@ -164,6 +164,22 @@ static inline int vgic_write_guest_lock(struct kvm *kvm, gpa_t gpa,
return ret;
}
+void kvm_compute_ich_hcr_trap_bits(struct alt_instr *alt,
+ __le32 *origptr, __le32 *updptr, int nr_inst);
+
+static inline u64 vgic_ich_hcr_trap_bits(void)
+{
+ u64 hcr;
+
+ /* All the traps are in the bottom 16bits */
+ asm volatile(ALTERNATIVE_CB("movz %0, #0\n",
+ ARM64_ALWAYS_SYSTEM,
+ kvm_compute_ich_hcr_trap_bits)
+ : "=r" (hcr));
+
+ return hcr;
+}
+
/*
* This struct provides an intermediate representation of the fields contained
* in the GICH_VMCR and ICH_VMCR registers, such that code exporting the GIC
@@ -220,6 +236,21 @@ struct its_ite {
u32 event_id;
};
+struct ap_list_summary {
+ unsigned int nr_pend; /* purely pending, not active */
+ unsigned int nr_act; /* active, or active+pending */
+ unsigned int nr_sgi; /* any SGI */
+};
+
+#define irqs_outside_lrs(s) \
+ (((s)->nr_pend + (s)->nr_act) > kvm_vgic_global_state.nr_lr)
+
+#define irqs_pending_outside_lrs(s) \
+ ((s)->nr_pend > kvm_vgic_global_state.nr_lr)
+
+#define irqs_active_outside_lrs(s) \
+ ((s)->nr_act && irqs_outside_lrs(s))
+
int vgic_v3_parse_attr(struct kvm_device *dev, struct kvm_device_attr *attr,
struct vgic_reg_attr *reg_attr);
int vgic_v2_parse_attr(struct kvm_device *dev, struct kvm_device_attr *attr,
@@ -230,6 +261,7 @@ vgic_get_mmio_region(struct kvm_vcpu *vcpu, struct vgic_io_device *iodev,
struct vgic_irq *vgic_get_irq(struct kvm *kvm, u32 intid);
struct vgic_irq *vgic_get_vcpu_irq(struct kvm_vcpu *vcpu, u32 intid);
void vgic_put_irq(struct kvm *kvm, struct vgic_irq *irq);
+struct kvm_vcpu *vgic_target_oracle(struct vgic_irq *irq);
bool vgic_get_phys_line_level(struct vgic_irq *irq);
void vgic_irq_set_phys_pending(struct vgic_irq *irq, bool pending);
void vgic_irq_set_phys_active(struct vgic_irq *irq, bool active);
@@ -245,8 +277,9 @@ int vgic_check_iorange(struct kvm *kvm, phys_addr_t ioaddr,
void vgic_v2_fold_lr_state(struct kvm_vcpu *vcpu);
void vgic_v2_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr);
+void vgic_v2_deactivate(struct kvm_vcpu *vcpu, u32 val);
void vgic_v2_clear_lr(struct kvm_vcpu *vcpu, int lr);
-void vgic_v2_set_underflow(struct kvm_vcpu *vcpu);
+void vgic_v2_configure_hcr(struct kvm_vcpu *vcpu, struct ap_list_summary *als);
int vgic_v2_has_attr_regs(struct kvm_device *dev, struct kvm_device_attr *attr);
int vgic_v2_dist_uaccess(struct kvm_vcpu *vcpu, bool is_write,
int offset, u32 *val);
@@ -254,7 +287,7 @@ int vgic_v2_cpuif_uaccess(struct kvm_vcpu *vcpu, bool is_write,
int offset, u32 *val);
void vgic_v2_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr);
void vgic_v2_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr);
-void vgic_v2_enable(struct kvm_vcpu *vcpu);
+void vgic_v2_reset(struct kvm_vcpu *vcpu);
int vgic_v2_probe(const struct gic_kvm_info *info);
int vgic_v2_map_resources(struct kvm *kvm);
int vgic_register_dist_iodev(struct kvm *kvm, gpa_t dist_base_address,
@@ -286,10 +319,11 @@ static inline void vgic_get_irq_ref(struct vgic_irq *irq)
void vgic_v3_fold_lr_state(struct kvm_vcpu *vcpu);
void vgic_v3_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr);
void vgic_v3_clear_lr(struct kvm_vcpu *vcpu, int lr);
-void vgic_v3_set_underflow(struct kvm_vcpu *vcpu);
+void vgic_v3_deactivate(struct kvm_vcpu *vcpu, u64 val);
+void vgic_v3_configure_hcr(struct kvm_vcpu *vcpu, struct ap_list_summary *als);
void vgic_v3_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr);
void vgic_v3_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr);
-void vgic_v3_enable(struct kvm_vcpu *vcpu);
+void vgic_v3_reset(struct kvm_vcpu *vcpu);
int vgic_v3_probe(const struct gic_kvm_info *info);
int vgic_v3_map_resources(struct kvm *kvm);
int vgic_v3_lpi_sync_pending_status(struct kvm *kvm, struct vgic_irq *irq);
@@ -412,6 +446,7 @@ static inline bool kvm_has_gicv3(struct kvm *kvm)
return kvm_has_feat(kvm, ID_AA64PFR0_EL1, GIC, IMP);
}
+void vgic_v3_flush_nested(struct kvm_vcpu *vcpu);
void vgic_v3_sync_nested(struct kvm_vcpu *vcpu);
void vgic_v3_load_nested(struct kvm_vcpu *vcpu);
void vgic_v3_put_nested(struct kvm_vcpu *vcpu);
diff --git a/arch/arm64/tools/cpucaps b/arch/arm64/tools/cpucaps
index 1b32c1232d28..0fac75f01534 100644
--- a/arch/arm64/tools/cpucaps
+++ b/arch/arm64/tools/cpucaps
@@ -40,6 +40,7 @@ HAS_GICV5_CPUIF
HAS_GICV5_LEGACY
HAS_GIC_PRIO_MASKING
HAS_GIC_PRIO_RELAXED_SYNC
+HAS_ICH_HCR_EL2_TDIR
HAS_HCR_NV1
HAS_HCX
HAS_LDAPR
@@ -64,6 +65,7 @@ HAS_TLB_RANGE
HAS_VA52
HAS_VIRT_HOST_EXTN
HAS_WFXT
+HAS_XNX
HAFT
HW_DBM
KVM_HVHE
diff --git a/arch/loongarch/include/asm/kvm_eiointc.h b/arch/loongarch/include/asm/kvm_eiointc.h
index a3a40aba8acf..8b7a2fa3f7f8 100644
--- a/arch/loongarch/include/asm/kvm_eiointc.h
+++ b/arch/loongarch/include/asm/kvm_eiointc.h
@@ -10,10 +10,7 @@
#define EIOINTC_IRQS 256
#define EIOINTC_ROUTE_MAX_VCPUS 256
-#define EIOINTC_IRQS_U8_NUMS (EIOINTC_IRQS / 8)
-#define EIOINTC_IRQS_U16_NUMS (EIOINTC_IRQS_U8_NUMS / 2)
-#define EIOINTC_IRQS_U32_NUMS (EIOINTC_IRQS_U8_NUMS / 4)
-#define EIOINTC_IRQS_U64_NUMS (EIOINTC_IRQS_U8_NUMS / 8)
+#define EIOINTC_IRQS_U64_NUMS (EIOINTC_IRQS / 64)
/* map to ipnum per 32 irqs */
#define EIOINTC_IRQS_NODETYPE_COUNT 16
@@ -64,54 +61,18 @@ struct loongarch_eiointc {
uint32_t status;
/* hardware state */
- union nodetype {
- u64 reg_u64[EIOINTC_IRQS_NODETYPE_COUNT / 4];
- u32 reg_u32[EIOINTC_IRQS_NODETYPE_COUNT / 2];
- u16 reg_u16[EIOINTC_IRQS_NODETYPE_COUNT];
- u8 reg_u8[EIOINTC_IRQS_NODETYPE_COUNT * 2];
- } nodetype;
+ u64 nodetype[EIOINTC_IRQS_NODETYPE_COUNT / 4];
/* one bit shows the state of one irq */
- union bounce {
- u64 reg_u64[EIOINTC_IRQS_U64_NUMS];
- u32 reg_u32[EIOINTC_IRQS_U32_NUMS];
- u16 reg_u16[EIOINTC_IRQS_U16_NUMS];
- u8 reg_u8[EIOINTC_IRQS_U8_NUMS];
- } bounce;
-
- union isr {
- u64 reg_u64[EIOINTC_IRQS_U64_NUMS];
- u32 reg_u32[EIOINTC_IRQS_U32_NUMS];
- u16 reg_u16[EIOINTC_IRQS_U16_NUMS];
- u8 reg_u8[EIOINTC_IRQS_U8_NUMS];
- } isr;
- union coreisr {
- u64 reg_u64[EIOINTC_ROUTE_MAX_VCPUS][EIOINTC_IRQS_U64_NUMS];
- u32 reg_u32[EIOINTC_ROUTE_MAX_VCPUS][EIOINTC_IRQS_U32_NUMS];
- u16 reg_u16[EIOINTC_ROUTE_MAX_VCPUS][EIOINTC_IRQS_U16_NUMS];
- u8 reg_u8[EIOINTC_ROUTE_MAX_VCPUS][EIOINTC_IRQS_U8_NUMS];
- } coreisr;
- union enable {
- u64 reg_u64[EIOINTC_IRQS_U64_NUMS];
- u32 reg_u32[EIOINTC_IRQS_U32_NUMS];
- u16 reg_u16[EIOINTC_IRQS_U16_NUMS];
- u8 reg_u8[EIOINTC_IRQS_U8_NUMS];
- } enable;
+ u64 bounce[EIOINTC_IRQS_U64_NUMS];
+ u64 isr[EIOINTC_IRQS_U64_NUMS];
+ u64 coreisr[EIOINTC_ROUTE_MAX_VCPUS][EIOINTC_IRQS_U64_NUMS];
+ u64 enable[EIOINTC_IRQS_U64_NUMS];
/* use one byte to config ipmap for 32 irqs at once */
- union ipmap {
- u64 reg_u64;
- u32 reg_u32[EIOINTC_IRQS_U32_NUMS / 4];
- u16 reg_u16[EIOINTC_IRQS_U16_NUMS / 4];
- u8 reg_u8[EIOINTC_IRQS_U8_NUMS / 4];
- } ipmap;
+ u64 ipmap;
/* use one byte to config coremap for one irq */
- union coremap {
- u64 reg_u64[EIOINTC_IRQS / 8];
- u32 reg_u32[EIOINTC_IRQS / 4];
- u16 reg_u16[EIOINTC_IRQS / 2];
- u8 reg_u8[EIOINTC_IRQS];
- } coremap;
+ u64 coremap[EIOINTC_IRQS / 8];
DECLARE_BITMAP(sw_coreisr[EIOINTC_ROUTE_MAX_VCPUS][LOONGSON_IP_NUM], EIOINTC_IRQS);
uint8_t sw_coremap[EIOINTC_IRQS];
diff --git a/arch/loongarch/include/asm/kvm_host.h b/arch/loongarch/include/asm/kvm_host.h
index 0cecbd038bb3..e4fe5b8e8149 100644
--- a/arch/loongarch/include/asm/kvm_host.h
+++ b/arch/loongarch/include/asm/kvm_host.h
@@ -126,6 +126,8 @@ struct kvm_arch {
struct kvm_phyid_map *phyid_map;
/* Enabled PV features */
unsigned long pv_features;
+ /* Supported KVM features */
+ unsigned long kvm_features;
s64 time_offset;
struct kvm_context __percpu *vmcs;
@@ -293,6 +295,12 @@ static inline int kvm_get_pmu_num(struct kvm_vcpu_arch *arch)
return (arch->cpucfg[6] & CPUCFG6_PMNUM) >> CPUCFG6_PMNUM_SHIFT;
}
+/* Check whether KVM support this feature (VMM may disable it) */
+static inline bool kvm_vm_support(struct kvm_arch *arch, int feature)
+{
+ return !!(arch->kvm_features & BIT_ULL(feature));
+}
+
bool kvm_arch_pmi_in_guest(struct kvm_vcpu *vcpu);
/* Debug: dump vcpu state */
diff --git a/arch/loongarch/include/asm/kvm_vcpu.h b/arch/loongarch/include/asm/kvm_vcpu.h
index f1efd7cfbc20..3784ab4ccdb5 100644
--- a/arch/loongarch/include/asm/kvm_vcpu.h
+++ b/arch/loongarch/include/asm/kvm_vcpu.h
@@ -15,6 +15,7 @@
#define CPU_PMU (_ULCAST_(1) << 10)
#define CPU_TIMER (_ULCAST_(1) << 11)
#define CPU_IPI (_ULCAST_(1) << 12)
+#define CPU_AVEC (_ULCAST_(1) << 14)
/* Controlled by 0x52 guest exception VIP aligned to estat bit 5~12 */
#define CPU_IP0 (_ULCAST_(1))
diff --git a/arch/loongarch/include/asm/loongarch.h b/arch/loongarch/include/asm/loongarch.h
index 3de03cb864b2..58a4a3b6b035 100644
--- a/arch/loongarch/include/asm/loongarch.h
+++ b/arch/loongarch/include/asm/loongarch.h
@@ -511,6 +511,8 @@
#define CSR_GCFG_GPERF_SHIFT 24
#define CSR_GCFG_GPERF_WIDTH 3
#define CSR_GCFG_GPERF (_ULCAST_(0x7) << CSR_GCFG_GPERF_SHIFT)
+#define CSR_GCFG_GPMP_SHIFT 23
+#define CSR_GCFG_GPMP (_ULCAST_(0x1) << CSR_GCFG_GPMP_SHIFT)
#define CSR_GCFG_GCI_SHIFT 20
#define CSR_GCFG_GCI_WIDTH 2
#define CSR_GCFG_GCI (_ULCAST_(0x3) << CSR_GCFG_GCI_SHIFT)
diff --git a/arch/loongarch/include/uapi/asm/kvm.h b/arch/loongarch/include/uapi/asm/kvm.h
index 57ba1a563bb1..de6c3f18e40a 100644
--- a/arch/loongarch/include/uapi/asm/kvm.h
+++ b/arch/loongarch/include/uapi/asm/kvm.h
@@ -104,6 +104,7 @@ struct kvm_fpu {
#define KVM_LOONGARCH_VM_FEAT_PV_IPI 6
#define KVM_LOONGARCH_VM_FEAT_PV_STEALTIME 7
#define KVM_LOONGARCH_VM_FEAT_PTW 8
+#define KVM_LOONGARCH_VM_FEAT_MSGINT 9
/* Device Control API on vcpu fd */
#define KVM_LOONGARCH_VCPU_CPUCFG 0
diff --git a/arch/loongarch/kvm/Kconfig b/arch/loongarch/kvm/Kconfig
index ae64bbdf83a7..ed4f724db774 100644
--- a/arch/loongarch/kvm/Kconfig
+++ b/arch/loongarch/kvm/Kconfig
@@ -25,7 +25,6 @@ config KVM
select HAVE_KVM_IRQCHIP
select HAVE_KVM_MSI
select HAVE_KVM_READONLY_MEM
- select HAVE_KVM_VCPU_ASYNC_IOCTL
select KVM_COMMON
select KVM_GENERIC_DIRTYLOG_READ_PROTECT
select KVM_GENERIC_HARDWARE_ENABLING
diff --git a/arch/loongarch/kvm/intc/eiointc.c b/arch/loongarch/kvm/intc/eiointc.c
index a1cc116b4dac..29886876143f 100644
--- a/arch/loongarch/kvm/intc/eiointc.c
+++ b/arch/loongarch/kvm/intc/eiointc.c
@@ -13,19 +13,19 @@ static void eiointc_set_sw_coreisr(struct loongarch_eiointc *s)
struct kvm_vcpu *vcpu;
for (irq = 0; irq < EIOINTC_IRQS; irq++) {
- ipnum = s->ipmap.reg_u8[irq / 32];
+ ipnum = (s->ipmap >> (irq / 32 * 8)) & 0xff;
if (!(s->status & BIT(EIOINTC_ENABLE_INT_ENCODE))) {
ipnum = count_trailing_zeros(ipnum);
ipnum = (ipnum >= 0 && ipnum < 4) ? ipnum : 0;
}
- cpuid = s->coremap.reg_u8[irq];
+ cpuid = ((u8 *)s->coremap)[irq];
vcpu = kvm_get_vcpu_by_cpuid(s->kvm, cpuid);
if (!vcpu)
continue;
cpu = vcpu->vcpu_id;
- if (test_bit(irq, (unsigned long *)s->coreisr.reg_u32[cpu]))
+ if (test_bit(irq, (unsigned long *)s->coreisr[cpu]))
__set_bit(irq, s->sw_coreisr[cpu][ipnum]);
else
__clear_bit(irq, s->sw_coreisr[cpu][ipnum]);
@@ -38,7 +38,7 @@ static void eiointc_update_irq(struct loongarch_eiointc *s, int irq, int level)
struct kvm_vcpu *vcpu;
struct kvm_interrupt vcpu_irq;
- ipnum = s->ipmap.reg_u8[irq / 32];
+ ipnum = (s->ipmap >> (irq / 32 * 8)) & 0xff;
if (!(s->status & BIT(EIOINTC_ENABLE_INT_ENCODE))) {
ipnum = count_trailing_zeros(ipnum);
ipnum = (ipnum >= 0 && ipnum < 4) ? ipnum : 0;
@@ -53,13 +53,13 @@ static void eiointc_update_irq(struct loongarch_eiointc *s, int irq, int level)
if (level) {
/* if not enable return false */
- if (!test_bit(irq, (unsigned long *)s->enable.reg_u32))
+ if (!test_bit(irq, (unsigned long *)s->enable))
return;
- __set_bit(irq, (unsigned long *)s->coreisr.reg_u32[cpu]);
+ __set_bit(irq, (unsigned long *)s->coreisr[cpu]);
found = find_first_bit(s->sw_coreisr[cpu][ipnum], EIOINTC_IRQS);
__set_bit(irq, s->sw_coreisr[cpu][ipnum]);
} else {
- __clear_bit(irq, (unsigned long *)s->coreisr.reg_u32[cpu]);
+ __clear_bit(irq, (unsigned long *)s->coreisr[cpu]);
__clear_bit(irq, s->sw_coreisr[cpu][ipnum]);
found = find_first_bit(s->sw_coreisr[cpu][ipnum], EIOINTC_IRQS);
}
@@ -94,7 +94,7 @@ static inline void eiointc_update_sw_coremap(struct loongarch_eiointc *s,
if (s->sw_coremap[irq + i] == cpu)
continue;
- if (notify && test_bit(irq + i, (unsigned long *)s->isr.reg_u8)) {
+ if (notify && test_bit(irq + i, (unsigned long *)s->isr)) {
/* lower irq at old cpu and raise irq at new cpu */
eiointc_update_irq(s, irq + i, 0);
s->sw_coremap[irq + i] = cpu;
@@ -108,7 +108,7 @@ static inline void eiointc_update_sw_coremap(struct loongarch_eiointc *s,
void eiointc_set_irq(struct loongarch_eiointc *s, int irq, int level)
{
unsigned long flags;
- unsigned long *isr = (unsigned long *)s->isr.reg_u8;
+ unsigned long *isr = (unsigned long *)s->isr;
spin_lock_irqsave(&s->lock, flags);
level ? __set_bit(irq, isr) : __clear_bit(irq, isr);
@@ -127,27 +127,27 @@ static int loongarch_eiointc_read(struct kvm_vcpu *vcpu, struct loongarch_eioint
switch (offset) {
case EIOINTC_NODETYPE_START ... EIOINTC_NODETYPE_END:
index = (offset - EIOINTC_NODETYPE_START) >> 3;
- data = s->nodetype.reg_u64[index];
+ data = s->nodetype[index];
break;
case EIOINTC_IPMAP_START ... EIOINTC_IPMAP_END:
index = (offset - EIOINTC_IPMAP_START) >> 3;
- data = s->ipmap.reg_u64;
+ data = s->ipmap;
break;
case EIOINTC_ENABLE_START ... EIOINTC_ENABLE_END:
index = (offset - EIOINTC_ENABLE_START) >> 3;
- data = s->enable.reg_u64[index];
+ data = s->enable[index];
break;
case EIOINTC_BOUNCE_START ... EIOINTC_BOUNCE_END:
index = (offset - EIOINTC_BOUNCE_START) >> 3;
- data = s->bounce.reg_u64[index];
+ data = s->bounce[index];
break;
case EIOINTC_COREISR_START ... EIOINTC_COREISR_END:
index = (offset - EIOINTC_COREISR_START) >> 3;
- data = s->coreisr.reg_u64[vcpu->vcpu_id][index];
+ data = s->coreisr[vcpu->vcpu_id][index];
break;
case EIOINTC_COREMAP_START ... EIOINTC_COREMAP_END:
index = (offset - EIOINTC_COREMAP_START) >> 3;
- data = s->coremap.reg_u64[index];
+ data = s->coremap[index];
break;
default:
ret = -EINVAL;
@@ -223,26 +223,26 @@ static int loongarch_eiointc_write(struct kvm_vcpu *vcpu,
switch (offset) {
case EIOINTC_NODETYPE_START ... EIOINTC_NODETYPE_END:
index = (offset - EIOINTC_NODETYPE_START) >> 3;
- old = s->nodetype.reg_u64[index];
- s->nodetype.reg_u64[index] = (old & ~mask) | data;
+ old = s->nodetype[index];
+ s->nodetype[index] = (old & ~mask) | data;
break;
case EIOINTC_IPMAP_START ... EIOINTC_IPMAP_END:
/*
* ipmap cannot be set at runtime, can be set only at the beginning
* of irqchip driver, need not update upper irq level
*/
- old = s->ipmap.reg_u64;
- s->ipmap.reg_u64 = (old & ~mask) | data;
+ old = s->ipmap;
+ s->ipmap = (old & ~mask) | data;
break;
case EIOINTC_ENABLE_START ... EIOINTC_ENABLE_END:
index = (offset - EIOINTC_ENABLE_START) >> 3;
- old = s->enable.reg_u64[index];
- s->enable.reg_u64[index] = (old & ~mask) | data;
+ old = s->enable[index];
+ s->enable[index] = (old & ~mask) | data;
/*
* 1: enable irq.
* update irq when isr is set.
*/
- data = s->enable.reg_u64[index] & ~old & s->isr.reg_u64[index];
+ data = s->enable[index] & ~old & s->isr[index];
while (data) {
irq = __ffs(data);
eiointc_update_irq(s, irq + index * 64, 1);
@@ -252,7 +252,7 @@ static int loongarch_eiointc_write(struct kvm_vcpu *vcpu,
* 0: disable irq.
* update irq when isr is set.
*/
- data = ~s->enable.reg_u64[index] & old & s->isr.reg_u64[index];
+ data = ~s->enable[index] & old & s->isr[index];
while (data) {
irq = __ffs(data);
eiointc_update_irq(s, irq + index * 64, 0);
@@ -262,16 +262,16 @@ static int loongarch_eiointc_write(struct kvm_vcpu *vcpu,
case EIOINTC_BOUNCE_START ... EIOINTC_BOUNCE_END:
/* do not emulate hw bounced irq routing */
index = (offset - EIOINTC_BOUNCE_START) >> 3;
- old = s->bounce.reg_u64[index];
- s->bounce.reg_u64[index] = (old & ~mask) | data;
+ old = s->bounce[index];
+ s->bounce[index] = (old & ~mask) | data;
break;
case EIOINTC_COREISR_START ... EIOINTC_COREISR_END:
index = (offset - EIOINTC_COREISR_START) >> 3;
/* use attrs to get current cpu index */
cpu = vcpu->vcpu_id;
- old = s->coreisr.reg_u64[cpu][index];
+ old = s->coreisr[cpu][index];
/* write 1 to clear interrupt */
- s->coreisr.reg_u64[cpu][index] = old & ~data;
+ s->coreisr[cpu][index] = old & ~data;
data &= old;
while (data) {
irq = __ffs(data);
@@ -281,9 +281,9 @@ static int loongarch_eiointc_write(struct kvm_vcpu *vcpu,
break;
case EIOINTC_COREMAP_START ... EIOINTC_COREMAP_END:
index = (offset - EIOINTC_COREMAP_START) >> 3;
- old = s->coremap.reg_u64[index];
- s->coremap.reg_u64[index] = (old & ~mask) | data;
- data = s->coremap.reg_u64[index];
+ old = s->coremap[index];
+ s->coremap[index] = (old & ~mask) | data;
+ data = s->coremap[index];
eiointc_update_sw_coremap(s, index * 8, data, sizeof(data), true);
break;
default:
@@ -451,10 +451,10 @@ static int kvm_eiointc_ctrl_access(struct kvm_device *dev,
break;
case KVM_DEV_LOONGARCH_EXTIOI_CTRL_LOAD_FINISHED:
eiointc_set_sw_coreisr(s);
- for (i = 0; i < (EIOINTC_IRQS / 4); i++) {
- start_irq = i * 4;
+ for (i = 0; i < (EIOINTC_IRQS / 8); i++) {
+ start_irq = i * 8;
eiointc_update_sw_coremap(s, start_irq,
- s->coremap.reg_u32[i], sizeof(u32), false);
+ s->coremap[i], sizeof(u64), false);
}
break;
default:
@@ -481,34 +481,34 @@ static int kvm_eiointc_regs_access(struct kvm_device *dev,
switch (addr) {
case EIOINTC_NODETYPE_START ... EIOINTC_NODETYPE_END:
offset = (addr - EIOINTC_NODETYPE_START) / 4;
- p = &s->nodetype.reg_u32[offset];
+ p = s->nodetype + offset * 4;
break;
case EIOINTC_IPMAP_START ... EIOINTC_IPMAP_END:
offset = (addr - EIOINTC_IPMAP_START) / 4;
- p = &s->ipmap.reg_u32[offset];
+ p = &s->ipmap + offset * 4;
break;
case EIOINTC_ENABLE_START ... EIOINTC_ENABLE_END:
offset = (addr - EIOINTC_ENABLE_START) / 4;
- p = &s->enable.reg_u32[offset];
+ p = s->enable + offset * 4;
break;
case EIOINTC_BOUNCE_START ... EIOINTC_BOUNCE_END:
offset = (addr - EIOINTC_BOUNCE_START) / 4;
- p = &s->bounce.reg_u32[offset];
+ p = s->bounce + offset * 4;
break;
case EIOINTC_ISR_START ... EIOINTC_ISR_END:
offset = (addr - EIOINTC_ISR_START) / 4;
- p = &s->isr.reg_u32[offset];
+ p = s->isr + offset * 4;
break;
case EIOINTC_COREISR_START ... EIOINTC_COREISR_END:
if (cpu >= s->num_cpu)
return -EINVAL;
offset = (addr - EIOINTC_COREISR_START) / 4;
- p = &s->coreisr.reg_u32[cpu][offset];
+ p = s->coreisr[cpu] + offset * 4;
break;
case EIOINTC_COREMAP_START ... EIOINTC_COREMAP_END:
offset = (addr - EIOINTC_COREMAP_START) / 4;
- p = &s->coremap.reg_u32[offset];
+ p = s->coremap + offset * 4;
break;
default:
kvm_err("%s: unknown eiointc register, addr = %d\n", __func__, addr);
diff --git a/arch/loongarch/kvm/interrupt.c b/arch/loongarch/kvm/interrupt.c
index 8462083f0301..a6d42d399a59 100644
--- a/arch/loongarch/kvm/interrupt.c
+++ b/arch/loongarch/kvm/interrupt.c
@@ -21,6 +21,7 @@ static unsigned int priority_to_irq[EXCCODE_INT_NUM] = {
[INT_HWI5] = CPU_IP5,
[INT_HWI6] = CPU_IP6,
[INT_HWI7] = CPU_IP7,
+ [INT_AVEC] = CPU_AVEC,
};
static int kvm_irq_deliver(struct kvm_vcpu *vcpu, unsigned int priority)
@@ -31,6 +32,11 @@ static int kvm_irq_deliver(struct kvm_vcpu *vcpu, unsigned int priority)
if (priority < EXCCODE_INT_NUM)
irq = priority_to_irq[priority];
+ if (cpu_has_msgint && (priority == INT_AVEC)) {
+ set_gcsr_estat(irq);
+ return 1;
+ }
+
switch (priority) {
case INT_TI:
case INT_IPI:
@@ -58,6 +64,11 @@ static int kvm_irq_clear(struct kvm_vcpu *vcpu, unsigned int priority)
if (priority < EXCCODE_INT_NUM)
irq = priority_to_irq[priority];
+ if (cpu_has_msgint && (priority == INT_AVEC)) {
+ clear_gcsr_estat(irq);
+ return 1;
+ }
+
switch (priority) {
case INT_TI:
case INT_IPI:
@@ -83,10 +94,10 @@ void kvm_deliver_intr(struct kvm_vcpu *vcpu)
unsigned long *pending = &vcpu->arch.irq_pending;
unsigned long *pending_clr = &vcpu->arch.irq_clear;
- for_each_set_bit(priority, pending_clr, INT_IPI + 1)
+ for_each_set_bit(priority, pending_clr, EXCCODE_INT_NUM)
kvm_irq_clear(vcpu, priority);
- for_each_set_bit(priority, pending, INT_IPI + 1)
+ for_each_set_bit(priority, pending, EXCCODE_INT_NUM)
kvm_irq_deliver(vcpu, priority);
}
diff --git a/arch/loongarch/kvm/vcpu.c b/arch/loongarch/kvm/vcpu.c
index 1245a6b35896..6d833599ef2e 100644
--- a/arch/loongarch/kvm/vcpu.c
+++ b/arch/loongarch/kvm/vcpu.c
@@ -659,8 +659,7 @@ static int _kvm_get_cpucfg_mask(int id, u64 *v)
*v = GENMASK(31, 0);
return 0;
case LOONGARCH_CPUCFG1:
- /* CPUCFG1_MSGINT is not supported by KVM */
- *v = GENMASK(25, 0);
+ *v = GENMASK(26, 0);
return 0;
case LOONGARCH_CPUCFG2:
/* CPUCFG2 features unconditionally supported by KVM */
@@ -728,6 +727,10 @@ static int kvm_check_cpucfg(int id, u64 val)
return -EINVAL;
switch (id) {
+ case LOONGARCH_CPUCFG1:
+ if ((val & CPUCFG1_MSGINT) && !cpu_has_msgint)
+ return -EINVAL;
+ return 0;
case LOONGARCH_CPUCFG2:
if (!(val & CPUCFG2_LLFTP))
/* Guests must have a constant timer */
@@ -1473,8 +1476,8 @@ int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, struct kvm_interrupt *irq)
return 0;
}
-long kvm_arch_vcpu_async_ioctl(struct file *filp,
- unsigned int ioctl, unsigned long arg)
+long kvm_arch_vcpu_unlocked_ioctl(struct file *filp, unsigned int ioctl,
+ unsigned long arg)
{
void __user *argp = (void __user *)arg;
struct kvm_vcpu *vcpu = filp->private_data;
@@ -1657,6 +1660,12 @@ static int _kvm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
kvm_restore_hw_gcsr(csr, LOONGARCH_CSR_DMWIN2);
kvm_restore_hw_gcsr(csr, LOONGARCH_CSR_DMWIN3);
kvm_restore_hw_gcsr(csr, LOONGARCH_CSR_LLBCTL);
+ if (cpu_has_msgint) {
+ kvm_restore_hw_gcsr(csr, LOONGARCH_CSR_ISR0);
+ kvm_restore_hw_gcsr(csr, LOONGARCH_CSR_ISR1);
+ kvm_restore_hw_gcsr(csr, LOONGARCH_CSR_ISR2);
+ kvm_restore_hw_gcsr(csr, LOONGARCH_CSR_ISR3);
+ }
/* Restore Root.GINTC from unused Guest.GINTC register */
write_csr_gintc(csr->csrs[LOONGARCH_CSR_GINTC]);
@@ -1746,6 +1755,12 @@ static int _kvm_vcpu_put(struct kvm_vcpu *vcpu, int cpu)
kvm_save_hw_gcsr(csr, LOONGARCH_CSR_DMWIN1);
kvm_save_hw_gcsr(csr, LOONGARCH_CSR_DMWIN2);
kvm_save_hw_gcsr(csr, LOONGARCH_CSR_DMWIN3);
+ if (cpu_has_msgint) {
+ kvm_save_hw_gcsr(csr, LOONGARCH_CSR_ISR0);
+ kvm_save_hw_gcsr(csr, LOONGARCH_CSR_ISR1);
+ kvm_save_hw_gcsr(csr, LOONGARCH_CSR_ISR2);
+ kvm_save_hw_gcsr(csr, LOONGARCH_CSR_ISR3);
+ }
vcpu->arch.aux_inuse |= KVM_LARCH_SWCSR_LATEST;
diff --git a/arch/loongarch/kvm/vm.c b/arch/loongarch/kvm/vm.c
index a49b1c1a3dd1..194ccbcdc3b3 100644
--- a/arch/loongarch/kvm/vm.c
+++ b/arch/loongarch/kvm/vm.c
@@ -6,6 +6,7 @@
#include <linux/kvm_host.h>
#include <asm/kvm_mmu.h>
#include <asm/kvm_vcpu.h>
+#include <asm/kvm_csr.h>
#include <asm/kvm_eiointc.h>
#include <asm/kvm_pch_pic.h>
@@ -24,6 +25,23 @@ const struct kvm_stats_header kvm_vm_stats_header = {
sizeof(kvm_vm_stats_desc),
};
+static void kvm_vm_init_features(struct kvm *kvm)
+{
+ unsigned long val;
+
+ val = read_csr_gcfg();
+ if (val & CSR_GCFG_GPMP)
+ kvm->arch.kvm_features |= BIT(KVM_LOONGARCH_VM_FEAT_PMU);
+
+ /* Enable all PV features by default */
+ kvm->arch.pv_features = BIT(KVM_FEATURE_IPI);
+ kvm->arch.kvm_features = BIT(KVM_LOONGARCH_VM_FEAT_PV_IPI);
+ if (kvm_pvtime_supported()) {
+ kvm->arch.pv_features |= BIT(KVM_FEATURE_STEAL_TIME);
+ kvm->arch.kvm_features |= BIT(KVM_LOONGARCH_VM_FEAT_PV_STEALTIME);
+ }
+}
+
int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
{
int i;
@@ -42,11 +60,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
spin_lock_init(&kvm->arch.phyid_map_lock);
kvm_init_vmcs(kvm);
-
- /* Enable all PV features by default */
- kvm->arch.pv_features = BIT(KVM_FEATURE_IPI);
- if (kvm_pvtime_supported())
- kvm->arch.pv_features |= BIT(KVM_FEATURE_STEAL_TIME);
+ kvm_vm_init_features(kvm);
/*
* cpu_vabits means user address space only (a half of total).
@@ -136,18 +150,18 @@ static int kvm_vm_feature_has_attr(struct kvm *kvm, struct kvm_device_attr *attr
if (cpu_has_lbt_mips)
return 0;
return -ENXIO;
- case KVM_LOONGARCH_VM_FEAT_PMU:
- if (cpu_has_pmp)
+ case KVM_LOONGARCH_VM_FEAT_PTW:
+ if (cpu_has_ptw)
return 0;
return -ENXIO;
- case KVM_LOONGARCH_VM_FEAT_PV_IPI:
- return 0;
- case KVM_LOONGARCH_VM_FEAT_PV_STEALTIME:
- if (kvm_pvtime_supported())
+ case KVM_LOONGARCH_VM_FEAT_MSGINT:
+ if (cpu_has_msgint)
return 0;
return -ENXIO;
- case KVM_LOONGARCH_VM_FEAT_PTW:
- if (cpu_has_ptw)
+ case KVM_LOONGARCH_VM_FEAT_PMU:
+ case KVM_LOONGARCH_VM_FEAT_PV_IPI:
+ case KVM_LOONGARCH_VM_FEAT_PV_STEALTIME:
+ if (kvm_vm_support(&kvm->arch, attr->attr))
return 0;
return -ENXIO;
default:
diff --git a/arch/mips/kvm/Kconfig b/arch/mips/kvm/Kconfig
index ab57221fa4dd..cc13cc35f208 100644
--- a/arch/mips/kvm/Kconfig
+++ b/arch/mips/kvm/Kconfig
@@ -22,7 +22,6 @@ config KVM
select EXPORT_UASM
select KVM_COMMON
select KVM_GENERIC_DIRTYLOG_READ_PROTECT
- select HAVE_KVM_VCPU_ASYNC_IOCTL
select KVM_MMIO
select KVM_GENERIC_MMU_NOTIFIER
select KVM_GENERIC_HARDWARE_ENABLING
diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c
index a75587018f44..b0fb92fda4d4 100644
--- a/arch/mips/kvm/mips.c
+++ b/arch/mips/kvm/mips.c
@@ -895,8 +895,8 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu,
return r;
}
-long kvm_arch_vcpu_async_ioctl(struct file *filp, unsigned int ioctl,
- unsigned long arg)
+long kvm_arch_vcpu_unlocked_ioctl(struct file *filp, unsigned int ioctl,
+ unsigned long arg)
{
struct kvm_vcpu *vcpu = filp->private_data;
void __user *argp = (void __user *)arg;
diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig
index 2f2702c867f7..c9a2d50ff1b0 100644
--- a/arch/powerpc/kvm/Kconfig
+++ b/arch/powerpc/kvm/Kconfig
@@ -20,7 +20,6 @@ if VIRTUALIZATION
config KVM
bool
select KVM_COMMON
- select HAVE_KVM_VCPU_ASYNC_IOCTL
select KVM_VFIO
select HAVE_KVM_IRQ_BYPASS
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index 2ba057171ebe..9a89a6d98f97 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -2028,8 +2028,8 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
return -EINVAL;
}
-long kvm_arch_vcpu_async_ioctl(struct file *filp,
- unsigned int ioctl, unsigned long arg)
+long kvm_arch_vcpu_unlocked_ioctl(struct file *filp, unsigned int ioctl,
+ unsigned long arg)
{
struct kvm_vcpu *vcpu = filp->private_data;
void __user *argp = (void __user *)arg;
diff --git a/arch/riscv/include/asm/kvm_host.h b/arch/riscv/include/asm/kvm_host.h
index 4d794573e3db..24585304c02b 100644
--- a/arch/riscv/include/asm/kvm_host.h
+++ b/arch/riscv/include/asm/kvm_host.h
@@ -59,6 +59,9 @@
BIT(IRQ_VS_TIMER) | \
BIT(IRQ_VS_EXT))
+#define KVM_DIRTY_LOG_MANUAL_CAPS (KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE | \
+ KVM_DIRTY_LOG_INITIALLY_SET)
+
struct kvm_vm_stat {
struct kvm_vm_stat_generic generic;
};
@@ -327,4 +330,7 @@ bool kvm_riscv_vcpu_stopped(struct kvm_vcpu *vcpu);
void kvm_riscv_vcpu_record_steal_time(struct kvm_vcpu *vcpu);
+/* Flags representing implementation specific details */
+DECLARE_STATIC_KEY_FALSE(kvm_riscv_vsstage_tlb_no_gpa);
+
#endif /* __RISCV_KVM_HOST_H__ */
diff --git a/arch/riscv/include/asm/kvm_tlb.h b/arch/riscv/include/asm/kvm_tlb.h
index 38a2f933ad3a..a0e7099bcb85 100644
--- a/arch/riscv/include/asm/kvm_tlb.h
+++ b/arch/riscv/include/asm/kvm_tlb.h
@@ -49,6 +49,7 @@ void kvm_riscv_local_hfence_vvma_gva(unsigned long vmid,
unsigned long gva, unsigned long gvsz,
unsigned long order);
void kvm_riscv_local_hfence_vvma_all(unsigned long vmid);
+void kvm_riscv_local_tlb_sanitize(struct kvm_vcpu *vcpu);
void kvm_riscv_tlb_flush_process(struct kvm_vcpu *vcpu);
diff --git a/arch/riscv/include/asm/kvm_vcpu_sbi.h b/arch/riscv/include/asm/kvm_vcpu_sbi.h
index 3497489e04db..c1a7e3b40d9c 100644
--- a/arch/riscv/include/asm/kvm_vcpu_sbi.h
+++ b/arch/riscv/include/asm/kvm_vcpu_sbi.h
@@ -69,7 +69,9 @@ struct kvm_vcpu_sbi_extension {
unsigned long reg_size, const void *reg_val);
};
-void kvm_riscv_vcpu_sbi_forward(struct kvm_vcpu *vcpu, struct kvm_run *run);
+int kvm_riscv_vcpu_sbi_forward_handler(struct kvm_vcpu *vcpu,
+ struct kvm_run *run,
+ struct kvm_vcpu_sbi_return *retdata);
void kvm_riscv_vcpu_sbi_system_reset(struct kvm_vcpu *vcpu,
struct kvm_run *run,
u32 type, u64 flags);
@@ -105,6 +107,7 @@ extern const struct kvm_vcpu_sbi_extension vcpu_sbi_ext_dbcn;
extern const struct kvm_vcpu_sbi_extension vcpu_sbi_ext_susp;
extern const struct kvm_vcpu_sbi_extension vcpu_sbi_ext_sta;
extern const struct kvm_vcpu_sbi_extension vcpu_sbi_ext_fwft;
+extern const struct kvm_vcpu_sbi_extension vcpu_sbi_ext_mpxy;
extern const struct kvm_vcpu_sbi_extension vcpu_sbi_ext_experimental;
extern const struct kvm_vcpu_sbi_extension vcpu_sbi_ext_vendor;
diff --git a/arch/riscv/include/asm/kvm_vmid.h b/arch/riscv/include/asm/kvm_vmid.h
index ab98e1434fb7..db61b0525a8d 100644
--- a/arch/riscv/include/asm/kvm_vmid.h
+++ b/arch/riscv/include/asm/kvm_vmid.h
@@ -22,6 +22,5 @@ unsigned long kvm_riscv_gstage_vmid_bits(void);
int kvm_riscv_gstage_vmid_init(struct kvm *kvm);
bool kvm_riscv_gstage_vmid_ver_changed(struct kvm_vmid *vmid);
void kvm_riscv_gstage_vmid_update(struct kvm_vcpu *vcpu);
-void kvm_riscv_gstage_vmid_sanitize(struct kvm_vcpu *vcpu);
#endif
diff --git a/arch/riscv/include/uapi/asm/kvm.h b/arch/riscv/include/uapi/asm/kvm.h
index 759a4852c09a..54f3ad7ed2e4 100644
--- a/arch/riscv/include/uapi/asm/kvm.h
+++ b/arch/riscv/include/uapi/asm/kvm.h
@@ -23,6 +23,8 @@
#define KVM_INTERRUPT_SET -1U
#define KVM_INTERRUPT_UNSET -2U
+#define KVM_EXIT_FAIL_ENTRY_NO_VSFILE (1ULL << 0)
+
/* for KVM_GET_REGS and KVM_SET_REGS */
struct kvm_regs {
};
@@ -211,6 +213,7 @@ enum KVM_RISCV_SBI_EXT_ID {
KVM_RISCV_SBI_EXT_STA,
KVM_RISCV_SBI_EXT_SUSP,
KVM_RISCV_SBI_EXT_FWFT,
+ KVM_RISCV_SBI_EXT_MPXY,
KVM_RISCV_SBI_EXT_MAX,
};
diff --git a/arch/riscv/kvm/Kconfig b/arch/riscv/kvm/Kconfig
index c50328212917..77379f77840a 100644
--- a/arch/riscv/kvm/Kconfig
+++ b/arch/riscv/kvm/Kconfig
@@ -23,7 +23,6 @@ config KVM
select HAVE_KVM_IRQCHIP
select HAVE_KVM_IRQ_ROUTING
select HAVE_KVM_MSI
- select HAVE_KVM_VCPU_ASYNC_IOCTL
select HAVE_KVM_READONLY_MEM
select HAVE_KVM_DIRTY_RING_ACQ_REL
select KVM_COMMON
diff --git a/arch/riscv/kvm/Makefile b/arch/riscv/kvm/Makefile
index 07197395750e..3b8afb038b35 100644
--- a/arch/riscv/kvm/Makefile
+++ b/arch/riscv/kvm/Makefile
@@ -27,6 +27,7 @@ kvm-y += vcpu_onereg.o
kvm-$(CONFIG_RISCV_PMU_SBI) += vcpu_pmu.o
kvm-y += vcpu_sbi.o
kvm-y += vcpu_sbi_base.o
+kvm-y += vcpu_sbi_forward.o
kvm-y += vcpu_sbi_fwft.o
kvm-y += vcpu_sbi_hsm.o
kvm-$(CONFIG_RISCV_PMU_SBI) += vcpu_sbi_pmu.o
diff --git a/arch/riscv/kvm/aia_imsic.c b/arch/riscv/kvm/aia_imsic.c
index 11422cb95a64..e597e86491c3 100644
--- a/arch/riscv/kvm/aia_imsic.c
+++ b/arch/riscv/kvm/aia_imsic.c
@@ -814,7 +814,7 @@ int kvm_riscv_vcpu_aia_imsic_update(struct kvm_vcpu *vcpu)
/* For HW acceleration mode, we can't continue */
if (kvm->arch.aia.mode == KVM_DEV_RISCV_AIA_MODE_HWACCEL) {
run->fail_entry.hardware_entry_failure_reason =
- CSR_HSTATUS;
+ KVM_EXIT_FAIL_ENTRY_NO_VSFILE;
run->fail_entry.cpu = vcpu->cpu;
run->exit_reason = KVM_EXIT_FAIL_ENTRY;
return 0;
diff --git a/arch/riscv/kvm/main.c b/arch/riscv/kvm/main.c
index 77dc1655b442..45536af521f0 100644
--- a/arch/riscv/kvm/main.c
+++ b/arch/riscv/kvm/main.c
@@ -15,6 +15,18 @@
#include <asm/kvm_nacl.h>
#include <asm/sbi.h>
+DEFINE_STATIC_KEY_FALSE(kvm_riscv_vsstage_tlb_no_gpa);
+
+static void kvm_riscv_setup_vendor_features(void)
+{
+ /* Andes AX66: split two-stage TLBs */
+ if (riscv_cached_mvendorid(0) == ANDES_VENDOR_ID &&
+ (riscv_cached_marchid(0) & 0xFFFF) == 0x8A66) {
+ static_branch_enable(&kvm_riscv_vsstage_tlb_no_gpa);
+ kvm_info("VS-stage TLB does not cache guest physical address and VMID\n");
+ }
+}
+
long kvm_arch_dev_ioctl(struct file *filp,
unsigned int ioctl, unsigned long arg)
{
@@ -160,6 +172,8 @@ static int __init riscv_kvm_init(void)
kvm_info("AIA available with %d guest external interrupts\n",
kvm_riscv_aia_nr_hgei);
+ kvm_riscv_setup_vendor_features();
+
kvm_register_perf_callbacks(NULL);
rc = kvm_init(sizeof(struct kvm_vcpu), 0, THIS_MODULE);
diff --git a/arch/riscv/kvm/mmu.c b/arch/riscv/kvm/mmu.c
index 58f5f3536ffd..4ab06697bfc0 100644
--- a/arch/riscv/kvm/mmu.c
+++ b/arch/riscv/kvm/mmu.c
@@ -161,8 +161,11 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
* allocated dirty_bitmap[], dirty pages will be tracked while
* the memory slot is write protected.
*/
- if (change != KVM_MR_DELETE && new->flags & KVM_MEM_LOG_DIRTY_PAGES)
+ if (change != KVM_MR_DELETE && new->flags & KVM_MEM_LOG_DIRTY_PAGES) {
+ if (kvm_dirty_log_manual_protect_and_init_set(kvm))
+ return;
mmu_wp_memory_region(kvm, new->id);
+ }
}
int kvm_arch_prepare_memory_region(struct kvm *kvm,
diff --git a/arch/riscv/kvm/tlb.c b/arch/riscv/kvm/tlb.c
index 3c5a70a2b927..ff1aeac4eb8e 100644
--- a/arch/riscv/kvm/tlb.c
+++ b/arch/riscv/kvm/tlb.c
@@ -158,6 +158,36 @@ void kvm_riscv_local_hfence_vvma_all(unsigned long vmid)
csr_write(CSR_HGATP, hgatp);
}
+void kvm_riscv_local_tlb_sanitize(struct kvm_vcpu *vcpu)
+{
+ unsigned long vmid;
+
+ if (!kvm_riscv_gstage_vmid_bits() ||
+ vcpu->arch.last_exit_cpu == vcpu->cpu)
+ return;
+
+ /*
+ * On RISC-V platforms with hardware VMID support, we share same
+ * VMID for all VCPUs of a particular Guest/VM. This means we might
+ * have stale G-stage TLB entries on the current Host CPU due to
+ * some other VCPU of the same Guest which ran previously on the
+ * current Host CPU.
+ *
+ * To cleanup stale TLB entries, we simply flush all G-stage TLB
+ * entries by VMID whenever underlying Host CPU changes for a VCPU.
+ */
+
+ vmid = READ_ONCE(vcpu->kvm->arch.vmid.vmid);
+ kvm_riscv_local_hfence_gvma_vmid_all(vmid);
+
+ /*
+ * Flush VS-stage TLB entries for implementation where VS-stage
+ * TLB does not cahce guest physical address and VMID.
+ */
+ if (static_branch_unlikely(&kvm_riscv_vsstage_tlb_no_gpa))
+ kvm_riscv_local_hfence_vvma_all(vmid);
+}
+
void kvm_riscv_fence_i_process(struct kvm_vcpu *vcpu)
{
kvm_riscv_vcpu_pmu_incr_fw(vcpu, SBI_PMU_FW_FENCE_I_RCVD);
diff --git a/arch/riscv/kvm/vcpu.c b/arch/riscv/kvm/vcpu.c
index 5ce35aba6069..a55a95da54d0 100644
--- a/arch/riscv/kvm/vcpu.c
+++ b/arch/riscv/kvm/vcpu.c
@@ -238,8 +238,8 @@ vm_fault_t kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf)
return VM_FAULT_SIGBUS;
}
-long kvm_arch_vcpu_async_ioctl(struct file *filp,
- unsigned int ioctl, unsigned long arg)
+long kvm_arch_vcpu_unlocked_ioctl(struct file *filp, unsigned int ioctl,
+ unsigned long arg)
{
struct kvm_vcpu *vcpu = filp->private_data;
void __user *argp = (void __user *)arg;
@@ -968,7 +968,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
* Note: This should be done after G-stage VMID has been
* updated using kvm_riscv_gstage_vmid_ver_changed()
*/
- kvm_riscv_gstage_vmid_sanitize(vcpu);
+ kvm_riscv_local_tlb_sanitize(vcpu);
trace_kvm_entry(vcpu);
diff --git a/arch/riscv/kvm/vcpu_insn.c b/arch/riscv/kvm/vcpu_insn.c
index de1f96ea6225..4d89b94128ae 100644
--- a/arch/riscv/kvm/vcpu_insn.c
+++ b/arch/riscv/kvm/vcpu_insn.c
@@ -298,6 +298,22 @@ static int system_opcode_insn(struct kvm_vcpu *vcpu, struct kvm_run *run,
return (rc <= 0) ? rc : 1;
}
+static bool is_load_guest_page_fault(unsigned long scause)
+{
+ /**
+ * If a g-stage page fault occurs, the direct approach
+ * is to let the g-stage page fault handler handle it
+ * naturally, however, calling the g-stage page fault
+ * handler here seems rather strange.
+ * Considering this is a corner case, we can directly
+ * return to the guest and re-execute the same PC, this
+ * will trigger a g-stage page fault again and then the
+ * regular g-stage page fault handler will populate
+ * g-stage page table.
+ */
+ return (scause == EXC_LOAD_GUEST_PAGE_FAULT);
+}
+
/**
* kvm_riscv_vcpu_virtual_insn -- Handle virtual instruction trap
*
@@ -323,6 +339,8 @@ int kvm_riscv_vcpu_virtual_insn(struct kvm_vcpu *vcpu, struct kvm_run *run,
ct->sepc,
&utrap);
if (utrap.scause) {
+ if (is_load_guest_page_fault(utrap.scause))
+ return 1;
utrap.sepc = ct->sepc;
kvm_riscv_vcpu_trap_redirect(vcpu, &utrap);
return 1;
@@ -378,6 +396,8 @@ int kvm_riscv_vcpu_mmio_load(struct kvm_vcpu *vcpu, struct kvm_run *run,
insn = kvm_riscv_vcpu_unpriv_read(vcpu, true, ct->sepc,
&utrap);
if (utrap.scause) {
+ if (is_load_guest_page_fault(utrap.scause))
+ return 1;
/* Redirect trap if we failed to read instruction */
utrap.sepc = ct->sepc;
kvm_riscv_vcpu_trap_redirect(vcpu, &utrap);
@@ -504,6 +524,8 @@ int kvm_riscv_vcpu_mmio_store(struct kvm_vcpu *vcpu, struct kvm_run *run,
insn = kvm_riscv_vcpu_unpriv_read(vcpu, true, ct->sepc,
&utrap);
if (utrap.scause) {
+ if (is_load_guest_page_fault(utrap.scause))
+ return 1;
/* Redirect trap if we failed to read instruction */
utrap.sepc = ct->sepc;
kvm_riscv_vcpu_trap_redirect(vcpu, &utrap);
diff --git a/arch/riscv/kvm/vcpu_sbi.c b/arch/riscv/kvm/vcpu_sbi.c
index 1b13623380e1..46ab7b989432 100644
--- a/arch/riscv/kvm/vcpu_sbi.c
+++ b/arch/riscv/kvm/vcpu_sbi.c
@@ -83,6 +83,10 @@ static const struct kvm_riscv_sbi_extension_entry sbi_ext[] = {
.ext_ptr = &vcpu_sbi_ext_fwft,
},
{
+ .ext_idx = KVM_RISCV_SBI_EXT_MPXY,
+ .ext_ptr = &vcpu_sbi_ext_mpxy,
+ },
+ {
.ext_idx = KVM_RISCV_SBI_EXT_EXPERIMENTAL,
.ext_ptr = &vcpu_sbi_ext_experimental,
},
@@ -120,7 +124,9 @@ static bool riscv_vcpu_supports_sbi_ext(struct kvm_vcpu *vcpu, int idx)
return sext && scontext->ext_status[sext->ext_idx] != KVM_RISCV_SBI_EXT_STATUS_UNAVAILABLE;
}
-void kvm_riscv_vcpu_sbi_forward(struct kvm_vcpu *vcpu, struct kvm_run *run)
+int kvm_riscv_vcpu_sbi_forward_handler(struct kvm_vcpu *vcpu,
+ struct kvm_run *run,
+ struct kvm_vcpu_sbi_return *retdata)
{
struct kvm_cpu_context *cp = &vcpu->arch.guest_context;
@@ -137,6 +143,8 @@ void kvm_riscv_vcpu_sbi_forward(struct kvm_vcpu *vcpu, struct kvm_run *run)
run->riscv_sbi.args[5] = cp->a5;
run->riscv_sbi.ret[0] = SBI_ERR_NOT_SUPPORTED;
run->riscv_sbi.ret[1] = 0;
+ retdata->uexit = true;
+ return 0;
}
void kvm_riscv_vcpu_sbi_system_reset(struct kvm_vcpu *vcpu,
diff --git a/arch/riscv/kvm/vcpu_sbi_base.c b/arch/riscv/kvm/vcpu_sbi_base.c
index 5bc570b984f4..06fdd5f69364 100644
--- a/arch/riscv/kvm/vcpu_sbi_base.c
+++ b/arch/riscv/kvm/vcpu_sbi_base.c
@@ -41,8 +41,7 @@ static int kvm_sbi_ext_base_handler(struct kvm_vcpu *vcpu, struct kvm_run *run,
* For experimental/vendor extensions
* forward it to the userspace
*/
- kvm_riscv_vcpu_sbi_forward(vcpu, run);
- retdata->uexit = true;
+ return kvm_riscv_vcpu_sbi_forward_handler(vcpu, run, retdata);
} else {
sbi_ext = kvm_vcpu_sbi_find_ext(vcpu, cp->a0);
*out_val = sbi_ext && sbi_ext->probe ?
@@ -71,28 +70,3 @@ const struct kvm_vcpu_sbi_extension vcpu_sbi_ext_base = {
.extid_end = SBI_EXT_BASE,
.handler = kvm_sbi_ext_base_handler,
};
-
-static int kvm_sbi_ext_forward_handler(struct kvm_vcpu *vcpu,
- struct kvm_run *run,
- struct kvm_vcpu_sbi_return *retdata)
-{
- /*
- * Both SBI experimental and vendor extensions are
- * unconditionally forwarded to userspace.
- */
- kvm_riscv_vcpu_sbi_forward(vcpu, run);
- retdata->uexit = true;
- return 0;
-}
-
-const struct kvm_vcpu_sbi_extension vcpu_sbi_ext_experimental = {
- .extid_start = SBI_EXT_EXPERIMENTAL_START,
- .extid_end = SBI_EXT_EXPERIMENTAL_END,
- .handler = kvm_sbi_ext_forward_handler,
-};
-
-const struct kvm_vcpu_sbi_extension vcpu_sbi_ext_vendor = {
- .extid_start = SBI_EXT_VENDOR_START,
- .extid_end = SBI_EXT_VENDOR_END,
- .handler = kvm_sbi_ext_forward_handler,
-};
diff --git a/arch/riscv/kvm/vcpu_sbi_forward.c b/arch/riscv/kvm/vcpu_sbi_forward.c
new file mode 100644
index 000000000000..5a3c75eb23c5
--- /dev/null
+++ b/arch/riscv/kvm/vcpu_sbi_forward.c
@@ -0,0 +1,34 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2025 Ventana Micro Systems Inc.
+ */
+
+#include <linux/kvm_host.h>
+#include <asm/kvm_vcpu_sbi.h>
+#include <asm/sbi.h>
+
+const struct kvm_vcpu_sbi_extension vcpu_sbi_ext_experimental = {
+ .extid_start = SBI_EXT_EXPERIMENTAL_START,
+ .extid_end = SBI_EXT_EXPERIMENTAL_END,
+ .handler = kvm_riscv_vcpu_sbi_forward_handler,
+};
+
+const struct kvm_vcpu_sbi_extension vcpu_sbi_ext_vendor = {
+ .extid_start = SBI_EXT_VENDOR_START,
+ .extid_end = SBI_EXT_VENDOR_END,
+ .handler = kvm_riscv_vcpu_sbi_forward_handler,
+};
+
+const struct kvm_vcpu_sbi_extension vcpu_sbi_ext_dbcn = {
+ .extid_start = SBI_EXT_DBCN,
+ .extid_end = SBI_EXT_DBCN,
+ .default_disabled = true,
+ .handler = kvm_riscv_vcpu_sbi_forward_handler,
+};
+
+const struct kvm_vcpu_sbi_extension vcpu_sbi_ext_mpxy = {
+ .extid_start = SBI_EXT_MPXY,
+ .extid_end = SBI_EXT_MPXY,
+ .default_disabled = true,
+ .handler = kvm_riscv_vcpu_sbi_forward_handler,
+};
diff --git a/arch/riscv/kvm/vcpu_sbi_replace.c b/arch/riscv/kvm/vcpu_sbi_replace.c
index b490ed1428a6..506a510b6bff 100644
--- a/arch/riscv/kvm/vcpu_sbi_replace.c
+++ b/arch/riscv/kvm/vcpu_sbi_replace.c
@@ -185,35 +185,3 @@ const struct kvm_vcpu_sbi_extension vcpu_sbi_ext_srst = {
.extid_end = SBI_EXT_SRST,
.handler = kvm_sbi_ext_srst_handler,
};
-
-static int kvm_sbi_ext_dbcn_handler(struct kvm_vcpu *vcpu,
- struct kvm_run *run,
- struct kvm_vcpu_sbi_return *retdata)
-{
- struct kvm_cpu_context *cp = &vcpu->arch.guest_context;
- unsigned long funcid = cp->a6;
-
- switch (funcid) {
- case SBI_EXT_DBCN_CONSOLE_WRITE:
- case SBI_EXT_DBCN_CONSOLE_READ:
- case SBI_EXT_DBCN_CONSOLE_WRITE_BYTE:
- /*
- * The SBI debug console functions are unconditionally
- * forwarded to the userspace.
- */
- kvm_riscv_vcpu_sbi_forward(vcpu, run);
- retdata->uexit = true;
- break;
- default:
- retdata->err_val = SBI_ERR_NOT_SUPPORTED;
- }
-
- return 0;
-}
-
-const struct kvm_vcpu_sbi_extension vcpu_sbi_ext_dbcn = {
- .extid_start = SBI_EXT_DBCN,
- .extid_end = SBI_EXT_DBCN,
- .default_disabled = true,
- .handler = kvm_sbi_ext_dbcn_handler,
-};
diff --git a/arch/riscv/kvm/vcpu_sbi_system.c b/arch/riscv/kvm/vcpu_sbi_system.c
index 359be90b0fc5..c6f7e609ac79 100644
--- a/arch/riscv/kvm/vcpu_sbi_system.c
+++ b/arch/riscv/kvm/vcpu_sbi_system.c
@@ -47,9 +47,7 @@ static int kvm_sbi_ext_susp_handler(struct kvm_vcpu *vcpu, struct kvm_run *run,
kvm_riscv_vcpu_sbi_request_reset(vcpu, cp->a1, cp->a2);
/* userspace provides the suspend implementation */
- kvm_riscv_vcpu_sbi_forward(vcpu, run);
- retdata->uexit = true;
- break;
+ return kvm_riscv_vcpu_sbi_forward_handler(vcpu, run, retdata);
default:
retdata->err_val = SBI_ERR_NOT_SUPPORTED;
break;
diff --git a/arch/riscv/kvm/vcpu_sbi_v01.c b/arch/riscv/kvm/vcpu_sbi_v01.c
index 368dfddd23d9..188d5ea5b3b8 100644
--- a/arch/riscv/kvm/vcpu_sbi_v01.c
+++ b/arch/riscv/kvm/vcpu_sbi_v01.c
@@ -32,8 +32,7 @@ static int kvm_sbi_ext_v01_handler(struct kvm_vcpu *vcpu, struct kvm_run *run,
* The CONSOLE_GETCHAR/CONSOLE_PUTCHAR SBI calls cannot be
* handled in kernel so we forward these to user-space
*/
- kvm_riscv_vcpu_sbi_forward(vcpu, run);
- retdata->uexit = true;
+ ret = kvm_riscv_vcpu_sbi_forward_handler(vcpu, run, retdata);
break;
case SBI_EXT_0_1_SET_TIMER:
#if __riscv_xlen == 32
diff --git a/arch/riscv/kvm/vmid.c b/arch/riscv/kvm/vmid.c
index abb1c2bf2542..cf34d448289d 100644
--- a/arch/riscv/kvm/vmid.c
+++ b/arch/riscv/kvm/vmid.c
@@ -122,26 +122,3 @@ void kvm_riscv_gstage_vmid_update(struct kvm_vcpu *vcpu)
kvm_for_each_vcpu(i, v, vcpu->kvm)
kvm_make_request(KVM_REQ_UPDATE_HGATP, v);
}
-
-void kvm_riscv_gstage_vmid_sanitize(struct kvm_vcpu *vcpu)
-{
- unsigned long vmid;
-
- if (!kvm_riscv_gstage_vmid_bits() ||
- vcpu->arch.last_exit_cpu == vcpu->cpu)
- return;
-
- /*
- * On RISC-V platforms with hardware VMID support, we share same
- * VMID for all VCPUs of a particular Guest/VM. This means we might
- * have stale G-stage TLB entries on the current Host CPU due to
- * some other VCPU of the same Guest which ran previously on the
- * current Host CPU.
- *
- * To cleanup stale TLB entries, we simply flush all G-stage TLB
- * entries by VMID whenever underlying Host CPU changes for a VCPU.
- */
-
- vmid = READ_ONCE(vcpu->kvm->arch.vmid.vmid);
- kvm_riscv_local_hfence_gvma_vmid_all(vmid);
-}
diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h
index c2ba3d4398c5..ae1223264d3c 100644
--- a/arch/s390/include/asm/kvm_host.h
+++ b/arch/s390/include/asm/kvm_host.h
@@ -146,6 +146,7 @@ struct kvm_vcpu_stat {
u64 instruction_diagnose_500;
u64 instruction_diagnose_other;
u64 pfault_sync;
+ u64 signal_exits;
};
#define PGM_OPERATION 0x01
@@ -631,10 +632,8 @@ struct kvm_s390_pv {
struct mmu_notifier mmu_notifier;
};
-struct kvm_arch{
- void *sca;
- int use_esca;
- rwlock_t sca_lock;
+struct kvm_arch {
+ struct esca_block *sca;
debug_info_t *dbf;
struct kvm_s390_float_interrupt float_int;
struct kvm_device *flic;
@@ -650,6 +649,7 @@ struct kvm_arch{
int user_sigp;
int user_stsi;
int user_instr0;
+ int user_operexec;
struct s390_io_adapter *adapters[MAX_S390_IO_ADAPTERS];
wait_queue_head_t ipte_wq;
int ipte_lock_count;
diff --git a/arch/s390/include/asm/stacktrace.h b/arch/s390/include/asm/stacktrace.h
index 810a6b9d9628..c9ae680a28af 100644
--- a/arch/s390/include/asm/stacktrace.h
+++ b/arch/s390/include/asm/stacktrace.h
@@ -66,6 +66,7 @@ struct stack_frame {
unsigned long sie_flags;
unsigned long sie_control_block_phys;
unsigned long sie_guest_asce;
+ unsigned long sie_irq;
};
};
unsigned long gprs[10];
diff --git a/arch/s390/kernel/asm-offsets.c b/arch/s390/kernel/asm-offsets.c
index cfe27f6579e3..e1a5b5b54e4f 100644
--- a/arch/s390/kernel/asm-offsets.c
+++ b/arch/s390/kernel/asm-offsets.c
@@ -67,6 +67,7 @@ int main(void)
OFFSET(__SF_SIE_FLAGS, stack_frame, sie_flags);
OFFSET(__SF_SIE_CONTROL_PHYS, stack_frame, sie_control_block_phys);
OFFSET(__SF_SIE_GUEST_ASCE, stack_frame, sie_guest_asce);
+ OFFSET(__SF_SIE_IRQ, stack_frame, sie_irq);
DEFINE(STACK_FRAME_OVERHEAD, sizeof(struct stack_frame));
BLANK();
OFFSET(__SFUSER_BACKCHAIN, stack_frame_user, back_chain);
diff --git a/arch/s390/kernel/entry.S b/arch/s390/kernel/entry.S
index c360087807d8..b7f1553d9ee5 100644
--- a/arch/s390/kernel/entry.S
+++ b/arch/s390/kernel/entry.S
@@ -193,6 +193,7 @@ SYM_FUNC_START(__sie64a)
mvc __SF_SIE_FLAGS(8,%r15),__TI_flags(%r14) # copy thread flags
lmg %r0,%r13,0(%r4) # load guest gprs 0-13
mvi __TI_sie(%r14),1
+ stosm __SF_SIE_IRQ(%r15),0x03 # enable interrupts
lctlg %c1,%c1,__SF_SIE_GUEST_ASCE(%r15) # load primary asce
lg %r14,__SF_SIE_CONTROL(%r15) # get control block pointer
oi __SIE_PROG0C+3(%r14),1 # we are going into SIE now
@@ -216,6 +217,7 @@ SYM_FUNC_START(__sie64a)
lg %r14,__LC_CURRENT(%r14)
mvi __TI_sie(%r14),0
SYM_INNER_LABEL(sie_exit, SYM_L_GLOBAL)
+ stnsm __SF_SIE_IRQ(%r15),0xfc # disable interrupts
lg %r14,__SF_SIE_SAVEAREA(%r15) # load guest register save area
stmg %r0,%r13,0(%r14) # save guest gprs 0-13
xgr %r0,%r0 # clear guest registers to
diff --git a/arch/s390/kvm/Kconfig b/arch/s390/kvm/Kconfig
index cae908d64550..f4ec8c1ce214 100644
--- a/arch/s390/kvm/Kconfig
+++ b/arch/s390/kvm/Kconfig
@@ -20,7 +20,6 @@ config KVM
def_tristate y
prompt "Kernel-based Virtual Machine (KVM) support"
select HAVE_KVM_CPU_RELAX_INTERCEPT
- select HAVE_KVM_VCPU_ASYNC_IOCTL
select KVM_ASYNC_PF
select KVM_ASYNC_PF_SYNC
select KVM_COMMON
@@ -30,6 +29,7 @@ config KVM
select HAVE_KVM_NO_POLL
select KVM_VFIO
select MMU_NOTIFIER
+ select VIRT_XFER_TO_GUEST_WORK
help
Support hosting paravirtualized guest machines using the SIE
virtualization capability on the mainframe. This should work
diff --git a/arch/s390/kvm/gaccess.c b/arch/s390/kvm/gaccess.c
index 21c2e61fece4..41ca6b0ee7a9 100644
--- a/arch/s390/kvm/gaccess.c
+++ b/arch/s390/kvm/gaccess.c
@@ -109,14 +109,9 @@ struct aste {
int ipte_lock_held(struct kvm *kvm)
{
- if (sclp.has_siif) {
- int rc;
+ if (sclp.has_siif)
+ return kvm->arch.sca->ipte_control.kh != 0;
- read_lock(&kvm->arch.sca_lock);
- rc = kvm_s390_get_ipte_control(kvm)->kh != 0;
- read_unlock(&kvm->arch.sca_lock);
- return rc;
- }
return kvm->arch.ipte_lock_count != 0;
}
@@ -129,19 +124,16 @@ static void ipte_lock_simple(struct kvm *kvm)
if (kvm->arch.ipte_lock_count > 1)
goto out;
retry:
- read_lock(&kvm->arch.sca_lock);
- ic = kvm_s390_get_ipte_control(kvm);
+ ic = &kvm->arch.sca->ipte_control;
old = READ_ONCE(*ic);
do {
if (old.k) {
- read_unlock(&kvm->arch.sca_lock);
cond_resched();
goto retry;
}
new = old;
new.k = 1;
} while (!try_cmpxchg(&ic->val, &old.val, new.val));
- read_unlock(&kvm->arch.sca_lock);
out:
mutex_unlock(&kvm->arch.ipte_mutex);
}
@@ -154,14 +146,12 @@ static void ipte_unlock_simple(struct kvm *kvm)
kvm->arch.ipte_lock_count--;
if (kvm->arch.ipte_lock_count)
goto out;
- read_lock(&kvm->arch.sca_lock);
- ic = kvm_s390_get_ipte_control(kvm);
+ ic = &kvm->arch.sca->ipte_control;
old = READ_ONCE(*ic);
do {
new = old;
new.k = 0;
} while (!try_cmpxchg(&ic->val, &old.val, new.val));
- read_unlock(&kvm->arch.sca_lock);
wake_up(&kvm->arch.ipte_wq);
out:
mutex_unlock(&kvm->arch.ipte_mutex);
@@ -172,12 +162,10 @@ static void ipte_lock_siif(struct kvm *kvm)
union ipte_control old, new, *ic;
retry:
- read_lock(&kvm->arch.sca_lock);
- ic = kvm_s390_get_ipte_control(kvm);
+ ic = &kvm->arch.sca->ipte_control;
old = READ_ONCE(*ic);
do {
if (old.kg) {
- read_unlock(&kvm->arch.sca_lock);
cond_resched();
goto retry;
}
@@ -185,15 +173,13 @@ retry:
new.k = 1;
new.kh++;
} while (!try_cmpxchg(&ic->val, &old.val, new.val));
- read_unlock(&kvm->arch.sca_lock);
}
static void ipte_unlock_siif(struct kvm *kvm)
{
union ipte_control old, new, *ic;
- read_lock(&kvm->arch.sca_lock);
- ic = kvm_s390_get_ipte_control(kvm);
+ ic = &kvm->arch.sca->ipte_control;
old = READ_ONCE(*ic);
do {
new = old;
@@ -201,7 +187,6 @@ static void ipte_unlock_siif(struct kvm *kvm)
if (!new.kh)
new.k = 0;
} while (!try_cmpxchg(&ic->val, &old.val, new.val));
- read_unlock(&kvm->arch.sca_lock);
if (!new.kh)
wake_up(&kvm->arch.ipte_wq);
}
diff --git a/arch/s390/kvm/intercept.c b/arch/s390/kvm/intercept.c
index c7908950c1f4..420ae62977e2 100644
--- a/arch/s390/kvm/intercept.c
+++ b/arch/s390/kvm/intercept.c
@@ -471,6 +471,9 @@ static int handle_operexc(struct kvm_vcpu *vcpu)
if (vcpu->arch.sie_block->ipa == 0xb256)
return handle_sthyi(vcpu);
+ if (vcpu->kvm->arch.user_operexec)
+ return -EOPNOTSUPP;
+
if (vcpu->arch.sie_block->ipa == 0 && vcpu->kvm->arch.user_instr0)
return -EOPNOTSUPP;
rc = read_guest_lc(vcpu, __LC_PGM_NEW_PSW, &newpsw, sizeof(psw_t));
diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c
index f55574af98cc..249cdc822ec5 100644
--- a/arch/s390/kvm/interrupt.c
+++ b/arch/s390/kvm/interrupt.c
@@ -44,70 +44,34 @@ static struct kvm_s390_gib *gib;
/* handle external calls via sigp interpretation facility */
static int sca_ext_call_pending(struct kvm_vcpu *vcpu, int *src_id)
{
- int c, scn;
+ struct esca_block *sca = vcpu->kvm->arch.sca;
+ union esca_sigp_ctrl sigp_ctrl = sca->cpu[vcpu->vcpu_id].sigp_ctrl;
if (!kvm_s390_test_cpuflags(vcpu, CPUSTAT_ECALL_PEND))
return 0;
BUG_ON(!kvm_s390_use_sca_entries());
- read_lock(&vcpu->kvm->arch.sca_lock);
- if (vcpu->kvm->arch.use_esca) {
- struct esca_block *sca = vcpu->kvm->arch.sca;
- union esca_sigp_ctrl sigp_ctrl =
- sca->cpu[vcpu->vcpu_id].sigp_ctrl;
-
- c = sigp_ctrl.c;
- scn = sigp_ctrl.scn;
- } else {
- struct bsca_block *sca = vcpu->kvm->arch.sca;
- union bsca_sigp_ctrl sigp_ctrl =
- sca->cpu[vcpu->vcpu_id].sigp_ctrl;
-
- c = sigp_ctrl.c;
- scn = sigp_ctrl.scn;
- }
- read_unlock(&vcpu->kvm->arch.sca_lock);
if (src_id)
- *src_id = scn;
+ *src_id = sigp_ctrl.scn;
- return c;
+ return sigp_ctrl.c;
}
static int sca_inject_ext_call(struct kvm_vcpu *vcpu, int src_id)
{
+ struct esca_block *sca = vcpu->kvm->arch.sca;
+ union esca_sigp_ctrl *sigp_ctrl = &sca->cpu[vcpu->vcpu_id].sigp_ctrl;
+ union esca_sigp_ctrl old_val, new_val = {.scn = src_id, .c = 1};
int expect, rc;
BUG_ON(!kvm_s390_use_sca_entries());
- read_lock(&vcpu->kvm->arch.sca_lock);
- if (vcpu->kvm->arch.use_esca) {
- struct esca_block *sca = vcpu->kvm->arch.sca;
- union esca_sigp_ctrl *sigp_ctrl =
- &(sca->cpu[vcpu->vcpu_id].sigp_ctrl);
- union esca_sigp_ctrl new_val = {0}, old_val;
-
- old_val = READ_ONCE(*sigp_ctrl);
- new_val.scn = src_id;
- new_val.c = 1;
- old_val.c = 0;
-
- expect = old_val.value;
- rc = cmpxchg(&sigp_ctrl->value, old_val.value, new_val.value);
- } else {
- struct bsca_block *sca = vcpu->kvm->arch.sca;
- union bsca_sigp_ctrl *sigp_ctrl =
- &(sca->cpu[vcpu->vcpu_id].sigp_ctrl);
- union bsca_sigp_ctrl new_val = {0}, old_val;
- old_val = READ_ONCE(*sigp_ctrl);
- new_val.scn = src_id;
- new_val.c = 1;
- old_val.c = 0;
+ old_val = READ_ONCE(*sigp_ctrl);
+ old_val.c = 0;
- expect = old_val.value;
- rc = cmpxchg(&sigp_ctrl->value, old_val.value, new_val.value);
- }
- read_unlock(&vcpu->kvm->arch.sca_lock);
+ expect = old_val.value;
+ rc = cmpxchg(&sigp_ctrl->value, old_val.value, new_val.value);
if (rc != expect) {
/* another external call is pending */
@@ -119,24 +83,14 @@ static int sca_inject_ext_call(struct kvm_vcpu *vcpu, int src_id)
static void sca_clear_ext_call(struct kvm_vcpu *vcpu)
{
+ struct esca_block *sca = vcpu->kvm->arch.sca;
+ union esca_sigp_ctrl *sigp_ctrl = &sca->cpu[vcpu->vcpu_id].sigp_ctrl;
+
if (!kvm_s390_use_sca_entries())
return;
kvm_s390_clear_cpuflags(vcpu, CPUSTAT_ECALL_PEND);
- read_lock(&vcpu->kvm->arch.sca_lock);
- if (vcpu->kvm->arch.use_esca) {
- struct esca_block *sca = vcpu->kvm->arch.sca;
- union esca_sigp_ctrl *sigp_ctrl =
- &(sca->cpu[vcpu->vcpu_id].sigp_ctrl);
-
- WRITE_ONCE(sigp_ctrl->value, 0);
- } else {
- struct bsca_block *sca = vcpu->kvm->arch.sca;
- union bsca_sigp_ctrl *sigp_ctrl =
- &(sca->cpu[vcpu->vcpu_id].sigp_ctrl);
- WRITE_ONCE(sigp_ctrl->value, 0);
- }
- read_unlock(&vcpu->kvm->arch.sca_lock);
+ WRITE_ONCE(sigp_ctrl->value, 0);
}
int psw_extint_disabled(struct kvm_vcpu *vcpu)
@@ -1223,7 +1177,7 @@ int kvm_s390_ext_call_pending(struct kvm_vcpu *vcpu)
{
struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int;
- if (!sclp.has_sigpif)
+ if (!kvm_s390_use_sca_entries())
return test_bit(IRQ_PEND_EXT_EXTERNAL, &li->pending_irqs);
return sca_ext_call_pending(vcpu, NULL);
@@ -1548,7 +1502,7 @@ static int __inject_extcall(struct kvm_vcpu *vcpu, struct kvm_s390_irq *irq)
if (kvm_get_vcpu_by_id(vcpu->kvm, src_id) == NULL)
return -EINVAL;
- if (sclp.has_sigpif && !kvm_s390_pv_cpu_get_handle(vcpu))
+ if (kvm_s390_use_sca_entries() && !kvm_s390_pv_cpu_get_handle(vcpu))
return sca_inject_ext_call(vcpu, src_id);
if (test_and_set_bit(IRQ_PEND_EXT_EXTERNAL, &li->pending_irqs))
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index ff3a185f156c..56a50524b3ee 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -13,6 +13,7 @@
#define pr_fmt(fmt) "kvm-s390: " fmt
#include <linux/compiler.h>
+#include <linux/entry-virt.h>
#include <linux/export.h>
#include <linux/err.h>
#include <linux/fs.h>
@@ -184,7 +185,8 @@ const struct _kvm_stats_desc kvm_vcpu_stats_desc[] = {
STATS_DESC_COUNTER(VCPU, instruction_diagnose_308),
STATS_DESC_COUNTER(VCPU, instruction_diagnose_500),
STATS_DESC_COUNTER(VCPU, instruction_diagnose_other),
- STATS_DESC_COUNTER(VCPU, pfault_sync)
+ STATS_DESC_COUNTER(VCPU, pfault_sync),
+ STATS_DESC_COUNTER(VCPU, signal_exits)
};
const struct kvm_stats_header kvm_vcpu_stats_header = {
@@ -271,7 +273,6 @@ debug_info_t *kvm_s390_dbf_uv;
/* forward declarations */
static void kvm_gmap_notifier(struct gmap *gmap, unsigned long start,
unsigned long end);
-static int sca_switch_to_extended(struct kvm *kvm);
static void kvm_clock_sync_scb(struct kvm_s390_sie_block *scb, u64 delta)
{
@@ -606,6 +607,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
case KVM_CAP_SET_GUEST_DEBUG:
case KVM_CAP_S390_DIAG318:
case KVM_CAP_IRQFD_RESAMPLE:
+ case KVM_CAP_S390_USER_OPEREXEC:
r = 1;
break;
case KVM_CAP_SET_GUEST_DEBUG2:
@@ -631,11 +633,13 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
case KVM_CAP_NR_VCPUS:
case KVM_CAP_MAX_VCPUS:
case KVM_CAP_MAX_VCPU_ID:
- r = KVM_S390_BSCA_CPU_SLOTS;
+ /*
+ * Return the same value for KVM_CAP_MAX_VCPUS and
+ * KVM_CAP_MAX_VCPU_ID to conform with the KVM API.
+ */
+ r = KVM_S390_ESCA_CPU_SLOTS;
if (!kvm_s390_use_sca_entries())
r = KVM_MAX_VCPUS;
- else if (sclp.has_esca && sclp.has_64bscao)
- r = KVM_S390_ESCA_CPU_SLOTS;
if (ext == KVM_CAP_NR_VCPUS)
r = min_t(unsigned int, num_online_cpus(), r);
break;
@@ -919,6 +923,12 @@ int kvm_vm_ioctl_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap)
VM_EVENT(kvm, 3, "ENABLE: CAP_S390_CPU_TOPOLOGY %s",
r ? "(not available)" : "(success)");
break;
+ case KVM_CAP_S390_USER_OPEREXEC:
+ VM_EVENT(kvm, 3, "%s", "ENABLE: CAP_S390_USER_OPEREXEC");
+ kvm->arch.user_operexec = 1;
+ icpt_operexc_on_all_vcpus(kvm);
+ r = 0;
+ break;
default:
r = -EINVAL;
break;
@@ -1930,22 +1940,18 @@ static int kvm_s390_get_cpu_model(struct kvm *kvm, struct kvm_device_attr *attr)
* Updates the Multiprocessor Topology-Change-Report bit to signal
* the guest with a topology change.
* This is only relevant if the topology facility is present.
- *
- * The SCA version, bsca or esca, doesn't matter as offset is the same.
*/
static void kvm_s390_update_topology_change_report(struct kvm *kvm, bool val)
{
union sca_utility new, old;
- struct bsca_block *sca;
+ struct esca_block *sca;
- read_lock(&kvm->arch.sca_lock);
sca = kvm->arch.sca;
old = READ_ONCE(sca->utility);
do {
new = old;
new.mtcr = val;
} while (!try_cmpxchg(&sca->utility.val, &old.val, new.val));
- read_unlock(&kvm->arch.sca_lock);
}
static int kvm_s390_set_topo_change_indication(struct kvm *kvm,
@@ -1966,9 +1972,7 @@ static int kvm_s390_get_topo_change_indication(struct kvm *kvm,
if (!test_kvm_facility(kvm, 11))
return -ENXIO;
- read_lock(&kvm->arch.sca_lock);
- topo = ((struct bsca_block *)kvm->arch.sca)->utility.mtcr;
- read_unlock(&kvm->arch.sca_lock);
+ topo = kvm->arch.sca->utility.mtcr;
return put_user(topo, (u8 __user *)attr->addr);
}
@@ -2666,14 +2670,6 @@ static int kvm_s390_handle_pv(struct kvm *kvm, struct kvm_pv_cmd *cmd)
if (kvm_s390_pv_is_protected(kvm))
break;
- /*
- * FMT 4 SIE needs esca. As we never switch back to bsca from
- * esca, we need no cleanup in the error cases below
- */
- r = sca_switch_to_extended(kvm);
- if (r)
- break;
-
mmap_write_lock(kvm->mm);
r = gmap_helper_disable_cow_sharing();
mmap_write_unlock(kvm->mm);
@@ -3316,10 +3312,7 @@ static void kvm_s390_crypto_init(struct kvm *kvm)
static void sca_dispose(struct kvm *kvm)
{
- if (kvm->arch.use_esca)
- free_pages_exact(kvm->arch.sca, sizeof(struct esca_block));
- else
- free_page((unsigned long)(kvm->arch.sca));
+ free_pages_exact(kvm->arch.sca, sizeof(*kvm->arch.sca));
kvm->arch.sca = NULL;
}
@@ -3333,10 +3326,9 @@ void kvm_arch_free_vm(struct kvm *kvm)
int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
{
- gfp_t alloc_flags = GFP_KERNEL_ACCOUNT;
- int i, rc;
+ gfp_t alloc_flags = GFP_KERNEL_ACCOUNT | __GFP_ZERO;
char debug_name[16];
- static unsigned long sca_offset;
+ int i, rc;
rc = -EINVAL;
#ifdef CONFIG_KVM_S390_UCONTROL
@@ -3357,20 +3349,14 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
if (!sclp.has_64bscao)
alloc_flags |= GFP_DMA;
- rwlock_init(&kvm->arch.sca_lock);
- /* start with basic SCA */
- kvm->arch.sca = (struct bsca_block *) get_zeroed_page(alloc_flags);
- if (!kvm->arch.sca)
- goto out_err;
mutex_lock(&kvm_lock);
- sca_offset += 16;
- if (sca_offset + sizeof(struct bsca_block) > PAGE_SIZE)
- sca_offset = 0;
- kvm->arch.sca = (struct bsca_block *)
- ((char *) kvm->arch.sca + sca_offset);
+
+ kvm->arch.sca = alloc_pages_exact(sizeof(*kvm->arch.sca), alloc_flags);
mutex_unlock(&kvm_lock);
+ if (!kvm->arch.sca)
+ goto out_err;
- sprintf(debug_name, "kvm-%u", current->pid);
+ snprintf(debug_name, sizeof(debug_name), "kvm-%u", current->pid);
kvm->arch.dbf = debug_register(debug_name, 32, 1, 7 * sizeof(long));
if (!kvm->arch.dbf)
@@ -3547,133 +3533,38 @@ static int __kvm_ucontrol_vcpu_init(struct kvm_vcpu *vcpu)
static void sca_del_vcpu(struct kvm_vcpu *vcpu)
{
+ struct esca_block *sca = vcpu->kvm->arch.sca;
+
if (!kvm_s390_use_sca_entries())
return;
- read_lock(&vcpu->kvm->arch.sca_lock);
- if (vcpu->kvm->arch.use_esca) {
- struct esca_block *sca = vcpu->kvm->arch.sca;
- clear_bit_inv(vcpu->vcpu_id, (unsigned long *) sca->mcn);
- sca->cpu[vcpu->vcpu_id].sda = 0;
- } else {
- struct bsca_block *sca = vcpu->kvm->arch.sca;
-
- clear_bit_inv(vcpu->vcpu_id, (unsigned long *) &sca->mcn);
- sca->cpu[vcpu->vcpu_id].sda = 0;
- }
- read_unlock(&vcpu->kvm->arch.sca_lock);
+ clear_bit_inv(vcpu->vcpu_id, (unsigned long *)sca->mcn);
+ sca->cpu[vcpu->vcpu_id].sda = 0;
}
static void sca_add_vcpu(struct kvm_vcpu *vcpu)
{
- if (!kvm_s390_use_sca_entries()) {
- phys_addr_t sca_phys = virt_to_phys(vcpu->kvm->arch.sca);
-
- /* we still need the basic sca for the ipte control */
- vcpu->arch.sie_block->scaoh = sca_phys >> 32;
- vcpu->arch.sie_block->scaol = sca_phys;
- return;
- }
- read_lock(&vcpu->kvm->arch.sca_lock);
- if (vcpu->kvm->arch.use_esca) {
- struct esca_block *sca = vcpu->kvm->arch.sca;
- phys_addr_t sca_phys = virt_to_phys(sca);
-
- sca->cpu[vcpu->vcpu_id].sda = virt_to_phys(vcpu->arch.sie_block);
- vcpu->arch.sie_block->scaoh = sca_phys >> 32;
- vcpu->arch.sie_block->scaol = sca_phys & ESCA_SCAOL_MASK;
- vcpu->arch.sie_block->ecb2 |= ECB2_ESCA;
- set_bit_inv(vcpu->vcpu_id, (unsigned long *) sca->mcn);
- } else {
- struct bsca_block *sca = vcpu->kvm->arch.sca;
- phys_addr_t sca_phys = virt_to_phys(sca);
-
- sca->cpu[vcpu->vcpu_id].sda = virt_to_phys(vcpu->arch.sie_block);
- vcpu->arch.sie_block->scaoh = sca_phys >> 32;
- vcpu->arch.sie_block->scaol = sca_phys;
- set_bit_inv(vcpu->vcpu_id, (unsigned long *) &sca->mcn);
- }
- read_unlock(&vcpu->kvm->arch.sca_lock);
-}
-
-/* Basic SCA to Extended SCA data copy routines */
-static inline void sca_copy_entry(struct esca_entry *d, struct bsca_entry *s)
-{
- d->sda = s->sda;
- d->sigp_ctrl.c = s->sigp_ctrl.c;
- d->sigp_ctrl.scn = s->sigp_ctrl.scn;
-}
-
-static void sca_copy_b_to_e(struct esca_block *d, struct bsca_block *s)
-{
- int i;
+ struct esca_block *sca = vcpu->kvm->arch.sca;
+ phys_addr_t sca_phys = virt_to_phys(sca);
- d->ipte_control = s->ipte_control;
- d->mcn[0] = s->mcn;
- for (i = 0; i < KVM_S390_BSCA_CPU_SLOTS; i++)
- sca_copy_entry(&d->cpu[i], &s->cpu[i]);
-}
+ /* we still need the sca header for the ipte control */
+ vcpu->arch.sie_block->scaoh = sca_phys >> 32;
+ vcpu->arch.sie_block->scaol = sca_phys & ESCA_SCAOL_MASK;
+ vcpu->arch.sie_block->ecb2 |= ECB2_ESCA;
-static int sca_switch_to_extended(struct kvm *kvm)
-{
- struct bsca_block *old_sca = kvm->arch.sca;
- struct esca_block *new_sca;
- struct kvm_vcpu *vcpu;
- unsigned long vcpu_idx;
- u32 scaol, scaoh;
- phys_addr_t new_sca_phys;
-
- if (kvm->arch.use_esca)
- return 0;
-
- new_sca = alloc_pages_exact(sizeof(*new_sca), GFP_KERNEL_ACCOUNT | __GFP_ZERO);
- if (!new_sca)
- return -ENOMEM;
-
- new_sca_phys = virt_to_phys(new_sca);
- scaoh = new_sca_phys >> 32;
- scaol = new_sca_phys & ESCA_SCAOL_MASK;
-
- kvm_s390_vcpu_block_all(kvm);
- write_lock(&kvm->arch.sca_lock);
-
- sca_copy_b_to_e(new_sca, old_sca);
-
- kvm_for_each_vcpu(vcpu_idx, vcpu, kvm) {
- vcpu->arch.sie_block->scaoh = scaoh;
- vcpu->arch.sie_block->scaol = scaol;
- vcpu->arch.sie_block->ecb2 |= ECB2_ESCA;
- }
- kvm->arch.sca = new_sca;
- kvm->arch.use_esca = 1;
-
- write_unlock(&kvm->arch.sca_lock);
- kvm_s390_vcpu_unblock_all(kvm);
-
- free_page((unsigned long)old_sca);
+ if (!kvm_s390_use_sca_entries())
+ return;
- VM_EVENT(kvm, 2, "Switched to ESCA (0x%p -> 0x%p)",
- old_sca, kvm->arch.sca);
- return 0;
+ set_bit_inv(vcpu->vcpu_id, (unsigned long *)sca->mcn);
+ sca->cpu[vcpu->vcpu_id].sda = virt_to_phys(vcpu->arch.sie_block);
}
static int sca_can_add_vcpu(struct kvm *kvm, unsigned int id)
{
- int rc;
-
- if (!kvm_s390_use_sca_entries()) {
- if (id < KVM_MAX_VCPUS)
- return true;
- return false;
- }
- if (id < KVM_S390_BSCA_CPU_SLOTS)
- return true;
- if (!sclp.has_esca || !sclp.has_64bscao)
- return false;
-
- rc = kvm->arch.use_esca ? 0 : sca_switch_to_extended(kvm);
+ if (!kvm_s390_use_sca_entries())
+ return id < KVM_MAX_VCPUS;
- return rc == 0 && id < KVM_S390_ESCA_CPU_SLOTS;
+ return id < KVM_S390_ESCA_CPU_SLOTS;
}
/* needs disabled preemption to protect from TOD sync and vcpu_load/put */
@@ -3919,7 +3810,7 @@ static int kvm_s390_vcpu_setup(struct kvm_vcpu *vcpu)
vcpu->arch.sie_block->eca |= ECA_IB;
if (sclp.has_siif)
vcpu->arch.sie_block->eca |= ECA_SII;
- if (sclp.has_sigpif)
+ if (kvm_s390_use_sca_entries())
vcpu->arch.sie_block->eca |= ECA_SIGPI;
if (test_kvm_facility(vcpu->kvm, 129)) {
vcpu->arch.sie_block->eca |= ECA_VX;
@@ -4366,8 +4257,6 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
{
- int ret = 0;
-
vcpu_load(vcpu);
vcpu->run->s.regs.fpc = fpu->fpc;
@@ -4378,7 +4267,7 @@ int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
memcpy(vcpu->run->s.regs.fprs, &fpu->fprs, sizeof(fpu->fprs));
vcpu_put(vcpu);
- return ret;
+ return 0;
}
int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
@@ -4786,9 +4675,6 @@ static int vcpu_pre_run(struct kvm_vcpu *vcpu)
vcpu->arch.sie_block->gg14 = vcpu->run->s.regs.gprs[14];
vcpu->arch.sie_block->gg15 = vcpu->run->s.regs.gprs[15];
- if (need_resched())
- schedule();
-
if (!kvm_is_ucontrol(vcpu->kvm)) {
rc = kvm_s390_deliver_pending_interrupts(vcpu);
if (rc || guestdbg_exit_pending(vcpu))
@@ -5073,13 +4959,8 @@ int noinstr kvm_s390_enter_exit_sie(struct kvm_s390_sie_block *scb,
* The guest_state_{enter,exit}_irqoff() functions inform lockdep and
* tracing that entry to the guest will enable host IRQs, and exit from
* the guest will disable host IRQs.
- *
- * We must not use lockdep/tracing/RCU in this critical section, so we
- * use the low-level arch_local_irq_*() helpers to enable/disable IRQs.
*/
- arch_local_irq_enable();
ret = sie64a(scb, gprs, gasce);
- arch_local_irq_disable();
guest_state_exit_irqoff();
@@ -5098,12 +4979,12 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
*/
kvm_vcpu_srcu_read_lock(vcpu);
- do {
+ while (true) {
rc = vcpu_pre_run(vcpu);
+ kvm_vcpu_srcu_read_unlock(vcpu);
if (rc || guestdbg_exit_pending(vcpu))
break;
- kvm_vcpu_srcu_read_unlock(vcpu);
/*
* As PF_VCPU will be used in fault handler, between
* guest_timing_enter_irqoff and guest_timing_exit_irqoff
@@ -5115,7 +4996,17 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
sizeof(sie_page->pv_grregs));
}
+xfer_to_guest_mode_check:
local_irq_disable();
+ xfer_to_guest_mode_prepare();
+ if (xfer_to_guest_mode_work_pending()) {
+ local_irq_enable();
+ rc = kvm_xfer_to_guest_mode_handle_work(vcpu);
+ if (rc)
+ break;
+ goto xfer_to_guest_mode_check;
+ }
+
guest_timing_enter_irqoff();
__disable_cpu_timer_accounting(vcpu);
@@ -5145,9 +5036,12 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
kvm_vcpu_srcu_read_lock(vcpu);
rc = vcpu_post_run(vcpu, exit_reason);
- } while (!signal_pending(current) && !guestdbg_exit_pending(vcpu) && !rc);
+ if (rc || guestdbg_exit_pending(vcpu)) {
+ kvm_vcpu_srcu_read_unlock(vcpu);
+ break;
+ }
+ }
- kvm_vcpu_srcu_read_unlock(vcpu);
return rc;
}
@@ -5363,6 +5257,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
if (signal_pending(current) && !rc) {
kvm_run->exit_reason = KVM_EXIT_INTR;
+ vcpu->stat.signal_exits++;
rc = -EINTR;
}
@@ -5729,8 +5624,8 @@ static long kvm_s390_vcpu_memsida_op(struct kvm_vcpu *vcpu,
return r;
}
-long kvm_arch_vcpu_async_ioctl(struct file *filp,
- unsigned int ioctl, unsigned long arg)
+long kvm_arch_vcpu_unlocked_ioctl(struct file *filp, unsigned int ioctl,
+ unsigned long arg)
{
struct kvm_vcpu *vcpu = filp->private_data;
void __user *argp = (void __user *)arg;
diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h
index c44fe0c3a097..65c950760993 100644
--- a/arch/s390/kvm/kvm-s390.h
+++ b/arch/s390/kvm/kvm-s390.h
@@ -570,13 +570,6 @@ void kvm_s390_prepare_debug_exit(struct kvm_vcpu *vcpu);
int kvm_s390_handle_per_ifetch_icpt(struct kvm_vcpu *vcpu);
int kvm_s390_handle_per_event(struct kvm_vcpu *vcpu);
-/* support for Basic/Extended SCA handling */
-static inline union ipte_control *kvm_s390_get_ipte_control(struct kvm *kvm)
-{
- struct bsca_block *sca = kvm->arch.sca; /* SCA version doesn't matter */
-
- return &sca->ipte_control;
-}
static inline int kvm_s390_use_sca_entries(void)
{
/*
@@ -584,7 +577,7 @@ static inline int kvm_s390_use_sca_entries(void)
* might use the entries. By not setting the entries and keeping them
* invalid, hardware will not access them but intercept.
*/
- return sclp.has_sigpif;
+ return sclp.has_sigpif && sclp.has_esca;
}
void kvm_s390_reinject_machine_check(struct kvm_vcpu *vcpu,
struct mcck_volatile_info *mcck_info);
diff --git a/arch/s390/kvm/vsie.c b/arch/s390/kvm/vsie.c
index 347268f89f2f..b526621d2a1b 100644
--- a/arch/s390/kvm/vsie.c
+++ b/arch/s390/kvm/vsie.c
@@ -782,7 +782,7 @@ static int pin_blocks(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
else if ((gpa & ~0x1fffUL) == kvm_s390_get_prefix(vcpu))
rc = set_validity_icpt(scb_s, 0x0011U);
else if ((gpa & PAGE_MASK) !=
- ((gpa + sizeof(struct bsca_block) - 1) & PAGE_MASK))
+ ((gpa + offsetof(struct bsca_block, cpu[0]) - 1) & PAGE_MASK))
rc = set_validity_icpt(scb_s, 0x003bU);
if (!rc) {
rc = pin_guest_page(vcpu->kvm, gpa, &hpa);
@@ -1180,12 +1180,23 @@ static int do_vsie_run(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
current->thread.gmap_int_code = 0;
barrier();
if (!kvm_s390_vcpu_sie_inhibited(vcpu)) {
+xfer_to_guest_mode_check:
local_irq_disable();
+ xfer_to_guest_mode_prepare();
+ if (xfer_to_guest_mode_work_pending()) {
+ local_irq_enable();
+ rc = kvm_xfer_to_guest_mode_handle_work(vcpu);
+ if (rc)
+ goto skip_sie;
+ goto xfer_to_guest_mode_check;
+ }
guest_timing_enter_irqoff();
rc = kvm_s390_enter_exit_sie(scb_s, vcpu->run->s.regs.gprs, vsie_page->gmap->asce);
guest_timing_exit_irqoff();
local_irq_enable();
}
+
+skip_sie:
barrier();
vcpu->arch.sie_block->prog0c &= ~PROG_IN_SIE;
@@ -1345,13 +1356,11 @@ static int vsie_run(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
* but rewind the PSW to re-enter SIE once that's completed
* instead of passing a "no action" intercept to the guest.
*/
- if (signal_pending(current) ||
- kvm_s390_vcpu_has_irq(vcpu, 0) ||
+ if (kvm_s390_vcpu_has_irq(vcpu, 0) ||
kvm_s390_vcpu_sie_inhibited(vcpu)) {
kvm_s390_rewind_psw(vcpu, 4);
break;
}
- cond_resched();
}
if (rc == -EFAULT) {
@@ -1483,8 +1492,7 @@ int kvm_s390_handle_vsie(struct kvm_vcpu *vcpu)
if (unlikely(scb_addr & 0x1ffUL))
return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
- if (signal_pending(current) || kvm_s390_vcpu_has_irq(vcpu, 0) ||
- kvm_s390_vcpu_sie_inhibited(vcpu)) {
+ if (kvm_s390_vcpu_has_irq(vcpu, 0) || kvm_s390_vcpu_sie_inhibited(vcpu)) {
kvm_s390_rewind_psw(vcpu, 4);
return 0;
}
diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
index d90ce601917c..c3b53beb1300 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -339,6 +339,7 @@
#define X86_FEATURE_AMD_STIBP (13*32+15) /* Single Thread Indirect Branch Predictors */
#define X86_FEATURE_AMD_STIBP_ALWAYS_ON (13*32+17) /* Single Thread Indirect Branch Predictors always-on preferred */
#define X86_FEATURE_AMD_IBRS_SAME_MODE (13*32+19) /* Indirect Branch Restricted Speculation same mode protection*/
+#define X86_FEATURE_EFER_LMSLE_MBZ (13*32+20) /* EFER.LMSLE must be zero */
#define X86_FEATURE_AMD_PPIN (13*32+23) /* "amd_ppin" Protected Processor Inventory Number */
#define X86_FEATURE_AMD_SSBD (13*32+24) /* Speculative Store Bypass Disable */
#define X86_FEATURE_VIRT_SSBD (13*32+25) /* "virt_ssbd" Virtualized Speculative Store Bypass Disable */
@@ -506,6 +507,12 @@
#define X86_FEATURE_SGX_EUPDATESVN (21*32+17) /* Support for ENCLS[EUPDATESVN] instruction */
#define X86_FEATURE_SDCIAE (21*32+18) /* L3 Smart Data Cache Injection Allocation Enforcement */
+#define X86_FEATURE_CLEAR_CPU_BUF_VM_MMIO (21*32+19) /*
+ * Clear CPU buffers before VM-Enter if the vCPU
+ * can access host MMIO (ignored for all intents
+ * and purposes if CLEAR_CPU_BUF_VM is set).
+ */
+#define X86_FEATURE_X2AVIC_EXT (21*32+20) /* AMD SVM x2AVIC support for 4k vCPUs */
/*
* BUG word(s)
diff --git a/arch/x86/include/asm/hardirq.h b/arch/x86/include/asm/hardirq.h
index f00c09ffe6a9..6b6d472baa0b 100644
--- a/arch/x86/include/asm/hardirq.h
+++ b/arch/x86/include/asm/hardirq.h
@@ -5,7 +5,7 @@
#include <linux/threads.h>
typedef struct {
-#if IS_ENABLED(CONFIG_KVM_INTEL)
+#if IS_ENABLED(CONFIG_CPU_MITIGATIONS) && IS_ENABLED(CONFIG_KVM_INTEL)
u8 kvm_cpu_l1tf_flush_l1d;
#endif
unsigned int __nmi_count; /* arch dependent */
@@ -68,7 +68,7 @@ extern u64 arch_irq_stat(void);
DECLARE_PER_CPU_CACHE_HOT(u16, __softirq_pending);
#define local_softirq_pending_ref __softirq_pending
-#if IS_ENABLED(CONFIG_KVM_INTEL)
+#if IS_ENABLED(CONFIG_CPU_MITIGATIONS) && IS_ENABLED(CONFIG_KVM_INTEL)
/*
* This function is called from noinstr interrupt contexts
* and must be inlined to not get instrumentation.
diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h
index fdf178443f85..de709fb5bd76 100644
--- a/arch/x86/include/asm/kvm-x86-ops.h
+++ b/arch/x86/include/asm/kvm-x86-ops.h
@@ -128,6 +128,7 @@ KVM_X86_OP(enable_smi_window)
KVM_X86_OP_OPTIONAL(dev_get_attr)
KVM_X86_OP_OPTIONAL(mem_enc_ioctl)
KVM_X86_OP_OPTIONAL(vcpu_mem_enc_ioctl)
+KVM_X86_OP_OPTIONAL(vcpu_mem_enc_unlocked_ioctl)
KVM_X86_OP_OPTIONAL(mem_enc_register_region)
KVM_X86_OP_OPTIONAL(mem_enc_unregister_region)
KVM_X86_OP_OPTIONAL(vm_copy_enc_context_from)
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 48598d017d6f..5a3bfa293e8b 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1055,9 +1055,6 @@ struct kvm_vcpu_arch {
/* be preempted when it's in kernel-mode(cpl=0) */
bool preempted_in_kernel;
- /* Flush the L1 Data cache for L1TF mitigation on VMENTER */
- bool l1tf_flush_l1d;
-
/* Host CPU on which VM-entry was most recently attempted */
int last_vmentry_cpu;
@@ -1456,8 +1453,6 @@ struct kvm_arch {
bool use_master_clock;
u64 master_kernel_ns;
u64 master_cycle_now;
- struct delayed_work kvmclock_update_work;
- struct delayed_work kvmclock_sync_work;
#ifdef CONFIG_KVM_HYPERV
struct kvm_hv hyperv;
@@ -1848,15 +1843,15 @@ struct kvm_x86_ops {
void *external_spt);
/* Update the external page table from spte getting set. */
int (*set_external_spte)(struct kvm *kvm, gfn_t gfn, enum pg_level level,
- kvm_pfn_t pfn_for_gfn);
+ u64 mirror_spte);
/* Update external page tables for page table about to be freed. */
int (*free_external_spt)(struct kvm *kvm, gfn_t gfn, enum pg_level level,
void *external_spt);
/* Update external page table from spte getting removed, and flush TLB. */
- int (*remove_external_spte)(struct kvm *kvm, gfn_t gfn, enum pg_level level,
- kvm_pfn_t pfn_for_gfn);
+ void (*remove_external_spte)(struct kvm *kvm, gfn_t gfn, enum pg_level level,
+ u64 mirror_spte);
bool (*has_wbinvd_exit)(void);
@@ -1914,6 +1909,7 @@ struct kvm_x86_ops {
int (*dev_get_attr)(u32 group, u64 attr, u64 *val);
int (*mem_enc_ioctl)(struct kvm *kvm, void __user *argp);
int (*vcpu_mem_enc_ioctl)(struct kvm_vcpu *vcpu, void __user *argp);
+ int (*vcpu_mem_enc_unlocked_ioctl)(struct kvm_vcpu *vcpu, void __user *argp);
int (*mem_enc_register_region)(struct kvm *kvm, struct kvm_enc_region *argp);
int (*mem_enc_unregister_region)(struct kvm *kvm, struct kvm_enc_region *argp);
int (*vm_copy_enc_context_from)(struct kvm *kvm, unsigned int source_fd);
@@ -2143,6 +2139,11 @@ u64 vcpu_tsc_khz(struct kvm_vcpu *vcpu);
* the gfn, i.e. retrying the instruction will hit a
* !PRESENT fault, which results in a new shadow page
* and sends KVM back to square one.
+ *
+ * EMULTYPE_SKIP_SOFT_INT - Set in combination with EMULTYPE_SKIP to only skip
+ * an instruction if it could generate a given software
+ * interrupt, which must be encoded via
+ * EMULTYPE_SET_SOFT_INT_VECTOR().
*/
#define EMULTYPE_NO_DECODE (1 << 0)
#define EMULTYPE_TRAP_UD (1 << 1)
@@ -2153,6 +2154,10 @@ u64 vcpu_tsc_khz(struct kvm_vcpu *vcpu);
#define EMULTYPE_PF (1 << 6)
#define EMULTYPE_COMPLETE_USER_EXIT (1 << 7)
#define EMULTYPE_WRITE_PF_TO_SP (1 << 8)
+#define EMULTYPE_SKIP_SOFT_INT (1 << 9)
+
+#define EMULTYPE_SET_SOFT_INT_VECTOR(v) ((u32)((v) & 0xff) << 16)
+#define EMULTYPE_GET_SOFT_INT_VECTOR(e) (((e) >> 16) & 0xff)
static inline bool kvm_can_emulate_event_vectoring(int emul_type)
{
@@ -2167,6 +2172,7 @@ void __kvm_prepare_emulation_failure_exit(struct kvm_vcpu *vcpu,
void kvm_prepare_emulation_failure_exit(struct kvm_vcpu *vcpu);
void kvm_prepare_event_vectoring_exit(struct kvm_vcpu *vcpu, gpa_t gpa);
+void kvm_prepare_unexpected_reason_exit(struct kvm_vcpu *vcpu, u64 exit_reason);
void kvm_enable_efer_bits(u64);
bool kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer);
@@ -2378,7 +2384,6 @@ int kvm_pv_send_ipi(struct kvm *kvm, unsigned long ipi_bitmap_low,
int kvm_add_user_return_msr(u32 msr);
int kvm_find_user_return_msr(u32 msr);
int kvm_set_user_return_msr(unsigned index, u64 val, u64 mask);
-void kvm_user_return_msr_update_cache(unsigned int index, u64 val);
u64 kvm_get_user_return_msr(unsigned int slot);
static inline bool kvm_is_supported_user_return_msr(u32 msr)
diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h
index 08ed5a2e46a5..a6526c5be5ca 100644
--- a/arch/x86/include/asm/nospec-branch.h
+++ b/arch/x86/include/asm/nospec-branch.h
@@ -308,24 +308,26 @@
* CFLAGS.ZF.
* Note: Only the memory operand variant of VERW clears the CPU buffers.
*/
-.macro __CLEAR_CPU_BUFFERS feature
#ifdef CONFIG_X86_64
- ALTERNATIVE "", "verw x86_verw_sel(%rip)", \feature
+#define VERW verw x86_verw_sel(%rip)
#else
- /*
- * In 32bit mode, the memory operand must be a %cs reference. The data
- * segments may not be usable (vm86 mode), and the stack segment may not
- * be flat (ESPFIX32).
- */
- ALTERNATIVE "", "verw %cs:x86_verw_sel", \feature
+/*
+ * In 32bit mode, the memory operand must be a %cs reference. The data segments
+ * may not be usable (vm86 mode), and the stack segment may not be flat (ESPFIX32).
+ */
+#define VERW verw %cs:x86_verw_sel
#endif
-.endm
-#define CLEAR_CPU_BUFFERS \
- __CLEAR_CPU_BUFFERS X86_FEATURE_CLEAR_CPU_BUF
+/*
+ * Provide a stringified VERW macro for simple usage, and a non-stringified
+ * VERW macro for use in more elaborate sequences, e.g. to encode a conditional
+ * VERW within an ALTERNATIVE.
+ */
+#define __CLEAR_CPU_BUFFERS __stringify(VERW)
-#define VM_CLEAR_CPU_BUFFERS \
- __CLEAR_CPU_BUFFERS X86_FEATURE_CLEAR_CPU_BUF_VM
+/* If necessary, emit VERW on exit-to-userspace to clear CPU buffers. */
+#define CLEAR_CPU_BUFFERS \
+ ALTERNATIVE "", __CLEAR_CPU_BUFFERS, X86_FEATURE_CLEAR_CPU_BUF
#ifdef CONFIG_X86_64
.macro CLEAR_BRANCH_HISTORY
@@ -580,8 +582,6 @@ DECLARE_STATIC_KEY_FALSE(cpu_buf_idle_clear);
DECLARE_STATIC_KEY_FALSE(switch_mm_cond_l1d_flush);
-DECLARE_STATIC_KEY_FALSE(cpu_buf_vm_clear);
-
extern u16 x86_verw_sel;
#include <asm/segment.h>
diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h
index 0581c477d466..56aa99503dc4 100644
--- a/arch/x86/include/asm/svm.h
+++ b/arch/x86/include/asm/svm.h
@@ -279,7 +279,7 @@ enum avic_ipi_failure_cause {
AVIC_IPI_FAILURE_INVALID_IPI_VECTOR,
};
-#define AVIC_PHYSICAL_MAX_INDEX_MASK GENMASK_ULL(8, 0)
+#define AVIC_PHYSICAL_MAX_INDEX_MASK GENMASK_ULL(11, 0)
/*
* For AVIC, the max index allowed for physical APIC ID table is 0xfe (254), as
@@ -289,11 +289,14 @@ enum avic_ipi_failure_cause {
/*
* For x2AVIC, the max index allowed for physical APIC ID table is 0x1ff (511).
+ * With X86_FEATURE_X2AVIC_EXT, the max index is increased to 0xfff (4095).
*/
#define X2AVIC_MAX_PHYSICAL_ID 0x1FFUL
+#define X2AVIC_4K_MAX_PHYSICAL_ID 0xFFFUL
static_assert((AVIC_MAX_PHYSICAL_ID & AVIC_PHYSICAL_MAX_INDEX_MASK) == AVIC_MAX_PHYSICAL_ID);
static_assert((X2AVIC_MAX_PHYSICAL_ID & AVIC_PHYSICAL_MAX_INDEX_MASK) == X2AVIC_MAX_PHYSICAL_ID);
+static_assert((X2AVIC_4K_MAX_PHYSICAL_ID & AVIC_PHYSICAL_MAX_INDEX_MASK) == X2AVIC_4K_MAX_PHYSICAL_ID);
#define SVM_SEV_FEAT_SNP_ACTIVE BIT(0)
#define SVM_SEV_FEAT_RESTRICTED_INJECTION BIT(3)
diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h
index d420c9c066d4..7ceff6583652 100644
--- a/arch/x86/include/uapi/asm/kvm.h
+++ b/arch/x86/include/uapi/asm/kvm.h
@@ -502,6 +502,7 @@ struct kvm_sync_regs {
/* vendor-specific groups and attributes for system fd */
#define KVM_X86_GRP_SEV 1
# define KVM_X86_SEV_VMSA_FEATURES 0
+# define KVM_X86_SNP_POLICY_BITS 1
struct kvm_vmx_nested_state_data {
__u8 vmcs12[KVM_STATE_NESTED_VMX_VMCS_SIZE];
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index d8660770dc6a..d0a2847a4bb0 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -145,14 +145,6 @@ EXPORT_SYMBOL_GPL(cpu_buf_idle_clear);
*/
DEFINE_STATIC_KEY_FALSE(switch_mm_cond_l1d_flush);
-/*
- * Controls CPU Fill buffer clear before VMenter. This is a subset of
- * X86_FEATURE_CLEAR_CPU_BUF, and should only be enabled when KVM-only
- * mitigation is required.
- */
-DEFINE_STATIC_KEY_FALSE(cpu_buf_vm_clear);
-EXPORT_SYMBOL_FOR_KVM(cpu_buf_vm_clear);
-
#undef pr_fmt
#define pr_fmt(fmt) "mitigations: " fmt
@@ -349,8 +341,8 @@ static enum rfds_mitigations rfds_mitigation __ro_after_init =
IS_ENABLED(CONFIG_MITIGATION_RFDS) ? RFDS_MITIGATION_AUTO : RFDS_MITIGATION_OFF;
/*
- * Set if any of MDS/TAA/MMIO/RFDS are going to enable VERW clearing
- * through X86_FEATURE_CLEAR_CPU_BUF on kernel and guest entry.
+ * Set if any of MDS/TAA/MMIO/RFDS are going to enable VERW clearing on exit to
+ * userspace *and* on entry to KVM guests.
*/
static bool verw_clear_cpu_buf_mitigation_selected __ro_after_init;
@@ -396,6 +388,7 @@ static void __init mds_apply_mitigation(void)
if (mds_mitigation == MDS_MITIGATION_FULL ||
mds_mitigation == MDS_MITIGATION_VMWERV) {
setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF);
+ setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF_VM);
if (!boot_cpu_has(X86_BUG_MSBDS_ONLY) &&
(mds_nosmt || smt_mitigations == SMT_MITIGATIONS_ON))
cpu_smt_disable(false);
@@ -507,6 +500,7 @@ static void __init taa_apply_mitigation(void)
* present on host, enable the mitigation for UCODE_NEEDED as well.
*/
setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF);
+ setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF_VM);
if (taa_nosmt || smt_mitigations == SMT_MITIGATIONS_ON)
cpu_smt_disable(false);
@@ -608,9 +602,9 @@ static void __init mmio_apply_mitigation(void)
*/
if (verw_clear_cpu_buf_mitigation_selected) {
setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF);
- static_branch_disable(&cpu_buf_vm_clear);
+ setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF_VM);
} else {
- static_branch_enable(&cpu_buf_vm_clear);
+ setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF_VM_MMIO);
}
/*
@@ -699,8 +693,10 @@ static void __init rfds_update_mitigation(void)
static void __init rfds_apply_mitigation(void)
{
- if (rfds_mitigation == RFDS_MITIGATION_VERW)
+ if (rfds_mitigation == RFDS_MITIGATION_VERW) {
setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF);
+ setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF_VM);
+ }
}
static __init int rfds_parse_cmdline(char *str)
diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c
index cde4b6cd3471..42c7eac0c387 100644
--- a/arch/x86/kernel/cpu/scattered.c
+++ b/arch/x86/kernel/cpu/scattered.c
@@ -53,6 +53,7 @@ static const struct cpuid_bit cpuid_bits[] = {
{ X86_FEATURE_PROC_FEEDBACK, CPUID_EDX, 11, 0x80000007, 0 },
{ X86_FEATURE_AMD_FAST_CPPC, CPUID_EDX, 15, 0x80000007, 0 },
{ X86_FEATURE_MBA, CPUID_EBX, 6, 0x80000008, 0 },
+ { X86_FEATURE_X2AVIC_EXT, CPUID_ECX, 6, 0x8000000a, 0 },
{ X86_FEATURE_COHERENCY_SFW_NO, CPUID_EBX, 31, 0x8000001f, 0 },
{ X86_FEATURE_SMBA, CPUID_EBX, 2, 0x80000020, 0 },
{ X86_FEATURE_BMEC, CPUID_EBX, 3, 0x80000020, 0 },
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 52524e0ca97f..d563a948318b 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -1135,6 +1135,7 @@ void kvm_set_cpu_caps(void)
F(AMD_STIBP),
F(AMD_STIBP_ALWAYS_ON),
F(AMD_IBRS_SAME_MODE),
+ PASSTHROUGH_F(EFER_LMSLE_MBZ),
F(AMD_PSFD),
F(AMD_IBPB_RET),
);
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 4e3da5b497b8..c8e292e9a24d 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -81,9 +81,8 @@
*/
/* Operand sizes: 8-bit operands or specified/overridden size. */
-#define ByteOp (1<<0) /* 8-bit operands. */
-/* Destination operand type. */
-#define DstShift 1
+#define ByteOp (1<<0) /* 8-bit operands. */
+#define DstShift 1 /* Destination operand type at bits 1-5 */
#define ImplicitOps (OpImplicit << DstShift)
#define DstReg (OpReg << DstShift)
#define DstMem (OpMem << DstShift)
@@ -95,8 +94,7 @@
#define DstDX (OpDX << DstShift)
#define DstAccLo (OpAccLo << DstShift)
#define DstMask (OpMask << DstShift)
-/* Source operand type. */
-#define SrcShift 6
+#define SrcShift 6 /* Source operand type at bits 6-10 */
#define SrcNone (OpNone << SrcShift)
#define SrcReg (OpReg << SrcShift)
#define SrcMem (OpMem << SrcShift)
@@ -119,10 +117,10 @@
#define SrcAccHi (OpAccHi << SrcShift)
#define SrcMask (OpMask << SrcShift)
#define BitOp (1<<11)
-#define MemAbs (1<<12) /* Memory operand is absolute displacement */
+#define MemAbs (1<<12) /* Memory operand is absolute displacement */
#define String (1<<13) /* String instruction (rep capable) */
#define Stack (1<<14) /* Stack instruction (push/pop) */
-#define GroupMask (7<<15) /* Opcode uses one of the group mechanisms */
+#define GroupMask (7<<15) /* Group mechanisms, at bits 15-17 */
#define Group (1<<15) /* Bits 3:5 of modrm byte extend opcode */
#define GroupDual (2<<15) /* Alternate decoding of mod == 3 */
#define Prefix (3<<15) /* Instruction varies with 66/f2/f3 prefix */
@@ -131,11 +129,8 @@
#define InstrDual (6<<15) /* Alternate instruction decoding of mod == 3 */
#define ModeDual (7<<15) /* Different instruction for 32/64 bit */
#define Sse (1<<18) /* SSE Vector instruction */
-/* Generic ModRM decode. */
-#define ModRM (1<<19)
-/* Destination is only written; never read. */
-#define Mov (1<<20)
-/* Misc flags */
+#define ModRM (1<<19) /* Generic ModRM decode. */
+#define Mov (1<<20) /* Destination is only written; never read. */
#define Prot (1<<21) /* instruction generates #UD if not in prot-mode */
#define EmulateOnUD (1<<22) /* Emulate if unsupported by the host */
#define NoAccess (1<<23) /* Don't access memory (lea/invlpg/verr etc) */
@@ -143,11 +138,11 @@
#define Undefined (1<<25) /* No Such Instruction */
#define Lock (1<<26) /* lock prefix is allowed for the instruction */
#define Priv (1<<27) /* instruction generates #GP if current CPL != 0 */
-#define No64 (1<<28)
+#define No64 (1<<28) /* Instruction generates #UD in 64-bit mode */
#define PageTable (1 << 29) /* instruction used to write page table */
#define NotImpl (1 << 30) /* instruction is not implemented */
-/* Source 2 operand type */
-#define Src2Shift (31)
+#define Avx ((u64)1 << 31) /* Instruction uses VEX prefix */
+#define Src2Shift (32) /* Source 2 operand type at bits 32-36 */
#define Src2None (OpNone << Src2Shift)
#define Src2Mem (OpMem << Src2Shift)
#define Src2CL (OpCL << Src2Shift)
@@ -161,12 +156,13 @@
#define Src2FS (OpFS << Src2Shift)
#define Src2GS (OpGS << Src2Shift)
#define Src2Mask (OpMask << Src2Shift)
+/* free: 37-39 */
#define Mmx ((u64)1 << 40) /* MMX Vector instruction */
-#define AlignMask ((u64)7 << 41)
+#define AlignMask ((u64)3 << 41) /* Memory alignment requirement at bits 41-42 */
#define Aligned ((u64)1 << 41) /* Explicitly aligned (e.g. MOVDQA) */
#define Unaligned ((u64)2 << 41) /* Explicitly unaligned (e.g. MOVDQU) */
-#define Avx ((u64)3 << 41) /* Advanced Vector Extensions */
-#define Aligned16 ((u64)4 << 41) /* Aligned to 16 byte boundary (e.g. FXSAVE) */
+#define Aligned16 ((u64)3 << 41) /* Aligned to 16 byte boundary (e.g. FXSAVE) */
+/* free: 43-44 */
#define NoWrite ((u64)1 << 45) /* No writeback */
#define SrcWrite ((u64)1 << 46) /* Write back src operand */
#define NoMod ((u64)1 << 47) /* Mod field is ignored */
@@ -243,6 +239,13 @@ enum x86_transfer_type {
X86_TRANSFER_TASK_SWITCH,
};
+enum rex_bits {
+ REX_B = 1,
+ REX_X = 2,
+ REX_R = 4,
+ REX_W = 8,
+};
+
static void writeback_registers(struct x86_emulate_ctxt *ctxt)
{
unsigned long dirty = ctxt->regs_dirty;
@@ -622,7 +625,6 @@ static unsigned insn_alignment(struct x86_emulate_ctxt *ctxt, unsigned size)
switch (alignment) {
case Unaligned:
- case Avx:
return 1;
case Aligned16:
return 16;
@@ -924,7 +926,7 @@ static void *decode_register(struct x86_emulate_ctxt *ctxt, u8 modrm_reg,
int byteop)
{
void *p;
- int highbyte_regs = (ctxt->rex_prefix == 0) && byteop;
+ int highbyte_regs = (ctxt->rex_prefix == REX_NONE) && byteop;
if (highbyte_regs && modrm_reg >= 4 && modrm_reg < 8)
p = (unsigned char *)reg_rmw(ctxt, modrm_reg & 3) + 1;
@@ -1030,6 +1032,7 @@ static void fetch_register_operand(struct operand *op)
op->val = *(u64 *)op->addr.reg;
break;
}
+ op->orig_val = op->val;
}
static int em_fninit(struct x86_emulate_ctxt *ctxt)
@@ -1075,17 +1078,17 @@ static int em_fnstsw(struct x86_emulate_ctxt *ctxt)
return X86EMUL_CONTINUE;
}
-static void decode_register_operand(struct x86_emulate_ctxt *ctxt,
- struct operand *op)
+static void __decode_register_operand(struct x86_emulate_ctxt *ctxt,
+ struct operand *op, int reg)
{
- unsigned int reg;
-
- if (ctxt->d & ModRM)
- reg = ctxt->modrm_reg;
- else
- reg = (ctxt->b & 7) | ((ctxt->rex_prefix & 1) << 3);
-
- if (ctxt->d & Sse) {
+ if ((ctxt->d & Avx) && ctxt->op_bytes == 32) {
+ op->type = OP_YMM;
+ op->bytes = 32;
+ op->addr.xmm = reg;
+ kvm_read_avx_reg(reg, &op->vec_val2);
+ return;
+ }
+ if (ctxt->d & (Avx|Sse)) {
op->type = OP_XMM;
op->bytes = 16;
op->addr.xmm = reg;
@@ -1103,9 +1106,20 @@ static void decode_register_operand(struct x86_emulate_ctxt *ctxt,
op->type = OP_REG;
op->bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes;
op->addr.reg = decode_register(ctxt, reg, ctxt->d & ByteOp);
-
fetch_register_operand(op);
- op->orig_val = op->val;
+}
+
+static void decode_register_operand(struct x86_emulate_ctxt *ctxt,
+ struct operand *op)
+{
+ unsigned int reg;
+
+ if (ctxt->d & ModRM)
+ reg = ctxt->modrm_reg;
+ else
+ reg = (ctxt->b & 7) | (ctxt->rex_bits & REX_B ? 8 : 0);
+
+ __decode_register_operand(ctxt, op, reg);
}
static void adjust_modrm_seg(struct x86_emulate_ctxt *ctxt, int base_reg)
@@ -1122,9 +1136,9 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt,
int rc = X86EMUL_CONTINUE;
ulong modrm_ea = 0;
- ctxt->modrm_reg = ((ctxt->rex_prefix << 1) & 8); /* REX.R */
- index_reg = (ctxt->rex_prefix << 2) & 8; /* REX.X */
- base_reg = (ctxt->rex_prefix << 3) & 8; /* REX.B */
+ ctxt->modrm_reg = (ctxt->rex_bits & REX_R ? 8 : 0);
+ index_reg = (ctxt->rex_bits & REX_X ? 8 : 0);
+ base_reg = (ctxt->rex_bits & REX_B ? 8 : 0);
ctxt->modrm_mod = (ctxt->modrm & 0xc0) >> 6;
ctxt->modrm_reg |= (ctxt->modrm & 0x38) >> 3;
@@ -1132,24 +1146,7 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt,
ctxt->modrm_seg = VCPU_SREG_DS;
if (ctxt->modrm_mod == 3 || (ctxt->d & NoMod)) {
- op->type = OP_REG;
- op->bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes;
- op->addr.reg = decode_register(ctxt, ctxt->modrm_rm,
- ctxt->d & ByteOp);
- if (ctxt->d & Sse) {
- op->type = OP_XMM;
- op->bytes = 16;
- op->addr.xmm = ctxt->modrm_rm;
- kvm_read_sse_reg(ctxt->modrm_rm, &op->vec_val);
- return rc;
- }
- if (ctxt->d & Mmx) {
- op->type = OP_MM;
- op->bytes = 8;
- op->addr.mm = ctxt->modrm_rm & 7;
- return rc;
- }
- fetch_register_operand(op);
+ __decode_register_operand(ctxt, op, ctxt->modrm_rm);
return rc;
}
@@ -1783,7 +1780,15 @@ static int writeback(struct x86_emulate_ctxt *ctxt, struct operand *op)
op->data,
op->bytes * op->count);
case OP_XMM:
- kvm_write_sse_reg(op->addr.xmm, &op->vec_val);
+ if (!(ctxt->d & Avx)) {
+ kvm_write_sse_reg(op->addr.xmm, &op->vec_val);
+ break;
+ }
+ /* full YMM write but with high bytes cleared */
+ memset(op->valptr + 16, 0, 16);
+ fallthrough;
+ case OP_YMM:
+ kvm_write_avx_reg(op->addr.xmm, &op->vec_val2);
break;
case OP_MM:
kvm_write_mmx_reg(op->addr.mm, &op->mm_val);
@@ -2466,7 +2471,7 @@ static int em_sysexit(struct x86_emulate_ctxt *ctxt)
setup_syscalls_segments(&cs, &ss);
- if ((ctxt->rex_prefix & 0x8) != 0x0)
+ if (ctxt->rex_bits & REX_W)
usermode = X86EMUL_MODE_PROT64;
else
usermode = X86EMUL_MODE_PROT32;
@@ -3958,6 +3963,8 @@ static int check_perm_out(struct x86_emulate_ctxt *ctxt)
I2bv(((_f) | DstReg | SrcMem | ModRM) & ~Lock, _e), \
I2bv(((_f) & ~Lock) | DstAcc | SrcImm, _e)
+static const struct opcode ud = I(SrcNone, emulate_ud);
+
static const struct opcode group7_rm0[] = {
N,
I(SrcNone | Priv | EmulateOnUD, em_hypercall),
@@ -4114,7 +4121,7 @@ static const struct group_dual group15 = { {
} };
static const struct gprefix pfx_0f_6f_0f_7f = {
- I(Mmx, em_mov), I(Sse | Aligned, em_mov), N, I(Sse | Unaligned, em_mov),
+ I(Mmx, em_mov), I(Sse | Avx | Aligned, em_mov), N, I(Sse | Avx | Unaligned, em_mov),
};
static const struct instr_dual instr_dual_0f_2b = {
@@ -4133,8 +4140,8 @@ static const struct gprefix pfx_0f_28_0f_29 = {
I(Aligned, em_mov), I(Aligned, em_mov), N, N,
};
-static const struct gprefix pfx_0f_e7 = {
- N, I(Sse, em_mov), N, N,
+static const struct gprefix pfx_0f_e7_0f_38_2a = {
+ N, I(Sse | Avx, em_mov), N, N,
};
static const struct escape escape_d9 = { {
@@ -4347,8 +4354,8 @@ static const struct opcode twobyte_table[256] = {
DI(ImplicitOps | Priv, invd), DI(ImplicitOps | Priv, wbinvd), N, N,
N, D(ImplicitOps | ModRM | SrcMem | NoAccess), N, N,
/* 0x10 - 0x1F */
- GP(ModRM | DstReg | SrcMem | Mov | Sse, &pfx_0f_10_0f_11),
- GP(ModRM | DstMem | SrcReg | Mov | Sse, &pfx_0f_10_0f_11),
+ GP(ModRM | DstReg | SrcMem | Mov | Sse | Avx, &pfx_0f_10_0f_11),
+ GP(ModRM | DstMem | SrcReg | Mov | Sse | Avx, &pfx_0f_10_0f_11),
N, N, N, N, N, N,
D(ImplicitOps | ModRM | SrcMem | NoAccess), /* 4 * prefetch + 4 * reserved NOP */
D(ImplicitOps | ModRM | SrcMem | NoAccess), N, N,
@@ -4364,9 +4371,9 @@ static const struct opcode twobyte_table[256] = {
IIP(ModRM | SrcMem | Priv | Op3264 | NoMod, em_dr_write, dr_write,
check_dr_write),
N, N, N, N,
- GP(ModRM | DstReg | SrcMem | Mov | Sse, &pfx_0f_28_0f_29),
- GP(ModRM | DstMem | SrcReg | Mov | Sse, &pfx_0f_28_0f_29),
- N, GP(ModRM | DstMem | SrcReg | Mov | Sse, &pfx_0f_2b),
+ GP(ModRM | DstReg | SrcMem | Mov | Sse | Avx, &pfx_0f_28_0f_29),
+ GP(ModRM | DstMem | SrcReg | Mov | Sse | Avx, &pfx_0f_28_0f_29),
+ N, GP(ModRM | DstMem | SrcReg | Mov | Sse | Avx, &pfx_0f_2b),
N, N, N, N,
/* 0x30 - 0x3F */
II(ImplicitOps | Priv, em_wrmsr, wrmsr),
@@ -4431,7 +4438,7 @@ static const struct opcode twobyte_table[256] = {
/* 0xD0 - 0xDF */
N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N,
/* 0xE0 - 0xEF */
- N, N, N, N, N, N, N, GP(SrcReg | DstMem | ModRM | Mov, &pfx_0f_e7),
+ N, N, N, N, N, N, N, GP(SrcReg | DstMem | ModRM | Mov, &pfx_0f_e7_0f_38_2a),
N, N, N, N, N, N, N, N,
/* 0xF0 - 0xFF */
N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N
@@ -4458,8 +4465,13 @@ static const struct gprefix three_byte_0f_38_f1 = {
* byte.
*/
static const struct opcode opcode_map_0f_38[256] = {
- /* 0x00 - 0x7f */
- X16(N), X16(N), X16(N), X16(N), X16(N), X16(N), X16(N), X16(N),
+ /* 0x00 - 0x1f */
+ X16(N), X16(N),
+ /* 0x20 - 0x2f */
+ X8(N),
+ X2(N), GP(SrcReg | DstMem | ModRM | Mov | Aligned, &pfx_0f_e7_0f_38_2a), N, N, N, N, N,
+ /* 0x30 - 0x7f */
+ X16(N), X16(N), X16(N), X16(N), X16(N),
/* 0x80 - 0xef */
X16(N), X16(N), X16(N), X16(N), X16(N), X16(N), X16(N),
/* 0xf0 - 0xf1 */
@@ -4618,14 +4630,12 @@ static int decode_operand(struct x86_emulate_ctxt *ctxt, struct operand *op,
op->bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes;
op->addr.reg = reg_rmw(ctxt, VCPU_REGS_RAX);
fetch_register_operand(op);
- op->orig_val = op->val;
break;
case OpAccLo:
op->type = OP_REG;
op->bytes = (ctxt->d & ByteOp) ? 2 : ctxt->op_bytes;
op->addr.reg = reg_rmw(ctxt, VCPU_REGS_RAX);
fetch_register_operand(op);
- op->orig_val = op->val;
break;
case OpAccHi:
if (ctxt->d & ByteOp) {
@@ -4636,7 +4646,6 @@ static int decode_operand(struct x86_emulate_ctxt *ctxt, struct operand *op,
op->bytes = ctxt->op_bytes;
op->addr.reg = reg_rmw(ctxt, VCPU_REGS_RDX);
fetch_register_operand(op);
- op->orig_val = op->val;
break;
case OpDI:
op->type = OP_MEM;
@@ -4755,12 +4764,87 @@ done:
return rc;
}
+static int x86_decode_avx(struct x86_emulate_ctxt *ctxt,
+ u8 vex_1st, u8 vex_2nd, struct opcode *opcode)
+{
+ u8 vex_3rd, map, pp, l, v;
+ int rc = X86EMUL_CONTINUE;
+
+ if (ctxt->rep_prefix || ctxt->op_prefix || ctxt->rex_prefix)
+ goto ud;
+
+ if (vex_1st == 0xc5) {
+ /* Expand RVVVVlpp to VEX3 format */
+ vex_3rd = vex_2nd & ~0x80; /* VVVVlpp from VEX2, w=0 */
+ vex_2nd = (vex_2nd & 0x80) | 0x61; /* R from VEX2, X=1 B=1 mmmmm=00001 */
+ } else {
+ vex_3rd = insn_fetch(u8, ctxt);
+ }
+
+ /* vex_2nd = RXBmmmmm, vex_3rd = wVVVVlpp. Fix polarity */
+ vex_2nd ^= 0xE0; /* binary 11100000 */
+ vex_3rd ^= 0x78; /* binary 01111000 */
+
+ ctxt->rex_prefix = REX_PREFIX;
+ ctxt->rex_bits = (vex_2nd & 0xE0) >> 5; /* RXB */
+ ctxt->rex_bits |= (vex_3rd & 0x80) >> 4; /* w */
+ if (ctxt->rex_bits && ctxt->mode != X86EMUL_MODE_PROT64)
+ goto ud;
+
+ map = vex_2nd & 0x1f;
+ v = (vex_3rd >> 3) & 0xf;
+ l = vex_3rd & 0x4;
+ pp = vex_3rd & 0x3;
+
+ ctxt->b = insn_fetch(u8, ctxt);
+ switch (map) {
+ case 1:
+ ctxt->opcode_len = 2;
+ *opcode = twobyte_table[ctxt->b];
+ break;
+ case 2:
+ ctxt->opcode_len = 3;
+ *opcode = opcode_map_0f_38[ctxt->b];
+ break;
+ case 3:
+ /* no 0f 3a instructions are supported yet */
+ return X86EMUL_UNHANDLEABLE;
+ default:
+ goto ud;
+ }
+
+ /*
+ * No three operand instructions are supported yet; those that
+ * *are* marked with the Avx flag reserve the VVVV flag.
+ */
+ if (v)
+ goto ud;
+
+ if (l)
+ ctxt->op_bytes = 32;
+ else
+ ctxt->op_bytes = 16;
+
+ switch (pp) {
+ case 0: break;
+ case 1: ctxt->op_prefix = true; break;
+ case 2: ctxt->rep_prefix = 0xf3; break;
+ case 3: ctxt->rep_prefix = 0xf2; break;
+ }
+
+done:
+ return rc;
+ud:
+ *opcode = ud;
+ return rc;
+}
+
int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len, int emulation_type)
{
int rc = X86EMUL_CONTINUE;
int mode = ctxt->mode;
int def_op_bytes, def_ad_bytes, goffset, simd_prefix;
- bool op_prefix = false;
+ bool vex_prefix = false;
bool has_seg_override = false;
struct opcode opcode;
u16 dummy;
@@ -4812,7 +4896,7 @@ int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len, int
for (;;) {
switch (ctxt->b = insn_fetch(u8, ctxt)) {
case 0x66: /* operand-size override */
- op_prefix = true;
+ ctxt->op_prefix = true;
/* switch between 2/4 bytes */
ctxt->op_bytes = def_op_bytes ^ 6;
break;
@@ -4851,7 +4935,8 @@ int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len, int
case 0x40 ... 0x4f: /* REX */
if (mode != X86EMUL_MODE_PROT64)
goto done_prefixes;
- ctxt->rex_prefix = ctxt->b;
+ ctxt->rex_prefix = REX_PREFIX;
+ ctxt->rex_bits = ctxt->b & 0xf;
continue;
case 0xf0: /* LOCK */
ctxt->lock_prefix = 1;
@@ -4865,20 +4950,33 @@ int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len, int
}
/* Any legacy prefix after a REX prefix nullifies its effect. */
-
- ctxt->rex_prefix = 0;
+ ctxt->rex_prefix = REX_NONE;
+ ctxt->rex_bits = 0;
}
done_prefixes:
/* REX prefix. */
- if (ctxt->rex_prefix & 8)
- ctxt->op_bytes = 8; /* REX.W */
+ if (ctxt->rex_bits & REX_W)
+ ctxt->op_bytes = 8;
/* Opcode byte(s). */
- opcode = opcode_table[ctxt->b];
- /* Two-byte opcode? */
- if (ctxt->b == 0x0f) {
+ if (ctxt->b == 0xc4 || ctxt->b == 0xc5) {
+ /* VEX or LDS/LES */
+ u8 vex_2nd = insn_fetch(u8, ctxt);
+ if (mode != X86EMUL_MODE_PROT64 && (vex_2nd & 0xc0) != 0xc0) {
+ opcode = opcode_table[ctxt->b];
+ ctxt->modrm = vex_2nd;
+ /* the Mod/RM byte has been fetched already! */
+ goto done_modrm;
+ }
+
+ vex_prefix = true;
+ rc = x86_decode_avx(ctxt, ctxt->b, vex_2nd, &opcode);
+ if (rc != X86EMUL_CONTINUE)
+ goto done;
+ } else if (ctxt->b == 0x0f) {
+ /* Two- or three-byte opcode */
ctxt->opcode_len = 2;
ctxt->b = insn_fetch(u8, ctxt);
opcode = twobyte_table[ctxt->b];
@@ -4889,18 +4987,16 @@ done_prefixes:
ctxt->b = insn_fetch(u8, ctxt);
opcode = opcode_map_0f_38[ctxt->b];
}
+ } else {
+ /* Opcode byte(s). */
+ opcode = opcode_table[ctxt->b];
}
- ctxt->d = opcode.flags;
- if (ctxt->d & ModRM)
+ if (opcode.flags & ModRM)
ctxt->modrm = insn_fetch(u8, ctxt);
- /* vex-prefix instructions are not implemented */
- if (ctxt->opcode_len == 1 && (ctxt->b == 0xc5 || ctxt->b == 0xc4) &&
- (mode == X86EMUL_MODE_PROT64 || (ctxt->modrm & 0xc0) == 0xc0)) {
- ctxt->d = NotImpl;
- }
-
+done_modrm:
+ ctxt->d = opcode.flags;
while (ctxt->d & GroupMask) {
switch (ctxt->d & GroupMask) {
case Group:
@@ -4919,9 +5015,9 @@ done_prefixes:
opcode = opcode.u.group[goffset];
break;
case Prefix:
- if (ctxt->rep_prefix && op_prefix)
+ if (ctxt->rep_prefix && ctxt->op_prefix)
return EMULATION_FAILED;
- simd_prefix = op_prefix ? 0x66 : ctxt->rep_prefix;
+ simd_prefix = ctxt->op_prefix ? 0x66 : ctxt->rep_prefix;
switch (simd_prefix) {
case 0x00: opcode = opcode.u.gprefix->pfx_no; break;
case 0x66: opcode = opcode.u.gprefix->pfx_66; break;
@@ -4966,6 +5062,19 @@ done_prefixes:
if (ctxt->d == 0)
return EMULATION_FAILED;
+ if (unlikely(vex_prefix)) {
+ /*
+ * Only specifically marked instructions support VEX. Since many
+ * instructions support it but are not annotated, return not implemented
+ * rather than #UD.
+ */
+ if (!(ctxt->d & Avx))
+ return EMULATION_FAILED;
+
+ if (!(ctxt->d & AlignMask))
+ ctxt->d |= Unaligned;
+ }
+
ctxt->execute = opcode.u.execute;
/*
@@ -5036,8 +5145,10 @@ done_prefixes:
if ((ctxt->d & No16) && ctxt->op_bytes == 2)
ctxt->op_bytes = 4;
- if (ctxt->d & Sse)
- ctxt->op_bytes = 16;
+ if (vex_prefix)
+ ;
+ else if (ctxt->d & Sse)
+ ctxt->op_bytes = 16, ctxt->d &= ~Avx;
else if (ctxt->d & Mmx)
ctxt->op_bytes = 8;
}
@@ -5137,8 +5248,10 @@ void init_decode_cache(struct x86_emulate_ctxt *ctxt)
{
/* Clear fields that are set conditionally but read without a guard. */
ctxt->rip_relative = false;
- ctxt->rex_prefix = 0;
+ ctxt->rex_prefix = REX_NONE;
+ ctxt->rex_bits = 0;
ctxt->lock_prefix = 0;
+ ctxt->op_prefix = false;
ctxt->rep_prefix = 0;
ctxt->regs_valid = 0;
ctxt->regs_dirty = 0;
@@ -5168,20 +5281,34 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt, bool check_intercepts)
}
if (unlikely(ctxt->d &
- (No64|Undefined|Sse|Mmx|Intercept|CheckPerm|Priv|Prot|String))) {
+ (No64|Undefined|Avx|Sse|Mmx|Intercept|CheckPerm|Priv|Prot|String))) {
if ((ctxt->mode == X86EMUL_MODE_PROT64 && (ctxt->d & No64)) ||
(ctxt->d & Undefined)) {
rc = emulate_ud(ctxt);
goto done;
}
- if (((ctxt->d & (Sse|Mmx)) && ((ops->get_cr(ctxt, 0) & X86_CR0_EM)))
- || ((ctxt->d & Sse) && !(ops->get_cr(ctxt, 4) & X86_CR4_OSFXSR))) {
+ if ((ctxt->d & (Avx|Sse|Mmx)) && ((ops->get_cr(ctxt, 0) & X86_CR0_EM))) {
rc = emulate_ud(ctxt);
goto done;
}
- if ((ctxt->d & (Sse|Mmx)) && (ops->get_cr(ctxt, 0) & X86_CR0_TS)) {
+ if (ctxt->d & Avx) {
+ u64 xcr = 0;
+ if (!(ops->get_cr(ctxt, 4) & X86_CR4_OSXSAVE)
+ || ops->get_xcr(ctxt, 0, &xcr)
+ || !(xcr & XFEATURE_MASK_YMM)) {
+ rc = emulate_ud(ctxt);
+ goto done;
+ }
+ } else if (ctxt->d & Sse) {
+ if (!(ops->get_cr(ctxt, 4) & X86_CR4_OSFXSR)) {
+ rc = emulate_ud(ctxt);
+ goto done;
+ }
+ }
+
+ if ((ctxt->d & (Avx|Sse|Mmx)) && (ops->get_cr(ctxt, 0) & X86_CR0_TS)) {
rc = emulate_nm(ctxt);
goto done;
}
diff --git a/arch/x86/kvm/fpu.h b/arch/x86/kvm/fpu.h
index 3ba12888bf66..f898781b6a06 100644
--- a/arch/x86/kvm/fpu.h
+++ b/arch/x86/kvm/fpu.h
@@ -15,6 +15,58 @@ typedef u32 __attribute__((vector_size(16))) sse128_t;
#define sse128_l3(x) ({ __sse128_u t; t.vec = x; t.as_u32[3]; })
#define sse128(lo, hi) ({ __sse128_u t; t.as_u64[0] = lo; t.as_u64[1] = hi; t.vec; })
+typedef u32 __attribute__((vector_size(32))) avx256_t;
+
+static inline void _kvm_read_avx_reg(int reg, avx256_t *data)
+{
+ switch (reg) {
+ case 0: asm("vmovdqa %%ymm0, %0" : "=m"(*data)); break;
+ case 1: asm("vmovdqa %%ymm1, %0" : "=m"(*data)); break;
+ case 2: asm("vmovdqa %%ymm2, %0" : "=m"(*data)); break;
+ case 3: asm("vmovdqa %%ymm3, %0" : "=m"(*data)); break;
+ case 4: asm("vmovdqa %%ymm4, %0" : "=m"(*data)); break;
+ case 5: asm("vmovdqa %%ymm5, %0" : "=m"(*data)); break;
+ case 6: asm("vmovdqa %%ymm6, %0" : "=m"(*data)); break;
+ case 7: asm("vmovdqa %%ymm7, %0" : "=m"(*data)); break;
+#ifdef CONFIG_X86_64
+ case 8: asm("vmovdqa %%ymm8, %0" : "=m"(*data)); break;
+ case 9: asm("vmovdqa %%ymm9, %0" : "=m"(*data)); break;
+ case 10: asm("vmovdqa %%ymm10, %0" : "=m"(*data)); break;
+ case 11: asm("vmovdqa %%ymm11, %0" : "=m"(*data)); break;
+ case 12: asm("vmovdqa %%ymm12, %0" : "=m"(*data)); break;
+ case 13: asm("vmovdqa %%ymm13, %0" : "=m"(*data)); break;
+ case 14: asm("vmovdqa %%ymm14, %0" : "=m"(*data)); break;
+ case 15: asm("vmovdqa %%ymm15, %0" : "=m"(*data)); break;
+#endif
+ default: BUG();
+ }
+}
+
+static inline void _kvm_write_avx_reg(int reg, const avx256_t *data)
+{
+ switch (reg) {
+ case 0: asm("vmovdqa %0, %%ymm0" : : "m"(*data)); break;
+ case 1: asm("vmovdqa %0, %%ymm1" : : "m"(*data)); break;
+ case 2: asm("vmovdqa %0, %%ymm2" : : "m"(*data)); break;
+ case 3: asm("vmovdqa %0, %%ymm3" : : "m"(*data)); break;
+ case 4: asm("vmovdqa %0, %%ymm4" : : "m"(*data)); break;
+ case 5: asm("vmovdqa %0, %%ymm5" : : "m"(*data)); break;
+ case 6: asm("vmovdqa %0, %%ymm6" : : "m"(*data)); break;
+ case 7: asm("vmovdqa %0, %%ymm7" : : "m"(*data)); break;
+#ifdef CONFIG_X86_64
+ case 8: asm("vmovdqa %0, %%ymm8" : : "m"(*data)); break;
+ case 9: asm("vmovdqa %0, %%ymm9" : : "m"(*data)); break;
+ case 10: asm("vmovdqa %0, %%ymm10" : : "m"(*data)); break;
+ case 11: asm("vmovdqa %0, %%ymm11" : : "m"(*data)); break;
+ case 12: asm("vmovdqa %0, %%ymm12" : : "m"(*data)); break;
+ case 13: asm("vmovdqa %0, %%ymm13" : : "m"(*data)); break;
+ case 14: asm("vmovdqa %0, %%ymm14" : : "m"(*data)); break;
+ case 15: asm("vmovdqa %0, %%ymm15" : : "m"(*data)); break;
+#endif
+ default: BUG();
+ }
+}
+
static inline void _kvm_read_sse_reg(int reg, sse128_t *data)
{
switch (reg) {
@@ -109,6 +161,20 @@ static inline void kvm_fpu_put(void)
fpregs_unlock();
}
+static inline void kvm_read_avx_reg(int reg, avx256_t *data)
+{
+ kvm_fpu_get();
+ _kvm_read_avx_reg(reg, data);
+ kvm_fpu_put();
+}
+
+static inline void kvm_write_avx_reg(int reg, const avx256_t *data)
+{
+ kvm_fpu_get();
+ _kvm_write_avx_reg(reg, data);
+ kvm_fpu_put();
+}
+
static inline void kvm_read_sse_reg(int reg, sse128_t *data)
{
kvm_fpu_get();
diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
index 38595ecb990d..de92292eb1f5 100644
--- a/arch/x86/kvm/hyperv.c
+++ b/arch/x86/kvm/hyperv.c
@@ -1568,7 +1568,7 @@ static int kvm_hv_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host)
* only, there can be valuable data in the rest which needs
* to be preserved e.g. on migration.
*/
- if (__put_user(0, (u32 __user *)addr))
+ if (put_user(0, (u32 __user *)addr))
return 1;
hv_vcpu->hv_vapic = data;
kvm_vcpu_mark_page_dirty(vcpu, gfn);
diff --git a/arch/x86/kvm/kvm_emulate.h b/arch/x86/kvm/kvm_emulate.h
index 7b5ddb787a25..fb3dab4b5a53 100644
--- a/arch/x86/kvm/kvm_emulate.h
+++ b/arch/x86/kvm/kvm_emulate.h
@@ -237,6 +237,7 @@ struct x86_emulate_ops {
bool (*is_smm)(struct x86_emulate_ctxt *ctxt);
int (*leave_smm)(struct x86_emulate_ctxt *ctxt);
void (*triple_fault)(struct x86_emulate_ctxt *ctxt);
+ int (*get_xcr)(struct x86_emulate_ctxt *ctxt, u32 index, u64 *xcr);
int (*set_xcr)(struct x86_emulate_ctxt *ctxt, u32 index, u64 xcr);
gva_t (*get_untagged_addr)(struct x86_emulate_ctxt *ctxt, gva_t addr,
@@ -248,7 +249,7 @@ struct x86_emulate_ops {
/* Type, address-of, and value of an instruction's operand. */
struct operand {
- enum { OP_REG, OP_MEM, OP_MEM_STR, OP_IMM, OP_XMM, OP_MM, OP_NONE } type;
+ enum { OP_REG, OP_MEM, OP_MEM_STR, OP_IMM, OP_XMM, OP_YMM, OP_MM, OP_NONE } type;
unsigned int bytes;
unsigned int count;
union {
@@ -267,11 +268,12 @@ struct operand {
union {
unsigned long val;
u64 val64;
- char valptr[sizeof(sse128_t)];
+ char valptr[sizeof(avx256_t)];
sse128_t vec_val;
+ avx256_t vec_val2;
u64 mm_val;
void *data;
- };
+ } __aligned(32);
};
#define X86_MAX_INSTRUCTION_LENGTH 15
@@ -317,6 +319,14 @@ typedef void (*fastop_t)(struct fastop *);
#define NR_EMULATOR_GPRS 8
#endif
+/*
+ * Distinguish between no prefix, REX, or in the future REX2.
+ */
+enum rex_type {
+ REX_NONE,
+ REX_PREFIX,
+};
+
struct x86_emulate_ctxt {
void *vcpu;
const struct x86_emulate_ops *ops;
@@ -348,6 +358,7 @@ struct x86_emulate_ctxt {
u8 opcode_len;
u8 b;
u8 intercept;
+ bool op_prefix;
u8 op_bytes;
u8 ad_bytes;
union {
@@ -357,7 +368,8 @@ struct x86_emulate_ctxt {
int (*check_perm)(struct x86_emulate_ctxt *ctxt);
bool rip_relative;
- u8 rex_prefix;
+ enum rex_type rex_prefix;
+ u8 rex_bits;
u8 lock_prefix;
u8 rep_prefix;
/* bitmaps of registers in _regs[] that can be read */
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 0ae7f913d782..1597dd0b0cc6 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -2126,23 +2126,41 @@ static bool set_target_expiration(struct kvm_lapic *apic, u32 count_reg)
static void advance_periodic_target_expiration(struct kvm_lapic *apic)
{
+ struct kvm_timer *ktimer = &apic->lapic_timer;
ktime_t now = ktime_get();
u64 tscl = rdtsc();
ktime_t delta;
/*
- * Synchronize both deadlines to the same time source or
- * differences in the periods (caused by differences in the
- * underlying clocks or numerical approximation errors) will
- * cause the two to drift apart over time as the errors
- * accumulate.
+ * Use kernel time as the time source for both the hrtimer deadline and
+ * TSC-based deadline so that they stay synchronized. Computing each
+ * deadline independently will cause the two deadlines to drift apart
+ * over time as differences in the periods accumulate, e.g. due to
+ * differences in the underlying clocks or numerical approximation errors.
*/
- apic->lapic_timer.target_expiration =
- ktime_add_ns(apic->lapic_timer.target_expiration,
- apic->lapic_timer.period);
- delta = ktime_sub(apic->lapic_timer.target_expiration, now);
- apic->lapic_timer.tscdeadline = kvm_read_l1_tsc(apic->vcpu, tscl) +
- nsec_to_cycles(apic->vcpu, delta);
+ ktimer->target_expiration = ktime_add_ns(ktimer->target_expiration,
+ ktimer->period);
+
+ /*
+ * If the new expiration is in the past, e.g. because userspace stopped
+ * running the VM for an extended duration, then force the expiration
+ * to "now" and don't try to play catch-up with the missed events. KVM
+ * will only deliver a single interrupt regardless of how many events
+ * are pending, i.e. restarting the timer with an expiration in the
+ * past will do nothing more than waste host cycles, and can even lead
+ * to a hard lockup in extreme cases.
+ */
+ if (ktime_before(ktimer->target_expiration, now))
+ ktimer->target_expiration = now;
+
+ /*
+ * Note, ensuring the expiration isn't in the past also prevents delta
+ * from going negative, which could cause the TSC deadline to become
+ * excessively large due to it an unsigned value.
+ */
+ delta = ktime_sub(ktimer->target_expiration, now);
+ ktimer->tscdeadline = kvm_read_l1_tsc(apic->vcpu, tscl) +
+ nsec_to_cycles(apic->vcpu, delta);
}
static void start_sw_period(struct kvm_lapic *apic)
@@ -2970,9 +2988,9 @@ static enum hrtimer_restart apic_timer_fn(struct hrtimer *data)
apic_timer_expired(apic, true);
- if (lapic_is_periodic(apic)) {
+ if (lapic_is_periodic(apic) && !WARN_ON_ONCE(!apic->lapic_timer.period)) {
advance_periodic_target_expiration(apic);
- hrtimer_add_expires_ns(&ktimer->timer, ktimer->period);
+ hrtimer_set_expires(&ktimer->timer, ktimer->target_expiration);
return HRTIMER_RESTART;
} else
return HRTIMER_NORESTART;
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index f63074048ec6..830f46145692 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -235,8 +235,6 @@ static inline u8 permission_fault(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
return -(u32)fault & errcode;
}
-bool kvm_mmu_may_ignore_guest_pat(struct kvm *kvm);
-
int kvm_mmu_post_init_vm(struct kvm *kvm);
void kvm_mmu_pre_destroy_vm(struct kvm *kvm);
@@ -257,8 +255,7 @@ extern bool tdp_mmu_enabled;
#define tdp_mmu_enabled false
#endif
-bool kvm_tdp_mmu_gpa_is_mapped(struct kvm_vcpu *vcpu, u64 gpa);
-int kvm_tdp_map_page(struct kvm_vcpu *vcpu, gpa_t gpa, u64 error_code, u8 *level);
+int kvm_tdp_mmu_map_private_pfn(struct kvm_vcpu *vcpu, gfn_t gfn, kvm_pfn_t pfn);
static inline bool kvm_memslots_have_rmaps(struct kvm *kvm)
{
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 667d66cf76d5..02c450686b4a 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -4859,7 +4859,7 @@ int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code,
*/
BUILD_BUG_ON(lower_32_bits(PFERR_SYNTHETIC_MASK));
- vcpu->arch.l1tf_flush_l1d = true;
+ kvm_request_l1tf_flush_l1d();
if (!flags) {
trace_kvm_page_fault(vcpu, fault_address, error_code);
@@ -4924,7 +4924,8 @@ int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
return direct_page_fault(vcpu, fault);
}
-int kvm_tdp_map_page(struct kvm_vcpu *vcpu, gpa_t gpa, u64 error_code, u8 *level)
+static int kvm_tdp_page_prefault(struct kvm_vcpu *vcpu, gpa_t gpa,
+ u64 error_code, u8 *level)
{
int r;
@@ -4966,7 +4967,6 @@ int kvm_tdp_map_page(struct kvm_vcpu *vcpu, gpa_t gpa, u64 error_code, u8 *level
return -EIO;
}
}
-EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_tdp_map_page);
long kvm_arch_vcpu_pre_fault_memory(struct kvm_vcpu *vcpu,
struct kvm_pre_fault_memory *range)
@@ -5002,7 +5002,7 @@ long kvm_arch_vcpu_pre_fault_memory(struct kvm_vcpu *vcpu,
* Shadow paging uses GVA for kvm page fault, so restrict to
* two-dimensional paging.
*/
- r = kvm_tdp_map_page(vcpu, range->gpa | direct_bits, error_code, &level);
+ r = kvm_tdp_page_prefault(vcpu, range->gpa | direct_bits, error_code, &level);
if (r < 0)
return r;
@@ -5014,6 +5014,86 @@ long kvm_arch_vcpu_pre_fault_memory(struct kvm_vcpu *vcpu,
return min(range->size, end - range->gpa);
}
+#ifdef CONFIG_KVM_GUEST_MEMFD
+static void kvm_assert_gmem_invalidate_lock_held(struct kvm_memory_slot *slot)
+{
+#ifdef CONFIG_PROVE_LOCKING
+ if (WARN_ON_ONCE(!kvm_slot_has_gmem(slot)) ||
+ WARN_ON_ONCE(!slot->gmem.file) ||
+ WARN_ON_ONCE(!file_count(slot->gmem.file)))
+ return;
+
+ lockdep_assert_held(&file_inode(slot->gmem.file)->i_mapping->invalidate_lock);
+#endif
+}
+
+int kvm_tdp_mmu_map_private_pfn(struct kvm_vcpu *vcpu, gfn_t gfn, kvm_pfn_t pfn)
+{
+ struct kvm_page_fault fault = {
+ .addr = gfn_to_gpa(gfn),
+ .error_code = PFERR_GUEST_FINAL_MASK | PFERR_PRIVATE_ACCESS,
+ .prefetch = true,
+ .is_tdp = true,
+ .nx_huge_page_workaround_enabled = is_nx_huge_page_enabled(vcpu->kvm),
+
+ .max_level = PG_LEVEL_4K,
+ .req_level = PG_LEVEL_4K,
+ .goal_level = PG_LEVEL_4K,
+ .is_private = true,
+
+ .gfn = gfn,
+ .slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn),
+ .pfn = pfn,
+ .map_writable = true,
+ };
+ struct kvm *kvm = vcpu->kvm;
+ int r;
+
+ lockdep_assert_held(&kvm->slots_lock);
+
+ /*
+ * Mapping a pre-determined private pfn is intended only for use when
+ * populating a guest_memfd instance. Assert that the slot is backed
+ * by guest_memfd and that the gmem instance's invalidate_lock is held.
+ */
+ kvm_assert_gmem_invalidate_lock_held(fault.slot);
+
+ if (KVM_BUG_ON(!tdp_mmu_enabled, kvm))
+ return -EIO;
+
+ if (kvm_gfn_is_write_tracked(kvm, fault.slot, fault.gfn))
+ return -EPERM;
+
+ r = kvm_mmu_reload(vcpu);
+ if (r)
+ return r;
+
+ r = mmu_topup_memory_caches(vcpu, false);
+ if (r)
+ return r;
+
+ do {
+ if (signal_pending(current))
+ return -EINTR;
+
+ if (kvm_test_request(KVM_REQ_VM_DEAD, vcpu))
+ return -EIO;
+
+ cond_resched();
+
+ guard(read_lock)(&kvm->mmu_lock);
+
+ r = kvm_tdp_mmu_map(vcpu, &fault);
+ } while (r == RET_PF_RETRY);
+
+ if (r != RET_PF_FIXED)
+ return -EIO;
+
+ return 0;
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_tdp_mmu_map_private_pfn);
+#endif
+
static void nonpaging_init_context(struct kvm_mmu *context)
{
context->page_fault = nonpaging_page_fault;
@@ -5997,7 +6077,6 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu)
out:
return r;
}
-EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_mmu_load);
void kvm_mmu_unload(struct kvm_vcpu *vcpu)
{
@@ -6863,6 +6942,7 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end)
write_unlock(&kvm->mmu_lock);
}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_zap_gfn_range);
static bool slot_rmap_write_protect(struct kvm *kvm,
struct kvm_rmap_head *rmap_head,
@@ -7204,7 +7284,6 @@ restart:
return need_tlb_flush;
}
-EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_zap_gfn_range);
static void kvm_rmap_zap_collapsible_sptes(struct kvm *kvm,
const struct kvm_memory_slot *slot)
@@ -7364,6 +7443,9 @@ void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen)
{
WARN_ON_ONCE(gen & KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS);
+ if (!enable_mmio_caching)
+ return;
+
gen &= MMIO_SPTE_GEN_MASK;
/*
diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h
index ed5c01df21ba..73cdcbccc89e 100644
--- a/arch/x86/kvm/mmu/mmu_internal.h
+++ b/arch/x86/kvm/mmu/mmu_internal.h
@@ -39,16 +39,6 @@
#define INVALID_PAE_ROOT 0
#define IS_VALID_PAE_ROOT(x) (!!(x))
-static inline hpa_t kvm_mmu_get_dummy_root(void)
-{
- return my_zero_pfn(0) << PAGE_SHIFT;
-}
-
-static inline bool kvm_mmu_is_dummy_root(hpa_t shadow_page)
-{
- return is_zero_pfn(shadow_page >> PAGE_SHIFT);
-}
-
typedef u64 __rcu *tdp_ptep_t;
struct kvm_mmu_page {
diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h
index ed762bb4b007..901cd2bd40b8 100644
--- a/arch/x86/kvm/mmu/paging_tmpl.h
+++ b/arch/x86/kvm/mmu/paging_tmpl.h
@@ -402,7 +402,7 @@ retry_walk:
goto error;
ptep_user = (pt_element_t __user *)((void *)host_addr + offset);
- if (unlikely(__get_user(pte, ptep_user)))
+ if (unlikely(get_user(pte, ptep_user)))
goto error;
walker->ptep_user[walker->level - 1] = ptep_user;
diff --git a/arch/x86/kvm/mmu/spte.c b/arch/x86/kvm/mmu/spte.c
index 37647afde7d3..85a0473809b0 100644
--- a/arch/x86/kvm/mmu/spte.c
+++ b/arch/x86/kvm/mmu/spte.c
@@ -292,7 +292,7 @@ bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
mark_page_dirty_in_slot(vcpu->kvm, slot, gfn);
}
- if (static_branch_unlikely(&cpu_buf_vm_clear) &&
+ if (cpu_feature_enabled(X86_FEATURE_CLEAR_CPU_BUF_VM_MMIO) &&
!kvm_vcpu_can_access_host_mmio(vcpu) &&
kvm_is_mmio_pfn(pfn, &is_host_mmio))
kvm_track_host_mmio_mapping(vcpu);
diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h
index 3133f066927e..91ce29fd6f1b 100644
--- a/arch/x86/kvm/mmu/spte.h
+++ b/arch/x86/kvm/mmu/spte.h
@@ -246,6 +246,16 @@ static inline int spte_index(u64 *sptep)
*/
extern u64 __read_mostly shadow_nonpresent_or_rsvd_lower_gfn_mask;
+static inline hpa_t kvm_mmu_get_dummy_root(void)
+{
+ return my_zero_pfn(0) << PAGE_SHIFT;
+}
+
+static inline bool kvm_mmu_is_dummy_root(hpa_t shadow_page)
+{
+ return is_zero_pfn(shadow_page >> PAGE_SHIFT);
+}
+
static inline struct kvm_mmu_page *to_shadow_page(hpa_t shadow_page)
{
struct page *page = pfn_to_page((shadow_page) >> PAGE_SHIFT);
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index c5734ca5c17d..9c26038f6b77 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -362,9 +362,6 @@ static void tdp_mmu_unlink_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
static void remove_external_spte(struct kvm *kvm, gfn_t gfn, u64 old_spte,
int level)
{
- kvm_pfn_t old_pfn = spte_to_pfn(old_spte);
- int ret;
-
/*
* External (TDX) SPTEs are limited to PG_LEVEL_4K, and external
* PTs are removed in a special order, involving free_external_spt().
@@ -377,9 +374,8 @@ static void remove_external_spte(struct kvm *kvm, gfn_t gfn, u64 old_spte,
/* Zapping leaf spte is allowed only when write lock is held. */
lockdep_assert_held_write(&kvm->mmu_lock);
- /* Because write lock is held, operation should success. */
- ret = kvm_x86_call(remove_external_spte)(kvm, gfn, level, old_pfn);
- KVM_BUG_ON(ret, kvm);
+
+ kvm_x86_call(remove_external_spte)(kvm, gfn, level, old_spte);
}
/**
@@ -519,7 +515,6 @@ static int __must_check set_external_spte_present(struct kvm *kvm, tdp_ptep_t sp
bool was_present = is_shadow_present_pte(old_spte);
bool is_present = is_shadow_present_pte(new_spte);
bool is_leaf = is_present && is_last_spte(new_spte, level);
- kvm_pfn_t new_pfn = spte_to_pfn(new_spte);
int ret = 0;
KVM_BUG_ON(was_present, kvm);
@@ -538,7 +533,7 @@ static int __must_check set_external_spte_present(struct kvm *kvm, tdp_ptep_t sp
* external page table, or leaf.
*/
if (is_leaf) {
- ret = kvm_x86_call(set_external_spte)(kvm, gfn, level, new_pfn);
+ ret = kvm_x86_call(set_external_spte)(kvm, gfn, level, new_spte);
} else {
void *external_spt = get_external_spt(gfn, new_spte, level);
@@ -1273,6 +1268,8 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
struct kvm_mmu_page *sp;
int ret = RET_PF_RETRY;
+ KVM_MMU_WARN_ON(!root || root->role.invalid);
+
kvm_mmu_hugepage_adjust(vcpu, fault);
trace_kvm_mmu_spte_requested(fault);
@@ -1939,13 +1936,16 @@ bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm,
*
* Must be called between kvm_tdp_mmu_walk_lockless_{begin,end}.
*/
-static int __kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes,
- struct kvm_mmu_page *root)
+int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes,
+ int *root_level)
{
+ struct kvm_mmu_page *root = root_to_sp(vcpu->arch.mmu->root.hpa);
struct tdp_iter iter;
gfn_t gfn = addr >> PAGE_SHIFT;
int leaf = -1;
+ *root_level = vcpu->arch.mmu->root_role.level;
+
for_each_tdp_pte(iter, vcpu->kvm, root, gfn, gfn + 1) {
leaf = iter.level;
sptes[leaf] = iter.old_spte;
@@ -1954,36 +1954,6 @@ static int __kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes,
return leaf;
}
-int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes,
- int *root_level)
-{
- struct kvm_mmu_page *root = root_to_sp(vcpu->arch.mmu->root.hpa);
- *root_level = vcpu->arch.mmu->root_role.level;
-
- return __kvm_tdp_mmu_get_walk(vcpu, addr, sptes, root);
-}
-
-bool kvm_tdp_mmu_gpa_is_mapped(struct kvm_vcpu *vcpu, u64 gpa)
-{
- struct kvm *kvm = vcpu->kvm;
- bool is_direct = kvm_is_addr_direct(kvm, gpa);
- hpa_t root = is_direct ? vcpu->arch.mmu->root.hpa :
- vcpu->arch.mmu->mirror_root_hpa;
- u64 sptes[PT64_ROOT_MAX_LEVEL + 1], spte;
- int leaf;
-
- lockdep_assert_held(&kvm->mmu_lock);
- rcu_read_lock();
- leaf = __kvm_tdp_mmu_get_walk(vcpu, gpa, sptes, root_to_sp(root));
- rcu_read_unlock();
- if (leaf < 0)
- return false;
-
- spte = sptes[leaf];
- return is_shadow_present_pte(spte) && is_last_spte(spte, leaf);
-}
-EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_tdp_mmu_gpa_is_mapped);
-
/*
* Returns the last level spte pointer of the shadow page walk for the given
* gpa, and sets *spte to the spte value. This spte may be non-preset. If no
diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c
index fef00546c885..6b77b2033208 100644
--- a/arch/x86/kvm/svm/avic.c
+++ b/arch/x86/kvm/svm/avic.c
@@ -106,7 +106,7 @@ static u32 next_vm_id = 0;
static bool next_vm_id_wrapped = 0;
static DEFINE_SPINLOCK(svm_vm_data_hash_lock);
static bool x2avic_enabled;
-
+static u32 x2avic_max_physical_id;
static void avic_set_x2apic_msr_interception(struct vcpu_svm *svm,
bool intercept)
@@ -158,12 +158,40 @@ static void avic_set_x2apic_msr_interception(struct vcpu_svm *svm,
svm->x2avic_msrs_intercepted = intercept;
}
+static u32 __avic_get_max_physical_id(struct kvm *kvm, struct kvm_vcpu *vcpu)
+{
+ u32 arch_max;
+
+ /*
+ * Return the largest size (x2APIC) when querying without a vCPU, e.g.
+ * to allocate the per-VM table..
+ */
+ if (x2avic_enabled && (!vcpu || apic_x2apic_mode(vcpu->arch.apic)))
+ arch_max = x2avic_max_physical_id;
+ else
+ arch_max = AVIC_MAX_PHYSICAL_ID;
+
+ /*
+ * Despite its name, KVM_CAP_MAX_VCPU_ID represents the maximum APIC ID
+ * plus one, so the max possible APIC ID is one less than that.
+ */
+ return min(kvm->arch.max_vcpu_ids - 1, arch_max);
+}
+
+static u32 avic_get_max_physical_id(struct kvm_vcpu *vcpu)
+{
+ return __avic_get_max_physical_id(vcpu->kvm, vcpu);
+}
+
static void avic_activate_vmcb(struct vcpu_svm *svm)
{
struct vmcb *vmcb = svm->vmcb01.ptr;
+ struct kvm_vcpu *vcpu = &svm->vcpu;
vmcb->control.int_ctl &= ~(AVIC_ENABLE_MASK | X2APIC_MODE_MASK);
+
vmcb->control.avic_physical_id &= ~AVIC_PHYSICAL_MAX_INDEX_MASK;
+ vmcb->control.avic_physical_id |= avic_get_max_physical_id(vcpu);
vmcb->control.int_ctl |= AVIC_ENABLE_MASK;
@@ -176,7 +204,7 @@ static void avic_activate_vmcb(struct vcpu_svm *svm)
*/
if (x2avic_enabled && apic_x2apic_mode(svm->vcpu.arch.apic)) {
vmcb->control.int_ctl |= X2APIC_MODE_MASK;
- vmcb->control.avic_physical_id |= X2AVIC_MAX_PHYSICAL_ID;
+
/* Disabling MSR intercept for x2APIC registers */
avic_set_x2apic_msr_interception(svm, false);
} else {
@@ -186,8 +214,6 @@ static void avic_activate_vmcb(struct vcpu_svm *svm)
*/
kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, &svm->vcpu);
- /* For xAVIC and hybrid-xAVIC modes */
- vmcb->control.avic_physical_id |= AVIC_MAX_PHYSICAL_ID;
/* Enabling MSR intercept for x2APIC registers */
avic_set_x2apic_msr_interception(svm, true);
}
@@ -247,6 +273,30 @@ static int avic_ga_log_notifier(u32 ga_tag)
return 0;
}
+static int avic_get_physical_id_table_order(struct kvm *kvm)
+{
+ /* Provision for the maximum physical ID supported in x2avic mode */
+ return get_order((__avic_get_max_physical_id(kvm, NULL) + 1) * sizeof(u64));
+}
+
+int avic_alloc_physical_id_table(struct kvm *kvm)
+{
+ struct kvm_svm *kvm_svm = to_kvm_svm(kvm);
+
+ if (!irqchip_in_kernel(kvm) || !enable_apicv)
+ return 0;
+
+ if (kvm_svm->avic_physical_id_table)
+ return 0;
+
+ kvm_svm->avic_physical_id_table = (void *)__get_free_pages(GFP_KERNEL_ACCOUNT | __GFP_ZERO,
+ avic_get_physical_id_table_order(kvm));
+ if (!kvm_svm->avic_physical_id_table)
+ return -ENOMEM;
+
+ return 0;
+}
+
void avic_vm_destroy(struct kvm *kvm)
{
unsigned long flags;
@@ -256,7 +306,8 @@ void avic_vm_destroy(struct kvm *kvm)
return;
free_page((unsigned long)kvm_svm->avic_logical_id_table);
- free_page((unsigned long)kvm_svm->avic_physical_id_table);
+ free_pages((unsigned long)kvm_svm->avic_physical_id_table,
+ avic_get_physical_id_table_order(kvm));
spin_lock_irqsave(&svm_vm_data_hash_lock, flags);
hash_del(&kvm_svm->hnode);
@@ -274,10 +325,6 @@ int avic_vm_init(struct kvm *kvm)
if (!enable_apicv)
return 0;
- kvm_svm->avic_physical_id_table = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT);
- if (!kvm_svm->avic_physical_id_table)
- goto free_avic;
-
kvm_svm->avic_logical_id_table = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT);
if (!kvm_svm->avic_logical_id_table)
goto free_avic;
@@ -342,7 +389,7 @@ static int avic_init_backing_page(struct kvm_vcpu *vcpu)
* fully initialized AVIC.
*/
if ((!x2avic_enabled && id > AVIC_MAX_PHYSICAL_ID) ||
- (id > X2AVIC_MAX_PHYSICAL_ID)) {
+ (id > x2avic_max_physical_id)) {
kvm_set_apicv_inhibit(vcpu->kvm, APICV_INHIBIT_REASON_PHYSICAL_ID_TOO_BIG);
vcpu->arch.apic->apicv_active = false;
return 0;
@@ -562,7 +609,7 @@ int avic_incomplete_ipi_interception(struct kvm_vcpu *vcpu)
u32 icrh = svm->vmcb->control.exit_info_1 >> 32;
u32 icrl = svm->vmcb->control.exit_info_1;
u32 id = svm->vmcb->control.exit_info_2 >> 32;
- u32 index = svm->vmcb->control.exit_info_2 & 0x1FF;
+ u32 index = svm->vmcb->control.exit_info_2 & AVIC_PHYSICAL_MAX_INDEX_MASK;
struct kvm_lapic *apic = vcpu->arch.apic;
trace_kvm_avic_incomplete_ipi(vcpu->vcpu_id, icrh, icrl, id, index);
@@ -962,7 +1009,8 @@ static void __avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu,
if (WARN_ON(h_physical_id & ~AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK))
return;
- if (WARN_ON_ONCE(vcpu->vcpu_id * sizeof(entry) >= PAGE_SIZE))
+ if (WARN_ON_ONCE(vcpu->vcpu_id * sizeof(entry) >=
+ PAGE_SIZE << avic_get_physical_id_table_order(vcpu->kvm)))
return;
/*
@@ -1024,7 +1072,8 @@ static void __avic_vcpu_put(struct kvm_vcpu *vcpu, enum avic_vcpu_action action)
lockdep_assert_preemption_disabled();
- if (WARN_ON_ONCE(vcpu->vcpu_id * sizeof(entry) >= PAGE_SIZE))
+ if (WARN_ON_ONCE(vcpu->vcpu_id * sizeof(entry) >=
+ PAGE_SIZE << avic_get_physical_id_table_order(vcpu->kvm)))
return;
/*
@@ -1226,10 +1275,15 @@ bool __init avic_hardware_setup(void)
/* AVIC is a prerequisite for x2AVIC. */
x2avic_enabled = boot_cpu_has(X86_FEATURE_X2AVIC);
- if (x2avic_enabled)
- pr_info("x2AVIC enabled\n");
- else
+ if (x2avic_enabled) {
+ if (cpu_feature_enabled(X86_FEATURE_X2AVIC_EXT))
+ x2avic_max_physical_id = X2AVIC_4K_MAX_PHYSICAL_ID;
+ else
+ x2avic_max_physical_id = X2AVIC_MAX_PHYSICAL_ID;
+ pr_info("x2AVIC enabled (max %u vCPUs)\n", x2avic_max_physical_id + 1);
+ } else {
svm_x86_ops.allow_apicv_in_x2apic_without_x2apic_virtualization = true;
+ }
/*
* Disable IPI virtualization for AMD Family 17h CPUs (Zen1 and Zen2)
diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
index da6e80b3ac35..c81005b24522 100644
--- a/arch/x86/kvm/svm/nested.c
+++ b/arch/x86/kvm/svm/nested.c
@@ -613,6 +613,7 @@ static void nested_vmcb02_prepare_save(struct vcpu_svm *svm, struct vmcb *vmcb12
struct kvm_vcpu *vcpu = &svm->vcpu;
nested_vmcb02_compute_g_pat(svm);
+ vmcb_mark_dirty(vmcb02, VMCB_NPT);
/* Load the nested guest state */
if (svm->nested.vmcb12_gpa != svm->nested.last_vmcb12_gpa) {
@@ -751,6 +752,7 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm,
vmcb02->control.nested_ctl = vmcb01->control.nested_ctl;
vmcb02->control.iopm_base_pa = vmcb01->control.iopm_base_pa;
vmcb02->control.msrpm_base_pa = vmcb01->control.msrpm_base_pa;
+ vmcb_mark_dirty(vmcb02, VMCB_PERM_MAP);
/*
* Stash vmcb02's counter if the guest hasn't moved past the guilty
@@ -1430,16 +1432,6 @@ static int nested_svm_intercept(struct vcpu_svm *svm)
case SVM_EXIT_IOIO:
vmexit = nested_svm_intercept_ioio(svm);
break;
- case SVM_EXIT_READ_CR0 ... SVM_EXIT_WRITE_CR8: {
- if (vmcb12_is_intercept(&svm->nested.ctl, exit_code))
- vmexit = NESTED_EXIT_DONE;
- break;
- }
- case SVM_EXIT_READ_DR0 ... SVM_EXIT_WRITE_DR7: {
- if (vmcb12_is_intercept(&svm->nested.ctl, exit_code))
- vmexit = NESTED_EXIT_DONE;
- break;
- }
case SVM_EXIT_EXCP_BASE ... SVM_EXIT_EXCP_BASE + 0x1f: {
/*
* Host-intercepted exceptions have been checked already in
diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
index 0835c664fbfd..f59c65abe3cf 100644
--- a/arch/x86/kvm/svm/sev.c
+++ b/arch/x86/kvm/svm/sev.c
@@ -65,20 +65,24 @@ module_param_named(ciphertext_hiding_asids, nr_ciphertext_hiding_asids, uint, 04
#define AP_RESET_HOLD_NAE_EVENT 1
#define AP_RESET_HOLD_MSR_PROTO 2
-/* As defined by SEV-SNP Firmware ABI, under "Guest Policy". */
-#define SNP_POLICY_MASK_API_MINOR GENMASK_ULL(7, 0)
-#define SNP_POLICY_MASK_API_MAJOR GENMASK_ULL(15, 8)
-#define SNP_POLICY_MASK_SMT BIT_ULL(16)
-#define SNP_POLICY_MASK_RSVD_MBO BIT_ULL(17)
-#define SNP_POLICY_MASK_DEBUG BIT_ULL(19)
-#define SNP_POLICY_MASK_SINGLE_SOCKET BIT_ULL(20)
-
-#define SNP_POLICY_MASK_VALID (SNP_POLICY_MASK_API_MINOR | \
- SNP_POLICY_MASK_API_MAJOR | \
- SNP_POLICY_MASK_SMT | \
- SNP_POLICY_MASK_RSVD_MBO | \
- SNP_POLICY_MASK_DEBUG | \
- SNP_POLICY_MASK_SINGLE_SOCKET)
+/*
+ * SEV-SNP policy bits that can be supported by KVM. These include policy bits
+ * that have implementation support within KVM or policy bits that do not
+ * require implementation support within KVM to enforce the policy.
+ */
+#define KVM_SNP_POLICY_MASK_VALID (SNP_POLICY_MASK_API_MINOR | \
+ SNP_POLICY_MASK_API_MAJOR | \
+ SNP_POLICY_MASK_SMT | \
+ SNP_POLICY_MASK_RSVD_MBO | \
+ SNP_POLICY_MASK_DEBUG | \
+ SNP_POLICY_MASK_SINGLE_SOCKET | \
+ SNP_POLICY_MASK_CXL_ALLOW | \
+ SNP_POLICY_MASK_MEM_AES_256_XTS | \
+ SNP_POLICY_MASK_RAPL_DIS | \
+ SNP_POLICY_MASK_CIPHERTEXT_HIDING_DRAM | \
+ SNP_POLICY_MASK_PAGE_SWAP_DISABLE)
+
+static u64 snp_supported_policy_bits __ro_after_init;
#define INITIAL_VMSA_GPA 0xFFFFFFFFF000
@@ -2143,6 +2147,10 @@ int sev_dev_get_attr(u32 group, u64 attr, u64 *val)
*val = sev_supported_vmsa_features;
return 0;
+ case KVM_X86_SNP_POLICY_BITS:
+ *val = snp_supported_policy_bits;
+ return 0;
+
default:
return -ENXIO;
}
@@ -2207,7 +2215,7 @@ static int snp_launch_start(struct kvm *kvm, struct kvm_sev_cmd *argp)
if (params.flags)
return -EINVAL;
- if (params.policy & ~SNP_POLICY_MASK_VALID)
+ if (params.policy & ~snp_supported_policy_bits)
return -EINVAL;
/* Check for policy bits that must be set */
@@ -3100,8 +3108,11 @@ out:
else if (sev_snp_supported)
sev_snp_supported = is_sev_snp_initialized();
- if (sev_snp_supported)
+ if (sev_snp_supported) {
+ snp_supported_policy_bits = sev_get_snp_policy_bits() &
+ KVM_SNP_POLICY_MASK_VALID;
nr_ciphertext_hiding_asids = init_args.max_snp_asid;
+ }
/*
* If ciphertext hiding is enabled, the joint SEV-ES/SEV-SNP
@@ -5085,10 +5096,10 @@ struct vmcb_save_area *sev_decrypt_vmsa(struct kvm_vcpu *vcpu)
/* Check if the SEV policy allows debugging */
if (sev_snp_guest(vcpu->kvm)) {
- if (!(sev->policy & SNP_POLICY_DEBUG))
+ if (!(sev->policy & SNP_POLICY_MASK_DEBUG))
return NULL;
} else {
- if (sev->policy & SEV_POLICY_NODBG)
+ if (sev->policy & SEV_POLICY_MASK_NODBG)
return NULL;
}
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index 9d29b2e7e855..f56c2d895011 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -272,6 +272,7 @@ static void svm_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
}
static int __svm_skip_emulated_instruction(struct kvm_vcpu *vcpu,
+ int emul_type,
bool commit_side_effects)
{
struct vcpu_svm *svm = to_svm(vcpu);
@@ -293,7 +294,7 @@ static int __svm_skip_emulated_instruction(struct kvm_vcpu *vcpu,
if (unlikely(!commit_side_effects))
old_rflags = svm->vmcb->save.rflags;
- if (!kvm_emulate_instruction(vcpu, EMULTYPE_SKIP))
+ if (!kvm_emulate_instruction(vcpu, emul_type))
return 0;
if (unlikely(!commit_side_effects))
@@ -311,11 +312,13 @@ done:
static int svm_skip_emulated_instruction(struct kvm_vcpu *vcpu)
{
- return __svm_skip_emulated_instruction(vcpu, true);
+ return __svm_skip_emulated_instruction(vcpu, EMULTYPE_SKIP, true);
}
-static int svm_update_soft_interrupt_rip(struct kvm_vcpu *vcpu)
+static int svm_update_soft_interrupt_rip(struct kvm_vcpu *vcpu, u8 vector)
{
+ const int emul_type = EMULTYPE_SKIP | EMULTYPE_SKIP_SOFT_INT |
+ EMULTYPE_SET_SOFT_INT_VECTOR(vector);
unsigned long rip, old_rip = kvm_rip_read(vcpu);
struct vcpu_svm *svm = to_svm(vcpu);
@@ -331,7 +334,7 @@ static int svm_update_soft_interrupt_rip(struct kvm_vcpu *vcpu)
* in use, the skip must not commit any side effects such as clearing
* the interrupt shadow or RFLAGS.RF.
*/
- if (!__svm_skip_emulated_instruction(vcpu, !nrips))
+ if (!__svm_skip_emulated_instruction(vcpu, emul_type, !nrips))
return -EIO;
rip = kvm_rip_read(vcpu);
@@ -367,7 +370,7 @@ static void svm_inject_exception(struct kvm_vcpu *vcpu)
kvm_deliver_exception_payload(vcpu, ex);
if (kvm_exception_is_soft(ex->vector) &&
- svm_update_soft_interrupt_rip(vcpu))
+ svm_update_soft_interrupt_rip(vcpu, ex->vector))
return;
svm->vmcb->control.event_inj = ex->vector
@@ -1198,6 +1201,11 @@ void svm_switch_vmcb(struct vcpu_svm *svm, struct kvm_vmcb_info *target_vmcb)
svm->vmcb = target_vmcb->ptr;
}
+static int svm_vcpu_precreate(struct kvm *kvm)
+{
+ return avic_alloc_physical_id_table(kvm);
+}
+
static int svm_vcpu_create(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm;
@@ -3442,13 +3450,8 @@ static bool svm_check_exit_valid(u64 exit_code)
static int svm_handle_invalid_exit(struct kvm_vcpu *vcpu, u64 exit_code)
{
- vcpu_unimpl(vcpu, "svm: unexpected exit reason 0x%llx\n", exit_code);
dump_vmcb(vcpu);
- vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
- vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_UNEXPECTED_EXIT_REASON;
- vcpu->run->internal.ndata = 2;
- vcpu->run->internal.data[0] = exit_code;
- vcpu->run->internal.data[1] = vcpu->arch.last_vmentry_cpu;
+ kvm_prepare_unexpected_reason_exit(vcpu, exit_code);
return 0;
}
@@ -3633,11 +3636,12 @@ static bool svm_set_vnmi_pending(struct kvm_vcpu *vcpu)
static void svm_inject_irq(struct kvm_vcpu *vcpu, bool reinjected)
{
+ struct kvm_queued_interrupt *intr = &vcpu->arch.interrupt;
struct vcpu_svm *svm = to_svm(vcpu);
u32 type;
- if (vcpu->arch.interrupt.soft) {
- if (svm_update_soft_interrupt_rip(vcpu))
+ if (intr->soft) {
+ if (svm_update_soft_interrupt_rip(vcpu, intr->nr))
return;
type = SVM_EVTINJ_TYPE_SOFT;
@@ -3645,12 +3649,10 @@ static void svm_inject_irq(struct kvm_vcpu *vcpu, bool reinjected)
type = SVM_EVTINJ_TYPE_INTR;
}
- trace_kvm_inj_virq(vcpu->arch.interrupt.nr,
- vcpu->arch.interrupt.soft, reinjected);
+ trace_kvm_inj_virq(intr->nr, intr->soft, reinjected);
++vcpu->stat.irq_injections;
- svm->vmcb->control.event_inj = vcpu->arch.interrupt.nr |
- SVM_EVTINJ_VALID | type;
+ svm->vmcb->control.event_inj = intr->nr | SVM_EVTINJ_VALID | type;
}
void svm_complete_interrupt_delivery(struct kvm_vcpu *vcpu, int delivery_mode,
@@ -4251,7 +4253,6 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags)
svm_set_dr6(vcpu, DR6_ACTIVE_LOW);
clgi();
- kvm_load_guest_xsave_state(vcpu);
/*
* Hardware only context switches DEBUGCTL if LBR virtualization is
@@ -4294,7 +4295,6 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags)
vcpu->arch.host_debugctl != svm->vmcb->save.dbgctl)
update_debugctlmsr(vcpu->arch.host_debugctl);
- kvm_load_host_xsave_state(vcpu);
stgi();
/* Any pending NMI will happen here */
@@ -4326,14 +4326,6 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags)
vcpu->arch.regs_avail &= ~SVM_REGS_LAZY_LOAD_SET;
- /*
- * We need to handle MC intercepts here before the vcpu has a chance to
- * change the physical cpu
- */
- if (unlikely(svm->vmcb->control.exit_code ==
- SVM_EXIT_EXCP_BASE + MC_VECTOR))
- svm_handle_mce(vcpu);
-
trace_kvm_exit(vcpu, KVM_ISA_SVM);
svm_complete_interrupts(vcpu);
@@ -4526,31 +4518,45 @@ static int svm_check_intercept(struct kvm_vcpu *vcpu,
case SVM_EXIT_WRITE_CR0: {
unsigned long cr0, val;
- if (info->intercept == x86_intercept_cr_write)
+ /*
+ * Adjust the exit code accordingly if a CR other than CR0 is
+ * being written, and skip straight to the common handling as
+ * only CR0 has an additional selective intercept.
+ */
+ if (info->intercept == x86_intercept_cr_write && info->modrm_reg) {
icpt_info.exit_code += info->modrm_reg;
-
- if (icpt_info.exit_code != SVM_EXIT_WRITE_CR0 ||
- info->intercept == x86_intercept_clts)
break;
+ }
- if (!(vmcb12_is_intercept(&svm->nested.ctl,
- INTERCEPT_SELECTIVE_CR0)))
+ /*
+ * Convert the exit_code to SVM_EXIT_CR0_SEL_WRITE if a
+ * selective CR0 intercept is triggered (the common logic will
+ * treat the selective intercept as being enabled). Note, the
+ * unconditional intercept has higher priority, i.e. this is
+ * only relevant if *only* the selective intercept is enabled.
+ */
+ if (vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_CR0_WRITE) ||
+ !(vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_SELECTIVE_CR0)))
break;
- cr0 = vcpu->arch.cr0 & ~SVM_CR0_SELECTIVE_MASK;
- val = info->src_val & ~SVM_CR0_SELECTIVE_MASK;
+ /* CLTS never triggers INTERCEPT_SELECTIVE_CR0 */
+ if (info->intercept == x86_intercept_clts)
+ break;
+ /* LMSW always triggers INTERCEPT_SELECTIVE_CR0 */
if (info->intercept == x86_intercept_lmsw) {
- cr0 &= 0xfUL;
- val &= 0xfUL;
- /* lmsw can't clear PE - catch this here */
- if (cr0 & X86_CR0_PE)
- val |= X86_CR0_PE;
+ icpt_info.exit_code = SVM_EXIT_CR0_SEL_WRITE;
+ break;
}
+ /*
+ * MOV-to-CR0 only triggers INTERCEPT_SELECTIVE_CR0 if any bit
+ * other than SVM_CR0_SELECTIVE_MASK is changed.
+ */
+ cr0 = vcpu->arch.cr0 & ~SVM_CR0_SELECTIVE_MASK;
+ val = info->src_val & ~SVM_CR0_SELECTIVE_MASK;
if (cr0 ^ val)
icpt_info.exit_code = SVM_EXIT_CR0_SEL_WRITE;
-
break;
}
case SVM_EXIT_READ_DR0:
@@ -4622,8 +4628,16 @@ out:
static void svm_handle_exit_irqoff(struct kvm_vcpu *vcpu)
{
- if (to_svm(vcpu)->vmcb->control.exit_code == SVM_EXIT_INTR)
+ switch (to_svm(vcpu)->vmcb->control.exit_code) {
+ case SVM_EXIT_EXCP_BASE + MC_VECTOR:
+ svm_handle_mce(vcpu);
+ break;
+ case SVM_EXIT_INTR:
vcpu->arch.at_instruction_boundary = true;
+ break;
+ default:
+ break;
+ }
}
static void svm_setup_mce(struct kvm_vcpu *vcpu)
@@ -5012,6 +5026,7 @@ struct kvm_x86_ops svm_x86_ops __initdata = {
.emergency_disable_virtualization_cpu = svm_emergency_disable_virtualization_cpu,
.has_emulated_msr = svm_has_emulated_msr,
+ .vcpu_precreate = svm_vcpu_precreate,
.vcpu_create = svm_vcpu_create,
.vcpu_free = svm_vcpu_free,
.vcpu_reset = svm_vcpu_reset,
@@ -5316,7 +5331,9 @@ static __init int svm_hardware_setup(void)
if (nested) {
pr_info("Nested Virtualization enabled\n");
- kvm_enable_efer_bits(EFER_SVME | EFER_LMSLE);
+ kvm_enable_efer_bits(EFER_SVME);
+ if (!boot_cpu_has(X86_FEATURE_EFER_LMSLE_MBZ))
+ kvm_enable_efer_bits(EFER_LMSLE);
r = nested_svm_init_msrpm_merge_offsets();
if (r)
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
index dd78e6402345..9e151dbdef25 100644
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -117,9 +117,6 @@ struct kvm_sev_info {
cpumask_var_t have_run_cpus; /* CPUs that have done VMRUN for this VM. */
};
-#define SEV_POLICY_NODBG BIT_ULL(0)
-#define SNP_POLICY_DEBUG BIT_ULL(19)
-
struct kvm_svm {
struct kvm kvm;
@@ -807,6 +804,7 @@ extern struct kvm_x86_nested_ops svm_nested_ops;
bool __init avic_hardware_setup(void);
void avic_hardware_unsetup(void);
+int avic_alloc_physical_id_table(struct kvm *kvm);
void avic_vm_destroy(struct kvm *kvm);
int avic_vm_init(struct kvm *kvm);
void avic_init_vmcb(struct vcpu_svm *svm, struct vmcb *vmcb);
diff --git a/arch/x86/kvm/svm/vmenter.S b/arch/x86/kvm/svm/vmenter.S
index 235c4af6b692..3392bcadfb89 100644
--- a/arch/x86/kvm/svm/vmenter.S
+++ b/arch/x86/kvm/svm/vmenter.S
@@ -52,11 +52,23 @@
* there must not be any returns or indirect branches between this code
* and vmentry.
*/
- movl SVM_spec_ctrl(%_ASM_DI), %eax
- cmp PER_CPU_VAR(x86_spec_ctrl_current), %eax
+#ifdef CONFIG_X86_64
+ mov SVM_spec_ctrl(%rdi), %rdx
+ cmp PER_CPU_VAR(x86_spec_ctrl_current), %rdx
+ je 801b
+ movl %edx, %eax
+ shr $32, %rdx
+#else
+ mov SVM_spec_ctrl(%edi), %eax
+ mov PER_CPU_VAR(x86_spec_ctrl_current), %ecx
+ xor %eax, %ecx
+ mov SVM_spec_ctrl + 4(%edi), %edx
+ mov PER_CPU_VAR(x86_spec_ctrl_current + 4), %esi
+ xor %edx, %esi
+ or %esi, %ecx
je 801b
+#endif
mov $MSR_IA32_SPEC_CTRL, %ecx
- xor %edx, %edx
wrmsr
jmp 801b
.endm
@@ -81,17 +93,31 @@
jnz 998f
rdmsr
movl %eax, SVM_spec_ctrl(%_ASM_DI)
+ movl %edx, SVM_spec_ctrl + 4(%_ASM_DI)
998:
-
/* Now restore the host value of the MSR if different from the guest's. */
- movl PER_CPU_VAR(x86_spec_ctrl_current), %eax
- cmp SVM_spec_ctrl(%_ASM_DI), %eax
+#ifdef CONFIG_X86_64
+ mov PER_CPU_VAR(x86_spec_ctrl_current), %rdx
+ cmp SVM_spec_ctrl(%rdi), %rdx
je 901b
- xor %edx, %edx
+ movl %edx, %eax
+ shr $32, %rdx
+#else
+ mov PER_CPU_VAR(x86_spec_ctrl_current), %eax
+ mov SVM_spec_ctrl(%edi), %esi
+ xor %eax, %esi
+ mov PER_CPU_VAR(x86_spec_ctrl_current + 4), %edx
+ mov SVM_spec_ctrl + 4(%edi), %edi
+ xor %edx, %edi
+ or %edi, %esi
+ je 901b
+#endif
wrmsr
jmp 901b
.endm
+#define SVM_CLEAR_CPU_BUFFERS \
+ ALTERNATIVE "", __CLEAR_CPU_BUFFERS, X86_FEATURE_CLEAR_CPU_BUF_VM
/**
* __svm_vcpu_run - Run a vCPU via a transition to SVM guest mode
@@ -134,7 +160,7 @@ SYM_FUNC_START(__svm_vcpu_run)
mov %_ASM_ARG1, %_ASM_DI
.endif
- /* Clobbers RAX, RCX, RDX. */
+ /* Clobbers RAX, RCX, RDX (and ESI on 32-bit), consumes RDI (@svm). */
RESTORE_GUEST_SPEC_CTRL
/*
@@ -170,7 +196,7 @@ SYM_FUNC_START(__svm_vcpu_run)
mov VCPU_RDI(%_ASM_DI), %_ASM_DI
/* Clobbers EFLAGS.ZF */
- VM_CLEAR_CPU_BUFFERS
+ SVM_CLEAR_CPU_BUFFERS
/* Enter guest mode */
3: vmrun %_ASM_AX
@@ -211,7 +237,10 @@ SYM_FUNC_START(__svm_vcpu_run)
/* IMPORTANT: Stuff the RSB immediately after VM-Exit, before RET! */
FILL_RETURN_BUFFER %_ASM_AX, RSB_CLEAR_LOOPS, X86_FEATURE_RSB_VMEXIT
- /* Clobbers RAX, RCX, RDX. */
+ /*
+ * Clobbers RAX, RCX, RDX (and ESI, EDI on 32-bit), consumes RDI (@svm)
+ * and RSP (pointer to @spec_ctrl_intercepted).
+ */
RESTORE_HOST_SPEC_CTRL
/*
@@ -331,7 +360,7 @@ SYM_FUNC_START(__svm_sev_es_vcpu_run)
mov %rdi, SEV_ES_RDI (%rdx)
mov %rsi, SEV_ES_RSI (%rdx)
- /* Clobbers RAX, RCX, RDX (@hostsa). */
+ /* Clobbers RAX, RCX, and RDX (@hostsa), consumes RDI (@svm). */
RESTORE_GUEST_SPEC_CTRL
/* Get svm->current_vmcb->pa into RAX. */
@@ -339,7 +368,7 @@ SYM_FUNC_START(__svm_sev_es_vcpu_run)
mov KVM_VMCB_pa(%rax), %rax
/* Clobbers EFLAGS.ZF */
- VM_CLEAR_CPU_BUFFERS
+ SVM_CLEAR_CPU_BUFFERS
/* Enter guest mode */
1: vmrun %rax
diff --git a/arch/x86/kvm/vmx/main.c b/arch/x86/kvm/vmx/main.c
index 0eb2773b2ae2..a46ccd670785 100644
--- a/arch/x86/kvm/vmx/main.c
+++ b/arch/x86/kvm/vmx/main.c
@@ -831,6 +831,14 @@ static int vt_vcpu_mem_enc_ioctl(struct kvm_vcpu *vcpu, void __user *argp)
return tdx_vcpu_ioctl(vcpu, argp);
}
+static int vt_vcpu_mem_enc_unlocked_ioctl(struct kvm_vcpu *vcpu, void __user *argp)
+{
+ if (!is_td_vcpu(vcpu))
+ return -EINVAL;
+
+ return tdx_vcpu_unlocked_ioctl(vcpu, argp);
+}
+
static int vt_gmem_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn,
bool is_private)
{
@@ -1005,6 +1013,7 @@ struct kvm_x86_ops vt_x86_ops __initdata = {
.mem_enc_ioctl = vt_op_tdx_only(mem_enc_ioctl),
.vcpu_mem_enc_ioctl = vt_op_tdx_only(vcpu_mem_enc_ioctl),
+ .vcpu_mem_enc_unlocked_ioctl = vt_op_tdx_only(vcpu_mem_enc_unlocked_ioctl),
.gmem_max_mapping_level = vt_op_tdx_only(gmem_max_mapping_level)
};
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index bcea087b642f..40777278eabb 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -23,8 +23,8 @@
static bool __read_mostly enable_shadow_vmcs = 1;
module_param_named(enable_shadow_vmcs, enable_shadow_vmcs, bool, S_IRUGO);
-static bool __read_mostly nested_early_check = 0;
-module_param(nested_early_check, bool, S_IRUGO);
+static bool __ro_after_init warn_on_missed_cc;
+module_param(warn_on_missed_cc, bool, 0444);
#define CC KVM_NESTED_VMENTER_CONSISTENCY_CHECK
@@ -555,6 +555,9 @@ static int nested_vmx_check_tpr_shadow_controls(struct kvm_vcpu *vcpu,
if (CC(!page_address_valid(vcpu, vmcs12->virtual_apic_page_addr)))
return -EINVAL;
+ if (CC(!nested_cpu_has_vid(vmcs12) && vmcs12->tpr_threshold >> 4))
+ return -EINVAL;
+
return 0;
}
@@ -761,7 +764,7 @@ static void nested_cache_shadow_vmcs12(struct kvm_vcpu *vcpu,
vmcs12->vmcs_link_pointer, VMCS12_SIZE))
return;
- kvm_read_guest_cached(vmx->vcpu.kvm, ghc, get_shadow_vmcs12(vcpu),
+ kvm_read_guest_cached(vcpu->kvm, ghc, get_shadow_vmcs12(vcpu),
VMCS12_SIZE);
}
@@ -780,7 +783,7 @@ static void nested_flush_cached_shadow_vmcs12(struct kvm_vcpu *vcpu,
vmcs12->vmcs_link_pointer, VMCS12_SIZE))
return;
- kvm_write_guest_cached(vmx->vcpu.kvm, ghc, get_shadow_vmcs12(vcpu),
+ kvm_write_guest_cached(vcpu->kvm, ghc, get_shadow_vmcs12(vcpu),
VMCS12_SIZE);
}
@@ -2296,15 +2299,6 @@ static void prepare_vmcs02_constant_state(struct vcpu_vmx *vmx)
return;
vmx->nested.vmcs02_initialized = true;
- /*
- * We don't care what the EPTP value is we just need to guarantee
- * it's valid so we don't get a false positive when doing early
- * consistency checks.
- */
- if (enable_ept && nested_early_check)
- vmcs_write64(EPT_POINTER,
- construct_eptp(&vmx->vcpu, 0, PT64_ROOT_4LEVEL));
-
if (vmx->ve_info)
vmcs_write64(VE_INFORMATION_ADDRESS, __pa(vmx->ve_info));
@@ -2749,7 +2743,7 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
vmcs_write64(GUEST_IA32_PAT, vmcs12->guest_ia32_pat);
vcpu->arch.pat = vmcs12->guest_ia32_pat;
} else if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
- vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat);
+ vmcs_write64(GUEST_IA32_PAT, vcpu->arch.pat);
}
vcpu->arch.tsc_offset = kvm_calc_nested_tsc_offset(
@@ -2961,6 +2955,10 @@ static int nested_check_vm_execution_controls(struct kvm_vcpu *vcpu,
}
}
+ if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_TSC_SCALING) &&
+ CC(!vmcs12->tsc_multiplier))
+ return -EINVAL;
+
return 0;
}
@@ -3078,6 +3076,38 @@ static int nested_vmx_check_controls(struct kvm_vcpu *vcpu,
return 0;
}
+static int nested_vmx_check_controls_late(struct kvm_vcpu *vcpu,
+ struct vmcs12 *vmcs12)
+{
+ void *vapic = to_vmx(vcpu)->nested.virtual_apic_map.hva;
+ u32 vtpr = vapic ? (*(u32 *)(vapic + APIC_TASKPRI)) >> 4 : 0;
+
+ /*
+ * Don't bother with the consistency checks if KVM isn't configured to
+ * WARN on missed consistency checks, as KVM needs to rely on hardware
+ * to fully detect an illegal vTPR vs. TRP Threshold combination due to
+ * the vTPR being writable by L1 at all times (it's an in-memory value,
+ * not a VMCS field). I.e. even if the check passes now, it might fail
+ * at the actual VM-Enter.
+ *
+ * Keying off the module param also allows treating an invalid vAPIC
+ * mapping as a consistency check failure without increasing the risk
+ * of breaking a "real" VM.
+ */
+ if (!warn_on_missed_cc)
+ return 0;
+
+ if ((exec_controls_get(to_vmx(vcpu)) & CPU_BASED_TPR_SHADOW) &&
+ nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW) &&
+ !nested_cpu_has_vid(vmcs12) &&
+ !nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES) &&
+ (CC(!vapic) ||
+ CC((vmcs12->tpr_threshold & GENMASK(3, 0)) > (vtpr & GENMASK(3, 0)))))
+ return -EINVAL;
+
+ return 0;
+}
+
static int nested_vmx_check_address_space_size(struct kvm_vcpu *vcpu,
struct vmcs12 *vmcs12)
{
@@ -3333,84 +3363,6 @@ static int nested_vmx_check_guest_state(struct kvm_vcpu *vcpu,
return 0;
}
-static int nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu)
-{
- struct vcpu_vmx *vmx = to_vmx(vcpu);
- unsigned long cr3, cr4;
- bool vm_fail;
-
- if (!nested_early_check)
- return 0;
-
- if (vmx->msr_autoload.host.nr)
- vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0);
- if (vmx->msr_autoload.guest.nr)
- vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0);
-
- preempt_disable();
-
- vmx_prepare_switch_to_guest(vcpu);
-
- /*
- * Induce a consistency check VMExit by clearing bit 1 in GUEST_RFLAGS,
- * which is reserved to '1' by hardware. GUEST_RFLAGS is guaranteed to
- * be written (by prepare_vmcs02()) before the "real" VMEnter, i.e.
- * there is no need to preserve other bits or save/restore the field.
- */
- vmcs_writel(GUEST_RFLAGS, 0);
-
- cr3 = __get_current_cr3_fast();
- if (unlikely(cr3 != vmx->loaded_vmcs->host_state.cr3)) {
- vmcs_writel(HOST_CR3, cr3);
- vmx->loaded_vmcs->host_state.cr3 = cr3;
- }
-
- cr4 = cr4_read_shadow();
- if (unlikely(cr4 != vmx->loaded_vmcs->host_state.cr4)) {
- vmcs_writel(HOST_CR4, cr4);
- vmx->loaded_vmcs->host_state.cr4 = cr4;
- }
-
- vm_fail = __vmx_vcpu_run(vmx, (unsigned long *)&vcpu->arch.regs,
- __vmx_vcpu_run_flags(vmx));
-
- if (vmx->msr_autoload.host.nr)
- vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr);
- if (vmx->msr_autoload.guest.nr)
- vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr);
-
- if (vm_fail) {
- u32 error = vmcs_read32(VM_INSTRUCTION_ERROR);
-
- preempt_enable();
-
- trace_kvm_nested_vmenter_failed(
- "early hardware check VM-instruction error: ", error);
- WARN_ON_ONCE(error != VMXERR_ENTRY_INVALID_CONTROL_FIELD);
- return 1;
- }
-
- /*
- * VMExit clears RFLAGS.IF and DR7, even on a consistency check.
- */
- if (hw_breakpoint_active())
- set_debugreg(__this_cpu_read(cpu_dr7), 7);
- local_irq_enable();
- preempt_enable();
-
- /*
- * A non-failing VMEntry means we somehow entered guest mode with
- * an illegal RIP, and that's just the tip of the iceberg. There
- * is no telling what memory has been modified or what state has
- * been exposed to unknown code. Hitting this all but guarantees
- * a (very critical) hardware issue.
- */
- WARN_ON(!(vmcs_read32(VM_EXIT_REASON) &
- VMX_EXIT_REASONS_FAILED_VMENTRY));
-
- return 0;
-}
-
#ifdef CONFIG_KVM_HYPERV
static bool nested_get_evmcs_page(struct kvm_vcpu *vcpu)
{
@@ -3667,22 +3619,18 @@ enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu,
&vmx->nested.pre_vmenter_ssp_tbl);
/*
- * Overwrite vmcs01.GUEST_CR3 with L1's CR3 if EPT is disabled *and*
- * nested early checks are disabled. In the event of a "late" VM-Fail,
- * i.e. a VM-Fail detected by hardware but not KVM, KVM must unwind its
- * software model to the pre-VMEntry host state. When EPT is disabled,
- * GUEST_CR3 holds KVM's shadow CR3, not L1's "real" CR3, which causes
- * nested_vmx_restore_host_state() to corrupt vcpu->arch.cr3. Stuffing
- * vmcs01.GUEST_CR3 results in the unwind naturally setting arch.cr3 to
- * the correct value. Smashing vmcs01.GUEST_CR3 is safe because nested
- * VM-Exits, and the unwind, reset KVM's MMU, i.e. vmcs01.GUEST_CR3 is
- * guaranteed to be overwritten with a shadow CR3 prior to re-entering
- * L1. Don't stuff vmcs01.GUEST_CR3 when using nested early checks as
- * KVM modifies vcpu->arch.cr3 if and only if the early hardware checks
- * pass, and early VM-Fails do not reset KVM's MMU, i.e. the VM-Fail
- * path would need to manually save/restore vmcs01.GUEST_CR3.
+ * Overwrite vmcs01.GUEST_CR3 with L1's CR3 if EPT is disabled. In the
+ * event of a "late" VM-Fail, i.e. a VM-Fail detected by hardware but
+ * not KVM, KVM must unwind its software model to the pre-VM-Entry host
+ * state. When EPT is disabled, GUEST_CR3 holds KVM's shadow CR3, not
+ * L1's "real" CR3, which causes nested_vmx_restore_host_state() to
+ * corrupt vcpu->arch.cr3. Stuffing vmcs01.GUEST_CR3 results in the
+ * unwind naturally setting arch.cr3 to the correct value. Smashing
+ * vmcs01.GUEST_CR3 is safe because nested VM-Exits, and the unwind,
+ * reset KVM's MMU, i.e. vmcs01.GUEST_CR3 is guaranteed to be
+ * overwritten with a shadow CR3 prior to re-entering L1.
*/
- if (!enable_ept && !nested_early_check)
+ if (!enable_ept)
vmcs_writel(GUEST_CR3, vcpu->arch.cr3);
vmx_switch_vmcs(vcpu, &vmx->nested.vmcs02);
@@ -3695,7 +3643,7 @@ enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu,
return NVMX_VMENTRY_KVM_INTERNAL_ERROR;
}
- if (nested_vmx_check_vmentry_hw(vcpu)) {
+ if (nested_vmx_check_controls_late(vcpu, vmcs12)) {
vmx_switch_vmcs(vcpu, &vmx->vmcs01);
return NVMX_VMENTRY_VMFAIL;
}
@@ -3880,7 +3828,7 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
goto vmentry_failed;
/* Hide L1D cache contents from the nested guest. */
- vmx->vcpu.arch.l1tf_flush_l1d = true;
+ kvm_request_l1tf_flush_l1d();
/*
* Must happen outside of nested_vmx_enter_non_root_mode() as it will
@@ -5164,12 +5112,13 @@ void __nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason,
/*
* The only expected VM-instruction error is "VM entry with
* invalid control field(s)." Anything else indicates a
- * problem with L0. And we should never get here with a
- * VMFail of any type if early consistency checks are enabled.
+ * problem with L0.
*/
WARN_ON_ONCE(vmcs_read32(VM_INSTRUCTION_ERROR) !=
VMXERR_ENTRY_INVALID_CONTROL_FIELD);
- WARN_ON_ONCE(nested_early_check);
+
+ /* VM-Fail at VM-Entry means KVM missed a consistency check. */
+ WARN_ON_ONCE(warn_on_missed_cc);
}
/*
diff --git a/arch/x86/kvm/vmx/run_flags.h b/arch/x86/kvm/vmx/run_flags.h
index 2f20fb170def..6a87a12135fb 100644
--- a/arch/x86/kvm/vmx/run_flags.h
+++ b/arch/x86/kvm/vmx/run_flags.h
@@ -2,12 +2,8 @@
#ifndef __KVM_X86_VMX_RUN_FLAGS_H
#define __KVM_X86_VMX_RUN_FLAGS_H
-#define VMX_RUN_VMRESUME_SHIFT 0
-#define VMX_RUN_SAVE_SPEC_CTRL_SHIFT 1
-#define VMX_RUN_CLEAR_CPU_BUFFERS_FOR_MMIO_SHIFT 2
-
-#define VMX_RUN_VMRESUME BIT(VMX_RUN_VMRESUME_SHIFT)
-#define VMX_RUN_SAVE_SPEC_CTRL BIT(VMX_RUN_SAVE_SPEC_CTRL_SHIFT)
-#define VMX_RUN_CLEAR_CPU_BUFFERS_FOR_MMIO BIT(VMX_RUN_CLEAR_CPU_BUFFERS_FOR_MMIO_SHIFT)
+#define VMX_RUN_VMRESUME BIT(0)
+#define VMX_RUN_SAVE_SPEC_CTRL BIT(1)
+#define VMX_RUN_CLEAR_CPU_BUFFERS_FOR_MMIO BIT(2)
#endif /* __KVM_X86_VMX_RUN_FLAGS_H */
diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c
index 0a49c863c811..2d7a4d52ccfb 100644
--- a/arch/x86/kvm/vmx/tdx.c
+++ b/arch/x86/kvm/vmx/tdx.c
@@ -24,20 +24,33 @@
#undef pr_fmt
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-#define pr_tdx_error(__fn, __err) \
- pr_err_ratelimited("SEAMCALL %s failed: 0x%llx\n", #__fn, __err)
+#define __TDX_BUG_ON(__err, __f, __kvm, __fmt, __args...) \
+({ \
+ struct kvm *_kvm = (__kvm); \
+ bool __ret = !!(__err); \
+ \
+ if (WARN_ON_ONCE(__ret && (!_kvm || !_kvm->vm_bugged))) { \
+ if (_kvm) \
+ kvm_vm_bugged(_kvm); \
+ pr_err_ratelimited("SEAMCALL " __f " failed: 0x%llx" __fmt "\n",\
+ __err, __args); \
+ } \
+ unlikely(__ret); \
+})
+
+#define TDX_BUG_ON(__err, __fn, __kvm) \
+ __TDX_BUG_ON(__err, #__fn, __kvm, "%s", "")
+
+#define TDX_BUG_ON_1(__err, __fn, a1, __kvm) \
+ __TDX_BUG_ON(__err, #__fn, __kvm, ", " #a1 " 0x%llx", a1)
+
+#define TDX_BUG_ON_2(__err, __fn, a1, a2, __kvm) \
+ __TDX_BUG_ON(__err, #__fn, __kvm, ", " #a1 " 0x%llx, " #a2 " 0x%llx", a1, a2)
+
+#define TDX_BUG_ON_3(__err, __fn, a1, a2, a3, __kvm) \
+ __TDX_BUG_ON(__err, #__fn, __kvm, ", " #a1 " 0x%llx, " #a2 ", 0x%llx, " #a3 " 0x%llx", \
+ a1, a2, a3)
-#define __pr_tdx_error_N(__fn_str, __err, __fmt, ...) \
- pr_err_ratelimited("SEAMCALL " __fn_str " failed: 0x%llx, " __fmt, __err, __VA_ARGS__)
-
-#define pr_tdx_error_1(__fn, __err, __rcx) \
- __pr_tdx_error_N(#__fn, __err, "rcx 0x%llx\n", __rcx)
-
-#define pr_tdx_error_2(__fn, __err, __rcx, __rdx) \
- __pr_tdx_error_N(#__fn, __err, "rcx 0x%llx, rdx 0x%llx\n", __rcx, __rdx)
-
-#define pr_tdx_error_3(__fn, __err, __rcx, __rdx, __r8) \
- __pr_tdx_error_N(#__fn, __err, "rcx 0x%llx, rdx 0x%llx, r8 0x%llx\n", __rcx, __rdx, __r8)
bool enable_tdx __ro_after_init;
module_param_named(tdx, enable_tdx, bool, 0444);
@@ -281,25 +294,34 @@ static inline void tdx_disassociate_vp(struct kvm_vcpu *vcpu)
vcpu->cpu = -1;
}
-static void tdx_no_vcpus_enter_start(struct kvm *kvm)
-{
- struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
-
- lockdep_assert_held_write(&kvm->mmu_lock);
-
- WRITE_ONCE(kvm_tdx->wait_for_sept_zap, true);
-
- kvm_make_all_cpus_request(kvm, KVM_REQ_OUTSIDE_GUEST_MODE);
-}
-
-static void tdx_no_vcpus_enter_stop(struct kvm *kvm)
-{
- struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
-
- lockdep_assert_held_write(&kvm->mmu_lock);
-
- WRITE_ONCE(kvm_tdx->wait_for_sept_zap, false);
-}
+/*
+ * Execute a SEAMCALL related to removing/blocking S-EPT entries, with a single
+ * retry (if necessary) after forcing vCPUs to exit and wait for the operation
+ * to complete. All flows that remove/block S-EPT entries run with mmu_lock
+ * held for write, i.e. are mutually exclusive with each other, but they aren't
+ * mutually exclusive with running vCPUs, and so can fail with "operand busy"
+ * if a vCPU acquires a relevant lock in the TDX-Module, e.g. when doing TDCALL.
+ *
+ * Note, the retry is guaranteed to succeed, absent KVM and/or TDX-Module bugs.
+ */
+#define tdh_do_no_vcpus(tdh_func, kvm, args...) \
+({ \
+ struct kvm_tdx *__kvm_tdx = to_kvm_tdx(kvm); \
+ u64 __err; \
+ \
+ lockdep_assert_held_write(&kvm->mmu_lock); \
+ \
+ __err = tdh_func(args); \
+ if (unlikely(tdx_operand_busy(__err))) { \
+ WRITE_ONCE(__kvm_tdx->wait_for_sept_zap, true); \
+ kvm_make_all_cpus_request(kvm, KVM_REQ_OUTSIDE_GUEST_MODE); \
+ \
+ __err = tdh_func(args); \
+ \
+ WRITE_ONCE(__kvm_tdx->wait_for_sept_zap, false); \
+ } \
+ __err; \
+})
/* TDH.PHYMEM.PAGE.RECLAIM is allowed only when destroying the TD. */
static int __tdx_reclaim_page(struct page *page)
@@ -313,10 +335,9 @@ static int __tdx_reclaim_page(struct page *page)
* before the HKID is released and control pages have also been
* released at this point, so there is no possibility of contention.
*/
- if (WARN_ON_ONCE(err)) {
- pr_tdx_error_3(TDH_PHYMEM_PAGE_RECLAIM, err, rcx, rdx, r8);
+ if (TDX_BUG_ON_3(err, TDH_PHYMEM_PAGE_RECLAIM, rcx, rdx, r8, NULL))
return -EIO;
- }
+
return 0;
}
@@ -404,8 +425,8 @@ static void tdx_flush_vp_on_cpu(struct kvm_vcpu *vcpu)
return;
smp_call_function_single(cpu, tdx_flush_vp, &arg, 1);
- if (KVM_BUG_ON(arg.err, vcpu->kvm))
- pr_tdx_error(TDH_VP_FLUSH, arg.err);
+
+ TDX_BUG_ON(arg.err, TDH_VP_FLUSH, vcpu->kvm);
}
void tdx_disable_virtualization_cpu(void)
@@ -464,8 +485,7 @@ static void smp_func_do_phymem_cache_wb(void *unused)
}
out:
- if (WARN_ON_ONCE(err))
- pr_tdx_error(TDH_PHYMEM_CACHE_WB, err);
+ TDX_BUG_ON(err, TDH_PHYMEM_CACHE_WB, NULL);
}
void tdx_mmu_release_hkid(struct kvm *kvm)
@@ -504,8 +524,7 @@ void tdx_mmu_release_hkid(struct kvm *kvm)
err = tdh_mng_vpflushdone(&kvm_tdx->td);
if (err == TDX_FLUSHVP_NOT_DONE)
goto out;
- if (KVM_BUG_ON(err, kvm)) {
- pr_tdx_error(TDH_MNG_VPFLUSHDONE, err);
+ if (TDX_BUG_ON(err, TDH_MNG_VPFLUSHDONE, kvm)) {
pr_err("tdh_mng_vpflushdone() failed. HKID %d is leaked.\n",
kvm_tdx->hkid);
goto out;
@@ -528,8 +547,7 @@ void tdx_mmu_release_hkid(struct kvm *kvm)
* tdh_mng_key_freeid() will fail.
*/
err = tdh_mng_key_freeid(&kvm_tdx->td);
- if (KVM_BUG_ON(err, kvm)) {
- pr_tdx_error(TDH_MNG_KEY_FREEID, err);
+ if (TDX_BUG_ON(err, TDH_MNG_KEY_FREEID, kvm)) {
pr_err("tdh_mng_key_freeid() failed. HKID %d is leaked.\n",
kvm_tdx->hkid);
} else {
@@ -580,10 +598,9 @@ static void tdx_reclaim_td_control_pages(struct kvm *kvm)
* when it is reclaiming TDCS).
*/
err = tdh_phymem_page_wbinvd_tdr(&kvm_tdx->td);
- if (KVM_BUG_ON(err, kvm)) {
- pr_tdx_error(TDH_PHYMEM_PAGE_WBINVD, err);
+ if (TDX_BUG_ON(err, TDH_PHYMEM_PAGE_WBINVD, kvm))
return;
- }
+
tdx_quirk_reset_page(kvm_tdx->td.tdr_page);
__free_page(kvm_tdx->td.tdr_page);
@@ -606,11 +623,8 @@ static int tdx_do_tdh_mng_key_config(void *param)
/* TDX_RND_NO_ENTROPY related retries are handled by sc_retry() */
err = tdh_mng_key_config(&kvm_tdx->td);
-
- if (KVM_BUG_ON(err, &kvm_tdx->kvm)) {
- pr_tdx_error(TDH_MNG_KEY_CONFIG, err);
+ if (TDX_BUG_ON(err, TDH_MNG_KEY_CONFIG, &kvm_tdx->kvm))
return -EIO;
- }
return 0;
}
@@ -763,25 +777,6 @@ static bool tdx_protected_apic_has_interrupt(struct kvm_vcpu *vcpu)
return tdx_vcpu_state_details_intr_pending(vcpu_state_details);
}
-/*
- * Compared to vmx_prepare_switch_to_guest(), there is not much to do
- * as SEAMCALL/SEAMRET calls take care of most of save and restore.
- */
-void tdx_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
-{
- struct vcpu_vt *vt = to_vt(vcpu);
-
- if (vt->guest_state_loaded)
- return;
-
- if (likely(is_64bit_mm(current->mm)))
- vt->msr_host_kernel_gs_base = current->thread.gsbase;
- else
- vt->msr_host_kernel_gs_base = read_msr(MSR_KERNEL_GS_BASE);
-
- vt->guest_state_loaded = true;
-}
-
struct tdx_uret_msr {
u32 msr;
unsigned int slot;
@@ -795,19 +790,38 @@ static struct tdx_uret_msr tdx_uret_msrs[] = {
{.msr = MSR_TSC_AUX,},
};
-static void tdx_user_return_msr_update_cache(void)
+void tdx_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
{
+ struct vcpu_vt *vt = to_vt(vcpu);
int i;
+ if (vt->guest_state_loaded)
+ return;
+
+ if (likely(is_64bit_mm(current->mm)))
+ vt->msr_host_kernel_gs_base = current->thread.gsbase;
+ else
+ vt->msr_host_kernel_gs_base = read_msr(MSR_KERNEL_GS_BASE);
+
+ vt->guest_state_loaded = true;
+
+ /*
+ * Explicitly set user-return MSRs that are clobbered by the TDX-Module
+ * if VP.ENTER succeeds, i.e. on TD-Exit, with the values that would be
+ * written by the TDX-Module. Don't rely on the TDX-Module to actually
+ * clobber the MSRs, as the contract is poorly defined and not upheld.
+ * E.g. the TDX-Module will synthesize an EPT Violation without doing
+ * VM-Enter if it suspects a zero-step attack, and never "restore" VMM
+ * state.
+ */
for (i = 0; i < ARRAY_SIZE(tdx_uret_msrs); i++)
- kvm_user_return_msr_update_cache(tdx_uret_msrs[i].slot,
- tdx_uret_msrs[i].defval);
+ kvm_set_user_return_msr(tdx_uret_msrs[i].slot,
+ tdx_uret_msrs[i].defval, -1ull);
}
static void tdx_prepare_switch_to_host(struct kvm_vcpu *vcpu)
{
struct vcpu_vt *vt = to_vt(vcpu);
- struct vcpu_tdx *tdx = to_tdx(vcpu);
if (!vt->guest_state_loaded)
return;
@@ -815,11 +829,6 @@ static void tdx_prepare_switch_to_host(struct kvm_vcpu *vcpu)
++vcpu->stat.host_state_reload;
wrmsrl(MSR_KERNEL_GS_BASE, vt->msr_host_kernel_gs_base);
- if (tdx->guest_entered) {
- tdx_user_return_msr_update_cache();
- tdx->guest_entered = false;
- }
-
vt->guest_state_loaded = false;
}
@@ -829,19 +838,52 @@ void tdx_vcpu_put(struct kvm_vcpu *vcpu)
tdx_prepare_switch_to_host(vcpu);
}
+/*
+ * Life cycles for a TD and a vCPU:
+ * 1. KVM_CREATE_VM ioctl.
+ * TD state is TD_STATE_UNINITIALIZED.
+ * hkid is not assigned at this stage.
+ * 2. KVM_TDX_INIT_VM ioctl.
+ * TD transitions to TD_STATE_INITIALIZED.
+ * hkid is assigned after this stage.
+ * 3. KVM_CREATE_VCPU ioctl. (only when TD is TD_STATE_INITIALIZED).
+ * 3.1 tdx_vcpu_create() transitions vCPU state to VCPU_TD_STATE_UNINITIALIZED.
+ * 3.2 vcpu_load() and vcpu_put() in kvm_arch_vcpu_create().
+ * 3.3 (conditional) if any error encountered after kvm_arch_vcpu_create()
+ * kvm_arch_vcpu_destroy() --> tdx_vcpu_free().
+ * 4. KVM_TDX_INIT_VCPU ioctl.
+ * tdx_vcpu_init() transitions vCPU state to VCPU_TD_STATE_INITIALIZED.
+ * vCPU control structures are allocated at this stage.
+ * 5. kvm_destroy_vm().
+ * 5.1 tdx_mmu_release_hkid(): (1) tdh_vp_flush(), disassociates all vCPUs.
+ * (2) puts hkid to !assigned state.
+ * 5.2 kvm_destroy_vcpus() --> tdx_vcpu_free():
+ * transitions vCPU to VCPU_TD_STATE_UNINITIALIZED state.
+ * 5.3 tdx_vm_destroy()
+ * transitions TD to TD_STATE_UNINITIALIZED state.
+ *
+ * tdx_vcpu_free() can be invoked only at 3.3 or 5.2.
+ * - If at 3.3, hkid is still assigned, but the vCPU must be in
+ * VCPU_TD_STATE_UNINITIALIZED state.
+ * - if at 5.2, hkid must be !assigned and all vCPUs must be in
+ * VCPU_TD_STATE_INITIALIZED state and have been dissociated.
+ */
void tdx_vcpu_free(struct kvm_vcpu *vcpu)
{
struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm);
struct vcpu_tdx *tdx = to_tdx(vcpu);
int i;
+ if (vcpu->cpu != -1) {
+ KVM_BUG_ON(tdx->state == VCPU_TD_STATE_INITIALIZED, vcpu->kvm);
+ tdx_flush_vp_on_cpu(vcpu);
+ return;
+ }
+
/*
* It is not possible to reclaim pages while hkid is assigned. It might
- * be assigned if:
- * 1. the TD VM is being destroyed but freeing hkid failed, in which
- * case the pages are leaked
- * 2. TD VCPU creation failed and this on the error path, in which case
- * there is nothing to do anyway
+ * be assigned if the TD VM is being destroyed but freeing hkid failed,
+ * in which case the pages are leaked.
*/
if (is_hkid_assigned(kvm_tdx))
return;
@@ -856,7 +898,7 @@ void tdx_vcpu_free(struct kvm_vcpu *vcpu)
}
if (tdx->vp.tdvpr_page) {
tdx_reclaim_control_page(tdx->vp.tdvpr_page);
- tdx->vp.tdvpr_page = 0;
+ tdx->vp.tdvpr_page = NULL;
tdx->vp.tdvpr_pa = 0;
}
@@ -1059,7 +1101,6 @@ fastpath_t tdx_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags)
update_debugctlmsr(vcpu->arch.host_debugctl);
tdx_load_host_xsave_state(vcpu);
- tdx->guest_entered = true;
vcpu->arch.regs_avail &= TDX_REGS_AVAIL_SET;
@@ -1069,9 +1110,6 @@ fastpath_t tdx_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags)
if (unlikely((tdx->vp_enter_ret & TDX_SW_ERROR) == TDX_SW_ERROR))
return EXIT_FASTPATH_NONE;
- if (unlikely(vmx_get_exit_reason(vcpu).basic == EXIT_REASON_MCE_DURING_VMENTRY))
- kvm_machine_check();
-
trace_kvm_exit(vcpu, KVM_ISA_VMX);
if (unlikely(tdx_failed_vmentry(vcpu)))
@@ -1583,137 +1621,79 @@ void tdx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, int pgd_level)
td_vmcs_write64(to_tdx(vcpu), SHARED_EPT_POINTER, root_hpa);
}
-static void tdx_unpin(struct kvm *kvm, struct page *page)
+static int tdx_mem_page_add(struct kvm *kvm, gfn_t gfn, enum pg_level level,
+ kvm_pfn_t pfn)
{
- put_page(page);
+ struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
+ u64 err, entry, level_state;
+ gpa_t gpa = gfn_to_gpa(gfn);
+
+ lockdep_assert_held(&kvm->slots_lock);
+
+ if (KVM_BUG_ON(kvm->arch.pre_fault_allowed, kvm) ||
+ KVM_BUG_ON(!kvm_tdx->page_add_src, kvm))
+ return -EIO;
+
+ err = tdh_mem_page_add(&kvm_tdx->td, gpa, pfn_to_page(pfn),
+ kvm_tdx->page_add_src, &entry, &level_state);
+ if (unlikely(tdx_operand_busy(err)))
+ return -EBUSY;
+
+ if (TDX_BUG_ON_2(err, TDH_MEM_PAGE_ADD, entry, level_state, kvm))
+ return -EIO;
+
+ return 0;
}
static int tdx_mem_page_aug(struct kvm *kvm, gfn_t gfn,
- enum pg_level level, struct page *page)
+ enum pg_level level, kvm_pfn_t pfn)
{
int tdx_level = pg_level_to_tdx_sept_level(level);
struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
+ struct page *page = pfn_to_page(pfn);
gpa_t gpa = gfn_to_gpa(gfn);
u64 entry, level_state;
u64 err;
err = tdh_mem_page_aug(&kvm_tdx->td, gpa, tdx_level, page, &entry, &level_state);
- if (unlikely(tdx_operand_busy(err))) {
- tdx_unpin(kvm, page);
+ if (unlikely(tdx_operand_busy(err)))
return -EBUSY;
- }
- if (KVM_BUG_ON(err, kvm)) {
- pr_tdx_error_2(TDH_MEM_PAGE_AUG, err, entry, level_state);
- tdx_unpin(kvm, page);
+ if (TDX_BUG_ON_2(err, TDH_MEM_PAGE_AUG, entry, level_state, kvm))
return -EIO;
- }
-
- return 0;
-}
-
-/*
- * KVM_TDX_INIT_MEM_REGION calls kvm_gmem_populate() to map guest pages; the
- * callback tdx_gmem_post_populate() then maps pages into private memory.
- * through the a seamcall TDH.MEM.PAGE.ADD(). The SEAMCALL also requires the
- * private EPT structures for the page to have been built before, which is
- * done via kvm_tdp_map_page(). nr_premapped counts the number of pages that
- * were added to the EPT structures but not added with TDH.MEM.PAGE.ADD().
- * The counter has to be zero on KVM_TDX_FINALIZE_VM, to ensure that there
- * are no half-initialized shared EPT pages.
- */
-static int tdx_mem_page_record_premap_cnt(struct kvm *kvm, gfn_t gfn,
- enum pg_level level, kvm_pfn_t pfn)
-{
- struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
-
- if (KVM_BUG_ON(kvm->arch.pre_fault_allowed, kvm))
- return -EINVAL;
- /* nr_premapped will be decreased when tdh_mem_page_add() is called. */
- atomic64_inc(&kvm_tdx->nr_premapped);
return 0;
}
static int tdx_sept_set_private_spte(struct kvm *kvm, gfn_t gfn,
- enum pg_level level, kvm_pfn_t pfn)
+ enum pg_level level, u64 mirror_spte)
{
struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
- struct page *page = pfn_to_page(pfn);
+ kvm_pfn_t pfn = spte_to_pfn(mirror_spte);
/* TODO: handle large pages. */
if (KVM_BUG_ON(level != PG_LEVEL_4K, kvm))
- return -EINVAL;
+ return -EIO;
- /*
- * Because guest_memfd doesn't support page migration with
- * a_ops->migrate_folio (yet), no callback is triggered for KVM on page
- * migration. Until guest_memfd supports page migration, prevent page
- * migration.
- * TODO: Once guest_memfd introduces callback on page migration,
- * implement it and remove get_page/put_page().
- */
- get_page(page);
+ WARN_ON_ONCE(!is_shadow_present_pte(mirror_spte) ||
+ (mirror_spte & VMX_EPT_RWX_MASK) != VMX_EPT_RWX_MASK);
/*
- * Read 'pre_fault_allowed' before 'kvm_tdx->state'; see matching
- * barrier in tdx_td_finalize().
+ * Ensure pre_fault_allowed is read by kvm_arch_vcpu_pre_fault_memory()
+ * before kvm_tdx->state. Userspace must not be allowed to pre-fault
+ * arbitrary memory until the initial memory image is finalized. Pairs
+ * with the smp_wmb() in tdx_td_finalize().
*/
smp_rmb();
- if (likely(kvm_tdx->state == TD_STATE_RUNNABLE))
- return tdx_mem_page_aug(kvm, gfn, level, page);
-
- return tdx_mem_page_record_premap_cnt(kvm, gfn, level, pfn);
-}
-
-static int tdx_sept_drop_private_spte(struct kvm *kvm, gfn_t gfn,
- enum pg_level level, struct page *page)
-{
- int tdx_level = pg_level_to_tdx_sept_level(level);
- struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
- gpa_t gpa = gfn_to_gpa(gfn);
- u64 err, entry, level_state;
-
- /* TODO: handle large pages. */
- if (KVM_BUG_ON(level != PG_LEVEL_4K, kvm))
- return -EINVAL;
-
- if (KVM_BUG_ON(!is_hkid_assigned(kvm_tdx), kvm))
- return -EINVAL;
/*
- * When zapping private page, write lock is held. So no race condition
- * with other vcpu sept operation.
- * Race with TDH.VP.ENTER due to (0-step mitigation) and Guest TDCALLs.
+ * If the TD isn't finalized/runnable, then userspace is initializing
+ * the VM image via KVM_TDX_INIT_MEM_REGION; ADD the page to the TD.
*/
- err = tdh_mem_page_remove(&kvm_tdx->td, gpa, tdx_level, &entry,
- &level_state);
-
- if (unlikely(tdx_operand_busy(err))) {
- /*
- * The second retry is expected to succeed after kicking off all
- * other vCPUs and prevent them from invoking TDH.VP.ENTER.
- */
- tdx_no_vcpus_enter_start(kvm);
- err = tdh_mem_page_remove(&kvm_tdx->td, gpa, tdx_level, &entry,
- &level_state);
- tdx_no_vcpus_enter_stop(kvm);
- }
-
- if (KVM_BUG_ON(err, kvm)) {
- pr_tdx_error_2(TDH_MEM_PAGE_REMOVE, err, entry, level_state);
- return -EIO;
- }
-
- err = tdh_phymem_page_wbinvd_hkid((u16)kvm_tdx->hkid, page);
+ if (unlikely(kvm_tdx->state != TD_STATE_RUNNABLE))
+ return tdx_mem_page_add(kvm, gfn, level, pfn);
- if (KVM_BUG_ON(err, kvm)) {
- pr_tdx_error(TDH_PHYMEM_PAGE_WBINVD, err);
- return -EIO;
- }
- tdx_quirk_reset_page(page);
- tdx_unpin(kvm, page);
- return 0;
+ return tdx_mem_page_aug(kvm, gfn, level, pfn);
}
static int tdx_sept_link_private_spt(struct kvm *kvm, gfn_t gfn,
@@ -1729,81 +1709,13 @@ static int tdx_sept_link_private_spt(struct kvm *kvm, gfn_t gfn,
if (unlikely(tdx_operand_busy(err)))
return -EBUSY;
- if (KVM_BUG_ON(err, kvm)) {
- pr_tdx_error_2(TDH_MEM_SEPT_ADD, err, entry, level_state);
+ if (TDX_BUG_ON_2(err, TDH_MEM_SEPT_ADD, entry, level_state, kvm))
return -EIO;
- }
return 0;
}
/*
- * Check if the error returned from a SEPT zap SEAMCALL is due to that a page is
- * mapped by KVM_TDX_INIT_MEM_REGION without tdh_mem_page_add() being called
- * successfully.
- *
- * Since tdh_mem_sept_add() must have been invoked successfully before a
- * non-leaf entry present in the mirrored page table, the SEPT ZAP related
- * SEAMCALLs should not encounter err TDX_EPT_WALK_FAILED. They should instead
- * find TDX_EPT_ENTRY_STATE_INCORRECT due to an empty leaf entry found in the
- * SEPT.
- *
- * Further check if the returned entry from SEPT walking is with RWX permissions
- * to filter out anything unexpected.
- *
- * Note: @level is pg_level, not the tdx_level. The tdx_level extracted from
- * level_state returned from a SEAMCALL error is the same as that passed into
- * the SEAMCALL.
- */
-static int tdx_is_sept_zap_err_due_to_premap(struct kvm_tdx *kvm_tdx, u64 err,
- u64 entry, int level)
-{
- if (!err || kvm_tdx->state == TD_STATE_RUNNABLE)
- return false;
-
- if (err != (TDX_EPT_ENTRY_STATE_INCORRECT | TDX_OPERAND_ID_RCX))
- return false;
-
- if ((is_last_spte(entry, level) && (entry & VMX_EPT_RWX_MASK)))
- return false;
-
- return true;
-}
-
-static int tdx_sept_zap_private_spte(struct kvm *kvm, gfn_t gfn,
- enum pg_level level, struct page *page)
-{
- int tdx_level = pg_level_to_tdx_sept_level(level);
- struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
- gpa_t gpa = gfn_to_gpa(gfn) & KVM_HPAGE_MASK(level);
- u64 err, entry, level_state;
-
- /* For now large page isn't supported yet. */
- WARN_ON_ONCE(level != PG_LEVEL_4K);
-
- err = tdh_mem_range_block(&kvm_tdx->td, gpa, tdx_level, &entry, &level_state);
-
- if (unlikely(tdx_operand_busy(err))) {
- /* After no vCPUs enter, the second retry is expected to succeed */
- tdx_no_vcpus_enter_start(kvm);
- err = tdh_mem_range_block(&kvm_tdx->td, gpa, tdx_level, &entry, &level_state);
- tdx_no_vcpus_enter_stop(kvm);
- }
- if (tdx_is_sept_zap_err_due_to_premap(kvm_tdx, err, entry, level) &&
- !KVM_BUG_ON(!atomic64_read(&kvm_tdx->nr_premapped), kvm)) {
- atomic64_dec(&kvm_tdx->nr_premapped);
- tdx_unpin(kvm, page);
- return 0;
- }
-
- if (KVM_BUG_ON(err, kvm)) {
- pr_tdx_error_2(TDH_MEM_RANGE_BLOCK, err, entry, level_state);
- return -EIO;
- }
- return 1;
-}
-
-/*
* Ensure shared and private EPTs to be flushed on all vCPUs.
* tdh_mem_track() is the only caller that increases TD epoch. An increase in
* the TD epoch (e.g., to value "N + 1") is successful only if no vCPUs are
@@ -1836,18 +1748,15 @@ static void tdx_track(struct kvm *kvm)
if (unlikely(kvm_tdx->state != TD_STATE_RUNNABLE))
return;
+ /*
+ * The full sequence of TDH.MEM.TRACK and forcing vCPUs out of guest
+ * mode must be serialized, as TDH.MEM.TRACK will fail if the previous
+ * tracking epoch hasn't completed.
+ */
lockdep_assert_held_write(&kvm->mmu_lock);
- err = tdh_mem_track(&kvm_tdx->td);
- if (unlikely(tdx_operand_busy(err))) {
- /* After no vCPUs enter, the second retry is expected to succeed */
- tdx_no_vcpus_enter_start(kvm);
- err = tdh_mem_track(&kvm_tdx->td);
- tdx_no_vcpus_enter_stop(kvm);
- }
-
- if (KVM_BUG_ON(err, kvm))
- pr_tdx_error(TDH_MEM_TRACK, err);
+ err = tdh_do_no_vcpus(tdh_mem_track, kvm, &kvm_tdx->td);
+ TDX_BUG_ON(err, TDH_MEM_TRACK, kvm);
kvm_make_all_cpus_request(kvm, KVM_REQ_OUTSIDE_GUEST_MODE);
}
@@ -1866,7 +1775,7 @@ static int tdx_sept_free_private_spt(struct kvm *kvm, gfn_t gfn,
* and slot move/deletion.
*/
if (KVM_BUG_ON(is_hkid_assigned(kvm_tdx), kvm))
- return -EINVAL;
+ return -EIO;
/*
* The HKID assigned to this TD was already freed and cache was
@@ -1875,11 +1784,16 @@ static int tdx_sept_free_private_spt(struct kvm *kvm, gfn_t gfn,
return tdx_reclaim_page(virt_to_page(private_spt));
}
-static int tdx_sept_remove_private_spte(struct kvm *kvm, gfn_t gfn,
- enum pg_level level, kvm_pfn_t pfn)
+static void tdx_sept_remove_private_spte(struct kvm *kvm, gfn_t gfn,
+ enum pg_level level, u64 mirror_spte)
{
- struct page *page = pfn_to_page(pfn);
- int ret;
+ struct page *page = pfn_to_page(spte_to_pfn(mirror_spte));
+ int tdx_level = pg_level_to_tdx_sept_level(level);
+ struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
+ gpa_t gpa = gfn_to_gpa(gfn);
+ u64 err, entry, level_state;
+
+ lockdep_assert_held_write(&kvm->mmu_lock);
/*
* HKID is released after all private pages have been removed, and set
@@ -1887,11 +1801,16 @@ static int tdx_sept_remove_private_spte(struct kvm *kvm, gfn_t gfn,
* there can't be anything populated in the private EPT.
*/
if (KVM_BUG_ON(!is_hkid_assigned(to_kvm_tdx(kvm)), kvm))
- return -EINVAL;
+ return;
- ret = tdx_sept_zap_private_spte(kvm, gfn, level, page);
- if (ret <= 0)
- return ret;
+ /* TODO: handle large pages. */
+ if (KVM_BUG_ON(level != PG_LEVEL_4K, kvm))
+ return;
+
+ err = tdh_do_no_vcpus(tdh_mem_range_block, kvm, &kvm_tdx->td, gpa,
+ tdx_level, &entry, &level_state);
+ if (TDX_BUG_ON_2(err, TDH_MEM_RANGE_BLOCK, entry, level_state, kvm))
+ return;
/*
* TDX requires TLB tracking before dropping private page. Do
@@ -1899,7 +1818,21 @@ static int tdx_sept_remove_private_spte(struct kvm *kvm, gfn_t gfn,
*/
tdx_track(kvm);
- return tdx_sept_drop_private_spte(kvm, gfn, level, page);
+ /*
+ * When zapping private page, write lock is held. So no race condition
+ * with other vcpu sept operation.
+ * Race with TDH.VP.ENTER due to (0-step mitigation) and Guest TDCALLs.
+ */
+ err = tdh_do_no_vcpus(tdh_mem_page_remove, kvm, &kvm_tdx->td, gpa,
+ tdx_level, &entry, &level_state);
+ if (TDX_BUG_ON_2(err, TDH_MEM_PAGE_REMOVE, entry, level_state, kvm))
+ return;
+
+ err = tdh_phymem_page_wbinvd_hkid((u16)kvm_tdx->hkid, page);
+ if (TDX_BUG_ON(err, TDH_PHYMEM_PAGE_WBINVD, kvm))
+ return;
+
+ tdx_quirk_reset_page(page);
}
void tdx_deliver_interrupt(struct kvm_lapic *apic, int delivery_mode,
@@ -2145,11 +2078,7 @@ int tdx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t fastpath)
}
unhandled_exit:
- vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
- vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_UNEXPECTED_EXIT_REASON;
- vcpu->run->internal.ndata = 2;
- vcpu->run->internal.data[0] = vp_enter_ret;
- vcpu->run->internal.data[1] = vcpu->arch.last_vmentry_cpu;
+ kvm_prepare_unexpected_reason_exit(vcpu, vp_enter_ret);
return 0;
}
@@ -2282,37 +2211,28 @@ static int tdx_get_capabilities(struct kvm_tdx_cmd *cmd)
if (cmd->flags)
return -EINVAL;
- caps = kzalloc(sizeof(*caps) +
- sizeof(struct kvm_cpuid_entry2) * td_conf->num_cpuid_config,
- GFP_KERNEL);
- if (!caps)
- return -ENOMEM;
-
user_caps = u64_to_user_ptr(cmd->data);
- if (get_user(nr_user_entries, &user_caps->cpuid.nent)) {
- ret = -EFAULT;
- goto out;
- }
+ if (get_user(nr_user_entries, &user_caps->cpuid.nent))
+ return -EFAULT;
- if (nr_user_entries < td_conf->num_cpuid_config) {
- ret = -E2BIG;
- goto out;
- }
+ if (nr_user_entries < td_conf->num_cpuid_config)
+ return -E2BIG;
+
+ caps = kzalloc(struct_size(caps, cpuid.entries,
+ td_conf->num_cpuid_config), GFP_KERNEL);
+ if (!caps)
+ return -ENOMEM;
ret = init_kvm_tdx_caps(td_conf, caps);
if (ret)
goto out;
- if (copy_to_user(user_caps, caps, sizeof(*caps))) {
+ if (copy_to_user(user_caps, caps, struct_size(caps, cpuid.entries,
+ caps->cpuid.nent))) {
ret = -EFAULT;
goto out;
}
- if (copy_to_user(user_caps->cpuid.entries, caps->cpuid.entries,
- caps->cpuid.nent *
- sizeof(caps->cpuid.entries[0])))
- ret = -EFAULT;
-
out:
/* kfree() accepts NULL. */
kfree(caps);
@@ -2537,8 +2457,7 @@ static int __tdx_td_init(struct kvm *kvm, struct td_params *td_params,
goto free_packages;
}
- if (WARN_ON_ONCE(err)) {
- pr_tdx_error(TDH_MNG_CREATE, err);
+ if (TDX_BUG_ON(err, TDH_MNG_CREATE, kvm)) {
ret = -EIO;
goto free_packages;
}
@@ -2579,8 +2498,7 @@ static int __tdx_td_init(struct kvm *kvm, struct td_params *td_params,
ret = -EAGAIN;
goto teardown;
}
- if (WARN_ON_ONCE(err)) {
- pr_tdx_error(TDH_MNG_ADDCX, err);
+ if (TDX_BUG_ON(err, TDH_MNG_ADDCX, kvm)) {
ret = -EIO;
goto teardown;
}
@@ -2597,8 +2515,7 @@ static int __tdx_td_init(struct kvm *kvm, struct td_params *td_params,
*seamcall_err = err;
ret = -EINVAL;
goto teardown;
- } else if (WARN_ON_ONCE(err)) {
- pr_tdx_error_1(TDH_MNG_INIT, err, rcx);
+ } else if (TDX_BUG_ON_1(err, TDH_MNG_INIT, rcx, kvm)) {
ret = -EIO;
goto teardown;
}
@@ -2642,7 +2559,7 @@ free_tdcs:
free_tdr:
if (tdr_page)
__free_page(tdr_page);
- kvm_tdx->td.tdr_page = 0;
+ kvm_tdx->td.tdr_page = NULL;
free_hkid:
tdx_hkid_free(kvm_tdx);
@@ -2747,11 +2664,53 @@ err_out:
return -EIO;
}
+typedef void *tdx_vm_state_guard_t;
+
+static tdx_vm_state_guard_t tdx_acquire_vm_state_locks(struct kvm *kvm)
+{
+ int r;
+
+ mutex_lock(&kvm->lock);
+
+ if (kvm->created_vcpus != atomic_read(&kvm->online_vcpus)) {
+ r = -EBUSY;
+ goto out_err;
+ }
+
+ r = kvm_lock_all_vcpus(kvm);
+ if (r)
+ goto out_err;
+
+ /*
+ * Note the unintuitive ordering! vcpu->mutex must be taken outside
+ * kvm->slots_lock!
+ */
+ mutex_lock(&kvm->slots_lock);
+ return kvm;
+
+out_err:
+ mutex_unlock(&kvm->lock);
+ return ERR_PTR(r);
+}
+
+static void tdx_release_vm_state_locks(struct kvm *kvm)
+{
+ mutex_unlock(&kvm->slots_lock);
+ kvm_unlock_all_vcpus(kvm);
+ mutex_unlock(&kvm->lock);
+}
+
+DEFINE_CLASS(tdx_vm_state_guard, tdx_vm_state_guard_t,
+ if (!IS_ERR(_T)) tdx_release_vm_state_locks(_T),
+ tdx_acquire_vm_state_locks(kvm), struct kvm *kvm);
+
static int tdx_td_init(struct kvm *kvm, struct kvm_tdx_cmd *cmd)
{
+ struct kvm_tdx_init_vm __user *user_data = u64_to_user_ptr(cmd->data);
struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
struct kvm_tdx_init_vm *init_vm;
struct td_params *td_params = NULL;
+ u32 nr_user_entries;
int ret;
BUILD_BUG_ON(sizeof(*init_vm) != 256 + sizeof_field(struct kvm_tdx_init_vm, cpuid));
@@ -2763,28 +2722,16 @@ static int tdx_td_init(struct kvm *kvm, struct kvm_tdx_cmd *cmd)
if (cmd->flags)
return -EINVAL;
- init_vm = kmalloc(sizeof(*init_vm) +
- sizeof(init_vm->cpuid.entries[0]) * KVM_MAX_CPUID_ENTRIES,
- GFP_KERNEL);
- if (!init_vm)
- return -ENOMEM;
-
- if (copy_from_user(init_vm, u64_to_user_ptr(cmd->data), sizeof(*init_vm))) {
- ret = -EFAULT;
- goto out;
- }
+ if (get_user(nr_user_entries, &user_data->cpuid.nent))
+ return -EFAULT;
- if (init_vm->cpuid.nent > KVM_MAX_CPUID_ENTRIES) {
- ret = -E2BIG;
- goto out;
- }
+ if (nr_user_entries > KVM_MAX_CPUID_ENTRIES)
+ return -E2BIG;
- if (copy_from_user(init_vm->cpuid.entries,
- u64_to_user_ptr(cmd->data) + sizeof(*init_vm),
- flex_array_size(init_vm, cpuid.entries, init_vm->cpuid.nent))) {
- ret = -EFAULT;
- goto out;
- }
+ init_vm = memdup_user(user_data,
+ struct_size(user_data, cpuid.entries, nr_user_entries));
+ if (IS_ERR(init_vm))
+ return PTR_ERR(init_vm);
if (memchr_inv(init_vm->reserved, 0, sizeof(init_vm->reserved))) {
ret = -EINVAL;
@@ -2868,24 +2815,14 @@ static int tdx_td_finalize(struct kvm *kvm, struct kvm_tdx_cmd *cmd)
{
struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
- guard(mutex)(&kvm->slots_lock);
-
if (!is_hkid_assigned(kvm_tdx) || kvm_tdx->state == TD_STATE_RUNNABLE)
return -EINVAL;
- /*
- * Pages are pending for KVM_TDX_INIT_MEM_REGION to issue
- * TDH.MEM.PAGE.ADD().
- */
- if (atomic64_read(&kvm_tdx->nr_premapped))
- return -EINVAL;
cmd->hw_error = tdh_mr_finalize(&kvm_tdx->td);
if (tdx_operand_busy(cmd->hw_error))
return -EBUSY;
- if (KVM_BUG_ON(cmd->hw_error, kvm)) {
- pr_tdx_error(TDH_MR_FINALIZE, cmd->hw_error);
+ if (TDX_BUG_ON(cmd->hw_error, TDH_MR_FINALIZE, kvm))
return -EIO;
- }
kvm_tdx->state = TD_STATE_RUNNABLE;
/* TD_STATE_RUNNABLE must be set before 'pre_fault_allowed' */
@@ -2894,27 +2831,38 @@ static int tdx_td_finalize(struct kvm *kvm, struct kvm_tdx_cmd *cmd)
return 0;
}
-int tdx_vm_ioctl(struct kvm *kvm, void __user *argp)
+static int tdx_get_cmd(void __user *argp, struct kvm_tdx_cmd *cmd)
{
- struct kvm_tdx_cmd tdx_cmd;
- int r;
-
- if (copy_from_user(&tdx_cmd, argp, sizeof(struct kvm_tdx_cmd)))
+ if (copy_from_user(cmd, argp, sizeof(*cmd)))
return -EFAULT;
/*
- * Userspace should never set hw_error. It is used to fill
- * hardware-defined error by the kernel.
+ * Userspace should never set hw_error. KVM writes hw_error to report
+ * hardware-defined error back to userspace.
*/
- if (tdx_cmd.hw_error)
+ if (cmd->hw_error)
return -EINVAL;
- mutex_lock(&kvm->lock);
+ return 0;
+}
+
+int tdx_vm_ioctl(struct kvm *kvm, void __user *argp)
+{
+ struct kvm_tdx_cmd tdx_cmd;
+ int r;
+
+ r = tdx_get_cmd(argp, &tdx_cmd);
+ if (r)
+ return r;
+
+ if (tdx_cmd.id == KVM_TDX_CAPABILITIES)
+ return tdx_get_capabilities(&tdx_cmd);
+
+ CLASS(tdx_vm_state_guard, guard)(kvm);
+ if (IS_ERR(guard))
+ return PTR_ERR(guard);
switch (tdx_cmd.id) {
- case KVM_TDX_CAPABILITIES:
- r = tdx_get_capabilities(&tdx_cmd);
- break;
case KVM_TDX_INIT_VM:
r = tdx_td_init(kvm, &tdx_cmd);
break;
@@ -2922,15 +2870,12 @@ int tdx_vm_ioctl(struct kvm *kvm, void __user *argp)
r = tdx_td_finalize(kvm, &tdx_cmd);
break;
default:
- r = -EINVAL;
- goto out;
+ return -EINVAL;
}
if (copy_to_user(argp, &tdx_cmd, sizeof(struct kvm_tdx_cmd)))
- r = -EFAULT;
+ return -EFAULT;
-out:
- mutex_unlock(&kvm->lock);
return r;
}
@@ -2972,16 +2917,14 @@ static int tdx_td_vcpu_init(struct kvm_vcpu *vcpu, u64 vcpu_rcx)
}
err = tdh_vp_create(&kvm_tdx->td, &tdx->vp);
- if (KVM_BUG_ON(err, vcpu->kvm)) {
+ if (TDX_BUG_ON(err, TDH_VP_CREATE, vcpu->kvm)) {
ret = -EIO;
- pr_tdx_error(TDH_VP_CREATE, err);
goto free_tdcx;
}
for (i = 0; i < kvm_tdx->td.tdcx_nr_pages; i++) {
err = tdh_vp_addcx(&tdx->vp, tdx->vp.tdcx_pages[i]);
- if (KVM_BUG_ON(err, vcpu->kvm)) {
- pr_tdx_error(TDH_VP_ADDCX, err);
+ if (TDX_BUG_ON(err, TDH_VP_ADDCX, vcpu->kvm)) {
/*
* Pages already added are reclaimed by the vcpu_free
* method, but the rest are freed here.
@@ -2994,10 +2937,19 @@ static int tdx_td_vcpu_init(struct kvm_vcpu *vcpu, u64 vcpu_rcx)
}
}
- err = tdh_vp_init(&tdx->vp, vcpu_rcx, vcpu->vcpu_id);
- if (KVM_BUG_ON(err, vcpu->kvm)) {
- pr_tdx_error(TDH_VP_INIT, err);
- return -EIO;
+ /*
+ * tdh_vp_init() can take an exclusive lock of the TDR resource inside
+ * the TDX-Module. The TDR resource is also taken as shared in several
+ * no-fail MMU paths, which could return TDX_OPERAND_BUSY on contention
+ * (TDX-Module locks are try-lock implementations with no slow path).
+ * Take mmu_lock for write to reflect the nature of the lock taken by
+ * the TDX-Module, and to ensure the no-fail MMU paths succeed, e.g. if
+ * a concurrent PUNCH_HOLE on guest_memfd triggers removal of SPTEs.
+ */
+ scoped_guard(write_lock, &vcpu->kvm->mmu_lock) {
+ err = tdh_vp_init(&tdx->vp, vcpu_rcx, vcpu->vcpu_id);
+ if (TDX_BUG_ON(err, TDH_VP_INIT, vcpu->kvm))
+ return -EIO;
}
vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
@@ -3016,7 +2968,7 @@ free_tdcx:
free_tdvpr:
if (tdx->vp.tdvpr_page)
__free_page(tdx->vp.tdvpr_page);
- tdx->vp.tdvpr_page = 0;
+ tdx->vp.tdvpr_page = NULL;
tdx->vp.tdvpr_pa = 0;
return ret;
@@ -3054,7 +3006,8 @@ static int tdx_vcpu_get_cpuid_leaf(struct kvm_vcpu *vcpu, u32 leaf, int *entry_i
static int tdx_vcpu_get_cpuid(struct kvm_vcpu *vcpu, struct kvm_tdx_cmd *cmd)
{
- struct kvm_cpuid2 __user *output, *td_cpuid;
+ struct kvm_cpuid2 __user *output;
+ struct kvm_cpuid2 *td_cpuid;
int r = 0, i = 0, leaf;
u32 level;
@@ -3167,15 +3120,15 @@ struct tdx_gmem_post_populate_arg {
static int tdx_gmem_post_populate(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn,
void __user *src, int order, void *_arg)
{
- u64 error_code = PFERR_GUEST_FINAL_MASK | PFERR_PRIVATE_ACCESS;
- struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
struct tdx_gmem_post_populate_arg *arg = _arg;
- struct kvm_vcpu *vcpu = arg->vcpu;
+ struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
+ u64 err, entry, level_state;
gpa_t gpa = gfn_to_gpa(gfn);
- u8 level = PG_LEVEL_4K;
struct page *src_page;
int ret, i;
- u64 err, entry, level_state;
+
+ if (KVM_BUG_ON(kvm_tdx->page_add_src, kvm))
+ return -EIO;
/*
* Get the source page if it has been faulted in. Return failure if the
@@ -3187,49 +3140,29 @@ static int tdx_gmem_post_populate(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn,
if (ret != 1)
return -ENOMEM;
- ret = kvm_tdp_map_page(vcpu, gpa, error_code, &level);
- if (ret < 0)
- goto out;
-
- /*
- * The private mem cannot be zapped after kvm_tdp_map_page()
- * because all paths are covered by slots_lock and the
- * filemap invalidate lock. Check that they are indeed enough.
- */
- if (IS_ENABLED(CONFIG_KVM_PROVE_MMU)) {
- scoped_guard(read_lock, &kvm->mmu_lock) {
- if (KVM_BUG_ON(!kvm_tdp_mmu_gpa_is_mapped(vcpu, gpa), kvm)) {
- ret = -EIO;
- goto out;
- }
- }
- }
+ kvm_tdx->page_add_src = src_page;
+ ret = kvm_tdp_mmu_map_private_pfn(arg->vcpu, gfn, pfn);
+ kvm_tdx->page_add_src = NULL;
- ret = 0;
- err = tdh_mem_page_add(&kvm_tdx->td, gpa, pfn_to_page(pfn),
- src_page, &entry, &level_state);
- if (err) {
- ret = unlikely(tdx_operand_busy(err)) ? -EBUSY : -EIO;
- goto out;
- }
+ put_page(src_page);
- if (!KVM_BUG_ON(!atomic64_read(&kvm_tdx->nr_premapped), kvm))
- atomic64_dec(&kvm_tdx->nr_premapped);
+ if (ret || !(arg->flags & KVM_TDX_MEASURE_MEMORY_REGION))
+ return ret;
- if (arg->flags & KVM_TDX_MEASURE_MEMORY_REGION) {
- for (i = 0; i < PAGE_SIZE; i += TDX_EXTENDMR_CHUNKSIZE) {
- err = tdh_mr_extend(&kvm_tdx->td, gpa + i, &entry,
- &level_state);
- if (err) {
- ret = -EIO;
- break;
- }
- }
+ /*
+ * Note, MR.EXTEND can fail if the S-EPT mapping is somehow removed
+ * between mapping the pfn and now, but slots_lock prevents memslot
+ * updates, filemap_invalidate_lock() prevents guest_memfd updates,
+ * mmu_notifier events can't reach S-EPT entries, and KVM's internal
+ * zapping flows are mutually exclusive with S-EPT mappings.
+ */
+ for (i = 0; i < PAGE_SIZE; i += TDX_EXTENDMR_CHUNKSIZE) {
+ err = tdh_mr_extend(&kvm_tdx->td, gpa + i, &entry, &level_state);
+ if (TDX_BUG_ON_2(err, TDH_MR_EXTEND, entry, level_state, kvm))
+ return -EIO;
}
-out:
- put_page(src_page);
- return ret;
+ return 0;
}
static int tdx_vcpu_init_mem_region(struct kvm_vcpu *vcpu, struct kvm_tdx_cmd *cmd)
@@ -3245,8 +3178,6 @@ static int tdx_vcpu_init_mem_region(struct kvm_vcpu *vcpu, struct kvm_tdx_cmd *c
if (tdx->state != VCPU_TD_STATE_INITIALIZED)
return -EINVAL;
- guard(mutex)(&kvm->slots_lock);
-
/* Once TD is finalized, the initial guest memory is fixed. */
if (kvm_tdx->state == TD_STATE_RUNNABLE)
return -EINVAL;
@@ -3264,7 +3195,6 @@ static int tdx_vcpu_init_mem_region(struct kvm_vcpu *vcpu, struct kvm_tdx_cmd *c
!vt_is_tdx_private_gpa(kvm, region.gpa + (region.nr_pages << PAGE_SHIFT) - 1))
return -EINVAL;
- kvm_mmu_reload(vcpu);
ret = 0;
while (region.nr_pages) {
if (signal_pending(current)) {
@@ -3301,28 +3231,57 @@ static int tdx_vcpu_init_mem_region(struct kvm_vcpu *vcpu, struct kvm_tdx_cmd *c
return ret;
}
-int tdx_vcpu_ioctl(struct kvm_vcpu *vcpu, void __user *argp)
+int tdx_vcpu_unlocked_ioctl(struct kvm_vcpu *vcpu, void __user *argp)
{
- struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm);
+ struct kvm *kvm = vcpu->kvm;
+ struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
struct kvm_tdx_cmd cmd;
- int ret;
+ int r;
- if (!is_hkid_assigned(kvm_tdx) || kvm_tdx->state == TD_STATE_RUNNABLE)
- return -EINVAL;
+ r = tdx_get_cmd(argp, &cmd);
+ if (r)
+ return r;
- if (copy_from_user(&cmd, argp, sizeof(cmd)))
- return -EFAULT;
+ CLASS(tdx_vm_state_guard, guard)(kvm);
+ if (IS_ERR(guard))
+ return PTR_ERR(guard);
- if (cmd.hw_error)
+ if (!is_hkid_assigned(kvm_tdx) || kvm_tdx->state == TD_STATE_RUNNABLE)
return -EINVAL;
+ vcpu_load(vcpu);
+
switch (cmd.id) {
+ case KVM_TDX_INIT_MEM_REGION:
+ r = tdx_vcpu_init_mem_region(vcpu, &cmd);
+ break;
case KVM_TDX_INIT_VCPU:
- ret = tdx_vcpu_init(vcpu, &cmd);
+ r = tdx_vcpu_init(vcpu, &cmd);
break;
- case KVM_TDX_INIT_MEM_REGION:
- ret = tdx_vcpu_init_mem_region(vcpu, &cmd);
+ default:
+ r = -ENOIOCTLCMD;
break;
+ }
+
+ vcpu_put(vcpu);
+
+ return r;
+}
+
+int tdx_vcpu_ioctl(struct kvm_vcpu *vcpu, void __user *argp)
+{
+ struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm);
+ struct kvm_tdx_cmd cmd;
+ int ret;
+
+ if (!is_hkid_assigned(kvm_tdx) || kvm_tdx->state == TD_STATE_RUNNABLE)
+ return -EINVAL;
+
+ ret = tdx_get_cmd(argp, &cmd);
+ if (ret)
+ return ret;
+
+ switch (cmd.id) {
case KVM_TDX_GET_CPUID:
ret = tdx_vcpu_get_cpuid(vcpu, &cmd);
break;
@@ -3447,10 +3406,6 @@ static int __init __tdx_bringup(void)
/*
* Check if MSRs (tdx_uret_msrs) can be saved/restored
* before returning to user space.
- *
- * this_cpu_ptr(user_return_msrs)->registered isn't checked
- * because the registration is done at vcpu runtime by
- * tdx_user_return_msr_update_cache().
*/
tdx_uret_msrs[i].slot = kvm_find_user_return_msr(tdx_uret_msrs[i].msr);
if (tdx_uret_msrs[i].slot == -1) {
diff --git a/arch/x86/kvm/vmx/tdx.h b/arch/x86/kvm/vmx/tdx.h
index ca39a9391db1..45b5183ccb36 100644
--- a/arch/x86/kvm/vmx/tdx.h
+++ b/arch/x86/kvm/vmx/tdx.h
@@ -36,8 +36,12 @@ struct kvm_tdx {
struct tdx_td td;
- /* For KVM_TDX_INIT_MEM_REGION. */
- atomic64_t nr_premapped;
+ /*
+ * Scratch pointer used to pass the source page to tdx_mem_page_add().
+ * Protected by slots_lock, and non-NULL only when mapping a private
+ * pfn via tdx_gmem_post_populate().
+ */
+ struct page *page_add_src;
/*
* Prevent vCPUs from TD entry to ensure SEPT zap related SEAMCALLs do
@@ -67,7 +71,6 @@ struct vcpu_tdx {
u64 vp_enter_ret;
enum vcpu_tdx_state state;
- bool guest_entered;
u64 map_gpa_next;
u64 map_gpa_end;
diff --git a/arch/x86/kvm/vmx/vmenter.S b/arch/x86/kvm/vmx/vmenter.S
index bc255d709d8a..4426d34811fc 100644
--- a/arch/x86/kvm/vmx/vmenter.S
+++ b/arch/x86/kvm/vmx/vmenter.S
@@ -71,6 +71,7 @@
* @regs: unsigned long * (to guest registers)
* @flags: VMX_RUN_VMRESUME: use VMRESUME instead of VMLAUNCH
* VMX_RUN_SAVE_SPEC_CTRL: save guest SPEC_CTRL into vmx->spec_ctrl
+ * VMX_RUN_CLEAR_CPU_BUFFERS_FOR_MMIO: vCPU can access host MMIO
*
* Returns:
* 0 on VM-Exit, 1 on VM-Fail
@@ -92,7 +93,7 @@ SYM_FUNC_START(__vmx_vcpu_run)
/* Save @vmx for SPEC_CTRL handling */
push %_ASM_ARG1
- /* Save @flags for SPEC_CTRL handling */
+ /* Save @flags (used for VMLAUNCH vs. VMRESUME and mitigations). */
push %_ASM_ARG3
/*
@@ -101,9 +102,6 @@ SYM_FUNC_START(__vmx_vcpu_run)
*/
push %_ASM_ARG2
- /* Copy @flags to EBX, _ASM_ARG3 is volatile. */
- mov %_ASM_ARG3L, %ebx
-
lea (%_ASM_SP), %_ASM_ARG2
call vmx_update_host_rsp
@@ -118,13 +116,23 @@ SYM_FUNC_START(__vmx_vcpu_run)
* and vmentry.
*/
mov 2*WORD_SIZE(%_ASM_SP), %_ASM_DI
- movl VMX_spec_ctrl(%_ASM_DI), %edi
- movl PER_CPU_VAR(x86_spec_ctrl_current), %esi
- cmp %edi, %esi
+#ifdef CONFIG_X86_64
+ mov VMX_spec_ctrl(%rdi), %rdx
+ cmp PER_CPU_VAR(x86_spec_ctrl_current), %rdx
je .Lspec_ctrl_done
+ movl %edx, %eax
+ shr $32, %rdx
+#else
+ mov VMX_spec_ctrl(%edi), %eax
+ mov PER_CPU_VAR(x86_spec_ctrl_current), %ecx
+ xor %eax, %ecx
+ mov VMX_spec_ctrl + 4(%edi), %edx
+ mov PER_CPU_VAR(x86_spec_ctrl_current + 4), %edi
+ xor %edx, %edi
+ or %edi, %ecx
+ je .Lspec_ctrl_done
+#endif
mov $MSR_IA32_SPEC_CTRL, %ecx
- xor %edx, %edx
- mov %edi, %eax
wrmsr
.Lspec_ctrl_done:
@@ -137,9 +145,6 @@ SYM_FUNC_START(__vmx_vcpu_run)
/* Load @regs to RAX. */
mov (%_ASM_SP), %_ASM_AX
- /* Check if vmlaunch or vmresume is needed */
- bt $VMX_RUN_VMRESUME_SHIFT, %ebx
-
/* Load guest registers. Don't clobber flags. */
mov VCPU_RCX(%_ASM_AX), %_ASM_CX
mov VCPU_RDX(%_ASM_AX), %_ASM_DX
@@ -160,11 +165,23 @@ SYM_FUNC_START(__vmx_vcpu_run)
/* Load guest RAX. This kills the @regs pointer! */
mov VCPU_RAX(%_ASM_AX), %_ASM_AX
- /* Clobbers EFLAGS.ZF */
- CLEAR_CPU_BUFFERS
-
- /* Check EFLAGS.CF from the VMX_RUN_VMRESUME bit test above. */
- jnc .Lvmlaunch
+ /*
+ * Note, ALTERNATIVE_2 works in reverse order. If CLEAR_CPU_BUF_VM is
+ * enabled, do VERW unconditionally. If CPU_BUF_VM_MMIO is enabled,
+ * check @flags to see if the vCPU has access to host MMIO, and if so,
+ * do VERW. Else, do nothing (no mitigations needed/enabled).
+ */
+ ALTERNATIVE_2 "", \
+ __stringify(testl $VMX_RUN_CLEAR_CPU_BUFFERS_FOR_MMIO, WORD_SIZE(%_ASM_SP); \
+ jz .Lskip_mmio_verw; \
+ VERW; \
+ .Lskip_mmio_verw:), \
+ X86_FEATURE_CLEAR_CPU_BUF_VM_MMIO, \
+ __stringify(VERW), X86_FEATURE_CLEAR_CPU_BUF_VM
+
+ /* Check @flags to see if VMLAUNCH or VMRESUME is needed. */
+ testl $VMX_RUN_VMRESUME, WORD_SIZE(%_ASM_SP)
+ jz .Lvmlaunch
/*
* After a successful VMRESUME/VMLAUNCH, control flow "magically"
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 91b6f2f3edc2..4cbe8c84b636 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -203,6 +203,7 @@ module_param(pt_mode, int, S_IRUGO);
struct x86_pmu_lbr __ro_after_init vmx_lbr_caps;
+#ifdef CONFIG_CPU_MITIGATIONS
static DEFINE_STATIC_KEY_FALSE(vmx_l1d_should_flush);
static DEFINE_STATIC_KEY_FALSE(vmx_l1d_flush_cond);
static DEFINE_MUTEX(vmx_l1d_flush_mutex);
@@ -225,7 +226,7 @@ static const struct {
#define L1D_CACHE_ORDER 4
static void *vmx_l1d_flush_pages;
-static int vmx_setup_l1d_flush(enum vmx_l1d_flush_state l1tf)
+static int __vmx_setup_l1d_flush(enum vmx_l1d_flush_state l1tf)
{
struct page *page;
unsigned int i;
@@ -302,6 +303,26 @@ static int vmx_setup_l1d_flush(enum vmx_l1d_flush_state l1tf)
return 0;
}
+static int vmx_setup_l1d_flush(void)
+{
+ /*
+ * Hand the parameter mitigation value in which was stored in the pre
+ * module init parser. If no parameter was given, it will contain
+ * 'auto' which will be turned into the default 'cond' mitigation mode.
+ */
+ return __vmx_setup_l1d_flush(vmentry_l1d_flush_param);
+}
+
+static void vmx_cleanup_l1d_flush(void)
+{
+ if (vmx_l1d_flush_pages) {
+ free_pages((unsigned long)vmx_l1d_flush_pages, L1D_CACHE_ORDER);
+ vmx_l1d_flush_pages = NULL;
+ }
+ /* Restore state so sysfs ignores VMX */
+ l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_AUTO;
+}
+
static int vmentry_l1d_flush_parse(const char *s)
{
unsigned int i;
@@ -339,7 +360,7 @@ static int vmentry_l1d_flush_set(const char *s, const struct kernel_param *kp)
}
mutex_lock(&vmx_l1d_flush_mutex);
- ret = vmx_setup_l1d_flush(l1tf);
+ ret = __vmx_setup_l1d_flush(l1tf);
mutex_unlock(&vmx_l1d_flush_mutex);
return ret;
}
@@ -352,6 +373,101 @@ static int vmentry_l1d_flush_get(char *s, const struct kernel_param *kp)
return sysfs_emit(s, "%s\n", vmentry_l1d_param[l1tf_vmx_mitigation].option);
}
+/*
+ * Software based L1D cache flush which is used when microcode providing
+ * the cache control MSR is not loaded.
+ *
+ * The L1D cache is 32 KiB on Nehalem and later microarchitectures, but to
+ * flush it is required to read in 64 KiB because the replacement algorithm
+ * is not exactly LRU. This could be sized at runtime via topology
+ * information but as all relevant affected CPUs have 32KiB L1D cache size
+ * there is no point in doing so.
+ */
+static noinstr void vmx_l1d_flush(struct kvm_vcpu *vcpu)
+{
+ int size = PAGE_SIZE << L1D_CACHE_ORDER;
+
+ if (!static_branch_unlikely(&vmx_l1d_should_flush))
+ return;
+
+ /*
+ * This code is only executed when the flush mode is 'cond' or
+ * 'always'
+ */
+ if (static_branch_likely(&vmx_l1d_flush_cond)) {
+ /*
+ * Clear the per-cpu flush bit, it gets set again if the vCPU
+ * is reloaded, i.e. if the vCPU is scheduled out or if KVM
+ * exits to userspace, or if KVM reaches one of the unsafe
+ * VMEXIT handlers, e.g. if KVM calls into the emulator,
+ * or from the interrupt handlers.
+ */
+ if (!kvm_get_cpu_l1tf_flush_l1d())
+ return;
+ kvm_clear_cpu_l1tf_flush_l1d();
+ }
+
+ vcpu->stat.l1d_flush++;
+
+ if (static_cpu_has(X86_FEATURE_FLUSH_L1D)) {
+ native_wrmsrq(MSR_IA32_FLUSH_CMD, L1D_FLUSH);
+ return;
+ }
+
+ asm volatile(
+ /* First ensure the pages are in the TLB */
+ "xorl %%eax, %%eax\n"
+ ".Lpopulate_tlb:\n\t"
+ "movzbl (%[flush_pages], %%" _ASM_AX "), %%ecx\n\t"
+ "addl $4096, %%eax\n\t"
+ "cmpl %%eax, %[size]\n\t"
+ "jne .Lpopulate_tlb\n\t"
+ "xorl %%eax, %%eax\n\t"
+ "cpuid\n\t"
+ /* Now fill the cache */
+ "xorl %%eax, %%eax\n"
+ ".Lfill_cache:\n"
+ "movzbl (%[flush_pages], %%" _ASM_AX "), %%ecx\n\t"
+ "addl $64, %%eax\n\t"
+ "cmpl %%eax, %[size]\n\t"
+ "jne .Lfill_cache\n\t"
+ "lfence\n"
+ :: [flush_pages] "r" (vmx_l1d_flush_pages),
+ [size] "r" (size)
+ : "eax", "ebx", "ecx", "edx");
+}
+
+#else /* CONFIG_CPU_MITIGATIONS*/
+static int vmx_setup_l1d_flush(void)
+{
+ l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_NEVER;
+ return 0;
+}
+static void vmx_cleanup_l1d_flush(void)
+{
+ l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_AUTO;
+}
+static __always_inline void vmx_l1d_flush(struct kvm_vcpu *vcpu)
+{
+
+}
+static int vmentry_l1d_flush_set(const char *s, const struct kernel_param *kp)
+{
+ pr_warn_once("Kernel compiled without mitigations, ignoring vmentry_l1d_flush\n");
+ return 0;
+}
+static int vmentry_l1d_flush_get(char *s, const struct kernel_param *kp)
+{
+ return sysfs_emit(s, "never\n");
+}
+#endif
+
+static const struct kernel_param_ops vmentry_l1d_flush_ops = {
+ .set = vmentry_l1d_flush_set,
+ .get = vmentry_l1d_flush_get,
+};
+module_param_cb(vmentry_l1d_flush, &vmentry_l1d_flush_ops, NULL, 0644);
+
static __always_inline void vmx_disable_fb_clear(struct vcpu_vmx *vmx)
{
u64 msr;
@@ -404,12 +520,6 @@ static void vmx_update_fb_clear_dis(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx)
vmx->disable_fb_clear = false;
}
-static const struct kernel_param_ops vmentry_l1d_flush_ops = {
- .set = vmentry_l1d_flush_set,
- .get = vmentry_l1d_flush_get,
-};
-module_param_cb(vmentry_l1d_flush, &vmentry_l1d_flush_ops, NULL, 0644);
-
static u32 vmx_segment_access_rights(struct kvm_segment *var);
void vmx_vmexit(void);
@@ -752,7 +862,7 @@ static void __loaded_vmcs_clear(void *arg)
loaded_vmcs->launched = 0;
}
-void loaded_vmcs_clear(struct loaded_vmcs *loaded_vmcs)
+static void loaded_vmcs_clear(struct loaded_vmcs *loaded_vmcs)
{
int cpu = loaded_vmcs->cpu;
@@ -903,7 +1013,7 @@ unsigned int __vmx_vcpu_run_flags(struct vcpu_vmx *vmx)
if (!msr_write_intercepted(vmx, MSR_IA32_SPEC_CTRL))
flags |= VMX_RUN_SAVE_SPEC_CTRL;
- if (static_branch_unlikely(&cpu_buf_vm_clear) &&
+ if (cpu_feature_enabled(X86_FEATURE_CLEAR_CPU_BUF_VM_MMIO) &&
kvm_vcpu_can_access_host_mmio(&vmx->vcpu))
flags |= VMX_RUN_CLEAR_CPU_BUFFERS_FOR_MMIO;
@@ -3219,6 +3329,40 @@ static inline int vmx_get_current_vpid(struct kvm_vcpu *vcpu)
return to_vmx(vcpu)->vpid;
}
+static u64 construct_eptp(hpa_t root_hpa)
+{
+ u64 eptp = root_hpa | VMX_EPTP_MT_WB;
+ struct kvm_mmu_page *root;
+
+ if (kvm_mmu_is_dummy_root(root_hpa))
+ return eptp | VMX_EPTP_PWL_4;
+
+ /*
+ * EPT roots should always have an associated MMU page. Return a "bad"
+ * EPTP to induce VM-Fail instead of continuing on in a unknown state.
+ */
+ root = root_to_sp(root_hpa);
+ if (WARN_ON_ONCE(!root))
+ return INVALID_PAGE;
+
+ eptp |= (root->role.level == 5) ? VMX_EPTP_PWL_5 : VMX_EPTP_PWL_4;
+
+ if (enable_ept_ad_bits && !root->role.ad_disabled)
+ eptp |= VMX_EPTP_AD_ENABLE_BIT;
+
+ return eptp;
+}
+
+static void vmx_flush_tlb_ept_root(hpa_t root_hpa)
+{
+ u64 eptp = construct_eptp(root_hpa);
+
+ if (VALID_PAGE(eptp))
+ ept_sync_context(eptp);
+ else
+ ept_sync_global();
+}
+
void vmx_flush_tlb_current(struct kvm_vcpu *vcpu)
{
struct kvm_mmu *mmu = vcpu->arch.mmu;
@@ -3229,8 +3373,7 @@ void vmx_flush_tlb_current(struct kvm_vcpu *vcpu)
return;
if (enable_ept)
- ept_sync_context(construct_eptp(vcpu, root_hpa,
- mmu->root_role.level));
+ vmx_flush_tlb_ept_root(root_hpa);
else
vpid_sync_context(vmx_get_current_vpid(vcpu));
}
@@ -3396,30 +3539,16 @@ static int vmx_get_max_ept_level(void)
return 4;
}
-u64 construct_eptp(struct kvm_vcpu *vcpu, hpa_t root_hpa, int root_level)
-{
- u64 eptp = VMX_EPTP_MT_WB;
-
- eptp |= (root_level == 5) ? VMX_EPTP_PWL_5 : VMX_EPTP_PWL_4;
-
- if (enable_ept_ad_bits &&
- (!is_guest_mode(vcpu) || nested_ept_ad_enabled(vcpu)))
- eptp |= VMX_EPTP_AD_ENABLE_BIT;
- eptp |= root_hpa;
-
- return eptp;
-}
-
void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, int root_level)
{
struct kvm *kvm = vcpu->kvm;
bool update_guest_cr3 = true;
unsigned long guest_cr3;
- u64 eptp;
if (enable_ept) {
- eptp = construct_eptp(vcpu, root_hpa, root_level);
- vmcs_write64(EPT_POINTER, eptp);
+ KVM_MMU_WARN_ON(root_to_sp(root_hpa) &&
+ root_level != root_to_sp(root_hpa)->role.level);
+ vmcs_write64(EPT_POINTER, construct_eptp(root_hpa));
hv_track_root_tdp(vcpu, root_hpa);
@@ -6631,15 +6760,8 @@ static int __vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
return kvm_vmx_exit_handlers[exit_handler_index](vcpu);
unexpected_vmexit:
- vcpu_unimpl(vcpu, "vmx: unexpected exit reason 0x%x\n",
- exit_reason.full);
dump_vmcs(vcpu);
- vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
- vcpu->run->internal.suberror =
- KVM_INTERNAL_ERROR_UNEXPECTED_EXIT_REASON;
- vcpu->run->internal.ndata = 2;
- vcpu->run->internal.data[0] = exit_reason.full;
- vcpu->run->internal.data[1] = vcpu->arch.last_vmentry_cpu;
+ kvm_prepare_unexpected_reason_exit(vcpu, exit_reason.full);
return 0;
}
@@ -6661,77 +6783,6 @@ int vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
return ret;
}
-/*
- * Software based L1D cache flush which is used when microcode providing
- * the cache control MSR is not loaded.
- *
- * The L1D cache is 32 KiB on Nehalem and later microarchitectures, but to
- * flush it is required to read in 64 KiB because the replacement algorithm
- * is not exactly LRU. This could be sized at runtime via topology
- * information but as all relevant affected CPUs have 32KiB L1D cache size
- * there is no point in doing so.
- */
-static noinstr void vmx_l1d_flush(struct kvm_vcpu *vcpu)
-{
- int size = PAGE_SIZE << L1D_CACHE_ORDER;
-
- /*
- * This code is only executed when the flush mode is 'cond' or
- * 'always'
- */
- if (static_branch_likely(&vmx_l1d_flush_cond)) {
- bool flush_l1d;
-
- /*
- * Clear the per-vcpu flush bit, it gets set again if the vCPU
- * is reloaded, i.e. if the vCPU is scheduled out or if KVM
- * exits to userspace, or if KVM reaches one of the unsafe
- * VMEXIT handlers, e.g. if KVM calls into the emulator.
- */
- flush_l1d = vcpu->arch.l1tf_flush_l1d;
- vcpu->arch.l1tf_flush_l1d = false;
-
- /*
- * Clear the per-cpu flush bit, it gets set again from
- * the interrupt handlers.
- */
- flush_l1d |= kvm_get_cpu_l1tf_flush_l1d();
- kvm_clear_cpu_l1tf_flush_l1d();
-
- if (!flush_l1d)
- return;
- }
-
- vcpu->stat.l1d_flush++;
-
- if (static_cpu_has(X86_FEATURE_FLUSH_L1D)) {
- native_wrmsrq(MSR_IA32_FLUSH_CMD, L1D_FLUSH);
- return;
- }
-
- asm volatile(
- /* First ensure the pages are in the TLB */
- "xorl %%eax, %%eax\n"
- ".Lpopulate_tlb:\n\t"
- "movzbl (%[flush_pages], %%" _ASM_AX "), %%ecx\n\t"
- "addl $4096, %%eax\n\t"
- "cmpl %%eax, %[size]\n\t"
- "jne .Lpopulate_tlb\n\t"
- "xorl %%eax, %%eax\n\t"
- "cpuid\n\t"
- /* Now fill the cache */
- "xorl %%eax, %%eax\n"
- ".Lfill_cache:\n"
- "movzbl (%[flush_pages], %%" _ASM_AX "), %%ecx\n\t"
- "addl $64, %%eax\n\t"
- "cmpl %%eax, %[size]\n\t"
- "jne .Lfill_cache\n\t"
- "lfence\n"
- :: [flush_pages] "r" (vmx_l1d_flush_pages),
- [size] "r" (size)
- : "eax", "ebx", "ecx", "edx");
-}
-
void vmx_update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
{
struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
@@ -7050,10 +7101,19 @@ void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu)
if (to_vt(vcpu)->emulation_required)
return;
- if (vmx_get_exit_reason(vcpu).basic == EXIT_REASON_EXTERNAL_INTERRUPT)
+ switch (vmx_get_exit_reason(vcpu).basic) {
+ case EXIT_REASON_EXTERNAL_INTERRUPT:
handle_external_interrupt_irqoff(vcpu, vmx_get_intr_info(vcpu));
- else if (vmx_get_exit_reason(vcpu).basic == EXIT_REASON_EXCEPTION_NMI)
+ break;
+ case EXIT_REASON_EXCEPTION_NMI:
handle_exception_irqoff(vcpu, vmx_get_intr_info(vcpu));
+ break;
+ case EXIT_REASON_MCE_DURING_VMENTRY:
+ kvm_machine_check();
+ break;
+ default:
+ break;
+ }
}
/*
@@ -7328,21 +7388,7 @@ static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu,
guest_state_enter_irqoff();
- /*
- * L1D Flush includes CPU buffer clear to mitigate MDS, but VERW
- * mitigation for MDS is done late in VMentry and is still
- * executed in spite of L1D Flush. This is because an extra VERW
- * should not matter much after the big hammer L1D Flush.
- *
- * cpu_buf_vm_clear is used when system is not vulnerable to MDS/TAA,
- * and is affected by MMIO Stale Data. In such cases mitigation in only
- * needed against an MMIO capable guest.
- */
- if (static_branch_unlikely(&vmx_l1d_should_flush))
- vmx_l1d_flush(vcpu);
- else if (static_branch_unlikely(&cpu_buf_vm_clear) &&
- (flags & VMX_RUN_CLEAR_CPU_BUFFERS_FOR_MMIO))
- x86_clear_cpu_buffers();
+ vmx_l1d_flush(vcpu);
vmx_disable_fb_clear(vmx);
@@ -7454,8 +7500,6 @@ fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags)
if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
vmx_set_interrupt_shadow(vcpu, 0);
- kvm_load_guest_xsave_state(vcpu);
-
pt_guest_enter(vmx);
atomic_switch_perf_msrs(vmx);
@@ -7499,8 +7543,6 @@ fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags)
pt_guest_exit(vmx);
- kvm_load_host_xsave_state(vcpu);
-
if (is_guest_mode(vcpu)) {
/*
* Track VMLAUNCH/VMRESUME that have made past guest state
@@ -7516,9 +7558,6 @@ fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags)
if (unlikely(vmx->fail))
return EXIT_FASTPATH_NONE;
- if (unlikely((u16)vmx_get_exit_reason(vcpu).basic == EXIT_REASON_MCE_DURING_VMENTRY))
- kvm_machine_check();
-
trace_kvm_exit(vcpu, KVM_ISA_VMX);
if (unlikely(vmx_get_exit_reason(vcpu).failed_vmentry))
@@ -8679,16 +8718,6 @@ __init int vmx_hardware_setup(void)
return r;
}
-static void vmx_cleanup_l1d_flush(void)
-{
- if (vmx_l1d_flush_pages) {
- free_pages((unsigned long)vmx_l1d_flush_pages, L1D_CACHE_ORDER);
- vmx_l1d_flush_pages = NULL;
- }
- /* Restore state so sysfs ignores VMX */
- l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_AUTO;
-}
-
void vmx_exit(void)
{
allow_smaller_maxphyaddr = false;
@@ -8724,14 +8753,8 @@ int __init vmx_init(void)
if (r)
return r;
- /*
- * Must be called after common x86 init so enable_ept is properly set
- * up. Hand the parameter mitigation value in which was stored in
- * the pre module init parser. If no parameter was given, it will
- * contain 'auto' which will be turned into the default 'cond'
- * mitigation mode.
- */
- r = vmx_setup_l1d_flush(vmentry_l1d_flush_param);
+ /* Must be called after common x86 init so enable_ept is setup. */
+ r = vmx_setup_l1d_flush();
if (r)
goto err_l1d_flush;
diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
index ea93121029f9..bc3ed3145d7e 100644
--- a/arch/x86/kvm/vmx/vmx.h
+++ b/arch/x86/kvm/vmx/vmx.h
@@ -369,7 +369,6 @@ void set_cr4_guest_host_mask(struct vcpu_vmx *vmx);
void ept_save_pdptrs(struct kvm_vcpu *vcpu);
void vmx_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg);
void __vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg);
-u64 construct_eptp(struct kvm_vcpu *vcpu, hpa_t root_hpa, int root_level);
bool vmx_guest_inject_ac(struct kvm_vcpu *vcpu);
void vmx_update_exception_bitmap(struct kvm_vcpu *vcpu);
@@ -681,7 +680,6 @@ struct vmcs *alloc_vmcs_cpu(bool shadow, int cpu, gfp_t flags);
void free_vmcs(struct vmcs *vmcs);
int alloc_loaded_vmcs(struct loaded_vmcs *loaded_vmcs);
void free_loaded_vmcs(struct loaded_vmcs *loaded_vmcs);
-void loaded_vmcs_clear(struct loaded_vmcs *loaded_vmcs);
static inline struct vmcs *alloc_vmcs(bool shadow)
{
diff --git a/arch/x86/kvm/vmx/x86_ops.h b/arch/x86/kvm/vmx/x86_ops.h
index 9697368d65b3..d09abeac2b56 100644
--- a/arch/x86/kvm/vmx/x86_ops.h
+++ b/arch/x86/kvm/vmx/x86_ops.h
@@ -73,7 +73,6 @@ void vmx_get_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt);
void vmx_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt);
void vmx_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt);
void vmx_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt);
-void vmx_set_dr6(struct kvm_vcpu *vcpu, unsigned long val);
void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val);
void vmx_sync_dirty_debug_regs(struct kvm_vcpu *vcpu);
void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg);
@@ -149,6 +148,7 @@ int tdx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr);
int tdx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr);
int tdx_vcpu_ioctl(struct kvm_vcpu *vcpu, void __user *argp);
+int tdx_vcpu_unlocked_ioctl(struct kvm_vcpu *vcpu, void __user *argp);
void tdx_flush_tlb_current(struct kvm_vcpu *vcpu);
void tdx_flush_tlb_all(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index c9c2aa6f4705..0c6d899d53dd 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -159,9 +159,6 @@ EXPORT_SYMBOL_FOR_KVM_INTERNAL(report_ignored_msrs);
unsigned int min_timer_period_us = 200;
module_param(min_timer_period_us, uint, 0644);
-static bool __read_mostly kvmclock_periodic_sync = true;
-module_param(kvmclock_periodic_sync, bool, 0444);
-
/* tsc tolerance in parts per million - default to 1/2 of the NTP threshold */
static u32 __read_mostly tsc_tolerance_ppm = 250;
module_param(tsc_tolerance_ppm, uint, 0644);
@@ -212,7 +209,7 @@ struct kvm_user_return_msrs {
u32 __read_mostly kvm_nr_uret_msrs;
EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_nr_uret_msrs);
static u32 __read_mostly kvm_uret_msrs_list[KVM_MAX_NR_USER_RETURN_MSRS];
-static struct kvm_user_return_msrs __percpu *user_return_msrs;
+static DEFINE_PER_CPU(struct kvm_user_return_msrs, user_return_msrs);
#define KVM_SUPPORTED_XCR0 (XFEATURE_MASK_FP | XFEATURE_MASK_SSE \
| XFEATURE_MASK_YMM | XFEATURE_MASK_BNDREGS \
@@ -575,24 +572,26 @@ static inline void kvm_async_pf_hash_reset(struct kvm_vcpu *vcpu)
vcpu->arch.apf.gfns[i] = ~0;
}
+static void kvm_destroy_user_return_msrs(void)
+{
+ int cpu;
+
+ for_each_possible_cpu(cpu)
+ WARN_ON_ONCE(per_cpu(user_return_msrs, cpu).registered);
+
+ kvm_nr_uret_msrs = 0;
+}
+
static void kvm_on_user_return(struct user_return_notifier *urn)
{
unsigned slot;
struct kvm_user_return_msrs *msrs
= container_of(urn, struct kvm_user_return_msrs, urn);
struct kvm_user_return_msr_values *values;
- unsigned long flags;
- /*
- * Disabling irqs at this point since the following code could be
- * interrupted and executed through kvm_arch_disable_virtualization_cpu()
- */
- local_irq_save(flags);
- if (msrs->registered) {
- msrs->registered = false;
- user_return_notifier_unregister(urn);
- }
- local_irq_restore(flags);
+ msrs->registered = false;
+ user_return_notifier_unregister(urn);
+
for (slot = 0; slot < kvm_nr_uret_msrs; ++slot) {
values = &msrs->values[slot];
if (values->host != values->curr) {
@@ -643,7 +642,7 @@ EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_find_user_return_msr);
static void kvm_user_return_msr_cpu_online(void)
{
- struct kvm_user_return_msrs *msrs = this_cpu_ptr(user_return_msrs);
+ struct kvm_user_return_msrs *msrs = this_cpu_ptr(&user_return_msrs);
u64 value;
int i;
@@ -665,7 +664,7 @@ static void kvm_user_return_register_notifier(struct kvm_user_return_msrs *msrs)
int kvm_set_user_return_msr(unsigned slot, u64 value, u64 mask)
{
- struct kvm_user_return_msrs *msrs = this_cpu_ptr(user_return_msrs);
+ struct kvm_user_return_msrs *msrs = this_cpu_ptr(&user_return_msrs);
int err;
value = (value & mask) | (msrs->values[slot].host & ~mask);
@@ -681,24 +680,15 @@ int kvm_set_user_return_msr(unsigned slot, u64 value, u64 mask)
}
EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_set_user_return_msr);
-void kvm_user_return_msr_update_cache(unsigned int slot, u64 value)
-{
- struct kvm_user_return_msrs *msrs = this_cpu_ptr(user_return_msrs);
-
- msrs->values[slot].curr = value;
- kvm_user_return_register_notifier(msrs);
-}
-EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_user_return_msr_update_cache);
-
u64 kvm_get_user_return_msr(unsigned int slot)
{
- return this_cpu_ptr(user_return_msrs)->values[slot].curr;
+ return this_cpu_ptr(&user_return_msrs)->values[slot].curr;
}
EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_get_user_return_msr);
static void drop_user_return_notifiers(void)
{
- struct kvm_user_return_msrs *msrs = this_cpu_ptr(user_return_msrs);
+ struct kvm_user_return_msrs *msrs = this_cpu_ptr(&user_return_msrs);
if (msrs->registered)
kvm_on_user_return(&msrs->urn);
@@ -1045,6 +1035,13 @@ bool kvm_require_dr(struct kvm_vcpu *vcpu, int dr)
}
EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_require_dr);
+static bool kvm_pv_async_pf_enabled(struct kvm_vcpu *vcpu)
+{
+ u64 mask = KVM_ASYNC_PF_ENABLED | KVM_ASYNC_PF_DELIVERY_AS_INT;
+
+ return (vcpu->arch.apf.msr_en_val & mask) == mask;
+}
+
static inline u64 pdptr_rsvd_bits(struct kvm_vcpu *vcpu)
{
return vcpu->arch.reserved_gpa_bits | rsvd_bits(5, 8) | rsvd_bits(1, 2);
@@ -1137,15 +1134,20 @@ void kvm_post_set_cr0(struct kvm_vcpu *vcpu, unsigned long old_cr0, unsigned lon
}
if ((cr0 ^ old_cr0) & X86_CR0_PG) {
- kvm_clear_async_pf_completion_queue(vcpu);
- kvm_async_pf_hash_reset(vcpu);
-
/*
* Clearing CR0.PG is defined to flush the TLB from the guest's
* perspective.
*/
if (!(cr0 & X86_CR0_PG))
kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu);
+ /*
+ * Check for async #PF completion events when enabling paging,
+ * as the vCPU may have previously encountered async #PFs (it's
+ * entirely legal for the guest to toggle paging on/off without
+ * waiting for the async #PF queue to drain).
+ */
+ else if (kvm_pv_async_pf_enabled(vcpu))
+ kvm_make_request(KVM_REQ_APF_READY, vcpu);
}
if ((cr0 ^ old_cr0) & KVM_MMU_CR0_ROLE_BITS)
@@ -1203,20 +1205,27 @@ void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw)
}
EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_lmsw);
-void kvm_load_guest_xsave_state(struct kvm_vcpu *vcpu)
+static void kvm_load_xfeatures(struct kvm_vcpu *vcpu, bool load_guest)
{
if (vcpu->arch.guest_state_protected)
return;
- if (kvm_is_cr4_bit_set(vcpu, X86_CR4_OSXSAVE)) {
+ if (!kvm_is_cr4_bit_set(vcpu, X86_CR4_OSXSAVE))
+ return;
- if (vcpu->arch.xcr0 != kvm_host.xcr0)
- xsetbv(XCR_XFEATURE_ENABLED_MASK, vcpu->arch.xcr0);
+ if (vcpu->arch.xcr0 != kvm_host.xcr0)
+ xsetbv(XCR_XFEATURE_ENABLED_MASK,
+ load_guest ? vcpu->arch.xcr0 : kvm_host.xcr0);
- if (guest_cpu_cap_has(vcpu, X86_FEATURE_XSAVES) &&
- vcpu->arch.ia32_xss != kvm_host.xss)
- wrmsrq(MSR_IA32_XSS, vcpu->arch.ia32_xss);
- }
+ if (guest_cpu_cap_has(vcpu, X86_FEATURE_XSAVES) &&
+ vcpu->arch.ia32_xss != kvm_host.xss)
+ wrmsrq(MSR_IA32_XSS, load_guest ? vcpu->arch.ia32_xss : kvm_host.xss);
+}
+
+static void kvm_load_guest_pkru(struct kvm_vcpu *vcpu)
+{
+ if (vcpu->arch.guest_state_protected)
+ return;
if (cpu_feature_enabled(X86_FEATURE_PKU) &&
vcpu->arch.pkru != vcpu->arch.host_pkru &&
@@ -1224,9 +1233,8 @@ void kvm_load_guest_xsave_state(struct kvm_vcpu *vcpu)
kvm_is_cr4_bit_set(vcpu, X86_CR4_PKE)))
wrpkru(vcpu->arch.pkru);
}
-EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_load_guest_xsave_state);
-void kvm_load_host_xsave_state(struct kvm_vcpu *vcpu)
+static void kvm_load_host_pkru(struct kvm_vcpu *vcpu)
{
if (vcpu->arch.guest_state_protected)
return;
@@ -1238,19 +1246,7 @@ void kvm_load_host_xsave_state(struct kvm_vcpu *vcpu)
if (vcpu->arch.pkru != vcpu->arch.host_pkru)
wrpkru(vcpu->arch.host_pkru);
}
-
- if (kvm_is_cr4_bit_set(vcpu, X86_CR4_OSXSAVE)) {
-
- if (vcpu->arch.xcr0 != kvm_host.xcr0)
- xsetbv(XCR_XFEATURE_ENABLED_MASK, kvm_host.xcr0);
-
- if (guest_cpu_cap_has(vcpu, X86_FEATURE_XSAVES) &&
- vcpu->arch.ia32_xss != kvm_host.xss)
- wrmsrq(MSR_IA32_XSS, kvm_host.xss);
- }
-
}
-EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_load_host_xsave_state);
#ifdef CONFIG_X86_64
static inline u64 kvm_guest_supported_xfd(struct kvm_vcpu *vcpu)
@@ -3505,27 +3501,17 @@ uint64_t kvm_get_wall_clock_epoch(struct kvm *kvm)
/*
* kvmclock updates which are isolated to a given vcpu, such as
* vcpu->cpu migration, should not allow system_timestamp from
- * the rest of the vcpus to remain static. Otherwise ntp frequency
- * correction applies to one vcpu's system_timestamp but not
- * the others.
+ * the rest of the vcpus to remain static.
*
* So in those cases, request a kvmclock update for all vcpus.
- * We need to rate-limit these requests though, as they can
- * considerably slow guests that have a large number of vcpus.
- * The time for a remote vcpu to update its kvmclock is bound
- * by the delay we use to rate-limit the updates.
+ * The worst case for a remote vcpu to update its kvmclock
+ * is then bounded by maximum nohz sleep latency.
*/
-
-#define KVMCLOCK_UPDATE_DELAY msecs_to_jiffies(100)
-
-static void kvmclock_update_fn(struct work_struct *work)
+static void kvm_gen_kvmclock_update(struct kvm_vcpu *v)
{
unsigned long i;
- struct delayed_work *dwork = to_delayed_work(work);
- struct kvm_arch *ka = container_of(dwork, struct kvm_arch,
- kvmclock_update_work);
- struct kvm *kvm = container_of(ka, struct kvm, arch);
struct kvm_vcpu *vcpu;
+ struct kvm *kvm = v->kvm;
kvm_for_each_vcpu(i, vcpu, kvm) {
kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
@@ -3533,29 +3519,6 @@ static void kvmclock_update_fn(struct work_struct *work)
}
}
-static void kvm_gen_kvmclock_update(struct kvm_vcpu *v)
-{
- struct kvm *kvm = v->kvm;
-
- kvm_make_request(KVM_REQ_CLOCK_UPDATE, v);
- schedule_delayed_work(&kvm->arch.kvmclock_update_work,
- KVMCLOCK_UPDATE_DELAY);
-}
-
-#define KVMCLOCK_SYNC_PERIOD (300 * HZ)
-
-static void kvmclock_sync_fn(struct work_struct *work)
-{
- struct delayed_work *dwork = to_delayed_work(work);
- struct kvm_arch *ka = container_of(dwork, struct kvm_arch,
- kvmclock_sync_work);
- struct kvm *kvm = container_of(ka, struct kvm, arch);
-
- schedule_delayed_work(&kvm->arch.kvmclock_update_work, 0);
- schedule_delayed_work(&kvm->arch.kvmclock_sync_work,
- KVMCLOCK_SYNC_PERIOD);
-}
-
/* These helpers are safe iff @msr is known to be an MCx bank MSR. */
static bool is_mci_control_msr(u32 msr)
{
@@ -3650,13 +3613,6 @@ static int set_msr_mce(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
return 0;
}
-static inline bool kvm_pv_async_pf_enabled(struct kvm_vcpu *vcpu)
-{
- u64 mask = KVM_ASYNC_PF_ENABLED | KVM_ASYNC_PF_DELIVERY_AS_INT;
-
- return (vcpu->arch.apf.msr_en_val & mask) == mask;
-}
-
static int kvm_pv_enable_async_pf(struct kvm_vcpu *vcpu, u64 data)
{
gpa_t gpa = data & ~0x3f;
@@ -4182,7 +4138,12 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_INT))
return 1;
if (data & 0x1) {
- vcpu->arch.apf.pageready_pending = false;
+ /*
+ * Pairs with the smp_mb__after_atomic() in
+ * kvm_arch_async_page_present_queued().
+ */
+ smp_store_mb(vcpu->arch.apf.pageready_pending, false);
+
kvm_check_async_pf_completion(vcpu);
}
break;
@@ -5188,7 +5149,7 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
{
struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
- vcpu->arch.l1tf_flush_l1d = true;
+ kvm_request_l1tf_flush_l1d();
if (vcpu->scheduled_out && pmu->version && pmu->event_count) {
pmu->need_cleanup = true;
@@ -7239,6 +7200,19 @@ static int kvm_vm_ioctl_set_clock(struct kvm *kvm, void __user *argp)
return 0;
}
+long kvm_arch_vcpu_unlocked_ioctl(struct file *filp, unsigned int ioctl,
+ unsigned long arg)
+{
+ struct kvm_vcpu *vcpu = filp->private_data;
+ void __user *argp = (void __user *)arg;
+
+ if (ioctl == KVM_MEMORY_ENCRYPT_OP &&
+ kvm_x86_ops.vcpu_mem_enc_unlocked_ioctl)
+ return kvm_x86_call(vcpu_mem_enc_unlocked_ioctl)(vcpu, argp);
+
+ return -ENOIOCTLCMD;
+}
+
int kvm_arch_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg)
{
struct kvm *kvm = filp->private_data;
@@ -7998,7 +7972,7 @@ int kvm_write_guest_virt_system(struct kvm_vcpu *vcpu, gva_t addr, void *val,
unsigned int bytes, struct x86_exception *exception)
{
/* kvm_write_guest_virt_system can pull in tons of pages. */
- vcpu->arch.l1tf_flush_l1d = true;
+ kvm_request_l1tf_flush_l1d();
return kvm_write_guest_virt_helper(addr, val, bytes, vcpu,
PFERR_WRITE_MASK, exception);
@@ -8842,6 +8816,14 @@ static void emulator_triple_fault(struct x86_emulate_ctxt *ctxt)
kvm_make_request(KVM_REQ_TRIPLE_FAULT, emul_to_vcpu(ctxt));
}
+static int emulator_get_xcr(struct x86_emulate_ctxt *ctxt, u32 index, u64 *xcr)
+{
+ if (index != XCR_XFEATURE_ENABLED_MASK)
+ return 1;
+ *xcr = emul_to_vcpu(ctxt)->arch.xcr0;
+ return 0;
+}
+
static int emulator_set_xcr(struct x86_emulate_ctxt *ctxt, u32 index, u64 xcr)
{
return __kvm_set_xcr(emul_to_vcpu(ctxt), index, xcr);
@@ -8914,6 +8896,7 @@ static const struct x86_emulate_ops emulate_ops = {
.is_smm = emulator_is_smm,
.leave_smm = emulator_leave_smm,
.triple_fault = emulator_triple_fault,
+ .get_xcr = emulator_get_xcr,
.set_xcr = emulator_set_xcr,
.get_untagged_addr = emulator_get_untagged_addr,
.is_canonical_addr = emulator_is_canonical_addr,
@@ -9109,6 +9092,18 @@ void kvm_prepare_event_vectoring_exit(struct kvm_vcpu *vcpu, gpa_t gpa)
}
EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_prepare_event_vectoring_exit);
+void kvm_prepare_unexpected_reason_exit(struct kvm_vcpu *vcpu, u64 exit_reason)
+{
+ vcpu_unimpl(vcpu, "unexpected exit reason 0x%llx\n", exit_reason);
+
+ vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+ vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_UNEXPECTED_EXIT_REASON;
+ vcpu->run->internal.ndata = 2;
+ vcpu->run->internal.data[0] = exit_reason;
+ vcpu->run->internal.data[1] = vcpu->arch.last_vmentry_cpu;
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_prepare_unexpected_reason_exit);
+
static int handle_emulation_failure(struct kvm_vcpu *vcpu, int emulation_type)
{
struct kvm *kvm = vcpu->kvm;
@@ -9337,6 +9332,23 @@ static bool is_vmware_backdoor_opcode(struct x86_emulate_ctxt *ctxt)
return false;
}
+static bool is_soft_int_instruction(struct x86_emulate_ctxt *ctxt,
+ int emulation_type)
+{
+ u8 vector = EMULTYPE_GET_SOFT_INT_VECTOR(emulation_type);
+
+ switch (ctxt->b) {
+ case 0xcc:
+ return vector == BP_VECTOR;
+ case 0xcd:
+ return vector == ctxt->src.val;
+ case 0xce:
+ return vector == OF_VECTOR;
+ default:
+ return false;
+ }
+}
+
/*
* Decode an instruction for emulation. The caller is responsible for handling
* code breakpoints. Note, manually detecting code breakpoints is unnecessary
@@ -9394,7 +9406,7 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
return handle_emulation_failure(vcpu, emulation_type);
}
- vcpu->arch.l1tf_flush_l1d = true;
+ kvm_request_l1tf_flush_l1d();
if (!(emulation_type & EMULTYPE_NO_DECODE)) {
kvm_clear_exception_queue(vcpu);
@@ -9447,6 +9459,10 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
* injecting single-step #DBs.
*/
if (emulation_type & EMULTYPE_SKIP) {
+ if (emulation_type & EMULTYPE_SKIP_SOFT_INT &&
+ !is_soft_int_instruction(ctxt, emulation_type))
+ return 0;
+
if (ctxt->mode != X86EMUL_MODE_PROT64)
ctxt->eip = (u32)ctxt->_eip;
else
@@ -10031,17 +10047,9 @@ int kvm_x86_vendor_init(struct kvm_x86_init_ops *ops)
return -ENOMEM;
}
- user_return_msrs = alloc_percpu(struct kvm_user_return_msrs);
- if (!user_return_msrs) {
- pr_err("failed to allocate percpu kvm_user_return_msrs\n");
- r = -ENOMEM;
- goto out_free_x86_emulator_cache;
- }
- kvm_nr_uret_msrs = 0;
-
r = kvm_mmu_vendor_module_init();
if (r)
- goto out_free_percpu;
+ goto out_free_x86_emulator_cache;
kvm_caps.supported_vm_types = BIT(KVM_X86_DEFAULT_VM);
kvm_caps.supported_mce_cap = MCG_CTL_P | MCG_SER_P;
@@ -10066,6 +10074,8 @@ int kvm_x86_vendor_init(struct kvm_x86_init_ops *ops)
if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES))
rdmsrq(MSR_IA32_ARCH_CAPABILITIES, kvm_host.arch_capabilities);
+ WARN_ON_ONCE(kvm_nr_uret_msrs);
+
r = ops->hardware_setup();
if (r != 0)
goto out_mmu_exit;
@@ -10138,9 +10148,8 @@ out_unwind_ops:
kvm_x86_ops.enable_virtualization_cpu = NULL;
kvm_x86_call(hardware_unsetup)();
out_mmu_exit:
+ kvm_destroy_user_return_msrs();
kvm_mmu_vendor_module_exit();
-out_free_percpu:
- free_percpu(user_return_msrs);
out_free_x86_emulator_cache:
kmem_cache_destroy(x86_emulator_cache);
return r;
@@ -10168,8 +10177,8 @@ void kvm_x86_vendor_exit(void)
cancel_work_sync(&pvclock_gtod_work);
#endif
kvm_x86_call(hardware_unsetup)();
+ kvm_destroy_user_return_msrs();
kvm_mmu_vendor_module_exit();
- free_percpu(user_return_msrs);
kmem_cache_destroy(x86_emulator_cache);
#ifdef CONFIG_KVM_XEN
static_key_deferred_flush(&kvm_xen_enabled);
@@ -11291,6 +11300,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
if (vcpu->arch.guest_fpu.xfd_err)
wrmsrq(MSR_IA32_XFD_ERR, vcpu->arch.guest_fpu.xfd_err);
+ kvm_load_xfeatures(vcpu, true);
+
if (unlikely(vcpu->arch.switch_db_regs &&
!(vcpu->arch.switch_db_regs & KVM_DEBUGREG_AUTO_SWITCH))) {
set_debugreg(DR7_FIXED_1, 7);
@@ -11319,6 +11330,12 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
guest_timing_enter_irqoff();
+ /*
+ * Swap PKRU with hardware breakpoints disabled to minimize the number
+ * of flows where non-KVM code can run with guest state loaded.
+ */
+ kvm_load_guest_pkru(vcpu);
+
for (;;) {
/*
* Assert that vCPU vs. VM APICv state is consistent. An APICv
@@ -11347,6 +11364,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
++vcpu->stat.exits;
}
+ kvm_load_host_pkru(vcpu);
+
/*
* Do this here before restoring debug registers on the host. And
* since we do this before handling the vmexit, a DR access vmexit
@@ -11377,6 +11396,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
vcpu->mode = OUTSIDE_GUEST_MODE;
smp_wmb();
+ kvm_load_xfeatures(vcpu, false);
+
/*
* Sync xfd before calling handle_exit_irqoff() which may
* rely on the fact that guest_fpu::xfd is up-to-date (e.g.
@@ -12734,8 +12755,6 @@ fail_mmu_destroy:
void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
{
- struct kvm *kvm = vcpu->kvm;
-
if (mutex_lock_killable(&vcpu->mutex))
return;
vcpu_load(vcpu);
@@ -12746,10 +12765,6 @@ void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
vcpu->arch.msr_kvm_poll_control = 1;
mutex_unlock(&vcpu->mutex);
-
- if (kvmclock_periodic_sync && vcpu->vcpu_idx == 0)
- schedule_delayed_work(&kvm->arch.kvmclock_sync_work,
- KVMCLOCK_SYNC_PERIOD);
}
void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
@@ -13088,7 +13103,21 @@ int kvm_arch_enable_virtualization_cpu(void)
void kvm_arch_disable_virtualization_cpu(void)
{
kvm_x86_call(disable_virtualization_cpu)();
- drop_user_return_notifiers();
+
+ /*
+ * Leave the user-return notifiers as-is when disabling virtualization
+ * for reboot, i.e. when disabling via IPI function call, and instead
+ * pin kvm.ko (if it's a module) to defend against use-after-free (in
+ * the *very* unlikely scenario module unload is racing with reboot).
+ * On a forced reboot, tasks aren't frozen before shutdown, and so KVM
+ * could be actively modifying user-return MSR state when the IPI to
+ * disable virtualization arrives. Handle the extreme edge case here
+ * instead of trying to account for it in the normal flows.
+ */
+ if (in_task() || WARN_ON_ONCE(!kvm_rebooting))
+ drop_user_return_notifiers();
+ else
+ __module_get(THIS_MODULE);
}
bool kvm_vcpu_is_reset_bsp(struct kvm_vcpu *vcpu)
@@ -13160,9 +13189,6 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
kvm->arch.hv_root_tdp = INVALID_PAGE;
#endif
- INIT_DELAYED_WORK(&kvm->arch.kvmclock_update_work, kvmclock_update_fn);
- INIT_DELAYED_WORK(&kvm->arch.kvmclock_sync_work, kvmclock_sync_fn);
-
kvm_apicv_init(kvm);
kvm_hv_init_vm(kvm);
kvm_xen_init_vm(kvm);
@@ -13269,9 +13295,6 @@ void kvm_arch_pre_destroy_vm(struct kvm *kvm)
* is unsafe, i.e. will lead to use-after-free. The PIT also needs to
* be stopped before IRQ routing is freed.
*/
- cancel_delayed_work_sync(&kvm->arch.kvmclock_sync_work);
- cancel_delayed_work_sync(&kvm->arch.kvmclock_update_work);
-
#ifdef CONFIG_KVM_IOAPIC
kvm_free_pit(kvm);
#endif
@@ -13888,7 +13911,7 @@ void kvm_arch_async_page_present(struct kvm_vcpu *vcpu,
if ((work->wakeup_all || work->notpresent_injected) &&
kvm_pv_async_pf_enabled(vcpu) &&
!apf_put_user_ready(vcpu, work->arch.token)) {
- vcpu->arch.apf.pageready_pending = true;
+ WRITE_ONCE(vcpu->arch.apf.pageready_pending, true);
kvm_apic_set_irq(vcpu, &irq, NULL);
}
@@ -13899,7 +13922,11 @@ void kvm_arch_async_page_present(struct kvm_vcpu *vcpu,
void kvm_arch_async_page_present_queued(struct kvm_vcpu *vcpu)
{
kvm_make_request(KVM_REQ_APF_READY, vcpu);
- if (!vcpu->arch.apf.pageready_pending)
+
+ /* Pairs with smp_store_mb() in kvm_set_msr_common(). */
+ smp_mb__after_atomic();
+
+ if (!READ_ONCE(vcpu->arch.apf.pageready_pending))
kvm_vcpu_kick(vcpu);
}
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index f3dc77f006f9..fdab0ad49098 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -420,6 +420,20 @@ static inline bool kvm_check_has_quirk(struct kvm *kvm, u64 quirk)
return !(kvm->arch.disabled_quirks & quirk);
}
+static __always_inline void kvm_request_l1tf_flush_l1d(void)
+{
+#if IS_ENABLED(CONFIG_CPU_MITIGATIONS) && IS_ENABLED(CONFIG_KVM_INTEL)
+ /*
+ * Use a raw write to set the per-CPU flag, as KVM will ensure a flush
+ * even if preemption is currently enabled.. If the current vCPU task
+ * is migrated to a different CPU (or userspace runs the vCPU on a
+ * different task) before the next VM-Entry, then kvm_arch_vcpu_load()
+ * will request a flush on the new CPU.
+ */
+ raw_cpu_write(irq_stat.kvm_cpu_l1tf_flush_l1d, 1);
+#endif
+}
+
void kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip);
u64 get_kvmclock_ns(struct kvm *kvm);
@@ -622,8 +636,6 @@ static inline void kvm_machine_check(void)
#endif
}
-void kvm_load_guest_xsave_state(struct kvm_vcpu *vcpu);
-void kvm_load_host_xsave_state(struct kvm_vcpu *vcpu);
int kvm_spec_ctrl_test_value(u64 value);
int kvm_handle_memory_failure(struct kvm_vcpu *vcpu, int r,
struct x86_exception *e);