diff options
| author | Paolo Bonzini <pbonzini@redhat.com> | 2025-11-26 09:36:37 +0100 |
|---|---|---|
| committer | Paolo Bonzini <pbonzini@redhat.com> | 2025-11-26 09:36:37 +0100 |
| commit | de8e8ebb1a7c5e2243fdb0409418484501e3b9b2 (patch) | |
| tree | 6a5e82379ea71f61edab3075cf10177e0ff63eb9 | |
| parent | adc99a6cfcf76d670272dea64bbc2d43ecd12a2f (diff) | |
| parent | 398180f93cf3c7bb0ee3f512b139ad01843f3ddf (diff) | |
Merge tag 'kvm-x86-tdx-6.19' of https://github.com/kvm-x86/linux into HEAD
KVM TDX changes for 6.19:
- Overhaul the TDX code to address systemic races where KVM (acting on behalf
of userspace) could inadvertantly trigger lock contention in the TDX-Module,
which KVM was either working around in weird, ugly ways, or was simply
oblivious to (as proven by Yan tripping several KVM_BUG_ON()s with clever
selftests).
- Fix a bug where KVM could corrupt a vCPU's cpu_list when freeing a vCPU if
creating said vCPU failed partway through.
- Fix a few sparse warnings (bad annotation, 0 != NULL).
- Use struct_size() to simplify copying capabilities to userspace.
| -rw-r--r-- | arch/arm64/kvm/arm.c | 6 | ||||
| -rw-r--r-- | arch/loongarch/kvm/Kconfig | 1 | ||||
| -rw-r--r-- | arch/loongarch/kvm/vcpu.c | 4 | ||||
| -rw-r--r-- | arch/mips/kvm/Kconfig | 1 | ||||
| -rw-r--r-- | arch/mips/kvm/mips.c | 4 | ||||
| -rw-r--r-- | arch/powerpc/kvm/Kconfig | 1 | ||||
| -rw-r--r-- | arch/powerpc/kvm/powerpc.c | 4 | ||||
| -rw-r--r-- | arch/riscv/kvm/Kconfig | 1 | ||||
| -rw-r--r-- | arch/riscv/kvm/vcpu.c | 4 | ||||
| -rw-r--r-- | arch/s390/kvm/Kconfig | 1 | ||||
| -rw-r--r-- | arch/s390/kvm/kvm-s390.c | 4 | ||||
| -rw-r--r-- | arch/x86/include/asm/kvm-x86-ops.h | 1 | ||||
| -rw-r--r-- | arch/x86/include/asm/kvm_host.h | 7 | ||||
| -rw-r--r-- | arch/x86/kvm/mmu.h | 3 | ||||
| -rw-r--r-- | arch/x86/kvm/mmu/mmu.c | 87 | ||||
| -rw-r--r-- | arch/x86/kvm/mmu/tdp_mmu.c | 50 | ||||
| -rw-r--r-- | arch/x86/kvm/vmx/main.c | 9 | ||||
| -rw-r--r-- | arch/x86/kvm/vmx/tdx.c | 710 | ||||
| -rw-r--r-- | arch/x86/kvm/vmx/tdx.h | 8 | ||||
| -rw-r--r-- | arch/x86/kvm/vmx/x86_ops.h | 1 | ||||
| -rw-r--r-- | arch/x86/kvm/x86.c | 13 | ||||
| -rw-r--r-- | include/linux/kvm_host.h | 14 | ||||
| -rw-r--r-- | virt/kvm/Kconfig | 3 | ||||
| -rw-r--r-- | virt/kvm/kvm_main.c | 6 |
24 files changed, 495 insertions, 448 deletions
diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index 052bf0d4d0b0..e7140c19771b 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -1835,6 +1835,12 @@ long kvm_arch_vcpu_ioctl(struct file *filp, return r; } +long kvm_arch_vcpu_unlocked_ioctl(struct file *filp, unsigned int ioctl, + unsigned long arg) +{ + return -ENOIOCTLCMD; +} + void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot) { diff --git a/arch/loongarch/kvm/Kconfig b/arch/loongarch/kvm/Kconfig index ae64bbdf83a7..ed4f724db774 100644 --- a/arch/loongarch/kvm/Kconfig +++ b/arch/loongarch/kvm/Kconfig @@ -25,7 +25,6 @@ config KVM select HAVE_KVM_IRQCHIP select HAVE_KVM_MSI select HAVE_KVM_READONLY_MEM - select HAVE_KVM_VCPU_ASYNC_IOCTL select KVM_COMMON select KVM_GENERIC_DIRTYLOG_READ_PROTECT select KVM_GENERIC_HARDWARE_ENABLING diff --git a/arch/loongarch/kvm/vcpu.c b/arch/loongarch/kvm/vcpu.c index 1245a6b35896..72dc5726259b 100644 --- a/arch/loongarch/kvm/vcpu.c +++ b/arch/loongarch/kvm/vcpu.c @@ -1473,8 +1473,8 @@ int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, struct kvm_interrupt *irq) return 0; } -long kvm_arch_vcpu_async_ioctl(struct file *filp, - unsigned int ioctl, unsigned long arg) +long kvm_arch_vcpu_unlocked_ioctl(struct file *filp, unsigned int ioctl, + unsigned long arg) { void __user *argp = (void __user *)arg; struct kvm_vcpu *vcpu = filp->private_data; diff --git a/arch/mips/kvm/Kconfig b/arch/mips/kvm/Kconfig index ab57221fa4dd..cc13cc35f208 100644 --- a/arch/mips/kvm/Kconfig +++ b/arch/mips/kvm/Kconfig @@ -22,7 +22,6 @@ config KVM select EXPORT_UASM select KVM_COMMON select KVM_GENERIC_DIRTYLOG_READ_PROTECT - select HAVE_KVM_VCPU_ASYNC_IOCTL select KVM_MMIO select KVM_GENERIC_MMU_NOTIFIER select KVM_GENERIC_HARDWARE_ENABLING diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c index a75587018f44..b0fb92fda4d4 100644 --- a/arch/mips/kvm/mips.c +++ b/arch/mips/kvm/mips.c @@ -895,8 +895,8 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu, return r; } -long kvm_arch_vcpu_async_ioctl(struct file *filp, unsigned int ioctl, - unsigned long arg) +long kvm_arch_vcpu_unlocked_ioctl(struct file *filp, unsigned int ioctl, + unsigned long arg) { struct kvm_vcpu *vcpu = filp->private_data; void __user *argp = (void __user *)arg; diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig index 2f2702c867f7..c9a2d50ff1b0 100644 --- a/arch/powerpc/kvm/Kconfig +++ b/arch/powerpc/kvm/Kconfig @@ -20,7 +20,6 @@ if VIRTUALIZATION config KVM bool select KVM_COMMON - select HAVE_KVM_VCPU_ASYNC_IOCTL select KVM_VFIO select HAVE_KVM_IRQ_BYPASS diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index 2ba057171ebe..9a89a6d98f97 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -2028,8 +2028,8 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu, return -EINVAL; } -long kvm_arch_vcpu_async_ioctl(struct file *filp, - unsigned int ioctl, unsigned long arg) +long kvm_arch_vcpu_unlocked_ioctl(struct file *filp, unsigned int ioctl, + unsigned long arg) { struct kvm_vcpu *vcpu = filp->private_data; void __user *argp = (void __user *)arg; diff --git a/arch/riscv/kvm/Kconfig b/arch/riscv/kvm/Kconfig index c50328212917..77379f77840a 100644 --- a/arch/riscv/kvm/Kconfig +++ b/arch/riscv/kvm/Kconfig @@ -23,7 +23,6 @@ config KVM select HAVE_KVM_IRQCHIP select HAVE_KVM_IRQ_ROUTING select HAVE_KVM_MSI - select HAVE_KVM_VCPU_ASYNC_IOCTL select HAVE_KVM_READONLY_MEM select HAVE_KVM_DIRTY_RING_ACQ_REL select KVM_COMMON diff --git a/arch/riscv/kvm/vcpu.c b/arch/riscv/kvm/vcpu.c index 5ce35aba6069..2739d5b5ec80 100644 --- a/arch/riscv/kvm/vcpu.c +++ b/arch/riscv/kvm/vcpu.c @@ -238,8 +238,8 @@ vm_fault_t kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf) return VM_FAULT_SIGBUS; } -long kvm_arch_vcpu_async_ioctl(struct file *filp, - unsigned int ioctl, unsigned long arg) +long kvm_arch_vcpu_unlocked_ioctl(struct file *filp, unsigned int ioctl, + unsigned long arg) { struct kvm_vcpu *vcpu = filp->private_data; void __user *argp = (void __user *)arg; diff --git a/arch/s390/kvm/Kconfig b/arch/s390/kvm/Kconfig index cae908d64550..96d16028e8b7 100644 --- a/arch/s390/kvm/Kconfig +++ b/arch/s390/kvm/Kconfig @@ -20,7 +20,6 @@ config KVM def_tristate y prompt "Kernel-based Virtual Machine (KVM) support" select HAVE_KVM_CPU_RELAX_INTERCEPT - select HAVE_KVM_VCPU_ASYNC_IOCTL select KVM_ASYNC_PF select KVM_ASYNC_PF_SYNC select KVM_COMMON diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 16ba04062854..8c4caa5f2fcd 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -5730,8 +5730,8 @@ static long kvm_s390_vcpu_memsida_op(struct kvm_vcpu *vcpu, return r; } -long kvm_arch_vcpu_async_ioctl(struct file *filp, - unsigned int ioctl, unsigned long arg) +long kvm_arch_vcpu_unlocked_ioctl(struct file *filp, unsigned int ioctl, + unsigned long arg) { struct kvm_vcpu *vcpu = filp->private_data; void __user *argp = (void __user *)arg; diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h index fdf178443f85..de709fb5bd76 100644 --- a/arch/x86/include/asm/kvm-x86-ops.h +++ b/arch/x86/include/asm/kvm-x86-ops.h @@ -128,6 +128,7 @@ KVM_X86_OP(enable_smi_window) KVM_X86_OP_OPTIONAL(dev_get_attr) KVM_X86_OP_OPTIONAL(mem_enc_ioctl) KVM_X86_OP_OPTIONAL(vcpu_mem_enc_ioctl) +KVM_X86_OP_OPTIONAL(vcpu_mem_enc_unlocked_ioctl) KVM_X86_OP_OPTIONAL(mem_enc_register_region) KVM_X86_OP_OPTIONAL(mem_enc_unregister_region) KVM_X86_OP_OPTIONAL(vm_copy_enc_context_from) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 103af57e1060..6d41249e86af 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1843,15 +1843,15 @@ struct kvm_x86_ops { void *external_spt); /* Update the external page table from spte getting set. */ int (*set_external_spte)(struct kvm *kvm, gfn_t gfn, enum pg_level level, - kvm_pfn_t pfn_for_gfn); + u64 mirror_spte); /* Update external page tables for page table about to be freed. */ int (*free_external_spt)(struct kvm *kvm, gfn_t gfn, enum pg_level level, void *external_spt); /* Update external page table from spte getting removed, and flush TLB. */ - int (*remove_external_spte)(struct kvm *kvm, gfn_t gfn, enum pg_level level, - kvm_pfn_t pfn_for_gfn); + void (*remove_external_spte)(struct kvm *kvm, gfn_t gfn, enum pg_level level, + u64 mirror_spte); bool (*has_wbinvd_exit)(void); @@ -1909,6 +1909,7 @@ struct kvm_x86_ops { int (*dev_get_attr)(u32 group, u64 attr, u64 *val); int (*mem_enc_ioctl)(struct kvm *kvm, void __user *argp); int (*vcpu_mem_enc_ioctl)(struct kvm_vcpu *vcpu, void __user *argp); + int (*vcpu_mem_enc_unlocked_ioctl)(struct kvm_vcpu *vcpu, void __user *argp); int (*mem_enc_register_region)(struct kvm *kvm, struct kvm_enc_region *argp); int (*mem_enc_unregister_region)(struct kvm *kvm, struct kvm_enc_region *argp); int (*vm_copy_enc_context_from)(struct kvm *kvm, unsigned int source_fd); diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index c272ef269b96..830f46145692 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -255,8 +255,7 @@ extern bool tdp_mmu_enabled; #define tdp_mmu_enabled false #endif -bool kvm_tdp_mmu_gpa_is_mapped(struct kvm_vcpu *vcpu, u64 gpa); -int kvm_tdp_map_page(struct kvm_vcpu *vcpu, gpa_t gpa, u64 error_code, u8 *level); +int kvm_tdp_mmu_map_private_pfn(struct kvm_vcpu *vcpu, gfn_t gfn, kvm_pfn_t pfn); static inline bool kvm_memslots_have_rmaps(struct kvm *kvm) { diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index c15ac5329cf8..02c450686b4a 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -4924,7 +4924,8 @@ int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) return direct_page_fault(vcpu, fault); } -int kvm_tdp_map_page(struct kvm_vcpu *vcpu, gpa_t gpa, u64 error_code, u8 *level) +static int kvm_tdp_page_prefault(struct kvm_vcpu *vcpu, gpa_t gpa, + u64 error_code, u8 *level) { int r; @@ -4966,7 +4967,6 @@ int kvm_tdp_map_page(struct kvm_vcpu *vcpu, gpa_t gpa, u64 error_code, u8 *level return -EIO; } } -EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_tdp_map_page); long kvm_arch_vcpu_pre_fault_memory(struct kvm_vcpu *vcpu, struct kvm_pre_fault_memory *range) @@ -5002,7 +5002,7 @@ long kvm_arch_vcpu_pre_fault_memory(struct kvm_vcpu *vcpu, * Shadow paging uses GVA for kvm page fault, so restrict to * two-dimensional paging. */ - r = kvm_tdp_map_page(vcpu, range->gpa | direct_bits, error_code, &level); + r = kvm_tdp_page_prefault(vcpu, range->gpa | direct_bits, error_code, &level); if (r < 0) return r; @@ -5014,6 +5014,86 @@ long kvm_arch_vcpu_pre_fault_memory(struct kvm_vcpu *vcpu, return min(range->size, end - range->gpa); } +#ifdef CONFIG_KVM_GUEST_MEMFD +static void kvm_assert_gmem_invalidate_lock_held(struct kvm_memory_slot *slot) +{ +#ifdef CONFIG_PROVE_LOCKING + if (WARN_ON_ONCE(!kvm_slot_has_gmem(slot)) || + WARN_ON_ONCE(!slot->gmem.file) || + WARN_ON_ONCE(!file_count(slot->gmem.file))) + return; + + lockdep_assert_held(&file_inode(slot->gmem.file)->i_mapping->invalidate_lock); +#endif +} + +int kvm_tdp_mmu_map_private_pfn(struct kvm_vcpu *vcpu, gfn_t gfn, kvm_pfn_t pfn) +{ + struct kvm_page_fault fault = { + .addr = gfn_to_gpa(gfn), + .error_code = PFERR_GUEST_FINAL_MASK | PFERR_PRIVATE_ACCESS, + .prefetch = true, + .is_tdp = true, + .nx_huge_page_workaround_enabled = is_nx_huge_page_enabled(vcpu->kvm), + + .max_level = PG_LEVEL_4K, + .req_level = PG_LEVEL_4K, + .goal_level = PG_LEVEL_4K, + .is_private = true, + + .gfn = gfn, + .slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn), + .pfn = pfn, + .map_writable = true, + }; + struct kvm *kvm = vcpu->kvm; + int r; + + lockdep_assert_held(&kvm->slots_lock); + + /* + * Mapping a pre-determined private pfn is intended only for use when + * populating a guest_memfd instance. Assert that the slot is backed + * by guest_memfd and that the gmem instance's invalidate_lock is held. + */ + kvm_assert_gmem_invalidate_lock_held(fault.slot); + + if (KVM_BUG_ON(!tdp_mmu_enabled, kvm)) + return -EIO; + + if (kvm_gfn_is_write_tracked(kvm, fault.slot, fault.gfn)) + return -EPERM; + + r = kvm_mmu_reload(vcpu); + if (r) + return r; + + r = mmu_topup_memory_caches(vcpu, false); + if (r) + return r; + + do { + if (signal_pending(current)) + return -EINTR; + + if (kvm_test_request(KVM_REQ_VM_DEAD, vcpu)) + return -EIO; + + cond_resched(); + + guard(read_lock)(&kvm->mmu_lock); + + r = kvm_tdp_mmu_map(vcpu, &fault); + } while (r == RET_PF_RETRY); + + if (r != RET_PF_FIXED) + return -EIO; + + return 0; +} +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_tdp_mmu_map_private_pfn); +#endif + static void nonpaging_init_context(struct kvm_mmu *context) { context->page_fault = nonpaging_page_fault; @@ -5997,7 +6077,6 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu) out: return r; } -EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_mmu_load); void kvm_mmu_unload(struct kvm_vcpu *vcpu) { diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index c5734ca5c17d..9c26038f6b77 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -362,9 +362,6 @@ static void tdp_mmu_unlink_sp(struct kvm *kvm, struct kvm_mmu_page *sp) static void remove_external_spte(struct kvm *kvm, gfn_t gfn, u64 old_spte, int level) { - kvm_pfn_t old_pfn = spte_to_pfn(old_spte); - int ret; - /* * External (TDX) SPTEs are limited to PG_LEVEL_4K, and external * PTs are removed in a special order, involving free_external_spt(). @@ -377,9 +374,8 @@ static void remove_external_spte(struct kvm *kvm, gfn_t gfn, u64 old_spte, /* Zapping leaf spte is allowed only when write lock is held. */ lockdep_assert_held_write(&kvm->mmu_lock); - /* Because write lock is held, operation should success. */ - ret = kvm_x86_call(remove_external_spte)(kvm, gfn, level, old_pfn); - KVM_BUG_ON(ret, kvm); + + kvm_x86_call(remove_external_spte)(kvm, gfn, level, old_spte); } /** @@ -519,7 +515,6 @@ static int __must_check set_external_spte_present(struct kvm *kvm, tdp_ptep_t sp bool was_present = is_shadow_present_pte(old_spte); bool is_present = is_shadow_present_pte(new_spte); bool is_leaf = is_present && is_last_spte(new_spte, level); - kvm_pfn_t new_pfn = spte_to_pfn(new_spte); int ret = 0; KVM_BUG_ON(was_present, kvm); @@ -538,7 +533,7 @@ static int __must_check set_external_spte_present(struct kvm *kvm, tdp_ptep_t sp * external page table, or leaf. */ if (is_leaf) { - ret = kvm_x86_call(set_external_spte)(kvm, gfn, level, new_pfn); + ret = kvm_x86_call(set_external_spte)(kvm, gfn, level, new_spte); } else { void *external_spt = get_external_spt(gfn, new_spte, level); @@ -1273,6 +1268,8 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) struct kvm_mmu_page *sp; int ret = RET_PF_RETRY; + KVM_MMU_WARN_ON(!root || root->role.invalid); + kvm_mmu_hugepage_adjust(vcpu, fault); trace_kvm_mmu_spte_requested(fault); @@ -1939,13 +1936,16 @@ bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm, * * Must be called between kvm_tdp_mmu_walk_lockless_{begin,end}. */ -static int __kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, - struct kvm_mmu_page *root) +int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, + int *root_level) { + struct kvm_mmu_page *root = root_to_sp(vcpu->arch.mmu->root.hpa); struct tdp_iter iter; gfn_t gfn = addr >> PAGE_SHIFT; int leaf = -1; + *root_level = vcpu->arch.mmu->root_role.level; + for_each_tdp_pte(iter, vcpu->kvm, root, gfn, gfn + 1) { leaf = iter.level; sptes[leaf] = iter.old_spte; @@ -1954,36 +1954,6 @@ static int __kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, return leaf; } -int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, - int *root_level) -{ - struct kvm_mmu_page *root = root_to_sp(vcpu->arch.mmu->root.hpa); - *root_level = vcpu->arch.mmu->root_role.level; - - return __kvm_tdp_mmu_get_walk(vcpu, addr, sptes, root); -} - -bool kvm_tdp_mmu_gpa_is_mapped(struct kvm_vcpu *vcpu, u64 gpa) -{ - struct kvm *kvm = vcpu->kvm; - bool is_direct = kvm_is_addr_direct(kvm, gpa); - hpa_t root = is_direct ? vcpu->arch.mmu->root.hpa : - vcpu->arch.mmu->mirror_root_hpa; - u64 sptes[PT64_ROOT_MAX_LEVEL + 1], spte; - int leaf; - - lockdep_assert_held(&kvm->mmu_lock); - rcu_read_lock(); - leaf = __kvm_tdp_mmu_get_walk(vcpu, gpa, sptes, root_to_sp(root)); - rcu_read_unlock(); - if (leaf < 0) - return false; - - spte = sptes[leaf]; - return is_shadow_present_pte(spte) && is_last_spte(spte, leaf); -} -EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_tdp_mmu_gpa_is_mapped); - /* * Returns the last level spte pointer of the shadow page walk for the given * gpa, and sets *spte to the spte value. This spte may be non-preset. If no diff --git a/arch/x86/kvm/vmx/main.c b/arch/x86/kvm/vmx/main.c index 0eb2773b2ae2..a46ccd670785 100644 --- a/arch/x86/kvm/vmx/main.c +++ b/arch/x86/kvm/vmx/main.c @@ -831,6 +831,14 @@ static int vt_vcpu_mem_enc_ioctl(struct kvm_vcpu *vcpu, void __user *argp) return tdx_vcpu_ioctl(vcpu, argp); } +static int vt_vcpu_mem_enc_unlocked_ioctl(struct kvm_vcpu *vcpu, void __user *argp) +{ + if (!is_td_vcpu(vcpu)) + return -EINVAL; + + return tdx_vcpu_unlocked_ioctl(vcpu, argp); +} + static int vt_gmem_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn, bool is_private) { @@ -1005,6 +1013,7 @@ struct kvm_x86_ops vt_x86_ops __initdata = { .mem_enc_ioctl = vt_op_tdx_only(mem_enc_ioctl), .vcpu_mem_enc_ioctl = vt_op_tdx_only(vcpu_mem_enc_ioctl), + .vcpu_mem_enc_unlocked_ioctl = vt_op_tdx_only(vcpu_mem_enc_unlocked_ioctl), .gmem_max_mapping_level = vt_op_tdx_only(gmem_max_mapping_level) }; diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c index 6d41d2fc8043..f2d8fc3fecaa 100644 --- a/arch/x86/kvm/vmx/tdx.c +++ b/arch/x86/kvm/vmx/tdx.c @@ -24,20 +24,33 @@ #undef pr_fmt #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt -#define pr_tdx_error(__fn, __err) \ - pr_err_ratelimited("SEAMCALL %s failed: 0x%llx\n", #__fn, __err) +#define __TDX_BUG_ON(__err, __f, __kvm, __fmt, __args...) \ +({ \ + struct kvm *_kvm = (__kvm); \ + bool __ret = !!(__err); \ + \ + if (WARN_ON_ONCE(__ret && (!_kvm || !_kvm->vm_bugged))) { \ + if (_kvm) \ + kvm_vm_bugged(_kvm); \ + pr_err_ratelimited("SEAMCALL " __f " failed: 0x%llx" __fmt "\n",\ + __err, __args); \ + } \ + unlikely(__ret); \ +}) + +#define TDX_BUG_ON(__err, __fn, __kvm) \ + __TDX_BUG_ON(__err, #__fn, __kvm, "%s", "") + +#define TDX_BUG_ON_1(__err, __fn, a1, __kvm) \ + __TDX_BUG_ON(__err, #__fn, __kvm, ", " #a1 " 0x%llx", a1) + +#define TDX_BUG_ON_2(__err, __fn, a1, a2, __kvm) \ + __TDX_BUG_ON(__err, #__fn, __kvm, ", " #a1 " 0x%llx, " #a2 " 0x%llx", a1, a2) + +#define TDX_BUG_ON_3(__err, __fn, a1, a2, a3, __kvm) \ + __TDX_BUG_ON(__err, #__fn, __kvm, ", " #a1 " 0x%llx, " #a2 ", 0x%llx, " #a3 " 0x%llx", \ + a1, a2, a3) -#define __pr_tdx_error_N(__fn_str, __err, __fmt, ...) \ - pr_err_ratelimited("SEAMCALL " __fn_str " failed: 0x%llx, " __fmt, __err, __VA_ARGS__) - -#define pr_tdx_error_1(__fn, __err, __rcx) \ - __pr_tdx_error_N(#__fn, __err, "rcx 0x%llx\n", __rcx) - -#define pr_tdx_error_2(__fn, __err, __rcx, __rdx) \ - __pr_tdx_error_N(#__fn, __err, "rcx 0x%llx, rdx 0x%llx\n", __rcx, __rdx) - -#define pr_tdx_error_3(__fn, __err, __rcx, __rdx, __r8) \ - __pr_tdx_error_N(#__fn, __err, "rcx 0x%llx, rdx 0x%llx, r8 0x%llx\n", __rcx, __rdx, __r8) bool enable_tdx __ro_after_init; module_param_named(tdx, enable_tdx, bool, 0444); @@ -281,25 +294,34 @@ static inline void tdx_disassociate_vp(struct kvm_vcpu *vcpu) vcpu->cpu = -1; } -static void tdx_no_vcpus_enter_start(struct kvm *kvm) -{ - struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); - - lockdep_assert_held_write(&kvm->mmu_lock); - - WRITE_ONCE(kvm_tdx->wait_for_sept_zap, true); - - kvm_make_all_cpus_request(kvm, KVM_REQ_OUTSIDE_GUEST_MODE); -} - -static void tdx_no_vcpus_enter_stop(struct kvm *kvm) -{ - struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); - - lockdep_assert_held_write(&kvm->mmu_lock); - - WRITE_ONCE(kvm_tdx->wait_for_sept_zap, false); -} +/* + * Execute a SEAMCALL related to removing/blocking S-EPT entries, with a single + * retry (if necessary) after forcing vCPUs to exit and wait for the operation + * to complete. All flows that remove/block S-EPT entries run with mmu_lock + * held for write, i.e. are mutually exclusive with each other, but they aren't + * mutually exclusive with running vCPUs, and so can fail with "operand busy" + * if a vCPU acquires a relevant lock in the TDX-Module, e.g. when doing TDCALL. + * + * Note, the retry is guaranteed to succeed, absent KVM and/or TDX-Module bugs. + */ +#define tdh_do_no_vcpus(tdh_func, kvm, args...) \ +({ \ + struct kvm_tdx *__kvm_tdx = to_kvm_tdx(kvm); \ + u64 __err; \ + \ + lockdep_assert_held_write(&kvm->mmu_lock); \ + \ + __err = tdh_func(args); \ + if (unlikely(tdx_operand_busy(__err))) { \ + WRITE_ONCE(__kvm_tdx->wait_for_sept_zap, true); \ + kvm_make_all_cpus_request(kvm, KVM_REQ_OUTSIDE_GUEST_MODE); \ + \ + __err = tdh_func(args); \ + \ + WRITE_ONCE(__kvm_tdx->wait_for_sept_zap, false); \ + } \ + __err; \ +}) /* TDH.PHYMEM.PAGE.RECLAIM is allowed only when destroying the TD. */ static int __tdx_reclaim_page(struct page *page) @@ -313,10 +335,9 @@ static int __tdx_reclaim_page(struct page *page) * before the HKID is released and control pages have also been * released at this point, so there is no possibility of contention. */ - if (WARN_ON_ONCE(err)) { - pr_tdx_error_3(TDH_PHYMEM_PAGE_RECLAIM, err, rcx, rdx, r8); + if (TDX_BUG_ON_3(err, TDH_PHYMEM_PAGE_RECLAIM, rcx, rdx, r8, NULL)) return -EIO; - } + return 0; } @@ -404,8 +425,8 @@ static void tdx_flush_vp_on_cpu(struct kvm_vcpu *vcpu) return; smp_call_function_single(cpu, tdx_flush_vp, &arg, 1); - if (KVM_BUG_ON(arg.err, vcpu->kvm)) - pr_tdx_error(TDH_VP_FLUSH, arg.err); + + TDX_BUG_ON(arg.err, TDH_VP_FLUSH, vcpu->kvm); } void tdx_disable_virtualization_cpu(void) @@ -464,8 +485,7 @@ static void smp_func_do_phymem_cache_wb(void *unused) } out: - if (WARN_ON_ONCE(err)) - pr_tdx_error(TDH_PHYMEM_CACHE_WB, err); + TDX_BUG_ON(err, TDH_PHYMEM_CACHE_WB, NULL); } void tdx_mmu_release_hkid(struct kvm *kvm) @@ -504,8 +524,7 @@ void tdx_mmu_release_hkid(struct kvm *kvm) err = tdh_mng_vpflushdone(&kvm_tdx->td); if (err == TDX_FLUSHVP_NOT_DONE) goto out; - if (KVM_BUG_ON(err, kvm)) { - pr_tdx_error(TDH_MNG_VPFLUSHDONE, err); + if (TDX_BUG_ON(err, TDH_MNG_VPFLUSHDONE, kvm)) { pr_err("tdh_mng_vpflushdone() failed. HKID %d is leaked.\n", kvm_tdx->hkid); goto out; @@ -528,8 +547,7 @@ void tdx_mmu_release_hkid(struct kvm *kvm) * tdh_mng_key_freeid() will fail. */ err = tdh_mng_key_freeid(&kvm_tdx->td); - if (KVM_BUG_ON(err, kvm)) { - pr_tdx_error(TDH_MNG_KEY_FREEID, err); + if (TDX_BUG_ON(err, TDH_MNG_KEY_FREEID, kvm)) { pr_err("tdh_mng_key_freeid() failed. HKID %d is leaked.\n", kvm_tdx->hkid); } else { @@ -580,10 +598,9 @@ static void tdx_reclaim_td_control_pages(struct kvm *kvm) * when it is reclaiming TDCS). */ err = tdh_phymem_page_wbinvd_tdr(&kvm_tdx->td); - if (KVM_BUG_ON(err, kvm)) { - pr_tdx_error(TDH_PHYMEM_PAGE_WBINVD, err); + if (TDX_BUG_ON(err, TDH_PHYMEM_PAGE_WBINVD, kvm)) return; - } + tdx_quirk_reset_page(kvm_tdx->td.tdr_page); __free_page(kvm_tdx->td.tdr_page); @@ -606,11 +623,8 @@ static int tdx_do_tdh_mng_key_config(void *param) /* TDX_RND_NO_ENTROPY related retries are handled by sc_retry() */ err = tdh_mng_key_config(&kvm_tdx->td); - - if (KVM_BUG_ON(err, &kvm_tdx->kvm)) { - pr_tdx_error(TDH_MNG_KEY_CONFIG, err); + if (TDX_BUG_ON(err, TDH_MNG_KEY_CONFIG, &kvm_tdx->kvm)) return -EIO; - } return 0; } @@ -824,19 +838,52 @@ void tdx_vcpu_put(struct kvm_vcpu *vcpu) tdx_prepare_switch_to_host(vcpu); } +/* + * Life cycles for a TD and a vCPU: + * 1. KVM_CREATE_VM ioctl. + * TD state is TD_STATE_UNINITIALIZED. + * hkid is not assigned at this stage. + * 2. KVM_TDX_INIT_VM ioctl. + * TD transitions to TD_STATE_INITIALIZED. + * hkid is assigned after this stage. + * 3. KVM_CREATE_VCPU ioctl. (only when TD is TD_STATE_INITIALIZED). + * 3.1 tdx_vcpu_create() transitions vCPU state to VCPU_TD_STATE_UNINITIALIZED. + * 3.2 vcpu_load() and vcpu_put() in kvm_arch_vcpu_create(). + * 3.3 (conditional) if any error encountered after kvm_arch_vcpu_create() + * kvm_arch_vcpu_destroy() --> tdx_vcpu_free(). + * 4. KVM_TDX_INIT_VCPU ioctl. + * tdx_vcpu_init() transitions vCPU state to VCPU_TD_STATE_INITIALIZED. + * vCPU control structures are allocated at this stage. + * 5. kvm_destroy_vm(). + * 5.1 tdx_mmu_release_hkid(): (1) tdh_vp_flush(), disassociates all vCPUs. + * (2) puts hkid to !assigned state. + * 5.2 kvm_destroy_vcpus() --> tdx_vcpu_free(): + * transitions vCPU to VCPU_TD_STATE_UNINITIALIZED state. + * 5.3 tdx_vm_destroy() + * transitions TD to TD_STATE_UNINITIALIZED state. + * + * tdx_vcpu_free() can be invoked only at 3.3 or 5.2. + * - If at 3.3, hkid is still assigned, but the vCPU must be in + * VCPU_TD_STATE_UNINITIALIZED state. + * - if at 5.2, hkid must be !assigned and all vCPUs must be in + * VCPU_TD_STATE_INITIALIZED state and have been dissociated. + */ void tdx_vcpu_free(struct kvm_vcpu *vcpu) { struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm); struct vcpu_tdx *tdx = to_tdx(vcpu); int i; + if (vcpu->cpu != -1) { + KVM_BUG_ON(tdx->state == VCPU_TD_STATE_INITIALIZED, vcpu->kvm); + tdx_flush_vp_on_cpu(vcpu); + return; + } + /* * It is not possible to reclaim pages while hkid is assigned. It might - * be assigned if: - * 1. the TD VM is being destroyed but freeing hkid failed, in which - * case the pages are leaked - * 2. TD VCPU creation failed and this on the error path, in which case - * there is nothing to do anyway + * be assigned if the TD VM is being destroyed but freeing hkid failed, + * in which case the pages are leaked. */ if (is_hkid_assigned(kvm_tdx)) return; @@ -851,7 +898,7 @@ void tdx_vcpu_free(struct kvm_vcpu *vcpu) } if (tdx->vp.tdvpr_page) { tdx_reclaim_control_page(tdx->vp.tdvpr_page); - tdx->vp.tdvpr_page = 0; + tdx->vp.tdvpr_page = NULL; tdx->vp.tdvpr_pa = 0; } @@ -1574,137 +1621,79 @@ void tdx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, int pgd_level) td_vmcs_write64(to_tdx(vcpu), SHARED_EPT_POINTER, root_hpa); } -static void tdx_unpin(struct kvm *kvm, struct page *page) +static int tdx_mem_page_add(struct kvm *kvm, gfn_t gfn, enum pg_level level, + kvm_pfn_t pfn) { - put_page(page); + struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); + u64 err, entry, level_state; + gpa_t gpa = gfn_to_gpa(gfn); + + lockdep_assert_held(&kvm->slots_lock); + + if (KVM_BUG_ON(kvm->arch.pre_fault_allowed, kvm) || + KVM_BUG_ON(!kvm_tdx->page_add_src, kvm)) + return -EIO; + + err = tdh_mem_page_add(&kvm_tdx->td, gpa, pfn_to_page(pfn), + kvm_tdx->page_add_src, &entry, &level_state); + if (unlikely(tdx_operand_busy(err))) + return -EBUSY; + + if (TDX_BUG_ON_2(err, TDH_MEM_PAGE_ADD, entry, level_state, kvm)) + return -EIO; + + return 0; } static int tdx_mem_page_aug(struct kvm *kvm, gfn_t gfn, - enum pg_level level, struct page *page) + enum pg_level level, kvm_pfn_t pfn) { int tdx_level = pg_level_to_tdx_sept_level(level); struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); + struct page *page = pfn_to_page(pfn); gpa_t gpa = gfn_to_gpa(gfn); u64 entry, level_state; u64 err; err = tdh_mem_page_aug(&kvm_tdx->td, gpa, tdx_level, page, &entry, &level_state); - if (unlikely(tdx_operand_busy(err))) { - tdx_unpin(kvm, page); + if (unlikely(tdx_operand_busy(err))) return -EBUSY; - } - if (KVM_BUG_ON(err, kvm)) { - pr_tdx_error_2(TDH_MEM_PAGE_AUG, err, entry, level_state); - tdx_unpin(kvm, page); + if (TDX_BUG_ON_2(err, TDH_MEM_PAGE_AUG, entry, level_state, kvm)) return -EIO; - } - - return 0; -} -/* - * KVM_TDX_INIT_MEM_REGION calls kvm_gmem_populate() to map guest pages; the - * callback tdx_gmem_post_populate() then maps pages into private memory. - * through the a seamcall TDH.MEM.PAGE.ADD(). The SEAMCALL also requires the - * private EPT structures for the page to have been built before, which is - * done via kvm_tdp_map_page(). nr_premapped counts the number of pages that - * were added to the EPT structures but not added with TDH.MEM.PAGE.ADD(). - * The counter has to be zero on KVM_TDX_FINALIZE_VM, to ensure that there - * are no half-initialized shared EPT pages. - */ -static int tdx_mem_page_record_premap_cnt(struct kvm *kvm, gfn_t gfn, - enum pg_level level, kvm_pfn_t pfn) -{ - struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); - - if (KVM_BUG_ON(kvm->arch.pre_fault_allowed, kvm)) - return -EINVAL; - - /* nr_premapped will be decreased when tdh_mem_page_add() is called. */ - atomic64_inc(&kvm_tdx->nr_premapped); return 0; } static int tdx_sept_set_private_spte(struct kvm *kvm, gfn_t gfn, - enum pg_level level, kvm_pfn_t pfn) + enum pg_level level, u64 mirror_spte) { struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); - struct page *page = pfn_to_page(pfn); + kvm_pfn_t pfn = spte_to_pfn(mirror_spte); /* TODO: handle large pages. */ if (KVM_BUG_ON(level != PG_LEVEL_4K, kvm)) - return -EINVAL; + return -EIO; - /* - * Because guest_memfd doesn't support page migration with - * a_ops->migrate_folio (yet), no callback is triggered for KVM on page - * migration. Until guest_memfd supports page migration, prevent page - * migration. - * TODO: Once guest_memfd introduces callback on page migration, - * implement it and remove get_page/put_page(). - */ - get_page(page); + WARN_ON_ONCE(!is_shadow_present_pte(mirror_spte) || + (mirror_spte & VMX_EPT_RWX_MASK) != VMX_EPT_RWX_MASK); /* - * Read 'pre_fault_allowed' before 'kvm_tdx->state'; see matching - * barrier in tdx_td_finalize(). + * Ensure pre_fault_allowed is read by kvm_arch_vcpu_pre_fault_memory() + * before kvm_tdx->state. Userspace must not be allowed to pre-fault + * arbitrary memory until the initial memory image is finalized. Pairs + * with the smp_wmb() in tdx_td_finalize(). */ smp_rmb(); - if (likely(kvm_tdx->state == TD_STATE_RUNNABLE)) - return tdx_mem_page_aug(kvm, gfn, level, page); - - return tdx_mem_page_record_premap_cnt(kvm, gfn, level, pfn); -} - -static int tdx_sept_drop_private_spte(struct kvm *kvm, gfn_t gfn, - enum pg_level level, struct page *page) -{ - int tdx_level = pg_level_to_tdx_sept_level(level); - struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); - gpa_t gpa = gfn_to_gpa(gfn); - u64 err, entry, level_state; - - /* TODO: handle large pages. */ - if (KVM_BUG_ON(level != PG_LEVEL_4K, kvm)) - return -EINVAL; - - if (KVM_BUG_ON(!is_hkid_assigned(kvm_tdx), kvm)) - return -EINVAL; /* - * When zapping private page, write lock is held. So no race condition - * with other vcpu sept operation. - * Race with TDH.VP.ENTER due to (0-step mitigation) and Guest TDCALLs. + * If the TD isn't finalized/runnable, then userspace is initializing + * the VM image via KVM_TDX_INIT_MEM_REGION; ADD the page to the TD. */ - err = tdh_mem_page_remove(&kvm_tdx->td, gpa, tdx_level, &entry, - &level_state); - - if (unlikely(tdx_operand_busy(err))) { - /* - * The second retry is expected to succeed after kicking off all - * other vCPUs and prevent them from invoking TDH.VP.ENTER. - */ - tdx_no_vcpus_enter_start(kvm); - err = tdh_mem_page_remove(&kvm_tdx->td, gpa, tdx_level, &entry, - &level_state); - tdx_no_vcpus_enter_stop(kvm); - } - - if (KVM_BUG_ON(err, kvm)) { - pr_tdx_error_2(TDH_MEM_PAGE_REMOVE, err, entry, level_state); - return -EIO; - } - - err = tdh_phymem_page_wbinvd_hkid((u16)kvm_tdx->hkid, page); + if (unlikely(kvm_tdx->state != TD_STATE_RUNNABLE)) + return tdx_mem_page_add(kvm, gfn, level, pfn); - if (KVM_BUG_ON(err, kvm)) { - pr_tdx_error(TDH_PHYMEM_PAGE_WBINVD, err); - return -EIO; - } - tdx_quirk_reset_page(page); - tdx_unpin(kvm, page); - return 0; + return tdx_mem_page_aug(kvm, gfn, level, pfn); } static int tdx_sept_link_private_spt(struct kvm *kvm, gfn_t gfn, @@ -1720,81 +1709,13 @@ static int tdx_sept_link_private_spt(struct kvm *kvm, gfn_t gfn, if (unlikely(tdx_operand_busy(err))) return -EBUSY; - if (KVM_BUG_ON(err, kvm)) { - pr_tdx_error_2(TDH_MEM_SEPT_ADD, err, entry, level_state); + if (TDX_BUG_ON_2(err, TDH_MEM_SEPT_ADD, entry, level_state, kvm)) return -EIO; - } return 0; } /* - * Check if the error returned from a SEPT zap SEAMCALL is due to that a page is - * mapped by KVM_TDX_INIT_MEM_REGION without tdh_mem_page_add() being called - * successfully. - * - * Since tdh_mem_sept_add() must have been invoked successfully before a - * non-leaf entry present in the mirrored page table, the SEPT ZAP related - * SEAMCALLs should not encounter err TDX_EPT_WALK_FAILED. They should instead - * find TDX_EPT_ENTRY_STATE_INCORRECT due to an empty leaf entry found in the - * SEPT. - * - * Further check if the returned entry from SEPT walking is with RWX permissions - * to filter out anything unexpected. - * - * Note: @level is pg_level, not the tdx_level. The tdx_level extracted from - * level_state returned from a SEAMCALL error is the same as that passed into - * the SEAMCALL. - */ -static int tdx_is_sept_zap_err_due_to_premap(struct kvm_tdx *kvm_tdx, u64 err, - u64 entry, int level) -{ - if (!err || kvm_tdx->state == TD_STATE_RUNNABLE) - return false; - - if (err != (TDX_EPT_ENTRY_STATE_INCORRECT | TDX_OPERAND_ID_RCX)) - return false; - - if ((is_last_spte(entry, level) && (entry & VMX_EPT_RWX_MASK))) - return false; - - return true; -} - -static int tdx_sept_zap_private_spte(struct kvm *kvm, gfn_t gfn, - enum pg_level level, struct page *page) -{ - int tdx_level = pg_level_to_tdx_sept_level(level); - struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); - gpa_t gpa = gfn_to_gpa(gfn) & KVM_HPAGE_MASK(level); - u64 err, entry, level_state; - - /* For now large page isn't supported yet. */ - WARN_ON_ONCE(level != PG_LEVEL_4K); - - err = tdh_mem_range_block(&kvm_tdx->td, gpa, tdx_level, &entry, &level_state); - - if (unlikely(tdx_operand_busy(err))) { - /* After no vCPUs enter, the second retry is expected to succeed */ - tdx_no_vcpus_enter_start(kvm); - err = tdh_mem_range_block(&kvm_tdx->td, gpa, tdx_level, &entry, &level_state); - tdx_no_vcpus_enter_stop(kvm); - } - if (tdx_is_sept_zap_err_due_to_premap(kvm_tdx, err, entry, level) && - !KVM_BUG_ON(!atomic64_read(&kvm_tdx->nr_premapped), kvm)) { - atomic64_dec(&kvm_tdx->nr_premapped); - tdx_unpin(kvm, page); - return 0; - } - - if (KVM_BUG_ON(err, kvm)) { - pr_tdx_error_2(TDH_MEM_RANGE_BLOCK, err, entry, level_state); - return -EIO; - } - return 1; -} - -/* * Ensure shared and private EPTs to be flushed on all vCPUs. * tdh_mem_track() is the only caller that increases TD epoch. An increase in * the TD epoch (e.g., to value "N + 1") is successful only if no vCPUs are @@ -1827,18 +1748,15 @@ static void tdx_track(struct kvm *kvm) if (unlikely(kvm_tdx->state != TD_STATE_RUNNABLE)) return; + /* + * The full sequence of TDH.MEM.TRACK and forcing vCPUs out of guest + * mode must be serialized, as TDH.MEM.TRACK will fail if the previous + * tracking epoch hasn't completed. + */ lockdep_assert_held_write(&kvm->mmu_lock); - err = tdh_mem_track(&kvm_tdx->td); - if (unlikely(tdx_operand_busy(err))) { - /* After no vCPUs enter, the second retry is expected to succeed */ - tdx_no_vcpus_enter_start(kvm); - err = tdh_mem_track(&kvm_tdx->td); - tdx_no_vcpus_enter_stop(kvm); - } - - if (KVM_BUG_ON(err, kvm)) - pr_tdx_error(TDH_MEM_TRACK, err); + err = tdh_do_no_vcpus(tdh_mem_track, kvm, &kvm_tdx->td); + TDX_BUG_ON(err, TDH_MEM_TRACK, kvm); kvm_make_all_cpus_request(kvm, KVM_REQ_OUTSIDE_GUEST_MODE); } @@ -1857,7 +1775,7 @@ static int tdx_sept_free_private_spt(struct kvm *kvm, gfn_t gfn, * and slot move/deletion. */ if (KVM_BUG_ON(is_hkid_assigned(kvm_tdx), kvm)) - return -EINVAL; + return -EIO; /* * The HKID assigned to this TD was already freed and cache was @@ -1866,11 +1784,16 @@ static int tdx_sept_free_private_spt(struct kvm *kvm, gfn_t gfn, return tdx_reclaim_page(virt_to_page(private_spt)); } -static int tdx_sept_remove_private_spte(struct kvm *kvm, gfn_t gfn, - enum pg_level level, kvm_pfn_t pfn) +static void tdx_sept_remove_private_spte(struct kvm *kvm, gfn_t gfn, + enum pg_level level, u64 mirror_spte) { - struct page *page = pfn_to_page(pfn); - int ret; + struct page *page = pfn_to_page(spte_to_pfn(mirror_spte)); + int tdx_level = pg_level_to_tdx_sept_level(level); + struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); + gpa_t gpa = gfn_to_gpa(gfn); + u64 err, entry, level_state; + + lockdep_assert_held_write(&kvm->mmu_lock); /* * HKID is released after all private pages have been removed, and set @@ -1878,11 +1801,16 @@ static int tdx_sept_remove_private_spte(struct kvm *kvm, gfn_t gfn, * there can't be anything populated in the private EPT. */ if (KVM_BUG_ON(!is_hkid_assigned(to_kvm_tdx(kvm)), kvm)) - return -EINVAL; + return; - ret = tdx_sept_zap_private_spte(kvm, gfn, level, page); - if (ret <= 0) - return ret; + /* TODO: handle large pages. */ + if (KVM_BUG_ON(level != PG_LEVEL_4K, kvm)) + return; + + err = tdh_do_no_vcpus(tdh_mem_range_block, kvm, &kvm_tdx->td, gpa, + tdx_level, &entry, &level_state); + if (TDX_BUG_ON_2(err, TDH_MEM_RANGE_BLOCK, entry, level_state, kvm)) + return; /* * TDX requires TLB tracking before dropping private page. Do @@ -1890,7 +1818,21 @@ static int tdx_sept_remove_private_spte(struct kvm *kvm, gfn_t gfn, */ tdx_track(kvm); - return tdx_sept_drop_private_spte(kvm, gfn, level, page); + /* + * When zapping private page, write lock is held. So no race condition + * with other vcpu sept operation. + * Race with TDH.VP.ENTER due to (0-step mitigation) and Guest TDCALLs. + */ + err = tdh_do_no_vcpus(tdh_mem_page_remove, kvm, &kvm_tdx->td, gpa, + tdx_level, &entry, &level_state); + if (TDX_BUG_ON_2(err, TDH_MEM_PAGE_REMOVE, entry, level_state, kvm)) + return; + + err = tdh_phymem_page_wbinvd_hkid((u16)kvm_tdx->hkid, page); + if (TDX_BUG_ON(err, TDH_PHYMEM_PAGE_WBINVD, kvm)) + return; + + tdx_quirk_reset_page(page); } void tdx_deliver_interrupt(struct kvm_lapic *apic, int delivery_mode, @@ -2269,37 +2211,28 @@ static int tdx_get_capabilities(struct kvm_tdx_cmd *cmd) if (cmd->flags) return -EINVAL; - caps = kzalloc(sizeof(*caps) + - sizeof(struct kvm_cpuid_entry2) * td_conf->num_cpuid_config, - GFP_KERNEL); - if (!caps) - return -ENOMEM; - user_caps = u64_to_user_ptr(cmd->data); - if (get_user(nr_user_entries, &user_caps->cpuid.nent)) { - ret = -EFAULT; - goto out; - } + if (get_user(nr_user_entries, &user_caps->cpuid.nent)) + return -EFAULT; - if (nr_user_entries < td_conf->num_cpuid_config) { - ret = -E2BIG; - goto out; - } + if (nr_user_entries < td_conf->num_cpuid_config) + return -E2BIG; + + caps = kzalloc(struct_size(caps, cpuid.entries, + td_conf->num_cpuid_config), GFP_KERNEL); + if (!caps) + return -ENOMEM; ret = init_kvm_tdx_caps(td_conf, caps); if (ret) goto out; - if (copy_to_user(user_caps, caps, sizeof(*caps))) { + if (copy_to_user(user_caps, caps, struct_size(caps, cpuid.entries, + caps->cpuid.nent))) { ret = -EFAULT; goto out; } - if (copy_to_user(user_caps->cpuid.entries, caps->cpuid.entries, - caps->cpuid.nent * - sizeof(caps->cpuid.entries[0]))) - ret = -EFAULT; - out: /* kfree() accepts NULL. */ kfree(caps); @@ -2524,8 +2457,7 @@ static int __tdx_td_init(struct kvm *kvm, struct td_params *td_params, goto free_packages; } - if (WARN_ON_ONCE(err)) { - pr_tdx_error(TDH_MNG_CREATE, err); + if (TDX_BUG_ON(err, TDH_MNG_CREATE, kvm)) { ret = -EIO; goto free_packages; } @@ -2566,8 +2498,7 @@ static int __tdx_td_init(struct kvm *kvm, struct td_params *td_params, ret = -EAGAIN; goto teardown; } - if (WARN_ON_ONCE(err)) { - pr_tdx_error(TDH_MNG_ADDCX, err); + if (TDX_BUG_ON(err, TDH_MNG_ADDCX, kvm)) { ret = -EIO; goto teardown; } @@ -2584,8 +2515,7 @@ static int __tdx_td_init(struct kvm *kvm, struct td_params *td_params, *seamcall_err = err; ret = -EINVAL; goto teardown; - } else if (WARN_ON_ONCE(err)) { - pr_tdx_error_1(TDH_MNG_INIT, err, rcx); + } else if (TDX_BUG_ON_1(err, TDH_MNG_INIT, rcx, kvm)) { ret = -EIO; goto teardown; } @@ -2629,7 +2559,7 @@ free_tdcs: free_tdr: if (tdr_page) __free_page(tdr_page); - kvm_tdx->td.tdr_page = 0; + kvm_tdx->td.tdr_page = NULL; free_hkid: tdx_hkid_free(kvm_tdx); @@ -2734,6 +2664,46 @@ err_out: return -EIO; } +typedef void *tdx_vm_state_guard_t; + +static tdx_vm_state_guard_t tdx_acquire_vm_state_locks(struct kvm *kvm) +{ + int r; + + mutex_lock(&kvm->lock); + + if (kvm->created_vcpus != atomic_read(&kvm->online_vcpus)) { + r = -EBUSY; + goto out_err; + } + + r = kvm_lock_all_vcpus(kvm); + if (r) + goto out_err; + + /* + * Note the unintuitive ordering! vcpu->mutex must be taken outside + * kvm->slots_lock! + */ + mutex_lock(&kvm->slots_lock); + return kvm; + +out_err: + mutex_unlock(&kvm->lock); + return ERR_PTR(r); +} + +static void tdx_release_vm_state_locks(struct kvm *kvm) +{ + mutex_unlock(&kvm->slots_lock); + kvm_unlock_all_vcpus(kvm); + mutex_unlock(&kvm->lock); +} + +DEFINE_CLASS(tdx_vm_state_guard, tdx_vm_state_guard_t, + if (!IS_ERR(_T)) tdx_release_vm_state_locks(_T), + tdx_acquire_vm_state_locks(kvm), struct kvm *kvm); + static int tdx_td_init(struct kvm *kvm, struct kvm_tdx_cmd *cmd) { struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); @@ -2855,24 +2825,14 @@ static int tdx_td_finalize(struct kvm *kvm, struct kvm_tdx_cmd *cmd) { struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); - guard(mutex)(&kvm->slots_lock); - if (!is_hkid_assigned(kvm_tdx) || kvm_tdx->state == TD_STATE_RUNNABLE) return -EINVAL; - /* - * Pages are pending for KVM_TDX_INIT_MEM_REGION to issue - * TDH.MEM.PAGE.ADD(). - */ - if (atomic64_read(&kvm_tdx->nr_premapped)) - return -EINVAL; cmd->hw_error = tdh_mr_finalize(&kvm_tdx->td); if (tdx_operand_busy(cmd->hw_error)) return -EBUSY; - if (KVM_BUG_ON(cmd->hw_error, kvm)) { - pr_tdx_error(TDH_MR_FINALIZE, cmd->hw_error); + if (TDX_BUG_ON(cmd->hw_error, TDH_MR_FINALIZE, kvm)) return -EIO; - } kvm_tdx->state = TD_STATE_RUNNABLE; /* TD_STATE_RUNNABLE must be set before 'pre_fault_allowed' */ @@ -2881,27 +2841,38 @@ static int tdx_td_finalize(struct kvm *kvm, struct kvm_tdx_cmd *cmd) return 0; } -int tdx_vm_ioctl(struct kvm *kvm, void __user *argp) +static int tdx_get_cmd(void __user *argp, struct kvm_tdx_cmd *cmd) { - struct kvm_tdx_cmd tdx_cmd; - int r; - - if (copy_from_user(&tdx_cmd, argp, sizeof(struct kvm_tdx_cmd))) + if (copy_from_user(cmd, argp, sizeof(*cmd))) return -EFAULT; /* - * Userspace should never set hw_error. It is used to fill - * hardware-defined error by the kernel. + * Userspace should never set hw_error. KVM writes hw_error to report + * hardware-defined error back to userspace. */ - if (tdx_cmd.hw_error) + if (cmd->hw_error) return -EINVAL; - mutex_lock(&kvm->lock); + return 0; +} + +int tdx_vm_ioctl(struct kvm *kvm, void __user *argp) +{ + struct kvm_tdx_cmd tdx_cmd; + int r; + + r = tdx_get_cmd(argp, &tdx_cmd); + if (r) + return r; + + if (tdx_cmd.id == KVM_TDX_CAPABILITIES) + return tdx_get_capabilities(&tdx_cmd); + + CLASS(tdx_vm_state_guard, guard)(kvm); + if (IS_ERR(guard)) + return PTR_ERR(guard); switch (tdx_cmd.id) { - case KVM_TDX_CAPABILITIES: - r = tdx_get_capabilities(&tdx_cmd); - break; case KVM_TDX_INIT_VM: r = tdx_td_init(kvm, &tdx_cmd); break; @@ -2909,15 +2880,12 @@ int tdx_vm_ioctl(struct kvm *kvm, void __user *argp) r = tdx_td_finalize(kvm, &tdx_cmd); break; default: - r = -EINVAL; - goto out; + return -EINVAL; } if (copy_to_user(argp, &tdx_cmd, sizeof(struct kvm_tdx_cmd))) - r = -EFAULT; + return -EFAULT; -out: - mutex_unlock(&kvm->lock); return r; } @@ -2959,16 +2927,14 @@ static int tdx_td_vcpu_init(struct kvm_vcpu *vcpu, u64 vcpu_rcx) } err = tdh_vp_create(&kvm_tdx->td, &tdx->vp); - if (KVM_BUG_ON(err, vcpu->kvm)) { + if (TDX_BUG_ON(err, TDH_VP_CREATE, vcpu->kvm)) { ret = -EIO; - pr_tdx_error(TDH_VP_CREATE, err); goto free_tdcx; } for (i = 0; i < kvm_tdx->td.tdcx_nr_pages; i++) { err = tdh_vp_addcx(&tdx->vp, tdx->vp.tdcx_pages[i]); - if (KVM_BUG_ON(err, vcpu->kvm)) { - pr_tdx_error(TDH_VP_ADDCX, err); + if (TDX_BUG_ON(err, TDH_VP_ADDCX, vcpu->kvm)) { /* * Pages already added are reclaimed by the vcpu_free * method, but the rest are freed here. @@ -2981,10 +2947,19 @@ static int tdx_td_vcpu_init(struct kvm_vcpu *vcpu, u64 vcpu_rcx) } } - err = tdh_vp_init(&tdx->vp, vcpu_rcx, vcpu->vcpu_id); - if (KVM_BUG_ON(err, vcpu->kvm)) { - pr_tdx_error(TDH_VP_INIT, err); - return -EIO; + /* + * tdh_vp_init() can take an exclusive lock of the TDR resource inside + * the TDX-Module. The TDR resource is also taken as shared in several + * no-fail MMU paths, which could return TDX_OPERAND_BUSY on contention + * (TDX-Module locks are try-lock implementations with no slow path). + * Take mmu_lock for write to reflect the nature of the lock taken by + * the TDX-Module, and to ensure the no-fail MMU paths succeed, e.g. if + * a concurrent PUNCH_HOLE on guest_memfd triggers removal of SPTEs. + */ + scoped_guard(write_lock, &vcpu->kvm->mmu_lock) { + err = tdh_vp_init(&tdx->vp, vcpu_rcx, vcpu->vcpu_id); + if (TDX_BUG_ON(err, TDH_VP_INIT, vcpu->kvm)) + return -EIO; } vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; @@ -3003,7 +2978,7 @@ free_tdcx: free_tdvpr: if (tdx->vp.tdvpr_page) __free_page(tdx->vp.tdvpr_page); - tdx->vp.tdvpr_page = 0; + tdx->vp.tdvpr_page = NULL; tdx->vp.tdvpr_pa = 0; return ret; @@ -3041,7 +3016,8 @@ static int tdx_vcpu_get_cpuid_leaf(struct kvm_vcpu *vcpu, u32 leaf, int *entry_i static int tdx_vcpu_get_cpuid(struct kvm_vcpu *vcpu, struct kvm_tdx_cmd *cmd) { - struct kvm_cpuid2 __user *output, *td_cpuid; + struct kvm_cpuid2 __user *output; + struct kvm_cpuid2 *td_cpuid; int r = 0, i = 0, leaf; u32 level; @@ -3154,15 +3130,15 @@ struct tdx_gmem_post_populate_arg { static int tdx_gmem_post_populate(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn, void __user *src, int order, void *_arg) { - u64 error_code = PFERR_GUEST_FINAL_MASK | PFERR_PRIVATE_ACCESS; - struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); struct tdx_gmem_post_populate_arg *arg = _arg; - struct kvm_vcpu *vcpu = arg->vcpu; + struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); + u64 err, entry, level_state; gpa_t gpa = gfn_to_gpa(gfn); - u8 level = PG_LEVEL_4K; struct page *src_page; int ret, i; - u64 err, entry, level_state; + + if (KVM_BUG_ON(kvm_tdx->page_add_src, kvm)) + return -EIO; /* * Get the source page if it has been faulted in. Return failure if the @@ -3174,49 +3150,29 @@ static int tdx_gmem_post_populate(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn, if (ret != 1) return -ENOMEM; - ret = kvm_tdp_map_page(vcpu, gpa, error_code, &level); - if (ret < 0) - goto out; - - /* - * The private mem cannot be zapped after kvm_tdp_map_page() - * because all paths are covered by slots_lock and the - * filemap invalidate lock. Check that they are indeed enough. - */ - if (IS_ENABLED(CONFIG_KVM_PROVE_MMU)) { - scoped_guard(read_lock, &kvm->mmu_lock) { - if (KVM_BUG_ON(!kvm_tdp_mmu_gpa_is_mapped(vcpu, gpa), kvm)) { - ret = -EIO; - goto out; - } - } - } + kvm_tdx->page_add_src = src_page; + ret = kvm_tdp_mmu_map_private_pfn(arg->vcpu, gfn, pfn); + kvm_tdx->page_add_src = NULL; - ret = 0; - err = tdh_mem_page_add(&kvm_tdx->td, gpa, pfn_to_page(pfn), - src_page, &entry, &level_state); - if (err) { - ret = unlikely(tdx_operand_busy(err)) ? -EBUSY : -EIO; - goto out; - } + put_page(src_page); - if (!KVM_BUG_ON(!atomic64_read(&kvm_tdx->nr_premapped), kvm)) - atomic64_dec(&kvm_tdx->nr_premapped); + if (ret || !(arg->flags & KVM_TDX_MEASURE_MEMORY_REGION)) + return ret; - if (arg->flags & KVM_TDX_MEASURE_MEMORY_REGION) { - for (i = 0; i < PAGE_SIZE; i += TDX_EXTENDMR_CHUNKSIZE) { - err = tdh_mr_extend(&kvm_tdx->td, gpa + i, &entry, - &level_state); - if (err) { - ret = -EIO; - break; - } - } + /* + * Note, MR.EXTEND can fail if the S-EPT mapping is somehow removed + * between mapping the pfn and now, but slots_lock prevents memslot + * updates, filemap_invalidate_lock() prevents guest_memfd updates, + * mmu_notifier events can't reach S-EPT entries, and KVM's internal + * zapping flows are mutually exclusive with S-EPT mappings. + */ + for (i = 0; i < PAGE_SIZE; i += TDX_EXTENDMR_CHUNKSIZE) { + err = tdh_mr_extend(&kvm_tdx->td, gpa + i, &entry, &level_state); + if (TDX_BUG_ON_2(err, TDH_MR_EXTEND, entry, level_state, kvm)) + return -EIO; } -out: - put_page(src_page); - return ret; + return 0; } static int tdx_vcpu_init_mem_region(struct kvm_vcpu *vcpu, struct kvm_tdx_cmd *cmd) @@ -3232,8 +3188,6 @@ static int tdx_vcpu_init_mem_region(struct kvm_vcpu *vcpu, struct kvm_tdx_cmd *c if (tdx->state != VCPU_TD_STATE_INITIALIZED) return -EINVAL; - guard(mutex)(&kvm->slots_lock); - /* Once TD is finalized, the initial guest memory is fixed. */ if (kvm_tdx->state == TD_STATE_RUNNABLE) return -EINVAL; @@ -3251,7 +3205,6 @@ static int tdx_vcpu_init_mem_region(struct kvm_vcpu *vcpu, struct kvm_tdx_cmd *c !vt_is_tdx_private_gpa(kvm, region.gpa + (region.nr_pages << PAGE_SHIFT) - 1)) return -EINVAL; - kvm_mmu_reload(vcpu); ret = 0; while (region.nr_pages) { if (signal_pending(current)) { @@ -3288,28 +3241,57 @@ static int tdx_vcpu_init_mem_region(struct kvm_vcpu *vcpu, struct kvm_tdx_cmd *c return ret; } -int tdx_vcpu_ioctl(struct kvm_vcpu *vcpu, void __user *argp) +int tdx_vcpu_unlocked_ioctl(struct kvm_vcpu *vcpu, void __user *argp) { - struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm); + struct kvm *kvm = vcpu->kvm; + struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); struct kvm_tdx_cmd cmd; - int ret; + int r; - if (!is_hkid_assigned(kvm_tdx) || kvm_tdx->state == TD_STATE_RUNNABLE) - return -EINVAL; + r = tdx_get_cmd(argp, &cmd); + if (r) + return r; - if (copy_from_user(&cmd, argp, sizeof(cmd))) - return -EFAULT; + CLASS(tdx_vm_state_guard, guard)(kvm); + if (IS_ERR(guard)) + return PTR_ERR(guard); - if (cmd.hw_error) + if (!is_hkid_assigned(kvm_tdx) || kvm_tdx->state == TD_STATE_RUNNABLE) return -EINVAL; + vcpu_load(vcpu); + switch (cmd.id) { + case KVM_TDX_INIT_MEM_REGION: + r = tdx_vcpu_init_mem_region(vcpu, &cmd); + break; case KVM_TDX_INIT_VCPU: - ret = tdx_vcpu_init(vcpu, &cmd); + r = tdx_vcpu_init(vcpu, &cmd); break; - case KVM_TDX_INIT_MEM_REGION: - ret = tdx_vcpu_init_mem_region(vcpu, &cmd); + default: + r = -ENOIOCTLCMD; break; + } + + vcpu_put(vcpu); + + return r; +} + +int tdx_vcpu_ioctl(struct kvm_vcpu *vcpu, void __user *argp) +{ + struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm); + struct kvm_tdx_cmd cmd; + int ret; + + if (!is_hkid_assigned(kvm_tdx) || kvm_tdx->state == TD_STATE_RUNNABLE) + return -EINVAL; + + ret = tdx_get_cmd(argp, &cmd); + if (ret) + return ret; + + switch (cmd.id) { case KVM_TDX_GET_CPUID: ret = tdx_vcpu_get_cpuid(vcpu, &cmd); break; diff --git a/arch/x86/kvm/vmx/tdx.h b/arch/x86/kvm/vmx/tdx.h index 7f258870dc41..45b5183ccb36 100644 --- a/arch/x86/kvm/vmx/tdx.h +++ b/arch/x86/kvm/vmx/tdx.h @@ -36,8 +36,12 @@ struct kvm_tdx { struct tdx_td td; - /* For KVM_TDX_INIT_MEM_REGION. */ - atomic64_t nr_premapped; + /* + * Scratch pointer used to pass the source page to tdx_mem_page_add(). + * Protected by slots_lock, and non-NULL only when mapping a private + * pfn via tdx_gmem_post_populate(). + */ + struct page *page_add_src; /* * Prevent vCPUs from TD entry to ensure SEPT zap related SEAMCALLs do diff --git a/arch/x86/kvm/vmx/x86_ops.h b/arch/x86/kvm/vmx/x86_ops.h index 9697368d65b3..a7a870919580 100644 --- a/arch/x86/kvm/vmx/x86_ops.h +++ b/arch/x86/kvm/vmx/x86_ops.h @@ -149,6 +149,7 @@ int tdx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr); int tdx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr); int tdx_vcpu_ioctl(struct kvm_vcpu *vcpu, void __user *argp); +int tdx_vcpu_unlocked_ioctl(struct kvm_vcpu *vcpu, void __user *argp); void tdx_flush_tlb_current(struct kvm_vcpu *vcpu); void tdx_flush_tlb_all(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index cd55feb4a378..ea66d0047c9b 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -7200,6 +7200,19 @@ static int kvm_vm_ioctl_set_clock(struct kvm *kvm, void __user *argp) return 0; } +long kvm_arch_vcpu_unlocked_ioctl(struct file *filp, unsigned int ioctl, + unsigned long arg) +{ + struct kvm_vcpu *vcpu = filp->private_data; + void __user *argp = (void __user *)arg; + + if (ioctl == KVM_MEMORY_ENCRYPT_OP && + kvm_x86_ops.vcpu_mem_enc_unlocked_ioctl) + return kvm_x86_call(vcpu_mem_enc_unlocked_ioctl)(vcpu, argp); + + return -ENOIOCTLCMD; +} + int kvm_arch_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) { struct kvm *kvm = filp->private_data; diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 5bd76cf394fa..d93f75b05ae2 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1557,6 +1557,8 @@ long kvm_arch_dev_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg); long kvm_arch_vcpu_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg); +long kvm_arch_vcpu_unlocked_ioctl(struct file *filp, + unsigned int ioctl, unsigned long arg); vm_fault_t kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf); int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext); @@ -2437,18 +2439,6 @@ static inline bool kvm_arch_no_poll(struct kvm_vcpu *vcpu) } #endif /* CONFIG_HAVE_KVM_NO_POLL */ -#ifdef CONFIG_HAVE_KVM_VCPU_ASYNC_IOCTL -long kvm_arch_vcpu_async_ioctl(struct file *filp, - unsigned int ioctl, unsigned long arg); -#else -static inline long kvm_arch_vcpu_async_ioctl(struct file *filp, - unsigned int ioctl, - unsigned long arg) -{ - return -ENOIOCTLCMD; -} -#endif /* CONFIG_HAVE_KVM_VCPU_ASYNC_IOCTL */ - void kvm_arch_guest_memory_reclaimed(struct kvm *kvm); #ifdef CONFIG_HAVE_KVM_VCPU_RUN_PID_CHANGE diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig index 5f0015c5dd95..267c7369c765 100644 --- a/virt/kvm/Kconfig +++ b/virt/kvm/Kconfig @@ -78,9 +78,6 @@ config HAVE_KVM_IRQ_BYPASS tristate select IRQ_BYPASS_MANAGER -config HAVE_KVM_VCPU_ASYNC_IOCTL - bool - config HAVE_KVM_VCPU_RUN_PID_CHANGE bool diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index a5d9a6f221c2..74e1afd67b44 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -4434,10 +4434,10 @@ static long kvm_vcpu_ioctl(struct file *filp, return r; /* - * Some architectures have vcpu ioctls that are asynchronous to vcpu - * execution; mutex_lock() would break them. + * Let arch code handle select vCPU ioctls without holding vcpu->mutex, + * e.g. to support ioctls that can run asynchronous to vCPU execution. */ - r = kvm_arch_vcpu_async_ioctl(filp, ioctl, arg); + r = kvm_arch_vcpu_unlocked_ioctl(filp, ioctl, arg); if (r != -ENOIOCTLCMD) return r; |