summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPaolo Bonzini <pbonzini@redhat.com>2025-11-26 09:36:37 +0100
committerPaolo Bonzini <pbonzini@redhat.com>2025-11-26 09:36:37 +0100
commitde8e8ebb1a7c5e2243fdb0409418484501e3b9b2 (patch)
tree6a5e82379ea71f61edab3075cf10177e0ff63eb9
parentadc99a6cfcf76d670272dea64bbc2d43ecd12a2f (diff)
parent398180f93cf3c7bb0ee3f512b139ad01843f3ddf (diff)
Merge tag 'kvm-x86-tdx-6.19' of https://github.com/kvm-x86/linux into HEAD
KVM TDX changes for 6.19: - Overhaul the TDX code to address systemic races where KVM (acting on behalf of userspace) could inadvertantly trigger lock contention in the TDX-Module, which KVM was either working around in weird, ugly ways, or was simply oblivious to (as proven by Yan tripping several KVM_BUG_ON()s with clever selftests). - Fix a bug where KVM could corrupt a vCPU's cpu_list when freeing a vCPU if creating said vCPU failed partway through. - Fix a few sparse warnings (bad annotation, 0 != NULL). - Use struct_size() to simplify copying capabilities to userspace.
-rw-r--r--arch/arm64/kvm/arm.c6
-rw-r--r--arch/loongarch/kvm/Kconfig1
-rw-r--r--arch/loongarch/kvm/vcpu.c4
-rw-r--r--arch/mips/kvm/Kconfig1
-rw-r--r--arch/mips/kvm/mips.c4
-rw-r--r--arch/powerpc/kvm/Kconfig1
-rw-r--r--arch/powerpc/kvm/powerpc.c4
-rw-r--r--arch/riscv/kvm/Kconfig1
-rw-r--r--arch/riscv/kvm/vcpu.c4
-rw-r--r--arch/s390/kvm/Kconfig1
-rw-r--r--arch/s390/kvm/kvm-s390.c4
-rw-r--r--arch/x86/include/asm/kvm-x86-ops.h1
-rw-r--r--arch/x86/include/asm/kvm_host.h7
-rw-r--r--arch/x86/kvm/mmu.h3
-rw-r--r--arch/x86/kvm/mmu/mmu.c87
-rw-r--r--arch/x86/kvm/mmu/tdp_mmu.c50
-rw-r--r--arch/x86/kvm/vmx/main.c9
-rw-r--r--arch/x86/kvm/vmx/tdx.c710
-rw-r--r--arch/x86/kvm/vmx/tdx.h8
-rw-r--r--arch/x86/kvm/vmx/x86_ops.h1
-rw-r--r--arch/x86/kvm/x86.c13
-rw-r--r--include/linux/kvm_host.h14
-rw-r--r--virt/kvm/Kconfig3
-rw-r--r--virt/kvm/kvm_main.c6
24 files changed, 495 insertions, 448 deletions
diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
index 052bf0d4d0b0..e7140c19771b 100644
--- a/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@@ -1835,6 +1835,12 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
return r;
}
+long kvm_arch_vcpu_unlocked_ioctl(struct file *filp, unsigned int ioctl,
+ unsigned long arg)
+{
+ return -ENOIOCTLCMD;
+}
+
void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot)
{
diff --git a/arch/loongarch/kvm/Kconfig b/arch/loongarch/kvm/Kconfig
index ae64bbdf83a7..ed4f724db774 100644
--- a/arch/loongarch/kvm/Kconfig
+++ b/arch/loongarch/kvm/Kconfig
@@ -25,7 +25,6 @@ config KVM
select HAVE_KVM_IRQCHIP
select HAVE_KVM_MSI
select HAVE_KVM_READONLY_MEM
- select HAVE_KVM_VCPU_ASYNC_IOCTL
select KVM_COMMON
select KVM_GENERIC_DIRTYLOG_READ_PROTECT
select KVM_GENERIC_HARDWARE_ENABLING
diff --git a/arch/loongarch/kvm/vcpu.c b/arch/loongarch/kvm/vcpu.c
index 1245a6b35896..72dc5726259b 100644
--- a/arch/loongarch/kvm/vcpu.c
+++ b/arch/loongarch/kvm/vcpu.c
@@ -1473,8 +1473,8 @@ int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, struct kvm_interrupt *irq)
return 0;
}
-long kvm_arch_vcpu_async_ioctl(struct file *filp,
- unsigned int ioctl, unsigned long arg)
+long kvm_arch_vcpu_unlocked_ioctl(struct file *filp, unsigned int ioctl,
+ unsigned long arg)
{
void __user *argp = (void __user *)arg;
struct kvm_vcpu *vcpu = filp->private_data;
diff --git a/arch/mips/kvm/Kconfig b/arch/mips/kvm/Kconfig
index ab57221fa4dd..cc13cc35f208 100644
--- a/arch/mips/kvm/Kconfig
+++ b/arch/mips/kvm/Kconfig
@@ -22,7 +22,6 @@ config KVM
select EXPORT_UASM
select KVM_COMMON
select KVM_GENERIC_DIRTYLOG_READ_PROTECT
- select HAVE_KVM_VCPU_ASYNC_IOCTL
select KVM_MMIO
select KVM_GENERIC_MMU_NOTIFIER
select KVM_GENERIC_HARDWARE_ENABLING
diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c
index a75587018f44..b0fb92fda4d4 100644
--- a/arch/mips/kvm/mips.c
+++ b/arch/mips/kvm/mips.c
@@ -895,8 +895,8 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu,
return r;
}
-long kvm_arch_vcpu_async_ioctl(struct file *filp, unsigned int ioctl,
- unsigned long arg)
+long kvm_arch_vcpu_unlocked_ioctl(struct file *filp, unsigned int ioctl,
+ unsigned long arg)
{
struct kvm_vcpu *vcpu = filp->private_data;
void __user *argp = (void __user *)arg;
diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig
index 2f2702c867f7..c9a2d50ff1b0 100644
--- a/arch/powerpc/kvm/Kconfig
+++ b/arch/powerpc/kvm/Kconfig
@@ -20,7 +20,6 @@ if VIRTUALIZATION
config KVM
bool
select KVM_COMMON
- select HAVE_KVM_VCPU_ASYNC_IOCTL
select KVM_VFIO
select HAVE_KVM_IRQ_BYPASS
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index 2ba057171ebe..9a89a6d98f97 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -2028,8 +2028,8 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
return -EINVAL;
}
-long kvm_arch_vcpu_async_ioctl(struct file *filp,
- unsigned int ioctl, unsigned long arg)
+long kvm_arch_vcpu_unlocked_ioctl(struct file *filp, unsigned int ioctl,
+ unsigned long arg)
{
struct kvm_vcpu *vcpu = filp->private_data;
void __user *argp = (void __user *)arg;
diff --git a/arch/riscv/kvm/Kconfig b/arch/riscv/kvm/Kconfig
index c50328212917..77379f77840a 100644
--- a/arch/riscv/kvm/Kconfig
+++ b/arch/riscv/kvm/Kconfig
@@ -23,7 +23,6 @@ config KVM
select HAVE_KVM_IRQCHIP
select HAVE_KVM_IRQ_ROUTING
select HAVE_KVM_MSI
- select HAVE_KVM_VCPU_ASYNC_IOCTL
select HAVE_KVM_READONLY_MEM
select HAVE_KVM_DIRTY_RING_ACQ_REL
select KVM_COMMON
diff --git a/arch/riscv/kvm/vcpu.c b/arch/riscv/kvm/vcpu.c
index 5ce35aba6069..2739d5b5ec80 100644
--- a/arch/riscv/kvm/vcpu.c
+++ b/arch/riscv/kvm/vcpu.c
@@ -238,8 +238,8 @@ vm_fault_t kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf)
return VM_FAULT_SIGBUS;
}
-long kvm_arch_vcpu_async_ioctl(struct file *filp,
- unsigned int ioctl, unsigned long arg)
+long kvm_arch_vcpu_unlocked_ioctl(struct file *filp, unsigned int ioctl,
+ unsigned long arg)
{
struct kvm_vcpu *vcpu = filp->private_data;
void __user *argp = (void __user *)arg;
diff --git a/arch/s390/kvm/Kconfig b/arch/s390/kvm/Kconfig
index cae908d64550..96d16028e8b7 100644
--- a/arch/s390/kvm/Kconfig
+++ b/arch/s390/kvm/Kconfig
@@ -20,7 +20,6 @@ config KVM
def_tristate y
prompt "Kernel-based Virtual Machine (KVM) support"
select HAVE_KVM_CPU_RELAX_INTERCEPT
- select HAVE_KVM_VCPU_ASYNC_IOCTL
select KVM_ASYNC_PF
select KVM_ASYNC_PF_SYNC
select KVM_COMMON
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index 16ba04062854..8c4caa5f2fcd 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -5730,8 +5730,8 @@ static long kvm_s390_vcpu_memsida_op(struct kvm_vcpu *vcpu,
return r;
}
-long kvm_arch_vcpu_async_ioctl(struct file *filp,
- unsigned int ioctl, unsigned long arg)
+long kvm_arch_vcpu_unlocked_ioctl(struct file *filp, unsigned int ioctl,
+ unsigned long arg)
{
struct kvm_vcpu *vcpu = filp->private_data;
void __user *argp = (void __user *)arg;
diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h
index fdf178443f85..de709fb5bd76 100644
--- a/arch/x86/include/asm/kvm-x86-ops.h
+++ b/arch/x86/include/asm/kvm-x86-ops.h
@@ -128,6 +128,7 @@ KVM_X86_OP(enable_smi_window)
KVM_X86_OP_OPTIONAL(dev_get_attr)
KVM_X86_OP_OPTIONAL(mem_enc_ioctl)
KVM_X86_OP_OPTIONAL(vcpu_mem_enc_ioctl)
+KVM_X86_OP_OPTIONAL(vcpu_mem_enc_unlocked_ioctl)
KVM_X86_OP_OPTIONAL(mem_enc_register_region)
KVM_X86_OP_OPTIONAL(mem_enc_unregister_region)
KVM_X86_OP_OPTIONAL(vm_copy_enc_context_from)
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 103af57e1060..6d41249e86af 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1843,15 +1843,15 @@ struct kvm_x86_ops {
void *external_spt);
/* Update the external page table from spte getting set. */
int (*set_external_spte)(struct kvm *kvm, gfn_t gfn, enum pg_level level,
- kvm_pfn_t pfn_for_gfn);
+ u64 mirror_spte);
/* Update external page tables for page table about to be freed. */
int (*free_external_spt)(struct kvm *kvm, gfn_t gfn, enum pg_level level,
void *external_spt);
/* Update external page table from spte getting removed, and flush TLB. */
- int (*remove_external_spte)(struct kvm *kvm, gfn_t gfn, enum pg_level level,
- kvm_pfn_t pfn_for_gfn);
+ void (*remove_external_spte)(struct kvm *kvm, gfn_t gfn, enum pg_level level,
+ u64 mirror_spte);
bool (*has_wbinvd_exit)(void);
@@ -1909,6 +1909,7 @@ struct kvm_x86_ops {
int (*dev_get_attr)(u32 group, u64 attr, u64 *val);
int (*mem_enc_ioctl)(struct kvm *kvm, void __user *argp);
int (*vcpu_mem_enc_ioctl)(struct kvm_vcpu *vcpu, void __user *argp);
+ int (*vcpu_mem_enc_unlocked_ioctl)(struct kvm_vcpu *vcpu, void __user *argp);
int (*mem_enc_register_region)(struct kvm *kvm, struct kvm_enc_region *argp);
int (*mem_enc_unregister_region)(struct kvm *kvm, struct kvm_enc_region *argp);
int (*vm_copy_enc_context_from)(struct kvm *kvm, unsigned int source_fd);
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index c272ef269b96..830f46145692 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -255,8 +255,7 @@ extern bool tdp_mmu_enabled;
#define tdp_mmu_enabled false
#endif
-bool kvm_tdp_mmu_gpa_is_mapped(struct kvm_vcpu *vcpu, u64 gpa);
-int kvm_tdp_map_page(struct kvm_vcpu *vcpu, gpa_t gpa, u64 error_code, u8 *level);
+int kvm_tdp_mmu_map_private_pfn(struct kvm_vcpu *vcpu, gfn_t gfn, kvm_pfn_t pfn);
static inline bool kvm_memslots_have_rmaps(struct kvm *kvm)
{
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index c15ac5329cf8..02c450686b4a 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -4924,7 +4924,8 @@ int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
return direct_page_fault(vcpu, fault);
}
-int kvm_tdp_map_page(struct kvm_vcpu *vcpu, gpa_t gpa, u64 error_code, u8 *level)
+static int kvm_tdp_page_prefault(struct kvm_vcpu *vcpu, gpa_t gpa,
+ u64 error_code, u8 *level)
{
int r;
@@ -4966,7 +4967,6 @@ int kvm_tdp_map_page(struct kvm_vcpu *vcpu, gpa_t gpa, u64 error_code, u8 *level
return -EIO;
}
}
-EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_tdp_map_page);
long kvm_arch_vcpu_pre_fault_memory(struct kvm_vcpu *vcpu,
struct kvm_pre_fault_memory *range)
@@ -5002,7 +5002,7 @@ long kvm_arch_vcpu_pre_fault_memory(struct kvm_vcpu *vcpu,
* Shadow paging uses GVA for kvm page fault, so restrict to
* two-dimensional paging.
*/
- r = kvm_tdp_map_page(vcpu, range->gpa | direct_bits, error_code, &level);
+ r = kvm_tdp_page_prefault(vcpu, range->gpa | direct_bits, error_code, &level);
if (r < 0)
return r;
@@ -5014,6 +5014,86 @@ long kvm_arch_vcpu_pre_fault_memory(struct kvm_vcpu *vcpu,
return min(range->size, end - range->gpa);
}
+#ifdef CONFIG_KVM_GUEST_MEMFD
+static void kvm_assert_gmem_invalidate_lock_held(struct kvm_memory_slot *slot)
+{
+#ifdef CONFIG_PROVE_LOCKING
+ if (WARN_ON_ONCE(!kvm_slot_has_gmem(slot)) ||
+ WARN_ON_ONCE(!slot->gmem.file) ||
+ WARN_ON_ONCE(!file_count(slot->gmem.file)))
+ return;
+
+ lockdep_assert_held(&file_inode(slot->gmem.file)->i_mapping->invalidate_lock);
+#endif
+}
+
+int kvm_tdp_mmu_map_private_pfn(struct kvm_vcpu *vcpu, gfn_t gfn, kvm_pfn_t pfn)
+{
+ struct kvm_page_fault fault = {
+ .addr = gfn_to_gpa(gfn),
+ .error_code = PFERR_GUEST_FINAL_MASK | PFERR_PRIVATE_ACCESS,
+ .prefetch = true,
+ .is_tdp = true,
+ .nx_huge_page_workaround_enabled = is_nx_huge_page_enabled(vcpu->kvm),
+
+ .max_level = PG_LEVEL_4K,
+ .req_level = PG_LEVEL_4K,
+ .goal_level = PG_LEVEL_4K,
+ .is_private = true,
+
+ .gfn = gfn,
+ .slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn),
+ .pfn = pfn,
+ .map_writable = true,
+ };
+ struct kvm *kvm = vcpu->kvm;
+ int r;
+
+ lockdep_assert_held(&kvm->slots_lock);
+
+ /*
+ * Mapping a pre-determined private pfn is intended only for use when
+ * populating a guest_memfd instance. Assert that the slot is backed
+ * by guest_memfd and that the gmem instance's invalidate_lock is held.
+ */
+ kvm_assert_gmem_invalidate_lock_held(fault.slot);
+
+ if (KVM_BUG_ON(!tdp_mmu_enabled, kvm))
+ return -EIO;
+
+ if (kvm_gfn_is_write_tracked(kvm, fault.slot, fault.gfn))
+ return -EPERM;
+
+ r = kvm_mmu_reload(vcpu);
+ if (r)
+ return r;
+
+ r = mmu_topup_memory_caches(vcpu, false);
+ if (r)
+ return r;
+
+ do {
+ if (signal_pending(current))
+ return -EINTR;
+
+ if (kvm_test_request(KVM_REQ_VM_DEAD, vcpu))
+ return -EIO;
+
+ cond_resched();
+
+ guard(read_lock)(&kvm->mmu_lock);
+
+ r = kvm_tdp_mmu_map(vcpu, &fault);
+ } while (r == RET_PF_RETRY);
+
+ if (r != RET_PF_FIXED)
+ return -EIO;
+
+ return 0;
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_tdp_mmu_map_private_pfn);
+#endif
+
static void nonpaging_init_context(struct kvm_mmu *context)
{
context->page_fault = nonpaging_page_fault;
@@ -5997,7 +6077,6 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu)
out:
return r;
}
-EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_mmu_load);
void kvm_mmu_unload(struct kvm_vcpu *vcpu)
{
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index c5734ca5c17d..9c26038f6b77 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -362,9 +362,6 @@ static void tdp_mmu_unlink_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
static void remove_external_spte(struct kvm *kvm, gfn_t gfn, u64 old_spte,
int level)
{
- kvm_pfn_t old_pfn = spte_to_pfn(old_spte);
- int ret;
-
/*
* External (TDX) SPTEs are limited to PG_LEVEL_4K, and external
* PTs are removed in a special order, involving free_external_spt().
@@ -377,9 +374,8 @@ static void remove_external_spte(struct kvm *kvm, gfn_t gfn, u64 old_spte,
/* Zapping leaf spte is allowed only when write lock is held. */
lockdep_assert_held_write(&kvm->mmu_lock);
- /* Because write lock is held, operation should success. */
- ret = kvm_x86_call(remove_external_spte)(kvm, gfn, level, old_pfn);
- KVM_BUG_ON(ret, kvm);
+
+ kvm_x86_call(remove_external_spte)(kvm, gfn, level, old_spte);
}
/**
@@ -519,7 +515,6 @@ static int __must_check set_external_spte_present(struct kvm *kvm, tdp_ptep_t sp
bool was_present = is_shadow_present_pte(old_spte);
bool is_present = is_shadow_present_pte(new_spte);
bool is_leaf = is_present && is_last_spte(new_spte, level);
- kvm_pfn_t new_pfn = spte_to_pfn(new_spte);
int ret = 0;
KVM_BUG_ON(was_present, kvm);
@@ -538,7 +533,7 @@ static int __must_check set_external_spte_present(struct kvm *kvm, tdp_ptep_t sp
* external page table, or leaf.
*/
if (is_leaf) {
- ret = kvm_x86_call(set_external_spte)(kvm, gfn, level, new_pfn);
+ ret = kvm_x86_call(set_external_spte)(kvm, gfn, level, new_spte);
} else {
void *external_spt = get_external_spt(gfn, new_spte, level);
@@ -1273,6 +1268,8 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
struct kvm_mmu_page *sp;
int ret = RET_PF_RETRY;
+ KVM_MMU_WARN_ON(!root || root->role.invalid);
+
kvm_mmu_hugepage_adjust(vcpu, fault);
trace_kvm_mmu_spte_requested(fault);
@@ -1939,13 +1936,16 @@ bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm,
*
* Must be called between kvm_tdp_mmu_walk_lockless_{begin,end}.
*/
-static int __kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes,
- struct kvm_mmu_page *root)
+int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes,
+ int *root_level)
{
+ struct kvm_mmu_page *root = root_to_sp(vcpu->arch.mmu->root.hpa);
struct tdp_iter iter;
gfn_t gfn = addr >> PAGE_SHIFT;
int leaf = -1;
+ *root_level = vcpu->arch.mmu->root_role.level;
+
for_each_tdp_pte(iter, vcpu->kvm, root, gfn, gfn + 1) {
leaf = iter.level;
sptes[leaf] = iter.old_spte;
@@ -1954,36 +1954,6 @@ static int __kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes,
return leaf;
}
-int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes,
- int *root_level)
-{
- struct kvm_mmu_page *root = root_to_sp(vcpu->arch.mmu->root.hpa);
- *root_level = vcpu->arch.mmu->root_role.level;
-
- return __kvm_tdp_mmu_get_walk(vcpu, addr, sptes, root);
-}
-
-bool kvm_tdp_mmu_gpa_is_mapped(struct kvm_vcpu *vcpu, u64 gpa)
-{
- struct kvm *kvm = vcpu->kvm;
- bool is_direct = kvm_is_addr_direct(kvm, gpa);
- hpa_t root = is_direct ? vcpu->arch.mmu->root.hpa :
- vcpu->arch.mmu->mirror_root_hpa;
- u64 sptes[PT64_ROOT_MAX_LEVEL + 1], spte;
- int leaf;
-
- lockdep_assert_held(&kvm->mmu_lock);
- rcu_read_lock();
- leaf = __kvm_tdp_mmu_get_walk(vcpu, gpa, sptes, root_to_sp(root));
- rcu_read_unlock();
- if (leaf < 0)
- return false;
-
- spte = sptes[leaf];
- return is_shadow_present_pte(spte) && is_last_spte(spte, leaf);
-}
-EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_tdp_mmu_gpa_is_mapped);
-
/*
* Returns the last level spte pointer of the shadow page walk for the given
* gpa, and sets *spte to the spte value. This spte may be non-preset. If no
diff --git a/arch/x86/kvm/vmx/main.c b/arch/x86/kvm/vmx/main.c
index 0eb2773b2ae2..a46ccd670785 100644
--- a/arch/x86/kvm/vmx/main.c
+++ b/arch/x86/kvm/vmx/main.c
@@ -831,6 +831,14 @@ static int vt_vcpu_mem_enc_ioctl(struct kvm_vcpu *vcpu, void __user *argp)
return tdx_vcpu_ioctl(vcpu, argp);
}
+static int vt_vcpu_mem_enc_unlocked_ioctl(struct kvm_vcpu *vcpu, void __user *argp)
+{
+ if (!is_td_vcpu(vcpu))
+ return -EINVAL;
+
+ return tdx_vcpu_unlocked_ioctl(vcpu, argp);
+}
+
static int vt_gmem_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn,
bool is_private)
{
@@ -1005,6 +1013,7 @@ struct kvm_x86_ops vt_x86_ops __initdata = {
.mem_enc_ioctl = vt_op_tdx_only(mem_enc_ioctl),
.vcpu_mem_enc_ioctl = vt_op_tdx_only(vcpu_mem_enc_ioctl),
+ .vcpu_mem_enc_unlocked_ioctl = vt_op_tdx_only(vcpu_mem_enc_unlocked_ioctl),
.gmem_max_mapping_level = vt_op_tdx_only(gmem_max_mapping_level)
};
diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c
index 6d41d2fc8043..f2d8fc3fecaa 100644
--- a/arch/x86/kvm/vmx/tdx.c
+++ b/arch/x86/kvm/vmx/tdx.c
@@ -24,20 +24,33 @@
#undef pr_fmt
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-#define pr_tdx_error(__fn, __err) \
- pr_err_ratelimited("SEAMCALL %s failed: 0x%llx\n", #__fn, __err)
+#define __TDX_BUG_ON(__err, __f, __kvm, __fmt, __args...) \
+({ \
+ struct kvm *_kvm = (__kvm); \
+ bool __ret = !!(__err); \
+ \
+ if (WARN_ON_ONCE(__ret && (!_kvm || !_kvm->vm_bugged))) { \
+ if (_kvm) \
+ kvm_vm_bugged(_kvm); \
+ pr_err_ratelimited("SEAMCALL " __f " failed: 0x%llx" __fmt "\n",\
+ __err, __args); \
+ } \
+ unlikely(__ret); \
+})
+
+#define TDX_BUG_ON(__err, __fn, __kvm) \
+ __TDX_BUG_ON(__err, #__fn, __kvm, "%s", "")
+
+#define TDX_BUG_ON_1(__err, __fn, a1, __kvm) \
+ __TDX_BUG_ON(__err, #__fn, __kvm, ", " #a1 " 0x%llx", a1)
+
+#define TDX_BUG_ON_2(__err, __fn, a1, a2, __kvm) \
+ __TDX_BUG_ON(__err, #__fn, __kvm, ", " #a1 " 0x%llx, " #a2 " 0x%llx", a1, a2)
+
+#define TDX_BUG_ON_3(__err, __fn, a1, a2, a3, __kvm) \
+ __TDX_BUG_ON(__err, #__fn, __kvm, ", " #a1 " 0x%llx, " #a2 ", 0x%llx, " #a3 " 0x%llx", \
+ a1, a2, a3)
-#define __pr_tdx_error_N(__fn_str, __err, __fmt, ...) \
- pr_err_ratelimited("SEAMCALL " __fn_str " failed: 0x%llx, " __fmt, __err, __VA_ARGS__)
-
-#define pr_tdx_error_1(__fn, __err, __rcx) \
- __pr_tdx_error_N(#__fn, __err, "rcx 0x%llx\n", __rcx)
-
-#define pr_tdx_error_2(__fn, __err, __rcx, __rdx) \
- __pr_tdx_error_N(#__fn, __err, "rcx 0x%llx, rdx 0x%llx\n", __rcx, __rdx)
-
-#define pr_tdx_error_3(__fn, __err, __rcx, __rdx, __r8) \
- __pr_tdx_error_N(#__fn, __err, "rcx 0x%llx, rdx 0x%llx, r8 0x%llx\n", __rcx, __rdx, __r8)
bool enable_tdx __ro_after_init;
module_param_named(tdx, enable_tdx, bool, 0444);
@@ -281,25 +294,34 @@ static inline void tdx_disassociate_vp(struct kvm_vcpu *vcpu)
vcpu->cpu = -1;
}
-static void tdx_no_vcpus_enter_start(struct kvm *kvm)
-{
- struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
-
- lockdep_assert_held_write(&kvm->mmu_lock);
-
- WRITE_ONCE(kvm_tdx->wait_for_sept_zap, true);
-
- kvm_make_all_cpus_request(kvm, KVM_REQ_OUTSIDE_GUEST_MODE);
-}
-
-static void tdx_no_vcpus_enter_stop(struct kvm *kvm)
-{
- struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
-
- lockdep_assert_held_write(&kvm->mmu_lock);
-
- WRITE_ONCE(kvm_tdx->wait_for_sept_zap, false);
-}
+/*
+ * Execute a SEAMCALL related to removing/blocking S-EPT entries, with a single
+ * retry (if necessary) after forcing vCPUs to exit and wait for the operation
+ * to complete. All flows that remove/block S-EPT entries run with mmu_lock
+ * held for write, i.e. are mutually exclusive with each other, but they aren't
+ * mutually exclusive with running vCPUs, and so can fail with "operand busy"
+ * if a vCPU acquires a relevant lock in the TDX-Module, e.g. when doing TDCALL.
+ *
+ * Note, the retry is guaranteed to succeed, absent KVM and/or TDX-Module bugs.
+ */
+#define tdh_do_no_vcpus(tdh_func, kvm, args...) \
+({ \
+ struct kvm_tdx *__kvm_tdx = to_kvm_tdx(kvm); \
+ u64 __err; \
+ \
+ lockdep_assert_held_write(&kvm->mmu_lock); \
+ \
+ __err = tdh_func(args); \
+ if (unlikely(tdx_operand_busy(__err))) { \
+ WRITE_ONCE(__kvm_tdx->wait_for_sept_zap, true); \
+ kvm_make_all_cpus_request(kvm, KVM_REQ_OUTSIDE_GUEST_MODE); \
+ \
+ __err = tdh_func(args); \
+ \
+ WRITE_ONCE(__kvm_tdx->wait_for_sept_zap, false); \
+ } \
+ __err; \
+})
/* TDH.PHYMEM.PAGE.RECLAIM is allowed only when destroying the TD. */
static int __tdx_reclaim_page(struct page *page)
@@ -313,10 +335,9 @@ static int __tdx_reclaim_page(struct page *page)
* before the HKID is released and control pages have also been
* released at this point, so there is no possibility of contention.
*/
- if (WARN_ON_ONCE(err)) {
- pr_tdx_error_3(TDH_PHYMEM_PAGE_RECLAIM, err, rcx, rdx, r8);
+ if (TDX_BUG_ON_3(err, TDH_PHYMEM_PAGE_RECLAIM, rcx, rdx, r8, NULL))
return -EIO;
- }
+
return 0;
}
@@ -404,8 +425,8 @@ static void tdx_flush_vp_on_cpu(struct kvm_vcpu *vcpu)
return;
smp_call_function_single(cpu, tdx_flush_vp, &arg, 1);
- if (KVM_BUG_ON(arg.err, vcpu->kvm))
- pr_tdx_error(TDH_VP_FLUSH, arg.err);
+
+ TDX_BUG_ON(arg.err, TDH_VP_FLUSH, vcpu->kvm);
}
void tdx_disable_virtualization_cpu(void)
@@ -464,8 +485,7 @@ static void smp_func_do_phymem_cache_wb(void *unused)
}
out:
- if (WARN_ON_ONCE(err))
- pr_tdx_error(TDH_PHYMEM_CACHE_WB, err);
+ TDX_BUG_ON(err, TDH_PHYMEM_CACHE_WB, NULL);
}
void tdx_mmu_release_hkid(struct kvm *kvm)
@@ -504,8 +524,7 @@ void tdx_mmu_release_hkid(struct kvm *kvm)
err = tdh_mng_vpflushdone(&kvm_tdx->td);
if (err == TDX_FLUSHVP_NOT_DONE)
goto out;
- if (KVM_BUG_ON(err, kvm)) {
- pr_tdx_error(TDH_MNG_VPFLUSHDONE, err);
+ if (TDX_BUG_ON(err, TDH_MNG_VPFLUSHDONE, kvm)) {
pr_err("tdh_mng_vpflushdone() failed. HKID %d is leaked.\n",
kvm_tdx->hkid);
goto out;
@@ -528,8 +547,7 @@ void tdx_mmu_release_hkid(struct kvm *kvm)
* tdh_mng_key_freeid() will fail.
*/
err = tdh_mng_key_freeid(&kvm_tdx->td);
- if (KVM_BUG_ON(err, kvm)) {
- pr_tdx_error(TDH_MNG_KEY_FREEID, err);
+ if (TDX_BUG_ON(err, TDH_MNG_KEY_FREEID, kvm)) {
pr_err("tdh_mng_key_freeid() failed. HKID %d is leaked.\n",
kvm_tdx->hkid);
} else {
@@ -580,10 +598,9 @@ static void tdx_reclaim_td_control_pages(struct kvm *kvm)
* when it is reclaiming TDCS).
*/
err = tdh_phymem_page_wbinvd_tdr(&kvm_tdx->td);
- if (KVM_BUG_ON(err, kvm)) {
- pr_tdx_error(TDH_PHYMEM_PAGE_WBINVD, err);
+ if (TDX_BUG_ON(err, TDH_PHYMEM_PAGE_WBINVD, kvm))
return;
- }
+
tdx_quirk_reset_page(kvm_tdx->td.tdr_page);
__free_page(kvm_tdx->td.tdr_page);
@@ -606,11 +623,8 @@ static int tdx_do_tdh_mng_key_config(void *param)
/* TDX_RND_NO_ENTROPY related retries are handled by sc_retry() */
err = tdh_mng_key_config(&kvm_tdx->td);
-
- if (KVM_BUG_ON(err, &kvm_tdx->kvm)) {
- pr_tdx_error(TDH_MNG_KEY_CONFIG, err);
+ if (TDX_BUG_ON(err, TDH_MNG_KEY_CONFIG, &kvm_tdx->kvm))
return -EIO;
- }
return 0;
}
@@ -824,19 +838,52 @@ void tdx_vcpu_put(struct kvm_vcpu *vcpu)
tdx_prepare_switch_to_host(vcpu);
}
+/*
+ * Life cycles for a TD and a vCPU:
+ * 1. KVM_CREATE_VM ioctl.
+ * TD state is TD_STATE_UNINITIALIZED.
+ * hkid is not assigned at this stage.
+ * 2. KVM_TDX_INIT_VM ioctl.
+ * TD transitions to TD_STATE_INITIALIZED.
+ * hkid is assigned after this stage.
+ * 3. KVM_CREATE_VCPU ioctl. (only when TD is TD_STATE_INITIALIZED).
+ * 3.1 tdx_vcpu_create() transitions vCPU state to VCPU_TD_STATE_UNINITIALIZED.
+ * 3.2 vcpu_load() and vcpu_put() in kvm_arch_vcpu_create().
+ * 3.3 (conditional) if any error encountered after kvm_arch_vcpu_create()
+ * kvm_arch_vcpu_destroy() --> tdx_vcpu_free().
+ * 4. KVM_TDX_INIT_VCPU ioctl.
+ * tdx_vcpu_init() transitions vCPU state to VCPU_TD_STATE_INITIALIZED.
+ * vCPU control structures are allocated at this stage.
+ * 5. kvm_destroy_vm().
+ * 5.1 tdx_mmu_release_hkid(): (1) tdh_vp_flush(), disassociates all vCPUs.
+ * (2) puts hkid to !assigned state.
+ * 5.2 kvm_destroy_vcpus() --> tdx_vcpu_free():
+ * transitions vCPU to VCPU_TD_STATE_UNINITIALIZED state.
+ * 5.3 tdx_vm_destroy()
+ * transitions TD to TD_STATE_UNINITIALIZED state.
+ *
+ * tdx_vcpu_free() can be invoked only at 3.3 or 5.2.
+ * - If at 3.3, hkid is still assigned, but the vCPU must be in
+ * VCPU_TD_STATE_UNINITIALIZED state.
+ * - if at 5.2, hkid must be !assigned and all vCPUs must be in
+ * VCPU_TD_STATE_INITIALIZED state and have been dissociated.
+ */
void tdx_vcpu_free(struct kvm_vcpu *vcpu)
{
struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm);
struct vcpu_tdx *tdx = to_tdx(vcpu);
int i;
+ if (vcpu->cpu != -1) {
+ KVM_BUG_ON(tdx->state == VCPU_TD_STATE_INITIALIZED, vcpu->kvm);
+ tdx_flush_vp_on_cpu(vcpu);
+ return;
+ }
+
/*
* It is not possible to reclaim pages while hkid is assigned. It might
- * be assigned if:
- * 1. the TD VM is being destroyed but freeing hkid failed, in which
- * case the pages are leaked
- * 2. TD VCPU creation failed and this on the error path, in which case
- * there is nothing to do anyway
+ * be assigned if the TD VM is being destroyed but freeing hkid failed,
+ * in which case the pages are leaked.
*/
if (is_hkid_assigned(kvm_tdx))
return;
@@ -851,7 +898,7 @@ void tdx_vcpu_free(struct kvm_vcpu *vcpu)
}
if (tdx->vp.tdvpr_page) {
tdx_reclaim_control_page(tdx->vp.tdvpr_page);
- tdx->vp.tdvpr_page = 0;
+ tdx->vp.tdvpr_page = NULL;
tdx->vp.tdvpr_pa = 0;
}
@@ -1574,137 +1621,79 @@ void tdx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, int pgd_level)
td_vmcs_write64(to_tdx(vcpu), SHARED_EPT_POINTER, root_hpa);
}
-static void tdx_unpin(struct kvm *kvm, struct page *page)
+static int tdx_mem_page_add(struct kvm *kvm, gfn_t gfn, enum pg_level level,
+ kvm_pfn_t pfn)
{
- put_page(page);
+ struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
+ u64 err, entry, level_state;
+ gpa_t gpa = gfn_to_gpa(gfn);
+
+ lockdep_assert_held(&kvm->slots_lock);
+
+ if (KVM_BUG_ON(kvm->arch.pre_fault_allowed, kvm) ||
+ KVM_BUG_ON(!kvm_tdx->page_add_src, kvm))
+ return -EIO;
+
+ err = tdh_mem_page_add(&kvm_tdx->td, gpa, pfn_to_page(pfn),
+ kvm_tdx->page_add_src, &entry, &level_state);
+ if (unlikely(tdx_operand_busy(err)))
+ return -EBUSY;
+
+ if (TDX_BUG_ON_2(err, TDH_MEM_PAGE_ADD, entry, level_state, kvm))
+ return -EIO;
+
+ return 0;
}
static int tdx_mem_page_aug(struct kvm *kvm, gfn_t gfn,
- enum pg_level level, struct page *page)
+ enum pg_level level, kvm_pfn_t pfn)
{
int tdx_level = pg_level_to_tdx_sept_level(level);
struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
+ struct page *page = pfn_to_page(pfn);
gpa_t gpa = gfn_to_gpa(gfn);
u64 entry, level_state;
u64 err;
err = tdh_mem_page_aug(&kvm_tdx->td, gpa, tdx_level, page, &entry, &level_state);
- if (unlikely(tdx_operand_busy(err))) {
- tdx_unpin(kvm, page);
+ if (unlikely(tdx_operand_busy(err)))
return -EBUSY;
- }
- if (KVM_BUG_ON(err, kvm)) {
- pr_tdx_error_2(TDH_MEM_PAGE_AUG, err, entry, level_state);
- tdx_unpin(kvm, page);
+ if (TDX_BUG_ON_2(err, TDH_MEM_PAGE_AUG, entry, level_state, kvm))
return -EIO;
- }
-
- return 0;
-}
-/*
- * KVM_TDX_INIT_MEM_REGION calls kvm_gmem_populate() to map guest pages; the
- * callback tdx_gmem_post_populate() then maps pages into private memory.
- * through the a seamcall TDH.MEM.PAGE.ADD(). The SEAMCALL also requires the
- * private EPT structures for the page to have been built before, which is
- * done via kvm_tdp_map_page(). nr_premapped counts the number of pages that
- * were added to the EPT structures but not added with TDH.MEM.PAGE.ADD().
- * The counter has to be zero on KVM_TDX_FINALIZE_VM, to ensure that there
- * are no half-initialized shared EPT pages.
- */
-static int tdx_mem_page_record_premap_cnt(struct kvm *kvm, gfn_t gfn,
- enum pg_level level, kvm_pfn_t pfn)
-{
- struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
-
- if (KVM_BUG_ON(kvm->arch.pre_fault_allowed, kvm))
- return -EINVAL;
-
- /* nr_premapped will be decreased when tdh_mem_page_add() is called. */
- atomic64_inc(&kvm_tdx->nr_premapped);
return 0;
}
static int tdx_sept_set_private_spte(struct kvm *kvm, gfn_t gfn,
- enum pg_level level, kvm_pfn_t pfn)
+ enum pg_level level, u64 mirror_spte)
{
struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
- struct page *page = pfn_to_page(pfn);
+ kvm_pfn_t pfn = spte_to_pfn(mirror_spte);
/* TODO: handle large pages. */
if (KVM_BUG_ON(level != PG_LEVEL_4K, kvm))
- return -EINVAL;
+ return -EIO;
- /*
- * Because guest_memfd doesn't support page migration with
- * a_ops->migrate_folio (yet), no callback is triggered for KVM on page
- * migration. Until guest_memfd supports page migration, prevent page
- * migration.
- * TODO: Once guest_memfd introduces callback on page migration,
- * implement it and remove get_page/put_page().
- */
- get_page(page);
+ WARN_ON_ONCE(!is_shadow_present_pte(mirror_spte) ||
+ (mirror_spte & VMX_EPT_RWX_MASK) != VMX_EPT_RWX_MASK);
/*
- * Read 'pre_fault_allowed' before 'kvm_tdx->state'; see matching
- * barrier in tdx_td_finalize().
+ * Ensure pre_fault_allowed is read by kvm_arch_vcpu_pre_fault_memory()
+ * before kvm_tdx->state. Userspace must not be allowed to pre-fault
+ * arbitrary memory until the initial memory image is finalized. Pairs
+ * with the smp_wmb() in tdx_td_finalize().
*/
smp_rmb();
- if (likely(kvm_tdx->state == TD_STATE_RUNNABLE))
- return tdx_mem_page_aug(kvm, gfn, level, page);
-
- return tdx_mem_page_record_premap_cnt(kvm, gfn, level, pfn);
-}
-
-static int tdx_sept_drop_private_spte(struct kvm *kvm, gfn_t gfn,
- enum pg_level level, struct page *page)
-{
- int tdx_level = pg_level_to_tdx_sept_level(level);
- struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
- gpa_t gpa = gfn_to_gpa(gfn);
- u64 err, entry, level_state;
-
- /* TODO: handle large pages. */
- if (KVM_BUG_ON(level != PG_LEVEL_4K, kvm))
- return -EINVAL;
-
- if (KVM_BUG_ON(!is_hkid_assigned(kvm_tdx), kvm))
- return -EINVAL;
/*
- * When zapping private page, write lock is held. So no race condition
- * with other vcpu sept operation.
- * Race with TDH.VP.ENTER due to (0-step mitigation) and Guest TDCALLs.
+ * If the TD isn't finalized/runnable, then userspace is initializing
+ * the VM image via KVM_TDX_INIT_MEM_REGION; ADD the page to the TD.
*/
- err = tdh_mem_page_remove(&kvm_tdx->td, gpa, tdx_level, &entry,
- &level_state);
-
- if (unlikely(tdx_operand_busy(err))) {
- /*
- * The second retry is expected to succeed after kicking off all
- * other vCPUs and prevent them from invoking TDH.VP.ENTER.
- */
- tdx_no_vcpus_enter_start(kvm);
- err = tdh_mem_page_remove(&kvm_tdx->td, gpa, tdx_level, &entry,
- &level_state);
- tdx_no_vcpus_enter_stop(kvm);
- }
-
- if (KVM_BUG_ON(err, kvm)) {
- pr_tdx_error_2(TDH_MEM_PAGE_REMOVE, err, entry, level_state);
- return -EIO;
- }
-
- err = tdh_phymem_page_wbinvd_hkid((u16)kvm_tdx->hkid, page);
+ if (unlikely(kvm_tdx->state != TD_STATE_RUNNABLE))
+ return tdx_mem_page_add(kvm, gfn, level, pfn);
- if (KVM_BUG_ON(err, kvm)) {
- pr_tdx_error(TDH_PHYMEM_PAGE_WBINVD, err);
- return -EIO;
- }
- tdx_quirk_reset_page(page);
- tdx_unpin(kvm, page);
- return 0;
+ return tdx_mem_page_aug(kvm, gfn, level, pfn);
}
static int tdx_sept_link_private_spt(struct kvm *kvm, gfn_t gfn,
@@ -1720,81 +1709,13 @@ static int tdx_sept_link_private_spt(struct kvm *kvm, gfn_t gfn,
if (unlikely(tdx_operand_busy(err)))
return -EBUSY;
- if (KVM_BUG_ON(err, kvm)) {
- pr_tdx_error_2(TDH_MEM_SEPT_ADD, err, entry, level_state);
+ if (TDX_BUG_ON_2(err, TDH_MEM_SEPT_ADD, entry, level_state, kvm))
return -EIO;
- }
return 0;
}
/*
- * Check if the error returned from a SEPT zap SEAMCALL is due to that a page is
- * mapped by KVM_TDX_INIT_MEM_REGION without tdh_mem_page_add() being called
- * successfully.
- *
- * Since tdh_mem_sept_add() must have been invoked successfully before a
- * non-leaf entry present in the mirrored page table, the SEPT ZAP related
- * SEAMCALLs should not encounter err TDX_EPT_WALK_FAILED. They should instead
- * find TDX_EPT_ENTRY_STATE_INCORRECT due to an empty leaf entry found in the
- * SEPT.
- *
- * Further check if the returned entry from SEPT walking is with RWX permissions
- * to filter out anything unexpected.
- *
- * Note: @level is pg_level, not the tdx_level. The tdx_level extracted from
- * level_state returned from a SEAMCALL error is the same as that passed into
- * the SEAMCALL.
- */
-static int tdx_is_sept_zap_err_due_to_premap(struct kvm_tdx *kvm_tdx, u64 err,
- u64 entry, int level)
-{
- if (!err || kvm_tdx->state == TD_STATE_RUNNABLE)
- return false;
-
- if (err != (TDX_EPT_ENTRY_STATE_INCORRECT | TDX_OPERAND_ID_RCX))
- return false;
-
- if ((is_last_spte(entry, level) && (entry & VMX_EPT_RWX_MASK)))
- return false;
-
- return true;
-}
-
-static int tdx_sept_zap_private_spte(struct kvm *kvm, gfn_t gfn,
- enum pg_level level, struct page *page)
-{
- int tdx_level = pg_level_to_tdx_sept_level(level);
- struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
- gpa_t gpa = gfn_to_gpa(gfn) & KVM_HPAGE_MASK(level);
- u64 err, entry, level_state;
-
- /* For now large page isn't supported yet. */
- WARN_ON_ONCE(level != PG_LEVEL_4K);
-
- err = tdh_mem_range_block(&kvm_tdx->td, gpa, tdx_level, &entry, &level_state);
-
- if (unlikely(tdx_operand_busy(err))) {
- /* After no vCPUs enter, the second retry is expected to succeed */
- tdx_no_vcpus_enter_start(kvm);
- err = tdh_mem_range_block(&kvm_tdx->td, gpa, tdx_level, &entry, &level_state);
- tdx_no_vcpus_enter_stop(kvm);
- }
- if (tdx_is_sept_zap_err_due_to_premap(kvm_tdx, err, entry, level) &&
- !KVM_BUG_ON(!atomic64_read(&kvm_tdx->nr_premapped), kvm)) {
- atomic64_dec(&kvm_tdx->nr_premapped);
- tdx_unpin(kvm, page);
- return 0;
- }
-
- if (KVM_BUG_ON(err, kvm)) {
- pr_tdx_error_2(TDH_MEM_RANGE_BLOCK, err, entry, level_state);
- return -EIO;
- }
- return 1;
-}
-
-/*
* Ensure shared and private EPTs to be flushed on all vCPUs.
* tdh_mem_track() is the only caller that increases TD epoch. An increase in
* the TD epoch (e.g., to value "N + 1") is successful only if no vCPUs are
@@ -1827,18 +1748,15 @@ static void tdx_track(struct kvm *kvm)
if (unlikely(kvm_tdx->state != TD_STATE_RUNNABLE))
return;
+ /*
+ * The full sequence of TDH.MEM.TRACK and forcing vCPUs out of guest
+ * mode must be serialized, as TDH.MEM.TRACK will fail if the previous
+ * tracking epoch hasn't completed.
+ */
lockdep_assert_held_write(&kvm->mmu_lock);
- err = tdh_mem_track(&kvm_tdx->td);
- if (unlikely(tdx_operand_busy(err))) {
- /* After no vCPUs enter, the second retry is expected to succeed */
- tdx_no_vcpus_enter_start(kvm);
- err = tdh_mem_track(&kvm_tdx->td);
- tdx_no_vcpus_enter_stop(kvm);
- }
-
- if (KVM_BUG_ON(err, kvm))
- pr_tdx_error(TDH_MEM_TRACK, err);
+ err = tdh_do_no_vcpus(tdh_mem_track, kvm, &kvm_tdx->td);
+ TDX_BUG_ON(err, TDH_MEM_TRACK, kvm);
kvm_make_all_cpus_request(kvm, KVM_REQ_OUTSIDE_GUEST_MODE);
}
@@ -1857,7 +1775,7 @@ static int tdx_sept_free_private_spt(struct kvm *kvm, gfn_t gfn,
* and slot move/deletion.
*/
if (KVM_BUG_ON(is_hkid_assigned(kvm_tdx), kvm))
- return -EINVAL;
+ return -EIO;
/*
* The HKID assigned to this TD was already freed and cache was
@@ -1866,11 +1784,16 @@ static int tdx_sept_free_private_spt(struct kvm *kvm, gfn_t gfn,
return tdx_reclaim_page(virt_to_page(private_spt));
}
-static int tdx_sept_remove_private_spte(struct kvm *kvm, gfn_t gfn,
- enum pg_level level, kvm_pfn_t pfn)
+static void tdx_sept_remove_private_spte(struct kvm *kvm, gfn_t gfn,
+ enum pg_level level, u64 mirror_spte)
{
- struct page *page = pfn_to_page(pfn);
- int ret;
+ struct page *page = pfn_to_page(spte_to_pfn(mirror_spte));
+ int tdx_level = pg_level_to_tdx_sept_level(level);
+ struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
+ gpa_t gpa = gfn_to_gpa(gfn);
+ u64 err, entry, level_state;
+
+ lockdep_assert_held_write(&kvm->mmu_lock);
/*
* HKID is released after all private pages have been removed, and set
@@ -1878,11 +1801,16 @@ static int tdx_sept_remove_private_spte(struct kvm *kvm, gfn_t gfn,
* there can't be anything populated in the private EPT.
*/
if (KVM_BUG_ON(!is_hkid_assigned(to_kvm_tdx(kvm)), kvm))
- return -EINVAL;
+ return;
- ret = tdx_sept_zap_private_spte(kvm, gfn, level, page);
- if (ret <= 0)
- return ret;
+ /* TODO: handle large pages. */
+ if (KVM_BUG_ON(level != PG_LEVEL_4K, kvm))
+ return;
+
+ err = tdh_do_no_vcpus(tdh_mem_range_block, kvm, &kvm_tdx->td, gpa,
+ tdx_level, &entry, &level_state);
+ if (TDX_BUG_ON_2(err, TDH_MEM_RANGE_BLOCK, entry, level_state, kvm))
+ return;
/*
* TDX requires TLB tracking before dropping private page. Do
@@ -1890,7 +1818,21 @@ static int tdx_sept_remove_private_spte(struct kvm *kvm, gfn_t gfn,
*/
tdx_track(kvm);
- return tdx_sept_drop_private_spte(kvm, gfn, level, page);
+ /*
+ * When zapping private page, write lock is held. So no race condition
+ * with other vcpu sept operation.
+ * Race with TDH.VP.ENTER due to (0-step mitigation) and Guest TDCALLs.
+ */
+ err = tdh_do_no_vcpus(tdh_mem_page_remove, kvm, &kvm_tdx->td, gpa,
+ tdx_level, &entry, &level_state);
+ if (TDX_BUG_ON_2(err, TDH_MEM_PAGE_REMOVE, entry, level_state, kvm))
+ return;
+
+ err = tdh_phymem_page_wbinvd_hkid((u16)kvm_tdx->hkid, page);
+ if (TDX_BUG_ON(err, TDH_PHYMEM_PAGE_WBINVD, kvm))
+ return;
+
+ tdx_quirk_reset_page(page);
}
void tdx_deliver_interrupt(struct kvm_lapic *apic, int delivery_mode,
@@ -2269,37 +2211,28 @@ static int tdx_get_capabilities(struct kvm_tdx_cmd *cmd)
if (cmd->flags)
return -EINVAL;
- caps = kzalloc(sizeof(*caps) +
- sizeof(struct kvm_cpuid_entry2) * td_conf->num_cpuid_config,
- GFP_KERNEL);
- if (!caps)
- return -ENOMEM;
-
user_caps = u64_to_user_ptr(cmd->data);
- if (get_user(nr_user_entries, &user_caps->cpuid.nent)) {
- ret = -EFAULT;
- goto out;
- }
+ if (get_user(nr_user_entries, &user_caps->cpuid.nent))
+ return -EFAULT;
- if (nr_user_entries < td_conf->num_cpuid_config) {
- ret = -E2BIG;
- goto out;
- }
+ if (nr_user_entries < td_conf->num_cpuid_config)
+ return -E2BIG;
+
+ caps = kzalloc(struct_size(caps, cpuid.entries,
+ td_conf->num_cpuid_config), GFP_KERNEL);
+ if (!caps)
+ return -ENOMEM;
ret = init_kvm_tdx_caps(td_conf, caps);
if (ret)
goto out;
- if (copy_to_user(user_caps, caps, sizeof(*caps))) {
+ if (copy_to_user(user_caps, caps, struct_size(caps, cpuid.entries,
+ caps->cpuid.nent))) {
ret = -EFAULT;
goto out;
}
- if (copy_to_user(user_caps->cpuid.entries, caps->cpuid.entries,
- caps->cpuid.nent *
- sizeof(caps->cpuid.entries[0])))
- ret = -EFAULT;
-
out:
/* kfree() accepts NULL. */
kfree(caps);
@@ -2524,8 +2457,7 @@ static int __tdx_td_init(struct kvm *kvm, struct td_params *td_params,
goto free_packages;
}
- if (WARN_ON_ONCE(err)) {
- pr_tdx_error(TDH_MNG_CREATE, err);
+ if (TDX_BUG_ON(err, TDH_MNG_CREATE, kvm)) {
ret = -EIO;
goto free_packages;
}
@@ -2566,8 +2498,7 @@ static int __tdx_td_init(struct kvm *kvm, struct td_params *td_params,
ret = -EAGAIN;
goto teardown;
}
- if (WARN_ON_ONCE(err)) {
- pr_tdx_error(TDH_MNG_ADDCX, err);
+ if (TDX_BUG_ON(err, TDH_MNG_ADDCX, kvm)) {
ret = -EIO;
goto teardown;
}
@@ -2584,8 +2515,7 @@ static int __tdx_td_init(struct kvm *kvm, struct td_params *td_params,
*seamcall_err = err;
ret = -EINVAL;
goto teardown;
- } else if (WARN_ON_ONCE(err)) {
- pr_tdx_error_1(TDH_MNG_INIT, err, rcx);
+ } else if (TDX_BUG_ON_1(err, TDH_MNG_INIT, rcx, kvm)) {
ret = -EIO;
goto teardown;
}
@@ -2629,7 +2559,7 @@ free_tdcs:
free_tdr:
if (tdr_page)
__free_page(tdr_page);
- kvm_tdx->td.tdr_page = 0;
+ kvm_tdx->td.tdr_page = NULL;
free_hkid:
tdx_hkid_free(kvm_tdx);
@@ -2734,6 +2664,46 @@ err_out:
return -EIO;
}
+typedef void *tdx_vm_state_guard_t;
+
+static tdx_vm_state_guard_t tdx_acquire_vm_state_locks(struct kvm *kvm)
+{
+ int r;
+
+ mutex_lock(&kvm->lock);
+
+ if (kvm->created_vcpus != atomic_read(&kvm->online_vcpus)) {
+ r = -EBUSY;
+ goto out_err;
+ }
+
+ r = kvm_lock_all_vcpus(kvm);
+ if (r)
+ goto out_err;
+
+ /*
+ * Note the unintuitive ordering! vcpu->mutex must be taken outside
+ * kvm->slots_lock!
+ */
+ mutex_lock(&kvm->slots_lock);
+ return kvm;
+
+out_err:
+ mutex_unlock(&kvm->lock);
+ return ERR_PTR(r);
+}
+
+static void tdx_release_vm_state_locks(struct kvm *kvm)
+{
+ mutex_unlock(&kvm->slots_lock);
+ kvm_unlock_all_vcpus(kvm);
+ mutex_unlock(&kvm->lock);
+}
+
+DEFINE_CLASS(tdx_vm_state_guard, tdx_vm_state_guard_t,
+ if (!IS_ERR(_T)) tdx_release_vm_state_locks(_T),
+ tdx_acquire_vm_state_locks(kvm), struct kvm *kvm);
+
static int tdx_td_init(struct kvm *kvm, struct kvm_tdx_cmd *cmd)
{
struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
@@ -2855,24 +2825,14 @@ static int tdx_td_finalize(struct kvm *kvm, struct kvm_tdx_cmd *cmd)
{
struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
- guard(mutex)(&kvm->slots_lock);
-
if (!is_hkid_assigned(kvm_tdx) || kvm_tdx->state == TD_STATE_RUNNABLE)
return -EINVAL;
- /*
- * Pages are pending for KVM_TDX_INIT_MEM_REGION to issue
- * TDH.MEM.PAGE.ADD().
- */
- if (atomic64_read(&kvm_tdx->nr_premapped))
- return -EINVAL;
cmd->hw_error = tdh_mr_finalize(&kvm_tdx->td);
if (tdx_operand_busy(cmd->hw_error))
return -EBUSY;
- if (KVM_BUG_ON(cmd->hw_error, kvm)) {
- pr_tdx_error(TDH_MR_FINALIZE, cmd->hw_error);
+ if (TDX_BUG_ON(cmd->hw_error, TDH_MR_FINALIZE, kvm))
return -EIO;
- }
kvm_tdx->state = TD_STATE_RUNNABLE;
/* TD_STATE_RUNNABLE must be set before 'pre_fault_allowed' */
@@ -2881,27 +2841,38 @@ static int tdx_td_finalize(struct kvm *kvm, struct kvm_tdx_cmd *cmd)
return 0;
}
-int tdx_vm_ioctl(struct kvm *kvm, void __user *argp)
+static int tdx_get_cmd(void __user *argp, struct kvm_tdx_cmd *cmd)
{
- struct kvm_tdx_cmd tdx_cmd;
- int r;
-
- if (copy_from_user(&tdx_cmd, argp, sizeof(struct kvm_tdx_cmd)))
+ if (copy_from_user(cmd, argp, sizeof(*cmd)))
return -EFAULT;
/*
- * Userspace should never set hw_error. It is used to fill
- * hardware-defined error by the kernel.
+ * Userspace should never set hw_error. KVM writes hw_error to report
+ * hardware-defined error back to userspace.
*/
- if (tdx_cmd.hw_error)
+ if (cmd->hw_error)
return -EINVAL;
- mutex_lock(&kvm->lock);
+ return 0;
+}
+
+int tdx_vm_ioctl(struct kvm *kvm, void __user *argp)
+{
+ struct kvm_tdx_cmd tdx_cmd;
+ int r;
+
+ r = tdx_get_cmd(argp, &tdx_cmd);
+ if (r)
+ return r;
+
+ if (tdx_cmd.id == KVM_TDX_CAPABILITIES)
+ return tdx_get_capabilities(&tdx_cmd);
+
+ CLASS(tdx_vm_state_guard, guard)(kvm);
+ if (IS_ERR(guard))
+ return PTR_ERR(guard);
switch (tdx_cmd.id) {
- case KVM_TDX_CAPABILITIES:
- r = tdx_get_capabilities(&tdx_cmd);
- break;
case KVM_TDX_INIT_VM:
r = tdx_td_init(kvm, &tdx_cmd);
break;
@@ -2909,15 +2880,12 @@ int tdx_vm_ioctl(struct kvm *kvm, void __user *argp)
r = tdx_td_finalize(kvm, &tdx_cmd);
break;
default:
- r = -EINVAL;
- goto out;
+ return -EINVAL;
}
if (copy_to_user(argp, &tdx_cmd, sizeof(struct kvm_tdx_cmd)))
- r = -EFAULT;
+ return -EFAULT;
-out:
- mutex_unlock(&kvm->lock);
return r;
}
@@ -2959,16 +2927,14 @@ static int tdx_td_vcpu_init(struct kvm_vcpu *vcpu, u64 vcpu_rcx)
}
err = tdh_vp_create(&kvm_tdx->td, &tdx->vp);
- if (KVM_BUG_ON(err, vcpu->kvm)) {
+ if (TDX_BUG_ON(err, TDH_VP_CREATE, vcpu->kvm)) {
ret = -EIO;
- pr_tdx_error(TDH_VP_CREATE, err);
goto free_tdcx;
}
for (i = 0; i < kvm_tdx->td.tdcx_nr_pages; i++) {
err = tdh_vp_addcx(&tdx->vp, tdx->vp.tdcx_pages[i]);
- if (KVM_BUG_ON(err, vcpu->kvm)) {
- pr_tdx_error(TDH_VP_ADDCX, err);
+ if (TDX_BUG_ON(err, TDH_VP_ADDCX, vcpu->kvm)) {
/*
* Pages already added are reclaimed by the vcpu_free
* method, but the rest are freed here.
@@ -2981,10 +2947,19 @@ static int tdx_td_vcpu_init(struct kvm_vcpu *vcpu, u64 vcpu_rcx)
}
}
- err = tdh_vp_init(&tdx->vp, vcpu_rcx, vcpu->vcpu_id);
- if (KVM_BUG_ON(err, vcpu->kvm)) {
- pr_tdx_error(TDH_VP_INIT, err);
- return -EIO;
+ /*
+ * tdh_vp_init() can take an exclusive lock of the TDR resource inside
+ * the TDX-Module. The TDR resource is also taken as shared in several
+ * no-fail MMU paths, which could return TDX_OPERAND_BUSY on contention
+ * (TDX-Module locks are try-lock implementations with no slow path).
+ * Take mmu_lock for write to reflect the nature of the lock taken by
+ * the TDX-Module, and to ensure the no-fail MMU paths succeed, e.g. if
+ * a concurrent PUNCH_HOLE on guest_memfd triggers removal of SPTEs.
+ */
+ scoped_guard(write_lock, &vcpu->kvm->mmu_lock) {
+ err = tdh_vp_init(&tdx->vp, vcpu_rcx, vcpu->vcpu_id);
+ if (TDX_BUG_ON(err, TDH_VP_INIT, vcpu->kvm))
+ return -EIO;
}
vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
@@ -3003,7 +2978,7 @@ free_tdcx:
free_tdvpr:
if (tdx->vp.tdvpr_page)
__free_page(tdx->vp.tdvpr_page);
- tdx->vp.tdvpr_page = 0;
+ tdx->vp.tdvpr_page = NULL;
tdx->vp.tdvpr_pa = 0;
return ret;
@@ -3041,7 +3016,8 @@ static int tdx_vcpu_get_cpuid_leaf(struct kvm_vcpu *vcpu, u32 leaf, int *entry_i
static int tdx_vcpu_get_cpuid(struct kvm_vcpu *vcpu, struct kvm_tdx_cmd *cmd)
{
- struct kvm_cpuid2 __user *output, *td_cpuid;
+ struct kvm_cpuid2 __user *output;
+ struct kvm_cpuid2 *td_cpuid;
int r = 0, i = 0, leaf;
u32 level;
@@ -3154,15 +3130,15 @@ struct tdx_gmem_post_populate_arg {
static int tdx_gmem_post_populate(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn,
void __user *src, int order, void *_arg)
{
- u64 error_code = PFERR_GUEST_FINAL_MASK | PFERR_PRIVATE_ACCESS;
- struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
struct tdx_gmem_post_populate_arg *arg = _arg;
- struct kvm_vcpu *vcpu = arg->vcpu;
+ struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
+ u64 err, entry, level_state;
gpa_t gpa = gfn_to_gpa(gfn);
- u8 level = PG_LEVEL_4K;
struct page *src_page;
int ret, i;
- u64 err, entry, level_state;
+
+ if (KVM_BUG_ON(kvm_tdx->page_add_src, kvm))
+ return -EIO;
/*
* Get the source page if it has been faulted in. Return failure if the
@@ -3174,49 +3150,29 @@ static int tdx_gmem_post_populate(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn,
if (ret != 1)
return -ENOMEM;
- ret = kvm_tdp_map_page(vcpu, gpa, error_code, &level);
- if (ret < 0)
- goto out;
-
- /*
- * The private mem cannot be zapped after kvm_tdp_map_page()
- * because all paths are covered by slots_lock and the
- * filemap invalidate lock. Check that they are indeed enough.
- */
- if (IS_ENABLED(CONFIG_KVM_PROVE_MMU)) {
- scoped_guard(read_lock, &kvm->mmu_lock) {
- if (KVM_BUG_ON(!kvm_tdp_mmu_gpa_is_mapped(vcpu, gpa), kvm)) {
- ret = -EIO;
- goto out;
- }
- }
- }
+ kvm_tdx->page_add_src = src_page;
+ ret = kvm_tdp_mmu_map_private_pfn(arg->vcpu, gfn, pfn);
+ kvm_tdx->page_add_src = NULL;
- ret = 0;
- err = tdh_mem_page_add(&kvm_tdx->td, gpa, pfn_to_page(pfn),
- src_page, &entry, &level_state);
- if (err) {
- ret = unlikely(tdx_operand_busy(err)) ? -EBUSY : -EIO;
- goto out;
- }
+ put_page(src_page);
- if (!KVM_BUG_ON(!atomic64_read(&kvm_tdx->nr_premapped), kvm))
- atomic64_dec(&kvm_tdx->nr_premapped);
+ if (ret || !(arg->flags & KVM_TDX_MEASURE_MEMORY_REGION))
+ return ret;
- if (arg->flags & KVM_TDX_MEASURE_MEMORY_REGION) {
- for (i = 0; i < PAGE_SIZE; i += TDX_EXTENDMR_CHUNKSIZE) {
- err = tdh_mr_extend(&kvm_tdx->td, gpa + i, &entry,
- &level_state);
- if (err) {
- ret = -EIO;
- break;
- }
- }
+ /*
+ * Note, MR.EXTEND can fail if the S-EPT mapping is somehow removed
+ * between mapping the pfn and now, but slots_lock prevents memslot
+ * updates, filemap_invalidate_lock() prevents guest_memfd updates,
+ * mmu_notifier events can't reach S-EPT entries, and KVM's internal
+ * zapping flows are mutually exclusive with S-EPT mappings.
+ */
+ for (i = 0; i < PAGE_SIZE; i += TDX_EXTENDMR_CHUNKSIZE) {
+ err = tdh_mr_extend(&kvm_tdx->td, gpa + i, &entry, &level_state);
+ if (TDX_BUG_ON_2(err, TDH_MR_EXTEND, entry, level_state, kvm))
+ return -EIO;
}
-out:
- put_page(src_page);
- return ret;
+ return 0;
}
static int tdx_vcpu_init_mem_region(struct kvm_vcpu *vcpu, struct kvm_tdx_cmd *cmd)
@@ -3232,8 +3188,6 @@ static int tdx_vcpu_init_mem_region(struct kvm_vcpu *vcpu, struct kvm_tdx_cmd *c
if (tdx->state != VCPU_TD_STATE_INITIALIZED)
return -EINVAL;
- guard(mutex)(&kvm->slots_lock);
-
/* Once TD is finalized, the initial guest memory is fixed. */
if (kvm_tdx->state == TD_STATE_RUNNABLE)
return -EINVAL;
@@ -3251,7 +3205,6 @@ static int tdx_vcpu_init_mem_region(struct kvm_vcpu *vcpu, struct kvm_tdx_cmd *c
!vt_is_tdx_private_gpa(kvm, region.gpa + (region.nr_pages << PAGE_SHIFT) - 1))
return -EINVAL;
- kvm_mmu_reload(vcpu);
ret = 0;
while (region.nr_pages) {
if (signal_pending(current)) {
@@ -3288,28 +3241,57 @@ static int tdx_vcpu_init_mem_region(struct kvm_vcpu *vcpu, struct kvm_tdx_cmd *c
return ret;
}
-int tdx_vcpu_ioctl(struct kvm_vcpu *vcpu, void __user *argp)
+int tdx_vcpu_unlocked_ioctl(struct kvm_vcpu *vcpu, void __user *argp)
{
- struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm);
+ struct kvm *kvm = vcpu->kvm;
+ struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
struct kvm_tdx_cmd cmd;
- int ret;
+ int r;
- if (!is_hkid_assigned(kvm_tdx) || kvm_tdx->state == TD_STATE_RUNNABLE)
- return -EINVAL;
+ r = tdx_get_cmd(argp, &cmd);
+ if (r)
+ return r;
- if (copy_from_user(&cmd, argp, sizeof(cmd)))
- return -EFAULT;
+ CLASS(tdx_vm_state_guard, guard)(kvm);
+ if (IS_ERR(guard))
+ return PTR_ERR(guard);
- if (cmd.hw_error)
+ if (!is_hkid_assigned(kvm_tdx) || kvm_tdx->state == TD_STATE_RUNNABLE)
return -EINVAL;
+ vcpu_load(vcpu);
+
switch (cmd.id) {
+ case KVM_TDX_INIT_MEM_REGION:
+ r = tdx_vcpu_init_mem_region(vcpu, &cmd);
+ break;
case KVM_TDX_INIT_VCPU:
- ret = tdx_vcpu_init(vcpu, &cmd);
+ r = tdx_vcpu_init(vcpu, &cmd);
break;
- case KVM_TDX_INIT_MEM_REGION:
- ret = tdx_vcpu_init_mem_region(vcpu, &cmd);
+ default:
+ r = -ENOIOCTLCMD;
break;
+ }
+
+ vcpu_put(vcpu);
+
+ return r;
+}
+
+int tdx_vcpu_ioctl(struct kvm_vcpu *vcpu, void __user *argp)
+{
+ struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm);
+ struct kvm_tdx_cmd cmd;
+ int ret;
+
+ if (!is_hkid_assigned(kvm_tdx) || kvm_tdx->state == TD_STATE_RUNNABLE)
+ return -EINVAL;
+
+ ret = tdx_get_cmd(argp, &cmd);
+ if (ret)
+ return ret;
+
+ switch (cmd.id) {
case KVM_TDX_GET_CPUID:
ret = tdx_vcpu_get_cpuid(vcpu, &cmd);
break;
diff --git a/arch/x86/kvm/vmx/tdx.h b/arch/x86/kvm/vmx/tdx.h
index 7f258870dc41..45b5183ccb36 100644
--- a/arch/x86/kvm/vmx/tdx.h
+++ b/arch/x86/kvm/vmx/tdx.h
@@ -36,8 +36,12 @@ struct kvm_tdx {
struct tdx_td td;
- /* For KVM_TDX_INIT_MEM_REGION. */
- atomic64_t nr_premapped;
+ /*
+ * Scratch pointer used to pass the source page to tdx_mem_page_add().
+ * Protected by slots_lock, and non-NULL only when mapping a private
+ * pfn via tdx_gmem_post_populate().
+ */
+ struct page *page_add_src;
/*
* Prevent vCPUs from TD entry to ensure SEPT zap related SEAMCALLs do
diff --git a/arch/x86/kvm/vmx/x86_ops.h b/arch/x86/kvm/vmx/x86_ops.h
index 9697368d65b3..a7a870919580 100644
--- a/arch/x86/kvm/vmx/x86_ops.h
+++ b/arch/x86/kvm/vmx/x86_ops.h
@@ -149,6 +149,7 @@ int tdx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr);
int tdx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr);
int tdx_vcpu_ioctl(struct kvm_vcpu *vcpu, void __user *argp);
+int tdx_vcpu_unlocked_ioctl(struct kvm_vcpu *vcpu, void __user *argp);
void tdx_flush_tlb_current(struct kvm_vcpu *vcpu);
void tdx_flush_tlb_all(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index cd55feb4a378..ea66d0047c9b 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -7200,6 +7200,19 @@ static int kvm_vm_ioctl_set_clock(struct kvm *kvm, void __user *argp)
return 0;
}
+long kvm_arch_vcpu_unlocked_ioctl(struct file *filp, unsigned int ioctl,
+ unsigned long arg)
+{
+ struct kvm_vcpu *vcpu = filp->private_data;
+ void __user *argp = (void __user *)arg;
+
+ if (ioctl == KVM_MEMORY_ENCRYPT_OP &&
+ kvm_x86_ops.vcpu_mem_enc_unlocked_ioctl)
+ return kvm_x86_call(vcpu_mem_enc_unlocked_ioctl)(vcpu, argp);
+
+ return -ENOIOCTLCMD;
+}
+
int kvm_arch_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg)
{
struct kvm *kvm = filp->private_data;
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 5bd76cf394fa..d93f75b05ae2 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -1557,6 +1557,8 @@ long kvm_arch_dev_ioctl(struct file *filp,
unsigned int ioctl, unsigned long arg);
long kvm_arch_vcpu_ioctl(struct file *filp,
unsigned int ioctl, unsigned long arg);
+long kvm_arch_vcpu_unlocked_ioctl(struct file *filp,
+ unsigned int ioctl, unsigned long arg);
vm_fault_t kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf);
int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext);
@@ -2437,18 +2439,6 @@ static inline bool kvm_arch_no_poll(struct kvm_vcpu *vcpu)
}
#endif /* CONFIG_HAVE_KVM_NO_POLL */
-#ifdef CONFIG_HAVE_KVM_VCPU_ASYNC_IOCTL
-long kvm_arch_vcpu_async_ioctl(struct file *filp,
- unsigned int ioctl, unsigned long arg);
-#else
-static inline long kvm_arch_vcpu_async_ioctl(struct file *filp,
- unsigned int ioctl,
- unsigned long arg)
-{
- return -ENOIOCTLCMD;
-}
-#endif /* CONFIG_HAVE_KVM_VCPU_ASYNC_IOCTL */
-
void kvm_arch_guest_memory_reclaimed(struct kvm *kvm);
#ifdef CONFIG_HAVE_KVM_VCPU_RUN_PID_CHANGE
diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig
index 5f0015c5dd95..267c7369c765 100644
--- a/virt/kvm/Kconfig
+++ b/virt/kvm/Kconfig
@@ -78,9 +78,6 @@ config HAVE_KVM_IRQ_BYPASS
tristate
select IRQ_BYPASS_MANAGER
-config HAVE_KVM_VCPU_ASYNC_IOCTL
- bool
-
config HAVE_KVM_VCPU_RUN_PID_CHANGE
bool
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index a5d9a6f221c2..74e1afd67b44 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -4434,10 +4434,10 @@ static long kvm_vcpu_ioctl(struct file *filp,
return r;
/*
- * Some architectures have vcpu ioctls that are asynchronous to vcpu
- * execution; mutex_lock() would break them.
+ * Let arch code handle select vCPU ioctls without holding vcpu->mutex,
+ * e.g. to support ioctls that can run asynchronous to vCPU execution.
*/
- r = kvm_arch_vcpu_async_ioctl(filp, ioctl, arg);
+ r = kvm_arch_vcpu_unlocked_ioctl(filp, ioctl, arg);
if (r != -ENOIOCTLCMD)
return r;