From 6e2b2358b3ef870d24109083d2e314d04fc72de4 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Wed, 9 Oct 2024 08:04:52 -0700
Subject: KVM: Grab vcpu->mutex across installing the vCPU's fd and bumping
 online_vcpus

During vCPU creation, acquire vcpu->mutex prior to exposing the vCPU to
userspace, and hold the mutex until online_vcpus is bumped, i.e. until the
vCPU is fully online from KVM's perspective.

To ensure asynchronous vCPU ioctls also wait for the vCPU to come online,
explicitly check online_vcpus at the start of kvm_vcpu_ioctl(), and take
the vCPU's mutex to wait if necessary (having to wait for any ioctl should
be exceedingly rare, i.e. not worth optimizing).

Reported-by: Will Deacon <will@kernel.org>
Reported-by: Michal Luczaj <mhal@rbox.co>
Link: https://lore.kernel.org/all/20240730155646.1687-1-will@kernel.org
Acked-by: Will Deacon <will@kernel.org>
Link: https://lore.kernel.org/r/20241009150455.1057573-4-seanjc@google.com
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 virt/kvm/kvm_main.c | 47 ++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 46 insertions(+), 1 deletion(-)

(limited to 'virt/kvm/kvm_main.c')

diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index de2c11dae231..fb117af473e5 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -4132,7 +4132,14 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, unsigned long id)
 	if (r)
 		goto unlock_vcpu_destroy;
 
-	/* Now it's all set up, let userspace reach it */
+	/*
+	 * Now it's all set up, let userspace reach it.  Grab the vCPU's mutex
+	 * so that userspace can't invoke vCPU ioctl()s until the vCPU is fully
+	 * visible (per online_vcpus), e.g. so that KVM doesn't get tricked
+	 * into a NULL-pointer dereference because KVM thinks the _current_
+	 * vCPU doesn't exist.
+	 */
+	mutex_lock(&vcpu->mutex);
 	kvm_get_kvm(kvm);
 	r = create_vcpu_fd(vcpu);
 	if (r < 0)
@@ -4149,6 +4156,7 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, unsigned long id)
 	 */
 	smp_wmb();
 	atomic_inc(&kvm->online_vcpus);
+	mutex_unlock(&vcpu->mutex);
 
 	mutex_unlock(&kvm->lock);
 	kvm_arch_vcpu_postcreate(vcpu);
@@ -4156,6 +4164,7 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, unsigned long id)
 	return r;
 
 kvm_put_xa_release:
+	mutex_unlock(&vcpu->mutex);
 	kvm_put_kvm_no_destroy(kvm);
 	xa_release(&kvm->vcpu_array, vcpu->vcpu_idx);
 unlock_vcpu_destroy:
@@ -4282,6 +4291,33 @@ static int kvm_vcpu_pre_fault_memory(struct kvm_vcpu *vcpu,
 }
 #endif
 
+static int kvm_wait_for_vcpu_online(struct kvm_vcpu *vcpu)
+{
+	struct kvm *kvm = vcpu->kvm;
+
+	/*
+	 * In practice, this happy path will always be taken, as a well-behaved
+	 * VMM will never invoke a vCPU ioctl() before KVM_CREATE_VCPU returns.
+	 */
+	if (likely(vcpu->vcpu_idx < atomic_read(&kvm->online_vcpus)))
+		return 0;
+
+	/*
+	 * Acquire and release the vCPU's mutex to wait for vCPU creation to
+	 * complete (kvm_vm_ioctl_create_vcpu() holds the mutex until the vCPU
+	 * is fully online).
+	 */
+	if (mutex_lock_killable(&vcpu->mutex))
+		return -EINTR;
+
+	mutex_unlock(&vcpu->mutex);
+
+	if (WARN_ON_ONCE(!kvm_get_vcpu(kvm, vcpu->vcpu_idx)))
+		return -EIO;
+
+	return 0;
+}
+
 static long kvm_vcpu_ioctl(struct file *filp,
 			   unsigned int ioctl, unsigned long arg)
 {
@@ -4297,6 +4333,15 @@ static long kvm_vcpu_ioctl(struct file *filp,
 	if (unlikely(_IOC_TYPE(ioctl) != KVMIO))
 		return -EINVAL;
 
+	/*
+	 * Wait for the vCPU to be online before handling the ioctl(), as KVM
+	 * assumes the vCPU is reachable via vcpu_array, i.e. may dereference
+	 * a NULL pointer if userspace invokes an ioctl() before KVM is ready.
+	 */
+	r = kvm_wait_for_vcpu_online(vcpu);
+	if (r)
+		return r;
+
 	/*
 	 * Some architectures have vcpu ioctls that are asynchronous to vcpu
 	 * execution; mutex_lock() would break them.
-- 
cgit v1.2.3


From d0831edcd87ee4cbf1b8cc5669d9d07c71577477 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Wed, 9 Oct 2024 08:04:53 -0700
Subject: Revert "KVM: Fix vcpu_array[0] races"

Now that KVM loads from vcpu_array if and only if the target index is
valid with respect to online_vcpus, i.e. now that it is safe to erase a
not-fully-onlined vCPU entry, revert to storing into vcpu_array before
success is guaranteed.

If xa_store() fails, which _should_ be impossible, then putting the vCPU's
reference to 'struct kvm' results in a refcounting bug as the vCPU fd has
been installed and owns the vCPU's reference.

This was found by inspection, but forcing the xa_store() to fail
confirms the problem:

 | Unable to handle kernel paging request at virtual address ffff800080ecd960
 | Call trace:
 |  _raw_spin_lock_irq+0x2c/0x70
 |  kvm_irqfd_release+0x24/0xa0
 |  kvm_vm_release+0x1c/0x38
 |  __fput+0x88/0x2ec
 |  ____fput+0x10/0x1c
 |  task_work_run+0xb0/0xd4
 |  do_exit+0x210/0x854
 |  do_group_exit+0x70/0x98
 |  get_signal+0x6b0/0x73c
 |  do_signal+0xa4/0x11e8
 |  do_notify_resume+0x60/0x12c
 |  el0_svc+0x64/0x68
 |  el0t_64_sync_handler+0x84/0xfc
 |  el0t_64_sync+0x190/0x194
 | Code: b9000909 d503201f 2a1f03e1 52800028 (88e17c08)

Practically speaking, this is a non-issue as xa_store() can't fail, absent
a nasty kernel bug.  But the code is visually jarring and technically
broken.

This reverts commit afb2acb2e3a32e4d56f7fbd819769b98ed1b7520.

Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Michal Luczaj <mhal@rbox.co>
Cc: Alexander Potapenko <glider@google.com>
Cc: Marc Zyngier <maz@kernel.org>
Reported-by: Will Deacon <will@kernel.org>
Acked-by: Will Deacon <will@kernel.org>
Link: https://lore.kernel.org/r/20241009150455.1057573-5-seanjc@google.com
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 virt/kvm/kvm_main.c | 14 +++++---------
 1 file changed, 5 insertions(+), 9 deletions(-)

(limited to 'virt/kvm/kvm_main.c')

diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index fb117af473e5..2e5d3b85b46e 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -4128,7 +4128,8 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, unsigned long id)
 	}
 
 	vcpu->vcpu_idx = atomic_read(&kvm->online_vcpus);
-	r = xa_reserve(&kvm->vcpu_array, vcpu->vcpu_idx, GFP_KERNEL_ACCOUNT);
+	r = xa_insert(&kvm->vcpu_array, vcpu->vcpu_idx, vcpu, GFP_KERNEL_ACCOUNT);
+	BUG_ON(r == -EBUSY);
 	if (r)
 		goto unlock_vcpu_destroy;
 
@@ -4143,12 +4144,7 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, unsigned long id)
 	kvm_get_kvm(kvm);
 	r = create_vcpu_fd(vcpu);
 	if (r < 0)
-		goto kvm_put_xa_release;
-
-	if (KVM_BUG_ON(xa_store(&kvm->vcpu_array, vcpu->vcpu_idx, vcpu, 0), kvm)) {
-		r = -EINVAL;
-		goto kvm_put_xa_release;
-	}
+		goto kvm_put_xa_erase;
 
 	/*
 	 * Pairs with smp_rmb() in kvm_get_vcpu.  Store the vcpu
@@ -4163,10 +4159,10 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, unsigned long id)
 	kvm_create_vcpu_debugfs(vcpu);
 	return r;
 
-kvm_put_xa_release:
+kvm_put_xa_erase:
 	mutex_unlock(&vcpu->mutex);
 	kvm_put_kvm_no_destroy(kvm);
-	xa_release(&kvm->vcpu_array, vcpu->vcpu_idx);
+	xa_erase(&kvm->vcpu_array, vcpu->vcpu_idx);
 unlock_vcpu_destroy:
 	mutex_unlock(&kvm->lock);
 	kvm_dirty_ring_free(&vcpu->dirty_ring);
-- 
cgit v1.2.3


From e53dc37f5a06f0be5b89fac230fcd4008f646d50 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Wed, 9 Oct 2024 08:04:54 -0700
Subject: KVM: Don't BUG() the kernel if xa_insert() fails with -EBUSY

WARN once instead of triggering a BUG if xa_insert() fails because it
encountered an existing entry.  While KVM guarantees there should be no
existing entry, there's no reason to BUG the kernel, as KVM needs to
gracefully handle failure anyways.

Reviewed-by: Pankaj Gupta <pankaj.gupta@amd.com>
Acked-by: Will Deacon <will@kernel.org>
Link: https://lore.kernel.org/r/20241009150455.1057573-6-seanjc@google.com
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 virt/kvm/kvm_main.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'virt/kvm/kvm_main.c')

diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 2e5d3b85b46e..b855b27a36b6 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -4129,7 +4129,7 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, unsigned long id)
 
 	vcpu->vcpu_idx = atomic_read(&kvm->online_vcpus);
 	r = xa_insert(&kvm->vcpu_array, vcpu->vcpu_idx, vcpu, GFP_KERNEL_ACCOUNT);
-	BUG_ON(r == -EBUSY);
+	WARN_ON_ONCE(r == -EBUSY);
 	if (r)
 		goto unlock_vcpu_destroy;
 
-- 
cgit v1.2.3


From 01528db67f28d5919f7b0a68900dc212165218e2 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Wed, 9 Oct 2024 08:04:55 -0700
Subject: KVM: Drop hack that "manually" informs lockdep of kvm->lock vs.
 vcpu->mutex

Now that KVM takes vcpu->mutex inside kvm->lock when creating a vCPU, drop
the hack to manually inform lockdep of the kvm->lock => vcpu->mutex
ordering.

This effectively reverts commit 42a90008f890 ("KVM: Ensure lockdep knows
about kvm->lock vs. vcpu->mutex ordering rule").

Cc: Oliver Upton <oliver.upton@linux.dev>
Acked-by: Will Deacon <will@kernel.org>
Link: https://lore.kernel.org/r/20241009150455.1057573-7-seanjc@google.com
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 virt/kvm/kvm_main.c | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

(limited to 'virt/kvm/kvm_main.c')

diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index b855b27a36b6..9d54473d18e3 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -4116,12 +4116,6 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, unsigned long id)
 
 	mutex_lock(&kvm->lock);
 
-#ifdef CONFIG_LOCKDEP
-	/* Ensure that lockdep knows vcpu->mutex is taken *inside* kvm->lock */
-	mutex_lock(&vcpu->mutex);
-	mutex_unlock(&vcpu->mutex);
-#endif
-
 	if (kvm_get_vcpu_by_id(kvm, id)) {
 		r = -EEXIST;
 		goto unlock_vcpu_destroy;
@@ -4138,7 +4132,8 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, unsigned long id)
 	 * so that userspace can't invoke vCPU ioctl()s until the vCPU is fully
 	 * visible (per online_vcpus), e.g. so that KVM doesn't get tricked
 	 * into a NULL-pointer dereference because KVM thinks the _current_
-	 * vCPU doesn't exist.
+	 * vCPU doesn't exist.  As a bonus, taking vcpu->mutex ensures lockdep
+	 * knows it's taken *inside* kvm->lock.
 	 */
 	mutex_lock(&vcpu->mutex);
 	kvm_get_kvm(kvm);
-- 
cgit v1.2.3