From 7ba66191440d9aadfef4b6a5defe3357994d9340 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Tue, 11 Mar 2025 08:51:19 +0900 Subject: [PATCH] kvm: defer huge page recovery vhost task to later BugLink: https://bugs.launchpad.net/bugs/2101915 commit 931656b9e2ff7029aee0b36e17780621948a6ac1 upstream. Some libraries want to ensure they are single threaded before forking, so making the kernel's kvm huge page recovery process a vhost task of the user process breaks those. The minijail library used by crosvm is one such affected application. Defer the task to after the first VM_RUN call, which occurs after the parent process has forked all its jailed processes. This needs to happen only once for the kvm instance, so introduce some general-purpose infrastructure for that, too. It's similar in concept to pthread_once; except it is actually usable, because the callback takes a parameter. Cc: Sean Christopherson Cc: Paolo Bonzini Tested-by: Alyssa Ross Signed-off-by: Keith Busch Message-ID: <20250123153543.2769928-1-kbusch@meta.com> [Move call_once API to include/linux. - Paolo] Cc: stable@vger.kernel.org Fixes: d96c77bd4eeb ("KVM: x86: switch hugepage recovery thread to vhost_task") Signed-off-by: Paolo Bonzini Signed-off-by: Greg Kroah-Hartman (cherry picked from commit 27f651d28feed4b1bc6d8256c74d92f4d6302e96 linux-6.12.y) [koichiroden: follow-up fix from v6.12.14] Signed-off-by: Koichiro Den Signed-off-by: Stefan Bader --- arch/x86/include/asm/kvm_host.h | 2 ++ arch/x86/kvm/mmu/mmu.c | 18 +++++++++---- arch/x86/kvm/x86.c | 7 ++++- include/linux/call_once.h | 45 +++++++++++++++++++++++++++++++++ 4 files changed, 66 insertions(+), 6 deletions(-) create mode 100644 include/linux/call_once.h diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 45ff20d9bb62..021d2adcb933 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -27,6 +27,7 @@ #include #include #include +#include #include #include @@ -1424,6 +1425,7 @@ struct kvm_arch { struct kvm_x86_pmu_event_filter __rcu *pmu_event_filter; struct vhost_task *nx_huge_page_recovery_thread; u64 nx_huge_page_last; + struct once nx_once; #ifdef CONFIG_X86_64 /* The number of TDP MMU pages across all roots. */ diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index c8e40f1bb5ad..9dc8960624d2 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -7269,20 +7269,28 @@ static bool kvm_nx_huge_page_recovery_worker(void *data) return true; } -int kvm_mmu_post_init_vm(struct kvm *kvm) +static void kvm_mmu_start_lpage_recovery(struct once *once) { - if (nx_hugepage_mitigation_hard_disabled) - return 0; + struct kvm_arch *ka = container_of(once, struct kvm_arch, nx_once); + struct kvm *kvm = container_of(ka, struct kvm, arch); kvm->arch.nx_huge_page_last = get_jiffies_64(); kvm->arch.nx_huge_page_recovery_thread = vhost_task_create( kvm_nx_huge_page_recovery_worker, kvm_nx_huge_page_recovery_worker_kill, kvm, "kvm-nx-lpage-recovery"); + if (kvm->arch.nx_huge_page_recovery_thread) + vhost_task_start(kvm->arch.nx_huge_page_recovery_thread); +} + +int kvm_mmu_post_init_vm(struct kvm *kvm) +{ + if (nx_hugepage_mitigation_hard_disabled) + return 0; + + call_once(&kvm->arch.nx_once, kvm_mmu_start_lpage_recovery); if (!kvm->arch.nx_huge_page_recovery_thread) return -ENOMEM; - - vhost_task_start(kvm->arch.nx_huge_page_recovery_thread); return 0; } diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 898b191954eb..e9b22cee10c8 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -11314,6 +11314,10 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) struct kvm_run *kvm_run = vcpu->run; int r; + r = kvm_mmu_post_init_vm(vcpu->kvm); + if (r) + return r; + vcpu_load(vcpu); kvm_sigset_activate(vcpu); kvm_run->flags = 0; @@ -12585,7 +12589,8 @@ out: int kvm_arch_post_init_vm(struct kvm *kvm) { - return kvm_mmu_post_init_vm(kvm); + once_init(&kvm->arch.nx_once); + return 0; } static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu) diff --git a/include/linux/call_once.h b/include/linux/call_once.h new file mode 100644 index 000000000000..6261aa0b3fb0 --- /dev/null +++ b/include/linux/call_once.h @@ -0,0 +1,45 @@ +#ifndef _LINUX_CALL_ONCE_H +#define _LINUX_CALL_ONCE_H + +#include +#include + +#define ONCE_NOT_STARTED 0 +#define ONCE_RUNNING 1 +#define ONCE_COMPLETED 2 + +struct once { + atomic_t state; + struct mutex lock; +}; + +static inline void __once_init(struct once *once, const char *name, + struct lock_class_key *key) +{ + atomic_set(&once->state, ONCE_NOT_STARTED); + __mutex_init(&once->lock, name, key); +} + +#define once_init(once) \ +do { \ + static struct lock_class_key __key; \ + __once_init((once), #once, &__key); \ +} while (0) + +static inline void call_once(struct once *once, void (*cb)(struct once *)) +{ + /* Pairs with atomic_set_release() below. */ + if (atomic_read_acquire(&once->state) == ONCE_COMPLETED) + return; + + guard(mutex)(&once->lock); + WARN_ON(atomic_read(&once->state) == ONCE_RUNNING); + if (atomic_read(&once->state) != ONCE_NOT_STARTED) + return; + + atomic_set(&once->state, ONCE_RUNNING); + cb(once); + atomic_set_release(&once->state, ONCE_COMPLETED); +} + +#endif /* _LINUX_CALL_ONCE_H */