diff options
author | Nikolay Kuratov <kniv@yandex-team.ru> | 2024-12-08 11:38:30 +0300 |
---|---|---|
committer | Greg Kroah-Hartman <gregkh@linuxfoundation.org> | 2024-12-14 20:04:14 +0100 |
commit | 547b7f8fdebc77c0ff50c66d7fa2038d73a59ab6 (patch) | |
tree | ecb594dcdbff20bd6d06ea6d3ed8ae02d500d800 | |
parent | 7f91d9206220409a94f146c68796b7dc765b5bf1 (diff) |
KVM: x86/mmu: Ensure that kvm_release_pfn_clean() takes exact pfn from kvm_faultin_pfn()
Since 5.16 and prior to 6.13 KVM can't be used with FSDAX
guest memory (PMD pages). To reproduce the issue you need to reserve
guest memory with `memmap=` cmdline, create and mount FS in DAX mode
(tested both XFS and ext4), see doc link below. ndctl command for test:
ndctl create-namespace -v -e namespace1.0 --map=dev --mode=fsdax -a 2M
Then pass memory object to qemu like:
-m 8G -object memory-backend-file,id=ram0,size=8G,\
mem-path=/mnt/pmem/guestmem,share=on,prealloc=on,dump=off,align=2097152 \
-numa node,memdev=ram0,cpus=0-1
QEMU fails to run guest with error: kvm run failed Bad address
and there are two warnings in dmesg:
WARN_ON_ONCE(!page_count(page)) in kvm_is_zone_device_page() and
WARN_ON_ONCE(folio_ref_count(folio) <= 0) in try_grab_folio() (v6.6.63)
It looks like in the past assumption was made that pfn won't change from
faultin_pfn() to release_pfn_clean(), e.g. see
commit 4cd071d13c5c ("KVM: x86/mmu: Move calls to thp_adjust() down a level")
But kvm_page_fault structure made pfn part of mutable state, so
now release_pfn_clean() can take hugepage-adjusted pfn.
And it works for all cases (/dev/shm, hugetlb, devdax) except fsdax.
Apparently in fsdax mode faultin-pfn and adjusted-pfn may refer to
different folios, so we're getting get_page/put_page imbalance.
To solve this preserve faultin pfn in separate local variable
and pass it in kvm_release_pfn_clean().
Patch tested for all mentioned guest memory backends with tdp_mmu={0,1}.
No bug in upstream as it was solved fundamentally by
commit 8dd861cc07e2 ("KVM: x86/mmu: Put refcounted pages instead of blindly releasing pfns")
and related patch series.
Link: https://nvdimm.docs.kernel.org/2mib_fs_dax.html
Fixes: 2f6305dd5676 ("KVM: MMU: change kvm_tdp_mmu_map() arguments to kvm_page_fault")
Co-developed-by: Sean Christopherson <seanjc@google.com>
Signed-off-by: Sean Christopherson <seanjc@google.com>
Reviewed-by: Sean Christopherson <seanjc@google.com>
Signed-off-by: Nikolay Kuratov <kniv@yandex-team.ru>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
-rw-r--r-- | arch/x86/kvm/mmu/mmu.c | 10 | ||||
-rw-r--r-- | arch/x86/kvm/mmu/paging_tmpl.h | 5 |
2 files changed, 12 insertions, 3 deletions
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 3e353ed1f767..1b4438e24814 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -4580,6 +4580,7 @@ static bool is_page_fault_stale(struct kvm_vcpu *vcpu, static int direct_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) { + kvm_pfn_t orig_pfn; int r; /* Dummy roots are used only for shadowing bad guest roots. */ @@ -4601,6 +4602,8 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault if (r != RET_PF_CONTINUE) return r; + orig_pfn = fault->pfn; + r = RET_PF_RETRY; write_lock(&vcpu->kvm->mmu_lock); @@ -4615,7 +4618,7 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault out_unlock: write_unlock(&vcpu->kvm->mmu_lock); - kvm_release_pfn_clean(fault->pfn); + kvm_release_pfn_clean(orig_pfn); return r; } @@ -4675,6 +4678,7 @@ EXPORT_SYMBOL_GPL(kvm_handle_page_fault); static int kvm_tdp_mmu_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) { + kvm_pfn_t orig_pfn; int r; if (page_fault_handle_page_track(vcpu, fault)) @@ -4692,6 +4696,8 @@ static int kvm_tdp_mmu_page_fault(struct kvm_vcpu *vcpu, if (r != RET_PF_CONTINUE) return r; + orig_pfn = fault->pfn; + r = RET_PF_RETRY; read_lock(&vcpu->kvm->mmu_lock); @@ -4702,7 +4708,7 @@ static int kvm_tdp_mmu_page_fault(struct kvm_vcpu *vcpu, out_unlock: read_unlock(&vcpu->kvm->mmu_lock); - kvm_release_pfn_clean(fault->pfn); + kvm_release_pfn_clean(orig_pfn); return r; } #endif diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h index ae7d39ff2d07..b08017683920 100644 --- a/arch/x86/kvm/mmu/paging_tmpl.h +++ b/arch/x86/kvm/mmu/paging_tmpl.h @@ -778,6 +778,7 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault, static int FNAME(page_fault)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) { struct guest_walker walker; + kvm_pfn_t orig_pfn; int r; WARN_ON_ONCE(fault->is_tdp); @@ -836,6 +837,8 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault walker.pte_access &= ~ACC_EXEC_MASK; } + orig_pfn = fault->pfn; + r = RET_PF_RETRY; write_lock(&vcpu->kvm->mmu_lock); @@ -849,7 +852,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault out_unlock: write_unlock(&vcpu->kvm->mmu_lock); - kvm_release_pfn_clean(fault->pfn); + kvm_release_pfn_clean(orig_pfn); return r; } |