From 037bc38b298c9a8de64f84b253c0b472228bbb10 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 10 Oct 2024 11:23:03 -0700 Subject: KVM: Drop KVM_ERR_PTR_BAD_PAGE and instead return NULL to indicate an error MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Remove KVM_ERR_PTR_BAD_PAGE and instead return NULL, as "bad page" is just a leftover bit of weirdness from days of old when KVM stuffed a "bad" page into the guest instead of actually handling missing pages. See commit cea7bb21280e ("KVM: MMU: Make gfn_to_page() always safe"). Reviewed-by: Alex Bennée Tested-by: Alex Bennée Signed-off-by: Sean Christopherson Tested-by: Dmitry Osipenko Signed-off-by: Paolo Bonzini Message-ID: <20241010182427.1434605-2-seanjc@google.com> --- virt/kvm/kvm_main.c | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) (limited to 'virt') diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 141db5b79cd4..5fc95504affe 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -3066,19 +3066,14 @@ EXPORT_SYMBOL_GPL(gfn_to_page_many_atomic); */ struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn) { - struct page *page; kvm_pfn_t pfn; pfn = gfn_to_pfn(kvm, gfn); if (is_error_noslot_pfn(pfn)) - return KVM_ERR_PTR_BAD_PAGE; - - page = kvm_pfn_to_refcounted_page(pfn); - if (!page) - return KVM_ERR_PTR_BAD_PAGE; + return NULL; - return page; + return kvm_pfn_to_refcounted_page(pfn); } EXPORT_SYMBOL_GPL(gfn_to_page); @@ -3172,7 +3167,8 @@ static void kvm_set_page_accessed(struct page *page) void kvm_release_page_clean(struct page *page) { - WARN_ON(is_error_page(page)); + if (WARN_ON(!page)) + return; kvm_set_page_accessed(page); put_page(page); @@ -3196,7 +3192,8 @@ EXPORT_SYMBOL_GPL(kvm_release_pfn_clean); void kvm_release_page_dirty(struct page *page) { - WARN_ON(is_error_page(page)); + if (WARN_ON(!page)) + return; kvm_set_page_dirty(page); kvm_release_page_clean(page); -- cgit v1.2.3 From 85e88b2bbaacb4135499fdb219f78ac38ef6e8d4 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 10 Oct 2024 11:23:04 -0700 Subject: KVM: Allow calling kvm_release_page_{clean,dirty}() on a NULL page pointer MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Allow passing a NULL @page to kvm_release_page_{clean,dirty}(), there's no tangible benefit to forcing the callers to pre-check @page, and it ends up generating a lot of duplicate boilerplate code. Reviewed-by: Alex Bennée Tested-by: Alex Bennée Signed-off-by: Sean Christopherson Tested-by: Dmitry Osipenko Signed-off-by: Paolo Bonzini Message-ID: <20241010182427.1434605-3-seanjc@google.com> --- virt/kvm/kvm_main.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'virt') diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 5fc95504affe..e7561ca96a09 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -3167,7 +3167,7 @@ static void kvm_set_page_accessed(struct page *page) void kvm_release_page_clean(struct page *page) { - if (WARN_ON(!page)) + if (!page) return; kvm_set_page_accessed(page); @@ -3192,7 +3192,7 @@ EXPORT_SYMBOL_GPL(kvm_release_pfn_clean); void kvm_release_page_dirty(struct page *page) { - if (WARN_ON(!page)) + if (!page) return; kvm_set_page_dirty(page); -- cgit v1.2.3 From 6419bc52072b928acb764766968c672d5fede802 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 10 Oct 2024 11:23:13 -0700 Subject: KVM: Rename gfn_to_page_many_atomic() to kvm_prefetch_pages() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Rename gfn_to_page_many_atomic() to kvm_prefetch_pages() to try and communicate its true purpose, as the "atomic" aspect is essentially a side effect of the fact that x86 uses the API while holding mmu_lock. E.g. even if mmu_lock weren't held, KVM wouldn't want to fault-in pages, as the goal is to opportunistically grab surrounding pages that have already been accessed and/or dirtied by the host, and to do so quickly. Tested-by: Alex Bennée Signed-off-by: Sean Christopherson Tested-by: Dmitry Osipenko Signed-off-by: Paolo Bonzini Message-ID: <20241010182427.1434605-12-seanjc@google.com> --- arch/x86/kvm/mmu/mmu.c | 2 +- arch/x86/kvm/mmu/paging_tmpl.h | 2 +- include/linux/kvm_host.h | 4 ++-- virt/kvm/kvm_main.c | 6 +++--- 4 files changed, 7 insertions(+), 7 deletions(-) (limited to 'virt') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 993eeba32487..37c2f8d11e05 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -2965,7 +2965,7 @@ static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu, if (!slot) return -1; - ret = gfn_to_page_many_atomic(slot, gfn, pages, end - start); + ret = kvm_prefetch_pages(slot, gfn, pages, end - start); if (ret <= 0) return -1; diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h index 36b2607280f0..143b7e9f26dc 100644 --- a/arch/x86/kvm/mmu/paging_tmpl.h +++ b/arch/x86/kvm/mmu/paging_tmpl.h @@ -549,7 +549,7 @@ FNAME(prefetch_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, if (!slot) return false; - if (gfn_to_page_many_atomic(slot, gfn, &page, 1) != 1) + if (kvm_prefetch_pages(slot, gfn, &page, 1) != 1) return false; mmu_set_spte(vcpu, slot, spte, pte_access, gfn, page_to_pfn(page), NULL); diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 8de9acb0b35e..6a3976c1a218 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1207,8 +1207,8 @@ void kvm_arch_flush_shadow_all(struct kvm *kvm); void kvm_arch_flush_shadow_memslot(struct kvm *kvm, struct kvm_memory_slot *slot); -int gfn_to_page_many_atomic(struct kvm_memory_slot *slot, gfn_t gfn, - struct page **pages, int nr_pages); +int kvm_prefetch_pages(struct kvm_memory_slot *slot, gfn_t gfn, + struct page **pages, int nr_pages); struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn); unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn); diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index e7561ca96a09..aa7ae0f0f90e 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -3041,8 +3041,8 @@ kvm_pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn) } EXPORT_SYMBOL_GPL(gfn_to_pfn); -int gfn_to_page_many_atomic(struct kvm_memory_slot *slot, gfn_t gfn, - struct page **pages, int nr_pages) +int kvm_prefetch_pages(struct kvm_memory_slot *slot, gfn_t gfn, + struct page **pages, int nr_pages) { unsigned long addr; gfn_t entry = 0; @@ -3056,7 +3056,7 @@ int gfn_to_page_many_atomic(struct kvm_memory_slot *slot, gfn_t gfn, return get_user_pages_fast_only(addr, nr_pages, FOLL_WRITE, pages); } -EXPORT_SYMBOL_GPL(gfn_to_page_many_atomic); +EXPORT_SYMBOL_GPL(kvm_prefetch_pages); /* * Do not use this helper unless you are absolutely certain the gfn _must_ be -- cgit v1.2.3 From e2d2ca71ac03c748dbc44e0dd7dc1557befb1ab6 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 10 Oct 2024 11:23:14 -0700 Subject: KVM: Drop @atomic param from gfn=>pfn and hva=>pfn APIs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Drop @atomic from the myriad "to_pfn" APIs now that all callers pass "false", and remove a comment blurb about KVM running only the "GUP fast" part in atomic context. No functional change intended. Reviewed-by: Alex Bennée Tested-by: Alex Bennée Signed-off-by: Sean Christopherson Tested-by: Dmitry Osipenko Signed-off-by: Paolo Bonzini Message-ID: <20241010182427.1434605-13-seanjc@google.com> --- Documentation/virt/kvm/locking.rst | 4 ++-- arch/arm64/kvm/mmu.c | 2 +- arch/powerpc/kvm/book3s_64_mmu_hv.c | 2 +- arch/powerpc/kvm/book3s_64_mmu_radix.c | 2 +- arch/x86/kvm/mmu/mmu.c | 12 ++++++------ include/linux/kvm_host.h | 3 +-- virt/kvm/kvm_main.c | 33 ++++++++------------------------- virt/kvm/kvm_mm.h | 4 ++-- virt/kvm/pfncache.c | 2 +- 9 files changed, 23 insertions(+), 41 deletions(-) (limited to 'virt') diff --git a/Documentation/virt/kvm/locking.rst b/Documentation/virt/kvm/locking.rst index 693090bfc66d..f463ac42ac7a 100644 --- a/Documentation/virt/kvm/locking.rst +++ b/Documentation/virt/kvm/locking.rst @@ -135,8 +135,8 @@ We dirty-log for gfn1, that means gfn2 is lost in dirty-bitmap. For direct sp, we can easily avoid it since the spte of direct sp is fixed to gfn. For indirect sp, we disabled fast page fault for simplicity. -A solution for indirect sp could be to pin the gfn, for example via -gfn_to_pfn_memslot_atomic, before the cmpxchg. After the pinning: +A solution for indirect sp could be to pin the gfn before the cmpxchg. After +the pinning: - We have held the refcount of pfn; that means the pfn can not be freed and be reused for another gfn. diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index 0f7658aefa1a..9fbc79fad292 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -1570,7 +1570,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, mmu_seq = vcpu->kvm->mmu_invalidate_seq; mmap_read_unlock(current->mm); - pfn = __gfn_to_pfn_memslot(memslot, gfn, false, false, NULL, + pfn = __gfn_to_pfn_memslot(memslot, gfn, false, NULL, write_fault, &writable, NULL); if (pfn == KVM_PFN_ERR_HWPOISON) { kvm_send_hwpoison_signal(hva, vma_shift); diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c index 1b51b1c4713b..8cd02ca4b1b8 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_hv.c +++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c @@ -613,7 +613,7 @@ int kvmppc_book3s_hv_page_fault(struct kvm_vcpu *vcpu, write_ok = true; } else { /* Call KVM generic code to do the slow-path check */ - pfn = __gfn_to_pfn_memslot(memslot, gfn, false, false, NULL, + pfn = __gfn_to_pfn_memslot(memslot, gfn, false, NULL, writing, &write_ok, NULL); if (is_error_noslot_pfn(pfn)) return -EFAULT; diff --git a/arch/powerpc/kvm/book3s_64_mmu_radix.c b/arch/powerpc/kvm/book3s_64_mmu_radix.c index 408d98f8a514..26a969e935e3 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_radix.c +++ b/arch/powerpc/kvm/book3s_64_mmu_radix.c @@ -852,7 +852,7 @@ int kvmppc_book3s_instantiate_page(struct kvm_vcpu *vcpu, unsigned long pfn; /* Call KVM generic code to do the slow-path check */ - pfn = __gfn_to_pfn_memslot(memslot, gfn, false, false, NULL, + pfn = __gfn_to_pfn_memslot(memslot, gfn, false, NULL, writing, upgrade_p, NULL); if (is_error_noslot_pfn(pfn)) return -EFAULT; diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 37c2f8d11e05..e5e0bf7593e7 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -4387,9 +4387,9 @@ static int __kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault return kvm_faultin_pfn_private(vcpu, fault); async = false; - fault->pfn = __gfn_to_pfn_memslot(fault->slot, fault->gfn, false, false, - &async, fault->write, - &fault->map_writable, &fault->hva); + fault->pfn = __gfn_to_pfn_memslot(fault->slot, fault->gfn, false, &async, + fault->write, &fault->map_writable, + &fault->hva); if (!async) return RET_PF_CONTINUE; /* *pfn has correct page already */ @@ -4409,9 +4409,9 @@ static int __kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault * to wait for IO. Note, gup always bails if it is unable to quickly * get a page and a fatal signal, i.e. SIGKILL, is pending. */ - fault->pfn = __gfn_to_pfn_memslot(fault->slot, fault->gfn, false, true, - NULL, fault->write, - &fault->map_writable, &fault->hva); + fault->pfn = __gfn_to_pfn_memslot(fault->slot, fault->gfn, true, NULL, + fault->write, &fault->map_writable, + &fault->hva); return RET_PF_CONTINUE; } diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 6a3976c1a218..32e23e05a8c3 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1232,9 +1232,8 @@ kvm_pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn); kvm_pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault, bool *writable); kvm_pfn_t gfn_to_pfn_memslot(const struct kvm_memory_slot *slot, gfn_t gfn); -kvm_pfn_t gfn_to_pfn_memslot_atomic(const struct kvm_memory_slot *slot, gfn_t gfn); kvm_pfn_t __gfn_to_pfn_memslot(const struct kvm_memory_slot *slot, gfn_t gfn, - bool atomic, bool interruptible, bool *async, + bool interruptible, bool *async, bool write_fault, bool *writable, hva_t *hva); void kvm_release_pfn_clean(kvm_pfn_t pfn); diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index aa7ae0f0f90e..c7506eb23086 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -2756,8 +2756,7 @@ static inline int check_user_page_hwpoison(unsigned long addr) /* * The fast path to get the writable pfn which will be stored in @pfn, - * true indicates success, otherwise false is returned. It's also the - * only part that runs if we can in atomic context. + * true indicates success, otherwise false is returned. */ static bool hva_to_pfn_fast(unsigned long addr, bool write_fault, bool *writable, kvm_pfn_t *pfn) @@ -2922,7 +2921,6 @@ out: /* * Pin guest page in memory and return its pfn. * @addr: host virtual address which maps memory to the guest - * @atomic: whether this function is forbidden from sleeping * @interruptible: whether the process can be interrupted by non-fatal signals * @async: whether this function need to wait IO complete if the * host page is not in the memory @@ -2934,22 +2932,16 @@ out: * 2): @write_fault = false && @writable, @writable will tell the caller * whether the mapping is writable. */ -kvm_pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool interruptible, - bool *async, bool write_fault, bool *writable) +kvm_pfn_t hva_to_pfn(unsigned long addr, bool interruptible, bool *async, + bool write_fault, bool *writable) { struct vm_area_struct *vma; kvm_pfn_t pfn; int npages, r; - /* we can do it either atomically or asynchronously, not both */ - BUG_ON(atomic && async); - if (hva_to_pfn_fast(addr, write_fault, writable, &pfn)) return pfn; - if (atomic) - return KVM_PFN_ERR_FAULT; - npages = hva_to_pfn_slow(addr, async, write_fault, interruptible, writable, &pfn); if (npages == 1) @@ -2986,7 +2978,7 @@ exit: } kvm_pfn_t __gfn_to_pfn_memslot(const struct kvm_memory_slot *slot, gfn_t gfn, - bool atomic, bool interruptible, bool *async, + bool interruptible, bool *async, bool write_fault, bool *writable, hva_t *hva) { unsigned long addr = __gfn_to_hva_many(slot, gfn, NULL, write_fault); @@ -3008,33 +3000,24 @@ kvm_pfn_t __gfn_to_pfn_memslot(const struct kvm_memory_slot *slot, gfn_t gfn, writable = NULL; } - return hva_to_pfn(addr, atomic, interruptible, async, write_fault, - writable); + return hva_to_pfn(addr, interruptible, async, write_fault, writable); } EXPORT_SYMBOL_GPL(__gfn_to_pfn_memslot); kvm_pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault, bool *writable) { - return __gfn_to_pfn_memslot(gfn_to_memslot(kvm, gfn), gfn, false, false, - NULL, write_fault, writable, NULL); + return __gfn_to_pfn_memslot(gfn_to_memslot(kvm, gfn), gfn, false, NULL, + write_fault, writable, NULL); } EXPORT_SYMBOL_GPL(gfn_to_pfn_prot); kvm_pfn_t gfn_to_pfn_memslot(const struct kvm_memory_slot *slot, gfn_t gfn) { - return __gfn_to_pfn_memslot(slot, gfn, false, false, NULL, true, - NULL, NULL); + return __gfn_to_pfn_memslot(slot, gfn, false, NULL, true, NULL, NULL); } EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot); -kvm_pfn_t gfn_to_pfn_memslot_atomic(const struct kvm_memory_slot *slot, gfn_t gfn) -{ - return __gfn_to_pfn_memslot(slot, gfn, true, false, NULL, true, - NULL, NULL); -} -EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot_atomic); - kvm_pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn) { return gfn_to_pfn_memslot(gfn_to_memslot(kvm, gfn), gfn); diff --git a/virt/kvm/kvm_mm.h b/virt/kvm/kvm_mm.h index 715f19669d01..a3fa86f60d6c 100644 --- a/virt/kvm/kvm_mm.h +++ b/virt/kvm/kvm_mm.h @@ -20,8 +20,8 @@ #define KVM_MMU_UNLOCK(kvm) spin_unlock(&(kvm)->mmu_lock) #endif /* KVM_HAVE_MMU_RWLOCK */ -kvm_pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool interruptible, - bool *async, bool write_fault, bool *writable); +kvm_pfn_t hva_to_pfn(unsigned long addr, bool interruptible, bool *async, + bool write_fault, bool *writable); #ifdef CONFIG_HAVE_KVM_PFNCACHE void gfn_to_pfn_cache_invalidate_start(struct kvm *kvm, diff --git a/virt/kvm/pfncache.c b/virt/kvm/pfncache.c index f0039efb9e1e..58c706a610e5 100644 --- a/virt/kvm/pfncache.c +++ b/virt/kvm/pfncache.c @@ -198,7 +198,7 @@ static kvm_pfn_t hva_to_pfn_retry(struct gfn_to_pfn_cache *gpc) } /* We always request a writeable mapping */ - new_pfn = hva_to_pfn(gpc->uhva, false, false, NULL, true, NULL); + new_pfn = hva_to_pfn(gpc->uhva, false, NULL, true, NULL); if (is_error_noslot_pfn(new_pfn)) goto out_error; -- cgit v1.2.3 From eec1e5db464eba928273b27246122d157fd0eff8 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 10 Oct 2024 11:23:15 -0700 Subject: KVM: Annotate that all paths in hva_to_pfn() might sleep MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Now that hva_to_pfn() no longer supports being called in atomic context, move the might_sleep() annotation from hva_to_pfn_slow() to hva_to_pfn(). Reviewed-by: Alex Bennée Tested-by: Alex Bennée Signed-off-by: Sean Christopherson Tested-by: Dmitry Osipenko Signed-off-by: Paolo Bonzini Message-ID: <20241010182427.1434605-14-seanjc@google.com> --- virt/kvm/kvm_main.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'virt') diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index c7506eb23086..c60b542c7241 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -2804,8 +2804,6 @@ static int hva_to_pfn_slow(unsigned long addr, bool *async, bool write_fault, struct page *page; int npages; - might_sleep(); - if (writable) *writable = write_fault; @@ -2939,6 +2937,8 @@ kvm_pfn_t hva_to_pfn(unsigned long addr, bool interruptible, bool *async, kvm_pfn_t pfn; int npages, r; + might_sleep(); + if (hva_to_pfn_fast(addr, write_fault, writable, &pfn)) return pfn; -- cgit v1.2.3 From d1331a44694abbdb309cd3eb3e3d400134c670cf Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 10 Oct 2024 11:23:16 -0700 Subject: KVM: Return ERR_SIGPENDING from hva_to_pfn() if GUP returns -EGAIN Treat an -EAGAIN return from GUP the same as -EINTR and immediately report to the caller that a signal is pending. GUP only returns -EAGAIN if the _initial_ mmap_read_lock_killable() fails, which in turn onnly fails if a signal is pending Note, rwsem_down_read_slowpath() actually returns -EINTR, so GUP is really just making life harder than it needs to be. And the call to mmap_read_lock_killable() in the retry path returns its -errno verbatim, i.e. GUP (and thus KVM) is already handling locking failure this way, but only some of the time. Suggested-by: Paolo Bonzini Signed-off-by: Sean Christopherson Tested-by: Dmitry Osipenko Signed-off-by: Paolo Bonzini Message-ID: <20241010182427.1434605-15-seanjc@google.com> --- virt/kvm/kvm_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'virt') diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index c60b542c7241..88db55f9a8b6 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -2946,7 +2946,7 @@ kvm_pfn_t hva_to_pfn(unsigned long addr, bool interruptible, bool *async, writable, &pfn); if (npages == 1) return pfn; - if (npages == -EINTR) + if (npages == -EINTR || npages == -EAGAIN) return KVM_PFN_ERR_SIGPENDING; mmap_read_lock(current->mm); -- cgit v1.2.3 From b176f4b41775f8b2a7c642f87ccd5e3f405e5191 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 10 Oct 2024 11:23:17 -0700 Subject: KVM: Drop extra GUP (via check_user_page_hwpoison()) to detect poisoned page Remove check_user_page_hwpoison() as it's effectively dead code. Prior to commit 234b239bea39 ("kvm: Faults which trigger IO release the mmap_sem"), hva_to_pfn_slow() wasn't actually a slow path in all cases, i.e. would do get_user_pages_fast() without ever doing slow GUP with FOLL_HWPOISON. Now that hva_to_pfn_slow() is a straight shot to get_user_pages_unlocked(), and unconditionally passes FOLL_HWPOISON, it is impossible for hva_to_pfn() to get an -errno that needs to be morphed to -EHWPOISON. There are essentially four cases in KVM: - npages == 0, then FOLL_NOWAIT, a.k.a. @async, must be true, and thus check_user_page_hwpoison() will not be called - npages == 1 || npages == -EHWPOISON, all good - npages == -EINTR || npages == -EAGAIN, bail early, all good - everything else, including -EFAULT, can go down the vma_lookup() path, as npages < 0 means KVM went through hva_to_pfn_slow() which passes FOLL_HWPOISON Suggested-by: Paolo Bonzini Signed-off-by: Sean Christopherson Tested-by: Dmitry Osipenko Signed-off-by: Paolo Bonzini Message-ID: <20241010182427.1434605-16-seanjc@google.com> --- virt/kvm/kvm_main.c | 17 ++--------------- 1 file changed, 2 insertions(+), 15 deletions(-) (limited to 'virt') diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 88db55f9a8b6..f5b7fd653341 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -2746,14 +2746,6 @@ unsigned long kvm_vcpu_gfn_to_hva_prot(struct kvm_vcpu *vcpu, gfn_t gfn, bool *w return gfn_to_hva_memslot_prot(slot, gfn, writable); } -static inline int check_user_page_hwpoison(unsigned long addr) -{ - int rc, flags = FOLL_HWPOISON | FOLL_WRITE; - - rc = get_user_pages(addr, 1, flags, NULL); - return rc == -EHWPOISON; -} - /* * The fast path to get the writable pfn which will be stored in @pfn, * true indicates success, otherwise false is returned. @@ -2948,14 +2940,10 @@ kvm_pfn_t hva_to_pfn(unsigned long addr, bool interruptible, bool *async, return pfn; if (npages == -EINTR || npages == -EAGAIN) return KVM_PFN_ERR_SIGPENDING; + if (npages == -EHWPOISON) + return KVM_PFN_ERR_HWPOISON; mmap_read_lock(current->mm); - if (npages == -EHWPOISON || - (!async && check_user_page_hwpoison(addr))) { - pfn = KVM_PFN_ERR_HWPOISON; - goto exit; - } - retry: vma = vma_lookup(current->mm, addr); @@ -2972,7 +2960,6 @@ retry: *async = true; pfn = KVM_PFN_ERR_FAULT; } -exit: mmap_read_unlock(current->mm); return pfn; } -- cgit v1.2.3 From 6769d1bcd3509ad2d2ee04da122c465a11a165b4 Mon Sep 17 00:00:00 2001 From: David Stevens Date: Thu, 10 Oct 2024 11:23:18 -0700 Subject: KVM: Replace "async" pointer in gfn=>pfn with "no_wait" and error code MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a pfn error code to communicate that hva_to_pfn() failed because I/O was needed and disallowed, and convert @async to a constant @no_wait boolean. This will allow eliminating the @no_wait param by having callers pass in FOLL_NOWAIT along with other FOLL_* flags. Tested-by: Alex Bennée Signed-off-by: David Stevens Co-developed-by: Sean Christopherson Signed-off-by: Sean Christopherson Tested-by: Dmitry Osipenko Signed-off-by: Paolo Bonzini Message-ID: <20241010182427.1434605-17-seanjc@google.com> --- arch/x86/kvm/mmu/mmu.c | 18 +++++++++++------- include/linux/kvm_host.h | 3 ++- virt/kvm/kvm_main.c | 27 ++++++++++++++------------- virt/kvm/kvm_mm.h | 2 +- virt/kvm/pfncache.c | 4 ++-- 5 files changed, 30 insertions(+), 24 deletions(-) (limited to 'virt') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index e5e0bf7593e7..cae81209f9ed 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -4381,17 +4381,21 @@ static int kvm_faultin_pfn_private(struct kvm_vcpu *vcpu, static int __kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) { - bool async; - if (fault->is_private) return kvm_faultin_pfn_private(vcpu, fault); - async = false; - fault->pfn = __gfn_to_pfn_memslot(fault->slot, fault->gfn, false, &async, + fault->pfn = __gfn_to_pfn_memslot(fault->slot, fault->gfn, false, true, fault->write, &fault->map_writable, &fault->hva); - if (!async) - return RET_PF_CONTINUE; /* *pfn has correct page already */ + + /* + * If resolving the page failed because I/O is needed to fault-in the + * page, then either set up an asynchronous #PF to do the I/O, or if + * doing an async #PF isn't possible, retry with I/O allowed. All + * other failures are terminal, i.e. retrying won't help. + */ + if (fault->pfn != KVM_PFN_ERR_NEEDS_IO) + return RET_PF_CONTINUE; if (!fault->prefetch && kvm_can_do_async_pf(vcpu)) { trace_kvm_try_async_get_page(fault->addr, fault->gfn); @@ -4409,7 +4413,7 @@ static int __kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault * to wait for IO. Note, gup always bails if it is unable to quickly * get a page and a fatal signal, i.e. SIGKILL, is pending. */ - fault->pfn = __gfn_to_pfn_memslot(fault->slot, fault->gfn, true, NULL, + fault->pfn = __gfn_to_pfn_memslot(fault->slot, fault->gfn, true, true, fault->write, &fault->map_writable, &fault->hva); return RET_PF_CONTINUE; diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 32e23e05a8c3..dc15a9a64408 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -97,6 +97,7 @@ #define KVM_PFN_ERR_HWPOISON (KVM_PFN_ERR_MASK + 1) #define KVM_PFN_ERR_RO_FAULT (KVM_PFN_ERR_MASK + 2) #define KVM_PFN_ERR_SIGPENDING (KVM_PFN_ERR_MASK + 3) +#define KVM_PFN_ERR_NEEDS_IO (KVM_PFN_ERR_MASK + 4) /* * error pfns indicate that the gfn is in slot but faild to @@ -1233,7 +1234,7 @@ kvm_pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault, bool *writable); kvm_pfn_t gfn_to_pfn_memslot(const struct kvm_memory_slot *slot, gfn_t gfn); kvm_pfn_t __gfn_to_pfn_memslot(const struct kvm_memory_slot *slot, gfn_t gfn, - bool interruptible, bool *async, + bool interruptible, bool no_wait, bool write_fault, bool *writable, hva_t *hva); void kvm_release_pfn_clean(kvm_pfn_t pfn); diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index f5b7fd653341..d7a72278c033 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -2778,7 +2778,7 @@ static bool hva_to_pfn_fast(unsigned long addr, bool write_fault, * The slow path to get the pfn of the specified host virtual address, * 1 indicates success, -errno is returned if error is detected. */ -static int hva_to_pfn_slow(unsigned long addr, bool *async, bool write_fault, +static int hva_to_pfn_slow(unsigned long addr, bool no_wait, bool write_fault, bool interruptible, bool *writable, kvm_pfn_t *pfn) { /* @@ -2801,7 +2801,7 @@ static int hva_to_pfn_slow(unsigned long addr, bool *async, bool write_fault, if (write_fault) flags |= FOLL_WRITE; - if (async) + if (no_wait) flags |= FOLL_NOWAIT; if (interruptible) flags |= FOLL_INTERRUPTIBLE; @@ -2912,8 +2912,8 @@ out: * Pin guest page in memory and return its pfn. * @addr: host virtual address which maps memory to the guest * @interruptible: whether the process can be interrupted by non-fatal signals - * @async: whether this function need to wait IO complete if the - * host page is not in the memory + * @no_wait: whether or not this function need to wait IO complete if the + * host page is not in the memory * @write_fault: whether we should get a writable host page * @writable: whether it allows to map a writable host page for !@write_fault * @@ -2922,7 +2922,7 @@ out: * 2): @write_fault = false && @writable, @writable will tell the caller * whether the mapping is writable. */ -kvm_pfn_t hva_to_pfn(unsigned long addr, bool interruptible, bool *async, +kvm_pfn_t hva_to_pfn(unsigned long addr, bool interruptible, bool no_wait, bool write_fault, bool *writable) { struct vm_area_struct *vma; @@ -2934,7 +2934,7 @@ kvm_pfn_t hva_to_pfn(unsigned long addr, bool interruptible, bool *async, if (hva_to_pfn_fast(addr, write_fault, writable, &pfn)) return pfn; - npages = hva_to_pfn_slow(addr, async, write_fault, interruptible, + npages = hva_to_pfn_slow(addr, no_wait, write_fault, interruptible, writable, &pfn); if (npages == 1) return pfn; @@ -2956,16 +2956,17 @@ retry: if (r < 0) pfn = KVM_PFN_ERR_FAULT; } else { - if (async && vma_is_valid(vma, write_fault)) - *async = true; - pfn = KVM_PFN_ERR_FAULT; + if (no_wait && vma_is_valid(vma, write_fault)) + pfn = KVM_PFN_ERR_NEEDS_IO; + else + pfn = KVM_PFN_ERR_FAULT; } mmap_read_unlock(current->mm); return pfn; } kvm_pfn_t __gfn_to_pfn_memslot(const struct kvm_memory_slot *slot, gfn_t gfn, - bool interruptible, bool *async, + bool interruptible, bool no_wait, bool write_fault, bool *writable, hva_t *hva) { unsigned long addr = __gfn_to_hva_many(slot, gfn, NULL, write_fault); @@ -2987,21 +2988,21 @@ kvm_pfn_t __gfn_to_pfn_memslot(const struct kvm_memory_slot *slot, gfn_t gfn, writable = NULL; } - return hva_to_pfn(addr, interruptible, async, write_fault, writable); + return hva_to_pfn(addr, interruptible, no_wait, write_fault, writable); } EXPORT_SYMBOL_GPL(__gfn_to_pfn_memslot); kvm_pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault, bool *writable) { - return __gfn_to_pfn_memslot(gfn_to_memslot(kvm, gfn), gfn, false, NULL, + return __gfn_to_pfn_memslot(gfn_to_memslot(kvm, gfn), gfn, false, false, write_fault, writable, NULL); } EXPORT_SYMBOL_GPL(gfn_to_pfn_prot); kvm_pfn_t gfn_to_pfn_memslot(const struct kvm_memory_slot *slot, gfn_t gfn) { - return __gfn_to_pfn_memslot(slot, gfn, false, NULL, true, NULL, NULL); + return __gfn_to_pfn_memslot(slot, gfn, false, false, true, NULL, NULL); } EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot); diff --git a/virt/kvm/kvm_mm.h b/virt/kvm/kvm_mm.h index a3fa86f60d6c..51f3fee4ca3f 100644 --- a/virt/kvm/kvm_mm.h +++ b/virt/kvm/kvm_mm.h @@ -20,7 +20,7 @@ #define KVM_MMU_UNLOCK(kvm) spin_unlock(&(kvm)->mmu_lock) #endif /* KVM_HAVE_MMU_RWLOCK */ -kvm_pfn_t hva_to_pfn(unsigned long addr, bool interruptible, bool *async, +kvm_pfn_t hva_to_pfn(unsigned long addr, bool interruptible, bool no_wait, bool write_fault, bool *writable); #ifdef CONFIG_HAVE_KVM_PFNCACHE diff --git a/virt/kvm/pfncache.c b/virt/kvm/pfncache.c index 58c706a610e5..32dc61f48c81 100644 --- a/virt/kvm/pfncache.c +++ b/virt/kvm/pfncache.c @@ -197,8 +197,8 @@ static kvm_pfn_t hva_to_pfn_retry(struct gfn_to_pfn_cache *gpc) cond_resched(); } - /* We always request a writeable mapping */ - new_pfn = hva_to_pfn(gpc->uhva, false, NULL, true, NULL); + /* We always request a writable mapping */ + new_pfn = hva_to_pfn(gpc->uhva, false, false, true, NULL); if (is_error_noslot_pfn(new_pfn)) goto out_error; -- cgit v1.2.3 From cccefb0a0d3b4f7b41b1921538087dd7031876ac Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 10 Oct 2024 11:23:20 -0700 Subject: KVM: Drop unused "hva" pointer from __gfn_to_pfn_memslot() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Drop @hva from __gfn_to_pfn_memslot() now that all callers pass NULL. No functional change intended. Tested-by: Alex Bennée Signed-off-by: Sean Christopherson Tested-by: Dmitry Osipenko Signed-off-by: Paolo Bonzini Message-ID: <20241010182427.1434605-19-seanjc@google.com> --- arch/arm64/kvm/mmu.c | 2 +- arch/powerpc/kvm/book3s_64_mmu_hv.c | 2 +- arch/powerpc/kvm/book3s_64_mmu_radix.c | 2 +- arch/x86/kvm/mmu/mmu.c | 6 ++---- include/linux/kvm_host.h | 2 +- virt/kvm/kvm_main.c | 9 +++------ 6 files changed, 9 insertions(+), 14 deletions(-) (limited to 'virt') diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index 9fbc79fad292..246c820379ec 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -1571,7 +1571,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, mmap_read_unlock(current->mm); pfn = __gfn_to_pfn_memslot(memslot, gfn, false, NULL, - write_fault, &writable, NULL); + write_fault, &writable); if (pfn == KVM_PFN_ERR_HWPOISON) { kvm_send_hwpoison_signal(hva, vma_shift); return 0; diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c index 8cd02ca4b1b8..2f1d58984b41 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_hv.c +++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c @@ -614,7 +614,7 @@ int kvmppc_book3s_hv_page_fault(struct kvm_vcpu *vcpu, } else { /* Call KVM generic code to do the slow-path check */ pfn = __gfn_to_pfn_memslot(memslot, gfn, false, NULL, - writing, &write_ok, NULL); + writing, &write_ok); if (is_error_noslot_pfn(pfn)) return -EFAULT; page = NULL; diff --git a/arch/powerpc/kvm/book3s_64_mmu_radix.c b/arch/powerpc/kvm/book3s_64_mmu_radix.c index 26a969e935e3..8304b6f8fe45 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_radix.c +++ b/arch/powerpc/kvm/book3s_64_mmu_radix.c @@ -853,7 +853,7 @@ int kvmppc_book3s_instantiate_page(struct kvm_vcpu *vcpu, /* Call KVM generic code to do the slow-path check */ pfn = __gfn_to_pfn_memslot(memslot, gfn, false, NULL, - writing, upgrade_p, NULL); + writing, upgrade_p); if (is_error_noslot_pfn(pfn)) return -EFAULT; page = NULL; diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index d4e21071b78e..688202ac50c2 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -4384,8 +4384,7 @@ static int __kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault return kvm_faultin_pfn_private(vcpu, fault); fault->pfn = __gfn_to_pfn_memslot(fault->slot, fault->gfn, false, true, - fault->write, &fault->map_writable, - NULL); + fault->write, &fault->map_writable); /* * If resolving the page failed because I/O is needed to fault-in the @@ -4413,8 +4412,7 @@ static int __kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault * get a page and a fatal signal, i.e. SIGKILL, is pending. */ fault->pfn = __gfn_to_pfn_memslot(fault->slot, fault->gfn, true, true, - fault->write, &fault->map_writable, - NULL); + fault->write, &fault->map_writable); return RET_PF_CONTINUE; } diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index dc15a9a64408..2c9eb472f059 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1235,7 +1235,7 @@ kvm_pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault, kvm_pfn_t gfn_to_pfn_memslot(const struct kvm_memory_slot *slot, gfn_t gfn); kvm_pfn_t __gfn_to_pfn_memslot(const struct kvm_memory_slot *slot, gfn_t gfn, bool interruptible, bool no_wait, - bool write_fault, bool *writable, hva_t *hva); + bool write_fault, bool *writable); void kvm_release_pfn_clean(kvm_pfn_t pfn); void kvm_release_pfn_dirty(kvm_pfn_t pfn); diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index d7a72278c033..9de915a56bd5 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -2967,13 +2967,10 @@ retry: kvm_pfn_t __gfn_to_pfn_memslot(const struct kvm_memory_slot *slot, gfn_t gfn, bool interruptible, bool no_wait, - bool write_fault, bool *writable, hva_t *hva) + bool write_fault, bool *writable) { unsigned long addr = __gfn_to_hva_many(slot, gfn, NULL, write_fault); - if (hva) - *hva = addr; - if (kvm_is_error_hva(addr)) { if (writable) *writable = false; @@ -2996,13 +2993,13 @@ kvm_pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault, bool *writable) { return __gfn_to_pfn_memslot(gfn_to_memslot(kvm, gfn), gfn, false, false, - write_fault, writable, NULL); + write_fault, writable); } EXPORT_SYMBOL_GPL(gfn_to_pfn_prot); kvm_pfn_t gfn_to_pfn_memslot(const struct kvm_memory_slot *slot, gfn_t gfn) { - return __gfn_to_pfn_memslot(slot, gfn, false, false, true, NULL, NULL); + return __gfn_to_pfn_memslot(slot, gfn, false, false, true, NULL); } EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot); -- cgit v1.2.3 From c0461f20630b457f7fdb80ec8bb03156f2d6fc84 Mon Sep 17 00:00:00 2001 From: David Stevens Date: Thu, 10 Oct 2024 11:23:21 -0700 Subject: KVM: Introduce kvm_follow_pfn() to eventually replace "gfn_to_pfn" APIs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Introduce kvm_follow_pfn() to eventually supplant the various "gfn_to_pfn" APIs, albeit by adding more wrappers. The primary motivation of the new helper is to pass a structure instead of an ever changing set of parameters, e.g. so that tweaking the behavior, inputs, and/or outputs of the "to pfn" helpers doesn't require churning half of KVM. In the more distant future, the APIs exposed to arch code could also follow suit, e.g. by adding something akin to x86's "struct kvm_page_fault" when faulting in guest memory. But for now, the goal is purely to clean up KVM's "internal" MMU code. As part of the conversion, replace the write_fault, interruptible, and no-wait boolean flags with FOLL_WRITE, FOLL_INTERRUPTIBLE, and FOLL_NOWAIT respectively. Collecting the various FOLL_* flags into a single field will again ease the pain of passing new flags. Tested-by: Alex Bennée Signed-off-by: David Stevens Co-developed-by: Sean Christopherson Signed-off-by: Sean Christopherson Tested-by: Dmitry Osipenko Signed-off-by: Paolo Bonzini Message-ID: <20241010182427.1434605-20-seanjc@google.com> --- virt/kvm/kvm_main.c | 158 +++++++++++++++++++++++++++------------------------- virt/kvm/kvm_mm.h | 20 ++++++- virt/kvm/pfncache.c | 9 ++- 3 files changed, 107 insertions(+), 80 deletions(-) (limited to 'virt') diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 9de915a56bd5..8fd99c250219 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -2750,8 +2750,7 @@ unsigned long kvm_vcpu_gfn_to_hva_prot(struct kvm_vcpu *vcpu, gfn_t gfn, bool *w * The fast path to get the writable pfn which will be stored in @pfn, * true indicates success, otherwise false is returned. */ -static bool hva_to_pfn_fast(unsigned long addr, bool write_fault, - bool *writable, kvm_pfn_t *pfn) +static bool hva_to_pfn_fast(struct kvm_follow_pfn *kfp, kvm_pfn_t *pfn) { struct page *page[1]; @@ -2760,14 +2759,13 @@ static bool hva_to_pfn_fast(unsigned long addr, bool write_fault, * or the caller allows to map a writable pfn for a read fault * request. */ - if (!(write_fault || writable)) + if (!((kfp->flags & FOLL_WRITE) || kfp->map_writable)) return false; - if (get_user_page_fast_only(addr, FOLL_WRITE, page)) { + if (get_user_page_fast_only(kfp->hva, FOLL_WRITE, page)) { *pfn = page_to_pfn(page[0]); - - if (writable) - *writable = true; + if (kfp->map_writable) + *kfp->map_writable = true; return true; } @@ -2778,8 +2776,7 @@ static bool hva_to_pfn_fast(unsigned long addr, bool write_fault, * The slow path to get the pfn of the specified host virtual address, * 1 indicates success, -errno is returned if error is detected. */ -static int hva_to_pfn_slow(unsigned long addr, bool no_wait, bool write_fault, - bool interruptible, bool *writable, kvm_pfn_t *pfn) +static int hva_to_pfn_slow(struct kvm_follow_pfn *kfp, kvm_pfn_t *pfn) { /* * When a VCPU accesses a page that is not mapped into the secondary @@ -2792,34 +2789,30 @@ static int hva_to_pfn_slow(unsigned long addr, bool no_wait, bool write_fault, * Note that get_user_page_fast_only() and FOLL_WRITE for now * implicitly honor NUMA hinting faults and don't need this flag. */ - unsigned int flags = FOLL_HWPOISON | FOLL_HONOR_NUMA_FAULT; - struct page *page; + unsigned int flags = FOLL_HWPOISON | FOLL_HONOR_NUMA_FAULT | kfp->flags; + struct page *page, *wpage; int npages; - if (writable) - *writable = write_fault; - - if (write_fault) - flags |= FOLL_WRITE; - if (no_wait) - flags |= FOLL_NOWAIT; - if (interruptible) - flags |= FOLL_INTERRUPTIBLE; - - npages = get_user_pages_unlocked(addr, 1, &page, flags); + npages = get_user_pages_unlocked(kfp->hva, 1, &page, flags); if (npages != 1) return npages; - /* map read fault as writable if possible */ - if (unlikely(!write_fault) && writable) { - struct page *wpage; + if (!kfp->map_writable) + goto out; - if (get_user_page_fast_only(addr, FOLL_WRITE, &wpage)) { - *writable = true; - put_page(page); - page = wpage; - } + if (kfp->flags & FOLL_WRITE) { + *kfp->map_writable = true; + goto out; } + + /* map read fault as writable if possible */ + if (get_user_page_fast_only(kfp->hva, FOLL_WRITE, &wpage)) { + *kfp->map_writable = true; + put_page(page); + page = wpage; + } + +out: *pfn = page_to_pfn(page); return npages; } @@ -2846,10 +2839,10 @@ static int kvm_try_get_pfn(kvm_pfn_t pfn) } static int hva_to_pfn_remapped(struct vm_area_struct *vma, - unsigned long addr, bool write_fault, - bool *writable, kvm_pfn_t *p_pfn) + struct kvm_follow_pfn *kfp, kvm_pfn_t *p_pfn) { - struct follow_pfnmap_args args = { .vma = vma, .address = addr }; + struct follow_pfnmap_args args = { .vma = vma, .address = kfp->hva }; + bool write_fault = kfp->flags & FOLL_WRITE; kvm_pfn_t pfn; int r; @@ -2860,7 +2853,7 @@ static int hva_to_pfn_remapped(struct vm_area_struct *vma, * not call the fault handler, so do it here. */ bool unlocked = false; - r = fixup_user_fault(current->mm, addr, + r = fixup_user_fault(current->mm, kfp->hva, (write_fault ? FAULT_FLAG_WRITE : 0), &unlocked); if (unlocked) @@ -2878,8 +2871,8 @@ static int hva_to_pfn_remapped(struct vm_area_struct *vma, goto out; } - if (writable) - *writable = args.writable; + if (kfp->map_writable) + *kfp->map_writable = args.writable; pfn = args.pfn; /* @@ -2908,22 +2901,7 @@ out: return r; } -/* - * Pin guest page in memory and return its pfn. - * @addr: host virtual address which maps memory to the guest - * @interruptible: whether the process can be interrupted by non-fatal signals - * @no_wait: whether or not this function need to wait IO complete if the - * host page is not in the memory - * @write_fault: whether we should get a writable host page - * @writable: whether it allows to map a writable host page for !@write_fault - * - * The function will map a writable host page for these two cases: - * 1): @write_fault = true - * 2): @write_fault = false && @writable, @writable will tell the caller - * whether the mapping is writable. - */ -kvm_pfn_t hva_to_pfn(unsigned long addr, bool interruptible, bool no_wait, - bool write_fault, bool *writable) +kvm_pfn_t hva_to_pfn(struct kvm_follow_pfn *kfp) { struct vm_area_struct *vma; kvm_pfn_t pfn; @@ -2931,11 +2909,10 @@ kvm_pfn_t hva_to_pfn(unsigned long addr, bool interruptible, bool no_wait, might_sleep(); - if (hva_to_pfn_fast(addr, write_fault, writable, &pfn)) + if (hva_to_pfn_fast(kfp, &pfn)) return pfn; - npages = hva_to_pfn_slow(addr, no_wait, write_fault, interruptible, - writable, &pfn); + npages = hva_to_pfn_slow(kfp, &pfn); if (npages == 1) return pfn; if (npages == -EINTR || npages == -EAGAIN) @@ -2945,18 +2922,19 @@ kvm_pfn_t hva_to_pfn(unsigned long addr, bool interruptible, bool no_wait, mmap_read_lock(current->mm); retry: - vma = vma_lookup(current->mm, addr); + vma = vma_lookup(current->mm, kfp->hva); if (vma == NULL) pfn = KVM_PFN_ERR_FAULT; else if (vma->vm_flags & (VM_IO | VM_PFNMAP)) { - r = hva_to_pfn_remapped(vma, addr, write_fault, writable, &pfn); + r = hva_to_pfn_remapped(vma, kfp, &pfn); if (r == -EAGAIN) goto retry; if (r < 0) pfn = KVM_PFN_ERR_FAULT; } else { - if (no_wait && vma_is_valid(vma, write_fault)) + if ((kfp->flags & FOLL_NOWAIT) && + vma_is_valid(vma, kfp->flags & FOLL_WRITE)) pfn = KVM_PFN_ERR_NEEDS_IO; else pfn = KVM_PFN_ERR_FAULT; @@ -2965,41 +2943,69 @@ retry: return pfn; } -kvm_pfn_t __gfn_to_pfn_memslot(const struct kvm_memory_slot *slot, gfn_t gfn, - bool interruptible, bool no_wait, - bool write_fault, bool *writable) +static kvm_pfn_t kvm_follow_pfn(struct kvm_follow_pfn *kfp) { - unsigned long addr = __gfn_to_hva_many(slot, gfn, NULL, write_fault); + kfp->hva = __gfn_to_hva_many(kfp->slot, kfp->gfn, NULL, + kfp->flags & FOLL_WRITE); - if (kvm_is_error_hva(addr)) { - if (writable) - *writable = false; + if (kfp->hva == KVM_HVA_ERR_RO_BAD) + return KVM_PFN_ERR_RO_FAULT; - return addr == KVM_HVA_ERR_RO_BAD ? KVM_PFN_ERR_RO_FAULT : - KVM_PFN_NOSLOT; - } + if (kvm_is_error_hva(kfp->hva)) + return KVM_PFN_NOSLOT; - /* Do not map writable pfn in the readonly memslot. */ - if (writable && memslot_is_readonly(slot)) { - *writable = false; - writable = NULL; + if (memslot_is_readonly(kfp->slot) && kfp->map_writable) { + *kfp->map_writable = false; + kfp->map_writable = NULL; } - return hva_to_pfn(addr, interruptible, no_wait, write_fault, writable); + return hva_to_pfn(kfp); +} + +kvm_pfn_t __gfn_to_pfn_memslot(const struct kvm_memory_slot *slot, gfn_t gfn, + bool interruptible, bool no_wait, + bool write_fault, bool *writable) +{ + struct kvm_follow_pfn kfp = { + .slot = slot, + .gfn = gfn, + .map_writable = writable, + }; + + if (write_fault) + kfp.flags |= FOLL_WRITE; + if (no_wait) + kfp.flags |= FOLL_NOWAIT; + if (interruptible) + kfp.flags |= FOLL_INTERRUPTIBLE; + + return kvm_follow_pfn(&kfp); } EXPORT_SYMBOL_GPL(__gfn_to_pfn_memslot); kvm_pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault, bool *writable) { - return __gfn_to_pfn_memslot(gfn_to_memslot(kvm, gfn), gfn, false, false, - write_fault, writable); + struct kvm_follow_pfn kfp = { + .slot = gfn_to_memslot(kvm, gfn), + .gfn = gfn, + .flags = write_fault ? FOLL_WRITE : 0, + .map_writable = writable, + }; + + return kvm_follow_pfn(&kfp); } EXPORT_SYMBOL_GPL(gfn_to_pfn_prot); kvm_pfn_t gfn_to_pfn_memslot(const struct kvm_memory_slot *slot, gfn_t gfn) { - return __gfn_to_pfn_memslot(slot, gfn, false, false, true, NULL); + struct kvm_follow_pfn kfp = { + .slot = slot, + .gfn = gfn, + .flags = FOLL_WRITE, + }; + + return kvm_follow_pfn(&kfp); } EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot); diff --git a/virt/kvm/kvm_mm.h b/virt/kvm/kvm_mm.h index 51f3fee4ca3f..d5a215958f06 100644 --- a/virt/kvm/kvm_mm.h +++ b/virt/kvm/kvm_mm.h @@ -20,8 +20,24 @@ #define KVM_MMU_UNLOCK(kvm) spin_unlock(&(kvm)->mmu_lock) #endif /* KVM_HAVE_MMU_RWLOCK */ -kvm_pfn_t hva_to_pfn(unsigned long addr, bool interruptible, bool no_wait, - bool write_fault, bool *writable); + +struct kvm_follow_pfn { + const struct kvm_memory_slot *slot; + const gfn_t gfn; + + unsigned long hva; + + /* FOLL_* flags modifying lookup behavior, e.g. FOLL_WRITE. */ + unsigned int flags; + + /* + * If non-NULL, try to get a writable mapping even for a read fault. + * Set to true if a writable mapping was obtained. + */ + bool *map_writable; +}; + +kvm_pfn_t hva_to_pfn(struct kvm_follow_pfn *kfp); #ifdef CONFIG_HAVE_KVM_PFNCACHE void gfn_to_pfn_cache_invalidate_start(struct kvm *kvm, diff --git a/virt/kvm/pfncache.c b/virt/kvm/pfncache.c index 32dc61f48c81..067daf9ad6ef 100644 --- a/virt/kvm/pfncache.c +++ b/virt/kvm/pfncache.c @@ -159,6 +159,12 @@ static kvm_pfn_t hva_to_pfn_retry(struct gfn_to_pfn_cache *gpc) kvm_pfn_t new_pfn = KVM_PFN_ERR_FAULT; void *new_khva = NULL; unsigned long mmu_seq; + struct kvm_follow_pfn kfp = { + .slot = gpc->memslot, + .gfn = gpa_to_gfn(gpc->gpa), + .flags = FOLL_WRITE, + .hva = gpc->uhva, + }; lockdep_assert_held(&gpc->refresh_lock); @@ -197,8 +203,7 @@ static kvm_pfn_t hva_to_pfn_retry(struct gfn_to_pfn_cache *gpc) cond_resched(); } - /* We always request a writable mapping */ - new_pfn = hva_to_pfn(gpc->uhva, false, false, true, NULL); + new_pfn = hva_to_pfn(&kfp); if (is_error_noslot_pfn(new_pfn)) goto out_error; -- cgit v1.2.3 From 0b139b877b1462ccdecb5146493c68a0b94ccdc8 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 10 Oct 2024 11:23:22 -0700 Subject: KVM: Remove pointless sanity check on @map param to kvm_vcpu_(un)map() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Drop kvm_vcpu_{,un}map()'s useless checks on @map being non-NULL. The map is 100% kernel controlled, any caller that passes a NULL pointer is broken and needs to be fixed, i.e. a crash due to a NULL pointer dereference is desirable (though obviously not as desirable as not having a bug in the first place). Tested-by: Alex Bennée Signed-off-by: Sean Christopherson Tested-by: Dmitry Osipenko Signed-off-by: Paolo Bonzini Message-ID: <20241010182427.1434605-21-seanjc@google.com> --- virt/kvm/kvm_main.c | 6 ------ 1 file changed, 6 deletions(-) (limited to 'virt') diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 8fd99c250219..90e2a1f1de60 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -3065,9 +3065,6 @@ int kvm_vcpu_map(struct kvm_vcpu *vcpu, gfn_t gfn, struct kvm_host_map *map) void *hva = NULL; struct page *page = KVM_UNMAPPED_PAGE; - if (!map) - return -EINVAL; - pfn = gfn_to_pfn(vcpu->kvm, gfn); if (is_error_noslot_pfn(pfn)) return -EINVAL; @@ -3095,9 +3092,6 @@ EXPORT_SYMBOL_GPL(kvm_vcpu_map); void kvm_vcpu_unmap(struct kvm_vcpu *vcpu, struct kvm_host_map *map, bool dirty) { - if (!map) - return; - if (!map->hva) return; -- cgit v1.2.3 From 5488499f9c645d6c1ffe151be196df38ee5a56b4 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 10 Oct 2024 11:23:23 -0700 Subject: KVM: Explicitly initialize all fields at the start of kvm_vcpu_map() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Explicitly initialize the entire kvm_host_map structure when mapping a pfn, as some callers declare their struct on the stack, i.e. don't zero-initialize the struct, which makes the map->hva in kvm_vcpu_unmap() *very* suspect. Tested-by: Alex Bennée Signed-off-by: Sean Christopherson Tested-by: Dmitry Osipenko Signed-off-by: Paolo Bonzini Message-ID: <20241010182427.1434605-22-seanjc@google.com> --- virt/kvm/kvm_main.c | 28 ++++++++++------------------ 1 file changed, 10 insertions(+), 18 deletions(-) (limited to 'virt') diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 90e2a1f1de60..4074f49eb3f1 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -3061,32 +3061,24 @@ void kvm_release_pfn(kvm_pfn_t pfn, bool dirty) int kvm_vcpu_map(struct kvm_vcpu *vcpu, gfn_t gfn, struct kvm_host_map *map) { - kvm_pfn_t pfn; - void *hva = NULL; - struct page *page = KVM_UNMAPPED_PAGE; + map->page = KVM_UNMAPPED_PAGE; + map->hva = NULL; + map->gfn = gfn; - pfn = gfn_to_pfn(vcpu->kvm, gfn); - if (is_error_noslot_pfn(pfn)) + map->pfn = gfn_to_pfn(vcpu->kvm, gfn); + if (is_error_noslot_pfn(map->pfn)) return -EINVAL; - if (pfn_valid(pfn)) { - page = pfn_to_page(pfn); - hva = kmap(page); + if (pfn_valid(map->pfn)) { + map->page = pfn_to_page(map->pfn); + map->hva = kmap(map->page); #ifdef CONFIG_HAS_IOMEM } else { - hva = memremap(pfn_to_hpa(pfn), PAGE_SIZE, MEMREMAP_WB); + map->hva = memremap(pfn_to_hpa(map->pfn), PAGE_SIZE, MEMREMAP_WB); #endif } - if (!hva) - return -EFAULT; - - map->page = page; - map->hva = hva; - map->pfn = pfn; - map->gfn = gfn; - - return 0; + return map->hva ? 0 : -EFAULT; } EXPORT_SYMBOL_GPL(kvm_vcpu_map); -- cgit v1.2.3 From ef7db98e477f4b379fac3131bf94c33774c4a211 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 10 Oct 2024 11:23:24 -0700 Subject: KVM: Use NULL for struct page pointer to indicate mremapped memory MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Drop yet another unnecessary magic page value from KVM, as there's zero reason to use a poisoned pointer to indicate "no page". If KVM uses a NULL page pointer, the kernel will explode just as quickly as if KVM uses a poisoned pointer. Never mind the fact that such usage would be a blatant and egregious KVM bug. Tested-by: Alex Bennée Signed-off-by: Sean Christopherson Tested-by: Dmitry Osipenko Signed-off-by: Paolo Bonzini Message-ID: <20241010182427.1434605-23-seanjc@google.com> --- include/linux/kvm_host.h | 4 ---- virt/kvm/kvm_main.c | 4 ++-- 2 files changed, 2 insertions(+), 6 deletions(-) (limited to 'virt') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 2c9eb472f059..cd6f5cc1930f 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -273,16 +273,12 @@ enum { READING_SHADOW_PAGE_TABLES, }; -#define KVM_UNMAPPED_PAGE ((void *) 0x500 + POISON_POINTER_DELTA) - struct kvm_host_map { /* * Only valid if the 'pfn' is managed by the host kernel (i.e. There is * a 'struct page' for it. When using mem= kernel parameter some memory * can be used as guest memory but they are not managed by host * kernel). - * If 'pfn' is not managed by the host kernel, this field is - * initialized to KVM_UNMAPPED_PAGE. */ struct page *page; void *hva; diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 4074f49eb3f1..c20386a8aa3e 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -3061,7 +3061,7 @@ void kvm_release_pfn(kvm_pfn_t pfn, bool dirty) int kvm_vcpu_map(struct kvm_vcpu *vcpu, gfn_t gfn, struct kvm_host_map *map) { - map->page = KVM_UNMAPPED_PAGE; + map->page = NULL; map->hva = NULL; map->gfn = gfn; @@ -3087,7 +3087,7 @@ void kvm_vcpu_unmap(struct kvm_vcpu *vcpu, struct kvm_host_map *map, bool dirty) if (!map->hva) return; - if (map->page != KVM_UNMAPPED_PAGE) + if (map->page) kunmap(map->page); #ifdef CONFIG_HAS_IOMEM else -- cgit v1.2.3 From 12fac89950996e556a99eb16f0359307ed4c2566 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 10 Oct 2024 11:23:28 -0700 Subject: KVM: Use plain "struct page" pointer instead of single-entry array MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Use a single pointer instead of a single-entry array for the struct page pointer in hva_to_pfn_fast(). Using an array makes the code unnecessarily annoying to read and update. No functional change intended. Reviewed-by: Alex Bennée Tested-by: Alex Bennée Signed-off-by: Sean Christopherson Tested-by: Dmitry Osipenko Signed-off-by: Paolo Bonzini Message-ID: <20241010182427.1434605-27-seanjc@google.com> --- virt/kvm/kvm_main.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'virt') diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index c20386a8aa3e..32f0858da7f2 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -2752,7 +2752,7 @@ unsigned long kvm_vcpu_gfn_to_hva_prot(struct kvm_vcpu *vcpu, gfn_t gfn, bool *w */ static bool hva_to_pfn_fast(struct kvm_follow_pfn *kfp, kvm_pfn_t *pfn) { - struct page *page[1]; + struct page *page; /* * Fast pin a writable pfn only if it is a write fault request @@ -2762,8 +2762,8 @@ static bool hva_to_pfn_fast(struct kvm_follow_pfn *kfp, kvm_pfn_t *pfn) if (!((kfp->flags & FOLL_WRITE) || kfp->map_writable)) return false; - if (get_user_page_fast_only(kfp->hva, FOLL_WRITE, page)) { - *pfn = page_to_pfn(page[0]); + if (get_user_page_fast_only(kfp->hva, FOLL_WRITE, &page)) { + *pfn = page_to_pfn(page); if (kfp->map_writable) *kfp->map_writable = true; return true; -- cgit v1.2.3 From 3dd48ecfac7fdbcba397a6378ab93c3a9b3cb802 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 10 Oct 2024 11:23:29 -0700 Subject: KVM: Provide refcounted page as output field in struct kvm_follow_pfn MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add kvm_follow_pfn.refcounted_page as an output for the "to pfn" APIs to "return" the struct page that is associated with the returned pfn (if KVM acquired a reference to the page). This will eventually allow removing KVM's hacky kvm_pfn_to_refcounted_page() code, which is error prone and can't detect pfns that are valid, but aren't (currently) refcounted. Tested-by: Alex Bennée Signed-off-by: Sean Christopherson Tested-by: Dmitry Osipenko Signed-off-by: Paolo Bonzini Message-ID: <20241010182427.1434605-28-seanjc@google.com> --- virt/kvm/kvm_main.c | 99 +++++++++++++++++++++++++---------------------------- virt/kvm/kvm_mm.h | 9 +++++ 2 files changed, 56 insertions(+), 52 deletions(-) (limited to 'virt') diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 32f0858da7f2..b3b4b374dddc 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -2746,6 +2746,46 @@ unsigned long kvm_vcpu_gfn_to_hva_prot(struct kvm_vcpu *vcpu, gfn_t gfn, bool *w return gfn_to_hva_memslot_prot(slot, gfn, writable); } +static kvm_pfn_t kvm_resolve_pfn(struct kvm_follow_pfn *kfp, struct page *page, + struct follow_pfnmap_args *map, bool writable) +{ + kvm_pfn_t pfn; + + WARN_ON_ONCE(!!page == !!map); + + if (kfp->map_writable) + *kfp->map_writable = writable; + + /* + * FIXME: Remove this once KVM no longer blindly calls put_page() on + * every pfn that points at a struct page. + * + * Get a reference for follow_pte() pfns if they happen to point at a + * struct page, as KVM will ultimately call kvm_release_pfn_clean() on + * the returned pfn, i.e. KVM expects to have a reference. + * + * Certain IO or PFNMAP mappings can be backed with valid struct pages, + * but be allocated without refcounting, e.g. tail pages of + * non-compound higher order allocations. Grabbing and putting a + * reference to such pages would cause KVM to prematurely free a page + * it doesn't own (KVM gets and puts the one and only reference). + * Don't allow those pages until the FIXME is resolved. + */ + if (map) { + pfn = map->pfn; + page = kvm_pfn_to_refcounted_page(pfn); + if (page && !get_page_unless_zero(page)) + return KVM_PFN_ERR_FAULT; + } else { + pfn = page_to_pfn(page); + } + + if (kfp->refcounted_page) + *kfp->refcounted_page = page; + + return pfn; +} + /* * The fast path to get the writable pfn which will be stored in @pfn, * true indicates success, otherwise false is returned. @@ -2763,9 +2803,7 @@ static bool hva_to_pfn_fast(struct kvm_follow_pfn *kfp, kvm_pfn_t *pfn) return false; if (get_user_page_fast_only(kfp->hva, FOLL_WRITE, &page)) { - *pfn = page_to_pfn(page); - if (kfp->map_writable) - *kfp->map_writable = true; + *pfn = kvm_resolve_pfn(kfp, page, NULL, true); return true; } @@ -2797,23 +2835,15 @@ static int hva_to_pfn_slow(struct kvm_follow_pfn *kfp, kvm_pfn_t *pfn) if (npages != 1) return npages; - if (!kfp->map_writable) - goto out; - - if (kfp->flags & FOLL_WRITE) { - *kfp->map_writable = true; - goto out; - } - /* map read fault as writable if possible */ - if (get_user_page_fast_only(kfp->hva, FOLL_WRITE, &wpage)) { - *kfp->map_writable = true; + if (!(flags & FOLL_WRITE) && kfp->map_writable && + get_user_page_fast_only(kfp->hva, FOLL_WRITE, &wpage)) { put_page(page); page = wpage; + flags |= FOLL_WRITE; } -out: - *pfn = page_to_pfn(page); + *pfn = kvm_resolve_pfn(kfp, page, NULL, flags & FOLL_WRITE); return npages; } @@ -2828,22 +2858,11 @@ static bool vma_is_valid(struct vm_area_struct *vma, bool write_fault) return true; } -static int kvm_try_get_pfn(kvm_pfn_t pfn) -{ - struct page *page = kvm_pfn_to_refcounted_page(pfn); - - if (!page) - return 1; - - return get_page_unless_zero(page); -} - static int hva_to_pfn_remapped(struct vm_area_struct *vma, struct kvm_follow_pfn *kfp, kvm_pfn_t *p_pfn) { struct follow_pfnmap_args args = { .vma = vma, .address = kfp->hva }; bool write_fault = kfp->flags & FOLL_WRITE; - kvm_pfn_t pfn; int r; r = follow_pfnmap_start(&args); @@ -2867,37 +2886,13 @@ static int hva_to_pfn_remapped(struct vm_area_struct *vma, } if (write_fault && !args.writable) { - pfn = KVM_PFN_ERR_RO_FAULT; + *p_pfn = KVM_PFN_ERR_RO_FAULT; goto out; } - if (kfp->map_writable) - *kfp->map_writable = args.writable; - pfn = args.pfn; - - /* - * Get a reference here because callers of *hva_to_pfn* and - * *gfn_to_pfn* ultimately call kvm_release_pfn_clean on the - * returned pfn. This is only needed if the VMA has VM_MIXEDMAP - * set, but the kvm_try_get_pfn/kvm_release_pfn_clean pair will - * simply do nothing for reserved pfns. - * - * Whoever called remap_pfn_range is also going to call e.g. - * unmap_mapping_range before the underlying pages are freed, - * causing a call to our MMU notifier. - * - * Certain IO or PFNMAP mappings can be backed with valid - * struct pages, but be allocated without refcounting e.g., - * tail pages of non-compound higher order allocations, which - * would then underflow the refcount when the caller does the - * required put_page. Don't allow those pages here. - */ - if (!kvm_try_get_pfn(pfn)) - r = -EFAULT; + *p_pfn = kvm_resolve_pfn(kfp, NULL, &args, args.writable); out: follow_pfnmap_end(&args); - *p_pfn = pfn; - return r; } diff --git a/virt/kvm/kvm_mm.h b/virt/kvm/kvm_mm.h index d5a215958f06..d3ac1ba8ba66 100644 --- a/virt/kvm/kvm_mm.h +++ b/virt/kvm/kvm_mm.h @@ -35,6 +35,15 @@ struct kvm_follow_pfn { * Set to true if a writable mapping was obtained. */ bool *map_writable; + + /* + * Optional output. Set to a valid "struct page" if the returned pfn + * is for a refcounted or pinned struct page, NULL if the returned pfn + * has no struct page or if the struct page is not being refcounted + * (e.g. tail pages of non-compound higher order allocations from + * IO/PFNMAP mappings). + */ + struct page **refcounted_page; }; kvm_pfn_t hva_to_pfn(struct kvm_follow_pfn *kfp); -- cgit v1.2.3 From 775e3ff7bf4919285da0456d0ebbfa2874ee7572 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 10 Oct 2024 11:23:30 -0700 Subject: KVM: Move kvm_{set,release}_page_{clean,dirty}() helpers up in kvm_main.c MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Hoist the kvm_{set,release}_page_{clean,dirty}() APIs further up in kvm_main.c so that they can be used by the kvm_follow_pfn family of APIs. No functional change intended. Reviewed-by: Alex Bennée Tested-by: Alex Bennée Signed-off-by: Sean Christopherson Tested-by: Dmitry Osipenko Signed-off-by: Paolo Bonzini Message-ID: <20241010182427.1434605-29-seanjc@google.com> --- virt/kvm/kvm_main.c | 82 ++++++++++++++++++++++++++--------------------------- 1 file changed, 41 insertions(+), 41 deletions(-) (limited to 'virt') diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index b3b4b374dddc..64888257e301 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -2746,6 +2746,47 @@ unsigned long kvm_vcpu_gfn_to_hva_prot(struct kvm_vcpu *vcpu, gfn_t gfn, bool *w return gfn_to_hva_memslot_prot(slot, gfn, writable); } +static bool kvm_is_ad_tracked_page(struct page *page) +{ + /* + * Per page-flags.h, pages tagged PG_reserved "should in general not be + * touched (e.g. set dirty) except by its owner". + */ + return !PageReserved(page); +} + +static void kvm_set_page_dirty(struct page *page) +{ + if (kvm_is_ad_tracked_page(page)) + SetPageDirty(page); +} + +static void kvm_set_page_accessed(struct page *page) +{ + if (kvm_is_ad_tracked_page(page)) + mark_page_accessed(page); +} + +void kvm_release_page_clean(struct page *page) +{ + if (!page) + return; + + kvm_set_page_accessed(page); + put_page(page); +} +EXPORT_SYMBOL_GPL(kvm_release_page_clean); + +void kvm_release_page_dirty(struct page *page) +{ + if (!page) + return; + + kvm_set_page_dirty(page); + kvm_release_page_clean(page); +} +EXPORT_SYMBOL_GPL(kvm_release_page_dirty); + static kvm_pfn_t kvm_resolve_pfn(struct kvm_follow_pfn *kfp, struct page *page, struct follow_pfnmap_args *map, bool writable) { @@ -3099,37 +3140,6 @@ void kvm_vcpu_unmap(struct kvm_vcpu *vcpu, struct kvm_host_map *map, bool dirty) } EXPORT_SYMBOL_GPL(kvm_vcpu_unmap); -static bool kvm_is_ad_tracked_page(struct page *page) -{ - /* - * Per page-flags.h, pages tagged PG_reserved "should in general not be - * touched (e.g. set dirty) except by its owner". - */ - return !PageReserved(page); -} - -static void kvm_set_page_dirty(struct page *page) -{ - if (kvm_is_ad_tracked_page(page)) - SetPageDirty(page); -} - -static void kvm_set_page_accessed(struct page *page) -{ - if (kvm_is_ad_tracked_page(page)) - mark_page_accessed(page); -} - -void kvm_release_page_clean(struct page *page) -{ - if (!page) - return; - - kvm_set_page_accessed(page); - put_page(page); -} -EXPORT_SYMBOL_GPL(kvm_release_page_clean); - void kvm_release_pfn_clean(kvm_pfn_t pfn) { struct page *page; @@ -3145,16 +3155,6 @@ void kvm_release_pfn_clean(kvm_pfn_t pfn) } EXPORT_SYMBOL_GPL(kvm_release_pfn_clean); -void kvm_release_page_dirty(struct page *page) -{ - if (!page) - return; - - kvm_set_page_dirty(page); - kvm_release_page_clean(page); -} -EXPORT_SYMBOL_GPL(kvm_release_page_dirty); - void kvm_release_pfn_dirty(kvm_pfn_t pfn) { struct page *page; -- cgit v1.2.3 From 3154ddcb6a9016f314a2f5f1293808ad07c5fab9 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 10 Oct 2024 11:23:31 -0700 Subject: KVM: pfncache: Precisely track refcounted pages MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Track refcounted struct page memory using kvm_follow_pfn.refcounted_page instead of relying on kvm_release_pfn_clean() to correctly detect that the pfn is associated with a struct page. Tested-by: Alex Bennée Signed-off-by: Sean Christopherson Tested-by: Dmitry Osipenko Signed-off-by: Paolo Bonzini Message-ID: <20241010182427.1434605-30-seanjc@google.com> --- virt/kvm/pfncache.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'virt') diff --git a/virt/kvm/pfncache.c b/virt/kvm/pfncache.c index 067daf9ad6ef..728d2c1b488a 100644 --- a/virt/kvm/pfncache.c +++ b/virt/kvm/pfncache.c @@ -159,11 +159,14 @@ static kvm_pfn_t hva_to_pfn_retry(struct gfn_to_pfn_cache *gpc) kvm_pfn_t new_pfn = KVM_PFN_ERR_FAULT; void *new_khva = NULL; unsigned long mmu_seq; + struct page *page; + struct kvm_follow_pfn kfp = { .slot = gpc->memslot, .gfn = gpa_to_gfn(gpc->gpa), .flags = FOLL_WRITE, .hva = gpc->uhva, + .refcounted_page = &page, }; lockdep_assert_held(&gpc->refresh_lock); @@ -198,7 +201,7 @@ static kvm_pfn_t hva_to_pfn_retry(struct gfn_to_pfn_cache *gpc) if (new_khva != old_khva) gpc_unmap(new_pfn, new_khva); - kvm_release_pfn_clean(new_pfn); + kvm_release_page_unused(page); cond_resched(); } @@ -218,7 +221,7 @@ static kvm_pfn_t hva_to_pfn_retry(struct gfn_to_pfn_cache *gpc) new_khva = gpc_map(new_pfn); if (!new_khva) { - kvm_release_pfn_clean(new_pfn); + kvm_release_page_unused(page); goto out_error; } @@ -236,11 +239,11 @@ static kvm_pfn_t hva_to_pfn_retry(struct gfn_to_pfn_cache *gpc) gpc->khva = new_khva + offset_in_page(gpc->uhva); /* - * Put the reference to the _new_ pfn. The pfn is now tracked by the + * Put the reference to the _new_ page. The page is now tracked by the * cache and can be safely migrated, swapped, etc... as the cache will * invalidate any mappings in response to relevant mmu_notifier events. */ - kvm_release_pfn_clean(new_pfn); + kvm_release_page_clean(page); return 0; -- cgit v1.2.3 From 2ff072ba7ad2deb1c3b2d231faa0ac3a25e2451b Mon Sep 17 00:00:00 2001 From: David Stevens Date: Thu, 10 Oct 2024 11:23:32 -0700 Subject: KVM: Migrate kvm_vcpu_map() to kvm_follow_pfn() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Migrate kvm_vcpu_map() to kvm_follow_pfn(), and have it track whether or not the map holds a refcounted struct page. Precisely tracking struct page references will eventually allow removing kvm_pfn_to_refcounted_page() and its various wrappers. Signed-off-by: David Stevens [sean: use a pointer instead of a boolean] Tested-by: Alex Bennée Signed-off-by: Sean Christopherson Tested-by: Dmitry Osipenko Signed-off-by: Paolo Bonzini Message-ID: <20241010182427.1434605-31-seanjc@google.com> --- include/linux/kvm_host.h | 2 +- virt/kvm/kvm_main.c | 26 ++++++++++++++++---------- 2 files changed, 17 insertions(+), 11 deletions(-) (limited to 'virt') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index cd6f5cc1930f..35e1beb017dd 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -280,6 +280,7 @@ struct kvm_host_map { * can be used as guest memory but they are not managed by host * kernel). */ + struct page *refcounted_page; struct page *page; void *hva; kvm_pfn_t pfn; @@ -1238,7 +1239,6 @@ void kvm_release_pfn_dirty(kvm_pfn_t pfn); void kvm_set_pfn_dirty(kvm_pfn_t pfn); void kvm_set_pfn_accessed(kvm_pfn_t pfn); -void kvm_release_pfn(kvm_pfn_t pfn, bool dirty); int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset, int len); int kvm_read_guest(struct kvm *kvm, gpa_t gpa, void *data, unsigned long len); diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 64888257e301..842a5d5f3120 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -3087,21 +3087,21 @@ struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn) } EXPORT_SYMBOL_GPL(gfn_to_page); -void kvm_release_pfn(kvm_pfn_t pfn, bool dirty) -{ - if (dirty) - kvm_release_pfn_dirty(pfn); - else - kvm_release_pfn_clean(pfn); -} - int kvm_vcpu_map(struct kvm_vcpu *vcpu, gfn_t gfn, struct kvm_host_map *map) { + struct kvm_follow_pfn kfp = { + .slot = gfn_to_memslot(vcpu->kvm, gfn), + .gfn = gfn, + .flags = FOLL_WRITE, + .refcounted_page = &map->refcounted_page, + }; + + map->refcounted_page = NULL; map->page = NULL; map->hva = NULL; map->gfn = gfn; - map->pfn = gfn_to_pfn(vcpu->kvm, gfn); + map->pfn = kvm_follow_pfn(&kfp); if (is_error_noslot_pfn(map->pfn)) return -EINVAL; @@ -3133,10 +3133,16 @@ void kvm_vcpu_unmap(struct kvm_vcpu *vcpu, struct kvm_host_map *map, bool dirty) if (dirty) kvm_vcpu_mark_page_dirty(vcpu, map->gfn); - kvm_release_pfn(map->pfn, dirty); + if (map->refcounted_page) { + if (dirty) + kvm_release_page_dirty(map->refcounted_page); + else + kvm_release_page_clean(map->refcounted_page); + } map->hva = NULL; map->page = NULL; + map->refcounted_page = NULL; } EXPORT_SYMBOL_GPL(kvm_vcpu_unmap); -- cgit v1.2.3 From 2bcb52a3602bf4cbc55d8fb4da00c930f83d7789 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 10 Oct 2024 11:23:33 -0700 Subject: KVM: Pin (as in FOLL_PIN) pages during kvm_vcpu_map() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Pin, as in FOLL_PIN, pages when mapping them for direct access by KVM. As per Documentation/core-api/pin_user_pages.rst, writing to a page that was gotten via FOLL_GET is explicitly disallowed. Correct (uses FOLL_PIN calls): pin_user_pages() write to the data within the pages unpin_user_pages() INCORRECT (uses FOLL_GET calls): get_user_pages() write to the data within the pages put_page() Unfortunately, FOLL_PIN is a "private" flag, and so kvm_follow_pfn must use a one-off bool instead of being able to piggyback the "flags" field. Link: https://lwn.net/Articles/930667 Link: https://lore.kernel.org/all/cover.1683044162.git.lstoakes@gmail.com Tested-by: Alex Bennée Signed-off-by: Sean Christopherson Tested-by: Dmitry Osipenko Signed-off-by: Paolo Bonzini Message-ID: <20241010182427.1434605-32-seanjc@google.com> --- include/linux/kvm_host.h | 2 +- virt/kvm/kvm_main.c | 54 ++++++++++++++++++++++++++++++++++-------------- virt/kvm/kvm_mm.h | 7 +++++++ 3 files changed, 47 insertions(+), 16 deletions(-) (limited to 'virt') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 35e1beb017dd..b4c541fa5a1f 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -280,7 +280,7 @@ struct kvm_host_map { * can be used as guest memory but they are not managed by host * kernel). */ - struct page *refcounted_page; + struct page *pinned_page; struct page *page; void *hva; kvm_pfn_t pfn; diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 842a5d5f3120..0d59f47f099e 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -2814,9 +2814,12 @@ static kvm_pfn_t kvm_resolve_pfn(struct kvm_follow_pfn *kfp, struct page *page, */ if (map) { pfn = map->pfn; - page = kvm_pfn_to_refcounted_page(pfn); - if (page && !get_page_unless_zero(page)) - return KVM_PFN_ERR_FAULT; + + if (!kfp->pin) { + page = kvm_pfn_to_refcounted_page(pfn); + if (page && !get_page_unless_zero(page)) + return KVM_PFN_ERR_FAULT; + } } else { pfn = page_to_pfn(page); } @@ -2834,16 +2837,24 @@ static kvm_pfn_t kvm_resolve_pfn(struct kvm_follow_pfn *kfp, struct page *page, static bool hva_to_pfn_fast(struct kvm_follow_pfn *kfp, kvm_pfn_t *pfn) { struct page *page; + bool r; /* - * Fast pin a writable pfn only if it is a write fault request - * or the caller allows to map a writable pfn for a read fault - * request. + * Try the fast-only path when the caller wants to pin/get the page for + * writing. If the caller only wants to read the page, KVM must go + * down the full, slow path in order to avoid racing an operation that + * breaks Copy-on-Write (CoW), e.g. so that KVM doesn't end up pointing + * at the old, read-only page while mm/ points at a new, writable page. */ if (!((kfp->flags & FOLL_WRITE) || kfp->map_writable)) return false; - if (get_user_page_fast_only(kfp->hva, FOLL_WRITE, &page)) { + if (kfp->pin) + r = pin_user_pages_fast(kfp->hva, 1, FOLL_WRITE, &page) == 1; + else + r = get_user_page_fast_only(kfp->hva, FOLL_WRITE, &page); + + if (r) { *pfn = kvm_resolve_pfn(kfp, page, NULL, true); return true; } @@ -2872,10 +2883,21 @@ static int hva_to_pfn_slow(struct kvm_follow_pfn *kfp, kvm_pfn_t *pfn) struct page *page, *wpage; int npages; - npages = get_user_pages_unlocked(kfp->hva, 1, &page, flags); + if (kfp->pin) + npages = pin_user_pages_unlocked(kfp->hva, 1, &page, flags); + else + npages = get_user_pages_unlocked(kfp->hva, 1, &page, flags); if (npages != 1) return npages; + /* + * Pinning is mutually exclusive with opportunistically mapping a read + * fault as writable, as KVM should never pin pages when mapping memory + * into the guest (pinning is only for direct accesses from KVM). + */ + if (WARN_ON_ONCE(kfp->map_writable && kfp->pin)) + goto out; + /* map read fault as writable if possible */ if (!(flags & FOLL_WRITE) && kfp->map_writable && get_user_page_fast_only(kfp->hva, FOLL_WRITE, &wpage)) { @@ -2884,6 +2906,7 @@ static int hva_to_pfn_slow(struct kvm_follow_pfn *kfp, kvm_pfn_t *pfn) flags |= FOLL_WRITE; } +out: *pfn = kvm_resolve_pfn(kfp, page, NULL, flags & FOLL_WRITE); return npages; } @@ -3093,10 +3116,11 @@ int kvm_vcpu_map(struct kvm_vcpu *vcpu, gfn_t gfn, struct kvm_host_map *map) .slot = gfn_to_memslot(vcpu->kvm, gfn), .gfn = gfn, .flags = FOLL_WRITE, - .refcounted_page = &map->refcounted_page, + .refcounted_page = &map->pinned_page, + .pin = true, }; - map->refcounted_page = NULL; + map->pinned_page = NULL; map->page = NULL; map->hva = NULL; map->gfn = gfn; @@ -3133,16 +3157,16 @@ void kvm_vcpu_unmap(struct kvm_vcpu *vcpu, struct kvm_host_map *map, bool dirty) if (dirty) kvm_vcpu_mark_page_dirty(vcpu, map->gfn); - if (map->refcounted_page) { + if (map->pinned_page) { if (dirty) - kvm_release_page_dirty(map->refcounted_page); - else - kvm_release_page_clean(map->refcounted_page); + kvm_set_page_dirty(map->pinned_page); + kvm_set_page_accessed(map->pinned_page); + unpin_user_page(map->pinned_page); } map->hva = NULL; map->page = NULL; - map->refcounted_page = NULL; + map->pinned_page = NULL; } EXPORT_SYMBOL_GPL(kvm_vcpu_unmap); diff --git a/virt/kvm/kvm_mm.h b/virt/kvm/kvm_mm.h index d3ac1ba8ba66..acef3f5c582a 100644 --- a/virt/kvm/kvm_mm.h +++ b/virt/kvm/kvm_mm.h @@ -30,6 +30,13 @@ struct kvm_follow_pfn { /* FOLL_* flags modifying lookup behavior, e.g. FOLL_WRITE. */ unsigned int flags; + /* + * Pin the page (effectively FOLL_PIN, which is an mm/ internal flag). + * The page *must* be pinned if KVM will write to the page via a kernel + * mapping, e.g. via kmap(), mremap(), etc. + */ + bool pin; + /* * If non-NULL, try to get a writable mapping even for a read fault. * Set to true if a writable mapping was obtained. -- cgit v1.2.3 From 365e319208442a0807a96e9ea4d0b1fa338f1929 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 10 Oct 2024 11:23:35 -0700 Subject: KVM: Pass in write/dirty to kvm_vcpu_map(), not kvm_vcpu_unmap() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Now that all kvm_vcpu_{,un}map() users pass "true" for @dirty, have them pass "true" as a @writable param to kvm_vcpu_map(), and thus create a read-only mapping when possible. Note, creating read-only mappings can be theoretically slower, as they don't play nice with fast GUP due to the need to break CoW before mapping the underlying PFN. But practically speaking, creating a mapping isn't a super hot path, and getting a writable mapping for reading is weird and confusing. Tested-by: Alex Bennée Signed-off-by: Sean Christopherson Tested-by: Dmitry Osipenko Signed-off-by: Paolo Bonzini Message-ID: <20241010182427.1434605-34-seanjc@google.com> --- arch/x86/kvm/svm/nested.c | 4 ++-- arch/x86/kvm/svm/sev.c | 2 +- arch/x86/kvm/svm/svm.c | 8 ++++---- arch/x86/kvm/vmx/nested.c | 16 ++++++++-------- include/linux/kvm_host.h | 20 ++++++++++++++++++-- virt/kvm/kvm_main.c | 12 +++++++----- 6 files changed, 40 insertions(+), 22 deletions(-) (limited to 'virt') diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index cf84103ce38b..b708bdf7eaff 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -926,7 +926,7 @@ out_exit_err: nested_svm_vmexit(svm); out: - kvm_vcpu_unmap(vcpu, &map, true); + kvm_vcpu_unmap(vcpu, &map); return ret; } @@ -1130,7 +1130,7 @@ int nested_svm_vmexit(struct vcpu_svm *svm) vmcb12->control.exit_int_info_err, KVM_ISA_SVM); - kvm_vcpu_unmap(vcpu, &map, true); + kvm_vcpu_unmap(vcpu, &map); nested_svm_transition_tlb_flush(vcpu); diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 0b851ef937f2..4557ff3804ae 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -3468,7 +3468,7 @@ void sev_es_unmap_ghcb(struct vcpu_svm *svm) sev_es_sync_to_ghcb(svm); - kvm_vcpu_unmap(&svm->vcpu, &svm->sev_es.ghcb_map, true); + kvm_vcpu_unmap(&svm->vcpu, &svm->sev_es.ghcb_map); svm->sev_es.ghcb = NULL; } diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 9df3e1e5ae81..c1e29307826b 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -2299,7 +2299,7 @@ static int vmload_vmsave_interception(struct kvm_vcpu *vcpu, bool vmload) svm_copy_vmloadsave_state(vmcb12, svm->vmcb); } - kvm_vcpu_unmap(vcpu, &map, true); + kvm_vcpu_unmap(vcpu, &map); return ret; } @@ -4714,7 +4714,7 @@ static int svm_enter_smm(struct kvm_vcpu *vcpu, union kvm_smram *smram) svm_copy_vmrun_state(map_save.hva + 0x400, &svm->vmcb01.ptr->save); - kvm_vcpu_unmap(vcpu, &map_save, true); + kvm_vcpu_unmap(vcpu, &map_save); return 0; } @@ -4774,9 +4774,9 @@ static int svm_leave_smm(struct kvm_vcpu *vcpu, const union kvm_smram *smram) svm->nested.nested_run_pending = 1; unmap_save: - kvm_vcpu_unmap(vcpu, &map_save, true); + kvm_vcpu_unmap(vcpu, &map_save); unmap_map: - kvm_vcpu_unmap(vcpu, &map, true); + kvm_vcpu_unmap(vcpu, &map); return ret; } diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index ff83b56fe2fa..259fe445e695 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -231,7 +231,7 @@ static inline void nested_release_evmcs(struct kvm_vcpu *vcpu) struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); struct vcpu_vmx *vmx = to_vmx(vcpu); - kvm_vcpu_unmap(vcpu, &vmx->nested.hv_evmcs_map, true); + kvm_vcpu_unmap(vcpu, &vmx->nested.hv_evmcs_map); vmx->nested.hv_evmcs = NULL; vmx->nested.hv_evmcs_vmptr = EVMPTR_INVALID; @@ -318,9 +318,9 @@ static void nested_put_vmcs12_pages(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); - kvm_vcpu_unmap(vcpu, &vmx->nested.apic_access_page_map, true); - kvm_vcpu_unmap(vcpu, &vmx->nested.virtual_apic_map, true); - kvm_vcpu_unmap(vcpu, &vmx->nested.pi_desc_map, true); + kvm_vcpu_unmap(vcpu, &vmx->nested.apic_access_page_map); + kvm_vcpu_unmap(vcpu, &vmx->nested.virtual_apic_map); + kvm_vcpu_unmap(vcpu, &vmx->nested.pi_desc_map); vmx->nested.pi_desc = NULL; } @@ -624,7 +624,7 @@ static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu, int msr; unsigned long *msr_bitmap_l1; unsigned long *msr_bitmap_l0 = vmx->nested.vmcs02.msr_bitmap; - struct kvm_host_map msr_bitmap_map; + struct kvm_host_map map; /* Nothing to do if the MSR bitmap is not in use. */ if (!cpu_has_vmx_msr_bitmap() || @@ -647,10 +647,10 @@ static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu, return true; } - if (kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->msr_bitmap), &msr_bitmap_map)) + if (kvm_vcpu_map_readonly(vcpu, gpa_to_gfn(vmcs12->msr_bitmap), &map)) return false; - msr_bitmap_l1 = (unsigned long *)msr_bitmap_map.hva; + msr_bitmap_l1 = (unsigned long *)map.hva; /* * To keep the control flow simple, pay eight 8-byte writes (sixteen @@ -714,7 +714,7 @@ static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu, nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, MSR_IA32_FLUSH_CMD, MSR_TYPE_W); - kvm_vcpu_unmap(vcpu, &msr_bitmap_map, false); + kvm_vcpu_unmap(vcpu, &map); vmx->nested.force_msr_bitmap_recalc = false; diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index b4c541fa5a1f..101dbf2be1ce 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -285,6 +285,7 @@ struct kvm_host_map { void *hva; kvm_pfn_t pfn; kvm_pfn_t gfn; + bool writable; }; /* @@ -1311,8 +1312,23 @@ void mark_page_dirty(struct kvm *kvm, gfn_t gfn); struct kvm_memslots *kvm_vcpu_memslots(struct kvm_vcpu *vcpu); struct kvm_memory_slot *kvm_vcpu_gfn_to_memslot(struct kvm_vcpu *vcpu, gfn_t gfn); -int kvm_vcpu_map(struct kvm_vcpu *vcpu, gpa_t gpa, struct kvm_host_map *map); -void kvm_vcpu_unmap(struct kvm_vcpu *vcpu, struct kvm_host_map *map, bool dirty); + +int __kvm_vcpu_map(struct kvm_vcpu *vcpu, gpa_t gpa, struct kvm_host_map *map, + bool writable); +void kvm_vcpu_unmap(struct kvm_vcpu *vcpu, struct kvm_host_map *map); + +static inline int kvm_vcpu_map(struct kvm_vcpu *vcpu, gpa_t gpa, + struct kvm_host_map *map) +{ + return __kvm_vcpu_map(vcpu, gpa, map, true); +} + +static inline int kvm_vcpu_map_readonly(struct kvm_vcpu *vcpu, gpa_t gpa, + struct kvm_host_map *map) +{ + return __kvm_vcpu_map(vcpu, gpa, map, false); +} + unsigned long kvm_vcpu_gfn_to_hva(struct kvm_vcpu *vcpu, gfn_t gfn); unsigned long kvm_vcpu_gfn_to_hva_prot(struct kvm_vcpu *vcpu, gfn_t gfn, bool *writable); int kvm_vcpu_read_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn, void *data, int offset, diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 0d59f47f099e..baa741c2b81c 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -3110,7 +3110,8 @@ struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn) } EXPORT_SYMBOL_GPL(gfn_to_page); -int kvm_vcpu_map(struct kvm_vcpu *vcpu, gfn_t gfn, struct kvm_host_map *map) +int __kvm_vcpu_map(struct kvm_vcpu *vcpu, gfn_t gfn, struct kvm_host_map *map, + bool writable) { struct kvm_follow_pfn kfp = { .slot = gfn_to_memslot(vcpu->kvm, gfn), @@ -3124,6 +3125,7 @@ int kvm_vcpu_map(struct kvm_vcpu *vcpu, gfn_t gfn, struct kvm_host_map *map) map->page = NULL; map->hva = NULL; map->gfn = gfn; + map->writable = writable; map->pfn = kvm_follow_pfn(&kfp); if (is_error_noslot_pfn(map->pfn)) @@ -3140,9 +3142,9 @@ int kvm_vcpu_map(struct kvm_vcpu *vcpu, gfn_t gfn, struct kvm_host_map *map) return map->hva ? 0 : -EFAULT; } -EXPORT_SYMBOL_GPL(kvm_vcpu_map); +EXPORT_SYMBOL_GPL(__kvm_vcpu_map); -void kvm_vcpu_unmap(struct kvm_vcpu *vcpu, struct kvm_host_map *map, bool dirty) +void kvm_vcpu_unmap(struct kvm_vcpu *vcpu, struct kvm_host_map *map) { if (!map->hva) return; @@ -3154,11 +3156,11 @@ void kvm_vcpu_unmap(struct kvm_vcpu *vcpu, struct kvm_host_map *map, bool dirty) memunmap(map->hva); #endif - if (dirty) + if (map->writable) kvm_vcpu_mark_page_dirty(vcpu, map->gfn); if (map->pinned_page) { - if (dirty) + if (map->writable) kvm_set_page_dirty(map->pinned_page); kvm_set_page_accessed(map->pinned_page); unpin_user_page(map->pinned_page); -- cgit v1.2.3 From 2e5fdf60a9a69971b5e642d32e09bd6b0223f47c Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 10 Oct 2024 11:23:36 -0700 Subject: KVM: Get writable mapping for __kvm_vcpu_map() only when necessary MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When creating a memory map for read, don't request a writable pfn from the primary MMU. While creating read-only mappings can be theoretically slower, as they don't play nice with fast GUP due to the need to break CoW before mapping the underlying PFN, practically speaking, creating a mapping isn't a super hot path, and getting a writable mapping for reading is weird and confusing. Tested-by: Alex Bennée Signed-off-by: Sean Christopherson Tested-by: Dmitry Osipenko Signed-off-by: Paolo Bonzini Message-ID: <20241010182427.1434605-35-seanjc@google.com> --- virt/kvm/kvm_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'virt') diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index baa741c2b81c..49ac13872e09 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -3116,7 +3116,7 @@ int __kvm_vcpu_map(struct kvm_vcpu *vcpu, gfn_t gfn, struct kvm_host_map *map, struct kvm_follow_pfn kfp = { .slot = gfn_to_memslot(vcpu->kvm, gfn), .gfn = gfn, - .flags = FOLL_WRITE, + .flags = writable ? FOLL_WRITE : 0, .refcounted_page = &map->pinned_page, .pin = true, }; -- cgit v1.2.3 From 68e51d0a437b09dcef621b4cc8e4fe02605bf5c4 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 10 Oct 2024 11:23:37 -0700 Subject: KVM: Disallow direct access (w/o mmu_notifier) to unpinned pfn by default MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add an off-by-default module param to control whether or not KVM is allowed to map memory that isn't pinned, i.e. that KVM can't guarantee won't be freed while it is mapped into KVM and/or the guest. Don't remove the functionality entirely, as there are use cases where mapping unpinned memory is safe (as defined by the platform owner), e.g. when memory is hidden from the kernel and managed by userspace, in which case userspace is already fully trusted to not muck with guest memory mappings. But for more typical setups, mapping unpinned memory is wildly unsafe, and unnecessary. The APIs are used exclusively by x86's nested virtualization support, and there is no known (or sane) use case for mapping PFN-mapped memory a KVM guest _and_ letting the guest use it for virtualization structures. Tested-by: Alex Bennée Signed-off-by: Sean Christopherson Tested-by: Dmitry Osipenko Signed-off-by: Paolo Bonzini Message-ID: <20241010182427.1434605-36-seanjc@google.com> --- virt/kvm/kvm_main.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'virt') diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 49ac13872e09..becf640e369c 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -94,6 +94,13 @@ unsigned int halt_poll_ns_shrink = 2; module_param(halt_poll_ns_shrink, uint, 0644); EXPORT_SYMBOL_GPL(halt_poll_ns_shrink); +/* + * Allow direct access (from KVM or the CPU) without MMU notifier protection + * to unpinned pages. + */ +static bool allow_unsafe_mappings; +module_param(allow_unsafe_mappings, bool, 0444); + /* * Ordering of locks: * @@ -2811,6 +2818,9 @@ static kvm_pfn_t kvm_resolve_pfn(struct kvm_follow_pfn *kfp, struct page *page, * reference to such pages would cause KVM to prematurely free a page * it doesn't own (KVM gets and puts the one and only reference). * Don't allow those pages until the FIXME is resolved. + * + * Don't grab a reference for pins, callers that pin pages are required + * to check refcounted_page, i.e. must not blindly release the pfn. */ if (map) { pfn = map->pfn; @@ -2929,6 +2939,14 @@ static int hva_to_pfn_remapped(struct vm_area_struct *vma, bool write_fault = kfp->flags & FOLL_WRITE; int r; + /* + * Remapped memory cannot be pinned in any meaningful sense. Bail if + * the caller wants to pin the page, i.e. access the page outside of + * MMU notifier protection, and unsafe umappings are disallowed. + */ + if (kfp->pin && !allow_unsafe_mappings) + return -EINVAL; + r = follow_pfnmap_start(&args); if (r) { /* -- cgit v1.2.3 From 1c7b627e930624dd64ee906df554c8f2bad628ff Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 10 Oct 2024 11:23:45 -0700 Subject: KVM: Add kvm_faultin_pfn() to specifically service guest page faults MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a new dedicated API, kvm_faultin_pfn(), for servicing guest page faults, i.e. for getting pages/pfns that will be mapped into the guest via an mmu_notifier-protected KVM MMU. Keep struct kvm_follow_pfn buried in internal code, as having __kvm_faultin_pfn() take "out" params is actually cleaner for several architectures, e.g. it allows the caller to have its own "page fault" structure without having to marshal data to/from kvm_follow_pfn. Long term, common KVM would ideally provide a kvm_page_fault structure, a la x86's struct of the same name. But all architectures need to be converted to a common API before that can happen. Tested-by: Alex Bennée Signed-off-by: Sean Christopherson Tested-by: Dmitry Osipenko Signed-off-by: Paolo Bonzini Message-ID: <20241010182427.1434605-44-seanjc@google.com> --- include/linux/kvm_host.h | 12 ++++++++++++ virt/kvm/kvm_main.c | 22 ++++++++++++++++++++++ 2 files changed, 34 insertions(+) (limited to 'virt') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 8d35a36e7707..a63b0325d3e2 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1231,6 +1231,18 @@ static inline void kvm_release_page_unused(struct page *page) void kvm_release_page_clean(struct page *page); void kvm_release_page_dirty(struct page *page); +kvm_pfn_t __kvm_faultin_pfn(const struct kvm_memory_slot *slot, gfn_t gfn, + unsigned int foll, bool *writable, + struct page **refcounted_page); + +static inline kvm_pfn_t kvm_faultin_pfn(struct kvm_vcpu *vcpu, gfn_t gfn, + bool write, bool *writable, + struct page **refcounted_page) +{ + return __kvm_faultin_pfn(kvm_vcpu_gfn_to_memslot(vcpu, gfn), gfn, + write ? FOLL_WRITE : 0, writable, refcounted_page); +} + kvm_pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn); kvm_pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault, bool *writable); diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index becf640e369c..f79745d6500c 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -3092,6 +3092,28 @@ kvm_pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn) } EXPORT_SYMBOL_GPL(gfn_to_pfn); +kvm_pfn_t __kvm_faultin_pfn(const struct kvm_memory_slot *slot, gfn_t gfn, + unsigned int foll, bool *writable, + struct page **refcounted_page) +{ + struct kvm_follow_pfn kfp = { + .slot = slot, + .gfn = gfn, + .flags = foll, + .map_writable = writable, + .refcounted_page = refcounted_page, + }; + + if (WARN_ON_ONCE(!writable || !refcounted_page)) + return KVM_PFN_ERR_FAULT; + + *writable = false; + *refcounted_page = NULL; + + return kvm_follow_pfn(&kfp); +} +EXPORT_SYMBOL_GPL(__kvm_faultin_pfn); + int kvm_prefetch_pages(struct kvm_memory_slot *slot, gfn_t gfn, struct page **pages, int nr_pages) { -- cgit v1.2.3 From 4af18dc6a9204464db76d9771d1f40e2b46bf6ae Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 10 Oct 2024 11:23:47 -0700 Subject: KVM: guest_memfd: Pass index, not gfn, to __kvm_gmem_get_pfn() Refactor guest_memfd usage of __kvm_gmem_get_pfn() to pass the index into the guest_memfd file instead of the gfn, i.e. resolve the index based on the slot+gfn in the caller instead of in __kvm_gmem_get_pfn(). This will allow kvm_gmem_get_pfn() to retrieve and return the specific "struct page", which requires the index into the folio, without a redoing the index calculation multiple times (which isn't costly, just hard to follow). Opportunistically add a kvm_gmem_get_index() helper to make the copy+pasted code easier to understand. Signed-off-by: Sean Christopherson Tested-by: Dmitry Osipenko Signed-off-by: Paolo Bonzini Message-ID: <20241010182427.1434605-46-seanjc@google.com> --- virt/kvm/guest_memfd.c | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) (limited to 'virt') diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c index 8f079a61a56d..8a878e57c5d4 100644 --- a/virt/kvm/guest_memfd.c +++ b/virt/kvm/guest_memfd.c @@ -302,6 +302,11 @@ static inline struct file *kvm_gmem_get_file(struct kvm_memory_slot *slot) return get_file_active(&slot->gmem.file); } +static pgoff_t kvm_gmem_get_index(struct kvm_memory_slot *slot, gfn_t gfn) +{ + return gfn - slot->base_gfn + slot->gmem.pgoff; +} + static struct file_operations kvm_gmem_fops = { .open = generic_file_open, .release = kvm_gmem_release, @@ -551,12 +556,11 @@ void kvm_gmem_unbind(struct kvm_memory_slot *slot) } /* Returns a locked folio on success. */ -static struct folio * -__kvm_gmem_get_pfn(struct file *file, struct kvm_memory_slot *slot, - gfn_t gfn, kvm_pfn_t *pfn, bool *is_prepared, - int *max_order) +static struct folio *__kvm_gmem_get_pfn(struct file *file, + struct kvm_memory_slot *slot, + pgoff_t index, kvm_pfn_t *pfn, + bool *is_prepared, int *max_order) { - pgoff_t index = gfn - slot->base_gfn + slot->gmem.pgoff; struct kvm_gmem *gmem = file->private_data; struct folio *folio; @@ -592,6 +596,7 @@ __kvm_gmem_get_pfn(struct file *file, struct kvm_memory_slot *slot, int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t gfn, kvm_pfn_t *pfn, int *max_order) { + pgoff_t index = kvm_gmem_get_index(slot, gfn); struct file *file = kvm_gmem_get_file(slot); struct folio *folio; bool is_prepared = false; @@ -600,7 +605,7 @@ int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot, if (!file) return -EFAULT; - folio = __kvm_gmem_get_pfn(file, slot, gfn, pfn, &is_prepared, max_order); + folio = __kvm_gmem_get_pfn(file, slot, index, pfn, &is_prepared, max_order); if (IS_ERR(folio)) { r = PTR_ERR(folio); goto out; @@ -648,6 +653,7 @@ long kvm_gmem_populate(struct kvm *kvm, gfn_t start_gfn, void __user *src, long for (i = 0; i < npages; i += (1 << max_order)) { struct folio *folio; gfn_t gfn = start_gfn + i; + pgoff_t index = kvm_gmem_get_index(slot, gfn); bool is_prepared = false; kvm_pfn_t pfn; @@ -656,7 +662,7 @@ long kvm_gmem_populate(struct kvm *kvm, gfn_t start_gfn, void __user *src, long break; } - folio = __kvm_gmem_get_pfn(file, slot, gfn, &pfn, &is_prepared, &max_order); + folio = __kvm_gmem_get_pfn(file, slot, index, &pfn, &is_prepared, &max_order); if (IS_ERR(folio)) { ret = PTR_ERR(folio); break; -- cgit v1.2.3 From 1fbee5b01a0fd27db571eed757682a7c20045107 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 10 Oct 2024 11:23:48 -0700 Subject: KVM: guest_memfd: Provide "struct page" as output from kvm_gmem_get_pfn() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Provide the "struct page" associated with a guest_memfd pfn as an output from __kvm_gmem_get_pfn() so that KVM guest page fault handlers can directly put the page instead of having to rely on kvm_pfn_to_refcounted_page(). Tested-by: Alex Bennée Signed-off-by: Sean Christopherson Tested-by: Dmitry Osipenko Signed-off-by: Paolo Bonzini Message-ID: <20241010182427.1434605-47-seanjc@google.com> --- arch/x86/kvm/mmu/mmu.c | 2 +- arch/x86/kvm/svm/sev.c | 10 ++++++---- include/linux/kvm_host.h | 6 ++++-- virt/kvm/guest_memfd.c | 8 ++++++-- 4 files changed, 17 insertions(+), 9 deletions(-) (limited to 'virt') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 2bea2d20c571..c657c3c449c8 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -4407,7 +4407,7 @@ static int kvm_mmu_faultin_pfn_private(struct kvm_vcpu *vcpu, } r = kvm_gmem_get_pfn(vcpu->kvm, fault->slot, fault->gfn, &fault->pfn, - &max_order); + &fault->refcounted_page, &max_order); if (r) { kvm_mmu_prepare_memory_fault_exit(vcpu, fault); return r; diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 4557ff3804ae..c6c852485900 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -3849,6 +3849,7 @@ static int __sev_snp_update_protected_guest_state(struct kvm_vcpu *vcpu) if (VALID_PAGE(svm->sev_es.snp_vmsa_gpa)) { gfn_t gfn = gpa_to_gfn(svm->sev_es.snp_vmsa_gpa); struct kvm_memory_slot *slot; + struct page *page; kvm_pfn_t pfn; slot = gfn_to_memslot(vcpu->kvm, gfn); @@ -3859,7 +3860,7 @@ static int __sev_snp_update_protected_guest_state(struct kvm_vcpu *vcpu) * The new VMSA will be private memory guest memory, so * retrieve the PFN from the gmem backend. */ - if (kvm_gmem_get_pfn(vcpu->kvm, slot, gfn, &pfn, NULL)) + if (kvm_gmem_get_pfn(vcpu->kvm, slot, gfn, &pfn, &page, NULL)) return -EINVAL; /* @@ -3888,7 +3889,7 @@ static int __sev_snp_update_protected_guest_state(struct kvm_vcpu *vcpu) * changes then care should be taken to ensure * svm->sev_es.vmsa is pinned through some other means. */ - kvm_release_pfn_clean(pfn); + kvm_release_page_clean(page); } /* @@ -4688,6 +4689,7 @@ void sev_handle_rmp_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u64 error_code) struct kvm_memory_slot *slot; struct kvm *kvm = vcpu->kvm; int order, rmp_level, ret; + struct page *page; bool assigned; kvm_pfn_t pfn; gfn_t gfn; @@ -4714,7 +4716,7 @@ void sev_handle_rmp_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u64 error_code) return; } - ret = kvm_gmem_get_pfn(kvm, slot, gfn, &pfn, &order); + ret = kvm_gmem_get_pfn(kvm, slot, gfn, &pfn, &page, &order); if (ret) { pr_warn_ratelimited("SEV: Unexpected RMP fault, no backing page for private GPA 0x%llx\n", gpa); @@ -4772,7 +4774,7 @@ void sev_handle_rmp_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u64 error_code) out: trace_kvm_rmp_fault(vcpu, gpa, pfn, error_code, rmp_level, ret); out_no_trace: - put_page(pfn_to_page(pfn)); + kvm_release_page_unused(page); } static bool is_pfn_range_shared(kvm_pfn_t start, kvm_pfn_t end) diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index a63b0325d3e2..6efdc00b4254 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -2487,11 +2487,13 @@ static inline bool kvm_mem_is_private(struct kvm *kvm, gfn_t gfn) #ifdef CONFIG_KVM_PRIVATE_MEM int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot, - gfn_t gfn, kvm_pfn_t *pfn, int *max_order); + gfn_t gfn, kvm_pfn_t *pfn, struct page **page, + int *max_order); #else static inline int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t gfn, - kvm_pfn_t *pfn, int *max_order) + kvm_pfn_t *pfn, struct page **page, + int *max_order) { KVM_BUG_ON(1, kvm); return -EIO; diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c index 8a878e57c5d4..47a9f68f7b24 100644 --- a/virt/kvm/guest_memfd.c +++ b/virt/kvm/guest_memfd.c @@ -594,7 +594,8 @@ static struct folio *__kvm_gmem_get_pfn(struct file *file, } int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot, - gfn_t gfn, kvm_pfn_t *pfn, int *max_order) + gfn_t gfn, kvm_pfn_t *pfn, struct page **page, + int *max_order) { pgoff_t index = kvm_gmem_get_index(slot, gfn); struct file *file = kvm_gmem_get_file(slot); @@ -615,7 +616,10 @@ int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot, r = kvm_gmem_prepare_folio(kvm, slot, gfn, folio); folio_unlock(folio); - if (r < 0) + + if (!r) + *page = folio_file_page(folio, index); + else folio_put(folio); out: -- cgit v1.2.3 From ce6bf70346891f75e400f93193773fcbf72c27fc Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 10 Oct 2024 11:24:17 -0700 Subject: KVM: Convert gfn_to_page() to use kvm_follow_pfn() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Convert gfn_to_page() to the new kvm_follow_pfn() internal API, which will eventually allow removing gfn_to_pfn() and kvm_pfn_to_refcounted_page(). Tested-by: Alex Bennée Signed-off-by: Sean Christopherson Tested-by: Dmitry Osipenko Signed-off-by: Paolo Bonzini Message-ID: <20241010182427.1434605-76-seanjc@google.com> --- virt/kvm/kvm_main.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) (limited to 'virt') diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index f79745d6500c..5a424598610f 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -3139,14 +3139,16 @@ EXPORT_SYMBOL_GPL(kvm_prefetch_pages); */ struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn) { - kvm_pfn_t pfn; - - pfn = gfn_to_pfn(kvm, gfn); - - if (is_error_noslot_pfn(pfn)) - return NULL; + struct page *refcounted_page = NULL; + struct kvm_follow_pfn kfp = { + .slot = gfn_to_memslot(kvm, gfn), + .gfn = gfn, + .flags = FOLL_WRITE, + .refcounted_page = &refcounted_page, + }; - return kvm_pfn_to_refcounted_page(pfn); + (void)kvm_follow_pfn(&kfp); + return refcounted_page; } EXPORT_SYMBOL_GPL(gfn_to_page); -- cgit v1.2.3 From f42e289a2095f61755e6ca5fd1370d441bf589d5 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 10 Oct 2024 11:24:18 -0700 Subject: KVM: Add support for read-only usage of gfn_to_page() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Rework gfn_to_page() to support read-only accesses so that it can be used by arm64 to get MTE tags out of guest memory. Opportunistically rewrite the comment to be even more stern about using gfn_to_page(), as there are very few scenarios where requiring a struct page is actually the right thing to do (though there are such scenarios). Add a FIXME to call out that KVM probably should be pinning pages, not just getting pages. Tested-by: Alex Bennée Signed-off-by: Sean Christopherson Tested-by: Dmitry Osipenko Signed-off-by: Paolo Bonzini Message-ID: <20241010182427.1434605-77-seanjc@google.com> --- include/linux/kvm_host.h | 7 ++++++- virt/kvm/kvm_main.c | 15 ++++++++------- 2 files changed, 14 insertions(+), 8 deletions(-) (limited to 'virt') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 3e06393e5f1e..96cf9e5660c3 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1213,7 +1213,12 @@ void kvm_arch_flush_shadow_memslot(struct kvm *kvm, int kvm_prefetch_pages(struct kvm_memory_slot *slot, gfn_t gfn, struct page **pages, int nr_pages); -struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn); +struct page *__gfn_to_page(struct kvm *kvm, gfn_t gfn, bool write); +static inline struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn) +{ + return __gfn_to_page(kvm, gfn, true); +} + unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn); unsigned long gfn_to_hva_prot(struct kvm *kvm, gfn_t gfn, bool *writable); unsigned long gfn_to_hva_memslot(struct kvm_memory_slot *slot, gfn_t gfn); diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 5a424598610f..1c11a05a97af 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -3132,25 +3132,26 @@ int kvm_prefetch_pages(struct kvm_memory_slot *slot, gfn_t gfn, EXPORT_SYMBOL_GPL(kvm_prefetch_pages); /* - * Do not use this helper unless you are absolutely certain the gfn _must_ be - * backed by 'struct page'. A valid example is if the backing memslot is - * controlled by KVM. Note, if the returned page is valid, it's refcount has - * been elevated by gfn_to_pfn(). + * Don't use this API unless you are absolutely, positively certain that KVM + * needs to get a struct page, e.g. to pin the page for firmware DMA. + * + * FIXME: Users of this API likely need to FOLL_PIN the page, not just elevate + * its refcount. */ -struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn) +struct page *__gfn_to_page(struct kvm *kvm, gfn_t gfn, bool write) { struct page *refcounted_page = NULL; struct kvm_follow_pfn kfp = { .slot = gfn_to_memslot(kvm, gfn), .gfn = gfn, - .flags = FOLL_WRITE, + .flags = write ? FOLL_WRITE : 0, .refcounted_page = &refcounted_page, }; (void)kvm_follow_pfn(&kfp); return refcounted_page; } -EXPORT_SYMBOL_GPL(gfn_to_page); +EXPORT_SYMBOL_GPL(__gfn_to_page); int __kvm_vcpu_map(struct kvm_vcpu *vcpu, gfn_t gfn, struct kvm_host_map *map, bool writable) -- cgit v1.2.3 From 06cdaff80e50e3fb74e5e3101e1d5d7aa8b68da6 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 10 Oct 2024 11:24:21 -0700 Subject: KVM: Drop gfn_to_pfn() APIs now that all users are gone MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Drop gfn_to_pfn() and all its variants now that all users are gone. No functional change intended. Tested-by: Alex Bennée Signed-off-by: Sean Christopherson Tested-by: Dmitry Osipenko Signed-off-by: Paolo Bonzini Message-ID: <20241010182427.1434605-80-seanjc@google.com> --- include/linux/kvm_host.h | 8 -------- virt/kvm/kvm_main.c | 53 ------------------------------------------------ 2 files changed, 61 deletions(-) (limited to 'virt') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 96cf9e5660c3..4a1eaa40a215 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1274,14 +1274,6 @@ static inline kvm_pfn_t kvm_faultin_pfn(struct kvm_vcpu *vcpu, gfn_t gfn, write ? FOLL_WRITE : 0, writable, refcounted_page); } -kvm_pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn); -kvm_pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault, - bool *writable); -kvm_pfn_t gfn_to_pfn_memslot(const struct kvm_memory_slot *slot, gfn_t gfn); -kvm_pfn_t __gfn_to_pfn_memslot(const struct kvm_memory_slot *slot, gfn_t gfn, - bool interruptible, bool no_wait, - bool write_fault, bool *writable); - void kvm_release_pfn_clean(kvm_pfn_t pfn); void kvm_release_pfn_dirty(kvm_pfn_t pfn); void kvm_set_pfn_dirty(kvm_pfn_t pfn); diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 1c11a05a97af..b2c8d429442d 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -3039,59 +3039,6 @@ static kvm_pfn_t kvm_follow_pfn(struct kvm_follow_pfn *kfp) return hva_to_pfn(kfp); } -kvm_pfn_t __gfn_to_pfn_memslot(const struct kvm_memory_slot *slot, gfn_t gfn, - bool interruptible, bool no_wait, - bool write_fault, bool *writable) -{ - struct kvm_follow_pfn kfp = { - .slot = slot, - .gfn = gfn, - .map_writable = writable, - }; - - if (write_fault) - kfp.flags |= FOLL_WRITE; - if (no_wait) - kfp.flags |= FOLL_NOWAIT; - if (interruptible) - kfp.flags |= FOLL_INTERRUPTIBLE; - - return kvm_follow_pfn(&kfp); -} -EXPORT_SYMBOL_GPL(__gfn_to_pfn_memslot); - -kvm_pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault, - bool *writable) -{ - struct kvm_follow_pfn kfp = { - .slot = gfn_to_memslot(kvm, gfn), - .gfn = gfn, - .flags = write_fault ? FOLL_WRITE : 0, - .map_writable = writable, - }; - - return kvm_follow_pfn(&kfp); -} -EXPORT_SYMBOL_GPL(gfn_to_pfn_prot); - -kvm_pfn_t gfn_to_pfn_memslot(const struct kvm_memory_slot *slot, gfn_t gfn) -{ - struct kvm_follow_pfn kfp = { - .slot = slot, - .gfn = gfn, - .flags = FOLL_WRITE, - }; - - return kvm_follow_pfn(&kfp); -} -EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot); - -kvm_pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn) -{ - return gfn_to_pfn_memslot(gfn_to_memslot(kvm, gfn), gfn); -} -EXPORT_SYMBOL_GPL(gfn_to_pfn); - kvm_pfn_t __kvm_faultin_pfn(const struct kvm_memory_slot *slot, gfn_t gfn, unsigned int foll, bool *writable, struct page **refcounted_page) -- cgit v1.2.3 From 31fccdd21263f12e8b701ad90a78e31e789cb780 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 10 Oct 2024 11:24:23 -0700 Subject: KVM: Make kvm_follow_pfn.refcounted_page a required field MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Now that the legacy gfn_to_pfn() APIs are gone, and all callers of hva_to_pfn() pass in a refcounted_page pointer, make it a required field to ensure all future usage in KVM plays nice. Tested-by: Alex Bennée Signed-off-by: Sean Christopherson Tested-by: Dmitry Osipenko Signed-off-by: Paolo Bonzini Message-ID: <20241010182427.1434605-82-seanjc@google.com> --- virt/kvm/kvm_main.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'virt') diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index b2c8d429442d..a483da96f4be 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -2834,8 +2834,7 @@ static kvm_pfn_t kvm_resolve_pfn(struct kvm_follow_pfn *kfp, struct page *page, pfn = page_to_pfn(page); } - if (kfp->refcounted_page) - *kfp->refcounted_page = page; + *kfp->refcounted_page = page; return pfn; } @@ -2986,6 +2985,9 @@ kvm_pfn_t hva_to_pfn(struct kvm_follow_pfn *kfp) might_sleep(); + if (WARN_ON_ONCE(!kfp->refcounted_page)) + return KVM_PFN_ERR_FAULT; + if (hva_to_pfn_fast(kfp, &pfn)) return pfn; -- cgit v1.2.3 From 93b7da404f5b0b02a4211bbb784889f001d27953 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 10 Oct 2024 11:24:26 -0700 Subject: KVM: Drop APIs that manipulate "struct page" via pfns MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Remove all kvm_{release,set}_pfn_*() APIs now that all users are gone. No functional change intended. Reviewed-by: Alex Bennée Tested-by: Alex Bennée Signed-off-by: Sean Christopherson Tested-by: Dmitry Osipenko Signed-off-by: Paolo Bonzini Message-ID: <20241010182427.1434605-85-seanjc@google.com> --- include/linux/kvm_host.h | 5 ----- virt/kvm/kvm_main.c | 55 ------------------------------------------------ 2 files changed, 60 deletions(-) (limited to 'virt') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 4a1eaa40a215..d045f8310a48 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1274,11 +1274,6 @@ static inline kvm_pfn_t kvm_faultin_pfn(struct kvm_vcpu *vcpu, gfn_t gfn, write ? FOLL_WRITE : 0, writable, refcounted_page); } -void kvm_release_pfn_clean(kvm_pfn_t pfn); -void kvm_release_pfn_dirty(kvm_pfn_t pfn); -void kvm_set_pfn_dirty(kvm_pfn_t pfn); -void kvm_set_pfn_accessed(kvm_pfn_t pfn); - int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset, int len); int kvm_read_guest(struct kvm *kvm, gpa_t gpa, void *data, unsigned long len); diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index a483da96f4be..396ca14f18f3 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -3164,61 +3164,6 @@ void kvm_vcpu_unmap(struct kvm_vcpu *vcpu, struct kvm_host_map *map) } EXPORT_SYMBOL_GPL(kvm_vcpu_unmap); -void kvm_release_pfn_clean(kvm_pfn_t pfn) -{ - struct page *page; - - if (is_error_noslot_pfn(pfn)) - return; - - page = kvm_pfn_to_refcounted_page(pfn); - if (!page) - return; - - kvm_release_page_clean(page); -} -EXPORT_SYMBOL_GPL(kvm_release_pfn_clean); - -void kvm_release_pfn_dirty(kvm_pfn_t pfn) -{ - struct page *page; - - if (is_error_noslot_pfn(pfn)) - return; - - page = kvm_pfn_to_refcounted_page(pfn); - if (!page) - return; - - kvm_release_page_dirty(page); -} -EXPORT_SYMBOL_GPL(kvm_release_pfn_dirty); - -/* - * Note, checking for an error/noslot pfn is the caller's responsibility when - * directly marking a page dirty/accessed. Unlike the "release" helpers, the - * "set" helpers are not to be used when the pfn might point at garbage. - */ -void kvm_set_pfn_dirty(kvm_pfn_t pfn) -{ - if (WARN_ON(is_error_noslot_pfn(pfn))) - return; - - if (pfn_valid(pfn)) - kvm_set_page_dirty(pfn_to_page(pfn)); -} -EXPORT_SYMBOL_GPL(kvm_set_pfn_dirty); - -void kvm_set_pfn_accessed(kvm_pfn_t pfn) -{ - if (WARN_ON(is_error_noslot_pfn(pfn))) - return; - - if (pfn_valid(pfn)) - kvm_set_page_accessed(pfn_to_page(pfn)); -} -EXPORT_SYMBOL_GPL(kvm_set_pfn_accessed); - static int next_segment(unsigned long len, int offset) { if (len > PAGE_SIZE - offset) -- cgit v1.2.3 From 8b15c3764c05ed8766709711d2054d96349dee8e Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 10 Oct 2024 11:24:27 -0700 Subject: KVM: Don't grab reference on VM_MIXEDMAP pfns that have a "struct page" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Now that KVM no longer relies on an ugly heuristic to find its struct page references, i.e. now that KVM can't get false positives on VM_MIXEDMAP pfns, remove KVM's hack to elevate the refcount for pfns that happen to have a valid struct page. In addition to removing a long-standing wart in KVM, this allows KVM to map non-refcounted struct page memory into the guest, e.g. for exposing GPU TTM buffers to KVM guests. Tested-by: Alex Bennée Signed-off-by: Sean Christopherson Tested-by: Dmitry Osipenko Signed-off-by: Paolo Bonzini Message-ID: <20241010182427.1434605-86-seanjc@google.com> --- include/linux/kvm_host.h | 3 -- virt/kvm/kvm_main.c | 75 ++---------------------------------------------- 2 files changed, 2 insertions(+), 76 deletions(-) (limited to 'virt') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index d045f8310a48..02f0206fd2dc 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1730,9 +1730,6 @@ void kvm_arch_sync_events(struct kvm *kvm); int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu); -struct page *kvm_pfn_to_refcounted_page(kvm_pfn_t pfn); -bool kvm_is_zone_device_page(struct page *page); - struct kvm_irq_ack_notifier { struct hlist_node link; unsigned gsi; diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 396ca14f18f3..b1b10dc408a0 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -160,52 +160,6 @@ __weak void kvm_arch_guest_memory_reclaimed(struct kvm *kvm) { } -bool kvm_is_zone_device_page(struct page *page) -{ - /* - * The metadata used by is_zone_device_page() to determine whether or - * not a page is ZONE_DEVICE is guaranteed to be valid if and only if - * the device has been pinned, e.g. by get_user_pages(). WARN if the - * page_count() is zero to help detect bad usage of this helper. - */ - if (WARN_ON_ONCE(!page_count(page))) - return false; - - return is_zone_device_page(page); -} - -/* - * Returns a 'struct page' if the pfn is "valid" and backed by a refcounted - * page, NULL otherwise. Note, the list of refcounted PG_reserved page types - * is likely incomplete, it has been compiled purely through people wanting to - * back guest with a certain type of memory and encountering issues. - */ -struct page *kvm_pfn_to_refcounted_page(kvm_pfn_t pfn) -{ - struct page *page; - - if (!pfn_valid(pfn)) - return NULL; - - page = pfn_to_page(pfn); - if (!PageReserved(page)) - return page; - - /* The ZERO_PAGE(s) is marked PG_reserved, but is refcounted. */ - if (is_zero_pfn(pfn)) - return page; - - /* - * ZONE_DEVICE pages currently set PG_reserved, but from a refcounting - * perspective they are "normal" pages, albeit with slightly different - * usage rules. - */ - if (kvm_is_zone_device_page(page)) - return page; - - return NULL; -} - /* * Switches to specified vcpu, until a matching vcpu_put() */ @@ -2804,35 +2758,10 @@ static kvm_pfn_t kvm_resolve_pfn(struct kvm_follow_pfn *kfp, struct page *page, if (kfp->map_writable) *kfp->map_writable = writable; - /* - * FIXME: Remove this once KVM no longer blindly calls put_page() on - * every pfn that points at a struct page. - * - * Get a reference for follow_pte() pfns if they happen to point at a - * struct page, as KVM will ultimately call kvm_release_pfn_clean() on - * the returned pfn, i.e. KVM expects to have a reference. - * - * Certain IO or PFNMAP mappings can be backed with valid struct pages, - * but be allocated without refcounting, e.g. tail pages of - * non-compound higher order allocations. Grabbing and putting a - * reference to such pages would cause KVM to prematurely free a page - * it doesn't own (KVM gets and puts the one and only reference). - * Don't allow those pages until the FIXME is resolved. - * - * Don't grab a reference for pins, callers that pin pages are required - * to check refcounted_page, i.e. must not blindly release the pfn. - */ - if (map) { + if (map) pfn = map->pfn; - - if (!kfp->pin) { - page = kvm_pfn_to_refcounted_page(pfn); - if (page && !get_page_unless_zero(page)) - return KVM_PFN_ERR_FAULT; - } - } else { + else pfn = page_to_pfn(page); - } *kfp->refcounted_page = page; -- cgit v1.2.3