summaryrefslogtreecommitdiff
path: root/arch/x86/kvm/mmu
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/kvm/mmu')
-rw-r--r--arch/x86/kvm/mmu/mmu.c197
-rw-r--r--arch/x86/kvm/mmu/mmu_internal.h6
-rw-r--r--arch/x86/kvm/mmu/mmutrace.h3
-rw-r--r--arch/x86/kvm/mmu/spte.c10
-rw-r--r--arch/x86/kvm/mmu/tdp_mmu.c51
-rw-r--r--arch/x86/kvm/mmu/tdp_mmu.h3
6 files changed, 179 insertions, 91 deletions
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 56c80588efa0..667d66cf76d5 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -110,7 +110,7 @@ static bool __ro_after_init tdp_mmu_allowed;
#ifdef CONFIG_X86_64
bool __read_mostly tdp_mmu_enabled = true;
module_param_named(tdp_mmu, tdp_mmu_enabled, bool, 0444);
-EXPORT_SYMBOL_GPL(tdp_mmu_enabled);
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(tdp_mmu_enabled);
#endif
static int max_huge_page_level __read_mostly;
@@ -776,7 +776,8 @@ static void account_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp)
kvm_flush_remote_tlbs_gfn(kvm, gfn, PG_LEVEL_4K);
}
-void track_possible_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp)
+void track_possible_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp,
+ enum kvm_mmu_type mmu_type)
{
/*
* If it's possible to replace the shadow page with an NX huge page,
@@ -790,8 +791,9 @@ void track_possible_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp)
return;
++kvm->stat.nx_lpage_splits;
+ ++kvm->arch.possible_nx_huge_pages[mmu_type].nr_pages;
list_add_tail(&sp->possible_nx_huge_page_link,
- &kvm->arch.possible_nx_huge_pages);
+ &kvm->arch.possible_nx_huge_pages[mmu_type].pages);
}
static void account_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp,
@@ -800,7 +802,7 @@ static void account_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp,
sp->nx_huge_page_disallowed = true;
if (nx_huge_page_possible)
- track_possible_nx_huge_page(kvm, sp);
+ track_possible_nx_huge_page(kvm, sp, KVM_SHADOW_MMU);
}
static void unaccount_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp)
@@ -819,12 +821,14 @@ static void unaccount_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp)
kvm_mmu_gfn_allow_lpage(slot, gfn);
}
-void untrack_possible_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp)
+void untrack_possible_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp,
+ enum kvm_mmu_type mmu_type)
{
if (list_empty(&sp->possible_nx_huge_page_link))
return;
--kvm->stat.nx_lpage_splits;
+ --kvm->arch.possible_nx_huge_pages[mmu_type].nr_pages;
list_del_init(&sp->possible_nx_huge_page_link);
}
@@ -832,7 +836,7 @@ static void unaccount_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp)
{
sp->nx_huge_page_disallowed = false;
- untrack_possible_nx_huge_page(kvm, sp);
+ untrack_possible_nx_huge_page(kvm, sp, KVM_SHADOW_MMU);
}
static struct kvm_memory_slot *gfn_to_memslot_dirty_bitmap(struct kvm_vcpu *vcpu,
@@ -3861,7 +3865,7 @@ void kvm_mmu_free_roots(struct kvm *kvm, struct kvm_mmu *mmu,
write_unlock(&kvm->mmu_lock);
}
}
-EXPORT_SYMBOL_GPL(kvm_mmu_free_roots);
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_mmu_free_roots);
void kvm_mmu_free_guest_mode_roots(struct kvm *kvm, struct kvm_mmu *mmu)
{
@@ -3888,7 +3892,7 @@ void kvm_mmu_free_guest_mode_roots(struct kvm *kvm, struct kvm_mmu *mmu)
kvm_mmu_free_roots(kvm, mmu, roots_to_free);
}
-EXPORT_SYMBOL_GPL(kvm_mmu_free_guest_mode_roots);
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_mmu_free_guest_mode_roots);
static hpa_t mmu_alloc_root(struct kvm_vcpu *vcpu, gfn_t gfn, int quadrant,
u8 level)
@@ -4663,10 +4667,16 @@ static int kvm_mmu_faultin_pfn(struct kvm_vcpu *vcpu,
/*
* Retry the page fault if the gfn hit a memslot that is being deleted
* or moved. This ensures any existing SPTEs for the old memslot will
- * be zapped before KVM inserts a new MMIO SPTE for the gfn.
+ * be zapped before KVM inserts a new MMIO SPTE for the gfn. Punt the
+ * error to userspace if this is a prefault, as KVM's prefaulting ABI
+ * doesn't provide the same forward progress guarantees as KVM_RUN.
*/
- if (slot->flags & KVM_MEMSLOT_INVALID)
+ if (slot->flags & KVM_MEMSLOT_INVALID) {
+ if (fault->prefetch)
+ return -EAGAIN;
+
return RET_PF_RETRY;
+ }
if (slot->id == APIC_ACCESS_PAGE_PRIVATE_MEMSLOT) {
/*
@@ -4866,7 +4876,7 @@ int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code,
return r;
}
-EXPORT_SYMBOL_GPL(kvm_handle_page_fault);
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_handle_page_fault);
#ifdef CONFIG_X86_64
static int kvm_tdp_mmu_page_fault(struct kvm_vcpu *vcpu,
@@ -4956,7 +4966,7 @@ int kvm_tdp_map_page(struct kvm_vcpu *vcpu, gpa_t gpa, u64 error_code, u8 *level
return -EIO;
}
}
-EXPORT_SYMBOL_GPL(kvm_tdp_map_page);
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_tdp_map_page);
long kvm_arch_vcpu_pre_fault_memory(struct kvm_vcpu *vcpu,
struct kvm_pre_fault_memory *range)
@@ -5152,7 +5162,7 @@ void kvm_mmu_new_pgd(struct kvm_vcpu *vcpu, gpa_t new_pgd)
__clear_sp_write_flooding_count(sp);
}
}
-EXPORT_SYMBOL_GPL(kvm_mmu_new_pgd);
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_mmu_new_pgd);
static bool sync_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, gfn_t gfn,
unsigned int access)
@@ -5798,7 +5808,7 @@ void kvm_init_shadow_npt_mmu(struct kvm_vcpu *vcpu, unsigned long cr0,
shadow_mmu_init_context(vcpu, context, cpu_role, root_role);
kvm_mmu_new_pgd(vcpu, nested_cr3);
}
-EXPORT_SYMBOL_GPL(kvm_init_shadow_npt_mmu);
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_init_shadow_npt_mmu);
static union kvm_cpu_role
kvm_calc_shadow_ept_root_page_role(struct kvm_vcpu *vcpu, bool accessed_dirty,
@@ -5852,7 +5862,7 @@ void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly,
kvm_mmu_new_pgd(vcpu, new_eptp);
}
-EXPORT_SYMBOL_GPL(kvm_init_shadow_ept_mmu);
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_init_shadow_ept_mmu);
static void init_kvm_softmmu(struct kvm_vcpu *vcpu,
union kvm_cpu_role cpu_role)
@@ -5917,7 +5927,7 @@ void kvm_init_mmu(struct kvm_vcpu *vcpu)
else
init_kvm_softmmu(vcpu, cpu_role);
}
-EXPORT_SYMBOL_GPL(kvm_init_mmu);
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_init_mmu);
void kvm_mmu_after_set_cpuid(struct kvm_vcpu *vcpu)
{
@@ -5953,7 +5963,7 @@ void kvm_mmu_reset_context(struct kvm_vcpu *vcpu)
kvm_mmu_unload(vcpu);
kvm_init_mmu(vcpu);
}
-EXPORT_SYMBOL_GPL(kvm_mmu_reset_context);
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_mmu_reset_context);
int kvm_mmu_load(struct kvm_vcpu *vcpu)
{
@@ -5987,7 +5997,7 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu)
out:
return r;
}
-EXPORT_SYMBOL_GPL(kvm_mmu_load);
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_mmu_load);
void kvm_mmu_unload(struct kvm_vcpu *vcpu)
{
@@ -6049,7 +6059,7 @@ void kvm_mmu_free_obsolete_roots(struct kvm_vcpu *vcpu)
__kvm_mmu_free_obsolete_roots(vcpu->kvm, &vcpu->arch.root_mmu);
__kvm_mmu_free_obsolete_roots(vcpu->kvm, &vcpu->arch.guest_mmu);
}
-EXPORT_SYMBOL_GPL(kvm_mmu_free_obsolete_roots);
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_mmu_free_obsolete_roots);
static u64 mmu_pte_write_fetch_gpte(struct kvm_vcpu *vcpu, gpa_t *gpa,
int *bytes)
@@ -6375,7 +6385,7 @@ emulate:
return x86_emulate_instruction(vcpu, cr2_or_gpa, emulation_type, insn,
insn_len);
}
-EXPORT_SYMBOL_GPL(kvm_mmu_page_fault);
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_mmu_page_fault);
void kvm_mmu_print_sptes(struct kvm_vcpu *vcpu, gpa_t gpa, const char *msg)
{
@@ -6391,7 +6401,7 @@ void kvm_mmu_print_sptes(struct kvm_vcpu *vcpu, gpa_t gpa, const char *msg)
pr_cont(", spte[%d] = 0x%llx", level, sptes[level]);
pr_cont("\n");
}
-EXPORT_SYMBOL_GPL(kvm_mmu_print_sptes);
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_mmu_print_sptes);
static void __kvm_mmu_invalidate_addr(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
u64 addr, hpa_t root_hpa)
@@ -6457,7 +6467,7 @@ void kvm_mmu_invalidate_addr(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
__kvm_mmu_invalidate_addr(vcpu, mmu, addr, mmu->prev_roots[i].hpa);
}
}
-EXPORT_SYMBOL_GPL(kvm_mmu_invalidate_addr);
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_mmu_invalidate_addr);
void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva)
{
@@ -6474,7 +6484,7 @@ void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva)
kvm_mmu_invalidate_addr(vcpu, vcpu->arch.walk_mmu, gva, KVM_MMU_ROOTS_ALL);
++vcpu->stat.invlpg;
}
-EXPORT_SYMBOL_GPL(kvm_mmu_invlpg);
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_mmu_invlpg);
void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid)
@@ -6527,7 +6537,7 @@ void kvm_configure_mmu(bool enable_tdp, int tdp_forced_root_level,
else
max_huge_page_level = PG_LEVEL_2M;
}
-EXPORT_SYMBOL_GPL(kvm_configure_mmu);
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_configure_mmu);
static void free_mmu_pages(struct kvm_mmu *mmu)
{
@@ -6751,11 +6761,12 @@ static void kvm_mmu_zap_all_fast(struct kvm *kvm)
int kvm_mmu_init_vm(struct kvm *kvm)
{
- int r;
+ int r, i;
kvm->arch.shadow_mmio_value = shadow_mmio_value;
INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
- INIT_LIST_HEAD(&kvm->arch.possible_nx_huge_pages);
+ for (i = 0; i < KVM_NR_MMU_TYPES; ++i)
+ INIT_LIST_HEAD(&kvm->arch.possible_nx_huge_pages[i].pages);
spin_lock_init(&kvm->arch.mmu_unsync_pages_lock);
if (tdp_mmu_enabled) {
@@ -7193,7 +7204,7 @@ restart:
return need_tlb_flush;
}
-EXPORT_SYMBOL_GPL(kvm_zap_gfn_range);
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_zap_gfn_range);
static void kvm_rmap_zap_collapsible_sptes(struct kvm *kvm,
const struct kvm_memory_slot *slot)
@@ -7596,19 +7607,64 @@ static int set_nx_huge_pages_recovery_param(const char *val, const struct kernel
return err;
}
-static void kvm_recover_nx_huge_pages(struct kvm *kvm)
+static unsigned long nx_huge_pages_to_zap(struct kvm *kvm,
+ enum kvm_mmu_type mmu_type)
+{
+ unsigned long pages = READ_ONCE(kvm->arch.possible_nx_huge_pages[mmu_type].nr_pages);
+ unsigned int ratio = READ_ONCE(nx_huge_pages_recovery_ratio);
+
+ return ratio ? DIV_ROUND_UP(pages, ratio) : 0;
+}
+
+static bool kvm_mmu_sp_dirty_logging_enabled(struct kvm *kvm,
+ struct kvm_mmu_page *sp)
{
- unsigned long nx_lpage_splits = kvm->stat.nx_lpage_splits;
struct kvm_memory_slot *slot;
- int rcu_idx;
+
+ /*
+ * Skip the memslot lookup if dirty tracking can't possibly be enabled,
+ * as memslot lookups are relatively expensive.
+ *
+ * If a memslot update is in progress, reading an incorrect value of
+ * kvm->nr_memslots_dirty_logging is not a problem: if it is becoming
+ * zero, KVM will do an unnecessary memslot lookup; if it is becoming
+ * nonzero, the page will be zapped unnecessarily. Either way, this
+ * only affects efficiency in racy situations, and not correctness.
+ */
+ if (!atomic_read(&kvm->nr_memslots_dirty_logging))
+ return false;
+
+ slot = __gfn_to_memslot(kvm_memslots_for_spte_role(kvm, sp->role), sp->gfn);
+ if (WARN_ON_ONCE(!slot))
+ return false;
+
+ return kvm_slot_dirty_track_enabled(slot);
+}
+
+static void kvm_recover_nx_huge_pages(struct kvm *kvm,
+ const enum kvm_mmu_type mmu_type)
+{
+#ifdef CONFIG_X86_64
+ const bool is_tdp_mmu = mmu_type == KVM_TDP_MMU;
+ spinlock_t *tdp_mmu_pages_lock = &kvm->arch.tdp_mmu_pages_lock;
+#else
+ const bool is_tdp_mmu = false;
+ spinlock_t *tdp_mmu_pages_lock = NULL;
+#endif
+ unsigned long to_zap = nx_huge_pages_to_zap(kvm, mmu_type);
+ struct list_head *nx_huge_pages;
struct kvm_mmu_page *sp;
- unsigned int ratio;
LIST_HEAD(invalid_list);
bool flush = false;
- ulong to_zap;
+ int rcu_idx;
+
+ nx_huge_pages = &kvm->arch.possible_nx_huge_pages[mmu_type].pages;
rcu_idx = srcu_read_lock(&kvm->srcu);
- write_lock(&kvm->mmu_lock);
+ if (is_tdp_mmu)
+ read_lock(&kvm->mmu_lock);
+ else
+ write_lock(&kvm->mmu_lock);
/*
* Zapping TDP MMU shadow pages, including the remote TLB flush, must
@@ -7617,11 +7673,15 @@ static void kvm_recover_nx_huge_pages(struct kvm *kvm)
*/
rcu_read_lock();
- ratio = READ_ONCE(nx_huge_pages_recovery_ratio);
- to_zap = ratio ? DIV_ROUND_UP(nx_lpage_splits, ratio) : 0;
for ( ; to_zap; --to_zap) {
- if (list_empty(&kvm->arch.possible_nx_huge_pages))
+ if (is_tdp_mmu)
+ spin_lock(tdp_mmu_pages_lock);
+
+ if (list_empty(nx_huge_pages)) {
+ if (is_tdp_mmu)
+ spin_unlock(tdp_mmu_pages_lock);
break;
+ }
/*
* We use a separate list instead of just using active_mmu_pages
@@ -7630,56 +7690,44 @@ static void kvm_recover_nx_huge_pages(struct kvm *kvm)
* the total number of shadow pages. And because the TDP MMU
* doesn't use active_mmu_pages.
*/
- sp = list_first_entry(&kvm->arch.possible_nx_huge_pages,
+ sp = list_first_entry(nx_huge_pages,
struct kvm_mmu_page,
possible_nx_huge_page_link);
WARN_ON_ONCE(!sp->nx_huge_page_disallowed);
WARN_ON_ONCE(!sp->role.direct);
+ unaccount_nx_huge_page(kvm, sp);
+
+ if (is_tdp_mmu)
+ spin_unlock(tdp_mmu_pages_lock);
+
/*
- * Unaccount and do not attempt to recover any NX Huge Pages
- * that are being dirty tracked, as they would just be faulted
- * back in as 4KiB pages. The NX Huge Pages in this slot will be
- * recovered, along with all the other huge pages in the slot,
- * when dirty logging is disabled.
- *
- * Since gfn_to_memslot() is relatively expensive, it helps to
- * skip it if it the test cannot possibly return true. On the
- * other hand, if any memslot has logging enabled, chances are
- * good that all of them do, in which case unaccount_nx_huge_page()
- * is much cheaper than zapping the page.
- *
- * If a memslot update is in progress, reading an incorrect value
- * of kvm->nr_memslots_dirty_logging is not a problem: if it is
- * becoming zero, gfn_to_memslot() will be done unnecessarily; if
- * it is becoming nonzero, the page will be zapped unnecessarily.
- * Either way, this only affects efficiency in racy situations,
- * and not correctness.
+ * Do not attempt to recover any NX Huge Pages that are being
+ * dirty tracked, as they would just be faulted back in as 4KiB
+ * pages. The NX Huge Pages in this slot will be recovered,
+ * along with all the other huge pages in the slot, when dirty
+ * logging is disabled.
*/
- slot = NULL;
- if (atomic_read(&kvm->nr_memslots_dirty_logging)) {
- struct kvm_memslots *slots;
+ if (!kvm_mmu_sp_dirty_logging_enabled(kvm, sp)) {
+ if (is_tdp_mmu)
+ flush |= kvm_tdp_mmu_zap_possible_nx_huge_page(kvm, sp);
+ else
+ kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
- slots = kvm_memslots_for_spte_role(kvm, sp->role);
- slot = __gfn_to_memslot(slots, sp->gfn);
- WARN_ON_ONCE(!slot);
}
- if (slot && kvm_slot_dirty_track_enabled(slot))
- unaccount_nx_huge_page(kvm, sp);
- else if (is_tdp_mmu_page(sp))
- flush |= kvm_tdp_mmu_zap_sp(kvm, sp);
- else
- kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
WARN_ON_ONCE(sp->nx_huge_page_disallowed);
if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) {
kvm_mmu_remote_flush_or_zap(kvm, &invalid_list, flush);
rcu_read_unlock();
- cond_resched_rwlock_write(&kvm->mmu_lock);
- flush = false;
+ if (is_tdp_mmu)
+ cond_resched_rwlock_read(&kvm->mmu_lock);
+ else
+ cond_resched_rwlock_write(&kvm->mmu_lock);
+ flush = false;
rcu_read_lock();
}
}
@@ -7687,7 +7735,10 @@ static void kvm_recover_nx_huge_pages(struct kvm *kvm)
rcu_read_unlock();
- write_unlock(&kvm->mmu_lock);
+ if (is_tdp_mmu)
+ read_unlock(&kvm->mmu_lock);
+ else
+ write_unlock(&kvm->mmu_lock);
srcu_read_unlock(&kvm->srcu, rcu_idx);
}
@@ -7698,9 +7749,10 @@ static void kvm_nx_huge_page_recovery_worker_kill(void *data)
static bool kvm_nx_huge_page_recovery_worker(void *data)
{
struct kvm *kvm = data;
+ long remaining_time;
bool enabled;
uint period;
- long remaining_time;
+ int i;
enabled = calc_nx_huge_pages_recovery_period(&period);
if (!enabled)
@@ -7715,7 +7767,8 @@ static bool kvm_nx_huge_page_recovery_worker(void *data)
}
__set_current_state(TASK_RUNNING);
- kvm_recover_nx_huge_pages(kvm);
+ for (i = 0; i < KVM_NR_MMU_TYPES; ++i)
+ kvm_recover_nx_huge_pages(kvm, i);
kvm->arch.nx_huge_page_last = get_jiffies_64();
return true;
}
diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h
index b776be783a2f..ed5c01df21ba 100644
--- a/arch/x86/kvm/mmu/mmu_internal.h
+++ b/arch/x86/kvm/mmu/mmu_internal.h
@@ -416,7 +416,9 @@ int kvm_mmu_max_mapping_level(struct kvm *kvm, struct kvm_page_fault *fault,
void kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault);
void disallowed_hugepage_adjust(struct kvm_page_fault *fault, u64 spte, int cur_level);
-void track_possible_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp);
-void untrack_possible_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp);
+void track_possible_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp,
+ enum kvm_mmu_type mmu_type);
+void untrack_possible_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp,
+ enum kvm_mmu_type mmu_type);
#endif /* __KVM_X86_MMU_INTERNAL_H */
diff --git a/arch/x86/kvm/mmu/mmutrace.h b/arch/x86/kvm/mmu/mmutrace.h
index f35a830ce469..764e3015d021 100644
--- a/arch/x86/kvm/mmu/mmutrace.h
+++ b/arch/x86/kvm/mmu/mmutrace.h
@@ -51,6 +51,9 @@
{ PFERR_PRESENT_MASK, "P" }, \
{ PFERR_WRITE_MASK, "W" }, \
{ PFERR_USER_MASK, "U" }, \
+ { PFERR_PK_MASK, "PK" }, \
+ { PFERR_SS_MASK, "SS" }, \
+ { PFERR_SGX_MASK, "SGX" }, \
{ PFERR_RSVD_MASK, "RSVD" }, \
{ PFERR_FETCH_MASK, "F" }
diff --git a/arch/x86/kvm/mmu/spte.c b/arch/x86/kvm/mmu/spte.c
index df31039b5d63..37647afde7d3 100644
--- a/arch/x86/kvm/mmu/spte.c
+++ b/arch/x86/kvm/mmu/spte.c
@@ -22,7 +22,7 @@
bool __read_mostly enable_mmio_caching = true;
static bool __ro_after_init allow_mmio_caching;
module_param_named(mmio_caching, enable_mmio_caching, bool, 0444);
-EXPORT_SYMBOL_GPL(enable_mmio_caching);
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(enable_mmio_caching);
bool __read_mostly kvm_ad_enabled;
@@ -470,13 +470,13 @@ void kvm_mmu_set_mmio_spte_mask(u64 mmio_value, u64 mmio_mask, u64 access_mask)
shadow_mmio_mask = mmio_mask;
shadow_mmio_access_mask = access_mask;
}
-EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_mask);
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_mmu_set_mmio_spte_mask);
void kvm_mmu_set_mmio_spte_value(struct kvm *kvm, u64 mmio_value)
{
kvm->arch.shadow_mmio_value = mmio_value;
}
-EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_value);
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_mmu_set_mmio_spte_value);
void kvm_mmu_set_me_spte_mask(u64 me_value, u64 me_mask)
{
@@ -487,7 +487,7 @@ void kvm_mmu_set_me_spte_mask(u64 me_value, u64 me_mask)
shadow_me_value = me_value;
shadow_me_mask = me_mask;
}
-EXPORT_SYMBOL_GPL(kvm_mmu_set_me_spte_mask);
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_mmu_set_me_spte_mask);
void kvm_mmu_set_ept_masks(bool has_ad_bits, bool has_exec_only)
{
@@ -513,7 +513,7 @@ void kvm_mmu_set_ept_masks(bool has_ad_bits, bool has_exec_only)
kvm_mmu_set_mmio_spte_mask(VMX_EPT_MISCONFIG_WX_VALUE,
VMX_EPT_RWX_MASK | VMX_EPT_SUPPRESS_VE_BIT, 0);
}
-EXPORT_SYMBOL_GPL(kvm_mmu_set_ept_masks);
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_mmu_set_ept_masks);
void kvm_mmu_reset_all_pte_masks(void)
{
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index 740cb06accdb..c5734ca5c17d 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -355,7 +355,7 @@ static void tdp_mmu_unlink_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
spin_lock(&kvm->arch.tdp_mmu_pages_lock);
sp->nx_huge_page_disallowed = false;
- untrack_possible_nx_huge_page(kvm, sp);
+ untrack_possible_nx_huge_page(kvm, sp, KVM_TDP_MMU);
spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
}
@@ -925,23 +925,52 @@ static void tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root,
rcu_read_unlock();
}
-bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
+bool kvm_tdp_mmu_zap_possible_nx_huge_page(struct kvm *kvm,
+ struct kvm_mmu_page *sp)
{
- u64 old_spte;
+ struct tdp_iter iter = {
+ .old_spte = sp->ptep ? kvm_tdp_mmu_read_spte(sp->ptep) : 0,
+ .sptep = sp->ptep,
+ .level = sp->role.level + 1,
+ .gfn = sp->gfn,
+ .as_id = kvm_mmu_page_as_id(sp),
+ };
+
+ lockdep_assert_held_read(&kvm->mmu_lock);
+
+ if (WARN_ON_ONCE(!is_tdp_mmu_page(sp)))
+ return false;
/*
- * This helper intentionally doesn't allow zapping a root shadow page,
- * which doesn't have a parent page table and thus no associated entry.
+ * Root shadow pages don't have a parent page table and thus no
+ * associated entry, but they can never be possible NX huge pages.
*/
if (WARN_ON_ONCE(!sp->ptep))
return false;
- old_spte = kvm_tdp_mmu_read_spte(sp->ptep);
- if (WARN_ON_ONCE(!is_shadow_present_pte(old_spte)))
+ /*
+ * Since mmu_lock is held in read mode, it's possible another task has
+ * already modified the SPTE. Zap the SPTE if and only if the SPTE
+ * points at the SP's page table, as checking shadow-present isn't
+ * sufficient, e.g. the SPTE could be replaced by a leaf SPTE, or even
+ * another SP. Note, spte_to_child_pt() also checks that the SPTE is
+ * shadow-present, i.e. guards against zapping a frozen SPTE.
+ */
+ if ((tdp_ptep_t)sp->spt != spte_to_child_pt(iter.old_spte, iter.level))
return false;
- tdp_mmu_set_spte(kvm, kvm_mmu_page_as_id(sp), sp->ptep, old_spte,
- SHADOW_NONPRESENT_VALUE, sp->gfn, sp->role.level + 1);
+ /*
+ * If a different task modified the SPTE, then it should be impossible
+ * for the SPTE to still be used for the to-be-zapped SP. Non-leaf
+ * SPTEs don't have Dirty bits, KVM always sets the Accessed bit when
+ * creating non-leaf SPTEs, and all other bits are immutable for non-
+ * leaf SPTEs, i.e. the only legal operations for non-leaf SPTEs are
+ * zapping and replacement.
+ */
+ if (tdp_mmu_set_spte_atomic(kvm, &iter, SHADOW_NONPRESENT_VALUE)) {
+ WARN_ON_ONCE((tdp_ptep_t)sp->spt == spte_to_child_pt(iter.old_spte, iter.level));
+ return false;
+ }
return true;
}
@@ -1303,7 +1332,7 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
fault->req_level >= iter.level) {
spin_lock(&kvm->arch.tdp_mmu_pages_lock);
if (sp->nx_huge_page_disallowed)
- track_possible_nx_huge_page(kvm, sp);
+ track_possible_nx_huge_page(kvm, sp, KVM_TDP_MMU);
spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
}
}
@@ -1953,7 +1982,7 @@ bool kvm_tdp_mmu_gpa_is_mapped(struct kvm_vcpu *vcpu, u64 gpa)
spte = sptes[leaf];
return is_shadow_present_pte(spte) && is_last_spte(spte, leaf);
}
-EXPORT_SYMBOL_GPL(kvm_tdp_mmu_gpa_is_mapped);
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_tdp_mmu_gpa_is_mapped);
/*
* Returns the last level spte pointer of the shadow page walk for the given
diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h
index 52acf99d40a0..bd62977c9199 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.h
+++ b/arch/x86/kvm/mmu/tdp_mmu.h
@@ -64,7 +64,8 @@ static inline struct kvm_mmu_page *tdp_mmu_get_root(struct kvm_vcpu *vcpu,
}
bool kvm_tdp_mmu_zap_leafs(struct kvm *kvm, gfn_t start, gfn_t end, bool flush);
-bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp);
+bool kvm_tdp_mmu_zap_possible_nx_huge_page(struct kvm *kvm,
+ struct kvm_mmu_page *sp);
void kvm_tdp_mmu_zap_all(struct kvm *kvm);
void kvm_tdp_mmu_invalidate_roots(struct kvm *kvm,
enum kvm_tdp_mmu_root_types root_types);