diff options
Diffstat (limited to 'arch/x86/kvm/x86.c')
-rw-r--r-- | arch/x86/kvm/x86.c | 206 |
1 files changed, 121 insertions, 85 deletions
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 2a20ce60152e..efc7a82ab140 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -156,9 +156,9 @@ module_param(tsc_tolerance_ppm, uint, S_IRUGO | S_IWUSR); /* * lapic timer advance (tscdeadline mode only) in nanoseconds. '-1' enables - * adaptive tuning starting from default advancment of 1000ns. '0' disables + * adaptive tuning starting from default advancement of 1000ns. '0' disables * advancement entirely. Any other value is used as-is and disables adaptive - * tuning, i.e. allows priveleged userspace to set an exact advancement time. + * tuning, i.e. allows privileged userspace to set an exact advancement time. */ static int __read_mostly lapic_timer_advance_ns = -1; module_param(lapic_timer_advance_ns, int, S_IRUGO | S_IWUSR); @@ -271,8 +271,7 @@ static struct kmem_cache *x86_emulator_cache; * When called, it means the previous get/set msr reached an invalid msr. * Return true if we want to ignore/silent this failed msr access. */ -static bool kvm_msr_ignored_check(struct kvm_vcpu *vcpu, u32 msr, - u64 data, bool write) +static bool kvm_msr_ignored_check(u32 msr, u64 data, bool write) { const char *op = write ? "wrmsr" : "rdmsr"; @@ -1288,7 +1287,7 @@ static const u32 emulated_msrs_all[] = { MSR_KVM_PV_EOI_EN, MSR_KVM_ASYNC_PF_INT, MSR_KVM_ASYNC_PF_ACK, MSR_IA32_TSC_ADJUST, - MSR_IA32_TSCDEADLINE, + MSR_IA32_TSC_DEADLINE, MSR_IA32_ARCH_CAPABILITIES, MSR_IA32_PERF_CAPABILITIES, MSR_IA32_MISC_ENABLE, @@ -1373,7 +1372,7 @@ static u64 kvm_get_arch_capabilities(void) /* * If nx_huge_pages is enabled, KVM's shadow paging will ensure that * the nested hypervisor runs with NX huge pages. If it is not, - * L1 is anyway vulnerable to ITLB_MULTIHIT explots from other + * L1 is anyway vulnerable to ITLB_MULTIHIT exploits from other * L1 guests, so it need not worry about its own (L2) guests. */ data |= ARCH_CAP_PSCHANGE_MC_NO; @@ -1445,7 +1444,7 @@ static int do_get_msr_feature(struct kvm_vcpu *vcpu, unsigned index, u64 *data) if (r == KVM_MSR_RET_INVALID) { /* Unconditionally clear the output for simplicity */ *data = 0; - if (kvm_msr_ignored_check(vcpu, index, 0, false)) + if (kvm_msr_ignored_check(index, 0, false)) r = 0; } @@ -1526,35 +1525,44 @@ EXPORT_SYMBOL_GPL(kvm_enable_efer_bits); bool kvm_msr_allowed(struct kvm_vcpu *vcpu, u32 index, u32 type) { + struct kvm_x86_msr_filter *msr_filter; + struct msr_bitmap_range *ranges; struct kvm *kvm = vcpu->kvm; - struct msr_bitmap_range *ranges = kvm->arch.msr_filter.ranges; - u32 count = kvm->arch.msr_filter.count; - u32 i; - bool r = kvm->arch.msr_filter.default_allow; + bool allowed; int idx; + u32 i; - /* MSR filtering not set up or x2APIC enabled, allow everything */ - if (!count || (index >= 0x800 && index <= 0x8ff)) + /* x2APIC MSRs do not support filtering. */ + if (index >= 0x800 && index <= 0x8ff) return true; - /* Prevent collision with set_msr_filter */ idx = srcu_read_lock(&kvm->srcu); - for (i = 0; i < count; i++) { + msr_filter = srcu_dereference(kvm->arch.msr_filter, &kvm->srcu); + if (!msr_filter) { + allowed = true; + goto out; + } + + allowed = msr_filter->default_allow; + ranges = msr_filter->ranges; + + for (i = 0; i < msr_filter->count; i++) { u32 start = ranges[i].base; u32 end = start + ranges[i].nmsrs; u32 flags = ranges[i].flags; unsigned long *bitmap = ranges[i].bitmap; if ((index >= start) && (index < end) && (flags & type)) { - r = !!test_bit(index - start, bitmap); + allowed = !!test_bit(index - start, bitmap); break; } } +out: srcu_read_unlock(&kvm->srcu, idx); - return r; + return allowed; } EXPORT_SYMBOL_GPL(kvm_msr_allowed); @@ -1611,7 +1619,7 @@ static int kvm_set_msr_ignored_check(struct kvm_vcpu *vcpu, int ret = __kvm_set_msr(vcpu, index, data, host_initiated); if (ret == KVM_MSR_RET_INVALID) - if (kvm_msr_ignored_check(vcpu, index, data, true)) + if (kvm_msr_ignored_check(index, data, true)) ret = 0; return ret; @@ -1649,7 +1657,7 @@ static int kvm_get_msr_ignored_check(struct kvm_vcpu *vcpu, if (ret == KVM_MSR_RET_INVALID) { /* Unconditionally clear *data for simplicity */ *data = 0; - if (kvm_msr_ignored_check(vcpu, index, 0, false)) + if (kvm_msr_ignored_check(index, 0, false)) ret = 0; } @@ -1841,7 +1849,7 @@ fastpath_t handle_fastpath_set_msr_irqoff(struct kvm_vcpu *vcpu) ret = EXIT_FASTPATH_EXIT_HANDLED; } break; - case MSR_IA32_TSCDEADLINE: + case MSR_IA32_TSC_DEADLINE: data = kvm_read_edx_eax(vcpu); if (!handle_fastpath_set_tscdeadline(vcpu, data)) { kvm_skip_emulated_instruction(vcpu); @@ -2320,7 +2328,7 @@ static void kvm_synchronize_tsc(struct kvm_vcpu *vcpu, u64 data) kvm_vcpu_write_tsc_offset(vcpu, offset); raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags); - spin_lock(&kvm->arch.pvclock_gtod_sync_lock); + spin_lock_irqsave(&kvm->arch.pvclock_gtod_sync_lock, flags); if (!matched) { kvm->arch.nr_vcpus_matched_tsc = 0; } else if (!already_matched) { @@ -2328,7 +2336,7 @@ static void kvm_synchronize_tsc(struct kvm_vcpu *vcpu, u64 data) } kvm_track_tsc_matching(vcpu); - spin_unlock(&kvm->arch.pvclock_gtod_sync_lock); + spin_unlock_irqrestore(&kvm->arch.pvclock_gtod_sync_lock, flags); } static inline void adjust_tsc_offset_guest(struct kvm_vcpu *vcpu, @@ -2550,11 +2558,16 @@ static void kvm_gen_update_masterclock(struct kvm *kvm) int i; struct kvm_vcpu *vcpu; struct kvm_arch *ka = &kvm->arch; + unsigned long flags; + + kvm_hv_invalidate_tsc_page(kvm); - spin_lock(&ka->pvclock_gtod_sync_lock); kvm_make_mclock_inprogress_request(kvm); + /* no guest entries from this point */ + spin_lock_irqsave(&ka->pvclock_gtod_sync_lock, flags); pvclock_update_vm_gtod_copy(kvm); + spin_unlock_irqrestore(&ka->pvclock_gtod_sync_lock, flags); kvm_for_each_vcpu(i, vcpu, kvm) kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); @@ -2562,8 +2575,6 @@ static void kvm_gen_update_masterclock(struct kvm *kvm) /* guest entries allowed */ kvm_for_each_vcpu(i, vcpu, kvm) kvm_clear_request(KVM_REQ_MCLOCK_INPROGRESS, vcpu); - - spin_unlock(&ka->pvclock_gtod_sync_lock); #endif } @@ -2571,17 +2582,18 @@ u64 get_kvmclock_ns(struct kvm *kvm) { struct kvm_arch *ka = &kvm->arch; struct pvclock_vcpu_time_info hv_clock; + unsigned long flags; u64 ret; - spin_lock(&ka->pvclock_gtod_sync_lock); + spin_lock_irqsave(&ka->pvclock_gtod_sync_lock, flags); if (!ka->use_master_clock) { - spin_unlock(&ka->pvclock_gtod_sync_lock); + spin_unlock_irqrestore(&ka->pvclock_gtod_sync_lock, flags); return get_kvmclock_base_ns() + ka->kvmclock_offset; } hv_clock.tsc_timestamp = ka->master_cycle_now; hv_clock.system_time = ka->master_kernel_ns + ka->kvmclock_offset; - spin_unlock(&ka->pvclock_gtod_sync_lock); + spin_unlock_irqrestore(&ka->pvclock_gtod_sync_lock, flags); /* both __this_cpu_read() and rdtsc() should be on the same cpu */ get_cpu(); @@ -2675,13 +2687,13 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) * If the host uses TSC clock, then passthrough TSC as stable * to the guest. */ - spin_lock(&ka->pvclock_gtod_sync_lock); + spin_lock_irqsave(&ka->pvclock_gtod_sync_lock, flags); use_master_clock = ka->use_master_clock; if (use_master_clock) { host_tsc = ka->master_cycle_now; kernel_ns = ka->master_kernel_ns; } - spin_unlock(&ka->pvclock_gtod_sync_lock); + spin_unlock_irqrestore(&ka->pvclock_gtod_sync_lock, flags); /* Keep irq disabled to prevent changes to the clock */ local_irq_save(flags); @@ -3075,7 +3087,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) return kvm_set_apic_base(vcpu, msr_info); case APIC_BASE_MSR ... APIC_BASE_MSR + 0xff: return kvm_x2apic_msr_write(vcpu, msr, data); - case MSR_IA32_TSCDEADLINE: + case MSR_IA32_TSC_DEADLINE: kvm_set_lapic_tscdeadline_msr(vcpu, data); break; case MSR_IA32_TSC_ADJUST: @@ -3437,7 +3449,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) break; case APIC_BASE_MSR ... APIC_BASE_MSR + 0xff: return kvm_x2apic_msr_read(vcpu, msr_info->index, &msr_info->data); - case MSR_IA32_TSCDEADLINE: + case MSR_IA32_TSC_DEADLINE: msr_info->data = kvm_get_lapic_tscdeadline_msr(vcpu); break; case MSR_IA32_TSC_ADJUST: @@ -4013,7 +4025,6 @@ static void kvm_steal_time_set_preempted(struct kvm_vcpu *vcpu) { struct kvm_host_map map; struct kvm_steal_time *st; - int idx; if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED)) return; @@ -4021,15 +4032,9 @@ static void kvm_steal_time_set_preempted(struct kvm_vcpu *vcpu) if (vcpu->arch.st.preempted) return; - /* - * Take the srcu lock as memslots will be accessed to check the gfn - * cache generation against the memslots generation. - */ - idx = srcu_read_lock(&vcpu->kvm->srcu); - if (kvm_map_gfn(vcpu, vcpu->arch.st.msr_val >> PAGE_SHIFT, &map, &vcpu->arch.st.cache, true)) - goto out; + return; st = map.hva + offset_in_page(vcpu->arch.st.msr_val & KVM_STEAL_VALID_BITS); @@ -4037,20 +4042,25 @@ static void kvm_steal_time_set_preempted(struct kvm_vcpu *vcpu) st->preempted = vcpu->arch.st.preempted = KVM_VCPU_PREEMPTED; kvm_unmap_gfn(vcpu, &map, &vcpu->arch.st.cache, true, true); - -out: - srcu_read_unlock(&vcpu->kvm->srcu, idx); } void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) { + int idx; + if (vcpu->preempted && !vcpu->arch.guest_state_protected) vcpu->arch.preempted_in_kernel = !static_call(kvm_x86_get_cpl)(vcpu); + /* + * Take the srcu lock as memslots will be accessed to check the gfn + * cache generation against the memslots generation. + */ + idx = srcu_read_lock(&vcpu->kvm->srcu); if (kvm_xen_msr_enabled(vcpu->kvm)) kvm_xen_runstate_set_preempted(vcpu); else kvm_steal_time_set_preempted(vcpu); + srcu_read_unlock(&vcpu->kvm->srcu, idx); static_call(kvm_x86_vcpu_put)(vcpu); vcpu->arch.last_host_tsc = rdtsc(); @@ -5352,25 +5362,34 @@ split_irqchip_unlock: return r; } -static void kvm_clear_msr_filter(struct kvm *kvm) +static struct kvm_x86_msr_filter *kvm_alloc_msr_filter(bool default_allow) +{ + struct kvm_x86_msr_filter *msr_filter; + + msr_filter = kzalloc(sizeof(*msr_filter), GFP_KERNEL_ACCOUNT); + if (!msr_filter) + return NULL; + + msr_filter->default_allow = default_allow; + return msr_filter; +} + +static void kvm_free_msr_filter(struct kvm_x86_msr_filter *msr_filter) { u32 i; - u32 count = kvm->arch.msr_filter.count; - struct msr_bitmap_range ranges[16]; - mutex_lock(&kvm->lock); - kvm->arch.msr_filter.count = 0; - memcpy(ranges, kvm->arch.msr_filter.ranges, count * sizeof(ranges[0])); - mutex_unlock(&kvm->lock); - synchronize_srcu(&kvm->srcu); + if (!msr_filter) + return; + + for (i = 0; i < msr_filter->count; i++) + kfree(msr_filter->ranges[i].bitmap); - for (i = 0; i < count; i++) - kfree(ranges[i].bitmap); + kfree(msr_filter); } -static int kvm_add_msr_filter(struct kvm *kvm, struct kvm_msr_filter_range *user_range) +static int kvm_add_msr_filter(struct kvm_x86_msr_filter *msr_filter, + struct kvm_msr_filter_range *user_range) { - struct msr_bitmap_range *ranges = kvm->arch.msr_filter.ranges; struct msr_bitmap_range range; unsigned long *bitmap = NULL; size_t bitmap_size; @@ -5404,11 +5423,9 @@ static int kvm_add_msr_filter(struct kvm *kvm, struct kvm_msr_filter_range *user goto err; } - /* Everything ok, add this range identifier to our global pool */ - ranges[kvm->arch.msr_filter.count] = range; - /* Make sure we filled the array before we tell anyone to walk it */ - smp_wmb(); - kvm->arch.msr_filter.count++; + /* Everything ok, add this range identifier. */ + msr_filter->ranges[msr_filter->count] = range; + msr_filter->count++; return 0; err: @@ -5419,10 +5436,11 @@ err: static int kvm_vm_ioctl_set_msr_filter(struct kvm *kvm, void __user *argp) { struct kvm_msr_filter __user *user_msr_filter = argp; + struct kvm_x86_msr_filter *new_filter, *old_filter; struct kvm_msr_filter filter; bool default_allow; - int r = 0; bool empty = true; + int r = 0; u32 i; if (copy_from_user(&filter, user_msr_filter, sizeof(filter))) @@ -5435,25 +5453,32 @@ static int kvm_vm_ioctl_set_msr_filter(struct kvm *kvm, void __user *argp) if (empty && !default_allow) return -EINVAL; - kvm_clear_msr_filter(kvm); + new_filter = kvm_alloc_msr_filter(default_allow); + if (!new_filter) + return -ENOMEM; - kvm->arch.msr_filter.default_allow = default_allow; - - /* - * Protect from concurrent calls to this function that could trigger - * a TOCTOU violation on kvm->arch.msr_filter.count. - */ - mutex_lock(&kvm->lock); for (i = 0; i < ARRAY_SIZE(filter.ranges); i++) { - r = kvm_add_msr_filter(kvm, &filter.ranges[i]); - if (r) - break; + r = kvm_add_msr_filter(new_filter, &filter.ranges[i]); + if (r) { + kvm_free_msr_filter(new_filter); + return r; + } } + mutex_lock(&kvm->lock); + + /* The per-VM filter is protected by kvm->lock... */ + old_filter = srcu_dereference_check(kvm->arch.msr_filter, &kvm->srcu, 1); + + rcu_assign_pointer(kvm->arch.msr_filter, new_filter); + synchronize_srcu(&kvm->srcu); + + kvm_free_msr_filter(old_filter); + kvm_make_all_cpus_request(kvm, KVM_REQ_MSR_FILTER_CHANGED); mutex_unlock(&kvm->lock); - return r; + return 0; } long kvm_arch_vm_ioctl(struct file *filp, @@ -5700,6 +5725,7 @@ set_pit2_out: } #endif case KVM_SET_CLOCK: { + struct kvm_arch *ka = &kvm->arch; struct kvm_clock_data user_ns; u64 now_ns; @@ -5718,8 +5744,22 @@ set_pit2_out: * pvclock_update_vm_gtod_copy(). */ kvm_gen_update_masterclock(kvm); - now_ns = get_kvmclock_ns(kvm); - kvm->arch.kvmclock_offset += user_ns.clock - now_ns; + + /* + * This pairs with kvm_guest_time_update(): when masterclock is + * in use, we use master_kernel_ns + kvmclock_offset to set + * unsigned 'system_time' so if we use get_kvmclock_ns() (which + * is slightly ahead) here we risk going negative on unsigned + * 'system_time' when 'user_ns.clock' is very small. + */ + spin_lock_irq(&ka->pvclock_gtod_sync_lock); + if (kvm->arch.use_master_clock) + now_ns = ka->master_kernel_ns; + else + now_ns = get_kvmclock_base_ns(); + ka->kvmclock_offset = user_ns.clock - now_ns; + spin_unlock_irq(&ka->pvclock_gtod_sync_lock); + kvm_make_all_cpus_request(kvm, KVM_REQ_CLOCK_UPDATE); break; } @@ -6603,7 +6643,7 @@ static int kvm_emulate_wbinvd_noskip(struct kvm_vcpu *vcpu) int cpu = get_cpu(); cpumask_set_cpu(cpu, vcpu->arch.wbinvd_dirty_mask); - smp_call_function_many(vcpu->arch.wbinvd_dirty_mask, + on_each_cpu_mask(vcpu->arch.wbinvd_dirty_mask, wbinvd_ipi, NULL, 1); put_cpu(); cpumask_clear(vcpu->arch.wbinvd_dirty_mask); @@ -7698,6 +7738,7 @@ static void kvm_hyperv_tsc_notifier(void) struct kvm *kvm; struct kvm_vcpu *vcpu; int cpu; + unsigned long flags; mutex_lock(&kvm_lock); list_for_each_entry(kvm, &vm_list, vm_list) @@ -7713,17 +7754,15 @@ static void kvm_hyperv_tsc_notifier(void) list_for_each_entry(kvm, &vm_list, vm_list) { struct kvm_arch *ka = &kvm->arch; - spin_lock(&ka->pvclock_gtod_sync_lock); - + spin_lock_irqsave(&ka->pvclock_gtod_sync_lock, flags); pvclock_update_vm_gtod_copy(kvm); + spin_unlock_irqrestore(&ka->pvclock_gtod_sync_lock, flags); kvm_for_each_vcpu(cpu, vcpu, kvm) kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); kvm_for_each_vcpu(cpu, vcpu, kvm) kvm_clear_request(KVM_REQ_MCLOCK_INPROGRESS, vcpu); - - spin_unlock(&ka->pvclock_gtod_sync_lock); } mutex_unlock(&kvm_lock); } @@ -10601,7 +10640,7 @@ void __user * __x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, return (void __user *)hva; } else { if (!slot || !slot->npages) - return 0; + return NULL; old_npages = slot->npages; hva = slot->userspace_addr; @@ -10634,8 +10673,6 @@ void kvm_arch_pre_destroy_vm(struct kvm *kvm) void kvm_arch_destroy_vm(struct kvm *kvm) { - u32 i; - if (current->mm == kvm->mm) { /* * Free memory regions allocated on behalf of userspace, @@ -10651,8 +10688,7 @@ void kvm_arch_destroy_vm(struct kvm *kvm) mutex_unlock(&kvm->slots_lock); } static_call_cond(kvm_x86_vm_destroy)(kvm); - for (i = 0; i < kvm->arch.msr_filter.count; i++) - kfree(kvm->arch.msr_filter.ranges[i].bitmap); + kvm_free_msr_filter(srcu_dereference_check(kvm->arch.msr_filter, &kvm->srcu, 1)); kvm_pic_destroy(kvm); kvm_ioapic_destroy(kvm); kvm_free_vcpus(kvm); |