summaryrefslogtreecommitdiff
path: root/arch/x86/kvm/emulate.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2025-10-06 12:37:34 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2025-10-06 12:37:34 -0700
commit256e3417065b2721f77bcd37331796b59483ef3b (patch)
tree616ede12953ce1dfc226208a864b5a7ce63f8d2f /arch/x86/kvm/emulate.c
parentfb5bc347311b1d78dc608c91c2d68327b0a1d1d4 (diff)
parent6b36119b94d0b2bb8cea9d512017efafd461d6ac (diff)
Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm
Pull x86 kvm updates from Paolo Bonzini: "Generic: - Rework almost all of KVM's exports to expose symbols only to KVM's x86 vendor modules (kvm-{amd,intel}.ko and PPC's kvm-{pr,hv}.ko x86: - Rework almost all of KVM x86's exports to expose symbols only to KVM's vendor modules, i.e. to kvm-{amd,intel}.ko - Add support for virtualizing Control-flow Enforcement Technology (CET) on Intel (Shadow Stacks and Indirect Branch Tracking) and AMD (Shadow Stacks). It is worth noting that while SHSTK and IBT can be enabled separately in CPUID, it is not really possible to virtualize them separately. Therefore, Intel processors will really allow both SHSTK and IBT under the hood if either is made visible in the guest's CPUID. The alternative would be to intercept XSAVES/XRSTORS, which is not feasible for performance reasons - Fix a variety of fuzzing WARNs all caused by checking L1 intercepts when completing userspace I/O. KVM has already committed to allowing L2 to to perform I/O at that point - Emulate PERF_CNTR_GLOBAL_STATUS_SET for PerfMonV2 guests, as the MSR is supposed to exist for v2 PMUs - Allow Centaur CPU leaves (base 0xC000_0000) for Zhaoxin CPUs - Add support for the immediate forms of RDMSR and WRMSRNS, sans full emulator support (KVM should never need to emulate the MSRs outside of forced emulation and other contrived testing scenarios) - Clean up the MSR APIs in preparation for CET and FRED virtualization, as well as mediated vPMU support - Clean up a pile of PMU code in anticipation of adding support for mediated vPMUs - Reject in-kernel IOAPIC/PIT for TDX VMs, as KVM can't obtain EOI vmexits needed to faithfully emulate an I/O APIC for such guests - Many cleanups and minor fixes - Recover possible NX huge pages within the TDP MMU under read lock to reduce guest jitter when restoring NX huge pages - Return -EAGAIN during prefault if userspace concurrently deletes/moves the relevant memslot, to fix an issue where prefaulting could deadlock with the memslot update x86 (AMD): - Enable AVIC by default for Zen4+ if x2AVIC (and other prereqs) is supported - Require a minimum GHCB version of 2 when starting SEV-SNP guests via KVM_SEV_INIT2 so that invalid GHCB versions result in immediate errors instead of latent guest failures - Add support for SEV-SNP's CipherText Hiding, an opt-in feature that prevents unauthorized CPU accesses from reading the ciphertext of SNP guest private memory, e.g. to attempt an offline attack. This feature splits the shared SEV-ES/SEV-SNP ASID space into separate ranges for SEV-ES and SEV-SNP guests, therefore a new module parameter is needed to control the number of ASIDs that can be used for VMs with CipherText Hiding vs. how many can be used to run SEV-ES guests - Add support for Secure TSC for SEV-SNP guests, which prevents the untrusted host from tampering with the guest's TSC frequency, while still allowing the the VMM to configure the guest's TSC frequency prior to launch - Validate the XCR0 provided by the guest (via the GHCB) to avoid bugs resulting from bogus XCR0 values - Save an SEV guest's policy if and only if LAUNCH_START fully succeeds to avoid leaving behind stale state (thankfully not consumed in KVM) - Explicitly reject non-positive effective lengths during SNP's LAUNCH_UPDATE instead of subtly relying on guest_memfd to deal with them - Reload the pre-VMRUN TSC_AUX on #VMEXIT for SEV-ES guests, not the host's desired TSC_AUX, to fix a bug where KVM was keeping a different vCPU's TSC_AUX in the host MSR until return to userspace KVM (Intel): - Preparation for FRED support - Don't retry in TDX's anti-zero-step mitigation if the target memslot is invalid, i.e. is being deleted or moved, to fix a deadlock scenario similar to the aforementioned prefaulting case - Misc bugfixes and minor cleanups" * tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (142 commits) KVM: x86: Export KVM-internal symbols for sub-modules only KVM: x86: Drop pointless exports of kvm_arch_xxx() hooks KVM: x86: Move kvm_intr_is_single_vcpu() to lapic.c KVM: Export KVM-internal symbols for sub-modules only KVM: s390/vfio-ap: Use kvm_is_gpa_in_memslot() instead of open coded equivalent KVM: VMX: Make CR4.CET a guest owned bit KVM: selftests: Verify MSRs are (not) in save/restore list when (un)supported KVM: selftests: Add coverage for KVM-defined registers in MSRs test KVM: selftests: Add KVM_{G,S}ET_ONE_REG coverage to MSRs test KVM: selftests: Extend MSRs test to validate vCPUs without supported features KVM: selftests: Add support for MSR_IA32_{S,U}_CET to MSRs test KVM: selftests: Add an MSR test to exercise guest/host and read/write KVM: x86: Define AMD's #HV, #VC, and #SX exception vectors KVM: x86: Define Control Protection Exception (#CP) vector KVM: x86: Add human friendly formatting for #XM, and #VE KVM: SVM: Enable shadow stack virtualization for SVM KVM: SEV: Synchronize MSR_IA32_XSS from the GHCB when it's valid KVM: SVM: Pass through shadow stack MSRs as appropriate KVM: SVM: Update dump_vmcb with shadow stack save area additions KVM: nSVM: Save/load CET Shadow Stack state to/from vmcb12/vmcb02 ...
Diffstat (limited to 'arch/x86/kvm/emulate.c')
-rw-r--r--arch/x86/kvm/emulate.c163
1 files changed, 143 insertions, 20 deletions
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 1349e278cd2a..59f93f68718a 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -178,6 +178,7 @@
#define IncSP ((u64)1 << 54) /* SP is incremented before ModRM calc */
#define TwoMemOp ((u64)1 << 55) /* Instruction has two memory operand */
#define IsBranch ((u64)1 << 56) /* Instruction is considered a branch. */
+#define ShadowStack ((u64)1 << 57) /* Instruction affects Shadow Stacks. */
#define DstXacc (DstAccLo | SrcAccHi | SrcWrite)
@@ -1553,6 +1554,37 @@ static int write_segment_descriptor(struct x86_emulate_ctxt *ctxt,
return linear_write_system(ctxt, addr, desc, sizeof(*desc));
}
+static bool emulator_is_ssp_invalid(struct x86_emulate_ctxt *ctxt, u8 cpl)
+{
+ const u32 MSR_IA32_X_CET = cpl == 3 ? MSR_IA32_U_CET : MSR_IA32_S_CET;
+ u64 efer = 0, cet = 0, ssp = 0;
+
+ if (!(ctxt->ops->get_cr(ctxt, 4) & X86_CR4_CET))
+ return false;
+
+ if (ctxt->ops->get_msr(ctxt, MSR_EFER, &efer))
+ return true;
+
+ /* SSP is guaranteed to be valid if the vCPU was already in 32-bit mode. */
+ if (!(efer & EFER_LMA))
+ return false;
+
+ if (ctxt->ops->get_msr(ctxt, MSR_IA32_X_CET, &cet))
+ return true;
+
+ if (!(cet & CET_SHSTK_EN))
+ return false;
+
+ if (ctxt->ops->get_msr(ctxt, MSR_KVM_INTERNAL_GUEST_SSP, &ssp))
+ return true;
+
+ /*
+ * On transfer from 64-bit mode to compatibility mode, SSP[63:32] must
+ * be 0, i.e. SSP must be a 32-bit value outside of 64-bit mode.
+ */
+ return ssp >> 32;
+}
+
static int __load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
u16 selector, int seg, u8 cpl,
enum x86_transfer_type transfer,
@@ -1693,6 +1725,10 @@ static int __load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
if (efer & EFER_LMA)
goto exception;
}
+ if (!seg_desc.l && emulator_is_ssp_invalid(ctxt, cpl)) {
+ err_code = 0;
+ goto exception;
+ }
/* CS(RPL) <- CPL */
selector = (selector & 0xfffc) | cpl;
@@ -4068,8 +4104,8 @@ static const struct opcode group4[] = {
static const struct opcode group5[] = {
F(DstMem | SrcNone | Lock, em_inc),
F(DstMem | SrcNone | Lock, em_dec),
- I(SrcMem | NearBranch | IsBranch, em_call_near_abs),
- I(SrcMemFAddr | ImplicitOps | IsBranch, em_call_far),
+ I(SrcMem | NearBranch | IsBranch | ShadowStack, em_call_near_abs),
+ I(SrcMemFAddr | ImplicitOps | IsBranch | ShadowStack, em_call_far),
I(SrcMem | NearBranch | IsBranch, em_jmp_abs),
I(SrcMemFAddr | ImplicitOps | IsBranch, em_jmp_far),
I(SrcMem | Stack | TwoMemOp, em_push), D(Undefined),
@@ -4304,7 +4340,7 @@ static const struct opcode opcode_table[256] = {
DI(SrcAcc | DstReg, pause), X7(D(SrcAcc | DstReg)),
/* 0x98 - 0x9F */
D(DstAcc | SrcNone), I(ImplicitOps | SrcAcc, em_cwd),
- I(SrcImmFAddr | No64 | IsBranch, em_call_far), N,
+ I(SrcImmFAddr | No64 | IsBranch | ShadowStack, em_call_far), N,
II(ImplicitOps | Stack, em_pushf, pushf),
II(ImplicitOps | Stack, em_popf, popf),
I(ImplicitOps, em_sahf), I(ImplicitOps, em_lahf),
@@ -4324,19 +4360,19 @@ static const struct opcode opcode_table[256] = {
X8(I(DstReg | SrcImm64 | Mov, em_mov)),
/* 0xC0 - 0xC7 */
G(ByteOp | Src2ImmByte, group2), G(Src2ImmByte, group2),
- I(ImplicitOps | NearBranch | SrcImmU16 | IsBranch, em_ret_near_imm),
- I(ImplicitOps | NearBranch | IsBranch, em_ret),
+ I(ImplicitOps | NearBranch | SrcImmU16 | IsBranch | ShadowStack, em_ret_near_imm),
+ I(ImplicitOps | NearBranch | IsBranch | ShadowStack, em_ret),
I(DstReg | SrcMemFAddr | ModRM | No64 | Src2ES, em_lseg),
I(DstReg | SrcMemFAddr | ModRM | No64 | Src2DS, em_lseg),
G(ByteOp, group11), G(0, group11),
/* 0xC8 - 0xCF */
- I(Stack | SrcImmU16 | Src2ImmByte | IsBranch, em_enter),
- I(Stack | IsBranch, em_leave),
- I(ImplicitOps | SrcImmU16 | IsBranch, em_ret_far_imm),
- I(ImplicitOps | IsBranch, em_ret_far),
- D(ImplicitOps | IsBranch), DI(SrcImmByte | IsBranch, intn),
+ I(Stack | SrcImmU16 | Src2ImmByte, em_enter),
+ I(Stack, em_leave),
+ I(ImplicitOps | SrcImmU16 | IsBranch | ShadowStack, em_ret_far_imm),
+ I(ImplicitOps | IsBranch | ShadowStack, em_ret_far),
+ D(ImplicitOps | IsBranch), DI(SrcImmByte | IsBranch | ShadowStack, intn),
D(ImplicitOps | No64 | IsBranch),
- II(ImplicitOps | IsBranch, em_iret, iret),
+ II(ImplicitOps | IsBranch | ShadowStack, em_iret, iret),
/* 0xD0 - 0xD7 */
G(Src2One | ByteOp, group2), G(Src2One, group2),
G(Src2CL | ByteOp, group2), G(Src2CL, group2),
@@ -4352,7 +4388,7 @@ static const struct opcode opcode_table[256] = {
I2bvIP(SrcImmUByte | DstAcc, em_in, in, check_perm_in),
I2bvIP(SrcAcc | DstImmUByte, em_out, out, check_perm_out),
/* 0xE8 - 0xEF */
- I(SrcImm | NearBranch | IsBranch, em_call),
+ I(SrcImm | NearBranch | IsBranch | ShadowStack, em_call),
D(SrcImm | ImplicitOps | NearBranch | IsBranch),
I(SrcImmFAddr | No64 | IsBranch, em_jmp_far),
D(SrcImmByte | ImplicitOps | NearBranch | IsBranch),
@@ -4371,7 +4407,7 @@ static const struct opcode opcode_table[256] = {
static const struct opcode twobyte_table[256] = {
/* 0x00 - 0x0F */
G(0, group6), GD(0, &group7), N, N,
- N, I(ImplicitOps | EmulateOnUD | IsBranch, em_syscall),
+ N, I(ImplicitOps | EmulateOnUD | IsBranch | ShadowStack, em_syscall),
II(ImplicitOps | Priv, em_clts, clts), N,
DI(ImplicitOps | Priv, invd), DI(ImplicitOps | Priv, wbinvd), N, N,
N, D(ImplicitOps | ModRM | SrcMem | NoAccess), N, N,
@@ -4402,8 +4438,8 @@ static const struct opcode twobyte_table[256] = {
IIP(ImplicitOps, em_rdtsc, rdtsc, check_rdtsc),
II(ImplicitOps | Priv, em_rdmsr, rdmsr),
IIP(ImplicitOps, em_rdpmc, rdpmc, check_rdpmc),
- I(ImplicitOps | EmulateOnUD | IsBranch, em_sysenter),
- I(ImplicitOps | Priv | EmulateOnUD | IsBranch, em_sysexit),
+ I(ImplicitOps | EmulateOnUD | IsBranch | ShadowStack, em_sysenter),
+ I(ImplicitOps | Priv | EmulateOnUD | IsBranch | ShadowStack, em_sysexit),
N, N,
N, N, N, N, N, N, N, N,
/* 0x40 - 0x4F */
@@ -4514,6 +4550,60 @@ static const struct opcode opcode_map_0f_38[256] = {
#undef I2bvIP
#undef I6ALU
+static bool is_shstk_instruction(struct x86_emulate_ctxt *ctxt)
+{
+ return ctxt->d & ShadowStack;
+}
+
+static bool is_ibt_instruction(struct x86_emulate_ctxt *ctxt)
+{
+ u64 flags = ctxt->d;
+
+ if (!(flags & IsBranch))
+ return false;
+
+ /*
+ * All far JMPs and CALLs (including SYSCALL, SYSENTER, and INTn) are
+ * indirect and thus affect IBT state. All far RETs (including SYSEXIT
+ * and IRET) are protected via Shadow Stacks and thus don't affect IBT
+ * state. IRET #GPs when returning to virtual-8086 and IBT or SHSTK is
+ * enabled, but that should be handled by IRET emulation (in the very
+ * unlikely scenario that KVM adds support for fully emulating IRET).
+ */
+ if (!(flags & NearBranch))
+ return ctxt->execute != em_iret &&
+ ctxt->execute != em_ret_far &&
+ ctxt->execute != em_ret_far_imm &&
+ ctxt->execute != em_sysexit;
+
+ switch (flags & SrcMask) {
+ case SrcReg:
+ case SrcMem:
+ case SrcMem16:
+ case SrcMem32:
+ return true;
+ case SrcMemFAddr:
+ case SrcImmFAddr:
+ /* Far branches should be handled above. */
+ WARN_ON_ONCE(1);
+ return true;
+ case SrcNone:
+ case SrcImm:
+ case SrcImmByte:
+ /*
+ * Note, ImmU16 is used only for the stack adjustment operand on ENTER
+ * and RET instructions. ENTER isn't a branch and RET FAR is handled
+ * by the NearBranch check above. RET itself isn't an indirect branch.
+ */
+ case SrcImmU16:
+ return false;
+ default:
+ WARN_ONCE(1, "Unexpected Src operand '%llx' on branch",
+ flags & SrcMask);
+ return false;
+ }
+}
+
static unsigned imm_size(struct x86_emulate_ctxt *ctxt)
{
unsigned size;
@@ -4943,6 +5033,40 @@ done_prefixes:
ctxt->execute = opcode.u.execute;
+ /*
+ * Reject emulation if KVM might need to emulate shadow stack updates
+ * and/or indirect branch tracking enforcement, which the emulator
+ * doesn't support.
+ */
+ if ((is_ibt_instruction(ctxt) || is_shstk_instruction(ctxt)) &&
+ ctxt->ops->get_cr(ctxt, 4) & X86_CR4_CET) {
+ u64 u_cet = 0, s_cet = 0;
+
+ /*
+ * Check both User and Supervisor on far transfers as inter-
+ * privilege level transfers are impacted by CET at the target
+ * privilege level, and that is not known at this time. The
+ * expectation is that the guest will not require emulation of
+ * any CET-affected instructions at any privilege level.
+ */
+ if (!(ctxt->d & NearBranch))
+ u_cet = s_cet = CET_SHSTK_EN | CET_ENDBR_EN;
+ else if (ctxt->ops->cpl(ctxt) == 3)
+ u_cet = CET_SHSTK_EN | CET_ENDBR_EN;
+ else
+ s_cet = CET_SHSTK_EN | CET_ENDBR_EN;
+
+ if ((u_cet && ctxt->ops->get_msr(ctxt, MSR_IA32_U_CET, &u_cet)) ||
+ (s_cet && ctxt->ops->get_msr(ctxt, MSR_IA32_S_CET, &s_cet)))
+ return EMULATION_FAILED;
+
+ if ((u_cet | s_cet) & CET_SHSTK_EN && is_shstk_instruction(ctxt))
+ return EMULATION_FAILED;
+
+ if ((u_cet | s_cet) & CET_ENDBR_EN && is_ibt_instruction(ctxt))
+ return EMULATION_FAILED;
+ }
+
if (unlikely(emulation_type & EMULTYPE_TRAP_UD) &&
likely(!(ctxt->d & EmulateOnUD)))
return EMULATION_FAILED;
@@ -5107,12 +5231,11 @@ void init_decode_cache(struct x86_emulate_ctxt *ctxt)
ctxt->mem_read.end = 0;
}
-int x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
+int x86_emulate_insn(struct x86_emulate_ctxt *ctxt, bool check_intercepts)
{
const struct x86_emulate_ops *ops = ctxt->ops;
int rc = X86EMUL_CONTINUE;
int saved_dst_type = ctxt->dst.type;
- bool is_guest_mode = ctxt->ops->is_guest_mode(ctxt);
ctxt->mem_read.pos = 0;
@@ -5160,7 +5283,7 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
fetch_possible_mmx_operand(&ctxt->dst);
}
- if (unlikely(is_guest_mode) && ctxt->intercept) {
+ if (unlikely(check_intercepts) && ctxt->intercept) {
rc = emulator_check_intercept(ctxt, ctxt->intercept,
X86_ICPT_PRE_EXCEPT);
if (rc != X86EMUL_CONTINUE)
@@ -5189,7 +5312,7 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
goto done;
}
- if (unlikely(is_guest_mode) && (ctxt->d & Intercept)) {
+ if (unlikely(check_intercepts) && (ctxt->d & Intercept)) {
rc = emulator_check_intercept(ctxt, ctxt->intercept,
X86_ICPT_POST_EXCEPT);
if (rc != X86EMUL_CONTINUE)
@@ -5243,7 +5366,7 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
special_insn:
- if (unlikely(is_guest_mode) && (ctxt->d & Intercept)) {
+ if (unlikely(check_intercepts) && (ctxt->d & Intercept)) {
rc = emulator_check_intercept(ctxt, ctxt->intercept,
X86_ICPT_POST_MEMACCESS);
if (rc != X86EMUL_CONTINUE)