diff options
Diffstat (limited to 'arch/arm64/kvm')
39 files changed, 1706 insertions, 762 deletions
diff --git a/arch/arm64/kvm/Kconfig b/arch/arm64/kvm/Kconfig index 713248f240e0..bff62e75d681 100644 --- a/arch/arm64/kvm/Kconfig +++ b/arch/arm64/kvm/Kconfig @@ -37,6 +37,7 @@ menuconfig KVM select HAVE_KVM_VCPU_RUN_PID_CHANGE select SCHED_INFO select GUEST_PERF_EVENTS if PERF_EVENTS + select KVM_GUEST_MEMFD help Support hosting virtualized guest machines. diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index 888f7c7abf54..fa79744290f3 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -170,10 +170,6 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) if (ret) return ret; - ret = pkvm_init_host_vm(kvm); - if (ret) - goto err_unshare_kvm; - if (!zalloc_cpumask_var(&kvm->arch.supported_cpus, GFP_KERNEL_ACCOUNT)) { ret = -ENOMEM; goto err_unshare_kvm; @@ -184,6 +180,16 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) if (ret) goto err_free_cpumask; + if (is_protected_kvm_enabled()) { + /* + * If any failures occur after this is successful, make sure to + * call __pkvm_unreserve_vm to unreserve the VM in hyp. + */ + ret = pkvm_init_host_vm(kvm); + if (ret) + goto err_free_cpumask; + } + kvm_vgic_early_init(kvm); kvm_timer_init_vm(kvm); @@ -2113,8 +2119,10 @@ static void cpu_hyp_init_features(void) { cpu_set_hyp_vector(); - if (is_kernel_in_hyp_mode()) + if (is_kernel_in_hyp_mode()) { kvm_timer_init_vhe(); + kvm_debug_init_vhe(); + } if (vgic_present) kvm_vgic_init_cpu_hardware(); @@ -2315,8 +2323,9 @@ static int __init init_subsystems(void) } if (kvm_mode == KVM_MODE_NV && - !(vgic_present && kvm_vgic_global_state.type == VGIC_V3)) { - kvm_err("NV support requires GICv3, giving up\n"); + !(vgic_present && (kvm_vgic_global_state.type == VGIC_V3 || + kvm_vgic_global_state.has_gcie_v3_compat))) { + kvm_err("NV support requires GICv3 or GICv5 with legacy support, giving up\n"); err = -EINVAL; goto out; } @@ -2408,12 +2417,12 @@ static u64 get_hyp_id_aa64pfr0_el1(void) */ u64 val = read_sanitised_ftr_reg(SYS_ID_AA64PFR0_EL1); - val &= ~(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_CSV2) | - ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_CSV3)); + val &= ~(ID_AA64PFR0_EL1_CSV2 | + ID_AA64PFR0_EL1_CSV3); - val |= FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_CSV2), + val |= FIELD_PREP(ID_AA64PFR0_EL1_CSV2, arm64_get_spectre_v2_state() == SPECTRE_UNAFFECTED); - val |= FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_CSV3), + val |= FIELD_PREP(ID_AA64PFR0_EL1_CSV3, arm64_get_meltdown_state() == SPECTRE_UNAFFECTED); return val; diff --git a/arch/arm64/kvm/at.c b/arch/arm64/kvm/at.c index 0e5610533949..20bb9af125b1 100644 --- a/arch/arm64/kvm/at.c +++ b/arch/arm64/kvm/at.c @@ -28,9 +28,57 @@ static int get_ia_size(struct s1_walk_info *wi) /* Return true if the IPA is out of the OA range */ static bool check_output_size(u64 ipa, struct s1_walk_info *wi) { + if (wi->pa52bit) + return wi->max_oa_bits < 52 && (ipa & GENMASK_ULL(51, wi->max_oa_bits)); return wi->max_oa_bits < 48 && (ipa & GENMASK_ULL(47, wi->max_oa_bits)); } +static bool has_52bit_pa(struct kvm_vcpu *vcpu, struct s1_walk_info *wi, u64 tcr) +{ + switch (BIT(wi->pgshift)) { + case SZ_64K: + default: /* IMPDEF: treat any other value as 64k */ + if (!kvm_has_feat_enum(vcpu->kvm, ID_AA64MMFR0_EL1, PARANGE, 52)) + return false; + return ((wi->regime == TR_EL2 ? + FIELD_GET(TCR_EL2_PS_MASK, tcr) : + FIELD_GET(TCR_IPS_MASK, tcr)) == 0b0110); + case SZ_16K: + if (!kvm_has_feat(vcpu->kvm, ID_AA64MMFR0_EL1, TGRAN16, 52_BIT)) + return false; + break; + case SZ_4K: + if (!kvm_has_feat(vcpu->kvm, ID_AA64MMFR0_EL1, TGRAN4, 52_BIT)) + return false; + break; + } + + return (tcr & (wi->regime == TR_EL2 ? TCR_EL2_DS : TCR_DS)); +} + +static u64 desc_to_oa(struct s1_walk_info *wi, u64 desc) +{ + u64 addr; + + if (!wi->pa52bit) + return desc & GENMASK_ULL(47, wi->pgshift); + + switch (BIT(wi->pgshift)) { + case SZ_4K: + case SZ_16K: + addr = desc & GENMASK_ULL(49, wi->pgshift); + addr |= FIELD_GET(KVM_PTE_ADDR_51_50_LPA2, desc) << 50; + break; + case SZ_64K: + default: /* IMPDEF: treat any other value as 64k */ + addr = desc & GENMASK_ULL(47, wi->pgshift); + addr |= FIELD_GET(KVM_PTE_ADDR_51_48, desc) << 48; + break; + } + + return addr; +} + /* Return the translation regime that applies to an AT instruction */ static enum trans_regime compute_translation_regime(struct kvm_vcpu *vcpu, u32 op) { @@ -50,21 +98,26 @@ static enum trans_regime compute_translation_regime(struct kvm_vcpu *vcpu, u32 o } } +static u64 effective_tcr2(struct kvm_vcpu *vcpu, enum trans_regime regime) +{ + if (regime == TR_EL10) { + if (vcpu_has_nv(vcpu) && + !(__vcpu_sys_reg(vcpu, HCRX_EL2) & HCRX_EL2_TCR2En)) + return 0; + + return vcpu_read_sys_reg(vcpu, TCR2_EL1); + } + + return vcpu_read_sys_reg(vcpu, TCR2_EL2); +} + static bool s1pie_enabled(struct kvm_vcpu *vcpu, enum trans_regime regime) { if (!kvm_has_s1pie(vcpu->kvm)) return false; - switch (regime) { - case TR_EL2: - case TR_EL20: - return vcpu_read_sys_reg(vcpu, TCR2_EL2) & TCR2_EL2_PIE; - case TR_EL10: - return (__vcpu_sys_reg(vcpu, HCRX_EL2) & HCRX_EL2_TCR2En) && - (__vcpu_sys_reg(vcpu, TCR2_EL1) & TCR2_EL1_PIE); - default: - BUG(); - } + /* Abuse TCR2_EL1_PIE and use it for EL2 as well */ + return effective_tcr2(vcpu, regime) & TCR2_EL1_PIE; } static void compute_s1poe(struct kvm_vcpu *vcpu, struct s1_walk_info *wi) @@ -76,23 +129,11 @@ static void compute_s1poe(struct kvm_vcpu *vcpu, struct s1_walk_info *wi) return; } - switch (wi->regime) { - case TR_EL2: - case TR_EL20: - val = vcpu_read_sys_reg(vcpu, TCR2_EL2); - wi->poe = val & TCR2_EL2_POE; - wi->e0poe = (wi->regime == TR_EL20) && (val & TCR2_EL2_E0POE); - break; - case TR_EL10: - if (__vcpu_sys_reg(vcpu, HCRX_EL2) & HCRX_EL2_TCR2En) { - wi->poe = wi->e0poe = false; - return; - } + val = effective_tcr2(vcpu, wi->regime); - val = __vcpu_sys_reg(vcpu, TCR2_EL1); - wi->poe = val & TCR2_EL1_POE; - wi->e0poe = val & TCR2_EL1_E0POE; - } + /* Abuse TCR2_EL1_* for EL2 */ + wi->poe = val & TCR2_EL1_POE; + wi->e0poe = (wi->regime != TR_EL2) && (val & TCR2_EL1_E0POE); } static int setup_s1_walk(struct kvm_vcpu *vcpu, struct s1_walk_info *wi, @@ -102,14 +143,16 @@ static int setup_s1_walk(struct kvm_vcpu *vcpu, struct s1_walk_info *wi, unsigned int stride, x; bool va55, tbi, lva; - hcr = __vcpu_sys_reg(vcpu, HCR_EL2); - va55 = va & BIT(55); - if (wi->regime == TR_EL2 && va55) - goto addrsz; - - wi->s2 = wi->regime == TR_EL10 && (hcr & (HCR_VM | HCR_DC)); + if (vcpu_has_nv(vcpu)) { + hcr = __vcpu_sys_reg(vcpu, HCR_EL2); + wi->s2 = wi->regime == TR_EL10 && (hcr & (HCR_VM | HCR_DC)); + } else { + WARN_ON_ONCE(wi->regime != TR_EL10); + wi->s2 = false; + hcr = 0; + } switch (wi->regime) { case TR_EL10: @@ -131,6 +174,46 @@ static int setup_s1_walk(struct kvm_vcpu *vcpu, struct s1_walk_info *wi, BUG(); } + /* Someone was silly enough to encode TG0/TG1 differently */ + if (va55 && wi->regime != TR_EL2) { + wi->txsz = FIELD_GET(TCR_T1SZ_MASK, tcr); + tg = FIELD_GET(TCR_TG1_MASK, tcr); + + switch (tg << TCR_TG1_SHIFT) { + case TCR_TG1_4K: + wi->pgshift = 12; break; + case TCR_TG1_16K: + wi->pgshift = 14; break; + case TCR_TG1_64K: + default: /* IMPDEF: treat any other value as 64k */ + wi->pgshift = 16; break; + } + } else { + wi->txsz = FIELD_GET(TCR_T0SZ_MASK, tcr); + tg = FIELD_GET(TCR_TG0_MASK, tcr); + + switch (tg << TCR_TG0_SHIFT) { + case TCR_TG0_4K: + wi->pgshift = 12; break; + case TCR_TG0_16K: + wi->pgshift = 14; break; + case TCR_TG0_64K: + default: /* IMPDEF: treat any other value as 64k */ + wi->pgshift = 16; break; + } + } + + wi->pa52bit = has_52bit_pa(vcpu, wi, tcr); + + ia_bits = get_ia_size(wi); + + /* AArch64.S1StartLevel() */ + stride = wi->pgshift - 3; + wi->sl = 3 - (((ia_bits - 1) - wi->pgshift) / stride); + + if (wi->regime == TR_EL2 && va55) + goto addrsz; + tbi = (wi->regime == TR_EL2 ? FIELD_GET(TCR_EL2_TBI, tcr) : (va55 ? @@ -140,6 +223,12 @@ static int setup_s1_walk(struct kvm_vcpu *vcpu, struct s1_walk_info *wi, if (!tbi && (u64)sign_extend64(va, 55) != va) goto addrsz; + wi->sh = (wi->regime == TR_EL2 ? + FIELD_GET(TCR_EL2_SH0_MASK, tcr) : + (va55 ? + FIELD_GET(TCR_SH1_MASK, tcr) : + FIELD_GET(TCR_SH0_MASK, tcr))); + va = (u64)sign_extend64(va, 55); /* Let's put the MMU disabled case aside immediately */ @@ -194,53 +283,20 @@ static int setup_s1_walk(struct kvm_vcpu *vcpu, struct s1_walk_info *wi, /* R_BVXDG */ wi->hpd |= (wi->poe || wi->e0poe); - /* Someone was silly enough to encode TG0/TG1 differently */ - if (va55) { - wi->txsz = FIELD_GET(TCR_T1SZ_MASK, tcr); - tg = FIELD_GET(TCR_TG1_MASK, tcr); - - switch (tg << TCR_TG1_SHIFT) { - case TCR_TG1_4K: - wi->pgshift = 12; break; - case TCR_TG1_16K: - wi->pgshift = 14; break; - case TCR_TG1_64K: - default: /* IMPDEF: treat any other value as 64k */ - wi->pgshift = 16; break; - } - } else { - wi->txsz = FIELD_GET(TCR_T0SZ_MASK, tcr); - tg = FIELD_GET(TCR_TG0_MASK, tcr); - - switch (tg << TCR_TG0_SHIFT) { - case TCR_TG0_4K: - wi->pgshift = 12; break; - case TCR_TG0_16K: - wi->pgshift = 14; break; - case TCR_TG0_64K: - default: /* IMPDEF: treat any other value as 64k */ - wi->pgshift = 16; break; - } - } - /* R_PLCGL, R_YXNYW */ if (!kvm_has_feat_enum(vcpu->kvm, ID_AA64MMFR2_EL1, ST, 48_47)) { if (wi->txsz > 39) - goto transfault_l0; + goto transfault; } else { if (wi->txsz > 48 || (BIT(wi->pgshift) == SZ_64K && wi->txsz > 47)) - goto transfault_l0; + goto transfault; } /* R_GTJBY, R_SXWGM */ switch (BIT(wi->pgshift)) { case SZ_4K: - lva = kvm_has_feat(vcpu->kvm, ID_AA64MMFR0_EL1, TGRAN4, 52_BIT); - lva &= tcr & (wi->regime == TR_EL2 ? TCR_EL2_DS : TCR_DS); - break; case SZ_16K: - lva = kvm_has_feat(vcpu->kvm, ID_AA64MMFR0_EL1, TGRAN16, 52_BIT); - lva &= tcr & (wi->regime == TR_EL2 ? TCR_EL2_DS : TCR_DS); + lva = wi->pa52bit; break; case SZ_64K: lva = kvm_has_feat(vcpu->kvm, ID_AA64MMFR2_EL1, VARange, 52); @@ -248,38 +304,42 @@ static int setup_s1_walk(struct kvm_vcpu *vcpu, struct s1_walk_info *wi, } if ((lva && wi->txsz < 12) || (!lva && wi->txsz < 16)) - goto transfault_l0; - - ia_bits = get_ia_size(wi); + goto transfault; /* R_YYVYV, I_THCZK */ if ((!va55 && va > GENMASK(ia_bits - 1, 0)) || (va55 && va < GENMASK(63, ia_bits))) - goto transfault_l0; + goto transfault; /* I_ZFSYQ */ if (wi->regime != TR_EL2 && (tcr & (va55 ? TCR_EPD1_MASK : TCR_EPD0_MASK))) - goto transfault_l0; + goto transfault; /* R_BNDVG and following statements */ if (kvm_has_feat(vcpu->kvm, ID_AA64MMFR2_EL1, E0PD, IMP) && wi->as_el0 && (tcr & (va55 ? TCR_E0PD1 : TCR_E0PD0))) - goto transfault_l0; - - /* AArch64.S1StartLevel() */ - stride = wi->pgshift - 3; - wi->sl = 3 - (((ia_bits - 1) - wi->pgshift) / stride); + goto transfault; ps = (wi->regime == TR_EL2 ? FIELD_GET(TCR_EL2_PS_MASK, tcr) : FIELD_GET(TCR_IPS_MASK, tcr)); - wi->max_oa_bits = min(get_kvm_ipa_limit(), ps_to_output_size(ps)); + wi->max_oa_bits = min(get_kvm_ipa_limit(), ps_to_output_size(ps, wi->pa52bit)); /* Compute minimal alignment */ x = 3 + ia_bits - ((3 - wi->sl) * stride + wi->pgshift); wi->baddr = ttbr & TTBRx_EL1_BADDR; + if (wi->pa52bit) { + /* + * Force the alignment on 64 bytes for top-level tables + * smaller than 8 entries, since TTBR.BADDR[5:2] are used to + * store bits [51:48] of the first level of lookup. + */ + x = max(x, 6); + + wi->baddr |= FIELD_GET(GENMASK_ULL(5, 2), ttbr) << 48; + } /* R_VPBBF */ if (check_output_size(wi->baddr, wi)) @@ -289,12 +349,17 @@ static int setup_s1_walk(struct kvm_vcpu *vcpu, struct s1_walk_info *wi, return 0; -addrsz: /* Address Size Fault level 0 */ +addrsz: + /* + * Address Size Fault level 0 to indicate it comes from TTBR. + * yes, this is an oddity. + */ fail_s1_walk(wr, ESR_ELx_FSC_ADDRSZ_L(0), false); return -EFAULT; -transfault_l0: /* Translation Fault level 0 */ - fail_s1_walk(wr, ESR_ELx_FSC_FAULT_L(0), false); +transfault: + /* Translation Fault on start level */ + fail_s1_walk(wr, ESR_ELx_FSC_FAULT_L(wi->sl), false); return -EFAULT; } @@ -339,6 +404,17 @@ static int walk_s1(struct kvm_vcpu *vcpu, struct s1_walk_info *wi, ipa = kvm_s2_trans_output(&s2_trans); } + if (wi->filter) { + ret = wi->filter->fn(&(struct s1_walk_context) + { + .wi = wi, + .table_ipa = baddr, + .level = level, + }, wi->filter->priv); + if (ret) + return ret; + } + ret = kvm_read_guest(vcpu->kvm, ipa, &desc, sizeof(desc)); if (ret) { fail_s1_walk(wr, ESR_ELx_FSC_SEA_TTW(level), false); @@ -369,7 +445,7 @@ static int walk_s1(struct kvm_vcpu *vcpu, struct s1_walk_info *wi, wr->PXNTable |= FIELD_GET(PMD_TABLE_PXN, desc); } - baddr = desc & GENMASK_ULL(47, wi->pgshift); + baddr = desc_to_oa(wi, desc); /* Check for out-of-range OA */ if (check_output_size(baddr, wi)) @@ -386,11 +462,11 @@ static int walk_s1(struct kvm_vcpu *vcpu, struct s1_walk_info *wi, switch (BIT(wi->pgshift)) { case SZ_4K: - valid_block = level == 1 || level == 2; + valid_block = level == 1 || level == 2 || (wi->pa52bit && level == 0); break; case SZ_16K: case SZ_64K: - valid_block = level == 2; + valid_block = level == 2 || (wi->pa52bit && level == 1); break; } @@ -398,7 +474,8 @@ static int walk_s1(struct kvm_vcpu *vcpu, struct s1_walk_info *wi, goto transfault; } - if (check_output_size(desc & GENMASK(47, va_bottom), wi)) + baddr = desc_to_oa(wi, desc); + if (check_output_size(baddr & GENMASK(52, va_bottom), wi)) goto addrsz; if (!(desc & PTE_AF)) { @@ -411,7 +488,7 @@ static int walk_s1(struct kvm_vcpu *vcpu, struct s1_walk_info *wi, wr->failed = false; wr->level = level; wr->desc = desc; - wr->pa = desc & GENMASK(47, va_bottom); + wr->pa = baddr & GENMASK(52, va_bottom); wr->pa |= va & GENMASK_ULL(va_bottom - 1, 0); wr->nG = (wi->regime != TR_EL2) && (desc & PTE_NG); @@ -640,21 +717,36 @@ static u8 combine_s1_s2_attr(u8 s1, u8 s2) #define ATTR_OSH 0b10 #define ATTR_ISH 0b11 -static u8 compute_sh(u8 attr, u64 desc) +static u8 compute_final_sh(u8 attr, u8 sh) { - u8 sh; - /* Any form of device, as well as NC has SH[1:0]=0b10 */ if (MEMATTR_IS_DEVICE(attr) || attr == MEMATTR(NC, NC)) return ATTR_OSH; - sh = FIELD_GET(PTE_SHARED, desc); if (sh == ATTR_RSV) /* Reserved, mapped to NSH */ sh = ATTR_NSH; return sh; } +static u8 compute_s1_sh(struct s1_walk_info *wi, struct s1_walk_result *wr, + u8 attr) +{ + u8 sh; + + /* + * non-52bit and LPA have their basic shareability described in the + * descriptor. LPA2 gets it from the corresponding field in TCR, + * conveniently recorded in the walk info. + */ + if (!wi->pa52bit || BIT(wi->pgshift) == SZ_64K) + sh = FIELD_GET(KVM_PTE_LEAF_ATTR_LO_S1_SH, wr->desc); + else + sh = wi->sh; + + return compute_final_sh(attr, sh); +} + static u8 combine_sh(u8 s1_sh, u8 s2_sh) { if (s1_sh == ATTR_OSH || s2_sh == ATTR_OSH) @@ -668,7 +760,7 @@ static u8 combine_sh(u8 s1_sh, u8 s2_sh) static u64 compute_par_s12(struct kvm_vcpu *vcpu, u64 s1_par, struct kvm_s2_trans *tr) { - u8 s1_parattr, s2_memattr, final_attr; + u8 s1_parattr, s2_memattr, final_attr, s2_sh; u64 par; /* If S2 has failed to translate, report the damage */ @@ -741,17 +833,19 @@ static u64 compute_par_s12(struct kvm_vcpu *vcpu, u64 s1_par, !MEMATTR_IS_DEVICE(final_attr)) final_attr = MEMATTR(NC, NC); + s2_sh = FIELD_GET(KVM_PTE_LEAF_ATTR_LO_S2_SH, tr->desc); + par = FIELD_PREP(SYS_PAR_EL1_ATTR, final_attr); par |= tr->output & GENMASK(47, 12); par |= FIELD_PREP(SYS_PAR_EL1_SH, combine_sh(FIELD_GET(SYS_PAR_EL1_SH, s1_par), - compute_sh(final_attr, tr->desc))); + compute_final_sh(final_attr, s2_sh))); return par; } -static u64 compute_par_s1(struct kvm_vcpu *vcpu, struct s1_walk_result *wr, - enum trans_regime regime) +static u64 compute_par_s1(struct kvm_vcpu *vcpu, struct s1_walk_info *wi, + struct s1_walk_result *wr) { u64 par; @@ -764,9 +858,9 @@ static u64 compute_par_s1(struct kvm_vcpu *vcpu, struct s1_walk_result *wr, } else if (wr->level == S1_MMU_DISABLED) { /* MMU off or HCR_EL2.DC == 1 */ par = SYS_PAR_EL1_NSE; - par |= wr->pa & GENMASK_ULL(47, 12); + par |= wr->pa & SYS_PAR_EL1_PA; - if (regime == TR_EL10 && + if (wi->regime == TR_EL10 && vcpu_has_nv(vcpu) && (__vcpu_sys_reg(vcpu, HCR_EL2) & HCR_DC)) { par |= FIELD_PREP(SYS_PAR_EL1_ATTR, MEMATTR(WbRaWa, WbRaWa)); @@ -781,14 +875,14 @@ static u64 compute_par_s1(struct kvm_vcpu *vcpu, struct s1_walk_result *wr, par = SYS_PAR_EL1_NSE; - mair = (regime == TR_EL10 ? + mair = (wi->regime == TR_EL10 ? vcpu_read_sys_reg(vcpu, MAIR_EL1) : vcpu_read_sys_reg(vcpu, MAIR_EL2)); mair >>= FIELD_GET(PTE_ATTRINDX_MASK, wr->desc) * 8; mair &= 0xff; - sctlr = (regime == TR_EL10 ? + sctlr = (wi->regime == TR_EL10 ? vcpu_read_sys_reg(vcpu, SCTLR_EL1) : vcpu_read_sys_reg(vcpu, SCTLR_EL2)); @@ -797,9 +891,9 @@ static u64 compute_par_s1(struct kvm_vcpu *vcpu, struct s1_walk_result *wr, mair = MEMATTR(NC, NC); par |= FIELD_PREP(SYS_PAR_EL1_ATTR, mair); - par |= wr->pa & GENMASK_ULL(47, 12); + par |= wr->pa & SYS_PAR_EL1_PA; - sh = compute_sh(mair, wr->desc); + sh = compute_s1_sh(wi, wr, mair); par |= FIELD_PREP(SYS_PAR_EL1_SH, sh); } @@ -873,7 +967,7 @@ static void compute_s1_direct_permissions(struct kvm_vcpu *vcpu, wxn = (vcpu_read_sys_reg(vcpu, SCTLR_EL2) & SCTLR_ELx_WXN); break; case TR_EL10: - wxn = (__vcpu_sys_reg(vcpu, SCTLR_EL1) & SCTLR_ELx_WXN); + wxn = (vcpu_read_sys_reg(vcpu, SCTLR_EL1) & SCTLR_ELx_WXN); break; } @@ -1186,7 +1280,7 @@ static u64 handle_at_slow(struct kvm_vcpu *vcpu, u32 op, u64 vaddr) fail_s1_walk(&wr, ESR_ELx_FSC_PERM_L(wr.level), false); compute_par: - return compute_par_s1(vcpu, &wr, wi.regime); + return compute_par_s1(vcpu, &wi, &wr); } /* @@ -1202,7 +1296,7 @@ static u64 __kvm_at_s1e01_fast(struct kvm_vcpu *vcpu, u32 op, u64 vaddr) { struct mmu_config config; struct kvm_s2_mmu *mmu; - bool fail; + bool fail, mmu_cs; u64 par; par = SYS_PAR_EL1_F; @@ -1218,8 +1312,13 @@ static u64 __kvm_at_s1e01_fast(struct kvm_vcpu *vcpu, u32 op, u64 vaddr) * If HCR_EL2.{E2H,TGE} == {1,1}, the MMU context is already * the right one (as we trapped from vEL2). If not, save the * full MMU context. + * + * We are also guaranteed to be in the correct context if + * we're not in a nested VM. */ - if (vcpu_el2_e2h_is_set(vcpu) && vcpu_el2_tge_is_set(vcpu)) + mmu_cs = (vcpu_has_nv(vcpu) && + !(vcpu_el2_e2h_is_set(vcpu) && vcpu_el2_tge_is_set(vcpu))); + if (!mmu_cs) goto skip_mmu_switch; /* @@ -1287,7 +1386,7 @@ skip_mmu_switch: write_sysreg_hcr(HCR_HOST_VHE_FLAGS); - if (!(vcpu_el2_e2h_is_set(vcpu) && vcpu_el2_tge_is_set(vcpu))) + if (mmu_cs) __mmu_config_restore(&config); return par; @@ -1420,10 +1519,10 @@ void __kvm_at_s12(struct kvm_vcpu *vcpu, u32 op, u64 vaddr) return; /* - * If we only have a single stage of translation (E2H=0 or - * TGE=1), exit early. Same thing if {VM,DC}=={0,0}. + * If we only have a single stage of translation (EL2&0), exit + * early. Same thing if {VM,DC}=={0,0}. */ - if (!vcpu_el2_e2h_is_set(vcpu) || vcpu_el2_tge_is_set(vcpu) || + if (compute_translation_regime(vcpu, op) == TR_EL20 || !(vcpu_read_sys_reg(vcpu, HCR_EL2) & (HCR_VM | HCR_DC))) return; @@ -1470,3 +1569,68 @@ int __kvm_translate_va(struct kvm_vcpu *vcpu, struct s1_walk_info *wi, return 0; } + +struct desc_match { + u64 ipa; + int level; +}; + +static int match_s1_desc(struct s1_walk_context *ctxt, void *priv) +{ + struct desc_match *dm = priv; + u64 ipa = dm->ipa; + + /* Use S1 granule alignment */ + ipa &= GENMASK(51, ctxt->wi->pgshift); + + /* Not the IPA we're looking for? Continue. */ + if (ipa != ctxt->table_ipa) + return 0; + + /* Note the level and interrupt the walk */ + dm->level = ctxt->level; + return -EINTR; +} + +int __kvm_find_s1_desc_level(struct kvm_vcpu *vcpu, u64 va, u64 ipa, int *level) +{ + struct desc_match dm = { + .ipa = ipa, + }; + struct s1_walk_info wi = { + .filter = &(struct s1_walk_filter){ + .fn = match_s1_desc, + .priv = &dm, + }, + .regime = TR_EL10, + .as_el0 = false, + .pan = false, + }; + struct s1_walk_result wr = {}; + int ret; + + ret = setup_s1_walk(vcpu, &wi, &wr, va); + if (ret) + return ret; + + /* We really expect the S1 MMU to be on here... */ + if (WARN_ON_ONCE(wr.level == S1_MMU_DISABLED)) { + *level = 0; + return 0; + } + + /* Walk the guest's PT, looking for a match along the way */ + ret = walk_s1(vcpu, &wi, &wr, va); + switch (ret) { + case -EINTR: + /* We interrupted the walk on a match, return the level */ + *level = dm.level; + return 0; + case 0: + /* The walk completed, we failed to find the entry */ + return -ENOENT; + default: + /* Any other error... */ + return ret; + } +} diff --git a/arch/arm64/kvm/config.c b/arch/arm64/kvm/config.c index da66c4a14775..fbd8944a3dea 100644 --- a/arch/arm64/kvm/config.c +++ b/arch/arm64/kvm/config.c @@ -7,12 +7,22 @@ #include <linux/kvm_host.h> #include <asm/sysreg.h> +/* + * Describes the dependencies between a set of bits (or the negation + * of a set of RES0 bits) and a feature. The flags indicate how the + * data is interpreted. + */ struct reg_bits_to_feat_map { - u64 bits; + union { + u64 bits; + u64 *res0p; + }; #define NEVER_FGU BIT(0) /* Can trap, but never UNDEF */ #define CALL_FUNC BIT(1) /* Needs to evaluate tons of crap */ #define FIXED_VALUE BIT(2) /* RAZ/WI or RAO/WI in KVM */ +#define RES0_POINTER BIT(3) /* Pointer to RES0 value instead of bits */ + unsigned long flags; union { @@ -28,9 +38,27 @@ struct reg_bits_to_feat_map { }; }; -#define __NEEDS_FEAT_3(m, f, id, fld, lim) \ +/* + * Describes the dependencies for a given register: + * + * @feat_map describes the dependency for the whole register. If the + * features the register depends on are not present, the whole + * register is effectively RES0. + * + * @bit_feat_map describes the dependencies for a set of bits in that + * register. If the features these bits depend on are not present, the + * bits are effectively RES0. + */ +struct reg_feat_map_desc { + const char *name; + const struct reg_bits_to_feat_map feat_map; + const struct reg_bits_to_feat_map *bit_feat_map; + const unsigned int bit_feat_map_sz; +}; + +#define __NEEDS_FEAT_3(m, f, w, id, fld, lim) \ { \ - .bits = (m), \ + .w = (m), \ .flags = (f), \ .regidx = IDREG_IDX(SYS_ ## id), \ .shift = id ##_## fld ## _SHIFT, \ @@ -39,28 +67,63 @@ struct reg_bits_to_feat_map { .lo_lim = id ##_## fld ##_## lim \ } -#define __NEEDS_FEAT_2(m, f, fun, dummy) \ +#define __NEEDS_FEAT_2(m, f, w, fun, dummy) \ { \ - .bits = (m), \ + .w = (m), \ .flags = (f) | CALL_FUNC, \ .fval = (fun), \ } -#define __NEEDS_FEAT_1(m, f, fun) \ +#define __NEEDS_FEAT_1(m, f, w, fun) \ { \ - .bits = (m), \ + .w = (m), \ .flags = (f) | CALL_FUNC, \ .match = (fun), \ } +#define __NEEDS_FEAT_FLAG(m, f, w, ...) \ + CONCATENATE(__NEEDS_FEAT_, COUNT_ARGS(__VA_ARGS__))(m, f, w, __VA_ARGS__) + #define NEEDS_FEAT_FLAG(m, f, ...) \ - CONCATENATE(__NEEDS_FEAT_, COUNT_ARGS(__VA_ARGS__))(m, f, __VA_ARGS__) + __NEEDS_FEAT_FLAG(m, f, bits, __VA_ARGS__) #define NEEDS_FEAT_FIXED(m, ...) \ - NEEDS_FEAT_FLAG(m, FIXED_VALUE, __VA_ARGS__, 0) + __NEEDS_FEAT_FLAG(m, FIXED_VALUE, bits, __VA_ARGS__, 0) +#define NEEDS_FEAT_RES0(p, ...) \ + __NEEDS_FEAT_FLAG(p, RES0_POINTER, res0p, __VA_ARGS__) + +/* + * Declare the dependency between a set of bits and a set of features, + * generating a struct reg_bit_to_feat_map. + */ #define NEEDS_FEAT(m, ...) NEEDS_FEAT_FLAG(m, 0, __VA_ARGS__) +/* + * Declare the dependency between a non-FGT register, a set of + * feature, and the set of individual bits it contains. This generates + * a struct reg_feat_map_desc. + */ +#define DECLARE_FEAT_MAP(n, r, m, f) \ + struct reg_feat_map_desc n = { \ + .name = #r, \ + .feat_map = NEEDS_FEAT(~r##_RES0, f), \ + .bit_feat_map = m, \ + .bit_feat_map_sz = ARRAY_SIZE(m), \ + } + +/* + * Specialised version of the above for FGT registers that have their + * RES0 masks described as struct fgt_masks. + */ +#define DECLARE_FEAT_MAP_FGT(n, msk, m, f) \ + struct reg_feat_map_desc n = { \ + .name = #msk, \ + .feat_map = NEEDS_FEAT_RES0(&msk.res0, f),\ + .bit_feat_map = m, \ + .bit_feat_map_sz = ARRAY_SIZE(m), \ + } + #define FEAT_SPE ID_AA64DFR0_EL1, PMSVer, IMP #define FEAT_SPE_FnE ID_AA64DFR0_EL1, PMSVer, V1P2 #define FEAT_BRBE ID_AA64DFR0_EL1, BRBE, IMP @@ -73,6 +136,7 @@ struct reg_bits_to_feat_map { #define FEAT_AA32EL0 ID_AA64PFR0_EL1, EL0, AARCH32 #define FEAT_AA32EL1 ID_AA64PFR0_EL1, EL1, AARCH32 #define FEAT_AA64EL1 ID_AA64PFR0_EL1, EL1, IMP +#define FEAT_AA64EL2 ID_AA64PFR0_EL1, EL2, IMP #define FEAT_AA64EL3 ID_AA64PFR0_EL1, EL3, IMP #define FEAT_AIE ID_AA64MMFR3_EL1, AIE, IMP #define FEAT_S2POE ID_AA64MMFR3_EL1, S2POE, IMP @@ -131,7 +195,6 @@ struct reg_bits_to_feat_map { #define FEAT_SPMU ID_AA64DFR1_EL1, SPMU, IMP #define FEAT_SPE_nVM ID_AA64DFR2_EL1, SPE_nVM, IMP #define FEAT_STEP2 ID_AA64DFR2_EL1, STEP, IMP -#define FEAT_SYSREG128 ID_AA64ISAR2_EL1, SYSREG_128, IMP #define FEAT_CPA2 ID_AA64ISAR3_EL1, CPA, CPA2 #define FEAT_ASID2 ID_AA64MMFR4_EL1, ASID2, IMP #define FEAT_MEC ID_AA64MMFR3_EL1, MEC, IMP @@ -143,7 +206,6 @@ struct reg_bits_to_feat_map { #define FEAT_LSMAOC ID_AA64MMFR2_EL1, LSM, IMP #define FEAT_MixedEnd ID_AA64MMFR0_EL1, BIGEND, IMP #define FEAT_MixedEndEL0 ID_AA64MMFR0_EL1, BIGENDEL0, IMP -#define FEAT_MTE2 ID_AA64PFR1_EL1, MTE, MTE2 #define FEAT_MTE_ASYNC ID_AA64PFR1_EL1, MTE_frac, ASYNC #define FEAT_MTE_STORE_ONLY ID_AA64PFR2_EL1, MTESTOREONLY, IMP #define FEAT_PAN ID_AA64MMFR1_EL1, PAN, IMP @@ -151,7 +213,9 @@ struct reg_bits_to_feat_map { #define FEAT_SSBS ID_AA64PFR1_EL1, SSBS, IMP #define FEAT_TIDCP1 ID_AA64MMFR1_EL1, TIDCP1, IMP #define FEAT_FGT ID_AA64MMFR0_EL1, FGT, IMP +#define FEAT_FGT2 ID_AA64MMFR0_EL1, FGT, FGT2 #define FEAT_MTPMU ID_AA64DFR0_EL1, MTPMU, IMP +#define FEAT_HCX ID_AA64MMFR1_EL1, HCX, IMP static bool not_feat_aa64el3(struct kvm *kvm) { @@ -397,6 +461,10 @@ static const struct reg_bits_to_feat_map hfgrtr_feat_map[] = { NEVER_FGU, FEAT_AA64EL1), }; + +static const DECLARE_FEAT_MAP_FGT(hfgrtr_desc, hfgrtr_masks, + hfgrtr_feat_map, FEAT_FGT); + static const struct reg_bits_to_feat_map hfgwtr_feat_map[] = { NEEDS_FEAT(HFGWTR_EL2_nAMAIR2_EL1 | HFGWTR_EL2_nMAIR2_EL1, @@ -461,6 +529,9 @@ static const struct reg_bits_to_feat_map hfgwtr_feat_map[] = { NEVER_FGU, FEAT_AA64EL1), }; +static const DECLARE_FEAT_MAP_FGT(hfgwtr_desc, hfgwtr_masks, + hfgwtr_feat_map, FEAT_FGT); + static const struct reg_bits_to_feat_map hdfgrtr_feat_map[] = { NEEDS_FEAT(HDFGRTR_EL2_PMBIDR_EL1 | HDFGRTR_EL2_PMSLATFR_EL1 | @@ -528,6 +599,9 @@ static const struct reg_bits_to_feat_map hdfgrtr_feat_map[] = { NEVER_FGU, FEAT_AA64EL1) }; +static const DECLARE_FEAT_MAP_FGT(hdfgrtr_desc, hdfgrtr_masks, + hdfgrtr_feat_map, FEAT_FGT); + static const struct reg_bits_to_feat_map hdfgwtr_feat_map[] = { NEEDS_FEAT(HDFGWTR_EL2_PMSLATFR_EL1 | HDFGWTR_EL2_PMSIRR_EL1 | @@ -588,6 +662,8 @@ static const struct reg_bits_to_feat_map hdfgwtr_feat_map[] = { NEEDS_FEAT(HDFGWTR_EL2_TRFCR_EL1, FEAT_TRF), }; +static const DECLARE_FEAT_MAP_FGT(hdfgwtr_desc, hdfgwtr_masks, + hdfgwtr_feat_map, FEAT_FGT); static const struct reg_bits_to_feat_map hfgitr_feat_map[] = { NEEDS_FEAT(HFGITR_EL2_PSBCSYNC, FEAT_SPEv1p5), @@ -662,6 +738,9 @@ static const struct reg_bits_to_feat_map hfgitr_feat_map[] = { NEVER_FGU, FEAT_AA64EL1), }; +static const DECLARE_FEAT_MAP_FGT(hfgitr_desc, hfgitr_masks, + hfgitr_feat_map, FEAT_FGT); + static const struct reg_bits_to_feat_map hafgrtr_feat_map[] = { NEEDS_FEAT(HAFGRTR_EL2_AMEVTYPER115_EL0 | HAFGRTR_EL2_AMEVTYPER114_EL0 | @@ -704,11 +783,17 @@ static const struct reg_bits_to_feat_map hafgrtr_feat_map[] = { FEAT_AMUv1), }; +static const DECLARE_FEAT_MAP_FGT(hafgrtr_desc, hafgrtr_masks, + hafgrtr_feat_map, FEAT_FGT); + static const struct reg_bits_to_feat_map hfgitr2_feat_map[] = { NEEDS_FEAT(HFGITR2_EL2_nDCCIVAPS, FEAT_PoPS), NEEDS_FEAT(HFGITR2_EL2_TSBCSYNC, FEAT_TRBEv1p1) }; +static const DECLARE_FEAT_MAP_FGT(hfgitr2_desc, hfgitr2_masks, + hfgitr2_feat_map, FEAT_FGT2); + static const struct reg_bits_to_feat_map hfgrtr2_feat_map[] = { NEEDS_FEAT(HFGRTR2_EL2_nPFAR_EL1, FEAT_PFAR), NEEDS_FEAT(HFGRTR2_EL2_nERXGSR_EL1, FEAT_RASv2), @@ -728,6 +813,9 @@ static const struct reg_bits_to_feat_map hfgrtr2_feat_map[] = { NEEDS_FEAT(HFGRTR2_EL2_nRCWSMASK_EL1, FEAT_THE), }; +static const DECLARE_FEAT_MAP_FGT(hfgrtr2_desc, hfgrtr2_masks, + hfgrtr2_feat_map, FEAT_FGT2); + static const struct reg_bits_to_feat_map hfgwtr2_feat_map[] = { NEEDS_FEAT(HFGWTR2_EL2_nPFAR_EL1, FEAT_PFAR), NEEDS_FEAT(HFGWTR2_EL2_nACTLRALIAS_EL1 | @@ -746,6 +834,9 @@ static const struct reg_bits_to_feat_map hfgwtr2_feat_map[] = { NEEDS_FEAT(HFGWTR2_EL2_nRCWSMASK_EL1, FEAT_THE), }; +static const DECLARE_FEAT_MAP_FGT(hfgwtr2_desc, hfgwtr2_masks, + hfgwtr2_feat_map, FEAT_FGT2); + static const struct reg_bits_to_feat_map hdfgrtr2_feat_map[] = { NEEDS_FEAT(HDFGRTR2_EL2_nMDSELR_EL1, FEAT_Debugv8p9), NEEDS_FEAT(HDFGRTR2_EL2_nPMECR_EL1, feat_ebep_pmuv3_ss), @@ -776,6 +867,9 @@ static const struct reg_bits_to_feat_map hdfgrtr2_feat_map[] = { NEEDS_FEAT(HDFGRTR2_EL2_nTRBMPAM_EL1, feat_trbe_mpam), }; +static const DECLARE_FEAT_MAP_FGT(hdfgrtr2_desc, hdfgrtr2_masks, + hdfgrtr2_feat_map, FEAT_FGT2); + static const struct reg_bits_to_feat_map hdfgwtr2_feat_map[] = { NEEDS_FEAT(HDFGWTR2_EL2_nMDSELR_EL1, FEAT_Debugv8p9), NEEDS_FEAT(HDFGWTR2_EL2_nPMECR_EL1, feat_ebep_pmuv3_ss), @@ -804,6 +898,10 @@ static const struct reg_bits_to_feat_map hdfgwtr2_feat_map[] = { NEEDS_FEAT(HDFGWTR2_EL2_nTRBMPAM_EL1, feat_trbe_mpam), }; +static const DECLARE_FEAT_MAP_FGT(hdfgwtr2_desc, hdfgwtr2_masks, + hdfgwtr2_feat_map, FEAT_FGT2); + + static const struct reg_bits_to_feat_map hcrx_feat_map[] = { NEEDS_FEAT(HCRX_EL2_PACMEn, feat_pauth_lr), NEEDS_FEAT(HCRX_EL2_EnFPM, FEAT_FPMR), @@ -833,6 +931,10 @@ static const struct reg_bits_to_feat_map hcrx_feat_map[] = { NEEDS_FEAT(HCRX_EL2_EnAS0, FEAT_LS64_ACCDATA), }; + +static const DECLARE_FEAT_MAP(hcrx_desc, __HCRX_EL2, + hcrx_feat_map, FEAT_HCX); + static const struct reg_bits_to_feat_map hcr_feat_map[] = { NEEDS_FEAT(HCR_EL2_TID0, FEAT_AA32EL0), NEEDS_FEAT_FIXED(HCR_EL2_RW, compute_hcr_rw), @@ -904,6 +1006,9 @@ static const struct reg_bits_to_feat_map hcr_feat_map[] = { NEEDS_FEAT_FIXED(HCR_EL2_E2H, compute_hcr_e2h), }; +static const DECLARE_FEAT_MAP(hcr_desc, HCR_EL2, + hcr_feat_map, FEAT_AA64EL2); + static const struct reg_bits_to_feat_map sctlr2_feat_map[] = { NEEDS_FEAT(SCTLR2_EL1_NMEA | SCTLR2_EL1_EASE, @@ -921,6 +1026,9 @@ static const struct reg_bits_to_feat_map sctlr2_feat_map[] = { FEAT_CPA2), }; +static const DECLARE_FEAT_MAP(sctlr2_desc, SCTLR2_EL1, + sctlr2_feat_map, FEAT_SCTLR2); + static const struct reg_bits_to_feat_map tcr2_el2_feat_map[] = { NEEDS_FEAT(TCR2_EL2_FNG1 | TCR2_EL2_FNG0 | @@ -943,6 +1051,9 @@ static const struct reg_bits_to_feat_map tcr2_el2_feat_map[] = { NEEDS_FEAT(TCR2_EL2_PIE, FEAT_S1PIE), }; +static const DECLARE_FEAT_MAP(tcr2_el2_desc, TCR2_EL2, + tcr2_el2_feat_map, FEAT_TCR2); + static const struct reg_bits_to_feat_map sctlr_el1_feat_map[] = { NEEDS_FEAT(SCTLR_EL1_CP15BEN | SCTLR_EL1_ITD | @@ -1017,6 +1128,9 @@ static const struct reg_bits_to_feat_map sctlr_el1_feat_map[] = { FEAT_AA64EL1), }; +static const DECLARE_FEAT_MAP(sctlr_el1_desc, SCTLR_EL1, + sctlr_el1_feat_map, FEAT_AA64EL1); + static const struct reg_bits_to_feat_map mdcr_el2_feat_map[] = { NEEDS_FEAT(MDCR_EL2_EBWE, FEAT_Debugv8p9), NEEDS_FEAT(MDCR_EL2_TDOSA, FEAT_DoubleLock), @@ -1048,6 +1162,9 @@ static const struct reg_bits_to_feat_map mdcr_el2_feat_map[] = { FEAT_AA64EL1), }; +static const DECLARE_FEAT_MAP(mdcr_el2_desc, MDCR_EL2, + mdcr_el2_feat_map, FEAT_AA64EL2); + static void __init check_feat_map(const struct reg_bits_to_feat_map *map, int map_size, u64 res0, const char *str) { @@ -1061,32 +1178,36 @@ static void __init check_feat_map(const struct reg_bits_to_feat_map *map, str, mask ^ ~res0); } +static u64 reg_feat_map_bits(const struct reg_bits_to_feat_map *map) +{ + return map->flags & RES0_POINTER ? ~(*map->res0p) : map->bits; +} + +static void __init check_reg_desc(const struct reg_feat_map_desc *r) +{ + check_feat_map(r->bit_feat_map, r->bit_feat_map_sz, + ~reg_feat_map_bits(&r->feat_map), r->name); +} + void __init check_feature_map(void) { - check_feat_map(hfgrtr_feat_map, ARRAY_SIZE(hfgrtr_feat_map), - hfgrtr_masks.res0, hfgrtr_masks.str); - check_feat_map(hfgwtr_feat_map, ARRAY_SIZE(hfgwtr_feat_map), - hfgwtr_masks.res0, hfgwtr_masks.str); - check_feat_map(hfgitr_feat_map, ARRAY_SIZE(hfgitr_feat_map), - hfgitr_masks.res0, hfgitr_masks.str); - check_feat_map(hdfgrtr_feat_map, ARRAY_SIZE(hdfgrtr_feat_map), - hdfgrtr_masks.res0, hdfgrtr_masks.str); - check_feat_map(hdfgwtr_feat_map, ARRAY_SIZE(hdfgwtr_feat_map), - hdfgwtr_masks.res0, hdfgwtr_masks.str); - check_feat_map(hafgrtr_feat_map, ARRAY_SIZE(hafgrtr_feat_map), - hafgrtr_masks.res0, hafgrtr_masks.str); - check_feat_map(hcrx_feat_map, ARRAY_SIZE(hcrx_feat_map), - __HCRX_EL2_RES0, "HCRX_EL2"); - check_feat_map(hcr_feat_map, ARRAY_SIZE(hcr_feat_map), - HCR_EL2_RES0, "HCR_EL2"); - check_feat_map(sctlr2_feat_map, ARRAY_SIZE(sctlr2_feat_map), - SCTLR2_EL1_RES0, "SCTLR2_EL1"); - check_feat_map(tcr2_el2_feat_map, ARRAY_SIZE(tcr2_el2_feat_map), - TCR2_EL2_RES0, "TCR2_EL2"); - check_feat_map(sctlr_el1_feat_map, ARRAY_SIZE(sctlr_el1_feat_map), - SCTLR_EL1_RES0, "SCTLR_EL1"); - check_feat_map(mdcr_el2_feat_map, ARRAY_SIZE(mdcr_el2_feat_map), - MDCR_EL2_RES0, "MDCR_EL2"); + check_reg_desc(&hfgrtr_desc); + check_reg_desc(&hfgwtr_desc); + check_reg_desc(&hfgitr_desc); + check_reg_desc(&hdfgrtr_desc); + check_reg_desc(&hdfgwtr_desc); + check_reg_desc(&hafgrtr_desc); + check_reg_desc(&hfgrtr2_desc); + check_reg_desc(&hfgwtr2_desc); + check_reg_desc(&hfgitr2_desc); + check_reg_desc(&hdfgrtr2_desc); + check_reg_desc(&hdfgwtr2_desc); + check_reg_desc(&hcrx_desc); + check_reg_desc(&hcr_desc); + check_reg_desc(&sctlr2_desc); + check_reg_desc(&tcr2_el2_desc); + check_reg_desc(&sctlr_el1_desc); + check_reg_desc(&mdcr_el2_desc); } static bool idreg_feat_match(struct kvm *kvm, const struct reg_bits_to_feat_map *map) @@ -1129,7 +1250,7 @@ static u64 __compute_fixed_bits(struct kvm *kvm, match = idreg_feat_match(kvm, &map[i]); if (!match || (map[i].flags & FIXED_VALUE)) - val |= map[i].bits; + val |= reg_feat_map_bits(&map[i]); } return val; @@ -1145,15 +1266,36 @@ static u64 compute_res0_bits(struct kvm *kvm, require, exclude | FIXED_VALUE); } -static u64 compute_fixed_bits(struct kvm *kvm, - const struct reg_bits_to_feat_map *map, - int map_size, - u64 *fixed_bits, - unsigned long require, - unsigned long exclude) +static u64 compute_reg_res0_bits(struct kvm *kvm, + const struct reg_feat_map_desc *r, + unsigned long require, unsigned long exclude) + +{ + u64 res0; + + res0 = compute_res0_bits(kvm, r->bit_feat_map, r->bit_feat_map_sz, + require, exclude); + + /* + * If computing FGUs, don't take RES0 or register existence + * into account -- we're not computing bits for the register + * itself. + */ + if (!(exclude & NEVER_FGU)) { + res0 |= compute_res0_bits(kvm, &r->feat_map, 1, require, exclude); + res0 |= ~reg_feat_map_bits(&r->feat_map); + } + + return res0; +} + +static u64 compute_reg_fixed_bits(struct kvm *kvm, + const struct reg_feat_map_desc *r, + u64 *fixed_bits, unsigned long require, + unsigned long exclude) { - return __compute_fixed_bits(kvm, map, map_size, fixed_bits, - require | FIXED_VALUE, exclude); + return __compute_fixed_bits(kvm, r->bit_feat_map, r->bit_feat_map_sz, + fixed_bits, require | FIXED_VALUE, exclude); } void compute_fgu(struct kvm *kvm, enum fgt_group_id fgt) @@ -1162,51 +1304,40 @@ void compute_fgu(struct kvm *kvm, enum fgt_group_id fgt) switch (fgt) { case HFGRTR_GROUP: - val |= compute_res0_bits(kvm, hfgrtr_feat_map, - ARRAY_SIZE(hfgrtr_feat_map), - 0, NEVER_FGU); - val |= compute_res0_bits(kvm, hfgwtr_feat_map, - ARRAY_SIZE(hfgwtr_feat_map), - 0, NEVER_FGU); + val |= compute_reg_res0_bits(kvm, &hfgrtr_desc, + 0, NEVER_FGU); + val |= compute_reg_res0_bits(kvm, &hfgwtr_desc, + 0, NEVER_FGU); break; case HFGITR_GROUP: - val |= compute_res0_bits(kvm, hfgitr_feat_map, - ARRAY_SIZE(hfgitr_feat_map), - 0, NEVER_FGU); + val |= compute_reg_res0_bits(kvm, &hfgitr_desc, + 0, NEVER_FGU); break; case HDFGRTR_GROUP: - val |= compute_res0_bits(kvm, hdfgrtr_feat_map, - ARRAY_SIZE(hdfgrtr_feat_map), - 0, NEVER_FGU); - val |= compute_res0_bits(kvm, hdfgwtr_feat_map, - ARRAY_SIZE(hdfgwtr_feat_map), - 0, NEVER_FGU); + val |= compute_reg_res0_bits(kvm, &hdfgrtr_desc, + 0, NEVER_FGU); + val |= compute_reg_res0_bits(kvm, &hdfgwtr_desc, + 0, NEVER_FGU); break; case HAFGRTR_GROUP: - val |= compute_res0_bits(kvm, hafgrtr_feat_map, - ARRAY_SIZE(hafgrtr_feat_map), - 0, NEVER_FGU); + val |= compute_reg_res0_bits(kvm, &hafgrtr_desc, + 0, NEVER_FGU); break; case HFGRTR2_GROUP: - val |= compute_res0_bits(kvm, hfgrtr2_feat_map, - ARRAY_SIZE(hfgrtr2_feat_map), - 0, NEVER_FGU); - val |= compute_res0_bits(kvm, hfgwtr2_feat_map, - ARRAY_SIZE(hfgwtr2_feat_map), - 0, NEVER_FGU); + val |= compute_reg_res0_bits(kvm, &hfgrtr2_desc, + 0, NEVER_FGU); + val |= compute_reg_res0_bits(kvm, &hfgwtr2_desc, + 0, NEVER_FGU); break; case HFGITR2_GROUP: - val |= compute_res0_bits(kvm, hfgitr2_feat_map, - ARRAY_SIZE(hfgitr2_feat_map), - 0, NEVER_FGU); + val |= compute_reg_res0_bits(kvm, &hfgitr2_desc, + 0, NEVER_FGU); break; case HDFGRTR2_GROUP: - val |= compute_res0_bits(kvm, hdfgrtr2_feat_map, - ARRAY_SIZE(hdfgrtr2_feat_map), - 0, NEVER_FGU); - val |= compute_res0_bits(kvm, hdfgwtr2_feat_map, - ARRAY_SIZE(hdfgwtr2_feat_map), - 0, NEVER_FGU); + val |= compute_reg_res0_bits(kvm, &hdfgrtr2_desc, + 0, NEVER_FGU); + val |= compute_reg_res0_bits(kvm, &hdfgwtr2_desc, + 0, NEVER_FGU); break; default: BUG(); @@ -1221,109 +1352,74 @@ void get_reg_fixed_bits(struct kvm *kvm, enum vcpu_sysreg reg, u64 *res0, u64 *r switch (reg) { case HFGRTR_EL2: - *res0 = compute_res0_bits(kvm, hfgrtr_feat_map, - ARRAY_SIZE(hfgrtr_feat_map), 0, 0); - *res0 |= hfgrtr_masks.res0; + *res0 = compute_reg_res0_bits(kvm, &hfgrtr_desc, 0, 0); *res1 = HFGRTR_EL2_RES1; break; case HFGWTR_EL2: - *res0 = compute_res0_bits(kvm, hfgwtr_feat_map, - ARRAY_SIZE(hfgwtr_feat_map), 0, 0); - *res0 |= hfgwtr_masks.res0; + *res0 = compute_reg_res0_bits(kvm, &hfgwtr_desc, 0, 0); *res1 = HFGWTR_EL2_RES1; break; case HFGITR_EL2: - *res0 = compute_res0_bits(kvm, hfgitr_feat_map, - ARRAY_SIZE(hfgitr_feat_map), 0, 0); - *res0 |= hfgitr_masks.res0; + *res0 = compute_reg_res0_bits(kvm, &hfgitr_desc, 0, 0); *res1 = HFGITR_EL2_RES1; break; case HDFGRTR_EL2: - *res0 = compute_res0_bits(kvm, hdfgrtr_feat_map, - ARRAY_SIZE(hdfgrtr_feat_map), 0, 0); - *res0 |= hdfgrtr_masks.res0; + *res0 = compute_reg_res0_bits(kvm, &hdfgrtr_desc, 0, 0); *res1 = HDFGRTR_EL2_RES1; break; case HDFGWTR_EL2: - *res0 = compute_res0_bits(kvm, hdfgwtr_feat_map, - ARRAY_SIZE(hdfgwtr_feat_map), 0, 0); - *res0 |= hdfgwtr_masks.res0; + *res0 = compute_reg_res0_bits(kvm, &hdfgwtr_desc, 0, 0); *res1 = HDFGWTR_EL2_RES1; break; case HAFGRTR_EL2: - *res0 = compute_res0_bits(kvm, hafgrtr_feat_map, - ARRAY_SIZE(hafgrtr_feat_map), 0, 0); - *res0 |= hafgrtr_masks.res0; + *res0 = compute_reg_res0_bits(kvm, &hafgrtr_desc, 0, 0); *res1 = HAFGRTR_EL2_RES1; break; case HFGRTR2_EL2: - *res0 = compute_res0_bits(kvm, hfgrtr2_feat_map, - ARRAY_SIZE(hfgrtr2_feat_map), 0, 0); - *res0 |= hfgrtr2_masks.res0; + *res0 = compute_reg_res0_bits(kvm, &hfgrtr2_desc, 0, 0); *res1 = HFGRTR2_EL2_RES1; break; case HFGWTR2_EL2: - *res0 = compute_res0_bits(kvm, hfgwtr2_feat_map, - ARRAY_SIZE(hfgwtr2_feat_map), 0, 0); - *res0 |= hfgwtr2_masks.res0; + *res0 = compute_reg_res0_bits(kvm, &hfgwtr2_desc, 0, 0); *res1 = HFGWTR2_EL2_RES1; break; case HFGITR2_EL2: - *res0 = compute_res0_bits(kvm, hfgitr2_feat_map, - ARRAY_SIZE(hfgitr2_feat_map), 0, 0); - *res0 |= hfgitr2_masks.res0; + *res0 = compute_reg_res0_bits(kvm, &hfgitr2_desc, 0, 0); *res1 = HFGITR2_EL2_RES1; break; case HDFGRTR2_EL2: - *res0 = compute_res0_bits(kvm, hdfgrtr2_feat_map, - ARRAY_SIZE(hdfgrtr2_feat_map), 0, 0); - *res0 |= hdfgrtr2_masks.res0; + *res0 = compute_reg_res0_bits(kvm, &hdfgrtr2_desc, 0, 0); *res1 = HDFGRTR2_EL2_RES1; break; case HDFGWTR2_EL2: - *res0 = compute_res0_bits(kvm, hdfgwtr2_feat_map, - ARRAY_SIZE(hdfgwtr2_feat_map), 0, 0); - *res0 |= hdfgwtr2_masks.res0; + *res0 = compute_reg_res0_bits(kvm, &hdfgwtr2_desc, 0, 0); *res1 = HDFGWTR2_EL2_RES1; break; case HCRX_EL2: - *res0 = compute_res0_bits(kvm, hcrx_feat_map, - ARRAY_SIZE(hcrx_feat_map), 0, 0); - *res0 |= __HCRX_EL2_RES0; + *res0 = compute_reg_res0_bits(kvm, &hcrx_desc, 0, 0); *res1 = __HCRX_EL2_RES1; break; case HCR_EL2: - mask = compute_fixed_bits(kvm, hcr_feat_map, - ARRAY_SIZE(hcr_feat_map), &fixed, - 0, 0); - *res0 = compute_res0_bits(kvm, hcr_feat_map, - ARRAY_SIZE(hcr_feat_map), 0, 0); - *res0 |= HCR_EL2_RES0 | (mask & ~fixed); + mask = compute_reg_fixed_bits(kvm, &hcr_desc, &fixed, 0, 0); + *res0 = compute_reg_res0_bits(kvm, &hcr_desc, 0, 0); + *res0 |= (mask & ~fixed); *res1 = HCR_EL2_RES1 | (mask & fixed); break; case SCTLR2_EL1: case SCTLR2_EL2: - *res0 = compute_res0_bits(kvm, sctlr2_feat_map, - ARRAY_SIZE(sctlr2_feat_map), 0, 0); - *res0 |= SCTLR2_EL1_RES0; + *res0 = compute_reg_res0_bits(kvm, &sctlr2_desc, 0, 0); *res1 = SCTLR2_EL1_RES1; break; case TCR2_EL2: - *res0 = compute_res0_bits(kvm, tcr2_el2_feat_map, - ARRAY_SIZE(tcr2_el2_feat_map), 0, 0); - *res0 |= TCR2_EL2_RES0; + *res0 = compute_reg_res0_bits(kvm, &tcr2_el2_desc, 0, 0); *res1 = TCR2_EL2_RES1; break; case SCTLR_EL1: - *res0 = compute_res0_bits(kvm, sctlr_el1_feat_map, - ARRAY_SIZE(sctlr_el1_feat_map), 0, 0); - *res0 |= SCTLR_EL1_RES0; + *res0 = compute_reg_res0_bits(kvm, &sctlr_el1_desc, 0, 0); *res1 = SCTLR_EL1_RES1; break; case MDCR_EL2: - *res0 = compute_res0_bits(kvm, mdcr_el2_feat_map, - ARRAY_SIZE(mdcr_el2_feat_map), 0, 0); - *res0 |= MDCR_EL2_RES0; + *res0 = compute_reg_res0_bits(kvm, &mdcr_el2_desc, 0, 0); *res1 = MDCR_EL2_RES1; break; default: diff --git a/arch/arm64/kvm/debug.c b/arch/arm64/kvm/debug.c index 381382c19fe4..3515a273eaa2 100644 --- a/arch/arm64/kvm/debug.c +++ b/arch/arm64/kvm/debug.c @@ -56,6 +56,9 @@ static void kvm_arm_setup_mdcr_el2(struct kvm_vcpu *vcpu) if (!kvm_guest_owns_debug_regs(vcpu)) vcpu->arch.mdcr_el2 |= MDCR_EL2_TDA; + if (vcpu_has_nv(vcpu)) + kvm_nested_setup_mdcr_el2(vcpu); + /* Write MDCR_EL2 directly if we're already at EL2 */ if (has_vhe()) write_sysreg(vcpu->arch.mdcr_el2, mdcr_el2); @@ -96,6 +99,13 @@ void kvm_init_host_debug_data(void) } } +void kvm_debug_init_vhe(void) +{ + /* Clear PMSCR_EL1.E{0,1}SPE which reset to UNKNOWN values. */ + if (SYS_FIELD_GET(ID_AA64DFR0_EL1, PMSVer, read_sysreg(id_aa64dfr0_el1))) + write_sysreg_el1(0, SYS_PMSCR); +} + /* * Configures the 'external' MDSCR_EL1 value for the guest, i.e. when the host * has taken over MDSCR_EL1. @@ -138,6 +148,9 @@ void kvm_vcpu_load_debug(struct kvm_vcpu *vcpu) /* Must be called before kvm_vcpu_load_vhe() */ KVM_BUG_ON(vcpu_get_flag(vcpu, SYSREGS_ON_CPU), vcpu->kvm); + if (has_vhe()) + *host_data_ptr(host_debug_state.mdcr_el2) = read_sysreg(mdcr_el2); + /* * Determine which of the possible debug states we're in: * @@ -184,6 +197,9 @@ void kvm_vcpu_load_debug(struct kvm_vcpu *vcpu) void kvm_vcpu_put_debug(struct kvm_vcpu *vcpu) { + if (has_vhe()) + write_sysreg(*host_data_ptr(host_debug_state.mdcr_el2), mdcr_el2); + if (likely(!(vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP))) return; @@ -230,29 +246,29 @@ void kvm_debug_handle_oslar(struct kvm_vcpu *vcpu, u64 val) preempt_enable(); } -void kvm_enable_trbe(void) +static bool skip_trbe_access(bool skip_condition) { - if (has_vhe() || is_protected_kvm_enabled() || - WARN_ON_ONCE(preemptible())) - return; + return (WARN_ON_ONCE(preemptible()) || skip_condition || + is_protected_kvm_enabled() || !is_kvm_arm_initialised()); +} - host_data_set_flag(TRBE_ENABLED); +void kvm_enable_trbe(void) +{ + if (!skip_trbe_access(has_vhe())) + host_data_set_flag(TRBE_ENABLED); } EXPORT_SYMBOL_GPL(kvm_enable_trbe); void kvm_disable_trbe(void) { - if (has_vhe() || is_protected_kvm_enabled() || - WARN_ON_ONCE(preemptible())) - return; - - host_data_clear_flag(TRBE_ENABLED); + if (!skip_trbe_access(has_vhe())) + host_data_clear_flag(TRBE_ENABLED); } EXPORT_SYMBOL_GPL(kvm_disable_trbe); void kvm_tracing_set_el1_configuration(u64 trfcr_while_in_guest) { - if (is_protected_kvm_enabled() || WARN_ON_ONCE(preemptible())) + if (skip_trbe_access(false)) return; if (has_vhe()) { diff --git a/arch/arm64/kvm/emulate-nested.c b/arch/arm64/kvm/emulate-nested.c index 90cb4b7ae0ff..834f13fb1fb7 100644 --- a/arch/arm64/kvm/emulate-nested.c +++ b/arch/arm64/kvm/emulate-nested.c @@ -1185,6 +1185,7 @@ static const struct encoding_to_trap_config encoding_to_cgt[] __initconst = { SR_TRAP(SYS_PMSIRR_EL1, CGT_MDCR_TPMS), SR_TRAP(SYS_PMSLATFR_EL1, CGT_MDCR_TPMS), SR_TRAP(SYS_PMSNEVFR_EL1, CGT_MDCR_TPMS), + SR_TRAP(SYS_PMSDSFR_EL1, CGT_MDCR_TPMS), SR_TRAP(SYS_TRFCR_EL1, CGT_MDCR_TTRF), SR_TRAP(SYS_TRBBASER_EL1, CGT_MDCR_E2TB), SR_TRAP(SYS_TRBLIMITR_EL1, CGT_MDCR_E2TB), @@ -2833,7 +2834,7 @@ int kvm_inject_nested_sea(struct kvm_vcpu *vcpu, bool iabt, u64 addr) iabt ? ESR_ELx_EC_IABT_LOW : ESR_ELx_EC_DABT_LOW); esr |= ESR_ELx_FSC_EXTABT | ESR_ELx_IL; - vcpu_write_sys_reg(vcpu, FAR_EL2, addr); + vcpu_write_sys_reg(vcpu, addr, FAR_EL2); if (__vcpu_sys_reg(vcpu, SCTLR2_EL2) & SCTLR2_EL1_EASE) return kvm_inject_nested(vcpu, esr, except_type_serror); diff --git a/arch/arm64/kvm/handle_exit.c b/arch/arm64/kvm/handle_exit.c index a598072f36d2..bca8c80e11da 100644 --- a/arch/arm64/kvm/handle_exit.c +++ b/arch/arm64/kvm/handle_exit.c @@ -545,7 +545,7 @@ void __noreturn __cold nvhe_hyp_panic_handler(u64 esr, u64 spsr, kvm_err("nVHE hyp BUG at: %s:%u!\n", file, line); else print_nvhe_hyp_panic("BUG", panic_addr); - } else if (IS_ENABLED(CONFIG_CFI_CLANG) && esr_is_cfi_brk(esr)) { + } else if (IS_ENABLED(CONFIG_CFI) && esr_is_cfi_brk(esr)) { kvm_nvhe_report_cfi_failure(panic_addr); } else if (IS_ENABLED(CONFIG_UBSAN_KVM_EL2) && ESR_ELx_EC(esr) == ESR_ELx_EC_BRK64 && @@ -559,6 +559,9 @@ void __noreturn __cold nvhe_hyp_panic_handler(u64 esr, u64 spsr, /* Dump the nVHE hypervisor backtrace */ kvm_nvhe_dump_backtrace(hyp_offset); + /* Dump the faulting instruction */ + dump_kernel_instr(panic_addr + kaslr_offset()); + /* * Hyp has panicked and we're going to handle that by panicking the * kernel. The kernel offset will be revealed in the panic so we're diff --git a/arch/arm64/kvm/hyp/exception.c b/arch/arm64/kvm/hyp/exception.c index 95d186e0bf54..bef40ddb16db 100644 --- a/arch/arm64/kvm/hyp/exception.c +++ b/arch/arm64/kvm/hyp/exception.c @@ -22,36 +22,28 @@ static inline u64 __vcpu_read_sys_reg(const struct kvm_vcpu *vcpu, int reg) { - u64 val; - - if (unlikely(vcpu_has_nv(vcpu))) + if (has_vhe()) return vcpu_read_sys_reg(vcpu, reg); - else if (vcpu_get_flag(vcpu, SYSREGS_ON_CPU) && - __vcpu_read_sys_reg_from_cpu(reg, &val)) - return val; return __vcpu_sys_reg(vcpu, reg); } static inline void __vcpu_write_sys_reg(struct kvm_vcpu *vcpu, u64 val, int reg) { - if (unlikely(vcpu_has_nv(vcpu))) + if (has_vhe()) vcpu_write_sys_reg(vcpu, val, reg); - else if (!vcpu_get_flag(vcpu, SYSREGS_ON_CPU) || - !__vcpu_write_sys_reg_to_cpu(val, reg)) + else __vcpu_assign_sys_reg(vcpu, reg, val); } static void __vcpu_write_spsr(struct kvm_vcpu *vcpu, unsigned long target_mode, u64 val) { - if (unlikely(vcpu_has_nv(vcpu))) { + if (has_vhe()) { if (target_mode == PSR_MODE_EL1h) vcpu_write_sys_reg(vcpu, val, SPSR_EL1); else vcpu_write_sys_reg(vcpu, val, SPSR_EL2); - } else if (has_vhe()) { - write_sysreg_el1(val, SYS_SPSR); } else { __vcpu_assign_sys_reg(vcpu, SPSR_EL1, val); } @@ -59,7 +51,7 @@ static void __vcpu_write_spsr(struct kvm_vcpu *vcpu, unsigned long target_mode, static void __vcpu_write_spsr_abt(struct kvm_vcpu *vcpu, u64 val) { - if (has_vhe()) + if (has_vhe() && vcpu_get_flag(vcpu, SYSREGS_ON_CPU)) write_sysreg(val, spsr_abt); else vcpu->arch.ctxt.spsr_abt = val; @@ -67,7 +59,7 @@ static void __vcpu_write_spsr_abt(struct kvm_vcpu *vcpu, u64 val) static void __vcpu_write_spsr_und(struct kvm_vcpu *vcpu, u64 val) { - if (has_vhe()) + if (has_vhe() && vcpu_get_flag(vcpu, SYSREGS_ON_CPU)) write_sysreg(val, spsr_und); else vcpu->arch.ctxt.spsr_und = val; diff --git a/arch/arm64/kvm/hyp/include/hyp/switch.h b/arch/arm64/kvm/hyp/include/hyp/switch.h index 84ec4e100fbb..b6682202edf3 100644 --- a/arch/arm64/kvm/hyp/include/hyp/switch.h +++ b/arch/arm64/kvm/hyp/include/hyp/switch.h @@ -431,9 +431,6 @@ static inline void __activate_traps_common(struct kvm_vcpu *vcpu) vcpu_set_flag(vcpu, PMUSERENR_ON_CPU); } - *host_data_ptr(host_debug_state.mdcr_el2) = read_sysreg(mdcr_el2); - write_sysreg(vcpu->arch.mdcr_el2, mdcr_el2); - if (cpus_have_final_cap(ARM64_HAS_HCX)) { u64 hcrx = vcpu->arch.hcrx_el2; if (is_nested_ctxt(vcpu)) { @@ -454,8 +451,6 @@ static inline void __deactivate_traps_common(struct kvm_vcpu *vcpu) { struct kvm_cpu_context *hctxt = host_data_ptr(host_ctxt); - write_sysreg(*host_data_ptr(host_debug_state.mdcr_el2), mdcr_el2); - write_sysreg(0, hstr_el2); if (system_supports_pmuv3()) { write_sysreg(ctxt_sys_reg(hctxt, PMUSERENR_EL0), pmuserenr_el0); diff --git a/arch/arm64/kvm/hyp/include/nvhe/pkvm.h b/arch/arm64/kvm/hyp/include/nvhe/pkvm.h index ce31d3b73603..184ad7a39950 100644 --- a/arch/arm64/kvm/hyp/include/nvhe/pkvm.h +++ b/arch/arm64/kvm/hyp/include/nvhe/pkvm.h @@ -29,7 +29,7 @@ struct pkvm_hyp_vcpu { }; /* - * Holds the relevant data for running a protected vm. + * Holds the relevant data for running a vm in protected mode. */ struct pkvm_hyp_vm { struct kvm kvm; @@ -67,6 +67,8 @@ static inline bool pkvm_hyp_vm_is_protected(struct pkvm_hyp_vm *hyp_vm) void pkvm_hyp_vm_table_init(void *tbl); +int __pkvm_reserve_vm(void); +void __pkvm_unreserve_vm(pkvm_handle_t handle); int __pkvm_init_vm(struct kvm *host_kvm, unsigned long vm_hva, unsigned long pgd_hva); int __pkvm_init_vcpu(pkvm_handle_t handle, struct kvm_vcpu *host_vcpu, diff --git a/arch/arm64/kvm/hyp/include/nvhe/trap_handler.h b/arch/arm64/kvm/hyp/include/nvhe/trap_handler.h index 1e6d995968a1..ba5382c12787 100644 --- a/arch/arm64/kvm/hyp/include/nvhe/trap_handler.h +++ b/arch/arm64/kvm/hyp/include/nvhe/trap_handler.h @@ -12,7 +12,8 @@ #include <asm/kvm_host.h> #define cpu_reg(ctxt, r) (ctxt)->regs.regs[r] -#define DECLARE_REG(type, name, ctxt, reg) \ +#define DECLARE_REG(type, name, ctxt, reg) \ + __always_unused int ___check_reg_ ## reg; \ type name = (type)cpu_reg(ctxt, (reg)) #endif /* __ARM64_KVM_NVHE_TRAP_HANDLER_H__ */ diff --git a/arch/arm64/kvm/hyp/nvhe/Makefile b/arch/arm64/kvm/hyp/nvhe/Makefile index 0b0a68b663d4..a244ec25f8c5 100644 --- a/arch/arm64/kvm/hyp/nvhe/Makefile +++ b/arch/arm64/kvm/hyp/nvhe/Makefile @@ -27,6 +27,7 @@ hyp-obj-y := timer-sr.o sysreg-sr.o debug-sr.o switch.o tlb.o hyp-init.o host.o cache.o setup.o mm.o mem_protect.o sys_regs.o pkvm.o stacktrace.o ffa.o hyp-obj-y += ../vgic-v3-sr.o ../aarch32.o ../vgic-v2-cpuif-proxy.o ../entry.o \ ../fpsimd.o ../hyp-entry.o ../exception.o ../pgtable.o +hyp-obj-y += ../../../kernel/smccc-call.o hyp-obj-$(CONFIG_LIST_HARDENED) += list_debug.o hyp-obj-y += $(lib-objs) diff --git a/arch/arm64/kvm/hyp/nvhe/ffa.c b/arch/arm64/kvm/hyp/nvhe/ffa.c index 3369dd0c4009..4e16f9b96f63 100644 --- a/arch/arm64/kvm/hyp/nvhe/ffa.c +++ b/arch/arm64/kvm/hyp/nvhe/ffa.c @@ -71,36 +71,68 @@ static u32 hyp_ffa_version; static bool has_version_negotiated; static hyp_spinlock_t version_lock; -static void ffa_to_smccc_error(struct arm_smccc_res *res, u64 ffa_errno) +static void ffa_to_smccc_error(struct arm_smccc_1_2_regs *res, u64 ffa_errno) { - *res = (struct arm_smccc_res) { + *res = (struct arm_smccc_1_2_regs) { .a0 = FFA_ERROR, .a2 = ffa_errno, }; } -static void ffa_to_smccc_res_prop(struct arm_smccc_res *res, int ret, u64 prop) +static void ffa_to_smccc_res_prop(struct arm_smccc_1_2_regs *res, int ret, u64 prop) { if (ret == FFA_RET_SUCCESS) { - *res = (struct arm_smccc_res) { .a0 = FFA_SUCCESS, - .a2 = prop }; + *res = (struct arm_smccc_1_2_regs) { .a0 = FFA_SUCCESS, + .a2 = prop }; } else { ffa_to_smccc_error(res, ret); } } -static void ffa_to_smccc_res(struct arm_smccc_res *res, int ret) +static void ffa_to_smccc_res(struct arm_smccc_1_2_regs *res, int ret) { ffa_to_smccc_res_prop(res, ret, 0); } static void ffa_set_retval(struct kvm_cpu_context *ctxt, - struct arm_smccc_res *res) + struct arm_smccc_1_2_regs *res) { cpu_reg(ctxt, 0) = res->a0; cpu_reg(ctxt, 1) = res->a1; cpu_reg(ctxt, 2) = res->a2; cpu_reg(ctxt, 3) = res->a3; + cpu_reg(ctxt, 4) = res->a4; + cpu_reg(ctxt, 5) = res->a5; + cpu_reg(ctxt, 6) = res->a6; + cpu_reg(ctxt, 7) = res->a7; + + /* + * DEN0028C 2.6: SMC32/HVC32 call from aarch64 must preserve x8-x30. + * + * In FF-A 1.2, we cannot rely on the function ID sent by the caller to + * detect 32-bit calls because the CPU cycle management interfaces (e.g. + * FFA_MSG_WAIT, FFA_RUN) are 32-bit only but can have 64-bit responses. + * + * FFA-1.3 introduces 64-bit variants of the CPU cycle management + * interfaces. Moreover, FF-A 1.3 clarifies that SMC32 direct requests + * complete with SMC32 direct reponses which *should* allow us use the + * function ID sent by the caller to determine whether to return x8-x17. + * + * Note that we also cannot rely on function IDs in the response. + * + * Given the above, assume SMC64 and send back x0-x17 unconditionally + * as the passthrough code (__kvm_hyp_host_forward_smc) does the same. + */ + cpu_reg(ctxt, 8) = res->a8; + cpu_reg(ctxt, 9) = res->a9; + cpu_reg(ctxt, 10) = res->a10; + cpu_reg(ctxt, 11) = res->a11; + cpu_reg(ctxt, 12) = res->a12; + cpu_reg(ctxt, 13) = res->a13; + cpu_reg(ctxt, 14) = res->a14; + cpu_reg(ctxt, 15) = res->a15; + cpu_reg(ctxt, 16) = res->a16; + cpu_reg(ctxt, 17) = res->a17; } static bool is_ffa_call(u64 func_id) @@ -113,82 +145,92 @@ static bool is_ffa_call(u64 func_id) static int ffa_map_hyp_buffers(u64 ffa_page_count) { - struct arm_smccc_res res; + struct arm_smccc_1_2_regs res; - arm_smccc_1_1_smc(FFA_FN64_RXTX_MAP, - hyp_virt_to_phys(hyp_buffers.tx), - hyp_virt_to_phys(hyp_buffers.rx), - ffa_page_count, - 0, 0, 0, 0, - &res); + arm_smccc_1_2_smc(&(struct arm_smccc_1_2_regs) { + .a0 = FFA_FN64_RXTX_MAP, + .a1 = hyp_virt_to_phys(hyp_buffers.tx), + .a2 = hyp_virt_to_phys(hyp_buffers.rx), + .a3 = ffa_page_count, + }, &res); return res.a0 == FFA_SUCCESS ? FFA_RET_SUCCESS : res.a2; } static int ffa_unmap_hyp_buffers(void) { - struct arm_smccc_res res; + struct arm_smccc_1_2_regs res; - arm_smccc_1_1_smc(FFA_RXTX_UNMAP, - HOST_FFA_ID, - 0, 0, 0, 0, 0, 0, - &res); + arm_smccc_1_2_smc(&(struct arm_smccc_1_2_regs) { + .a0 = FFA_RXTX_UNMAP, + .a1 = HOST_FFA_ID, + }, &res); return res.a0 == FFA_SUCCESS ? FFA_RET_SUCCESS : res.a2; } -static void ffa_mem_frag_tx(struct arm_smccc_res *res, u32 handle_lo, +static void ffa_mem_frag_tx(struct arm_smccc_1_2_regs *res, u32 handle_lo, u32 handle_hi, u32 fraglen, u32 endpoint_id) { - arm_smccc_1_1_smc(FFA_MEM_FRAG_TX, - handle_lo, handle_hi, fraglen, endpoint_id, - 0, 0, 0, - res); + arm_smccc_1_2_smc(&(struct arm_smccc_1_2_regs) { + .a0 = FFA_MEM_FRAG_TX, + .a1 = handle_lo, + .a2 = handle_hi, + .a3 = fraglen, + .a4 = endpoint_id, + }, res); } -static void ffa_mem_frag_rx(struct arm_smccc_res *res, u32 handle_lo, +static void ffa_mem_frag_rx(struct arm_smccc_1_2_regs *res, u32 handle_lo, u32 handle_hi, u32 fragoff) { - arm_smccc_1_1_smc(FFA_MEM_FRAG_RX, - handle_lo, handle_hi, fragoff, HOST_FFA_ID, - 0, 0, 0, - res); + arm_smccc_1_2_smc(&(struct arm_smccc_1_2_regs) { + .a0 = FFA_MEM_FRAG_RX, + .a1 = handle_lo, + .a2 = handle_hi, + .a3 = fragoff, + .a4 = HOST_FFA_ID, + }, res); } -static void ffa_mem_xfer(struct arm_smccc_res *res, u64 func_id, u32 len, +static void ffa_mem_xfer(struct arm_smccc_1_2_regs *res, u64 func_id, u32 len, u32 fraglen) { - arm_smccc_1_1_smc(func_id, len, fraglen, - 0, 0, 0, 0, 0, - res); + arm_smccc_1_2_smc(&(struct arm_smccc_1_2_regs) { + .a0 = func_id, + .a1 = len, + .a2 = fraglen, + }, res); } -static void ffa_mem_reclaim(struct arm_smccc_res *res, u32 handle_lo, +static void ffa_mem_reclaim(struct arm_smccc_1_2_regs *res, u32 handle_lo, u32 handle_hi, u32 flags) { - arm_smccc_1_1_smc(FFA_MEM_RECLAIM, - handle_lo, handle_hi, flags, - 0, 0, 0, 0, - res); + arm_smccc_1_2_smc(&(struct arm_smccc_1_2_regs) { + .a0 = FFA_MEM_RECLAIM, + .a1 = handle_lo, + .a2 = handle_hi, + .a3 = flags, + }, res); } -static void ffa_retrieve_req(struct arm_smccc_res *res, u32 len) +static void ffa_retrieve_req(struct arm_smccc_1_2_regs *res, u32 len) { - arm_smccc_1_1_smc(FFA_FN64_MEM_RETRIEVE_REQ, - len, len, - 0, 0, 0, 0, 0, - res); + arm_smccc_1_2_smc(&(struct arm_smccc_1_2_regs) { + .a0 = FFA_FN64_MEM_RETRIEVE_REQ, + .a1 = len, + .a2 = len, + }, res); } -static void ffa_rx_release(struct arm_smccc_res *res) +static void ffa_rx_release(struct arm_smccc_1_2_regs *res) { - arm_smccc_1_1_smc(FFA_RX_RELEASE, - 0, 0, - 0, 0, 0, 0, 0, - res); + arm_smccc_1_2_smc(&(struct arm_smccc_1_2_regs) { + .a0 = FFA_RX_RELEASE, + }, res); } -static void do_ffa_rxtx_map(struct arm_smccc_res *res, +static void do_ffa_rxtx_map(struct arm_smccc_1_2_regs *res, struct kvm_cpu_context *ctxt) { DECLARE_REG(phys_addr_t, tx, ctxt, 1); @@ -267,7 +309,7 @@ err_unmap: goto out_unlock; } -static void do_ffa_rxtx_unmap(struct arm_smccc_res *res, +static void do_ffa_rxtx_unmap(struct arm_smccc_1_2_regs *res, struct kvm_cpu_context *ctxt) { DECLARE_REG(u32, id, ctxt, 1); @@ -368,7 +410,7 @@ static int ffa_host_unshare_ranges(struct ffa_mem_region_addr_range *ranges, return ret; } -static void do_ffa_mem_frag_tx(struct arm_smccc_res *res, +static void do_ffa_mem_frag_tx(struct arm_smccc_1_2_regs *res, struct kvm_cpu_context *ctxt) { DECLARE_REG(u32, handle_lo, ctxt, 1); @@ -427,7 +469,7 @@ out: } static void __do_ffa_mem_xfer(const u64 func_id, - struct arm_smccc_res *res, + struct arm_smccc_1_2_regs *res, struct kvm_cpu_context *ctxt) { DECLARE_REG(u32, len, ctxt, 1); @@ -521,7 +563,7 @@ err_unshare: __do_ffa_mem_xfer((fid), (res), (ctxt)); \ } while (0); -static void do_ffa_mem_reclaim(struct arm_smccc_res *res, +static void do_ffa_mem_reclaim(struct arm_smccc_1_2_regs *res, struct kvm_cpu_context *ctxt) { DECLARE_REG(u32, handle_lo, ctxt, 1); @@ -628,13 +670,26 @@ static bool ffa_call_supported(u64 func_id) case FFA_RXTX_MAP: case FFA_MEM_DONATE: case FFA_MEM_RETRIEVE_REQ: + /* Optional notification interfaces added in FF-A 1.1 */ + case FFA_NOTIFICATION_BITMAP_CREATE: + case FFA_NOTIFICATION_BITMAP_DESTROY: + case FFA_NOTIFICATION_BIND: + case FFA_NOTIFICATION_UNBIND: + case FFA_NOTIFICATION_SET: + case FFA_NOTIFICATION_GET: + case FFA_NOTIFICATION_INFO_GET: + /* Optional interfaces added in FF-A 1.2 */ + case FFA_MSG_SEND_DIRECT_REQ2: /* Optional per 7.5.1 */ + case FFA_MSG_SEND_DIRECT_RESP2: /* Optional per 7.5.1 */ + case FFA_CONSOLE_LOG: /* Optional per 13.1: not in Table 13.1 */ + case FFA_PARTITION_INFO_GET_REGS: /* Optional for virtual instances per 13.1 */ return false; } return true; } -static bool do_ffa_features(struct arm_smccc_res *res, +static bool do_ffa_features(struct arm_smccc_1_2_regs *res, struct kvm_cpu_context *ctxt) { DECLARE_REG(u32, id, ctxt, 1); @@ -666,21 +721,25 @@ out_handled: static int hyp_ffa_post_init(void) { size_t min_rxtx_sz; - struct arm_smccc_res res; + struct arm_smccc_1_2_regs res; - arm_smccc_1_1_smc(FFA_ID_GET, 0, 0, 0, 0, 0, 0, 0, &res); + arm_smccc_1_2_smc(&(struct arm_smccc_1_2_regs){ + .a0 = FFA_ID_GET, + }, &res); if (res.a0 != FFA_SUCCESS) return -EOPNOTSUPP; if (res.a2 != HOST_FFA_ID) return -EINVAL; - arm_smccc_1_1_smc(FFA_FEATURES, FFA_FN64_RXTX_MAP, - 0, 0, 0, 0, 0, 0, &res); + arm_smccc_1_2_smc(&(struct arm_smccc_1_2_regs){ + .a0 = FFA_FEATURES, + .a1 = FFA_FN64_RXTX_MAP, + }, &res); if (res.a0 != FFA_SUCCESS) return -EOPNOTSUPP; - switch (res.a2) { + switch (res.a2 & FFA_FEAT_RXTX_MIN_SZ_MASK) { case FFA_FEAT_RXTX_MIN_SZ_4K: min_rxtx_sz = SZ_4K; break; @@ -700,7 +759,7 @@ static int hyp_ffa_post_init(void) return 0; } -static void do_ffa_version(struct arm_smccc_res *res, +static void do_ffa_version(struct arm_smccc_1_2_regs *res, struct kvm_cpu_context *ctxt) { DECLARE_REG(u32, ffa_req_version, ctxt, 1); @@ -712,7 +771,10 @@ static void do_ffa_version(struct arm_smccc_res *res, hyp_spin_lock(&version_lock); if (has_version_negotiated) { - res->a0 = hyp_ffa_version; + if (FFA_MINOR_VERSION(ffa_req_version) < FFA_MINOR_VERSION(hyp_ffa_version)) + res->a0 = FFA_RET_NOT_SUPPORTED; + else + res->a0 = hyp_ffa_version; goto unlock; } @@ -721,9 +783,10 @@ static void do_ffa_version(struct arm_smccc_res *res, * first if TEE supports it. */ if (FFA_MINOR_VERSION(ffa_req_version) < FFA_MINOR_VERSION(hyp_ffa_version)) { - arm_smccc_1_1_smc(FFA_VERSION, ffa_req_version, 0, - 0, 0, 0, 0, 0, - res); + arm_smccc_1_2_smc(&(struct arm_smccc_1_2_regs) { + .a0 = FFA_VERSION, + .a1 = ffa_req_version, + }, res); if (res->a0 == FFA_RET_NOT_SUPPORTED) goto unlock; @@ -740,7 +803,7 @@ unlock: hyp_spin_unlock(&version_lock); } -static void do_ffa_part_get(struct arm_smccc_res *res, +static void do_ffa_part_get(struct arm_smccc_1_2_regs *res, struct kvm_cpu_context *ctxt) { DECLARE_REG(u32, uuid0, ctxt, 1); @@ -756,9 +819,14 @@ static void do_ffa_part_get(struct arm_smccc_res *res, goto out_unlock; } - arm_smccc_1_1_smc(FFA_PARTITION_INFO_GET, uuid0, uuid1, - uuid2, uuid3, flags, 0, 0, - res); + arm_smccc_1_2_smc(&(struct arm_smccc_1_2_regs) { + .a0 = FFA_PARTITION_INFO_GET, + .a1 = uuid0, + .a2 = uuid1, + .a3 = uuid2, + .a4 = uuid3, + .a5 = flags, + }, res); if (res->a0 != FFA_SUCCESS) goto out_unlock; @@ -791,7 +859,7 @@ out_unlock: bool kvm_host_ffa_handler(struct kvm_cpu_context *host_ctxt, u32 func_id) { - struct arm_smccc_res res; + struct arm_smccc_1_2_regs res; /* * There's no way we can tell what a non-standard SMC call might @@ -860,13 +928,16 @@ out_handled: int hyp_ffa_init(void *pages) { - struct arm_smccc_res res; + struct arm_smccc_1_2_regs res; void *tx, *rx; if (kvm_host_psci_config.smccc_version < ARM_SMCCC_VERSION_1_2) return 0; - arm_smccc_1_1_smc(FFA_VERSION, FFA_VERSION_1_1, 0, 0, 0, 0, 0, 0, &res); + arm_smccc_1_2_smc(&(struct arm_smccc_1_2_regs) { + .a0 = FFA_VERSION, + .a1 = FFA_VERSION_1_2, + }, &res); if (res.a0 == FFA_RET_NOT_SUPPORTED) return 0; @@ -886,10 +957,10 @@ int hyp_ffa_init(void *pages) if (FFA_MAJOR_VERSION(res.a0) != 1) return -EOPNOTSUPP; - if (FFA_MINOR_VERSION(res.a0) < FFA_MINOR_VERSION(FFA_VERSION_1_1)) + if (FFA_MINOR_VERSION(res.a0) < FFA_MINOR_VERSION(FFA_VERSION_1_2)) hyp_ffa_version = res.a0; else - hyp_ffa_version = FFA_VERSION_1_1; + hyp_ffa_version = FFA_VERSION_1_2; tx = pages; pages += KVM_FFA_MBOX_NR_PAGES * PAGE_SIZE; diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-main.c b/arch/arm64/kvm/hyp/nvhe/hyp-main.c index 3206b2c07f82..29430c031095 100644 --- a/arch/arm64/kvm/hyp/nvhe/hyp-main.c +++ b/arch/arm64/kvm/hyp/nvhe/hyp-main.c @@ -546,6 +546,18 @@ static void handle___pkvm_prot_finalize(struct kvm_cpu_context *host_ctxt) cpu_reg(host_ctxt, 1) = __pkvm_prot_finalize(); } +static void handle___pkvm_reserve_vm(struct kvm_cpu_context *host_ctxt) +{ + cpu_reg(host_ctxt, 1) = __pkvm_reserve_vm(); +} + +static void handle___pkvm_unreserve_vm(struct kvm_cpu_context *host_ctxt) +{ + DECLARE_REG(pkvm_handle_t, handle, host_ctxt, 1); + + __pkvm_unreserve_vm(handle); +} + static void handle___pkvm_init_vm(struct kvm_cpu_context *host_ctxt) { DECLARE_REG(struct kvm *, host_kvm, host_ctxt, 1); @@ -606,6 +618,8 @@ static const hcall_t host_hcall[] = { HANDLE_FUNC(__kvm_timer_set_cntvoff), HANDLE_FUNC(__vgic_v3_save_vmcr_aprs), HANDLE_FUNC(__vgic_v3_restore_vmcr_aprs), + HANDLE_FUNC(__pkvm_reserve_vm), + HANDLE_FUNC(__pkvm_unreserve_vm), HANDLE_FUNC(__pkvm_init_vm), HANDLE_FUNC(__pkvm_init_vcpu), HANDLE_FUNC(__pkvm_teardown_vm), diff --git a/arch/arm64/kvm/hyp/nvhe/list_debug.c b/arch/arm64/kvm/hyp/nvhe/list_debug.c index 46a2d4f2b3c6..baa6260f88dc 100644 --- a/arch/arm64/kvm/hyp/nvhe/list_debug.c +++ b/arch/arm64/kvm/hyp/nvhe/list_debug.c @@ -17,7 +17,7 @@ static inline __must_check bool nvhe_check_data_corruption(bool v) bool corruption = unlikely(condition); \ if (corruption) { \ if (IS_ENABLED(CONFIG_BUG_ON_DATA_CORRUPTION)) { \ - BUG_ON(1); \ + BUG(); \ } else \ WARN_ON(1); \ } \ diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c b/arch/arm64/kvm/hyp/nvhe/mem_protect.c index 8957734d6183..ddc8beb55eee 100644 --- a/arch/arm64/kvm/hyp/nvhe/mem_protect.c +++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c @@ -1010,9 +1010,12 @@ static int __check_host_shared_guest(struct pkvm_hyp_vm *vm, u64 *__phys, u64 ip return ret; if (!kvm_pte_valid(pte)) return -ENOENT; - if (kvm_granule_size(level) != size) + if (size && kvm_granule_size(level) != size) return -E2BIG; + if (!size) + size = kvm_granule_size(level); + state = guest_get_page_state(pte, ipa); if (state != PKVM_PAGE_SHARED_BORROWED) return -EPERM; @@ -1100,7 +1103,7 @@ int __pkvm_host_relax_perms_guest(u64 gfn, struct pkvm_hyp_vcpu *vcpu, enum kvm_ if (prot & ~KVM_PGTABLE_PROT_RWX) return -EINVAL; - assert_host_shared_guest(vm, ipa, PAGE_SIZE); + assert_host_shared_guest(vm, ipa, 0); guest_lock_component(vm); ret = kvm_pgtable_stage2_relax_perms(&vm->pgt, ipa, prot, 0); guest_unlock_component(vm); @@ -1156,7 +1159,7 @@ int __pkvm_host_mkyoung_guest(u64 gfn, struct pkvm_hyp_vcpu *vcpu) if (pkvm_hyp_vm_is_protected(vm)) return -EPERM; - assert_host_shared_guest(vm, ipa, PAGE_SIZE); + assert_host_shared_guest(vm, ipa, 0); guest_lock_component(vm); kvm_pgtable_stage2_mkyoung(&vm->pgt, ipa, 0); guest_unlock_component(vm); diff --git a/arch/arm64/kvm/hyp/nvhe/pkvm.c b/arch/arm64/kvm/hyp/nvhe/pkvm.c index 338505cb0171..05774aed09cb 100644 --- a/arch/arm64/kvm/hyp/nvhe/pkvm.c +++ b/arch/arm64/kvm/hyp/nvhe/pkvm.c @@ -23,8 +23,8 @@ unsigned int kvm_arm_vmid_bits; unsigned int kvm_host_sve_max_vl; /* - * The currently loaded hyp vCPU for each physical CPU. Used only when - * protected KVM is enabled, but for both protected and non-protected VMs. + * The currently loaded hyp vCPU for each physical CPU. Used in protected mode + * for both protected and non-protected VMs. */ static DEFINE_PER_CPU(struct pkvm_hyp_vcpu *, loaded_hyp_vcpu); @@ -135,7 +135,7 @@ static int pkvm_check_pvm_cpu_features(struct kvm_vcpu *vcpu) { struct kvm *kvm = vcpu->kvm; - /* Protected KVM does not support AArch32 guests. */ + /* No AArch32 support for protected guests. */ if (kvm_has_feat(kvm, ID_AA64PFR0_EL1, EL0, AARCH32) || kvm_has_feat(kvm, ID_AA64PFR0_EL1, EL1, AARCH32)) return -EINVAL; @@ -192,6 +192,11 @@ static int pkvm_vcpu_init_traps(struct pkvm_hyp_vcpu *hyp_vcpu) */ #define HANDLE_OFFSET 0x1000 +/* + * Marks a reserved but not yet used entry in the VM table. + */ +#define RESERVED_ENTRY ((void *)0xa110ca7ed) + static unsigned int vm_handle_to_idx(pkvm_handle_t handle) { return handle - HANDLE_OFFSET; @@ -210,8 +215,8 @@ static pkvm_handle_t idx_to_vm_handle(unsigned int idx) DEFINE_HYP_SPINLOCK(vm_table_lock); /* - * The table of VM entries for protected VMs in hyp. - * Allocated at hyp initialization and setup. + * A table that tracks all VMs in protected mode. + * Allocated during hyp initialization and setup. */ static struct pkvm_hyp_vm **vm_table; @@ -231,6 +236,10 @@ static struct pkvm_hyp_vm *get_vm_by_handle(pkvm_handle_t handle) if (unlikely(idx >= KVM_MAX_PVMS)) return NULL; + /* A reserved entry doesn't represent an initialized VM. */ + if (unlikely(vm_table[idx] == RESERVED_ENTRY)) + return NULL; + return vm_table[idx]; } @@ -401,14 +410,26 @@ static void unpin_host_vcpus(struct pkvm_hyp_vcpu *hyp_vcpus[], } static void init_pkvm_hyp_vm(struct kvm *host_kvm, struct pkvm_hyp_vm *hyp_vm, - unsigned int nr_vcpus) + unsigned int nr_vcpus, pkvm_handle_t handle) { + struct kvm_s2_mmu *mmu = &hyp_vm->kvm.arch.mmu; + int idx = vm_handle_to_idx(handle); + + hyp_vm->kvm.arch.pkvm.handle = handle; + hyp_vm->host_kvm = host_kvm; hyp_vm->kvm.created_vcpus = nr_vcpus; - hyp_vm->kvm.arch.mmu.vtcr = host_mmu.arch.mmu.vtcr; - hyp_vm->kvm.arch.pkvm.enabled = READ_ONCE(host_kvm->arch.pkvm.enabled); + hyp_vm->kvm.arch.pkvm.is_protected = READ_ONCE(host_kvm->arch.pkvm.is_protected); + hyp_vm->kvm.arch.pkvm.is_created = true; hyp_vm->kvm.arch.flags = 0; pkvm_init_features_from_host(hyp_vm, host_kvm); + + /* VMID 0 is reserved for the host */ + atomic64_set(&mmu->vmid.id, idx + 1); + + mmu->vtcr = host_mmu.arch.mmu.vtcr; + mmu->arch = &hyp_vm->kvm.arch; + mmu->pgt = &hyp_vm->pgt; } static int pkvm_vcpu_init_sve(struct pkvm_hyp_vcpu *hyp_vcpu, struct kvm_vcpu *host_vcpu) @@ -480,7 +501,7 @@ done: return ret; } -static int find_free_vm_table_entry(struct kvm *host_kvm) +static int find_free_vm_table_entry(void) { int i; @@ -493,15 +514,13 @@ static int find_free_vm_table_entry(struct kvm *host_kvm) } /* - * Allocate a VM table entry and insert a pointer to the new vm. + * Reserve a VM table entry. * - * Return a unique handle to the protected VM on success, + * Return a unique handle to the VM on success, * negative error code on failure. */ -static pkvm_handle_t insert_vm_table_entry(struct kvm *host_kvm, - struct pkvm_hyp_vm *hyp_vm) +static int allocate_vm_table_entry(void) { - struct kvm_s2_mmu *mmu = &hyp_vm->kvm.arch.mmu; int idx; hyp_assert_lock_held(&vm_table_lock); @@ -514,20 +533,57 @@ static pkvm_handle_t insert_vm_table_entry(struct kvm *host_kvm, if (unlikely(!vm_table)) return -EINVAL; - idx = find_free_vm_table_entry(host_kvm); - if (idx < 0) + idx = find_free_vm_table_entry(); + if (unlikely(idx < 0)) return idx; - hyp_vm->kvm.arch.pkvm.handle = idx_to_vm_handle(idx); + vm_table[idx] = RESERVED_ENTRY; - /* VMID 0 is reserved for the host */ - atomic64_set(&mmu->vmid.id, idx + 1); + return idx; +} - mmu->arch = &hyp_vm->kvm.arch; - mmu->pgt = &hyp_vm->pgt; +static int __insert_vm_table_entry(pkvm_handle_t handle, + struct pkvm_hyp_vm *hyp_vm) +{ + unsigned int idx; + + hyp_assert_lock_held(&vm_table_lock); + + /* + * Initializing protected state might have failed, yet a malicious + * host could trigger this function. Thus, ensure that 'vm_table' + * exists. + */ + if (unlikely(!vm_table)) + return -EINVAL; + + idx = vm_handle_to_idx(handle); + if (unlikely(idx >= KVM_MAX_PVMS)) + return -EINVAL; + + if (unlikely(vm_table[idx] != RESERVED_ENTRY)) + return -EINVAL; vm_table[idx] = hyp_vm; - return hyp_vm->kvm.arch.pkvm.handle; + + return 0; +} + +/* + * Insert a pointer to the initialized VM into the VM table. + * + * Return 0 on success, or negative error code on failure. + */ +static int insert_vm_table_entry(pkvm_handle_t handle, + struct pkvm_hyp_vm *hyp_vm) +{ + int ret; + + hyp_spin_lock(&vm_table_lock); + ret = __insert_vm_table_entry(handle, hyp_vm); + hyp_spin_unlock(&vm_table_lock); + + return ret; } /* @@ -594,10 +650,45 @@ static void unmap_donated_memory_noclear(void *va, size_t size) } /* - * Initialize the hypervisor copy of the protected VM state using the - * memory donated by the host. + * Reserves an entry in the hypervisor for a new VM in protected mode. * - * Unmaps the donated memory from the host at stage 2. + * Return a unique handle to the VM on success, negative error code on failure. + */ +int __pkvm_reserve_vm(void) +{ + int ret; + + hyp_spin_lock(&vm_table_lock); + ret = allocate_vm_table_entry(); + hyp_spin_unlock(&vm_table_lock); + + if (ret < 0) + return ret; + + return idx_to_vm_handle(ret); +} + +/* + * Removes a reserved entry, but only if is hasn't been used yet. + * Otherwise, the VM needs to be destroyed. + */ +void __pkvm_unreserve_vm(pkvm_handle_t handle) +{ + unsigned int idx = vm_handle_to_idx(handle); + + if (unlikely(!vm_table)) + return; + + hyp_spin_lock(&vm_table_lock); + if (likely(idx < KVM_MAX_PVMS && vm_table[idx] == RESERVED_ENTRY)) + remove_vm_table_entry(handle); + hyp_spin_unlock(&vm_table_lock); +} + +/* + * Initialize the hypervisor copy of the VM state using host-donated memory. + * + * Unmap the donated memory from the host at stage 2. * * host_kvm: A pointer to the host's struct kvm. * vm_hva: The host va of the area being donated for the VM state. @@ -606,8 +697,7 @@ static void unmap_donated_memory_noclear(void *va, size_t size) * the VM. Must be page aligned. Its size is implied by the VM's * VTCR. * - * Return a unique handle to the protected VM on success, - * negative error code on failure. + * Return 0 success, negative error code on failure. */ int __pkvm_init_vm(struct kvm *host_kvm, unsigned long vm_hva, unsigned long pgd_hva) @@ -615,6 +705,7 @@ int __pkvm_init_vm(struct kvm *host_kvm, unsigned long vm_hva, struct pkvm_hyp_vm *hyp_vm = NULL; size_t vm_size, pgd_size; unsigned int nr_vcpus; + pkvm_handle_t handle; void *pgd = NULL; int ret; @@ -628,6 +719,12 @@ int __pkvm_init_vm(struct kvm *host_kvm, unsigned long vm_hva, goto err_unpin_kvm; } + handle = READ_ONCE(host_kvm->arch.pkvm.handle); + if (unlikely(handle < HANDLE_OFFSET)) { + ret = -EINVAL; + goto err_unpin_kvm; + } + vm_size = pkvm_get_hyp_vm_size(nr_vcpus); pgd_size = kvm_pgtable_stage2_pgd_size(host_mmu.arch.mmu.vtcr); @@ -641,24 +738,19 @@ int __pkvm_init_vm(struct kvm *host_kvm, unsigned long vm_hva, if (!pgd) goto err_remove_mappings; - init_pkvm_hyp_vm(host_kvm, hyp_vm, nr_vcpus); - - hyp_spin_lock(&vm_table_lock); - ret = insert_vm_table_entry(host_kvm, hyp_vm); - if (ret < 0) - goto err_unlock; + init_pkvm_hyp_vm(host_kvm, hyp_vm, nr_vcpus, handle); ret = kvm_guest_prepare_stage2(hyp_vm, pgd); if (ret) - goto err_remove_vm_table_entry; - hyp_spin_unlock(&vm_table_lock); + goto err_remove_mappings; - return hyp_vm->kvm.arch.pkvm.handle; + /* Must be called last since this publishes the VM. */ + ret = insert_vm_table_entry(handle, hyp_vm); + if (ret) + goto err_remove_mappings; + + return 0; -err_remove_vm_table_entry: - remove_vm_table_entry(hyp_vm->kvm.arch.pkvm.handle); -err_unlock: - hyp_spin_unlock(&vm_table_lock); err_remove_mappings: unmap_donated_memory(hyp_vm, vm_size); unmap_donated_memory(pgd, pgd_size); @@ -668,10 +760,9 @@ err_unpin_kvm: } /* - * Initialize the hypervisor copy of the protected vCPU state using the - * memory donated by the host. + * Initialize the hypervisor copy of the vCPU state using host-donated memory. * - * handle: The handle for the protected vm. + * handle: The hypervisor handle for the vm. * host_vcpu: A pointer to the corresponding host vcpu. * vcpu_hva: The host va of the area being donated for the vcpu state. * Must be page aligned. The size of the area must be equal to diff --git a/arch/arm64/kvm/hyp/nvhe/setup.c b/arch/arm64/kvm/hyp/nvhe/setup.c index a48d3f5a5afb..90bd014e952f 100644 --- a/arch/arm64/kvm/hyp/nvhe/setup.c +++ b/arch/arm64/kvm/hyp/nvhe/setup.c @@ -192,6 +192,7 @@ static int fix_host_ownership_walker(const struct kvm_pgtable_visit_ctx *ctx, enum pkvm_page_state state; struct hyp_page *page; phys_addr_t phys; + enum kvm_pgtable_prot prot; if (!kvm_pte_valid(ctx->old)) return 0; @@ -210,11 +211,18 @@ static int fix_host_ownership_walker(const struct kvm_pgtable_visit_ctx *ctx, * configured in the hypervisor stage-1, and make sure to propagate them * to the hyp_vmemmap state. */ - state = pkvm_getstate(kvm_pgtable_hyp_pte_prot(ctx->old)); + prot = kvm_pgtable_hyp_pte_prot(ctx->old); + state = pkvm_getstate(prot); switch (state) { case PKVM_PAGE_OWNED: set_hyp_state(page, PKVM_PAGE_OWNED); - return host_stage2_set_owner_locked(phys, PAGE_SIZE, PKVM_ID_HYP); + /* hyp text is RO in the host stage-2 to be inspected on panic. */ + if (prot == PAGE_HYP_EXEC) { + set_host_state(page, PKVM_NOPAGE); + return host_stage2_idmap_locked(phys, PAGE_SIZE, KVM_PGTABLE_PROT_R); + } else { + return host_stage2_set_owner_locked(phys, PAGE_SIZE, PKVM_ID_HYP); + } case PKVM_PAGE_SHARED_OWNED: set_hyp_state(page, PKVM_PAGE_SHARED_OWNED); set_host_state(page, PKVM_PAGE_SHARED_BORROWED); diff --git a/arch/arm64/kvm/hyp/nvhe/switch.c b/arch/arm64/kvm/hyp/nvhe/switch.c index ccd575d5f6de..d3b9ec8a7c28 100644 --- a/arch/arm64/kvm/hyp/nvhe/switch.c +++ b/arch/arm64/kvm/hyp/nvhe/switch.c @@ -50,6 +50,10 @@ extern void kvm_nvhe_prepare_backtrace(unsigned long fp, unsigned long pc); static void __activate_traps(struct kvm_vcpu *vcpu) { ___activate_traps(vcpu, vcpu->arch.hcr_el2); + + *host_data_ptr(host_debug_state.mdcr_el2) = read_sysreg(mdcr_el2); + write_sysreg(vcpu->arch.mdcr_el2, mdcr_el2); + __activate_traps_common(vcpu); __activate_cptr_traps(vcpu); @@ -93,6 +97,8 @@ static void __deactivate_traps(struct kvm_vcpu *vcpu) isb(); } + write_sysreg(*host_data_ptr(host_debug_state.mdcr_el2), mdcr_el2); + __deactivate_traps_common(vcpu); write_sysreg_hcr(this_cpu_ptr(&kvm_init_params)->hcr_el2); diff --git a/arch/arm64/kvm/hyp/nvhe/sys_regs.c b/arch/arm64/kvm/hyp/nvhe/sys_regs.c index 1ddd9ed3cbb3..82da9b03692d 100644 --- a/arch/arm64/kvm/hyp/nvhe/sys_regs.c +++ b/arch/arm64/kvm/hyp/nvhe/sys_regs.c @@ -253,6 +253,7 @@ static void inject_undef64(struct kvm_vcpu *vcpu) *vcpu_pc(vcpu) = read_sysreg_el2(SYS_ELR); *vcpu_cpsr(vcpu) = read_sysreg_el2(SYS_SPSR); + __vcpu_assign_sys_reg(vcpu, VBAR_EL1, read_sysreg_el1(SYS_VBAR)); kvm_pend_exception(vcpu, EXCEPT_AA64_EL1_SYNC); @@ -372,6 +373,9 @@ static const struct sys_reg_desc pvm_sys_reg_descs[] = { /* Debug and Trace Registers are restricted. */ + /* Group 1 ID registers */ + HOST_HANDLED(SYS_REVIDR_EL1), + /* AArch64 mappings of the AArch32 ID registers */ /* CRm=1 */ AARCH32(SYS_ID_PFR0_EL1), @@ -460,6 +464,7 @@ static const struct sys_reg_desc pvm_sys_reg_descs[] = { HOST_HANDLED(SYS_CCSIDR_EL1), HOST_HANDLED(SYS_CLIDR_EL1), + HOST_HANDLED(SYS_AIDR_EL1), HOST_HANDLED(SYS_CSSELR_EL1), HOST_HANDLED(SYS_CTR_EL0), diff --git a/arch/arm64/kvm/hyp/vgic-v2-cpuif-proxy.c b/arch/arm64/kvm/hyp/vgic-v2-cpuif-proxy.c index 87a54375bd6e..78579b31a420 100644 --- a/arch/arm64/kvm/hyp/vgic-v2-cpuif-proxy.c +++ b/arch/arm64/kvm/hyp/vgic-v2-cpuif-proxy.c @@ -20,7 +20,7 @@ static bool __is_be(struct kvm_vcpu *vcpu) if (vcpu_mode_is_32bit(vcpu)) return !!(read_sysreg_el2(SYS_SPSR) & PSR_AA32_E_BIT); - return !!(read_sysreg(SCTLR_EL1) & SCTLR_ELx_EE); + return !!(read_sysreg_el1(SYS_SCTLR) & SCTLR_ELx_EE); } /* diff --git a/arch/arm64/kvm/hyp/vgic-v3-sr.c b/arch/arm64/kvm/hyp/vgic-v3-sr.c index d81275790e69..acd909b7f225 100644 --- a/arch/arm64/kvm/hyp/vgic-v3-sr.c +++ b/arch/arm64/kvm/hyp/vgic-v3-sr.c @@ -295,12 +295,8 @@ void __vgic_v3_activate_traps(struct vgic_v3_cpu_if *cpu_if) } } - /* - * GICv5 BET0 FEAT_GCIE_LEGACY doesn't include ICC_SRE_EL2. This is due - * to be relaxed in a future spec release, at which point this in - * condition can be dropped. - */ - if (!cpus_have_final_cap(ARM64_HAS_GICV5_CPUIF)) { + /* Only disable SRE if the host implements the GICv2 interface */ + if (static_branch_unlikely(&vgic_v3_has_v2_compat)) { /* * Prevent the guest from touching the ICC_SRE_EL1 system * register. Note that this may not have any effect, as @@ -329,19 +325,16 @@ void __vgic_v3_deactivate_traps(struct vgic_v3_cpu_if *cpu_if) cpu_if->vgic_vmcr = read_gicreg(ICH_VMCR_EL2); } - /* - * Can be dropped in the future when GICv5 spec is relaxed. See comment - * above. - */ - if (!cpus_have_final_cap(ARM64_HAS_GICV5_CPUIF)) { + /* Only restore SRE if the host implements the GICv2 interface */ + if (static_branch_unlikely(&vgic_v3_has_v2_compat)) { val = read_gicreg(ICC_SRE_EL2); write_gicreg(val | ICC_SRE_EL2_ENABLE, ICC_SRE_EL2); - } - if (!cpu_if->vgic_sre) { - /* Make sure ENABLE is set at EL2 before setting SRE at EL1 */ - isb(); - write_gicreg(1, ICC_SRE_EL1); + if (!cpu_if->vgic_sre) { + /* Make sure ENABLE is set at EL2 before setting SRE at EL1 */ + isb(); + write_gicreg(1, ICC_SRE_EL1); + } } /* diff --git a/arch/arm64/kvm/hyp/vhe/switch.c b/arch/arm64/kvm/hyp/vhe/switch.c index e482181c6632..9984c492305a 100644 --- a/arch/arm64/kvm/hyp/vhe/switch.c +++ b/arch/arm64/kvm/hyp/vhe/switch.c @@ -43,8 +43,11 @@ DEFINE_PER_CPU(unsigned long, kvm_hyp_vector); * * - API/APK: they are already accounted for by vcpu_load(), and can * only take effect across a load/put cycle (such as ERET) + * + * - FIEN: no way we let a guest have access to the RAS "Common Fault + * Injection" thing, whatever that does */ -#define NV_HCR_GUEST_EXCLUDE (HCR_TGE | HCR_API | HCR_APK) +#define NV_HCR_GUEST_EXCLUDE (HCR_TGE | HCR_API | HCR_APK | HCR_FIEN) static u64 __compute_hcr(struct kvm_vcpu *vcpu) { @@ -92,6 +95,13 @@ static u64 __compute_hcr(struct kvm_vcpu *vcpu) /* Force NV2 in case the guest is forgetful... */ guest_hcr |= HCR_NV2; } + + /* + * Exclude the guest's TWED configuration if it hasn't set TWE + * to avoid potentially delaying traps for the host. + */ + if (!(guest_hcr & HCR_TWE)) + guest_hcr &= ~(HCR_EL2_TWEDEn | HCR_EL2_TWEDEL); } BUG_ON(host_data_test_flag(VCPU_IN_HYP_CONTEXT) && diff --git a/arch/arm64/kvm/inject_fault.c b/arch/arm64/kvm/inject_fault.c index 6745f38b64f9..dfcd66c65517 100644 --- a/arch/arm64/kvm/inject_fault.c +++ b/arch/arm64/kvm/inject_fault.c @@ -106,7 +106,30 @@ static void inject_abt64(struct kvm_vcpu *vcpu, bool is_iabt, unsigned long addr { unsigned long cpsr = *vcpu_cpsr(vcpu); bool is_aarch32 = vcpu_mode_is_32bit(vcpu); - u64 esr = 0; + u64 esr = 0, fsc; + int level; + + /* + * If injecting an abort from a failed S1PTW, rewalk the S1 PTs to + * find the failing level. If we can't find it, assume the error was + * transient and restart without changing the state. + */ + if (kvm_vcpu_abt_iss1tw(vcpu)) { + u64 hpfar = kvm_vcpu_get_fault_ipa(vcpu); + int ret; + + if (hpfar == INVALID_GPA) + return; + + ret = __kvm_find_s1_desc_level(vcpu, addr, hpfar, &level); + if (ret) + return; + + WARN_ON_ONCE(level < -1 || level > 3); + fsc = ESR_ELx_FSC_SEA_TTW(level); + } else { + fsc = ESR_ELx_FSC_EXTABT; + } /* This delight is brought to you by FEAT_DoubleFault2. */ if (effective_sctlr2_ease(vcpu)) @@ -133,7 +156,7 @@ static void inject_abt64(struct kvm_vcpu *vcpu, bool is_iabt, unsigned long addr if (!is_iabt) esr |= ESR_ELx_EC_DABT_LOW << ESR_ELx_EC_SHIFT; - esr |= ESR_ELx_FSC_EXTABT; + esr |= fsc; vcpu_write_sys_reg(vcpu, addr, exception_far_elx(vcpu)); vcpu_write_sys_reg(vcpu, esr, exception_esr_elx(vcpu)); diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index 1c78864767c5..7cc964af8d30 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -4,19 +4,20 @@ * Author: Christoffer Dall <c.dall@virtualopensystems.com> */ +#include <linux/acpi.h> #include <linux/mman.h> #include <linux/kvm_host.h> #include <linux/io.h> #include <linux/hugetlb.h> #include <linux/sched/signal.h> #include <trace/events/kvm.h> +#include <asm/acpi.h> #include <asm/pgalloc.h> #include <asm/cacheflush.h> #include <asm/kvm_arm.h> #include <asm/kvm_mmu.h> #include <asm/kvm_pgtable.h> #include <asm/kvm_pkvm.h> -#include <asm/kvm_ras.h> #include <asm/kvm_asm.h> #include <asm/kvm_emulate.h> #include <asm/virt.h> @@ -1073,6 +1074,10 @@ void kvm_free_stage2_pgd(struct kvm_s2_mmu *mmu) mmu->pgt = NULL; free_percpu(mmu->last_vcpu_ran); } + + if (kvm_is_nested_s2_mmu(kvm, mmu)) + kvm_init_nested_s2_mmu(mmu); + write_unlock(&kvm->mmu_lock); if (pgt) { @@ -1426,11 +1431,8 @@ static int get_vma_page_shift(struct vm_area_struct *vma, unsigned long hva) * able to see the page's tags and therefore they must be initialised first. If * PG_mte_tagged is set, tags have already been initialised. * - * The race in the test/set of the PG_mte_tagged flag is handled by: - * - preventing VM_SHARED mappings in a memslot with MTE preventing two VMs - * racing to santise the same page - * - mmap_lock protects between a VM faulting a page in and the VMM performing - * an mprotect() to add VM_MTE + * Must be called with kvm->mmu_lock held to ensure the memory remains mapped + * while the tags are zeroed. */ static void sanitise_mte_tags(struct kvm *kvm, kvm_pfn_t pfn, unsigned long size) @@ -1477,13 +1479,132 @@ static bool kvm_vma_is_cacheable(struct vm_area_struct *vma) } } +static int prepare_mmu_memcache(struct kvm_vcpu *vcpu, bool topup_memcache, + void **memcache) +{ + int min_pages; + + if (!is_protected_kvm_enabled()) + *memcache = &vcpu->arch.mmu_page_cache; + else + *memcache = &vcpu->arch.pkvm_memcache; + + if (!topup_memcache) + return 0; + + min_pages = kvm_mmu_cache_min_pages(vcpu->arch.hw_mmu); + + if (!is_protected_kvm_enabled()) + return kvm_mmu_topup_memory_cache(*memcache, min_pages); + + return topup_hyp_memcache(*memcache, min_pages); +} + +/* + * Potentially reduce shadow S2 permissions to match the guest's own S2. For + * exec faults, we'd only reach this point if the guest actually allowed it (see + * kvm_s2_handle_perm_fault). + * + * Also encode the level of the original translation in the SW bits of the leaf + * entry as a proxy for the span of that translation. This will be retrieved on + * TLB invalidation from the guest and used to limit the invalidation scope if a + * TTL hint or a range isn't provided. + */ +static void adjust_nested_fault_perms(struct kvm_s2_trans *nested, + enum kvm_pgtable_prot *prot, + bool *writable) +{ + *writable &= kvm_s2_trans_writable(nested); + if (!kvm_s2_trans_readable(nested)) + *prot &= ~KVM_PGTABLE_PROT_R; + + *prot |= kvm_encode_nested_level(nested); +} + +#define KVM_PGTABLE_WALK_MEMABORT_FLAGS (KVM_PGTABLE_WALK_HANDLE_FAULT | KVM_PGTABLE_WALK_SHARED) + +static int gmem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, + struct kvm_s2_trans *nested, + struct kvm_memory_slot *memslot, bool is_perm) +{ + bool write_fault, exec_fault, writable; + enum kvm_pgtable_walk_flags flags = KVM_PGTABLE_WALK_MEMABORT_FLAGS; + enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_R; + struct kvm_pgtable *pgt = vcpu->arch.hw_mmu->pgt; + unsigned long mmu_seq; + struct page *page; + struct kvm *kvm = vcpu->kvm; + void *memcache; + kvm_pfn_t pfn; + gfn_t gfn; + int ret; + + ret = prepare_mmu_memcache(vcpu, true, &memcache); + if (ret) + return ret; + + if (nested) + gfn = kvm_s2_trans_output(nested) >> PAGE_SHIFT; + else + gfn = fault_ipa >> PAGE_SHIFT; + + write_fault = kvm_is_write_fault(vcpu); + exec_fault = kvm_vcpu_trap_is_exec_fault(vcpu); + + VM_WARN_ON_ONCE(write_fault && exec_fault); + + mmu_seq = kvm->mmu_invalidate_seq; + /* Pairs with the smp_wmb() in kvm_mmu_invalidate_end(). */ + smp_rmb(); + + ret = kvm_gmem_get_pfn(kvm, memslot, gfn, &pfn, &page, NULL); + if (ret) { + kvm_prepare_memory_fault_exit(vcpu, fault_ipa, PAGE_SIZE, + write_fault, exec_fault, false); + return ret; + } + + writable = !(memslot->flags & KVM_MEM_READONLY); + + if (nested) + adjust_nested_fault_perms(nested, &prot, &writable); + + if (writable) + prot |= KVM_PGTABLE_PROT_W; + + if (exec_fault || + (cpus_have_final_cap(ARM64_HAS_CACHE_DIC) && + (!nested || kvm_s2_trans_executable(nested)))) + prot |= KVM_PGTABLE_PROT_X; + + kvm_fault_lock(kvm); + if (mmu_invalidate_retry(kvm, mmu_seq)) { + ret = -EAGAIN; + goto out_unlock; + } + + ret = KVM_PGT_FN(kvm_pgtable_stage2_map)(pgt, fault_ipa, PAGE_SIZE, + __pfn_to_phys(pfn), prot, + memcache, flags); + +out_unlock: + kvm_release_faultin_page(kvm, page, !!ret, writable); + kvm_fault_unlock(kvm); + + if (writable && !ret) + mark_page_dirty_in_slot(kvm, memslot, gfn); + + return ret != -EAGAIN ? ret : 0; +} + static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, struct kvm_s2_trans *nested, struct kvm_memory_slot *memslot, unsigned long hva, bool fault_is_perm) { int ret = 0; - bool write_fault, writable, force_pte = false; + bool topup_memcache; + bool write_fault, writable; bool exec_fault, mte_allowed, is_vma_cacheable; bool s2_force_noncacheable = false, vfio_allow_any_uc = false; unsigned long mmu_seq; @@ -1495,28 +1616,19 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, gfn_t gfn; kvm_pfn_t pfn; bool logging_active = memslot_is_logging(memslot); + bool force_pte = logging_active; long vma_pagesize, fault_granule; enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_R; struct kvm_pgtable *pgt; struct page *page; vm_flags_t vm_flags; - enum kvm_pgtable_walk_flags flags = KVM_PGTABLE_WALK_HANDLE_FAULT | KVM_PGTABLE_WALK_SHARED; + enum kvm_pgtable_walk_flags flags = KVM_PGTABLE_WALK_MEMABORT_FLAGS; if (fault_is_perm) fault_granule = kvm_vcpu_trap_get_perm_fault_granule(vcpu); write_fault = kvm_is_write_fault(vcpu); exec_fault = kvm_vcpu_trap_is_exec_fault(vcpu); - VM_BUG_ON(write_fault && exec_fault); - - if (fault_is_perm && !write_fault && !exec_fault) { - kvm_err("Unexpected L2 read permission error\n"); - return -EFAULT; - } - - if (!is_protected_kvm_enabled()) - memcache = &vcpu->arch.mmu_page_cache; - else - memcache = &vcpu->arch.pkvm_memcache; + VM_WARN_ON_ONCE(write_fault && exec_fault); /* * Permission faults just need to update the existing leaf entry, @@ -1524,17 +1636,10 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, * only exception to this is when dirty logging is enabled at runtime * and a write fault needs to collapse a block entry into a table. */ - if (!fault_is_perm || (logging_active && write_fault)) { - int min_pages = kvm_mmu_cache_min_pages(vcpu->arch.hw_mmu); - - if (!is_protected_kvm_enabled()) - ret = kvm_mmu_topup_memory_cache(memcache, min_pages); - else - ret = topup_hyp_memcache(memcache, min_pages); - - if (ret) - return ret; - } + topup_memcache = !fault_is_perm || (logging_active && write_fault); + ret = prepare_mmu_memcache(vcpu, topup_memcache, &memcache); + if (ret) + return ret; /* * Let's check if we will get back a huge page backed by hugetlbfs, or @@ -1548,16 +1653,10 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, return -EFAULT; } - /* - * logging_active is guaranteed to never be true for VM_PFNMAP - * memslots. - */ - if (logging_active) { - force_pte = true; + if (force_pte) vma_shift = PAGE_SHIFT; - } else { + else vma_shift = get_vma_page_shift(vma, hva); - } switch (vma_shift) { #ifndef __PAGETABLE_PMD_FOLDED @@ -1609,7 +1708,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, max_map_size = PAGE_SIZE; force_pte = (max_map_size == PAGE_SIZE); - vma_pagesize = min(vma_pagesize, (long)max_map_size); + vma_pagesize = min_t(long, vma_pagesize, max_map_size); } /* @@ -1642,7 +1741,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, * Rely on mmap_read_unlock() for an implicit smp_rmb(), which pairs * with the smp_wmb() in kvm_mmu_invalidate_end(). */ - mmu_seq = vcpu->kvm->mmu_invalidate_seq; + mmu_seq = kvm->mmu_invalidate_seq; mmap_read_unlock(current->mm); pfn = __kvm_faultin_pfn(memslot, gfn, write_fault ? FOLL_WRITE : 0, @@ -1673,7 +1772,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, * cache maintenance. */ if (!kvm_supports_cacheable_pfnmap()) - return -EFAULT; + ret = -EFAULT; } else { /* * If the page was identified as device early by looking at @@ -1696,27 +1795,16 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, } if (exec_fault && s2_force_noncacheable) - return -ENOEXEC; - - /* - * Potentially reduce shadow S2 permissions to match the guest's own - * S2. For exec faults, we'd only reach this point if the guest - * actually allowed it (see kvm_s2_handle_perm_fault). - * - * Also encode the level of the original translation in the SW bits - * of the leaf entry as a proxy for the span of that translation. - * This will be retrieved on TLB invalidation from the guest and - * used to limit the invalidation scope if a TTL hint or a range - * isn't provided. - */ - if (nested) { - writable &= kvm_s2_trans_writable(nested); - if (!kvm_s2_trans_readable(nested)) - prot &= ~KVM_PGTABLE_PROT_R; + ret = -ENOEXEC; - prot |= kvm_encode_nested_level(nested); + if (ret) { + kvm_release_page_unused(page); + return ret; } + if (nested) + adjust_nested_fault_perms(nested, &prot, &writable); + kvm_fault_lock(kvm); pgt = vcpu->arch.hw_mmu->pgt; if (mmu_invalidate_retry(kvm, mmu_seq)) { @@ -1811,6 +1899,19 @@ static void handle_access_fault(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa) read_unlock(&vcpu->kvm->mmu_lock); } +int kvm_handle_guest_sea(struct kvm_vcpu *vcpu) +{ + /* + * Give APEI the opportunity to claim the abort before handling it + * within KVM. apei_claim_sea() expects to be called with IRQs enabled. + */ + lockdep_assert_irqs_enabled(); + if (apei_claim_sea(NULL) == 0) + return 1; + + return kvm_inject_serror(vcpu); +} + /** * kvm_handle_guest_abort - handles all 2nd stage aborts * @vcpu: the VCPU pointer @@ -1834,17 +1935,8 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu) gfn_t gfn; int ret, idx; - /* Synchronous External Abort? */ - if (kvm_vcpu_abt_issea(vcpu)) { - /* - * For RAS the host kernel may handle this abort. - * There is no need to pass the error into the guest. - */ - if (kvm_handle_guest_sea()) - return kvm_inject_serror(vcpu); - - return 1; - } + if (kvm_vcpu_abt_issea(vcpu)) + return kvm_handle_guest_sea(vcpu); esr = kvm_vcpu_get_esr(vcpu); @@ -1981,8 +2073,15 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu) goto out_unlock; } - ret = user_mem_abort(vcpu, fault_ipa, nested, memslot, hva, - esr_fsc_is_permission_fault(esr)); + VM_WARN_ON_ONCE(kvm_vcpu_trap_is_permission_fault(vcpu) && + !write_fault && !kvm_vcpu_trap_is_exec_fault(vcpu)); + + if (kvm_slot_has_gmem(memslot)) + ret = gmem_abort(vcpu, fault_ipa, nested, memslot, + esr_fsc_is_permission_fault(esr)); + else + ret = user_mem_abort(vcpu, fault_ipa, nested, memslot, hva, + esr_fsc_is_permission_fault(esr)); if (ret == 0) ret = 1; out: @@ -2214,6 +2313,13 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm, if ((new->base_gfn + new->npages) > (kvm_phys_size(&kvm->arch.mmu) >> PAGE_SHIFT)) return -EFAULT; + /* + * Only support guest_memfd backed memslots with mappable memory, since + * there aren't any CoCo VMs that support only private memory on arm64. + */ + if (kvm_slot_has_gmem(new) && !kvm_memslot_is_gmem_only(new)) + return -EINVAL; + hva = new->userspace_addr; reg_end = hva + (new->npages << PAGE_SHIFT); diff --git a/arch/arm64/kvm/nested.c b/arch/arm64/kvm/nested.c index 153b3e11b115..7a045cad6bdf 100644 --- a/arch/arm64/kvm/nested.c +++ b/arch/arm64/kvm/nested.c @@ -349,7 +349,7 @@ static void vtcr_to_walk_info(u64 vtcr, struct s2_walk_info *wi) wi->sl = FIELD_GET(VTCR_EL2_SL0_MASK, vtcr); /* Global limit for now, should eventually be per-VM */ wi->max_oa_bits = min(get_kvm_ipa_limit(), - ps_to_output_size(FIELD_GET(VTCR_EL2_PS_MASK, vtcr))); + ps_to_output_size(FIELD_GET(VTCR_EL2_PS_MASK, vtcr), false)); } int kvm_walk_nested_s2(struct kvm_vcpu *vcpu, phys_addr_t gipa, @@ -847,7 +847,7 @@ static void kvm_invalidate_vncr_ipa(struct kvm *kvm, u64 start, u64 end) ipa_size = ttl_to_size(pgshift_level_to_ttl(vt->wi.pgshift, vt->wr.level)); - ipa_start = vt->wr.pa & (ipa_size - 1); + ipa_start = vt->wr.pa & ~(ipa_size - 1); ipa_end = ipa_start + ipa_size; if (ipa_end <= start || ipa_start >= end) @@ -887,7 +887,7 @@ static void invalidate_vncr_va(struct kvm *kvm, va_size = ttl_to_size(pgshift_level_to_ttl(vt->wi.pgshift, vt->wr.level)); - va_start = vt->gva & (va_size - 1); + va_start = vt->gva & ~(va_size - 1); va_end = va_start + va_size; switch (scope->type) { @@ -1172,8 +1172,9 @@ static u64 read_vncr_el2(struct kvm_vcpu *vcpu) return (u64)sign_extend64(__vcpu_sys_reg(vcpu, VNCR_EL2), 48); } -static int kvm_translate_vncr(struct kvm_vcpu *vcpu) +static int kvm_translate_vncr(struct kvm_vcpu *vcpu, bool *is_gmem) { + struct kvm_memory_slot *memslot; bool write_fault, writable; unsigned long mmu_seq; struct vncr_tlb *vt; @@ -1216,10 +1217,25 @@ static int kvm_translate_vncr(struct kvm_vcpu *vcpu) smp_rmb(); gfn = vt->wr.pa >> PAGE_SHIFT; - pfn = kvm_faultin_pfn(vcpu, gfn, write_fault, &writable, &page); - if (is_error_noslot_pfn(pfn) || (write_fault && !writable)) + memslot = gfn_to_memslot(vcpu->kvm, gfn); + if (!memslot) return -EFAULT; + *is_gmem = kvm_slot_has_gmem(memslot); + if (!*is_gmem) { + pfn = __kvm_faultin_pfn(memslot, gfn, write_fault ? FOLL_WRITE : 0, + &writable, &page); + if (is_error_noslot_pfn(pfn) || (write_fault && !writable)) + return -EFAULT; + } else { + ret = kvm_gmem_get_pfn(vcpu->kvm, memslot, gfn, &pfn, &page, NULL); + if (ret) { + kvm_prepare_memory_fault_exit(vcpu, vt->wr.pa, PAGE_SIZE, + write_fault, false, false); + return ret; + } + } + scoped_guard(write_lock, &vcpu->kvm->mmu_lock) { if (mmu_invalidate_retry(vcpu->kvm, mmu_seq)) return -EAGAIN; @@ -1276,7 +1292,7 @@ static bool kvm_vncr_tlb_lookup(struct kvm_vcpu *vcpu) !(tcr & TCR_ASID16)) asid &= GENMASK(7, 0); - return asid != vt->wr.asid; + return asid == vt->wr.asid; } return true; @@ -1287,28 +1303,44 @@ int kvm_handle_vncr_abort(struct kvm_vcpu *vcpu) struct vncr_tlb *vt = vcpu->arch.vncr_tlb; u64 esr = kvm_vcpu_get_esr(vcpu); - BUG_ON(!(esr & ESR_ELx_VNCR_SHIFT)); + WARN_ON_ONCE(!(esr & ESR_ELx_VNCR)); + + if (kvm_vcpu_abt_issea(vcpu)) + return kvm_handle_guest_sea(vcpu); if (esr_fsc_is_permission_fault(esr)) { inject_vncr_perm(vcpu); } else if (esr_fsc_is_translation_fault(esr)) { - bool valid; + bool valid, is_gmem = false; int ret; scoped_guard(read_lock, &vcpu->kvm->mmu_lock) valid = kvm_vncr_tlb_lookup(vcpu); if (!valid) - ret = kvm_translate_vncr(vcpu); + ret = kvm_translate_vncr(vcpu, &is_gmem); else ret = -EPERM; switch (ret) { case -EAGAIN: - case -ENOMEM: /* Let's try again... */ break; + case -ENOMEM: + /* + * For guest_memfd, this indicates that it failed to + * create a folio to back the memory. Inform userspace. + */ + if (is_gmem) + return 0; + /* Otherwise, let's try again... */ + break; case -EFAULT: + case -EIO: + case -EHWPOISON: + if (is_gmem) + return 0; + fallthrough; case -EINVAL: case -ENOENT: case -EACCES: @@ -1459,9 +1491,16 @@ u64 limit_nv_id_reg(struct kvm *kvm, u32 reg, u64 val) case SYS_ID_AA64PFR1_EL1: /* Only support BTI, SSBS, CSV2_frac */ - val &= (ID_AA64PFR1_EL1_BT | - ID_AA64PFR1_EL1_SSBS | - ID_AA64PFR1_EL1_CSV2_frac); + val &= ~(ID_AA64PFR1_EL1_PFAR | + ID_AA64PFR1_EL1_MTEX | + ID_AA64PFR1_EL1_THE | + ID_AA64PFR1_EL1_GCS | + ID_AA64PFR1_EL1_MTE_frac | + ID_AA64PFR1_EL1_NMI | + ID_AA64PFR1_EL1_SME | + ID_AA64PFR1_EL1_RES0 | + ID_AA64PFR1_EL1_MPAM_frac | + ID_AA64PFR1_EL1_MTE); break; case SYS_ID_AA64MMFR0_EL1: @@ -1514,12 +1553,11 @@ u64 limit_nv_id_reg(struct kvm *kvm, u32 reg, u64 val) break; case SYS_ID_AA64MMFR1_EL1: - val &= (ID_AA64MMFR1_EL1_HCX | - ID_AA64MMFR1_EL1_PAN | - ID_AA64MMFR1_EL1_LO | - ID_AA64MMFR1_EL1_HPDS | - ID_AA64MMFR1_EL1_VH | - ID_AA64MMFR1_EL1_VMIDBits); + val &= ~(ID_AA64MMFR1_EL1_CMOW | + ID_AA64MMFR1_EL1_nTLBPA | + ID_AA64MMFR1_EL1_ETS | + ID_AA64MMFR1_EL1_XNX | + ID_AA64MMFR1_EL1_HAFDBS); /* FEAT_E2H0 implies no VHE */ if (test_bit(KVM_ARM_VCPU_HAS_EL2_E2H0, kvm->arch.vcpu_features)) val &= ~ID_AA64MMFR1_EL1_VH; @@ -1561,14 +1599,22 @@ u64 limit_nv_id_reg(struct kvm *kvm, u32 reg, u64 val) case SYS_ID_AA64DFR0_EL1: /* Only limited support for PMU, Debug, BPs, WPs, and HPMN0 */ - val &= (ID_AA64DFR0_EL1_PMUVer | - ID_AA64DFR0_EL1_WRPs | - ID_AA64DFR0_EL1_BRPs | - ID_AA64DFR0_EL1_DebugVer| - ID_AA64DFR0_EL1_HPMN0); - - /* Cap Debug to ARMv8.1 */ - val = ID_REG_LIMIT_FIELD_ENUM(val, ID_AA64DFR0_EL1, DebugVer, VHE); + val &= ~(ID_AA64DFR0_EL1_ExtTrcBuff | + ID_AA64DFR0_EL1_BRBE | + ID_AA64DFR0_EL1_MTPMU | + ID_AA64DFR0_EL1_TraceBuffer | + ID_AA64DFR0_EL1_TraceFilt | + ID_AA64DFR0_EL1_PMSVer | + ID_AA64DFR0_EL1_CTX_CMPs | + ID_AA64DFR0_EL1_SEBEP | + ID_AA64DFR0_EL1_PMSS | + ID_AA64DFR0_EL1_TraceVer); + + /* + * FEAT_Debugv8p9 requires support for extended breakpoints / + * watchpoints. + */ + val = ID_REG_LIMIT_FIELD_ENUM(val, ID_AA64DFR0_EL1, DebugVer, V8P8); break; } @@ -1793,3 +1839,33 @@ void kvm_nested_sync_hwstate(struct kvm_vcpu *vcpu) if (unlikely(vcpu_test_and_clear_flag(vcpu, NESTED_SERROR_PENDING))) kvm_inject_serror_esr(vcpu, vcpu_get_vsesr(vcpu)); } + +/* + * KVM unconditionally sets most of these traps anyway but use an allowlist + * to document the guest hypervisor traps that may take precedence and guard + * against future changes to the non-nested trap configuration. + */ +#define NV_MDCR_GUEST_INCLUDE (MDCR_EL2_TDE | \ + MDCR_EL2_TDA | \ + MDCR_EL2_TDRA | \ + MDCR_EL2_TTRF | \ + MDCR_EL2_TPMS | \ + MDCR_EL2_TPM | \ + MDCR_EL2_TPMCR | \ + MDCR_EL2_TDCC | \ + MDCR_EL2_TDOSA) + +void kvm_nested_setup_mdcr_el2(struct kvm_vcpu *vcpu) +{ + u64 guest_mdcr = __vcpu_sys_reg(vcpu, MDCR_EL2); + + /* + * In yet another example where FEAT_NV2 is fscking broken, accesses + * to MDSCR_EL1 are redirected to the VNCR despite having an effect + * at EL2. Use a big hammer to apply sanity. + */ + if (is_hyp_ctxt(vcpu)) + vcpu->arch.mdcr_el2 |= MDCR_EL2_TDA; + else + vcpu->arch.mdcr_el2 |= (guest_mdcr & NV_MDCR_GUEST_INCLUDE); +} diff --git a/arch/arm64/kvm/pkvm.c b/arch/arm64/kvm/pkvm.c index fcd70bfe44fb..24f0f8a8c943 100644 --- a/arch/arm64/kvm/pkvm.c +++ b/arch/arm64/kvm/pkvm.c @@ -85,16 +85,23 @@ void __init kvm_hyp_reserve(void) hyp_mem_base); } -static void __pkvm_destroy_hyp_vm(struct kvm *host_kvm) +static void __pkvm_destroy_hyp_vm(struct kvm *kvm) { - if (host_kvm->arch.pkvm.handle) { + if (pkvm_hyp_vm_is_created(kvm)) { WARN_ON(kvm_call_hyp_nvhe(__pkvm_teardown_vm, - host_kvm->arch.pkvm.handle)); + kvm->arch.pkvm.handle)); + } else if (kvm->arch.pkvm.handle) { + /* + * The VM could have been reserved but hyp initialization has + * failed. Make sure to unreserve it. + */ + kvm_call_hyp_nvhe(__pkvm_unreserve_vm, kvm->arch.pkvm.handle); } - host_kvm->arch.pkvm.handle = 0; - free_hyp_memcache(&host_kvm->arch.pkvm.teardown_mc); - free_hyp_memcache(&host_kvm->arch.pkvm.stage2_teardown_mc); + kvm->arch.pkvm.handle = 0; + kvm->arch.pkvm.is_created = false; + free_hyp_memcache(&kvm->arch.pkvm.teardown_mc); + free_hyp_memcache(&kvm->arch.pkvm.stage2_teardown_mc); } static int __pkvm_create_hyp_vcpu(struct kvm_vcpu *vcpu) @@ -129,16 +136,16 @@ static int __pkvm_create_hyp_vcpu(struct kvm_vcpu *vcpu) * * Return 0 on success, negative error code on failure. */ -static int __pkvm_create_hyp_vm(struct kvm *host_kvm) +static int __pkvm_create_hyp_vm(struct kvm *kvm) { size_t pgd_sz, hyp_vm_sz; void *pgd, *hyp_vm; int ret; - if (host_kvm->created_vcpus < 1) + if (kvm->created_vcpus < 1) return -EINVAL; - pgd_sz = kvm_pgtable_stage2_pgd_size(host_kvm->arch.mmu.vtcr); + pgd_sz = kvm_pgtable_stage2_pgd_size(kvm->arch.mmu.vtcr); /* * The PGD pages will be reclaimed using a hyp_memcache which implies @@ -152,7 +159,7 @@ static int __pkvm_create_hyp_vm(struct kvm *host_kvm) /* Allocate memory to donate to hyp for vm and vcpu pointers. */ hyp_vm_sz = PAGE_ALIGN(size_add(PKVM_HYP_VM_SIZE, size_mul(sizeof(void *), - host_kvm->created_vcpus))); + kvm->created_vcpus))); hyp_vm = alloc_pages_exact(hyp_vm_sz, GFP_KERNEL_ACCOUNT); if (!hyp_vm) { ret = -ENOMEM; @@ -160,12 +167,12 @@ static int __pkvm_create_hyp_vm(struct kvm *host_kvm) } /* Donate the VM memory to hyp and let hyp initialize it. */ - ret = kvm_call_hyp_nvhe(__pkvm_init_vm, host_kvm, hyp_vm, pgd); - if (ret < 0) + ret = kvm_call_hyp_nvhe(__pkvm_init_vm, kvm, hyp_vm, pgd); + if (ret) goto free_vm; - host_kvm->arch.pkvm.handle = ret; - host_kvm->arch.pkvm.stage2_teardown_mc.flags |= HYP_MEMCACHE_ACCOUNT_STAGE2; + kvm->arch.pkvm.is_created = true; + kvm->arch.pkvm.stage2_teardown_mc.flags |= HYP_MEMCACHE_ACCOUNT_STAGE2; kvm_account_pgtable_pages(pgd, pgd_sz / PAGE_SIZE); return 0; @@ -176,14 +183,19 @@ free_pgd: return ret; } -int pkvm_create_hyp_vm(struct kvm *host_kvm) +bool pkvm_hyp_vm_is_created(struct kvm *kvm) +{ + return READ_ONCE(kvm->arch.pkvm.is_created); +} + +int pkvm_create_hyp_vm(struct kvm *kvm) { int ret = 0; - mutex_lock(&host_kvm->arch.config_lock); - if (!host_kvm->arch.pkvm.handle) - ret = __pkvm_create_hyp_vm(host_kvm); - mutex_unlock(&host_kvm->arch.config_lock); + mutex_lock(&kvm->arch.config_lock); + if (!pkvm_hyp_vm_is_created(kvm)) + ret = __pkvm_create_hyp_vm(kvm); + mutex_unlock(&kvm->arch.config_lock); return ret; } @@ -200,15 +212,31 @@ int pkvm_create_hyp_vcpu(struct kvm_vcpu *vcpu) return ret; } -void pkvm_destroy_hyp_vm(struct kvm *host_kvm) +void pkvm_destroy_hyp_vm(struct kvm *kvm) { - mutex_lock(&host_kvm->arch.config_lock); - __pkvm_destroy_hyp_vm(host_kvm); - mutex_unlock(&host_kvm->arch.config_lock); + mutex_lock(&kvm->arch.config_lock); + __pkvm_destroy_hyp_vm(kvm); + mutex_unlock(&kvm->arch.config_lock); } -int pkvm_init_host_vm(struct kvm *host_kvm) +int pkvm_init_host_vm(struct kvm *kvm) { + int ret; + + if (pkvm_hyp_vm_is_created(kvm)) + return -EINVAL; + + /* VM is already reserved, no need to proceed. */ + if (kvm->arch.pkvm.handle) + return 0; + + /* Reserve the VM in hyp and obtain a hyp handle for the VM. */ + ret = kvm_call_hyp_nvhe(__pkvm_reserve_vm); + if (ret < 0) + return ret; + + kvm->arch.pkvm.handle = ret; + return 0; } diff --git a/arch/arm64/kvm/ptdump.c b/arch/arm64/kvm/ptdump.c index 098416d7e5c2..dc5acfb00af9 100644 --- a/arch/arm64/kvm/ptdump.c +++ b/arch/arm64/kvm/ptdump.c @@ -32,23 +32,23 @@ static const struct ptdump_prot_bits stage2_pte_bits[] = { .set = " ", .clear = "F", }, { - .mask = KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R | PTE_VALID, - .val = KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R | PTE_VALID, + .mask = KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R, + .val = KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R, .set = "R", .clear = " ", }, { - .mask = KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W | PTE_VALID, - .val = KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W | PTE_VALID, + .mask = KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W, + .val = KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W, .set = "W", .clear = " ", }, { - .mask = KVM_PTE_LEAF_ATTR_HI_S2_XN | PTE_VALID, - .val = PTE_VALID, - .set = " ", - .clear = "X", + .mask = KVM_PTE_LEAF_ATTR_HI_S2_XN, + .val = KVM_PTE_LEAF_ATTR_HI_S2_XN, + .set = "NX", + .clear = "x ", }, { - .mask = KVM_PTE_LEAF_ATTR_LO_S2_AF | PTE_VALID, - .val = KVM_PTE_LEAF_ATTR_LO_S2_AF | PTE_VALID, + .mask = KVM_PTE_LEAF_ATTR_LO_S2_AF, + .val = KVM_PTE_LEAF_ATTR_LO_S2_AF, .set = "AF", .clear = " ", }, { diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index 82ffb3b3b3cf..91053aa832d0 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -82,43 +82,105 @@ static bool write_to_read_only(struct kvm_vcpu *vcpu, "sys_reg write to read-only register"); } -#define PURE_EL2_SYSREG(el2) \ - case el2: { \ - *el1r = el2; \ - return true; \ +enum sr_loc_attr { + SR_LOC_MEMORY = 0, /* Register definitely in memory */ + SR_LOC_LOADED = BIT(0), /* Register on CPU, unless it cannot */ + SR_LOC_MAPPED = BIT(1), /* Register in a different CPU register */ + SR_LOC_XLATED = BIT(2), /* Register translated to fit another reg */ + SR_LOC_SPECIAL = BIT(3), /* Demanding register, implies loaded */ +}; + +struct sr_loc { + enum sr_loc_attr loc; + enum vcpu_sysreg map_reg; + u64 (*xlate)(u64); +}; + +static enum sr_loc_attr locate_direct_register(const struct kvm_vcpu *vcpu, + enum vcpu_sysreg reg) +{ + switch (reg) { + case SCTLR_EL1: + case CPACR_EL1: + case TTBR0_EL1: + case TTBR1_EL1: + case TCR_EL1: + case TCR2_EL1: + case PIR_EL1: + case PIRE0_EL1: + case POR_EL1: + case ESR_EL1: + case AFSR0_EL1: + case AFSR1_EL1: + case FAR_EL1: + case MAIR_EL1: + case VBAR_EL1: + case CONTEXTIDR_EL1: + case AMAIR_EL1: + case CNTKCTL_EL1: + case ELR_EL1: + case SPSR_EL1: + case ZCR_EL1: + case SCTLR2_EL1: + /* + * EL1 registers which have an ELx2 mapping are loaded if + * we're not in hypervisor context. + */ + return is_hyp_ctxt(vcpu) ? SR_LOC_MEMORY : SR_LOC_LOADED; + + case TPIDR_EL0: + case TPIDRRO_EL0: + case TPIDR_EL1: + case PAR_EL1: + case DACR32_EL2: + case IFSR32_EL2: + case DBGVCR32_EL2: + /* These registers are always loaded, no matter what */ + return SR_LOC_LOADED; + + default: + /* Non-mapped EL2 registers are by definition in memory. */ + return SR_LOC_MEMORY; } +} -#define MAPPED_EL2_SYSREG(el2, el1, fn) \ - case el2: { \ - *xlate = fn; \ - *el1r = el1; \ - return true; \ +static void locate_mapped_el2_register(const struct kvm_vcpu *vcpu, + enum vcpu_sysreg reg, + enum vcpu_sysreg map_reg, + u64 (*xlate)(u64), + struct sr_loc *loc) +{ + if (!is_hyp_ctxt(vcpu)) { + loc->loc = SR_LOC_MEMORY; + return; } -static bool get_el2_to_el1_mapping(unsigned int reg, - unsigned int *el1r, u64 (**xlate)(u64)) + loc->loc = SR_LOC_LOADED | SR_LOC_MAPPED; + loc->map_reg = map_reg; + + WARN_ON(locate_direct_register(vcpu, map_reg) != SR_LOC_MEMORY); + + if (xlate != NULL && !vcpu_el2_e2h_is_set(vcpu)) { + loc->loc |= SR_LOC_XLATED; + loc->xlate = xlate; + } +} + +#define MAPPED_EL2_SYSREG(r, m, t) \ + case r: { \ + locate_mapped_el2_register(vcpu, r, m, t, loc); \ + break; \ + } + +static void locate_register(const struct kvm_vcpu *vcpu, enum vcpu_sysreg reg, + struct sr_loc *loc) { + if (!vcpu_get_flag(vcpu, SYSREGS_ON_CPU)) { + loc->loc = SR_LOC_MEMORY; + return; + } + switch (reg) { - PURE_EL2_SYSREG( VPIDR_EL2 ); - PURE_EL2_SYSREG( VMPIDR_EL2 ); - PURE_EL2_SYSREG( ACTLR_EL2 ); - PURE_EL2_SYSREG( HCR_EL2 ); - PURE_EL2_SYSREG( MDCR_EL2 ); - PURE_EL2_SYSREG( HSTR_EL2 ); - PURE_EL2_SYSREG( HACR_EL2 ); - PURE_EL2_SYSREG( VTTBR_EL2 ); - PURE_EL2_SYSREG( VTCR_EL2 ); - PURE_EL2_SYSREG( TPIDR_EL2 ); - PURE_EL2_SYSREG( HPFAR_EL2 ); - PURE_EL2_SYSREG( HCRX_EL2 ); - PURE_EL2_SYSREG( HFGRTR_EL2 ); - PURE_EL2_SYSREG( HFGWTR_EL2 ); - PURE_EL2_SYSREG( HFGITR_EL2 ); - PURE_EL2_SYSREG( HDFGRTR_EL2 ); - PURE_EL2_SYSREG( HDFGWTR_EL2 ); - PURE_EL2_SYSREG( HAFGRTR_EL2 ); - PURE_EL2_SYSREG( CNTVOFF_EL2 ); - PURE_EL2_SYSREG( CNTHCTL_EL2 ); MAPPED_EL2_SYSREG(SCTLR_EL2, SCTLR_EL1, translate_sctlr_el2_to_sctlr_el1 ); MAPPED_EL2_SYSREG(CPTR_EL2, CPACR_EL1, @@ -144,125 +206,189 @@ static bool get_el2_to_el1_mapping(unsigned int reg, MAPPED_EL2_SYSREG(ZCR_EL2, ZCR_EL1, NULL ); MAPPED_EL2_SYSREG(CONTEXTIDR_EL2, CONTEXTIDR_EL1, NULL ); MAPPED_EL2_SYSREG(SCTLR2_EL2, SCTLR2_EL1, NULL ); + case CNTHCTL_EL2: + /* CNTHCTL_EL2 is super special, until we support NV2.1 */ + loc->loc = ((is_hyp_ctxt(vcpu) && vcpu_el2_e2h_is_set(vcpu)) ? + SR_LOC_SPECIAL : SR_LOC_MEMORY); + break; default: - return false; + loc->loc = locate_direct_register(vcpu, reg); } } -u64 vcpu_read_sys_reg(const struct kvm_vcpu *vcpu, int reg) +static u64 read_sr_from_cpu(enum vcpu_sysreg reg) { u64 val = 0x8badf00d8badf00d; - u64 (*xlate)(u64) = NULL; - unsigned int el1r; - if (!vcpu_get_flag(vcpu, SYSREGS_ON_CPU)) - goto memory_read; + switch (reg) { + case SCTLR_EL1: val = read_sysreg_s(SYS_SCTLR_EL12); break; + case CPACR_EL1: val = read_sysreg_s(SYS_CPACR_EL12); break; + case TTBR0_EL1: val = read_sysreg_s(SYS_TTBR0_EL12); break; + case TTBR1_EL1: val = read_sysreg_s(SYS_TTBR1_EL12); break; + case TCR_EL1: val = read_sysreg_s(SYS_TCR_EL12); break; + case TCR2_EL1: val = read_sysreg_s(SYS_TCR2_EL12); break; + case PIR_EL1: val = read_sysreg_s(SYS_PIR_EL12); break; + case PIRE0_EL1: val = read_sysreg_s(SYS_PIRE0_EL12); break; + case POR_EL1: val = read_sysreg_s(SYS_POR_EL12); break; + case ESR_EL1: val = read_sysreg_s(SYS_ESR_EL12); break; + case AFSR0_EL1: val = read_sysreg_s(SYS_AFSR0_EL12); break; + case AFSR1_EL1: val = read_sysreg_s(SYS_AFSR1_EL12); break; + case FAR_EL1: val = read_sysreg_s(SYS_FAR_EL12); break; + case MAIR_EL1: val = read_sysreg_s(SYS_MAIR_EL12); break; + case VBAR_EL1: val = read_sysreg_s(SYS_VBAR_EL12); break; + case CONTEXTIDR_EL1: val = read_sysreg_s(SYS_CONTEXTIDR_EL12);break; + case AMAIR_EL1: val = read_sysreg_s(SYS_AMAIR_EL12); break; + case CNTKCTL_EL1: val = read_sysreg_s(SYS_CNTKCTL_EL12); break; + case ELR_EL1: val = read_sysreg_s(SYS_ELR_EL12); break; + case SPSR_EL1: val = read_sysreg_s(SYS_SPSR_EL12); break; + case ZCR_EL1: val = read_sysreg_s(SYS_ZCR_EL12); break; + case SCTLR2_EL1: val = read_sysreg_s(SYS_SCTLR2_EL12); break; + case TPIDR_EL0: val = read_sysreg_s(SYS_TPIDR_EL0); break; + case TPIDRRO_EL0: val = read_sysreg_s(SYS_TPIDRRO_EL0); break; + case TPIDR_EL1: val = read_sysreg_s(SYS_TPIDR_EL1); break; + case PAR_EL1: val = read_sysreg_par(); break; + case DACR32_EL2: val = read_sysreg_s(SYS_DACR32_EL2); break; + case IFSR32_EL2: val = read_sysreg_s(SYS_IFSR32_EL2); break; + case DBGVCR32_EL2: val = read_sysreg_s(SYS_DBGVCR32_EL2); break; + default: WARN_ON_ONCE(1); + } + + return val; +} + +static void write_sr_to_cpu(enum vcpu_sysreg reg, u64 val) +{ + switch (reg) { + case SCTLR_EL1: write_sysreg_s(val, SYS_SCTLR_EL12); break; + case CPACR_EL1: write_sysreg_s(val, SYS_CPACR_EL12); break; + case TTBR0_EL1: write_sysreg_s(val, SYS_TTBR0_EL12); break; + case TTBR1_EL1: write_sysreg_s(val, SYS_TTBR1_EL12); break; + case TCR_EL1: write_sysreg_s(val, SYS_TCR_EL12); break; + case TCR2_EL1: write_sysreg_s(val, SYS_TCR2_EL12); break; + case PIR_EL1: write_sysreg_s(val, SYS_PIR_EL12); break; + case PIRE0_EL1: write_sysreg_s(val, SYS_PIRE0_EL12); break; + case POR_EL1: write_sysreg_s(val, SYS_POR_EL12); break; + case ESR_EL1: write_sysreg_s(val, SYS_ESR_EL12); break; + case AFSR0_EL1: write_sysreg_s(val, SYS_AFSR0_EL12); break; + case AFSR1_EL1: write_sysreg_s(val, SYS_AFSR1_EL12); break; + case FAR_EL1: write_sysreg_s(val, SYS_FAR_EL12); break; + case MAIR_EL1: write_sysreg_s(val, SYS_MAIR_EL12); break; + case VBAR_EL1: write_sysreg_s(val, SYS_VBAR_EL12); break; + case CONTEXTIDR_EL1: write_sysreg_s(val, SYS_CONTEXTIDR_EL12);break; + case AMAIR_EL1: write_sysreg_s(val, SYS_AMAIR_EL12); break; + case CNTKCTL_EL1: write_sysreg_s(val, SYS_CNTKCTL_EL12); break; + case ELR_EL1: write_sysreg_s(val, SYS_ELR_EL12); break; + case SPSR_EL1: write_sysreg_s(val, SYS_SPSR_EL12); break; + case ZCR_EL1: write_sysreg_s(val, SYS_ZCR_EL12); break; + case SCTLR2_EL1: write_sysreg_s(val, SYS_SCTLR2_EL12); break; + case TPIDR_EL0: write_sysreg_s(val, SYS_TPIDR_EL0); break; + case TPIDRRO_EL0: write_sysreg_s(val, SYS_TPIDRRO_EL0); break; + case TPIDR_EL1: write_sysreg_s(val, SYS_TPIDR_EL1); break; + case PAR_EL1: write_sysreg_s(val, SYS_PAR_EL1); break; + case DACR32_EL2: write_sysreg_s(val, SYS_DACR32_EL2); break; + case IFSR32_EL2: write_sysreg_s(val, SYS_IFSR32_EL2); break; + case DBGVCR32_EL2: write_sysreg_s(val, SYS_DBGVCR32_EL2); break; + default: WARN_ON_ONCE(1); + } +} + +u64 vcpu_read_sys_reg(const struct kvm_vcpu *vcpu, enum vcpu_sysreg reg) +{ + struct sr_loc loc = {}; + + locate_register(vcpu, reg, &loc); - if (unlikely(get_el2_to_el1_mapping(reg, &el1r, &xlate))) { - if (!is_hyp_ctxt(vcpu)) - goto memory_read; + WARN_ON_ONCE(!has_vhe() && loc.loc != SR_LOC_MEMORY); + + if (loc.loc & SR_LOC_SPECIAL) { + u64 val; + + WARN_ON_ONCE(loc.loc & ~SR_LOC_SPECIAL); /* - * CNTHCTL_EL2 requires some special treatment to - * account for the bits that can be set via CNTKCTL_EL1. + * CNTHCTL_EL2 requires some special treatment to account + * for the bits that can be set via CNTKCTL_EL1 when E2H==1. */ switch (reg) { case CNTHCTL_EL2: - if (vcpu_el2_e2h_is_set(vcpu)) { - val = read_sysreg_el1(SYS_CNTKCTL); - val &= CNTKCTL_VALID_BITS; - val |= __vcpu_sys_reg(vcpu, reg) & ~CNTKCTL_VALID_BITS; - return val; - } - break; + val = read_sysreg_el1(SYS_CNTKCTL); + val &= CNTKCTL_VALID_BITS; + val |= __vcpu_sys_reg(vcpu, reg) & ~CNTKCTL_VALID_BITS; + return val; + default: + WARN_ON_ONCE(1); } + } - /* - * If this register does not have an EL1 counterpart, - * then read the stored EL2 version. - */ - if (reg == el1r) - goto memory_read; + if (loc.loc & SR_LOC_LOADED) { + enum vcpu_sysreg map_reg = reg; - /* - * If we have a non-VHE guest and that the sysreg - * requires translation to be used at EL1, use the - * in-memory copy instead. - */ - if (!vcpu_el2_e2h_is_set(vcpu) && xlate) - goto memory_read; + if (loc.loc & SR_LOC_MAPPED) + map_reg = loc.map_reg; - /* Get the current version of the EL1 counterpart. */ - WARN_ON(!__vcpu_read_sys_reg_from_cpu(el1r, &val)); - if (reg >= __SANITISED_REG_START__) - val = kvm_vcpu_apply_reg_masks(vcpu, reg, val); + if (!(loc.loc & SR_LOC_XLATED)) { + u64 val = read_sr_from_cpu(map_reg); - return val; - } + if (reg >= __SANITISED_REG_START__) + val = kvm_vcpu_apply_reg_masks(vcpu, reg, val); - /* EL1 register can't be on the CPU if the guest is in vEL2. */ - if (unlikely(is_hyp_ctxt(vcpu))) - goto memory_read; - - if (__vcpu_read_sys_reg_from_cpu(reg, &val)) - return val; + return val; + } + } -memory_read: return __vcpu_sys_reg(vcpu, reg); } -void vcpu_write_sys_reg(struct kvm_vcpu *vcpu, u64 val, int reg) +void vcpu_write_sys_reg(struct kvm_vcpu *vcpu, u64 val, enum vcpu_sysreg reg) { - u64 (*xlate)(u64) = NULL; - unsigned int el1r; + struct sr_loc loc = {}; - if (!vcpu_get_flag(vcpu, SYSREGS_ON_CPU)) - goto memory_write; + locate_register(vcpu, reg, &loc); - if (unlikely(get_el2_to_el1_mapping(reg, &el1r, &xlate))) { - if (!is_hyp_ctxt(vcpu)) - goto memory_write; + WARN_ON_ONCE(!has_vhe() && loc.loc != SR_LOC_MEMORY); - /* - * Always store a copy of the write to memory to avoid having - * to reverse-translate virtual EL2 system registers for a - * non-VHE guest hypervisor. - */ - __vcpu_assign_sys_reg(vcpu, reg, val); + if (loc.loc & SR_LOC_SPECIAL) { + + WARN_ON_ONCE(loc.loc & ~SR_LOC_SPECIAL); switch (reg) { case CNTHCTL_EL2: /* - * If E2H=0, CNHTCTL_EL2 is a pure shadow register. - * Otherwise, some of the bits are backed by + * If E2H=1, some of the bits are backed by * CNTKCTL_EL1, while the rest is kept in memory. * Yes, this is fun stuff. */ - if (vcpu_el2_e2h_is_set(vcpu)) - write_sysreg_el1(val, SYS_CNTKCTL); - return; + write_sysreg_el1(val, SYS_CNTKCTL); + break; + default: + WARN_ON_ONCE(1); } + } - /* No EL1 counterpart? We're done here.? */ - if (reg == el1r) - return; + if (loc.loc & SR_LOC_LOADED) { + enum vcpu_sysreg map_reg = reg; + u64 xlated_val; - if (!vcpu_el2_e2h_is_set(vcpu) && xlate) - val = xlate(val); + if (reg >= __SANITISED_REG_START__) + val = kvm_vcpu_apply_reg_masks(vcpu, reg, val); - /* Redirect this to the EL1 version of the register. */ - WARN_ON(!__vcpu_write_sys_reg_to_cpu(val, el1r)); - return; - } + if (loc.loc & SR_LOC_MAPPED) + map_reg = loc.map_reg; - /* EL1 register can't be on the CPU if the guest is in vEL2. */ - if (unlikely(is_hyp_ctxt(vcpu))) - goto memory_write; + if (loc.loc & SR_LOC_XLATED) + xlated_val = loc.xlate(val); + else + xlated_val = val; - if (__vcpu_write_sys_reg_to_cpu(val, reg)) - return; + write_sr_to_cpu(map_reg, xlated_val); + + /* + * Fall through to write the backing store anyway, which + * allows translated registers to be directly read without a + * reverse translation. + */ + } -memory_write: __vcpu_assign_sys_reg(vcpu, reg, val); } @@ -1584,6 +1710,7 @@ static u8 pmuver_to_perfmon(u8 pmuver) } static u64 sanitise_id_aa64pfr0_el1(const struct kvm_vcpu *vcpu, u64 val); +static u64 sanitise_id_aa64pfr1_el1(const struct kvm_vcpu *vcpu, u64 val); static u64 sanitise_id_aa64dfr0_el1(const struct kvm_vcpu *vcpu, u64 val); /* Read a sanitised cpufeature ID register by sys_reg_desc */ @@ -1606,19 +1733,7 @@ static u64 __kvm_read_sanitised_id_reg(const struct kvm_vcpu *vcpu, val = sanitise_id_aa64pfr0_el1(vcpu, val); break; case SYS_ID_AA64PFR1_EL1: - if (!kvm_has_mte(vcpu->kvm)) { - val &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_MTE); - val &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_MTE_frac); - } - - val &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_SME); - val &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_RNDR_trap); - val &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_NMI); - val &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_GCS); - val &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_THE); - val &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_MTEX); - val &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_PFAR); - val &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_MPAM_frac); + val = sanitise_id_aa64pfr1_el1(vcpu, val); break; case SYS_ID_AA64PFR2_EL1: val &= ID_AA64PFR2_EL1_FPMR | @@ -1628,21 +1743,22 @@ static u64 __kvm_read_sanitised_id_reg(const struct kvm_vcpu *vcpu, break; case SYS_ID_AA64ISAR1_EL1: if (!vcpu_has_ptrauth(vcpu)) - val &= ~(ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_APA) | - ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_API) | - ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_GPA) | - ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_GPI)); + val &= ~(ID_AA64ISAR1_EL1_APA | + ID_AA64ISAR1_EL1_API | + ID_AA64ISAR1_EL1_GPA | + ID_AA64ISAR1_EL1_GPI); break; case SYS_ID_AA64ISAR2_EL1: if (!vcpu_has_ptrauth(vcpu)) - val &= ~(ARM64_FEATURE_MASK(ID_AA64ISAR2_EL1_APA3) | - ARM64_FEATURE_MASK(ID_AA64ISAR2_EL1_GPA3)); + val &= ~(ID_AA64ISAR2_EL1_APA3 | + ID_AA64ISAR2_EL1_GPA3); if (!cpus_have_final_cap(ARM64_HAS_WFXT) || has_broken_cntvoff()) - val &= ~ARM64_FEATURE_MASK(ID_AA64ISAR2_EL1_WFxT); + val &= ~ID_AA64ISAR2_EL1_WFxT; break; case SYS_ID_AA64ISAR3_EL1: - val &= ID_AA64ISAR3_EL1_FPRCVT | ID_AA64ISAR3_EL1_FAMINMAX; + val &= ID_AA64ISAR3_EL1_FPRCVT | ID_AA64ISAR3_EL1_LSFE | + ID_AA64ISAR3_EL1_FAMINMAX; break; case SYS_ID_AA64MMFR2_EL1: val &= ~ID_AA64MMFR2_EL1_CCIDX_MASK; @@ -1655,7 +1771,7 @@ static u64 __kvm_read_sanitised_id_reg(const struct kvm_vcpu *vcpu, ID_AA64MMFR3_EL1_S1PIE; break; case SYS_ID_MMFR4_EL1: - val &= ~ARM64_FEATURE_MASK(ID_MMFR4_EL1_CCIDX); + val &= ~ID_MMFR4_EL1_CCIDX; break; } @@ -1836,6 +1952,31 @@ static u64 sanitise_id_aa64pfr0_el1(const struct kvm_vcpu *vcpu, u64 val) return val; } +static u64 sanitise_id_aa64pfr1_el1(const struct kvm_vcpu *vcpu, u64 val) +{ + u64 pfr0 = read_sanitised_ftr_reg(SYS_ID_AA64PFR0_EL1); + + if (!kvm_has_mte(vcpu->kvm)) { + val &= ~ID_AA64PFR1_EL1_MTE; + val &= ~ID_AA64PFR1_EL1_MTE_frac; + } + + if (!(cpus_have_final_cap(ARM64_HAS_RASV1P1_EXTN) && + SYS_FIELD_GET(ID_AA64PFR0_EL1, RAS, pfr0) == ID_AA64PFR0_EL1_RAS_IMP)) + val &= ~ID_AA64PFR1_EL1_RAS_frac; + + val &= ~ID_AA64PFR1_EL1_SME; + val &= ~ID_AA64PFR1_EL1_RNDR_trap; + val &= ~ID_AA64PFR1_EL1_NMI; + val &= ~ID_AA64PFR1_EL1_GCS; + val &= ~ID_AA64PFR1_EL1_THE; + val &= ~ID_AA64PFR1_EL1_MTEX; + val &= ~ID_AA64PFR1_EL1_PFAR; + val &= ~ID_AA64PFR1_EL1_MPAM_frac; + + return val; +} + static u64 sanitise_id_aa64dfr0_el1(const struct kvm_vcpu *vcpu, u64 val) { val = ID_REG_LIMIT_FIELD_ENUM(val, ID_AA64DFR0_EL1, DebugVer, V8P8); @@ -1857,6 +1998,26 @@ static u64 sanitise_id_aa64dfr0_el1(const struct kvm_vcpu *vcpu, u64 val) return val; } +/* + * Older versions of KVM erroneously claim support for FEAT_DoubleLock with + * NV-enabled VMs on unsupporting hardware. Silently ignore the incorrect + * value if it is consistent with the bug. + */ +static bool ignore_feat_doublelock(struct kvm_vcpu *vcpu, u64 val) +{ + u8 host, user; + + if (!vcpu_has_nv(vcpu)) + return false; + + host = SYS_FIELD_GET(ID_AA64DFR0_EL1, DoubleLock, + read_sanitised_ftr_reg(SYS_ID_AA64DFR0_EL1)); + user = SYS_FIELD_GET(ID_AA64DFR0_EL1, DoubleLock, val); + + return host == ID_AA64DFR0_EL1_DoubleLock_NI && + user == ID_AA64DFR0_EL1_DoubleLock_IMP; +} + static int set_id_aa64dfr0_el1(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, u64 val) @@ -1888,6 +2049,11 @@ static int set_id_aa64dfr0_el1(struct kvm_vcpu *vcpu, if (debugver < ID_AA64DFR0_EL1_DebugVer_IMP) return -EINVAL; + if (ignore_feat_doublelock(vcpu, val)) { + val &= ~ID_AA64DFR0_EL1_DoubleLock; + val |= SYS_FIELD_PREP_ENUM(ID_AA64DFR0_EL1, DoubleLock, NI); + } + return set_id_reg(vcpu, rd, val); } @@ -2008,16 +2174,29 @@ static int set_id_aa64pfr1_el1(struct kvm_vcpu *vcpu, return set_id_reg(vcpu, rd, user_val); } +/* + * Allow userspace to de-feature a stage-2 translation granule but prevent it + * from claiming the impossible. + */ +#define tgran2_val_allowed(tg, safe, user) \ +({ \ + u8 __s = SYS_FIELD_GET(ID_AA64MMFR0_EL1, tg, safe); \ + u8 __u = SYS_FIELD_GET(ID_AA64MMFR0_EL1, tg, user); \ + \ + __s == __u || __u == ID_AA64MMFR0_EL1_##tg##_NI; \ +}) + static int set_id_aa64mmfr0_el1(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, u64 user_val) { u64 sanitized_val = kvm_read_sanitised_id_reg(vcpu, rd); - u64 tgran2_mask = ID_AA64MMFR0_EL1_TGRAN4_2_MASK | - ID_AA64MMFR0_EL1_TGRAN16_2_MASK | - ID_AA64MMFR0_EL1_TGRAN64_2_MASK; - if (vcpu_has_nv(vcpu) && - ((sanitized_val & tgran2_mask) != (user_val & tgran2_mask))) + if (!vcpu_has_nv(vcpu)) + return set_id_reg(vcpu, rd, user_val); + + if (!tgran2_val_allowed(TGRAN4_2, sanitized_val, user_val) || + !tgran2_val_allowed(TGRAN16_2, sanitized_val, user_val) || + !tgran2_val_allowed(TGRAN64_2, sanitized_val, user_val)) return -EINVAL; return set_id_reg(vcpu, rd, user_val); @@ -2697,6 +2876,18 @@ static bool access_ras(struct kvm_vcpu *vcpu, struct kvm *kvm = vcpu->kvm; switch(reg_to_encoding(r)) { + case SYS_ERXPFGCDN_EL1: + case SYS_ERXPFGCTL_EL1: + case SYS_ERXPFGF_EL1: + case SYS_ERXMISC2_EL1: + case SYS_ERXMISC3_EL1: + if (!(kvm_has_feat(kvm, ID_AA64PFR0_EL1, RAS, V1P1) || + (kvm_has_feat_enum(kvm, ID_AA64PFR0_EL1, RAS, IMP) && + kvm_has_feat(kvm, ID_AA64PFR1_EL1, RAS_frac, RASv1p1)))) { + kvm_inject_undefined(vcpu); + return false; + } + break; default: if (!kvm_has_feat(kvm, ID_AA64PFR0_EL1, RAS, IMP)) { kvm_inject_undefined(vcpu); @@ -2929,7 +3120,6 @@ static const struct sys_reg_desc sys_reg_descs[] = { ~(ID_AA64PFR0_EL1_AMU | ID_AA64PFR0_EL1_MPAM | ID_AA64PFR0_EL1_SVE | - ID_AA64PFR0_EL1_RAS | ID_AA64PFR0_EL1_AdvSIMD | ID_AA64PFR0_EL1_FP)), ID_FILTERED(ID_AA64PFR1_EL1, id_aa64pfr1_el1, @@ -2943,7 +3133,6 @@ static const struct sys_reg_desc sys_reg_descs[] = { ID_AA64PFR1_EL1_SME | ID_AA64PFR1_EL1_RES0 | ID_AA64PFR1_EL1_MPAM_frac | - ID_AA64PFR1_EL1_RAS_frac | ID_AA64PFR1_EL1_MTE)), ID_WRITABLE(ID_AA64PFR2_EL1, ID_AA64PFR2_EL1_FPMR | @@ -2991,6 +3180,7 @@ static const struct sys_reg_desc sys_reg_descs[] = { ID_AA64ISAR2_EL1_APA3 | ID_AA64ISAR2_EL1_GPA3)), ID_WRITABLE(ID_AA64ISAR3_EL1, (ID_AA64ISAR3_EL1_FPRCVT | + ID_AA64ISAR3_EL1_LSFE | ID_AA64ISAR3_EL1_FAMINMAX)), ID_UNALLOCATED(6,4), ID_UNALLOCATED(6,5), @@ -3002,8 +3192,6 @@ static const struct sys_reg_desc sys_reg_descs[] = { ~(ID_AA64MMFR0_EL1_RES0 | ID_AA64MMFR0_EL1_ASIDBITS)), ID_WRITABLE(ID_AA64MMFR1_EL1, ~(ID_AA64MMFR1_EL1_RES0 | - ID_AA64MMFR1_EL1_HCX | - ID_AA64MMFR1_EL1_TWED | ID_AA64MMFR1_EL1_XNX | ID_AA64MMFR1_EL1_VH | ID_AA64MMFR1_EL1_VMIDBits)), @@ -3063,8 +3251,13 @@ static const struct sys_reg_desc sys_reg_descs[] = { { SYS_DESC(SYS_ERXCTLR_EL1), access_ras }, { SYS_DESC(SYS_ERXSTATUS_EL1), access_ras }, { SYS_DESC(SYS_ERXADDR_EL1), access_ras }, + { SYS_DESC(SYS_ERXPFGF_EL1), access_ras }, + { SYS_DESC(SYS_ERXPFGCTL_EL1), access_ras }, + { SYS_DESC(SYS_ERXPFGCDN_EL1), access_ras }, { SYS_DESC(SYS_ERXMISC0_EL1), access_ras }, { SYS_DESC(SYS_ERXMISC1_EL1), access_ras }, + { SYS_DESC(SYS_ERXMISC2_EL1), access_ras }, + { SYS_DESC(SYS_ERXMISC3_EL1), access_ras }, MTE_REG(TFSR_EL1), MTE_REG(TFSRE0_EL1), @@ -3083,6 +3276,7 @@ static const struct sys_reg_desc sys_reg_descs[] = { { SYS_DESC(SYS_PMBLIMITR_EL1), undef_access }, { SYS_DESC(SYS_PMBPTR_EL1), undef_access }, { SYS_DESC(SYS_PMBSR_EL1), undef_access }, + { SYS_DESC(SYS_PMSDSFR_EL1), undef_access }, /* PMBIDR_EL1 is not trapped */ { PMU_SYS_REG(PMINTENSET_EL1), diff --git a/arch/arm64/kvm/vgic/vgic-debug.c b/arch/arm64/kvm/vgic/vgic-debug.c index 2684f273d9e1..4c1209261b65 100644 --- a/arch/arm64/kvm/vgic/vgic-debug.c +++ b/arch/arm64/kvm/vgic/vgic-debug.c @@ -69,7 +69,7 @@ static int iter_mark_lpis(struct kvm *kvm) int nr_lpis = 0; xa_for_each(&dist->lpi_xa, intid, irq) { - if (!vgic_try_get_irq_kref(irq)) + if (!vgic_try_get_irq_ref(irq)) continue; xa_set_mark(&dist->lpi_xa, intid, LPI_XA_MARK_DEBUG_ITER); diff --git a/arch/arm64/kvm/vgic/vgic-init.c b/arch/arm64/kvm/vgic/vgic-init.c index 1e680ad6e863..1796b1a22a72 100644 --- a/arch/arm64/kvm/vgic/vgic-init.c +++ b/arch/arm64/kvm/vgic/vgic-init.c @@ -53,7 +53,7 @@ void kvm_vgic_early_init(struct kvm *kvm) { struct vgic_dist *dist = &kvm->arch.vgic; - xa_init_flags(&dist->lpi_xa, XA_FLAGS_LOCK_IRQ); + xa_init(&dist->lpi_xa); } /* CREATION */ @@ -208,7 +208,7 @@ static int kvm_vgic_dist_init(struct kvm *kvm, unsigned int nr_spis) raw_spin_lock_init(&irq->irq_lock); irq->vcpu = NULL; irq->target_vcpu = vcpu0; - kref_init(&irq->refcount); + refcount_set(&irq->refcount, 0); switch (dist->vgic_model) { case KVM_DEV_TYPE_ARM_VGIC_V2: irq->targets = 0; @@ -277,7 +277,7 @@ static int vgic_allocate_private_irqs_locked(struct kvm_vcpu *vcpu, u32 type) irq->intid = i; irq->vcpu = NULL; irq->target_vcpu = vcpu; - kref_init(&irq->refcount); + refcount_set(&irq->refcount, 0); if (vgic_irq_is_sgi(i)) { /* SGIs */ irq->enabled = 1; @@ -554,7 +554,6 @@ int vgic_lazy_init(struct kvm *kvm) * Also map the virtual CPU interface into the VM. * v2 calls vgic_init() if not already done. * v3 and derivatives return an error if the VGIC is not initialized. - * vgic_ready() returns true if this function has succeeded. */ int kvm_vgic_map_resources(struct kvm *kvm) { @@ -563,12 +562,12 @@ int kvm_vgic_map_resources(struct kvm *kvm) gpa_t dist_base; int ret = 0; - if (likely(vgic_ready(kvm))) + if (likely(smp_load_acquire(&dist->ready))) return 0; mutex_lock(&kvm->slots_lock); mutex_lock(&kvm->arch.config_lock); - if (vgic_ready(kvm)) + if (dist->ready) goto out; if (!irqchip_in_kernel(kvm)) @@ -594,14 +593,7 @@ int kvm_vgic_map_resources(struct kvm *kvm) goto out_slots; } - /* - * kvm_io_bus_register_dev() guarantees all readers see the new MMIO - * registration before returning through synchronize_srcu(), which also - * implies a full memory barrier. As such, marking the distributor as - * 'ready' here is guaranteed to be ordered after all vCPUs having seen - * a completely configured distributor. - */ - dist->ready = true; + smp_store_release(&dist->ready, true); goto out_slots; out: mutex_unlock(&kvm->arch.config_lock); diff --git a/arch/arm64/kvm/vgic/vgic-its.c b/arch/arm64/kvm/vgic/vgic-its.c index 7368c13f16b7..ce3e3ed3f29f 100644 --- a/arch/arm64/kvm/vgic/vgic-its.c +++ b/arch/arm64/kvm/vgic/vgic-its.c @@ -78,7 +78,6 @@ static struct vgic_irq *vgic_add_lpi(struct kvm *kvm, u32 intid, { struct vgic_dist *dist = &kvm->arch.vgic; struct vgic_irq *irq = vgic_get_irq(kvm, intid), *oldirq; - unsigned long flags; int ret; /* In this case there is no put, since we keep the reference. */ @@ -89,7 +88,7 @@ static struct vgic_irq *vgic_add_lpi(struct kvm *kvm, u32 intid, if (!irq) return ERR_PTR(-ENOMEM); - ret = xa_reserve_irq(&dist->lpi_xa, intid, GFP_KERNEL_ACCOUNT); + ret = xa_reserve(&dist->lpi_xa, intid, GFP_KERNEL_ACCOUNT); if (ret) { kfree(irq); return ERR_PTR(ret); @@ -99,19 +98,19 @@ static struct vgic_irq *vgic_add_lpi(struct kvm *kvm, u32 intid, raw_spin_lock_init(&irq->irq_lock); irq->config = VGIC_CONFIG_EDGE; - kref_init(&irq->refcount); + refcount_set(&irq->refcount, 1); irq->intid = intid; irq->target_vcpu = vcpu; irq->group = 1; - xa_lock_irqsave(&dist->lpi_xa, flags); + xa_lock(&dist->lpi_xa); /* * There could be a race with another vgic_add_lpi(), so we need to * check that we don't add a second list entry with the same LPI. */ oldirq = xa_load(&dist->lpi_xa, intid); - if (vgic_try_get_irq_kref(oldirq)) { + if (vgic_try_get_irq_ref(oldirq)) { /* Someone was faster with adding this LPI, lets use that. */ kfree(irq); irq = oldirq; @@ -126,7 +125,7 @@ static struct vgic_irq *vgic_add_lpi(struct kvm *kvm, u32 intid, } out_unlock: - xa_unlock_irqrestore(&dist->lpi_xa, flags); + xa_unlock(&dist->lpi_xa); if (ret) return ERR_PTR(ret); @@ -547,7 +546,7 @@ static struct vgic_irq *vgic_its_check_cache(struct kvm *kvm, phys_addr_t db, rcu_read_lock(); irq = xa_load(&its->translation_cache, cache_key); - if (!vgic_try_get_irq_kref(irq)) + if (!vgic_try_get_irq_ref(irq)) irq = NULL; rcu_read_unlock(); @@ -571,7 +570,7 @@ static void vgic_its_cache_translation(struct kvm *kvm, struct vgic_its *its, * its_lock, as the ITE (and the reference it holds) cannot be freed. */ lockdep_assert_held(&its->its_lock); - vgic_get_irq_kref(irq); + vgic_get_irq_ref(irq); old = xa_store(&its->translation_cache, cache_key, irq, GFP_KERNEL_ACCOUNT); diff --git a/arch/arm64/kvm/vgic/vgic-mmio-v3.c b/arch/arm64/kvm/vgic/vgic-mmio-v3.c index a3ef185209e9..70d50c77e5dc 100644 --- a/arch/arm64/kvm/vgic/vgic-mmio-v3.c +++ b/arch/arm64/kvm/vgic/vgic-mmio-v3.c @@ -50,6 +50,14 @@ bool vgic_has_its(struct kvm *kvm) bool vgic_supports_direct_msis(struct kvm *kvm) { + /* + * Deliberately conflate vLPI and vSGI support on GICv4.1 hardware, + * indirectly allowing userspace to control whether or not vPEs are + * allocated for the VM. + */ + if (system_supports_direct_sgis() && !vgic_supports_direct_sgis(kvm)) + return false; + return kvm_vgic_global_state.has_gicv4 && vgic_has_its(kvm); } diff --git a/arch/arm64/kvm/vgic/vgic-mmio.c b/arch/arm64/kvm/vgic/vgic-mmio.c index e416e433baff..a573b1f0c6cb 100644 --- a/arch/arm64/kvm/vgic/vgic-mmio.c +++ b/arch/arm64/kvm/vgic/vgic-mmio.c @@ -1091,7 +1091,7 @@ int vgic_register_dist_iodev(struct kvm *kvm, gpa_t dist_base_address, len = vgic_v3_init_dist_iodev(io_device); break; default: - BUG_ON(1); + BUG(); } io_device->base_addr = dist_base_address; diff --git a/arch/arm64/kvm/vgic/vgic-v3.c b/arch/arm64/kvm/vgic/vgic-v3.c index b9ad7c42c5b0..f1c153106c56 100644 --- a/arch/arm64/kvm/vgic/vgic-v3.c +++ b/arch/arm64/kvm/vgic/vgic-v3.c @@ -588,6 +588,7 @@ int vgic_v3_map_resources(struct kvm *kvm) } DEFINE_STATIC_KEY_FALSE(vgic_v3_cpuif_trap); +DEFINE_STATIC_KEY_FALSE(vgic_v3_has_v2_compat); static int __init early_group0_trap_cfg(char *buf) { @@ -697,6 +698,13 @@ int vgic_v3_probe(const struct gic_kvm_info *info) if (kvm_vgic_global_state.vcpu_base == 0) kvm_info("disabling GICv2 emulation\n"); + /* + * Flip the static branch if the HW supports v2, even if we're + * not using it (such as in protected mode). + */ + if (has_v2) + static_branch_enable(&vgic_v3_has_v2_compat); + if (cpus_have_final_cap(ARM64_WORKAROUND_CAVIUM_30115)) { group0_trap = true; group1_trap = true; diff --git a/arch/arm64/kvm/vgic/vgic-v4.c b/arch/arm64/kvm/vgic/vgic-v4.c index 4d9343d2b0b1..548aec9d5a72 100644 --- a/arch/arm64/kvm/vgic/vgic-v4.c +++ b/arch/arm64/kvm/vgic/vgic-v4.c @@ -518,7 +518,7 @@ static struct vgic_irq *__vgic_host_irq_get_vlpi(struct kvm *kvm, int host_irq) if (!irq->hw || irq->host_irq != host_irq) continue; - if (!vgic_try_get_irq_kref(irq)) + if (!vgic_try_get_irq_ref(irq)) return NULL; return irq; diff --git a/arch/arm64/kvm/vgic/vgic-v5.c b/arch/arm64/kvm/vgic/vgic-v5.c index 6bdbb221bcde..2d3811f4e117 100644 --- a/arch/arm64/kvm/vgic/vgic-v5.c +++ b/arch/arm64/kvm/vgic/vgic-v5.c @@ -15,7 +15,7 @@ int vgic_v5_probe(const struct gic_kvm_info *info) u64 ich_vtr_el2; int ret; - if (!info->has_gcie_v3_compat) + if (!cpus_have_final_cap(ARM64_HAS_GICV5_LEGACY)) return -ENODEV; kvm_vgic_global_state.type = VGIC_V5; diff --git a/arch/arm64/kvm/vgic/vgic.c b/arch/arm64/kvm/vgic/vgic.c index f5148b38120a..6dd5a10081e2 100644 --- a/arch/arm64/kvm/vgic/vgic.c +++ b/arch/arm64/kvm/vgic/vgic.c @@ -28,8 +28,8 @@ struct vgic_global kvm_vgic_global_state __ro_after_init = { * kvm->arch.config_lock (mutex) * its->cmd_lock (mutex) * its->its_lock (mutex) - * vgic_cpu->ap_list_lock must be taken with IRQs disabled - * vgic_dist->lpi_xa.xa_lock must be taken with IRQs disabled + * vgic_dist->lpi_xa.xa_lock + * vgic_cpu->ap_list_lock must be taken with IRQs disabled * vgic_irq->irq_lock must be taken with IRQs disabled * * As the ap_list_lock might be taken from the timer interrupt handler, @@ -71,7 +71,7 @@ static struct vgic_irq *vgic_get_lpi(struct kvm *kvm, u32 intid) rcu_read_lock(); irq = xa_load(&dist->lpi_xa, intid); - if (!vgic_try_get_irq_kref(irq)) + if (!vgic_try_get_irq_ref(irq)) irq = NULL; rcu_read_unlock(); @@ -114,37 +114,66 @@ struct vgic_irq *vgic_get_vcpu_irq(struct kvm_vcpu *vcpu, u32 intid) return vgic_get_irq(vcpu->kvm, intid); } -/* - * We can't do anything in here, because we lack the kvm pointer to - * lock and remove the item from the lpi_list. So we keep this function - * empty and use the return value of kref_put() to trigger the freeing. - */ -static void vgic_irq_release(struct kref *ref) +static void vgic_release_lpi_locked(struct vgic_dist *dist, struct vgic_irq *irq) +{ + lockdep_assert_held(&dist->lpi_xa.xa_lock); + __xa_erase(&dist->lpi_xa, irq->intid); + kfree_rcu(irq, rcu); +} + +static __must_check bool __vgic_put_irq(struct kvm *kvm, struct vgic_irq *irq) +{ + if (irq->intid < VGIC_MIN_LPI) + return false; + + return refcount_dec_and_test(&irq->refcount); +} + +static __must_check bool vgic_put_irq_norelease(struct kvm *kvm, struct vgic_irq *irq) { + if (!__vgic_put_irq(kvm, irq)) + return false; + + irq->pending_release = true; + return true; } void vgic_put_irq(struct kvm *kvm, struct vgic_irq *irq) { struct vgic_dist *dist = &kvm->arch.vgic; - unsigned long flags; - if (irq->intid < VGIC_MIN_LPI) - return; + if (irq->intid >= VGIC_MIN_LPI) + might_lock(&dist->lpi_xa.xa_lock); - if (!kref_put(&irq->refcount, vgic_irq_release)) + if (!__vgic_put_irq(kvm, irq)) return; - xa_lock_irqsave(&dist->lpi_xa, flags); - __xa_erase(&dist->lpi_xa, irq->intid); - xa_unlock_irqrestore(&dist->lpi_xa, flags); + xa_lock(&dist->lpi_xa); + vgic_release_lpi_locked(dist, irq); + xa_unlock(&dist->lpi_xa); +} - kfree_rcu(irq, rcu); +static void vgic_release_deleted_lpis(struct kvm *kvm) +{ + struct vgic_dist *dist = &kvm->arch.vgic; + unsigned long intid; + struct vgic_irq *irq; + + xa_lock(&dist->lpi_xa); + + xa_for_each(&dist->lpi_xa, intid, irq) { + if (irq->pending_release) + vgic_release_lpi_locked(dist, irq); + } + + xa_unlock(&dist->lpi_xa); } void vgic_flush_pending_lpis(struct kvm_vcpu *vcpu) { struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; struct vgic_irq *irq, *tmp; + bool deleted = false; unsigned long flags; raw_spin_lock_irqsave(&vgic_cpu->ap_list_lock, flags); @@ -155,11 +184,14 @@ void vgic_flush_pending_lpis(struct kvm_vcpu *vcpu) list_del(&irq->ap_list); irq->vcpu = NULL; raw_spin_unlock(&irq->irq_lock); - vgic_put_irq(vcpu->kvm, irq); + deleted |= vgic_put_irq_norelease(vcpu->kvm, irq); } } raw_spin_unlock_irqrestore(&vgic_cpu->ap_list_lock, flags); + + if (deleted) + vgic_release_deleted_lpis(vcpu->kvm); } void vgic_irq_set_phys_pending(struct vgic_irq *irq, bool pending) @@ -399,7 +431,7 @@ retry: * now in the ap_list. This is safe as the caller must already hold a * reference on the irq. */ - vgic_get_irq_kref(irq); + vgic_get_irq_ref(irq); list_add_tail(&irq->ap_list, &vcpu->arch.vgic_cpu.ap_list_head); irq->vcpu = vcpu; @@ -630,6 +662,7 @@ static void vgic_prune_ap_list(struct kvm_vcpu *vcpu) { struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; struct vgic_irq *irq, *tmp; + bool deleted_lpis = false; DEBUG_SPINLOCK_BUG_ON(!irqs_disabled()); @@ -657,12 +690,12 @@ retry: /* * This vgic_put_irq call matches the - * vgic_get_irq_kref in vgic_queue_irq_unlock, + * vgic_get_irq_ref in vgic_queue_irq_unlock, * where we added the LPI to the ap_list. As * we remove the irq from the list, we drop * also drop the refcount. */ - vgic_put_irq(vcpu->kvm, irq); + deleted_lpis |= vgic_put_irq_norelease(vcpu->kvm, irq); continue; } @@ -725,6 +758,9 @@ retry: } raw_spin_unlock(&vgic_cpu->ap_list_lock); + + if (unlikely(deleted_lpis)) + vgic_release_deleted_lpis(vcpu->kvm); } static inline void vgic_fold_lr_state(struct kvm_vcpu *vcpu) @@ -818,7 +854,7 @@ static void vgic_flush_lr_state(struct kvm_vcpu *vcpu) * the AP list has been sorted already. */ if (multi_sgi && irq->priority > prio) { - _raw_spin_unlock(&irq->irq_lock); + raw_spin_unlock(&irq->irq_lock); break; } diff --git a/arch/arm64/kvm/vgic/vgic.h b/arch/arm64/kvm/vgic/vgic.h index 1384a04c0784..ac5f9c5d2b98 100644 --- a/arch/arm64/kvm/vgic/vgic.h +++ b/arch/arm64/kvm/vgic/vgic.h @@ -267,7 +267,7 @@ void vgic_v2_put(struct kvm_vcpu *vcpu); void vgic_v2_save_state(struct kvm_vcpu *vcpu); void vgic_v2_restore_state(struct kvm_vcpu *vcpu); -static inline bool vgic_try_get_irq_kref(struct vgic_irq *irq) +static inline bool vgic_try_get_irq_ref(struct vgic_irq *irq) { if (!irq) return false; @@ -275,12 +275,12 @@ static inline bool vgic_try_get_irq_kref(struct vgic_irq *irq) if (irq->intid < VGIC_MIN_LPI) return true; - return kref_get_unless_zero(&irq->refcount); + return refcount_inc_not_zero(&irq->refcount); } -static inline void vgic_get_irq_kref(struct vgic_irq *irq) +static inline void vgic_get_irq_ref(struct vgic_irq *irq) { - WARN_ON_ONCE(!vgic_try_get_irq_kref(irq)); + WARN_ON_ONCE(!vgic_try_get_irq_ref(irq)); } void vgic_v3_fold_lr_state(struct kvm_vcpu *vcpu); @@ -396,15 +396,7 @@ bool vgic_supports_direct_sgis(struct kvm *kvm); static inline bool vgic_supports_direct_irqs(struct kvm *kvm) { - /* - * Deliberately conflate vLPI and vSGI support on GICv4.1 hardware, - * indirectly allowing userspace to control whether or not vPEs are - * allocated for the VM. - */ - if (system_supports_direct_sgis()) - return vgic_supports_direct_sgis(kvm); - - return vgic_supports_direct_msis(kvm); + return vgic_supports_direct_msis(kvm) || vgic_supports_direct_sgis(kvm); } int vgic_v4_init(struct kvm *kvm); |