summaryrefslogtreecommitdiff
path: root/arch/arm64/kvm
diff options
context:
space:
mode:
Diffstat (limited to 'arch/arm64/kvm')
-rw-r--r--arch/arm64/kvm/Kconfig1
-rw-r--r--arch/arm64/kvm/arm.c31
-rw-r--r--arch/arm64/kvm/at.c382
-rw-r--r--arch/arm64/kvm/config.c358
-rw-r--r--arch/arm64/kvm/debug.c38
-rw-r--r--arch/arm64/kvm/emulate-nested.c3
-rw-r--r--arch/arm64/kvm/handle_exit.c5
-rw-r--r--arch/arm64/kvm/hyp/exception.c20
-rw-r--r--arch/arm64/kvm/hyp/include/hyp/switch.h5
-rw-r--r--arch/arm64/kvm/hyp/include/nvhe/pkvm.h4
-rw-r--r--arch/arm64/kvm/hyp/include/nvhe/trap_handler.h3
-rw-r--r--arch/arm64/kvm/hyp/nvhe/Makefile1
-rw-r--r--arch/arm64/kvm/hyp/nvhe/ffa.c217
-rw-r--r--arch/arm64/kvm/hyp/nvhe/hyp-main.c14
-rw-r--r--arch/arm64/kvm/hyp/nvhe/list_debug.c2
-rw-r--r--arch/arm64/kvm/hyp/nvhe/mem_protect.c9
-rw-r--r--arch/arm64/kvm/hyp/nvhe/pkvm.c177
-rw-r--r--arch/arm64/kvm/hyp/nvhe/setup.c12
-rw-r--r--arch/arm64/kvm/hyp/nvhe/switch.c6
-rw-r--r--arch/arm64/kvm/hyp/nvhe/sys_regs.c5
-rw-r--r--arch/arm64/kvm/hyp/vgic-v2-cpuif-proxy.c2
-rw-r--r--arch/arm64/kvm/hyp/vgic-v3-sr.c25
-rw-r--r--arch/arm64/kvm/hyp/vhe/switch.c12
-rw-r--r--arch/arm64/kvm/inject_fault.c27
-rw-r--r--arch/arm64/kvm/mmu.c250
-rw-r--r--arch/arm64/kvm/nested.c132
-rw-r--r--arch/arm64/kvm/pkvm.c76
-rw-r--r--arch/arm64/kvm/ptdump.c20
-rw-r--r--arch/arm64/kvm/sys_regs.c474
-rw-r--r--arch/arm64/kvm/vgic/vgic-debug.c2
-rw-r--r--arch/arm64/kvm/vgic/vgic-init.c20
-rw-r--r--arch/arm64/kvm/vgic/vgic-its.c15
-rw-r--r--arch/arm64/kvm/vgic/vgic-mmio-v3.c8
-rw-r--r--arch/arm64/kvm/vgic/vgic-mmio.c2
-rw-r--r--arch/arm64/kvm/vgic/vgic-v3.c8
-rw-r--r--arch/arm64/kvm/vgic/vgic-v4.c2
-rw-r--r--arch/arm64/kvm/vgic/vgic-v5.c2
-rw-r--r--arch/arm64/kvm/vgic/vgic.c80
-rw-r--r--arch/arm64/kvm/vgic/vgic.h18
39 files changed, 1706 insertions, 762 deletions
diff --git a/arch/arm64/kvm/Kconfig b/arch/arm64/kvm/Kconfig
index 713248f240e0..bff62e75d681 100644
--- a/arch/arm64/kvm/Kconfig
+++ b/arch/arm64/kvm/Kconfig
@@ -37,6 +37,7 @@ menuconfig KVM
select HAVE_KVM_VCPU_RUN_PID_CHANGE
select SCHED_INFO
select GUEST_PERF_EVENTS if PERF_EVENTS
+ select KVM_GUEST_MEMFD
help
Support hosting virtualized guest machines.
diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
index 888f7c7abf54..fa79744290f3 100644
--- a/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@@ -170,10 +170,6 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
if (ret)
return ret;
- ret = pkvm_init_host_vm(kvm);
- if (ret)
- goto err_unshare_kvm;
-
if (!zalloc_cpumask_var(&kvm->arch.supported_cpus, GFP_KERNEL_ACCOUNT)) {
ret = -ENOMEM;
goto err_unshare_kvm;
@@ -184,6 +180,16 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
if (ret)
goto err_free_cpumask;
+ if (is_protected_kvm_enabled()) {
+ /*
+ * If any failures occur after this is successful, make sure to
+ * call __pkvm_unreserve_vm to unreserve the VM in hyp.
+ */
+ ret = pkvm_init_host_vm(kvm);
+ if (ret)
+ goto err_free_cpumask;
+ }
+
kvm_vgic_early_init(kvm);
kvm_timer_init_vm(kvm);
@@ -2113,8 +2119,10 @@ static void cpu_hyp_init_features(void)
{
cpu_set_hyp_vector();
- if (is_kernel_in_hyp_mode())
+ if (is_kernel_in_hyp_mode()) {
kvm_timer_init_vhe();
+ kvm_debug_init_vhe();
+ }
if (vgic_present)
kvm_vgic_init_cpu_hardware();
@@ -2315,8 +2323,9 @@ static int __init init_subsystems(void)
}
if (kvm_mode == KVM_MODE_NV &&
- !(vgic_present && kvm_vgic_global_state.type == VGIC_V3)) {
- kvm_err("NV support requires GICv3, giving up\n");
+ !(vgic_present && (kvm_vgic_global_state.type == VGIC_V3 ||
+ kvm_vgic_global_state.has_gcie_v3_compat))) {
+ kvm_err("NV support requires GICv3 or GICv5 with legacy support, giving up\n");
err = -EINVAL;
goto out;
}
@@ -2408,12 +2417,12 @@ static u64 get_hyp_id_aa64pfr0_el1(void)
*/
u64 val = read_sanitised_ftr_reg(SYS_ID_AA64PFR0_EL1);
- val &= ~(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_CSV2) |
- ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_CSV3));
+ val &= ~(ID_AA64PFR0_EL1_CSV2 |
+ ID_AA64PFR0_EL1_CSV3);
- val |= FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_CSV2),
+ val |= FIELD_PREP(ID_AA64PFR0_EL1_CSV2,
arm64_get_spectre_v2_state() == SPECTRE_UNAFFECTED);
- val |= FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_CSV3),
+ val |= FIELD_PREP(ID_AA64PFR0_EL1_CSV3,
arm64_get_meltdown_state() == SPECTRE_UNAFFECTED);
return val;
diff --git a/arch/arm64/kvm/at.c b/arch/arm64/kvm/at.c
index 0e5610533949..20bb9af125b1 100644
--- a/arch/arm64/kvm/at.c
+++ b/arch/arm64/kvm/at.c
@@ -28,9 +28,57 @@ static int get_ia_size(struct s1_walk_info *wi)
/* Return true if the IPA is out of the OA range */
static bool check_output_size(u64 ipa, struct s1_walk_info *wi)
{
+ if (wi->pa52bit)
+ return wi->max_oa_bits < 52 && (ipa & GENMASK_ULL(51, wi->max_oa_bits));
return wi->max_oa_bits < 48 && (ipa & GENMASK_ULL(47, wi->max_oa_bits));
}
+static bool has_52bit_pa(struct kvm_vcpu *vcpu, struct s1_walk_info *wi, u64 tcr)
+{
+ switch (BIT(wi->pgshift)) {
+ case SZ_64K:
+ default: /* IMPDEF: treat any other value as 64k */
+ if (!kvm_has_feat_enum(vcpu->kvm, ID_AA64MMFR0_EL1, PARANGE, 52))
+ return false;
+ return ((wi->regime == TR_EL2 ?
+ FIELD_GET(TCR_EL2_PS_MASK, tcr) :
+ FIELD_GET(TCR_IPS_MASK, tcr)) == 0b0110);
+ case SZ_16K:
+ if (!kvm_has_feat(vcpu->kvm, ID_AA64MMFR0_EL1, TGRAN16, 52_BIT))
+ return false;
+ break;
+ case SZ_4K:
+ if (!kvm_has_feat(vcpu->kvm, ID_AA64MMFR0_EL1, TGRAN4, 52_BIT))
+ return false;
+ break;
+ }
+
+ return (tcr & (wi->regime == TR_EL2 ? TCR_EL2_DS : TCR_DS));
+}
+
+static u64 desc_to_oa(struct s1_walk_info *wi, u64 desc)
+{
+ u64 addr;
+
+ if (!wi->pa52bit)
+ return desc & GENMASK_ULL(47, wi->pgshift);
+
+ switch (BIT(wi->pgshift)) {
+ case SZ_4K:
+ case SZ_16K:
+ addr = desc & GENMASK_ULL(49, wi->pgshift);
+ addr |= FIELD_GET(KVM_PTE_ADDR_51_50_LPA2, desc) << 50;
+ break;
+ case SZ_64K:
+ default: /* IMPDEF: treat any other value as 64k */
+ addr = desc & GENMASK_ULL(47, wi->pgshift);
+ addr |= FIELD_GET(KVM_PTE_ADDR_51_48, desc) << 48;
+ break;
+ }
+
+ return addr;
+}
+
/* Return the translation regime that applies to an AT instruction */
static enum trans_regime compute_translation_regime(struct kvm_vcpu *vcpu, u32 op)
{
@@ -50,21 +98,26 @@ static enum trans_regime compute_translation_regime(struct kvm_vcpu *vcpu, u32 o
}
}
+static u64 effective_tcr2(struct kvm_vcpu *vcpu, enum trans_regime regime)
+{
+ if (regime == TR_EL10) {
+ if (vcpu_has_nv(vcpu) &&
+ !(__vcpu_sys_reg(vcpu, HCRX_EL2) & HCRX_EL2_TCR2En))
+ return 0;
+
+ return vcpu_read_sys_reg(vcpu, TCR2_EL1);
+ }
+
+ return vcpu_read_sys_reg(vcpu, TCR2_EL2);
+}
+
static bool s1pie_enabled(struct kvm_vcpu *vcpu, enum trans_regime regime)
{
if (!kvm_has_s1pie(vcpu->kvm))
return false;
- switch (regime) {
- case TR_EL2:
- case TR_EL20:
- return vcpu_read_sys_reg(vcpu, TCR2_EL2) & TCR2_EL2_PIE;
- case TR_EL10:
- return (__vcpu_sys_reg(vcpu, HCRX_EL2) & HCRX_EL2_TCR2En) &&
- (__vcpu_sys_reg(vcpu, TCR2_EL1) & TCR2_EL1_PIE);
- default:
- BUG();
- }
+ /* Abuse TCR2_EL1_PIE and use it for EL2 as well */
+ return effective_tcr2(vcpu, regime) & TCR2_EL1_PIE;
}
static void compute_s1poe(struct kvm_vcpu *vcpu, struct s1_walk_info *wi)
@@ -76,23 +129,11 @@ static void compute_s1poe(struct kvm_vcpu *vcpu, struct s1_walk_info *wi)
return;
}
- switch (wi->regime) {
- case TR_EL2:
- case TR_EL20:
- val = vcpu_read_sys_reg(vcpu, TCR2_EL2);
- wi->poe = val & TCR2_EL2_POE;
- wi->e0poe = (wi->regime == TR_EL20) && (val & TCR2_EL2_E0POE);
- break;
- case TR_EL10:
- if (__vcpu_sys_reg(vcpu, HCRX_EL2) & HCRX_EL2_TCR2En) {
- wi->poe = wi->e0poe = false;
- return;
- }
+ val = effective_tcr2(vcpu, wi->regime);
- val = __vcpu_sys_reg(vcpu, TCR2_EL1);
- wi->poe = val & TCR2_EL1_POE;
- wi->e0poe = val & TCR2_EL1_E0POE;
- }
+ /* Abuse TCR2_EL1_* for EL2 */
+ wi->poe = val & TCR2_EL1_POE;
+ wi->e0poe = (wi->regime != TR_EL2) && (val & TCR2_EL1_E0POE);
}
static int setup_s1_walk(struct kvm_vcpu *vcpu, struct s1_walk_info *wi,
@@ -102,14 +143,16 @@ static int setup_s1_walk(struct kvm_vcpu *vcpu, struct s1_walk_info *wi,
unsigned int stride, x;
bool va55, tbi, lva;
- hcr = __vcpu_sys_reg(vcpu, HCR_EL2);
-
va55 = va & BIT(55);
- if (wi->regime == TR_EL2 && va55)
- goto addrsz;
-
- wi->s2 = wi->regime == TR_EL10 && (hcr & (HCR_VM | HCR_DC));
+ if (vcpu_has_nv(vcpu)) {
+ hcr = __vcpu_sys_reg(vcpu, HCR_EL2);
+ wi->s2 = wi->regime == TR_EL10 && (hcr & (HCR_VM | HCR_DC));
+ } else {
+ WARN_ON_ONCE(wi->regime != TR_EL10);
+ wi->s2 = false;
+ hcr = 0;
+ }
switch (wi->regime) {
case TR_EL10:
@@ -131,6 +174,46 @@ static int setup_s1_walk(struct kvm_vcpu *vcpu, struct s1_walk_info *wi,
BUG();
}
+ /* Someone was silly enough to encode TG0/TG1 differently */
+ if (va55 && wi->regime != TR_EL2) {
+ wi->txsz = FIELD_GET(TCR_T1SZ_MASK, tcr);
+ tg = FIELD_GET(TCR_TG1_MASK, tcr);
+
+ switch (tg << TCR_TG1_SHIFT) {
+ case TCR_TG1_4K:
+ wi->pgshift = 12; break;
+ case TCR_TG1_16K:
+ wi->pgshift = 14; break;
+ case TCR_TG1_64K:
+ default: /* IMPDEF: treat any other value as 64k */
+ wi->pgshift = 16; break;
+ }
+ } else {
+ wi->txsz = FIELD_GET(TCR_T0SZ_MASK, tcr);
+ tg = FIELD_GET(TCR_TG0_MASK, tcr);
+
+ switch (tg << TCR_TG0_SHIFT) {
+ case TCR_TG0_4K:
+ wi->pgshift = 12; break;
+ case TCR_TG0_16K:
+ wi->pgshift = 14; break;
+ case TCR_TG0_64K:
+ default: /* IMPDEF: treat any other value as 64k */
+ wi->pgshift = 16; break;
+ }
+ }
+
+ wi->pa52bit = has_52bit_pa(vcpu, wi, tcr);
+
+ ia_bits = get_ia_size(wi);
+
+ /* AArch64.S1StartLevel() */
+ stride = wi->pgshift - 3;
+ wi->sl = 3 - (((ia_bits - 1) - wi->pgshift) / stride);
+
+ if (wi->regime == TR_EL2 && va55)
+ goto addrsz;
+
tbi = (wi->regime == TR_EL2 ?
FIELD_GET(TCR_EL2_TBI, tcr) :
(va55 ?
@@ -140,6 +223,12 @@ static int setup_s1_walk(struct kvm_vcpu *vcpu, struct s1_walk_info *wi,
if (!tbi && (u64)sign_extend64(va, 55) != va)
goto addrsz;
+ wi->sh = (wi->regime == TR_EL2 ?
+ FIELD_GET(TCR_EL2_SH0_MASK, tcr) :
+ (va55 ?
+ FIELD_GET(TCR_SH1_MASK, tcr) :
+ FIELD_GET(TCR_SH0_MASK, tcr)));
+
va = (u64)sign_extend64(va, 55);
/* Let's put the MMU disabled case aside immediately */
@@ -194,53 +283,20 @@ static int setup_s1_walk(struct kvm_vcpu *vcpu, struct s1_walk_info *wi,
/* R_BVXDG */
wi->hpd |= (wi->poe || wi->e0poe);
- /* Someone was silly enough to encode TG0/TG1 differently */
- if (va55) {
- wi->txsz = FIELD_GET(TCR_T1SZ_MASK, tcr);
- tg = FIELD_GET(TCR_TG1_MASK, tcr);
-
- switch (tg << TCR_TG1_SHIFT) {
- case TCR_TG1_4K:
- wi->pgshift = 12; break;
- case TCR_TG1_16K:
- wi->pgshift = 14; break;
- case TCR_TG1_64K:
- default: /* IMPDEF: treat any other value as 64k */
- wi->pgshift = 16; break;
- }
- } else {
- wi->txsz = FIELD_GET(TCR_T0SZ_MASK, tcr);
- tg = FIELD_GET(TCR_TG0_MASK, tcr);
-
- switch (tg << TCR_TG0_SHIFT) {
- case TCR_TG0_4K:
- wi->pgshift = 12; break;
- case TCR_TG0_16K:
- wi->pgshift = 14; break;
- case TCR_TG0_64K:
- default: /* IMPDEF: treat any other value as 64k */
- wi->pgshift = 16; break;
- }
- }
-
/* R_PLCGL, R_YXNYW */
if (!kvm_has_feat_enum(vcpu->kvm, ID_AA64MMFR2_EL1, ST, 48_47)) {
if (wi->txsz > 39)
- goto transfault_l0;
+ goto transfault;
} else {
if (wi->txsz > 48 || (BIT(wi->pgshift) == SZ_64K && wi->txsz > 47))
- goto transfault_l0;
+ goto transfault;
}
/* R_GTJBY, R_SXWGM */
switch (BIT(wi->pgshift)) {
case SZ_4K:
- lva = kvm_has_feat(vcpu->kvm, ID_AA64MMFR0_EL1, TGRAN4, 52_BIT);
- lva &= tcr & (wi->regime == TR_EL2 ? TCR_EL2_DS : TCR_DS);
- break;
case SZ_16K:
- lva = kvm_has_feat(vcpu->kvm, ID_AA64MMFR0_EL1, TGRAN16, 52_BIT);
- lva &= tcr & (wi->regime == TR_EL2 ? TCR_EL2_DS : TCR_DS);
+ lva = wi->pa52bit;
break;
case SZ_64K:
lva = kvm_has_feat(vcpu->kvm, ID_AA64MMFR2_EL1, VARange, 52);
@@ -248,38 +304,42 @@ static int setup_s1_walk(struct kvm_vcpu *vcpu, struct s1_walk_info *wi,
}
if ((lva && wi->txsz < 12) || (!lva && wi->txsz < 16))
- goto transfault_l0;
-
- ia_bits = get_ia_size(wi);
+ goto transfault;
/* R_YYVYV, I_THCZK */
if ((!va55 && va > GENMASK(ia_bits - 1, 0)) ||
(va55 && va < GENMASK(63, ia_bits)))
- goto transfault_l0;
+ goto transfault;
/* I_ZFSYQ */
if (wi->regime != TR_EL2 &&
(tcr & (va55 ? TCR_EPD1_MASK : TCR_EPD0_MASK)))
- goto transfault_l0;
+ goto transfault;
/* R_BNDVG and following statements */
if (kvm_has_feat(vcpu->kvm, ID_AA64MMFR2_EL1, E0PD, IMP) &&
wi->as_el0 && (tcr & (va55 ? TCR_E0PD1 : TCR_E0PD0)))
- goto transfault_l0;
-
- /* AArch64.S1StartLevel() */
- stride = wi->pgshift - 3;
- wi->sl = 3 - (((ia_bits - 1) - wi->pgshift) / stride);
+ goto transfault;
ps = (wi->regime == TR_EL2 ?
FIELD_GET(TCR_EL2_PS_MASK, tcr) : FIELD_GET(TCR_IPS_MASK, tcr));
- wi->max_oa_bits = min(get_kvm_ipa_limit(), ps_to_output_size(ps));
+ wi->max_oa_bits = min(get_kvm_ipa_limit(), ps_to_output_size(ps, wi->pa52bit));
/* Compute minimal alignment */
x = 3 + ia_bits - ((3 - wi->sl) * stride + wi->pgshift);
wi->baddr = ttbr & TTBRx_EL1_BADDR;
+ if (wi->pa52bit) {
+ /*
+ * Force the alignment on 64 bytes for top-level tables
+ * smaller than 8 entries, since TTBR.BADDR[5:2] are used to
+ * store bits [51:48] of the first level of lookup.
+ */
+ x = max(x, 6);
+
+ wi->baddr |= FIELD_GET(GENMASK_ULL(5, 2), ttbr) << 48;
+ }
/* R_VPBBF */
if (check_output_size(wi->baddr, wi))
@@ -289,12 +349,17 @@ static int setup_s1_walk(struct kvm_vcpu *vcpu, struct s1_walk_info *wi,
return 0;
-addrsz: /* Address Size Fault level 0 */
+addrsz:
+ /*
+ * Address Size Fault level 0 to indicate it comes from TTBR.
+ * yes, this is an oddity.
+ */
fail_s1_walk(wr, ESR_ELx_FSC_ADDRSZ_L(0), false);
return -EFAULT;
-transfault_l0: /* Translation Fault level 0 */
- fail_s1_walk(wr, ESR_ELx_FSC_FAULT_L(0), false);
+transfault:
+ /* Translation Fault on start level */
+ fail_s1_walk(wr, ESR_ELx_FSC_FAULT_L(wi->sl), false);
return -EFAULT;
}
@@ -339,6 +404,17 @@ static int walk_s1(struct kvm_vcpu *vcpu, struct s1_walk_info *wi,
ipa = kvm_s2_trans_output(&s2_trans);
}
+ if (wi->filter) {
+ ret = wi->filter->fn(&(struct s1_walk_context)
+ {
+ .wi = wi,
+ .table_ipa = baddr,
+ .level = level,
+ }, wi->filter->priv);
+ if (ret)
+ return ret;
+ }
+
ret = kvm_read_guest(vcpu->kvm, ipa, &desc, sizeof(desc));
if (ret) {
fail_s1_walk(wr, ESR_ELx_FSC_SEA_TTW(level), false);
@@ -369,7 +445,7 @@ static int walk_s1(struct kvm_vcpu *vcpu, struct s1_walk_info *wi,
wr->PXNTable |= FIELD_GET(PMD_TABLE_PXN, desc);
}
- baddr = desc & GENMASK_ULL(47, wi->pgshift);
+ baddr = desc_to_oa(wi, desc);
/* Check for out-of-range OA */
if (check_output_size(baddr, wi))
@@ -386,11 +462,11 @@ static int walk_s1(struct kvm_vcpu *vcpu, struct s1_walk_info *wi,
switch (BIT(wi->pgshift)) {
case SZ_4K:
- valid_block = level == 1 || level == 2;
+ valid_block = level == 1 || level == 2 || (wi->pa52bit && level == 0);
break;
case SZ_16K:
case SZ_64K:
- valid_block = level == 2;
+ valid_block = level == 2 || (wi->pa52bit && level == 1);
break;
}
@@ -398,7 +474,8 @@ static int walk_s1(struct kvm_vcpu *vcpu, struct s1_walk_info *wi,
goto transfault;
}
- if (check_output_size(desc & GENMASK(47, va_bottom), wi))
+ baddr = desc_to_oa(wi, desc);
+ if (check_output_size(baddr & GENMASK(52, va_bottom), wi))
goto addrsz;
if (!(desc & PTE_AF)) {
@@ -411,7 +488,7 @@ static int walk_s1(struct kvm_vcpu *vcpu, struct s1_walk_info *wi,
wr->failed = false;
wr->level = level;
wr->desc = desc;
- wr->pa = desc & GENMASK(47, va_bottom);
+ wr->pa = baddr & GENMASK(52, va_bottom);
wr->pa |= va & GENMASK_ULL(va_bottom - 1, 0);
wr->nG = (wi->regime != TR_EL2) && (desc & PTE_NG);
@@ -640,21 +717,36 @@ static u8 combine_s1_s2_attr(u8 s1, u8 s2)
#define ATTR_OSH 0b10
#define ATTR_ISH 0b11
-static u8 compute_sh(u8 attr, u64 desc)
+static u8 compute_final_sh(u8 attr, u8 sh)
{
- u8 sh;
-
/* Any form of device, as well as NC has SH[1:0]=0b10 */
if (MEMATTR_IS_DEVICE(attr) || attr == MEMATTR(NC, NC))
return ATTR_OSH;
- sh = FIELD_GET(PTE_SHARED, desc);
if (sh == ATTR_RSV) /* Reserved, mapped to NSH */
sh = ATTR_NSH;
return sh;
}
+static u8 compute_s1_sh(struct s1_walk_info *wi, struct s1_walk_result *wr,
+ u8 attr)
+{
+ u8 sh;
+
+ /*
+ * non-52bit and LPA have their basic shareability described in the
+ * descriptor. LPA2 gets it from the corresponding field in TCR,
+ * conveniently recorded in the walk info.
+ */
+ if (!wi->pa52bit || BIT(wi->pgshift) == SZ_64K)
+ sh = FIELD_GET(KVM_PTE_LEAF_ATTR_LO_S1_SH, wr->desc);
+ else
+ sh = wi->sh;
+
+ return compute_final_sh(attr, sh);
+}
+
static u8 combine_sh(u8 s1_sh, u8 s2_sh)
{
if (s1_sh == ATTR_OSH || s2_sh == ATTR_OSH)
@@ -668,7 +760,7 @@ static u8 combine_sh(u8 s1_sh, u8 s2_sh)
static u64 compute_par_s12(struct kvm_vcpu *vcpu, u64 s1_par,
struct kvm_s2_trans *tr)
{
- u8 s1_parattr, s2_memattr, final_attr;
+ u8 s1_parattr, s2_memattr, final_attr, s2_sh;
u64 par;
/* If S2 has failed to translate, report the damage */
@@ -741,17 +833,19 @@ static u64 compute_par_s12(struct kvm_vcpu *vcpu, u64 s1_par,
!MEMATTR_IS_DEVICE(final_attr))
final_attr = MEMATTR(NC, NC);
+ s2_sh = FIELD_GET(KVM_PTE_LEAF_ATTR_LO_S2_SH, tr->desc);
+
par = FIELD_PREP(SYS_PAR_EL1_ATTR, final_attr);
par |= tr->output & GENMASK(47, 12);
par |= FIELD_PREP(SYS_PAR_EL1_SH,
combine_sh(FIELD_GET(SYS_PAR_EL1_SH, s1_par),
- compute_sh(final_attr, tr->desc)));
+ compute_final_sh(final_attr, s2_sh)));
return par;
}
-static u64 compute_par_s1(struct kvm_vcpu *vcpu, struct s1_walk_result *wr,
- enum trans_regime regime)
+static u64 compute_par_s1(struct kvm_vcpu *vcpu, struct s1_walk_info *wi,
+ struct s1_walk_result *wr)
{
u64 par;
@@ -764,9 +858,9 @@ static u64 compute_par_s1(struct kvm_vcpu *vcpu, struct s1_walk_result *wr,
} else if (wr->level == S1_MMU_DISABLED) {
/* MMU off or HCR_EL2.DC == 1 */
par = SYS_PAR_EL1_NSE;
- par |= wr->pa & GENMASK_ULL(47, 12);
+ par |= wr->pa & SYS_PAR_EL1_PA;
- if (regime == TR_EL10 &&
+ if (wi->regime == TR_EL10 && vcpu_has_nv(vcpu) &&
(__vcpu_sys_reg(vcpu, HCR_EL2) & HCR_DC)) {
par |= FIELD_PREP(SYS_PAR_EL1_ATTR,
MEMATTR(WbRaWa, WbRaWa));
@@ -781,14 +875,14 @@ static u64 compute_par_s1(struct kvm_vcpu *vcpu, struct s1_walk_result *wr,
par = SYS_PAR_EL1_NSE;
- mair = (regime == TR_EL10 ?
+ mair = (wi->regime == TR_EL10 ?
vcpu_read_sys_reg(vcpu, MAIR_EL1) :
vcpu_read_sys_reg(vcpu, MAIR_EL2));
mair >>= FIELD_GET(PTE_ATTRINDX_MASK, wr->desc) * 8;
mair &= 0xff;
- sctlr = (regime == TR_EL10 ?
+ sctlr = (wi->regime == TR_EL10 ?
vcpu_read_sys_reg(vcpu, SCTLR_EL1) :
vcpu_read_sys_reg(vcpu, SCTLR_EL2));
@@ -797,9 +891,9 @@ static u64 compute_par_s1(struct kvm_vcpu *vcpu, struct s1_walk_result *wr,
mair = MEMATTR(NC, NC);
par |= FIELD_PREP(SYS_PAR_EL1_ATTR, mair);
- par |= wr->pa & GENMASK_ULL(47, 12);
+ par |= wr->pa & SYS_PAR_EL1_PA;
- sh = compute_sh(mair, wr->desc);
+ sh = compute_s1_sh(wi, wr, mair);
par |= FIELD_PREP(SYS_PAR_EL1_SH, sh);
}
@@ -873,7 +967,7 @@ static void compute_s1_direct_permissions(struct kvm_vcpu *vcpu,
wxn = (vcpu_read_sys_reg(vcpu, SCTLR_EL2) & SCTLR_ELx_WXN);
break;
case TR_EL10:
- wxn = (__vcpu_sys_reg(vcpu, SCTLR_EL1) & SCTLR_ELx_WXN);
+ wxn = (vcpu_read_sys_reg(vcpu, SCTLR_EL1) & SCTLR_ELx_WXN);
break;
}
@@ -1186,7 +1280,7 @@ static u64 handle_at_slow(struct kvm_vcpu *vcpu, u32 op, u64 vaddr)
fail_s1_walk(&wr, ESR_ELx_FSC_PERM_L(wr.level), false);
compute_par:
- return compute_par_s1(vcpu, &wr, wi.regime);
+ return compute_par_s1(vcpu, &wi, &wr);
}
/*
@@ -1202,7 +1296,7 @@ static u64 __kvm_at_s1e01_fast(struct kvm_vcpu *vcpu, u32 op, u64 vaddr)
{
struct mmu_config config;
struct kvm_s2_mmu *mmu;
- bool fail;
+ bool fail, mmu_cs;
u64 par;
par = SYS_PAR_EL1_F;
@@ -1218,8 +1312,13 @@ static u64 __kvm_at_s1e01_fast(struct kvm_vcpu *vcpu, u32 op, u64 vaddr)
* If HCR_EL2.{E2H,TGE} == {1,1}, the MMU context is already
* the right one (as we trapped from vEL2). If not, save the
* full MMU context.
+ *
+ * We are also guaranteed to be in the correct context if
+ * we're not in a nested VM.
*/
- if (vcpu_el2_e2h_is_set(vcpu) && vcpu_el2_tge_is_set(vcpu))
+ mmu_cs = (vcpu_has_nv(vcpu) &&
+ !(vcpu_el2_e2h_is_set(vcpu) && vcpu_el2_tge_is_set(vcpu)));
+ if (!mmu_cs)
goto skip_mmu_switch;
/*
@@ -1287,7 +1386,7 @@ skip_mmu_switch:
write_sysreg_hcr(HCR_HOST_VHE_FLAGS);
- if (!(vcpu_el2_e2h_is_set(vcpu) && vcpu_el2_tge_is_set(vcpu)))
+ if (mmu_cs)
__mmu_config_restore(&config);
return par;
@@ -1420,10 +1519,10 @@ void __kvm_at_s12(struct kvm_vcpu *vcpu, u32 op, u64 vaddr)
return;
/*
- * If we only have a single stage of translation (E2H=0 or
- * TGE=1), exit early. Same thing if {VM,DC}=={0,0}.
+ * If we only have a single stage of translation (EL2&0), exit
+ * early. Same thing if {VM,DC}=={0,0}.
*/
- if (!vcpu_el2_e2h_is_set(vcpu) || vcpu_el2_tge_is_set(vcpu) ||
+ if (compute_translation_regime(vcpu, op) == TR_EL20 ||
!(vcpu_read_sys_reg(vcpu, HCR_EL2) & (HCR_VM | HCR_DC)))
return;
@@ -1470,3 +1569,68 @@ int __kvm_translate_va(struct kvm_vcpu *vcpu, struct s1_walk_info *wi,
return 0;
}
+
+struct desc_match {
+ u64 ipa;
+ int level;
+};
+
+static int match_s1_desc(struct s1_walk_context *ctxt, void *priv)
+{
+ struct desc_match *dm = priv;
+ u64 ipa = dm->ipa;
+
+ /* Use S1 granule alignment */
+ ipa &= GENMASK(51, ctxt->wi->pgshift);
+
+ /* Not the IPA we're looking for? Continue. */
+ if (ipa != ctxt->table_ipa)
+ return 0;
+
+ /* Note the level and interrupt the walk */
+ dm->level = ctxt->level;
+ return -EINTR;
+}
+
+int __kvm_find_s1_desc_level(struct kvm_vcpu *vcpu, u64 va, u64 ipa, int *level)
+{
+ struct desc_match dm = {
+ .ipa = ipa,
+ };
+ struct s1_walk_info wi = {
+ .filter = &(struct s1_walk_filter){
+ .fn = match_s1_desc,
+ .priv = &dm,
+ },
+ .regime = TR_EL10,
+ .as_el0 = false,
+ .pan = false,
+ };
+ struct s1_walk_result wr = {};
+ int ret;
+
+ ret = setup_s1_walk(vcpu, &wi, &wr, va);
+ if (ret)
+ return ret;
+
+ /* We really expect the S1 MMU to be on here... */
+ if (WARN_ON_ONCE(wr.level == S1_MMU_DISABLED)) {
+ *level = 0;
+ return 0;
+ }
+
+ /* Walk the guest's PT, looking for a match along the way */
+ ret = walk_s1(vcpu, &wi, &wr, va);
+ switch (ret) {
+ case -EINTR:
+ /* We interrupted the walk on a match, return the level */
+ *level = dm.level;
+ return 0;
+ case 0:
+ /* The walk completed, we failed to find the entry */
+ return -ENOENT;
+ default:
+ /* Any other error... */
+ return ret;
+ }
+}
diff --git a/arch/arm64/kvm/config.c b/arch/arm64/kvm/config.c
index da66c4a14775..fbd8944a3dea 100644
--- a/arch/arm64/kvm/config.c
+++ b/arch/arm64/kvm/config.c
@@ -7,12 +7,22 @@
#include <linux/kvm_host.h>
#include <asm/sysreg.h>
+/*
+ * Describes the dependencies between a set of bits (or the negation
+ * of a set of RES0 bits) and a feature. The flags indicate how the
+ * data is interpreted.
+ */
struct reg_bits_to_feat_map {
- u64 bits;
+ union {
+ u64 bits;
+ u64 *res0p;
+ };
#define NEVER_FGU BIT(0) /* Can trap, but never UNDEF */
#define CALL_FUNC BIT(1) /* Needs to evaluate tons of crap */
#define FIXED_VALUE BIT(2) /* RAZ/WI or RAO/WI in KVM */
+#define RES0_POINTER BIT(3) /* Pointer to RES0 value instead of bits */
+
unsigned long flags;
union {
@@ -28,9 +38,27 @@ struct reg_bits_to_feat_map {
};
};
-#define __NEEDS_FEAT_3(m, f, id, fld, lim) \
+/*
+ * Describes the dependencies for a given register:
+ *
+ * @feat_map describes the dependency for the whole register. If the
+ * features the register depends on are not present, the whole
+ * register is effectively RES0.
+ *
+ * @bit_feat_map describes the dependencies for a set of bits in that
+ * register. If the features these bits depend on are not present, the
+ * bits are effectively RES0.
+ */
+struct reg_feat_map_desc {
+ const char *name;
+ const struct reg_bits_to_feat_map feat_map;
+ const struct reg_bits_to_feat_map *bit_feat_map;
+ const unsigned int bit_feat_map_sz;
+};
+
+#define __NEEDS_FEAT_3(m, f, w, id, fld, lim) \
{ \
- .bits = (m), \
+ .w = (m), \
.flags = (f), \
.regidx = IDREG_IDX(SYS_ ## id), \
.shift = id ##_## fld ## _SHIFT, \
@@ -39,28 +67,63 @@ struct reg_bits_to_feat_map {
.lo_lim = id ##_## fld ##_## lim \
}
-#define __NEEDS_FEAT_2(m, f, fun, dummy) \
+#define __NEEDS_FEAT_2(m, f, w, fun, dummy) \
{ \
- .bits = (m), \
+ .w = (m), \
.flags = (f) | CALL_FUNC, \
.fval = (fun), \
}
-#define __NEEDS_FEAT_1(m, f, fun) \
+#define __NEEDS_FEAT_1(m, f, w, fun) \
{ \
- .bits = (m), \
+ .w = (m), \
.flags = (f) | CALL_FUNC, \
.match = (fun), \
}
+#define __NEEDS_FEAT_FLAG(m, f, w, ...) \
+ CONCATENATE(__NEEDS_FEAT_, COUNT_ARGS(__VA_ARGS__))(m, f, w, __VA_ARGS__)
+
#define NEEDS_FEAT_FLAG(m, f, ...) \
- CONCATENATE(__NEEDS_FEAT_, COUNT_ARGS(__VA_ARGS__))(m, f, __VA_ARGS__)
+ __NEEDS_FEAT_FLAG(m, f, bits, __VA_ARGS__)
#define NEEDS_FEAT_FIXED(m, ...) \
- NEEDS_FEAT_FLAG(m, FIXED_VALUE, __VA_ARGS__, 0)
+ __NEEDS_FEAT_FLAG(m, FIXED_VALUE, bits, __VA_ARGS__, 0)
+#define NEEDS_FEAT_RES0(p, ...) \
+ __NEEDS_FEAT_FLAG(p, RES0_POINTER, res0p, __VA_ARGS__)
+
+/*
+ * Declare the dependency between a set of bits and a set of features,
+ * generating a struct reg_bit_to_feat_map.
+ */
#define NEEDS_FEAT(m, ...) NEEDS_FEAT_FLAG(m, 0, __VA_ARGS__)
+/*
+ * Declare the dependency between a non-FGT register, a set of
+ * feature, and the set of individual bits it contains. This generates
+ * a struct reg_feat_map_desc.
+ */
+#define DECLARE_FEAT_MAP(n, r, m, f) \
+ struct reg_feat_map_desc n = { \
+ .name = #r, \
+ .feat_map = NEEDS_FEAT(~r##_RES0, f), \
+ .bit_feat_map = m, \
+ .bit_feat_map_sz = ARRAY_SIZE(m), \
+ }
+
+/*
+ * Specialised version of the above for FGT registers that have their
+ * RES0 masks described as struct fgt_masks.
+ */
+#define DECLARE_FEAT_MAP_FGT(n, msk, m, f) \
+ struct reg_feat_map_desc n = { \
+ .name = #msk, \
+ .feat_map = NEEDS_FEAT_RES0(&msk.res0, f),\
+ .bit_feat_map = m, \
+ .bit_feat_map_sz = ARRAY_SIZE(m), \
+ }
+
#define FEAT_SPE ID_AA64DFR0_EL1, PMSVer, IMP
#define FEAT_SPE_FnE ID_AA64DFR0_EL1, PMSVer, V1P2
#define FEAT_BRBE ID_AA64DFR0_EL1, BRBE, IMP
@@ -73,6 +136,7 @@ struct reg_bits_to_feat_map {
#define FEAT_AA32EL0 ID_AA64PFR0_EL1, EL0, AARCH32
#define FEAT_AA32EL1 ID_AA64PFR0_EL1, EL1, AARCH32
#define FEAT_AA64EL1 ID_AA64PFR0_EL1, EL1, IMP
+#define FEAT_AA64EL2 ID_AA64PFR0_EL1, EL2, IMP
#define FEAT_AA64EL3 ID_AA64PFR0_EL1, EL3, IMP
#define FEAT_AIE ID_AA64MMFR3_EL1, AIE, IMP
#define FEAT_S2POE ID_AA64MMFR3_EL1, S2POE, IMP
@@ -131,7 +195,6 @@ struct reg_bits_to_feat_map {
#define FEAT_SPMU ID_AA64DFR1_EL1, SPMU, IMP
#define FEAT_SPE_nVM ID_AA64DFR2_EL1, SPE_nVM, IMP
#define FEAT_STEP2 ID_AA64DFR2_EL1, STEP, IMP
-#define FEAT_SYSREG128 ID_AA64ISAR2_EL1, SYSREG_128, IMP
#define FEAT_CPA2 ID_AA64ISAR3_EL1, CPA, CPA2
#define FEAT_ASID2 ID_AA64MMFR4_EL1, ASID2, IMP
#define FEAT_MEC ID_AA64MMFR3_EL1, MEC, IMP
@@ -143,7 +206,6 @@ struct reg_bits_to_feat_map {
#define FEAT_LSMAOC ID_AA64MMFR2_EL1, LSM, IMP
#define FEAT_MixedEnd ID_AA64MMFR0_EL1, BIGEND, IMP
#define FEAT_MixedEndEL0 ID_AA64MMFR0_EL1, BIGENDEL0, IMP
-#define FEAT_MTE2 ID_AA64PFR1_EL1, MTE, MTE2
#define FEAT_MTE_ASYNC ID_AA64PFR1_EL1, MTE_frac, ASYNC
#define FEAT_MTE_STORE_ONLY ID_AA64PFR2_EL1, MTESTOREONLY, IMP
#define FEAT_PAN ID_AA64MMFR1_EL1, PAN, IMP
@@ -151,7 +213,9 @@ struct reg_bits_to_feat_map {
#define FEAT_SSBS ID_AA64PFR1_EL1, SSBS, IMP
#define FEAT_TIDCP1 ID_AA64MMFR1_EL1, TIDCP1, IMP
#define FEAT_FGT ID_AA64MMFR0_EL1, FGT, IMP
+#define FEAT_FGT2 ID_AA64MMFR0_EL1, FGT, FGT2
#define FEAT_MTPMU ID_AA64DFR0_EL1, MTPMU, IMP
+#define FEAT_HCX ID_AA64MMFR1_EL1, HCX, IMP
static bool not_feat_aa64el3(struct kvm *kvm)
{
@@ -397,6 +461,10 @@ static const struct reg_bits_to_feat_map hfgrtr_feat_map[] = {
NEVER_FGU, FEAT_AA64EL1),
};
+
+static const DECLARE_FEAT_MAP_FGT(hfgrtr_desc, hfgrtr_masks,
+ hfgrtr_feat_map, FEAT_FGT);
+
static const struct reg_bits_to_feat_map hfgwtr_feat_map[] = {
NEEDS_FEAT(HFGWTR_EL2_nAMAIR2_EL1 |
HFGWTR_EL2_nMAIR2_EL1,
@@ -461,6 +529,9 @@ static const struct reg_bits_to_feat_map hfgwtr_feat_map[] = {
NEVER_FGU, FEAT_AA64EL1),
};
+static const DECLARE_FEAT_MAP_FGT(hfgwtr_desc, hfgwtr_masks,
+ hfgwtr_feat_map, FEAT_FGT);
+
static const struct reg_bits_to_feat_map hdfgrtr_feat_map[] = {
NEEDS_FEAT(HDFGRTR_EL2_PMBIDR_EL1 |
HDFGRTR_EL2_PMSLATFR_EL1 |
@@ -528,6 +599,9 @@ static const struct reg_bits_to_feat_map hdfgrtr_feat_map[] = {
NEVER_FGU, FEAT_AA64EL1)
};
+static const DECLARE_FEAT_MAP_FGT(hdfgrtr_desc, hdfgrtr_masks,
+ hdfgrtr_feat_map, FEAT_FGT);
+
static const struct reg_bits_to_feat_map hdfgwtr_feat_map[] = {
NEEDS_FEAT(HDFGWTR_EL2_PMSLATFR_EL1 |
HDFGWTR_EL2_PMSIRR_EL1 |
@@ -588,6 +662,8 @@ static const struct reg_bits_to_feat_map hdfgwtr_feat_map[] = {
NEEDS_FEAT(HDFGWTR_EL2_TRFCR_EL1, FEAT_TRF),
};
+static const DECLARE_FEAT_MAP_FGT(hdfgwtr_desc, hdfgwtr_masks,
+ hdfgwtr_feat_map, FEAT_FGT);
static const struct reg_bits_to_feat_map hfgitr_feat_map[] = {
NEEDS_FEAT(HFGITR_EL2_PSBCSYNC, FEAT_SPEv1p5),
@@ -662,6 +738,9 @@ static const struct reg_bits_to_feat_map hfgitr_feat_map[] = {
NEVER_FGU, FEAT_AA64EL1),
};
+static const DECLARE_FEAT_MAP_FGT(hfgitr_desc, hfgitr_masks,
+ hfgitr_feat_map, FEAT_FGT);
+
static const struct reg_bits_to_feat_map hafgrtr_feat_map[] = {
NEEDS_FEAT(HAFGRTR_EL2_AMEVTYPER115_EL0 |
HAFGRTR_EL2_AMEVTYPER114_EL0 |
@@ -704,11 +783,17 @@ static const struct reg_bits_to_feat_map hafgrtr_feat_map[] = {
FEAT_AMUv1),
};
+static const DECLARE_FEAT_MAP_FGT(hafgrtr_desc, hafgrtr_masks,
+ hafgrtr_feat_map, FEAT_FGT);
+
static const struct reg_bits_to_feat_map hfgitr2_feat_map[] = {
NEEDS_FEAT(HFGITR2_EL2_nDCCIVAPS, FEAT_PoPS),
NEEDS_FEAT(HFGITR2_EL2_TSBCSYNC, FEAT_TRBEv1p1)
};
+static const DECLARE_FEAT_MAP_FGT(hfgitr2_desc, hfgitr2_masks,
+ hfgitr2_feat_map, FEAT_FGT2);
+
static const struct reg_bits_to_feat_map hfgrtr2_feat_map[] = {
NEEDS_FEAT(HFGRTR2_EL2_nPFAR_EL1, FEAT_PFAR),
NEEDS_FEAT(HFGRTR2_EL2_nERXGSR_EL1, FEAT_RASv2),
@@ -728,6 +813,9 @@ static const struct reg_bits_to_feat_map hfgrtr2_feat_map[] = {
NEEDS_FEAT(HFGRTR2_EL2_nRCWSMASK_EL1, FEAT_THE),
};
+static const DECLARE_FEAT_MAP_FGT(hfgrtr2_desc, hfgrtr2_masks,
+ hfgrtr2_feat_map, FEAT_FGT2);
+
static const struct reg_bits_to_feat_map hfgwtr2_feat_map[] = {
NEEDS_FEAT(HFGWTR2_EL2_nPFAR_EL1, FEAT_PFAR),
NEEDS_FEAT(HFGWTR2_EL2_nACTLRALIAS_EL1 |
@@ -746,6 +834,9 @@ static const struct reg_bits_to_feat_map hfgwtr2_feat_map[] = {
NEEDS_FEAT(HFGWTR2_EL2_nRCWSMASK_EL1, FEAT_THE),
};
+static const DECLARE_FEAT_MAP_FGT(hfgwtr2_desc, hfgwtr2_masks,
+ hfgwtr2_feat_map, FEAT_FGT2);
+
static const struct reg_bits_to_feat_map hdfgrtr2_feat_map[] = {
NEEDS_FEAT(HDFGRTR2_EL2_nMDSELR_EL1, FEAT_Debugv8p9),
NEEDS_FEAT(HDFGRTR2_EL2_nPMECR_EL1, feat_ebep_pmuv3_ss),
@@ -776,6 +867,9 @@ static const struct reg_bits_to_feat_map hdfgrtr2_feat_map[] = {
NEEDS_FEAT(HDFGRTR2_EL2_nTRBMPAM_EL1, feat_trbe_mpam),
};
+static const DECLARE_FEAT_MAP_FGT(hdfgrtr2_desc, hdfgrtr2_masks,
+ hdfgrtr2_feat_map, FEAT_FGT2);
+
static const struct reg_bits_to_feat_map hdfgwtr2_feat_map[] = {
NEEDS_FEAT(HDFGWTR2_EL2_nMDSELR_EL1, FEAT_Debugv8p9),
NEEDS_FEAT(HDFGWTR2_EL2_nPMECR_EL1, feat_ebep_pmuv3_ss),
@@ -804,6 +898,10 @@ static const struct reg_bits_to_feat_map hdfgwtr2_feat_map[] = {
NEEDS_FEAT(HDFGWTR2_EL2_nTRBMPAM_EL1, feat_trbe_mpam),
};
+static const DECLARE_FEAT_MAP_FGT(hdfgwtr2_desc, hdfgwtr2_masks,
+ hdfgwtr2_feat_map, FEAT_FGT2);
+
+
static const struct reg_bits_to_feat_map hcrx_feat_map[] = {
NEEDS_FEAT(HCRX_EL2_PACMEn, feat_pauth_lr),
NEEDS_FEAT(HCRX_EL2_EnFPM, FEAT_FPMR),
@@ -833,6 +931,10 @@ static const struct reg_bits_to_feat_map hcrx_feat_map[] = {
NEEDS_FEAT(HCRX_EL2_EnAS0, FEAT_LS64_ACCDATA),
};
+
+static const DECLARE_FEAT_MAP(hcrx_desc, __HCRX_EL2,
+ hcrx_feat_map, FEAT_HCX);
+
static const struct reg_bits_to_feat_map hcr_feat_map[] = {
NEEDS_FEAT(HCR_EL2_TID0, FEAT_AA32EL0),
NEEDS_FEAT_FIXED(HCR_EL2_RW, compute_hcr_rw),
@@ -904,6 +1006,9 @@ static const struct reg_bits_to_feat_map hcr_feat_map[] = {
NEEDS_FEAT_FIXED(HCR_EL2_E2H, compute_hcr_e2h),
};
+static const DECLARE_FEAT_MAP(hcr_desc, HCR_EL2,
+ hcr_feat_map, FEAT_AA64EL2);
+
static const struct reg_bits_to_feat_map sctlr2_feat_map[] = {
NEEDS_FEAT(SCTLR2_EL1_NMEA |
SCTLR2_EL1_EASE,
@@ -921,6 +1026,9 @@ static const struct reg_bits_to_feat_map sctlr2_feat_map[] = {
FEAT_CPA2),
};
+static const DECLARE_FEAT_MAP(sctlr2_desc, SCTLR2_EL1,
+ sctlr2_feat_map, FEAT_SCTLR2);
+
static const struct reg_bits_to_feat_map tcr2_el2_feat_map[] = {
NEEDS_FEAT(TCR2_EL2_FNG1 |
TCR2_EL2_FNG0 |
@@ -943,6 +1051,9 @@ static const struct reg_bits_to_feat_map tcr2_el2_feat_map[] = {
NEEDS_FEAT(TCR2_EL2_PIE, FEAT_S1PIE),
};
+static const DECLARE_FEAT_MAP(tcr2_el2_desc, TCR2_EL2,
+ tcr2_el2_feat_map, FEAT_TCR2);
+
static const struct reg_bits_to_feat_map sctlr_el1_feat_map[] = {
NEEDS_FEAT(SCTLR_EL1_CP15BEN |
SCTLR_EL1_ITD |
@@ -1017,6 +1128,9 @@ static const struct reg_bits_to_feat_map sctlr_el1_feat_map[] = {
FEAT_AA64EL1),
};
+static const DECLARE_FEAT_MAP(sctlr_el1_desc, SCTLR_EL1,
+ sctlr_el1_feat_map, FEAT_AA64EL1);
+
static const struct reg_bits_to_feat_map mdcr_el2_feat_map[] = {
NEEDS_FEAT(MDCR_EL2_EBWE, FEAT_Debugv8p9),
NEEDS_FEAT(MDCR_EL2_TDOSA, FEAT_DoubleLock),
@@ -1048,6 +1162,9 @@ static const struct reg_bits_to_feat_map mdcr_el2_feat_map[] = {
FEAT_AA64EL1),
};
+static const DECLARE_FEAT_MAP(mdcr_el2_desc, MDCR_EL2,
+ mdcr_el2_feat_map, FEAT_AA64EL2);
+
static void __init check_feat_map(const struct reg_bits_to_feat_map *map,
int map_size, u64 res0, const char *str)
{
@@ -1061,32 +1178,36 @@ static void __init check_feat_map(const struct reg_bits_to_feat_map *map,
str, mask ^ ~res0);
}
+static u64 reg_feat_map_bits(const struct reg_bits_to_feat_map *map)
+{
+ return map->flags & RES0_POINTER ? ~(*map->res0p) : map->bits;
+}
+
+static void __init check_reg_desc(const struct reg_feat_map_desc *r)
+{
+ check_feat_map(r->bit_feat_map, r->bit_feat_map_sz,
+ ~reg_feat_map_bits(&r->feat_map), r->name);
+}
+
void __init check_feature_map(void)
{
- check_feat_map(hfgrtr_feat_map, ARRAY_SIZE(hfgrtr_feat_map),
- hfgrtr_masks.res0, hfgrtr_masks.str);
- check_feat_map(hfgwtr_feat_map, ARRAY_SIZE(hfgwtr_feat_map),
- hfgwtr_masks.res0, hfgwtr_masks.str);
- check_feat_map(hfgitr_feat_map, ARRAY_SIZE(hfgitr_feat_map),
- hfgitr_masks.res0, hfgitr_masks.str);
- check_feat_map(hdfgrtr_feat_map, ARRAY_SIZE(hdfgrtr_feat_map),
- hdfgrtr_masks.res0, hdfgrtr_masks.str);
- check_feat_map(hdfgwtr_feat_map, ARRAY_SIZE(hdfgwtr_feat_map),
- hdfgwtr_masks.res0, hdfgwtr_masks.str);
- check_feat_map(hafgrtr_feat_map, ARRAY_SIZE(hafgrtr_feat_map),
- hafgrtr_masks.res0, hafgrtr_masks.str);
- check_feat_map(hcrx_feat_map, ARRAY_SIZE(hcrx_feat_map),
- __HCRX_EL2_RES0, "HCRX_EL2");
- check_feat_map(hcr_feat_map, ARRAY_SIZE(hcr_feat_map),
- HCR_EL2_RES0, "HCR_EL2");
- check_feat_map(sctlr2_feat_map, ARRAY_SIZE(sctlr2_feat_map),
- SCTLR2_EL1_RES0, "SCTLR2_EL1");
- check_feat_map(tcr2_el2_feat_map, ARRAY_SIZE(tcr2_el2_feat_map),
- TCR2_EL2_RES0, "TCR2_EL2");
- check_feat_map(sctlr_el1_feat_map, ARRAY_SIZE(sctlr_el1_feat_map),
- SCTLR_EL1_RES0, "SCTLR_EL1");
- check_feat_map(mdcr_el2_feat_map, ARRAY_SIZE(mdcr_el2_feat_map),
- MDCR_EL2_RES0, "MDCR_EL2");
+ check_reg_desc(&hfgrtr_desc);
+ check_reg_desc(&hfgwtr_desc);
+ check_reg_desc(&hfgitr_desc);
+ check_reg_desc(&hdfgrtr_desc);
+ check_reg_desc(&hdfgwtr_desc);
+ check_reg_desc(&hafgrtr_desc);
+ check_reg_desc(&hfgrtr2_desc);
+ check_reg_desc(&hfgwtr2_desc);
+ check_reg_desc(&hfgitr2_desc);
+ check_reg_desc(&hdfgrtr2_desc);
+ check_reg_desc(&hdfgwtr2_desc);
+ check_reg_desc(&hcrx_desc);
+ check_reg_desc(&hcr_desc);
+ check_reg_desc(&sctlr2_desc);
+ check_reg_desc(&tcr2_el2_desc);
+ check_reg_desc(&sctlr_el1_desc);
+ check_reg_desc(&mdcr_el2_desc);
}
static bool idreg_feat_match(struct kvm *kvm, const struct reg_bits_to_feat_map *map)
@@ -1129,7 +1250,7 @@ static u64 __compute_fixed_bits(struct kvm *kvm,
match = idreg_feat_match(kvm, &map[i]);
if (!match || (map[i].flags & FIXED_VALUE))
- val |= map[i].bits;
+ val |= reg_feat_map_bits(&map[i]);
}
return val;
@@ -1145,15 +1266,36 @@ static u64 compute_res0_bits(struct kvm *kvm,
require, exclude | FIXED_VALUE);
}
-static u64 compute_fixed_bits(struct kvm *kvm,
- const struct reg_bits_to_feat_map *map,
- int map_size,
- u64 *fixed_bits,
- unsigned long require,
- unsigned long exclude)
+static u64 compute_reg_res0_bits(struct kvm *kvm,
+ const struct reg_feat_map_desc *r,
+ unsigned long require, unsigned long exclude)
+
+{
+ u64 res0;
+
+ res0 = compute_res0_bits(kvm, r->bit_feat_map, r->bit_feat_map_sz,
+ require, exclude);
+
+ /*
+ * If computing FGUs, don't take RES0 or register existence
+ * into account -- we're not computing bits for the register
+ * itself.
+ */
+ if (!(exclude & NEVER_FGU)) {
+ res0 |= compute_res0_bits(kvm, &r->feat_map, 1, require, exclude);
+ res0 |= ~reg_feat_map_bits(&r->feat_map);
+ }
+
+ return res0;
+}
+
+static u64 compute_reg_fixed_bits(struct kvm *kvm,
+ const struct reg_feat_map_desc *r,
+ u64 *fixed_bits, unsigned long require,
+ unsigned long exclude)
{
- return __compute_fixed_bits(kvm, map, map_size, fixed_bits,
- require | FIXED_VALUE, exclude);
+ return __compute_fixed_bits(kvm, r->bit_feat_map, r->bit_feat_map_sz,
+ fixed_bits, require | FIXED_VALUE, exclude);
}
void compute_fgu(struct kvm *kvm, enum fgt_group_id fgt)
@@ -1162,51 +1304,40 @@ void compute_fgu(struct kvm *kvm, enum fgt_group_id fgt)
switch (fgt) {
case HFGRTR_GROUP:
- val |= compute_res0_bits(kvm, hfgrtr_feat_map,
- ARRAY_SIZE(hfgrtr_feat_map),
- 0, NEVER_FGU);
- val |= compute_res0_bits(kvm, hfgwtr_feat_map,
- ARRAY_SIZE(hfgwtr_feat_map),
- 0, NEVER_FGU);
+ val |= compute_reg_res0_bits(kvm, &hfgrtr_desc,
+ 0, NEVER_FGU);
+ val |= compute_reg_res0_bits(kvm, &hfgwtr_desc,
+ 0, NEVER_FGU);
break;
case HFGITR_GROUP:
- val |= compute_res0_bits(kvm, hfgitr_feat_map,
- ARRAY_SIZE(hfgitr_feat_map),
- 0, NEVER_FGU);
+ val |= compute_reg_res0_bits(kvm, &hfgitr_desc,
+ 0, NEVER_FGU);
break;
case HDFGRTR_GROUP:
- val |= compute_res0_bits(kvm, hdfgrtr_feat_map,
- ARRAY_SIZE(hdfgrtr_feat_map),
- 0, NEVER_FGU);
- val |= compute_res0_bits(kvm, hdfgwtr_feat_map,
- ARRAY_SIZE(hdfgwtr_feat_map),
- 0, NEVER_FGU);
+ val |= compute_reg_res0_bits(kvm, &hdfgrtr_desc,
+ 0, NEVER_FGU);
+ val |= compute_reg_res0_bits(kvm, &hdfgwtr_desc,
+ 0, NEVER_FGU);
break;
case HAFGRTR_GROUP:
- val |= compute_res0_bits(kvm, hafgrtr_feat_map,
- ARRAY_SIZE(hafgrtr_feat_map),
- 0, NEVER_FGU);
+ val |= compute_reg_res0_bits(kvm, &hafgrtr_desc,
+ 0, NEVER_FGU);
break;
case HFGRTR2_GROUP:
- val |= compute_res0_bits(kvm, hfgrtr2_feat_map,
- ARRAY_SIZE(hfgrtr2_feat_map),
- 0, NEVER_FGU);
- val |= compute_res0_bits(kvm, hfgwtr2_feat_map,
- ARRAY_SIZE(hfgwtr2_feat_map),
- 0, NEVER_FGU);
+ val |= compute_reg_res0_bits(kvm, &hfgrtr2_desc,
+ 0, NEVER_FGU);
+ val |= compute_reg_res0_bits(kvm, &hfgwtr2_desc,
+ 0, NEVER_FGU);
break;
case HFGITR2_GROUP:
- val |= compute_res0_bits(kvm, hfgitr2_feat_map,
- ARRAY_SIZE(hfgitr2_feat_map),
- 0, NEVER_FGU);
+ val |= compute_reg_res0_bits(kvm, &hfgitr2_desc,
+ 0, NEVER_FGU);
break;
case HDFGRTR2_GROUP:
- val |= compute_res0_bits(kvm, hdfgrtr2_feat_map,
- ARRAY_SIZE(hdfgrtr2_feat_map),
- 0, NEVER_FGU);
- val |= compute_res0_bits(kvm, hdfgwtr2_feat_map,
- ARRAY_SIZE(hdfgwtr2_feat_map),
- 0, NEVER_FGU);
+ val |= compute_reg_res0_bits(kvm, &hdfgrtr2_desc,
+ 0, NEVER_FGU);
+ val |= compute_reg_res0_bits(kvm, &hdfgwtr2_desc,
+ 0, NEVER_FGU);
break;
default:
BUG();
@@ -1221,109 +1352,74 @@ void get_reg_fixed_bits(struct kvm *kvm, enum vcpu_sysreg reg, u64 *res0, u64 *r
switch (reg) {
case HFGRTR_EL2:
- *res0 = compute_res0_bits(kvm, hfgrtr_feat_map,
- ARRAY_SIZE(hfgrtr_feat_map), 0, 0);
- *res0 |= hfgrtr_masks.res0;
+ *res0 = compute_reg_res0_bits(kvm, &hfgrtr_desc, 0, 0);
*res1 = HFGRTR_EL2_RES1;
break;
case HFGWTR_EL2:
- *res0 = compute_res0_bits(kvm, hfgwtr_feat_map,
- ARRAY_SIZE(hfgwtr_feat_map), 0, 0);
- *res0 |= hfgwtr_masks.res0;
+ *res0 = compute_reg_res0_bits(kvm, &hfgwtr_desc, 0, 0);
*res1 = HFGWTR_EL2_RES1;
break;
case HFGITR_EL2:
- *res0 = compute_res0_bits(kvm, hfgitr_feat_map,
- ARRAY_SIZE(hfgitr_feat_map), 0, 0);
- *res0 |= hfgitr_masks.res0;
+ *res0 = compute_reg_res0_bits(kvm, &hfgitr_desc, 0, 0);
*res1 = HFGITR_EL2_RES1;
break;
case HDFGRTR_EL2:
- *res0 = compute_res0_bits(kvm, hdfgrtr_feat_map,
- ARRAY_SIZE(hdfgrtr_feat_map), 0, 0);
- *res0 |= hdfgrtr_masks.res0;
+ *res0 = compute_reg_res0_bits(kvm, &hdfgrtr_desc, 0, 0);
*res1 = HDFGRTR_EL2_RES1;
break;
case HDFGWTR_EL2:
- *res0 = compute_res0_bits(kvm, hdfgwtr_feat_map,
- ARRAY_SIZE(hdfgwtr_feat_map), 0, 0);
- *res0 |= hdfgwtr_masks.res0;
+ *res0 = compute_reg_res0_bits(kvm, &hdfgwtr_desc, 0, 0);
*res1 = HDFGWTR_EL2_RES1;
break;
case HAFGRTR_EL2:
- *res0 = compute_res0_bits(kvm, hafgrtr_feat_map,
- ARRAY_SIZE(hafgrtr_feat_map), 0, 0);
- *res0 |= hafgrtr_masks.res0;
+ *res0 = compute_reg_res0_bits(kvm, &hafgrtr_desc, 0, 0);
*res1 = HAFGRTR_EL2_RES1;
break;
case HFGRTR2_EL2:
- *res0 = compute_res0_bits(kvm, hfgrtr2_feat_map,
- ARRAY_SIZE(hfgrtr2_feat_map), 0, 0);
- *res0 |= hfgrtr2_masks.res0;
+ *res0 = compute_reg_res0_bits(kvm, &hfgrtr2_desc, 0, 0);
*res1 = HFGRTR2_EL2_RES1;
break;
case HFGWTR2_EL2:
- *res0 = compute_res0_bits(kvm, hfgwtr2_feat_map,
- ARRAY_SIZE(hfgwtr2_feat_map), 0, 0);
- *res0 |= hfgwtr2_masks.res0;
+ *res0 = compute_reg_res0_bits(kvm, &hfgwtr2_desc, 0, 0);
*res1 = HFGWTR2_EL2_RES1;
break;
case HFGITR2_EL2:
- *res0 = compute_res0_bits(kvm, hfgitr2_feat_map,
- ARRAY_SIZE(hfgitr2_feat_map), 0, 0);
- *res0 |= hfgitr2_masks.res0;
+ *res0 = compute_reg_res0_bits(kvm, &hfgitr2_desc, 0, 0);
*res1 = HFGITR2_EL2_RES1;
break;
case HDFGRTR2_EL2:
- *res0 = compute_res0_bits(kvm, hdfgrtr2_feat_map,
- ARRAY_SIZE(hdfgrtr2_feat_map), 0, 0);
- *res0 |= hdfgrtr2_masks.res0;
+ *res0 = compute_reg_res0_bits(kvm, &hdfgrtr2_desc, 0, 0);
*res1 = HDFGRTR2_EL2_RES1;
break;
case HDFGWTR2_EL2:
- *res0 = compute_res0_bits(kvm, hdfgwtr2_feat_map,
- ARRAY_SIZE(hdfgwtr2_feat_map), 0, 0);
- *res0 |= hdfgwtr2_masks.res0;
+ *res0 = compute_reg_res0_bits(kvm, &hdfgwtr2_desc, 0, 0);
*res1 = HDFGWTR2_EL2_RES1;
break;
case HCRX_EL2:
- *res0 = compute_res0_bits(kvm, hcrx_feat_map,
- ARRAY_SIZE(hcrx_feat_map), 0, 0);
- *res0 |= __HCRX_EL2_RES0;
+ *res0 = compute_reg_res0_bits(kvm, &hcrx_desc, 0, 0);
*res1 = __HCRX_EL2_RES1;
break;
case HCR_EL2:
- mask = compute_fixed_bits(kvm, hcr_feat_map,
- ARRAY_SIZE(hcr_feat_map), &fixed,
- 0, 0);
- *res0 = compute_res0_bits(kvm, hcr_feat_map,
- ARRAY_SIZE(hcr_feat_map), 0, 0);
- *res0 |= HCR_EL2_RES0 | (mask & ~fixed);
+ mask = compute_reg_fixed_bits(kvm, &hcr_desc, &fixed, 0, 0);
+ *res0 = compute_reg_res0_bits(kvm, &hcr_desc, 0, 0);
+ *res0 |= (mask & ~fixed);
*res1 = HCR_EL2_RES1 | (mask & fixed);
break;
case SCTLR2_EL1:
case SCTLR2_EL2:
- *res0 = compute_res0_bits(kvm, sctlr2_feat_map,
- ARRAY_SIZE(sctlr2_feat_map), 0, 0);
- *res0 |= SCTLR2_EL1_RES0;
+ *res0 = compute_reg_res0_bits(kvm, &sctlr2_desc, 0, 0);
*res1 = SCTLR2_EL1_RES1;
break;
case TCR2_EL2:
- *res0 = compute_res0_bits(kvm, tcr2_el2_feat_map,
- ARRAY_SIZE(tcr2_el2_feat_map), 0, 0);
- *res0 |= TCR2_EL2_RES0;
+ *res0 = compute_reg_res0_bits(kvm, &tcr2_el2_desc, 0, 0);
*res1 = TCR2_EL2_RES1;
break;
case SCTLR_EL1:
- *res0 = compute_res0_bits(kvm, sctlr_el1_feat_map,
- ARRAY_SIZE(sctlr_el1_feat_map), 0, 0);
- *res0 |= SCTLR_EL1_RES0;
+ *res0 = compute_reg_res0_bits(kvm, &sctlr_el1_desc, 0, 0);
*res1 = SCTLR_EL1_RES1;
break;
case MDCR_EL2:
- *res0 = compute_res0_bits(kvm, mdcr_el2_feat_map,
- ARRAY_SIZE(mdcr_el2_feat_map), 0, 0);
- *res0 |= MDCR_EL2_RES0;
+ *res0 = compute_reg_res0_bits(kvm, &mdcr_el2_desc, 0, 0);
*res1 = MDCR_EL2_RES1;
break;
default:
diff --git a/arch/arm64/kvm/debug.c b/arch/arm64/kvm/debug.c
index 381382c19fe4..3515a273eaa2 100644
--- a/arch/arm64/kvm/debug.c
+++ b/arch/arm64/kvm/debug.c
@@ -56,6 +56,9 @@ static void kvm_arm_setup_mdcr_el2(struct kvm_vcpu *vcpu)
if (!kvm_guest_owns_debug_regs(vcpu))
vcpu->arch.mdcr_el2 |= MDCR_EL2_TDA;
+ if (vcpu_has_nv(vcpu))
+ kvm_nested_setup_mdcr_el2(vcpu);
+
/* Write MDCR_EL2 directly if we're already at EL2 */
if (has_vhe())
write_sysreg(vcpu->arch.mdcr_el2, mdcr_el2);
@@ -96,6 +99,13 @@ void kvm_init_host_debug_data(void)
}
}
+void kvm_debug_init_vhe(void)
+{
+ /* Clear PMSCR_EL1.E{0,1}SPE which reset to UNKNOWN values. */
+ if (SYS_FIELD_GET(ID_AA64DFR0_EL1, PMSVer, read_sysreg(id_aa64dfr0_el1)))
+ write_sysreg_el1(0, SYS_PMSCR);
+}
+
/*
* Configures the 'external' MDSCR_EL1 value for the guest, i.e. when the host
* has taken over MDSCR_EL1.
@@ -138,6 +148,9 @@ void kvm_vcpu_load_debug(struct kvm_vcpu *vcpu)
/* Must be called before kvm_vcpu_load_vhe() */
KVM_BUG_ON(vcpu_get_flag(vcpu, SYSREGS_ON_CPU), vcpu->kvm);
+ if (has_vhe())
+ *host_data_ptr(host_debug_state.mdcr_el2) = read_sysreg(mdcr_el2);
+
/*
* Determine which of the possible debug states we're in:
*
@@ -184,6 +197,9 @@ void kvm_vcpu_load_debug(struct kvm_vcpu *vcpu)
void kvm_vcpu_put_debug(struct kvm_vcpu *vcpu)
{
+ if (has_vhe())
+ write_sysreg(*host_data_ptr(host_debug_state.mdcr_el2), mdcr_el2);
+
if (likely(!(vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)))
return;
@@ -230,29 +246,29 @@ void kvm_debug_handle_oslar(struct kvm_vcpu *vcpu, u64 val)
preempt_enable();
}
-void kvm_enable_trbe(void)
+static bool skip_trbe_access(bool skip_condition)
{
- if (has_vhe() || is_protected_kvm_enabled() ||
- WARN_ON_ONCE(preemptible()))
- return;
+ return (WARN_ON_ONCE(preemptible()) || skip_condition ||
+ is_protected_kvm_enabled() || !is_kvm_arm_initialised());
+}
- host_data_set_flag(TRBE_ENABLED);
+void kvm_enable_trbe(void)
+{
+ if (!skip_trbe_access(has_vhe()))
+ host_data_set_flag(TRBE_ENABLED);
}
EXPORT_SYMBOL_GPL(kvm_enable_trbe);
void kvm_disable_trbe(void)
{
- if (has_vhe() || is_protected_kvm_enabled() ||
- WARN_ON_ONCE(preemptible()))
- return;
-
- host_data_clear_flag(TRBE_ENABLED);
+ if (!skip_trbe_access(has_vhe()))
+ host_data_clear_flag(TRBE_ENABLED);
}
EXPORT_SYMBOL_GPL(kvm_disable_trbe);
void kvm_tracing_set_el1_configuration(u64 trfcr_while_in_guest)
{
- if (is_protected_kvm_enabled() || WARN_ON_ONCE(preemptible()))
+ if (skip_trbe_access(false))
return;
if (has_vhe()) {
diff --git a/arch/arm64/kvm/emulate-nested.c b/arch/arm64/kvm/emulate-nested.c
index 90cb4b7ae0ff..834f13fb1fb7 100644
--- a/arch/arm64/kvm/emulate-nested.c
+++ b/arch/arm64/kvm/emulate-nested.c
@@ -1185,6 +1185,7 @@ static const struct encoding_to_trap_config encoding_to_cgt[] __initconst = {
SR_TRAP(SYS_PMSIRR_EL1, CGT_MDCR_TPMS),
SR_TRAP(SYS_PMSLATFR_EL1, CGT_MDCR_TPMS),
SR_TRAP(SYS_PMSNEVFR_EL1, CGT_MDCR_TPMS),
+ SR_TRAP(SYS_PMSDSFR_EL1, CGT_MDCR_TPMS),
SR_TRAP(SYS_TRFCR_EL1, CGT_MDCR_TTRF),
SR_TRAP(SYS_TRBBASER_EL1, CGT_MDCR_E2TB),
SR_TRAP(SYS_TRBLIMITR_EL1, CGT_MDCR_E2TB),
@@ -2833,7 +2834,7 @@ int kvm_inject_nested_sea(struct kvm_vcpu *vcpu, bool iabt, u64 addr)
iabt ? ESR_ELx_EC_IABT_LOW : ESR_ELx_EC_DABT_LOW);
esr |= ESR_ELx_FSC_EXTABT | ESR_ELx_IL;
- vcpu_write_sys_reg(vcpu, FAR_EL2, addr);
+ vcpu_write_sys_reg(vcpu, addr, FAR_EL2);
if (__vcpu_sys_reg(vcpu, SCTLR2_EL2) & SCTLR2_EL1_EASE)
return kvm_inject_nested(vcpu, esr, except_type_serror);
diff --git a/arch/arm64/kvm/handle_exit.c b/arch/arm64/kvm/handle_exit.c
index a598072f36d2..bca8c80e11da 100644
--- a/arch/arm64/kvm/handle_exit.c
+++ b/arch/arm64/kvm/handle_exit.c
@@ -545,7 +545,7 @@ void __noreturn __cold nvhe_hyp_panic_handler(u64 esr, u64 spsr,
kvm_err("nVHE hyp BUG at: %s:%u!\n", file, line);
else
print_nvhe_hyp_panic("BUG", panic_addr);
- } else if (IS_ENABLED(CONFIG_CFI_CLANG) && esr_is_cfi_brk(esr)) {
+ } else if (IS_ENABLED(CONFIG_CFI) && esr_is_cfi_brk(esr)) {
kvm_nvhe_report_cfi_failure(panic_addr);
} else if (IS_ENABLED(CONFIG_UBSAN_KVM_EL2) &&
ESR_ELx_EC(esr) == ESR_ELx_EC_BRK64 &&
@@ -559,6 +559,9 @@ void __noreturn __cold nvhe_hyp_panic_handler(u64 esr, u64 spsr,
/* Dump the nVHE hypervisor backtrace */
kvm_nvhe_dump_backtrace(hyp_offset);
+ /* Dump the faulting instruction */
+ dump_kernel_instr(panic_addr + kaslr_offset());
+
/*
* Hyp has panicked and we're going to handle that by panicking the
* kernel. The kernel offset will be revealed in the panic so we're
diff --git a/arch/arm64/kvm/hyp/exception.c b/arch/arm64/kvm/hyp/exception.c
index 95d186e0bf54..bef40ddb16db 100644
--- a/arch/arm64/kvm/hyp/exception.c
+++ b/arch/arm64/kvm/hyp/exception.c
@@ -22,36 +22,28 @@
static inline u64 __vcpu_read_sys_reg(const struct kvm_vcpu *vcpu, int reg)
{
- u64 val;
-
- if (unlikely(vcpu_has_nv(vcpu)))
+ if (has_vhe())
return vcpu_read_sys_reg(vcpu, reg);
- else if (vcpu_get_flag(vcpu, SYSREGS_ON_CPU) &&
- __vcpu_read_sys_reg_from_cpu(reg, &val))
- return val;
return __vcpu_sys_reg(vcpu, reg);
}
static inline void __vcpu_write_sys_reg(struct kvm_vcpu *vcpu, u64 val, int reg)
{
- if (unlikely(vcpu_has_nv(vcpu)))
+ if (has_vhe())
vcpu_write_sys_reg(vcpu, val, reg);
- else if (!vcpu_get_flag(vcpu, SYSREGS_ON_CPU) ||
- !__vcpu_write_sys_reg_to_cpu(val, reg))
+ else
__vcpu_assign_sys_reg(vcpu, reg, val);
}
static void __vcpu_write_spsr(struct kvm_vcpu *vcpu, unsigned long target_mode,
u64 val)
{
- if (unlikely(vcpu_has_nv(vcpu))) {
+ if (has_vhe()) {
if (target_mode == PSR_MODE_EL1h)
vcpu_write_sys_reg(vcpu, val, SPSR_EL1);
else
vcpu_write_sys_reg(vcpu, val, SPSR_EL2);
- } else if (has_vhe()) {
- write_sysreg_el1(val, SYS_SPSR);
} else {
__vcpu_assign_sys_reg(vcpu, SPSR_EL1, val);
}
@@ -59,7 +51,7 @@ static void __vcpu_write_spsr(struct kvm_vcpu *vcpu, unsigned long target_mode,
static void __vcpu_write_spsr_abt(struct kvm_vcpu *vcpu, u64 val)
{
- if (has_vhe())
+ if (has_vhe() && vcpu_get_flag(vcpu, SYSREGS_ON_CPU))
write_sysreg(val, spsr_abt);
else
vcpu->arch.ctxt.spsr_abt = val;
@@ -67,7 +59,7 @@ static void __vcpu_write_spsr_abt(struct kvm_vcpu *vcpu, u64 val)
static void __vcpu_write_spsr_und(struct kvm_vcpu *vcpu, u64 val)
{
- if (has_vhe())
+ if (has_vhe() && vcpu_get_flag(vcpu, SYSREGS_ON_CPU))
write_sysreg(val, spsr_und);
else
vcpu->arch.ctxt.spsr_und = val;
diff --git a/arch/arm64/kvm/hyp/include/hyp/switch.h b/arch/arm64/kvm/hyp/include/hyp/switch.h
index 84ec4e100fbb..b6682202edf3 100644
--- a/arch/arm64/kvm/hyp/include/hyp/switch.h
+++ b/arch/arm64/kvm/hyp/include/hyp/switch.h
@@ -431,9 +431,6 @@ static inline void __activate_traps_common(struct kvm_vcpu *vcpu)
vcpu_set_flag(vcpu, PMUSERENR_ON_CPU);
}
- *host_data_ptr(host_debug_state.mdcr_el2) = read_sysreg(mdcr_el2);
- write_sysreg(vcpu->arch.mdcr_el2, mdcr_el2);
-
if (cpus_have_final_cap(ARM64_HAS_HCX)) {
u64 hcrx = vcpu->arch.hcrx_el2;
if (is_nested_ctxt(vcpu)) {
@@ -454,8 +451,6 @@ static inline void __deactivate_traps_common(struct kvm_vcpu *vcpu)
{
struct kvm_cpu_context *hctxt = host_data_ptr(host_ctxt);
- write_sysreg(*host_data_ptr(host_debug_state.mdcr_el2), mdcr_el2);
-
write_sysreg(0, hstr_el2);
if (system_supports_pmuv3()) {
write_sysreg(ctxt_sys_reg(hctxt, PMUSERENR_EL0), pmuserenr_el0);
diff --git a/arch/arm64/kvm/hyp/include/nvhe/pkvm.h b/arch/arm64/kvm/hyp/include/nvhe/pkvm.h
index ce31d3b73603..184ad7a39950 100644
--- a/arch/arm64/kvm/hyp/include/nvhe/pkvm.h
+++ b/arch/arm64/kvm/hyp/include/nvhe/pkvm.h
@@ -29,7 +29,7 @@ struct pkvm_hyp_vcpu {
};
/*
- * Holds the relevant data for running a protected vm.
+ * Holds the relevant data for running a vm in protected mode.
*/
struct pkvm_hyp_vm {
struct kvm kvm;
@@ -67,6 +67,8 @@ static inline bool pkvm_hyp_vm_is_protected(struct pkvm_hyp_vm *hyp_vm)
void pkvm_hyp_vm_table_init(void *tbl);
+int __pkvm_reserve_vm(void);
+void __pkvm_unreserve_vm(pkvm_handle_t handle);
int __pkvm_init_vm(struct kvm *host_kvm, unsigned long vm_hva,
unsigned long pgd_hva);
int __pkvm_init_vcpu(pkvm_handle_t handle, struct kvm_vcpu *host_vcpu,
diff --git a/arch/arm64/kvm/hyp/include/nvhe/trap_handler.h b/arch/arm64/kvm/hyp/include/nvhe/trap_handler.h
index 1e6d995968a1..ba5382c12787 100644
--- a/arch/arm64/kvm/hyp/include/nvhe/trap_handler.h
+++ b/arch/arm64/kvm/hyp/include/nvhe/trap_handler.h
@@ -12,7 +12,8 @@
#include <asm/kvm_host.h>
#define cpu_reg(ctxt, r) (ctxt)->regs.regs[r]
-#define DECLARE_REG(type, name, ctxt, reg) \
+#define DECLARE_REG(type, name, ctxt, reg) \
+ __always_unused int ___check_reg_ ## reg; \
type name = (type)cpu_reg(ctxt, (reg))
#endif /* __ARM64_KVM_NVHE_TRAP_HANDLER_H__ */
diff --git a/arch/arm64/kvm/hyp/nvhe/Makefile b/arch/arm64/kvm/hyp/nvhe/Makefile
index 0b0a68b663d4..a244ec25f8c5 100644
--- a/arch/arm64/kvm/hyp/nvhe/Makefile
+++ b/arch/arm64/kvm/hyp/nvhe/Makefile
@@ -27,6 +27,7 @@ hyp-obj-y := timer-sr.o sysreg-sr.o debug-sr.o switch.o tlb.o hyp-init.o host.o
cache.o setup.o mm.o mem_protect.o sys_regs.o pkvm.o stacktrace.o ffa.o
hyp-obj-y += ../vgic-v3-sr.o ../aarch32.o ../vgic-v2-cpuif-proxy.o ../entry.o \
../fpsimd.o ../hyp-entry.o ../exception.o ../pgtable.o
+hyp-obj-y += ../../../kernel/smccc-call.o
hyp-obj-$(CONFIG_LIST_HARDENED) += list_debug.o
hyp-obj-y += $(lib-objs)
diff --git a/arch/arm64/kvm/hyp/nvhe/ffa.c b/arch/arm64/kvm/hyp/nvhe/ffa.c
index 3369dd0c4009..4e16f9b96f63 100644
--- a/arch/arm64/kvm/hyp/nvhe/ffa.c
+++ b/arch/arm64/kvm/hyp/nvhe/ffa.c
@@ -71,36 +71,68 @@ static u32 hyp_ffa_version;
static bool has_version_negotiated;
static hyp_spinlock_t version_lock;
-static void ffa_to_smccc_error(struct arm_smccc_res *res, u64 ffa_errno)
+static void ffa_to_smccc_error(struct arm_smccc_1_2_regs *res, u64 ffa_errno)
{
- *res = (struct arm_smccc_res) {
+ *res = (struct arm_smccc_1_2_regs) {
.a0 = FFA_ERROR,
.a2 = ffa_errno,
};
}
-static void ffa_to_smccc_res_prop(struct arm_smccc_res *res, int ret, u64 prop)
+static void ffa_to_smccc_res_prop(struct arm_smccc_1_2_regs *res, int ret, u64 prop)
{
if (ret == FFA_RET_SUCCESS) {
- *res = (struct arm_smccc_res) { .a0 = FFA_SUCCESS,
- .a2 = prop };
+ *res = (struct arm_smccc_1_2_regs) { .a0 = FFA_SUCCESS,
+ .a2 = prop };
} else {
ffa_to_smccc_error(res, ret);
}
}
-static void ffa_to_smccc_res(struct arm_smccc_res *res, int ret)
+static void ffa_to_smccc_res(struct arm_smccc_1_2_regs *res, int ret)
{
ffa_to_smccc_res_prop(res, ret, 0);
}
static void ffa_set_retval(struct kvm_cpu_context *ctxt,
- struct arm_smccc_res *res)
+ struct arm_smccc_1_2_regs *res)
{
cpu_reg(ctxt, 0) = res->a0;
cpu_reg(ctxt, 1) = res->a1;
cpu_reg(ctxt, 2) = res->a2;
cpu_reg(ctxt, 3) = res->a3;
+ cpu_reg(ctxt, 4) = res->a4;
+ cpu_reg(ctxt, 5) = res->a5;
+ cpu_reg(ctxt, 6) = res->a6;
+ cpu_reg(ctxt, 7) = res->a7;
+
+ /*
+ * DEN0028C 2.6: SMC32/HVC32 call from aarch64 must preserve x8-x30.
+ *
+ * In FF-A 1.2, we cannot rely on the function ID sent by the caller to
+ * detect 32-bit calls because the CPU cycle management interfaces (e.g.
+ * FFA_MSG_WAIT, FFA_RUN) are 32-bit only but can have 64-bit responses.
+ *
+ * FFA-1.3 introduces 64-bit variants of the CPU cycle management
+ * interfaces. Moreover, FF-A 1.3 clarifies that SMC32 direct requests
+ * complete with SMC32 direct reponses which *should* allow us use the
+ * function ID sent by the caller to determine whether to return x8-x17.
+ *
+ * Note that we also cannot rely on function IDs in the response.
+ *
+ * Given the above, assume SMC64 and send back x0-x17 unconditionally
+ * as the passthrough code (__kvm_hyp_host_forward_smc) does the same.
+ */
+ cpu_reg(ctxt, 8) = res->a8;
+ cpu_reg(ctxt, 9) = res->a9;
+ cpu_reg(ctxt, 10) = res->a10;
+ cpu_reg(ctxt, 11) = res->a11;
+ cpu_reg(ctxt, 12) = res->a12;
+ cpu_reg(ctxt, 13) = res->a13;
+ cpu_reg(ctxt, 14) = res->a14;
+ cpu_reg(ctxt, 15) = res->a15;
+ cpu_reg(ctxt, 16) = res->a16;
+ cpu_reg(ctxt, 17) = res->a17;
}
static bool is_ffa_call(u64 func_id)
@@ -113,82 +145,92 @@ static bool is_ffa_call(u64 func_id)
static int ffa_map_hyp_buffers(u64 ffa_page_count)
{
- struct arm_smccc_res res;
+ struct arm_smccc_1_2_regs res;
- arm_smccc_1_1_smc(FFA_FN64_RXTX_MAP,
- hyp_virt_to_phys(hyp_buffers.tx),
- hyp_virt_to_phys(hyp_buffers.rx),
- ffa_page_count,
- 0, 0, 0, 0,
- &res);
+ arm_smccc_1_2_smc(&(struct arm_smccc_1_2_regs) {
+ .a0 = FFA_FN64_RXTX_MAP,
+ .a1 = hyp_virt_to_phys(hyp_buffers.tx),
+ .a2 = hyp_virt_to_phys(hyp_buffers.rx),
+ .a3 = ffa_page_count,
+ }, &res);
return res.a0 == FFA_SUCCESS ? FFA_RET_SUCCESS : res.a2;
}
static int ffa_unmap_hyp_buffers(void)
{
- struct arm_smccc_res res;
+ struct arm_smccc_1_2_regs res;
- arm_smccc_1_1_smc(FFA_RXTX_UNMAP,
- HOST_FFA_ID,
- 0, 0, 0, 0, 0, 0,
- &res);
+ arm_smccc_1_2_smc(&(struct arm_smccc_1_2_regs) {
+ .a0 = FFA_RXTX_UNMAP,
+ .a1 = HOST_FFA_ID,
+ }, &res);
return res.a0 == FFA_SUCCESS ? FFA_RET_SUCCESS : res.a2;
}
-static void ffa_mem_frag_tx(struct arm_smccc_res *res, u32 handle_lo,
+static void ffa_mem_frag_tx(struct arm_smccc_1_2_regs *res, u32 handle_lo,
u32 handle_hi, u32 fraglen, u32 endpoint_id)
{
- arm_smccc_1_1_smc(FFA_MEM_FRAG_TX,
- handle_lo, handle_hi, fraglen, endpoint_id,
- 0, 0, 0,
- res);
+ arm_smccc_1_2_smc(&(struct arm_smccc_1_2_regs) {
+ .a0 = FFA_MEM_FRAG_TX,
+ .a1 = handle_lo,
+ .a2 = handle_hi,
+ .a3 = fraglen,
+ .a4 = endpoint_id,
+ }, res);
}
-static void ffa_mem_frag_rx(struct arm_smccc_res *res, u32 handle_lo,
+static void ffa_mem_frag_rx(struct arm_smccc_1_2_regs *res, u32 handle_lo,
u32 handle_hi, u32 fragoff)
{
- arm_smccc_1_1_smc(FFA_MEM_FRAG_RX,
- handle_lo, handle_hi, fragoff, HOST_FFA_ID,
- 0, 0, 0,
- res);
+ arm_smccc_1_2_smc(&(struct arm_smccc_1_2_regs) {
+ .a0 = FFA_MEM_FRAG_RX,
+ .a1 = handle_lo,
+ .a2 = handle_hi,
+ .a3 = fragoff,
+ .a4 = HOST_FFA_ID,
+ }, res);
}
-static void ffa_mem_xfer(struct arm_smccc_res *res, u64 func_id, u32 len,
+static void ffa_mem_xfer(struct arm_smccc_1_2_regs *res, u64 func_id, u32 len,
u32 fraglen)
{
- arm_smccc_1_1_smc(func_id, len, fraglen,
- 0, 0, 0, 0, 0,
- res);
+ arm_smccc_1_2_smc(&(struct arm_smccc_1_2_regs) {
+ .a0 = func_id,
+ .a1 = len,
+ .a2 = fraglen,
+ }, res);
}
-static void ffa_mem_reclaim(struct arm_smccc_res *res, u32 handle_lo,
+static void ffa_mem_reclaim(struct arm_smccc_1_2_regs *res, u32 handle_lo,
u32 handle_hi, u32 flags)
{
- arm_smccc_1_1_smc(FFA_MEM_RECLAIM,
- handle_lo, handle_hi, flags,
- 0, 0, 0, 0,
- res);
+ arm_smccc_1_2_smc(&(struct arm_smccc_1_2_regs) {
+ .a0 = FFA_MEM_RECLAIM,
+ .a1 = handle_lo,
+ .a2 = handle_hi,
+ .a3 = flags,
+ }, res);
}
-static void ffa_retrieve_req(struct arm_smccc_res *res, u32 len)
+static void ffa_retrieve_req(struct arm_smccc_1_2_regs *res, u32 len)
{
- arm_smccc_1_1_smc(FFA_FN64_MEM_RETRIEVE_REQ,
- len, len,
- 0, 0, 0, 0, 0,
- res);
+ arm_smccc_1_2_smc(&(struct arm_smccc_1_2_regs) {
+ .a0 = FFA_FN64_MEM_RETRIEVE_REQ,
+ .a1 = len,
+ .a2 = len,
+ }, res);
}
-static void ffa_rx_release(struct arm_smccc_res *res)
+static void ffa_rx_release(struct arm_smccc_1_2_regs *res)
{
- arm_smccc_1_1_smc(FFA_RX_RELEASE,
- 0, 0,
- 0, 0, 0, 0, 0,
- res);
+ arm_smccc_1_2_smc(&(struct arm_smccc_1_2_regs) {
+ .a0 = FFA_RX_RELEASE,
+ }, res);
}
-static void do_ffa_rxtx_map(struct arm_smccc_res *res,
+static void do_ffa_rxtx_map(struct arm_smccc_1_2_regs *res,
struct kvm_cpu_context *ctxt)
{
DECLARE_REG(phys_addr_t, tx, ctxt, 1);
@@ -267,7 +309,7 @@ err_unmap:
goto out_unlock;
}
-static void do_ffa_rxtx_unmap(struct arm_smccc_res *res,
+static void do_ffa_rxtx_unmap(struct arm_smccc_1_2_regs *res,
struct kvm_cpu_context *ctxt)
{
DECLARE_REG(u32, id, ctxt, 1);
@@ -368,7 +410,7 @@ static int ffa_host_unshare_ranges(struct ffa_mem_region_addr_range *ranges,
return ret;
}
-static void do_ffa_mem_frag_tx(struct arm_smccc_res *res,
+static void do_ffa_mem_frag_tx(struct arm_smccc_1_2_regs *res,
struct kvm_cpu_context *ctxt)
{
DECLARE_REG(u32, handle_lo, ctxt, 1);
@@ -427,7 +469,7 @@ out:
}
static void __do_ffa_mem_xfer(const u64 func_id,
- struct arm_smccc_res *res,
+ struct arm_smccc_1_2_regs *res,
struct kvm_cpu_context *ctxt)
{
DECLARE_REG(u32, len, ctxt, 1);
@@ -521,7 +563,7 @@ err_unshare:
__do_ffa_mem_xfer((fid), (res), (ctxt)); \
} while (0);
-static void do_ffa_mem_reclaim(struct arm_smccc_res *res,
+static void do_ffa_mem_reclaim(struct arm_smccc_1_2_regs *res,
struct kvm_cpu_context *ctxt)
{
DECLARE_REG(u32, handle_lo, ctxt, 1);
@@ -628,13 +670,26 @@ static bool ffa_call_supported(u64 func_id)
case FFA_RXTX_MAP:
case FFA_MEM_DONATE:
case FFA_MEM_RETRIEVE_REQ:
+ /* Optional notification interfaces added in FF-A 1.1 */
+ case FFA_NOTIFICATION_BITMAP_CREATE:
+ case FFA_NOTIFICATION_BITMAP_DESTROY:
+ case FFA_NOTIFICATION_BIND:
+ case FFA_NOTIFICATION_UNBIND:
+ case FFA_NOTIFICATION_SET:
+ case FFA_NOTIFICATION_GET:
+ case FFA_NOTIFICATION_INFO_GET:
+ /* Optional interfaces added in FF-A 1.2 */
+ case FFA_MSG_SEND_DIRECT_REQ2: /* Optional per 7.5.1 */
+ case FFA_MSG_SEND_DIRECT_RESP2: /* Optional per 7.5.1 */
+ case FFA_CONSOLE_LOG: /* Optional per 13.1: not in Table 13.1 */
+ case FFA_PARTITION_INFO_GET_REGS: /* Optional for virtual instances per 13.1 */
return false;
}
return true;
}
-static bool do_ffa_features(struct arm_smccc_res *res,
+static bool do_ffa_features(struct arm_smccc_1_2_regs *res,
struct kvm_cpu_context *ctxt)
{
DECLARE_REG(u32, id, ctxt, 1);
@@ -666,21 +721,25 @@ out_handled:
static int hyp_ffa_post_init(void)
{
size_t min_rxtx_sz;
- struct arm_smccc_res res;
+ struct arm_smccc_1_2_regs res;
- arm_smccc_1_1_smc(FFA_ID_GET, 0, 0, 0, 0, 0, 0, 0, &res);
+ arm_smccc_1_2_smc(&(struct arm_smccc_1_2_regs){
+ .a0 = FFA_ID_GET,
+ }, &res);
if (res.a0 != FFA_SUCCESS)
return -EOPNOTSUPP;
if (res.a2 != HOST_FFA_ID)
return -EINVAL;
- arm_smccc_1_1_smc(FFA_FEATURES, FFA_FN64_RXTX_MAP,
- 0, 0, 0, 0, 0, 0, &res);
+ arm_smccc_1_2_smc(&(struct arm_smccc_1_2_regs){
+ .a0 = FFA_FEATURES,
+ .a1 = FFA_FN64_RXTX_MAP,
+ }, &res);
if (res.a0 != FFA_SUCCESS)
return -EOPNOTSUPP;
- switch (res.a2) {
+ switch (res.a2 & FFA_FEAT_RXTX_MIN_SZ_MASK) {
case FFA_FEAT_RXTX_MIN_SZ_4K:
min_rxtx_sz = SZ_4K;
break;
@@ -700,7 +759,7 @@ static int hyp_ffa_post_init(void)
return 0;
}
-static void do_ffa_version(struct arm_smccc_res *res,
+static void do_ffa_version(struct arm_smccc_1_2_regs *res,
struct kvm_cpu_context *ctxt)
{
DECLARE_REG(u32, ffa_req_version, ctxt, 1);
@@ -712,7 +771,10 @@ static void do_ffa_version(struct arm_smccc_res *res,
hyp_spin_lock(&version_lock);
if (has_version_negotiated) {
- res->a0 = hyp_ffa_version;
+ if (FFA_MINOR_VERSION(ffa_req_version) < FFA_MINOR_VERSION(hyp_ffa_version))
+ res->a0 = FFA_RET_NOT_SUPPORTED;
+ else
+ res->a0 = hyp_ffa_version;
goto unlock;
}
@@ -721,9 +783,10 @@ static void do_ffa_version(struct arm_smccc_res *res,
* first if TEE supports it.
*/
if (FFA_MINOR_VERSION(ffa_req_version) < FFA_MINOR_VERSION(hyp_ffa_version)) {
- arm_smccc_1_1_smc(FFA_VERSION, ffa_req_version, 0,
- 0, 0, 0, 0, 0,
- res);
+ arm_smccc_1_2_smc(&(struct arm_smccc_1_2_regs) {
+ .a0 = FFA_VERSION,
+ .a1 = ffa_req_version,
+ }, res);
if (res->a0 == FFA_RET_NOT_SUPPORTED)
goto unlock;
@@ -740,7 +803,7 @@ unlock:
hyp_spin_unlock(&version_lock);
}
-static void do_ffa_part_get(struct arm_smccc_res *res,
+static void do_ffa_part_get(struct arm_smccc_1_2_regs *res,
struct kvm_cpu_context *ctxt)
{
DECLARE_REG(u32, uuid0, ctxt, 1);
@@ -756,9 +819,14 @@ static void do_ffa_part_get(struct arm_smccc_res *res,
goto out_unlock;
}
- arm_smccc_1_1_smc(FFA_PARTITION_INFO_GET, uuid0, uuid1,
- uuid2, uuid3, flags, 0, 0,
- res);
+ arm_smccc_1_2_smc(&(struct arm_smccc_1_2_regs) {
+ .a0 = FFA_PARTITION_INFO_GET,
+ .a1 = uuid0,
+ .a2 = uuid1,
+ .a3 = uuid2,
+ .a4 = uuid3,
+ .a5 = flags,
+ }, res);
if (res->a0 != FFA_SUCCESS)
goto out_unlock;
@@ -791,7 +859,7 @@ out_unlock:
bool kvm_host_ffa_handler(struct kvm_cpu_context *host_ctxt, u32 func_id)
{
- struct arm_smccc_res res;
+ struct arm_smccc_1_2_regs res;
/*
* There's no way we can tell what a non-standard SMC call might
@@ -860,13 +928,16 @@ out_handled:
int hyp_ffa_init(void *pages)
{
- struct arm_smccc_res res;
+ struct arm_smccc_1_2_regs res;
void *tx, *rx;
if (kvm_host_psci_config.smccc_version < ARM_SMCCC_VERSION_1_2)
return 0;
- arm_smccc_1_1_smc(FFA_VERSION, FFA_VERSION_1_1, 0, 0, 0, 0, 0, 0, &res);
+ arm_smccc_1_2_smc(&(struct arm_smccc_1_2_regs) {
+ .a0 = FFA_VERSION,
+ .a1 = FFA_VERSION_1_2,
+ }, &res);
if (res.a0 == FFA_RET_NOT_SUPPORTED)
return 0;
@@ -886,10 +957,10 @@ int hyp_ffa_init(void *pages)
if (FFA_MAJOR_VERSION(res.a0) != 1)
return -EOPNOTSUPP;
- if (FFA_MINOR_VERSION(res.a0) < FFA_MINOR_VERSION(FFA_VERSION_1_1))
+ if (FFA_MINOR_VERSION(res.a0) < FFA_MINOR_VERSION(FFA_VERSION_1_2))
hyp_ffa_version = res.a0;
else
- hyp_ffa_version = FFA_VERSION_1_1;
+ hyp_ffa_version = FFA_VERSION_1_2;
tx = pages;
pages += KVM_FFA_MBOX_NR_PAGES * PAGE_SIZE;
diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-main.c b/arch/arm64/kvm/hyp/nvhe/hyp-main.c
index 3206b2c07f82..29430c031095 100644
--- a/arch/arm64/kvm/hyp/nvhe/hyp-main.c
+++ b/arch/arm64/kvm/hyp/nvhe/hyp-main.c
@@ -546,6 +546,18 @@ static void handle___pkvm_prot_finalize(struct kvm_cpu_context *host_ctxt)
cpu_reg(host_ctxt, 1) = __pkvm_prot_finalize();
}
+static void handle___pkvm_reserve_vm(struct kvm_cpu_context *host_ctxt)
+{
+ cpu_reg(host_ctxt, 1) = __pkvm_reserve_vm();
+}
+
+static void handle___pkvm_unreserve_vm(struct kvm_cpu_context *host_ctxt)
+{
+ DECLARE_REG(pkvm_handle_t, handle, host_ctxt, 1);
+
+ __pkvm_unreserve_vm(handle);
+}
+
static void handle___pkvm_init_vm(struct kvm_cpu_context *host_ctxt)
{
DECLARE_REG(struct kvm *, host_kvm, host_ctxt, 1);
@@ -606,6 +618,8 @@ static const hcall_t host_hcall[] = {
HANDLE_FUNC(__kvm_timer_set_cntvoff),
HANDLE_FUNC(__vgic_v3_save_vmcr_aprs),
HANDLE_FUNC(__vgic_v3_restore_vmcr_aprs),
+ HANDLE_FUNC(__pkvm_reserve_vm),
+ HANDLE_FUNC(__pkvm_unreserve_vm),
HANDLE_FUNC(__pkvm_init_vm),
HANDLE_FUNC(__pkvm_init_vcpu),
HANDLE_FUNC(__pkvm_teardown_vm),
diff --git a/arch/arm64/kvm/hyp/nvhe/list_debug.c b/arch/arm64/kvm/hyp/nvhe/list_debug.c
index 46a2d4f2b3c6..baa6260f88dc 100644
--- a/arch/arm64/kvm/hyp/nvhe/list_debug.c
+++ b/arch/arm64/kvm/hyp/nvhe/list_debug.c
@@ -17,7 +17,7 @@ static inline __must_check bool nvhe_check_data_corruption(bool v)
bool corruption = unlikely(condition); \
if (corruption) { \
if (IS_ENABLED(CONFIG_BUG_ON_DATA_CORRUPTION)) { \
- BUG_ON(1); \
+ BUG(); \
} else \
WARN_ON(1); \
} \
diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
index 8957734d6183..ddc8beb55eee 100644
--- a/arch/arm64/kvm/hyp/nvhe/mem_protect.c
+++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
@@ -1010,9 +1010,12 @@ static int __check_host_shared_guest(struct pkvm_hyp_vm *vm, u64 *__phys, u64 ip
return ret;
if (!kvm_pte_valid(pte))
return -ENOENT;
- if (kvm_granule_size(level) != size)
+ if (size && kvm_granule_size(level) != size)
return -E2BIG;
+ if (!size)
+ size = kvm_granule_size(level);
+
state = guest_get_page_state(pte, ipa);
if (state != PKVM_PAGE_SHARED_BORROWED)
return -EPERM;
@@ -1100,7 +1103,7 @@ int __pkvm_host_relax_perms_guest(u64 gfn, struct pkvm_hyp_vcpu *vcpu, enum kvm_
if (prot & ~KVM_PGTABLE_PROT_RWX)
return -EINVAL;
- assert_host_shared_guest(vm, ipa, PAGE_SIZE);
+ assert_host_shared_guest(vm, ipa, 0);
guest_lock_component(vm);
ret = kvm_pgtable_stage2_relax_perms(&vm->pgt, ipa, prot, 0);
guest_unlock_component(vm);
@@ -1156,7 +1159,7 @@ int __pkvm_host_mkyoung_guest(u64 gfn, struct pkvm_hyp_vcpu *vcpu)
if (pkvm_hyp_vm_is_protected(vm))
return -EPERM;
- assert_host_shared_guest(vm, ipa, PAGE_SIZE);
+ assert_host_shared_guest(vm, ipa, 0);
guest_lock_component(vm);
kvm_pgtable_stage2_mkyoung(&vm->pgt, ipa, 0);
guest_unlock_component(vm);
diff --git a/arch/arm64/kvm/hyp/nvhe/pkvm.c b/arch/arm64/kvm/hyp/nvhe/pkvm.c
index 338505cb0171..05774aed09cb 100644
--- a/arch/arm64/kvm/hyp/nvhe/pkvm.c
+++ b/arch/arm64/kvm/hyp/nvhe/pkvm.c
@@ -23,8 +23,8 @@ unsigned int kvm_arm_vmid_bits;
unsigned int kvm_host_sve_max_vl;
/*
- * The currently loaded hyp vCPU for each physical CPU. Used only when
- * protected KVM is enabled, but for both protected and non-protected VMs.
+ * The currently loaded hyp vCPU for each physical CPU. Used in protected mode
+ * for both protected and non-protected VMs.
*/
static DEFINE_PER_CPU(struct pkvm_hyp_vcpu *, loaded_hyp_vcpu);
@@ -135,7 +135,7 @@ static int pkvm_check_pvm_cpu_features(struct kvm_vcpu *vcpu)
{
struct kvm *kvm = vcpu->kvm;
- /* Protected KVM does not support AArch32 guests. */
+ /* No AArch32 support for protected guests. */
if (kvm_has_feat(kvm, ID_AA64PFR0_EL1, EL0, AARCH32) ||
kvm_has_feat(kvm, ID_AA64PFR0_EL1, EL1, AARCH32))
return -EINVAL;
@@ -192,6 +192,11 @@ static int pkvm_vcpu_init_traps(struct pkvm_hyp_vcpu *hyp_vcpu)
*/
#define HANDLE_OFFSET 0x1000
+/*
+ * Marks a reserved but not yet used entry in the VM table.
+ */
+#define RESERVED_ENTRY ((void *)0xa110ca7ed)
+
static unsigned int vm_handle_to_idx(pkvm_handle_t handle)
{
return handle - HANDLE_OFFSET;
@@ -210,8 +215,8 @@ static pkvm_handle_t idx_to_vm_handle(unsigned int idx)
DEFINE_HYP_SPINLOCK(vm_table_lock);
/*
- * The table of VM entries for protected VMs in hyp.
- * Allocated at hyp initialization and setup.
+ * A table that tracks all VMs in protected mode.
+ * Allocated during hyp initialization and setup.
*/
static struct pkvm_hyp_vm **vm_table;
@@ -231,6 +236,10 @@ static struct pkvm_hyp_vm *get_vm_by_handle(pkvm_handle_t handle)
if (unlikely(idx >= KVM_MAX_PVMS))
return NULL;
+ /* A reserved entry doesn't represent an initialized VM. */
+ if (unlikely(vm_table[idx] == RESERVED_ENTRY))
+ return NULL;
+
return vm_table[idx];
}
@@ -401,14 +410,26 @@ static void unpin_host_vcpus(struct pkvm_hyp_vcpu *hyp_vcpus[],
}
static void init_pkvm_hyp_vm(struct kvm *host_kvm, struct pkvm_hyp_vm *hyp_vm,
- unsigned int nr_vcpus)
+ unsigned int nr_vcpus, pkvm_handle_t handle)
{
+ struct kvm_s2_mmu *mmu = &hyp_vm->kvm.arch.mmu;
+ int idx = vm_handle_to_idx(handle);
+
+ hyp_vm->kvm.arch.pkvm.handle = handle;
+
hyp_vm->host_kvm = host_kvm;
hyp_vm->kvm.created_vcpus = nr_vcpus;
- hyp_vm->kvm.arch.mmu.vtcr = host_mmu.arch.mmu.vtcr;
- hyp_vm->kvm.arch.pkvm.enabled = READ_ONCE(host_kvm->arch.pkvm.enabled);
+ hyp_vm->kvm.arch.pkvm.is_protected = READ_ONCE(host_kvm->arch.pkvm.is_protected);
+ hyp_vm->kvm.arch.pkvm.is_created = true;
hyp_vm->kvm.arch.flags = 0;
pkvm_init_features_from_host(hyp_vm, host_kvm);
+
+ /* VMID 0 is reserved for the host */
+ atomic64_set(&mmu->vmid.id, idx + 1);
+
+ mmu->vtcr = host_mmu.arch.mmu.vtcr;
+ mmu->arch = &hyp_vm->kvm.arch;
+ mmu->pgt = &hyp_vm->pgt;
}
static int pkvm_vcpu_init_sve(struct pkvm_hyp_vcpu *hyp_vcpu, struct kvm_vcpu *host_vcpu)
@@ -480,7 +501,7 @@ done:
return ret;
}
-static int find_free_vm_table_entry(struct kvm *host_kvm)
+static int find_free_vm_table_entry(void)
{
int i;
@@ -493,15 +514,13 @@ static int find_free_vm_table_entry(struct kvm *host_kvm)
}
/*
- * Allocate a VM table entry and insert a pointer to the new vm.
+ * Reserve a VM table entry.
*
- * Return a unique handle to the protected VM on success,
+ * Return a unique handle to the VM on success,
* negative error code on failure.
*/
-static pkvm_handle_t insert_vm_table_entry(struct kvm *host_kvm,
- struct pkvm_hyp_vm *hyp_vm)
+static int allocate_vm_table_entry(void)
{
- struct kvm_s2_mmu *mmu = &hyp_vm->kvm.arch.mmu;
int idx;
hyp_assert_lock_held(&vm_table_lock);
@@ -514,20 +533,57 @@ static pkvm_handle_t insert_vm_table_entry(struct kvm *host_kvm,
if (unlikely(!vm_table))
return -EINVAL;
- idx = find_free_vm_table_entry(host_kvm);
- if (idx < 0)
+ idx = find_free_vm_table_entry();
+ if (unlikely(idx < 0))
return idx;
- hyp_vm->kvm.arch.pkvm.handle = idx_to_vm_handle(idx);
+ vm_table[idx] = RESERVED_ENTRY;
- /* VMID 0 is reserved for the host */
- atomic64_set(&mmu->vmid.id, idx + 1);
+ return idx;
+}
- mmu->arch = &hyp_vm->kvm.arch;
- mmu->pgt = &hyp_vm->pgt;
+static int __insert_vm_table_entry(pkvm_handle_t handle,
+ struct pkvm_hyp_vm *hyp_vm)
+{
+ unsigned int idx;
+
+ hyp_assert_lock_held(&vm_table_lock);
+
+ /*
+ * Initializing protected state might have failed, yet a malicious
+ * host could trigger this function. Thus, ensure that 'vm_table'
+ * exists.
+ */
+ if (unlikely(!vm_table))
+ return -EINVAL;
+
+ idx = vm_handle_to_idx(handle);
+ if (unlikely(idx >= KVM_MAX_PVMS))
+ return -EINVAL;
+
+ if (unlikely(vm_table[idx] != RESERVED_ENTRY))
+ return -EINVAL;
vm_table[idx] = hyp_vm;
- return hyp_vm->kvm.arch.pkvm.handle;
+
+ return 0;
+}
+
+/*
+ * Insert a pointer to the initialized VM into the VM table.
+ *
+ * Return 0 on success, or negative error code on failure.
+ */
+static int insert_vm_table_entry(pkvm_handle_t handle,
+ struct pkvm_hyp_vm *hyp_vm)
+{
+ int ret;
+
+ hyp_spin_lock(&vm_table_lock);
+ ret = __insert_vm_table_entry(handle, hyp_vm);
+ hyp_spin_unlock(&vm_table_lock);
+
+ return ret;
}
/*
@@ -594,10 +650,45 @@ static void unmap_donated_memory_noclear(void *va, size_t size)
}
/*
- * Initialize the hypervisor copy of the protected VM state using the
- * memory donated by the host.
+ * Reserves an entry in the hypervisor for a new VM in protected mode.
*
- * Unmaps the donated memory from the host at stage 2.
+ * Return a unique handle to the VM on success, negative error code on failure.
+ */
+int __pkvm_reserve_vm(void)
+{
+ int ret;
+
+ hyp_spin_lock(&vm_table_lock);
+ ret = allocate_vm_table_entry();
+ hyp_spin_unlock(&vm_table_lock);
+
+ if (ret < 0)
+ return ret;
+
+ return idx_to_vm_handle(ret);
+}
+
+/*
+ * Removes a reserved entry, but only if is hasn't been used yet.
+ * Otherwise, the VM needs to be destroyed.
+ */
+void __pkvm_unreserve_vm(pkvm_handle_t handle)
+{
+ unsigned int idx = vm_handle_to_idx(handle);
+
+ if (unlikely(!vm_table))
+ return;
+
+ hyp_spin_lock(&vm_table_lock);
+ if (likely(idx < KVM_MAX_PVMS && vm_table[idx] == RESERVED_ENTRY))
+ remove_vm_table_entry(handle);
+ hyp_spin_unlock(&vm_table_lock);
+}
+
+/*
+ * Initialize the hypervisor copy of the VM state using host-donated memory.
+ *
+ * Unmap the donated memory from the host at stage 2.
*
* host_kvm: A pointer to the host's struct kvm.
* vm_hva: The host va of the area being donated for the VM state.
@@ -606,8 +697,7 @@ static void unmap_donated_memory_noclear(void *va, size_t size)
* the VM. Must be page aligned. Its size is implied by the VM's
* VTCR.
*
- * Return a unique handle to the protected VM on success,
- * negative error code on failure.
+ * Return 0 success, negative error code on failure.
*/
int __pkvm_init_vm(struct kvm *host_kvm, unsigned long vm_hva,
unsigned long pgd_hva)
@@ -615,6 +705,7 @@ int __pkvm_init_vm(struct kvm *host_kvm, unsigned long vm_hva,
struct pkvm_hyp_vm *hyp_vm = NULL;
size_t vm_size, pgd_size;
unsigned int nr_vcpus;
+ pkvm_handle_t handle;
void *pgd = NULL;
int ret;
@@ -628,6 +719,12 @@ int __pkvm_init_vm(struct kvm *host_kvm, unsigned long vm_hva,
goto err_unpin_kvm;
}
+ handle = READ_ONCE(host_kvm->arch.pkvm.handle);
+ if (unlikely(handle < HANDLE_OFFSET)) {
+ ret = -EINVAL;
+ goto err_unpin_kvm;
+ }
+
vm_size = pkvm_get_hyp_vm_size(nr_vcpus);
pgd_size = kvm_pgtable_stage2_pgd_size(host_mmu.arch.mmu.vtcr);
@@ -641,24 +738,19 @@ int __pkvm_init_vm(struct kvm *host_kvm, unsigned long vm_hva,
if (!pgd)
goto err_remove_mappings;
- init_pkvm_hyp_vm(host_kvm, hyp_vm, nr_vcpus);
-
- hyp_spin_lock(&vm_table_lock);
- ret = insert_vm_table_entry(host_kvm, hyp_vm);
- if (ret < 0)
- goto err_unlock;
+ init_pkvm_hyp_vm(host_kvm, hyp_vm, nr_vcpus, handle);
ret = kvm_guest_prepare_stage2(hyp_vm, pgd);
if (ret)
- goto err_remove_vm_table_entry;
- hyp_spin_unlock(&vm_table_lock);
+ goto err_remove_mappings;
- return hyp_vm->kvm.arch.pkvm.handle;
+ /* Must be called last since this publishes the VM. */
+ ret = insert_vm_table_entry(handle, hyp_vm);
+ if (ret)
+ goto err_remove_mappings;
+
+ return 0;
-err_remove_vm_table_entry:
- remove_vm_table_entry(hyp_vm->kvm.arch.pkvm.handle);
-err_unlock:
- hyp_spin_unlock(&vm_table_lock);
err_remove_mappings:
unmap_donated_memory(hyp_vm, vm_size);
unmap_donated_memory(pgd, pgd_size);
@@ -668,10 +760,9 @@ err_unpin_kvm:
}
/*
- * Initialize the hypervisor copy of the protected vCPU state using the
- * memory donated by the host.
+ * Initialize the hypervisor copy of the vCPU state using host-donated memory.
*
- * handle: The handle for the protected vm.
+ * handle: The hypervisor handle for the vm.
* host_vcpu: A pointer to the corresponding host vcpu.
* vcpu_hva: The host va of the area being donated for the vcpu state.
* Must be page aligned. The size of the area must be equal to
diff --git a/arch/arm64/kvm/hyp/nvhe/setup.c b/arch/arm64/kvm/hyp/nvhe/setup.c
index a48d3f5a5afb..90bd014e952f 100644
--- a/arch/arm64/kvm/hyp/nvhe/setup.c
+++ b/arch/arm64/kvm/hyp/nvhe/setup.c
@@ -192,6 +192,7 @@ static int fix_host_ownership_walker(const struct kvm_pgtable_visit_ctx *ctx,
enum pkvm_page_state state;
struct hyp_page *page;
phys_addr_t phys;
+ enum kvm_pgtable_prot prot;
if (!kvm_pte_valid(ctx->old))
return 0;
@@ -210,11 +211,18 @@ static int fix_host_ownership_walker(const struct kvm_pgtable_visit_ctx *ctx,
* configured in the hypervisor stage-1, and make sure to propagate them
* to the hyp_vmemmap state.
*/
- state = pkvm_getstate(kvm_pgtable_hyp_pte_prot(ctx->old));
+ prot = kvm_pgtable_hyp_pte_prot(ctx->old);
+ state = pkvm_getstate(prot);
switch (state) {
case PKVM_PAGE_OWNED:
set_hyp_state(page, PKVM_PAGE_OWNED);
- return host_stage2_set_owner_locked(phys, PAGE_SIZE, PKVM_ID_HYP);
+ /* hyp text is RO in the host stage-2 to be inspected on panic. */
+ if (prot == PAGE_HYP_EXEC) {
+ set_host_state(page, PKVM_NOPAGE);
+ return host_stage2_idmap_locked(phys, PAGE_SIZE, KVM_PGTABLE_PROT_R);
+ } else {
+ return host_stage2_set_owner_locked(phys, PAGE_SIZE, PKVM_ID_HYP);
+ }
case PKVM_PAGE_SHARED_OWNED:
set_hyp_state(page, PKVM_PAGE_SHARED_OWNED);
set_host_state(page, PKVM_PAGE_SHARED_BORROWED);
diff --git a/arch/arm64/kvm/hyp/nvhe/switch.c b/arch/arm64/kvm/hyp/nvhe/switch.c
index ccd575d5f6de..d3b9ec8a7c28 100644
--- a/arch/arm64/kvm/hyp/nvhe/switch.c
+++ b/arch/arm64/kvm/hyp/nvhe/switch.c
@@ -50,6 +50,10 @@ extern void kvm_nvhe_prepare_backtrace(unsigned long fp, unsigned long pc);
static void __activate_traps(struct kvm_vcpu *vcpu)
{
___activate_traps(vcpu, vcpu->arch.hcr_el2);
+
+ *host_data_ptr(host_debug_state.mdcr_el2) = read_sysreg(mdcr_el2);
+ write_sysreg(vcpu->arch.mdcr_el2, mdcr_el2);
+
__activate_traps_common(vcpu);
__activate_cptr_traps(vcpu);
@@ -93,6 +97,8 @@ static void __deactivate_traps(struct kvm_vcpu *vcpu)
isb();
}
+ write_sysreg(*host_data_ptr(host_debug_state.mdcr_el2), mdcr_el2);
+
__deactivate_traps_common(vcpu);
write_sysreg_hcr(this_cpu_ptr(&kvm_init_params)->hcr_el2);
diff --git a/arch/arm64/kvm/hyp/nvhe/sys_regs.c b/arch/arm64/kvm/hyp/nvhe/sys_regs.c
index 1ddd9ed3cbb3..82da9b03692d 100644
--- a/arch/arm64/kvm/hyp/nvhe/sys_regs.c
+++ b/arch/arm64/kvm/hyp/nvhe/sys_regs.c
@@ -253,6 +253,7 @@ static void inject_undef64(struct kvm_vcpu *vcpu)
*vcpu_pc(vcpu) = read_sysreg_el2(SYS_ELR);
*vcpu_cpsr(vcpu) = read_sysreg_el2(SYS_SPSR);
+ __vcpu_assign_sys_reg(vcpu, VBAR_EL1, read_sysreg_el1(SYS_VBAR));
kvm_pend_exception(vcpu, EXCEPT_AA64_EL1_SYNC);
@@ -372,6 +373,9 @@ static const struct sys_reg_desc pvm_sys_reg_descs[] = {
/* Debug and Trace Registers are restricted. */
+ /* Group 1 ID registers */
+ HOST_HANDLED(SYS_REVIDR_EL1),
+
/* AArch64 mappings of the AArch32 ID registers */
/* CRm=1 */
AARCH32(SYS_ID_PFR0_EL1),
@@ -460,6 +464,7 @@ static const struct sys_reg_desc pvm_sys_reg_descs[] = {
HOST_HANDLED(SYS_CCSIDR_EL1),
HOST_HANDLED(SYS_CLIDR_EL1),
+ HOST_HANDLED(SYS_AIDR_EL1),
HOST_HANDLED(SYS_CSSELR_EL1),
HOST_HANDLED(SYS_CTR_EL0),
diff --git a/arch/arm64/kvm/hyp/vgic-v2-cpuif-proxy.c b/arch/arm64/kvm/hyp/vgic-v2-cpuif-proxy.c
index 87a54375bd6e..78579b31a420 100644
--- a/arch/arm64/kvm/hyp/vgic-v2-cpuif-proxy.c
+++ b/arch/arm64/kvm/hyp/vgic-v2-cpuif-proxy.c
@@ -20,7 +20,7 @@ static bool __is_be(struct kvm_vcpu *vcpu)
if (vcpu_mode_is_32bit(vcpu))
return !!(read_sysreg_el2(SYS_SPSR) & PSR_AA32_E_BIT);
- return !!(read_sysreg(SCTLR_EL1) & SCTLR_ELx_EE);
+ return !!(read_sysreg_el1(SYS_SCTLR) & SCTLR_ELx_EE);
}
/*
diff --git a/arch/arm64/kvm/hyp/vgic-v3-sr.c b/arch/arm64/kvm/hyp/vgic-v3-sr.c
index d81275790e69..acd909b7f225 100644
--- a/arch/arm64/kvm/hyp/vgic-v3-sr.c
+++ b/arch/arm64/kvm/hyp/vgic-v3-sr.c
@@ -295,12 +295,8 @@ void __vgic_v3_activate_traps(struct vgic_v3_cpu_if *cpu_if)
}
}
- /*
- * GICv5 BET0 FEAT_GCIE_LEGACY doesn't include ICC_SRE_EL2. This is due
- * to be relaxed in a future spec release, at which point this in
- * condition can be dropped.
- */
- if (!cpus_have_final_cap(ARM64_HAS_GICV5_CPUIF)) {
+ /* Only disable SRE if the host implements the GICv2 interface */
+ if (static_branch_unlikely(&vgic_v3_has_v2_compat)) {
/*
* Prevent the guest from touching the ICC_SRE_EL1 system
* register. Note that this may not have any effect, as
@@ -329,19 +325,16 @@ void __vgic_v3_deactivate_traps(struct vgic_v3_cpu_if *cpu_if)
cpu_if->vgic_vmcr = read_gicreg(ICH_VMCR_EL2);
}
- /*
- * Can be dropped in the future when GICv5 spec is relaxed. See comment
- * above.
- */
- if (!cpus_have_final_cap(ARM64_HAS_GICV5_CPUIF)) {
+ /* Only restore SRE if the host implements the GICv2 interface */
+ if (static_branch_unlikely(&vgic_v3_has_v2_compat)) {
val = read_gicreg(ICC_SRE_EL2);
write_gicreg(val | ICC_SRE_EL2_ENABLE, ICC_SRE_EL2);
- }
- if (!cpu_if->vgic_sre) {
- /* Make sure ENABLE is set at EL2 before setting SRE at EL1 */
- isb();
- write_gicreg(1, ICC_SRE_EL1);
+ if (!cpu_if->vgic_sre) {
+ /* Make sure ENABLE is set at EL2 before setting SRE at EL1 */
+ isb();
+ write_gicreg(1, ICC_SRE_EL1);
+ }
}
/*
diff --git a/arch/arm64/kvm/hyp/vhe/switch.c b/arch/arm64/kvm/hyp/vhe/switch.c
index e482181c6632..9984c492305a 100644
--- a/arch/arm64/kvm/hyp/vhe/switch.c
+++ b/arch/arm64/kvm/hyp/vhe/switch.c
@@ -43,8 +43,11 @@ DEFINE_PER_CPU(unsigned long, kvm_hyp_vector);
*
* - API/APK: they are already accounted for by vcpu_load(), and can
* only take effect across a load/put cycle (such as ERET)
+ *
+ * - FIEN: no way we let a guest have access to the RAS "Common Fault
+ * Injection" thing, whatever that does
*/
-#define NV_HCR_GUEST_EXCLUDE (HCR_TGE | HCR_API | HCR_APK)
+#define NV_HCR_GUEST_EXCLUDE (HCR_TGE | HCR_API | HCR_APK | HCR_FIEN)
static u64 __compute_hcr(struct kvm_vcpu *vcpu)
{
@@ -92,6 +95,13 @@ static u64 __compute_hcr(struct kvm_vcpu *vcpu)
/* Force NV2 in case the guest is forgetful... */
guest_hcr |= HCR_NV2;
}
+
+ /*
+ * Exclude the guest's TWED configuration if it hasn't set TWE
+ * to avoid potentially delaying traps for the host.
+ */
+ if (!(guest_hcr & HCR_TWE))
+ guest_hcr &= ~(HCR_EL2_TWEDEn | HCR_EL2_TWEDEL);
}
BUG_ON(host_data_test_flag(VCPU_IN_HYP_CONTEXT) &&
diff --git a/arch/arm64/kvm/inject_fault.c b/arch/arm64/kvm/inject_fault.c
index 6745f38b64f9..dfcd66c65517 100644
--- a/arch/arm64/kvm/inject_fault.c
+++ b/arch/arm64/kvm/inject_fault.c
@@ -106,7 +106,30 @@ static void inject_abt64(struct kvm_vcpu *vcpu, bool is_iabt, unsigned long addr
{
unsigned long cpsr = *vcpu_cpsr(vcpu);
bool is_aarch32 = vcpu_mode_is_32bit(vcpu);
- u64 esr = 0;
+ u64 esr = 0, fsc;
+ int level;
+
+ /*
+ * If injecting an abort from a failed S1PTW, rewalk the S1 PTs to
+ * find the failing level. If we can't find it, assume the error was
+ * transient and restart without changing the state.
+ */
+ if (kvm_vcpu_abt_iss1tw(vcpu)) {
+ u64 hpfar = kvm_vcpu_get_fault_ipa(vcpu);
+ int ret;
+
+ if (hpfar == INVALID_GPA)
+ return;
+
+ ret = __kvm_find_s1_desc_level(vcpu, addr, hpfar, &level);
+ if (ret)
+ return;
+
+ WARN_ON_ONCE(level < -1 || level > 3);
+ fsc = ESR_ELx_FSC_SEA_TTW(level);
+ } else {
+ fsc = ESR_ELx_FSC_EXTABT;
+ }
/* This delight is brought to you by FEAT_DoubleFault2. */
if (effective_sctlr2_ease(vcpu))
@@ -133,7 +156,7 @@ static void inject_abt64(struct kvm_vcpu *vcpu, bool is_iabt, unsigned long addr
if (!is_iabt)
esr |= ESR_ELx_EC_DABT_LOW << ESR_ELx_EC_SHIFT;
- esr |= ESR_ELx_FSC_EXTABT;
+ esr |= fsc;
vcpu_write_sys_reg(vcpu, addr, exception_far_elx(vcpu));
vcpu_write_sys_reg(vcpu, esr, exception_esr_elx(vcpu));
diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
index 1c78864767c5..7cc964af8d30 100644
--- a/arch/arm64/kvm/mmu.c
+++ b/arch/arm64/kvm/mmu.c
@@ -4,19 +4,20 @@
* Author: Christoffer Dall <c.dall@virtualopensystems.com>
*/
+#include <linux/acpi.h>
#include <linux/mman.h>
#include <linux/kvm_host.h>
#include <linux/io.h>
#include <linux/hugetlb.h>
#include <linux/sched/signal.h>
#include <trace/events/kvm.h>
+#include <asm/acpi.h>
#include <asm/pgalloc.h>
#include <asm/cacheflush.h>
#include <asm/kvm_arm.h>
#include <asm/kvm_mmu.h>
#include <asm/kvm_pgtable.h>
#include <asm/kvm_pkvm.h>
-#include <asm/kvm_ras.h>
#include <asm/kvm_asm.h>
#include <asm/kvm_emulate.h>
#include <asm/virt.h>
@@ -1073,6 +1074,10 @@ void kvm_free_stage2_pgd(struct kvm_s2_mmu *mmu)
mmu->pgt = NULL;
free_percpu(mmu->last_vcpu_ran);
}
+
+ if (kvm_is_nested_s2_mmu(kvm, mmu))
+ kvm_init_nested_s2_mmu(mmu);
+
write_unlock(&kvm->mmu_lock);
if (pgt) {
@@ -1426,11 +1431,8 @@ static int get_vma_page_shift(struct vm_area_struct *vma, unsigned long hva)
* able to see the page's tags and therefore they must be initialised first. If
* PG_mte_tagged is set, tags have already been initialised.
*
- * The race in the test/set of the PG_mte_tagged flag is handled by:
- * - preventing VM_SHARED mappings in a memslot with MTE preventing two VMs
- * racing to santise the same page
- * - mmap_lock protects between a VM faulting a page in and the VMM performing
- * an mprotect() to add VM_MTE
+ * Must be called with kvm->mmu_lock held to ensure the memory remains mapped
+ * while the tags are zeroed.
*/
static void sanitise_mte_tags(struct kvm *kvm, kvm_pfn_t pfn,
unsigned long size)
@@ -1477,13 +1479,132 @@ static bool kvm_vma_is_cacheable(struct vm_area_struct *vma)
}
}
+static int prepare_mmu_memcache(struct kvm_vcpu *vcpu, bool topup_memcache,
+ void **memcache)
+{
+ int min_pages;
+
+ if (!is_protected_kvm_enabled())
+ *memcache = &vcpu->arch.mmu_page_cache;
+ else
+ *memcache = &vcpu->arch.pkvm_memcache;
+
+ if (!topup_memcache)
+ return 0;
+
+ min_pages = kvm_mmu_cache_min_pages(vcpu->arch.hw_mmu);
+
+ if (!is_protected_kvm_enabled())
+ return kvm_mmu_topup_memory_cache(*memcache, min_pages);
+
+ return topup_hyp_memcache(*memcache, min_pages);
+}
+
+/*
+ * Potentially reduce shadow S2 permissions to match the guest's own S2. For
+ * exec faults, we'd only reach this point if the guest actually allowed it (see
+ * kvm_s2_handle_perm_fault).
+ *
+ * Also encode the level of the original translation in the SW bits of the leaf
+ * entry as a proxy for the span of that translation. This will be retrieved on
+ * TLB invalidation from the guest and used to limit the invalidation scope if a
+ * TTL hint or a range isn't provided.
+ */
+static void adjust_nested_fault_perms(struct kvm_s2_trans *nested,
+ enum kvm_pgtable_prot *prot,
+ bool *writable)
+{
+ *writable &= kvm_s2_trans_writable(nested);
+ if (!kvm_s2_trans_readable(nested))
+ *prot &= ~KVM_PGTABLE_PROT_R;
+
+ *prot |= kvm_encode_nested_level(nested);
+}
+
+#define KVM_PGTABLE_WALK_MEMABORT_FLAGS (KVM_PGTABLE_WALK_HANDLE_FAULT | KVM_PGTABLE_WALK_SHARED)
+
+static int gmem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
+ struct kvm_s2_trans *nested,
+ struct kvm_memory_slot *memslot, bool is_perm)
+{
+ bool write_fault, exec_fault, writable;
+ enum kvm_pgtable_walk_flags flags = KVM_PGTABLE_WALK_MEMABORT_FLAGS;
+ enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_R;
+ struct kvm_pgtable *pgt = vcpu->arch.hw_mmu->pgt;
+ unsigned long mmu_seq;
+ struct page *page;
+ struct kvm *kvm = vcpu->kvm;
+ void *memcache;
+ kvm_pfn_t pfn;
+ gfn_t gfn;
+ int ret;
+
+ ret = prepare_mmu_memcache(vcpu, true, &memcache);
+ if (ret)
+ return ret;
+
+ if (nested)
+ gfn = kvm_s2_trans_output(nested) >> PAGE_SHIFT;
+ else
+ gfn = fault_ipa >> PAGE_SHIFT;
+
+ write_fault = kvm_is_write_fault(vcpu);
+ exec_fault = kvm_vcpu_trap_is_exec_fault(vcpu);
+
+ VM_WARN_ON_ONCE(write_fault && exec_fault);
+
+ mmu_seq = kvm->mmu_invalidate_seq;
+ /* Pairs with the smp_wmb() in kvm_mmu_invalidate_end(). */
+ smp_rmb();
+
+ ret = kvm_gmem_get_pfn(kvm, memslot, gfn, &pfn, &page, NULL);
+ if (ret) {
+ kvm_prepare_memory_fault_exit(vcpu, fault_ipa, PAGE_SIZE,
+ write_fault, exec_fault, false);
+ return ret;
+ }
+
+ writable = !(memslot->flags & KVM_MEM_READONLY);
+
+ if (nested)
+ adjust_nested_fault_perms(nested, &prot, &writable);
+
+ if (writable)
+ prot |= KVM_PGTABLE_PROT_W;
+
+ if (exec_fault ||
+ (cpus_have_final_cap(ARM64_HAS_CACHE_DIC) &&
+ (!nested || kvm_s2_trans_executable(nested))))
+ prot |= KVM_PGTABLE_PROT_X;
+
+ kvm_fault_lock(kvm);
+ if (mmu_invalidate_retry(kvm, mmu_seq)) {
+ ret = -EAGAIN;
+ goto out_unlock;
+ }
+
+ ret = KVM_PGT_FN(kvm_pgtable_stage2_map)(pgt, fault_ipa, PAGE_SIZE,
+ __pfn_to_phys(pfn), prot,
+ memcache, flags);
+
+out_unlock:
+ kvm_release_faultin_page(kvm, page, !!ret, writable);
+ kvm_fault_unlock(kvm);
+
+ if (writable && !ret)
+ mark_page_dirty_in_slot(kvm, memslot, gfn);
+
+ return ret != -EAGAIN ? ret : 0;
+}
+
static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
struct kvm_s2_trans *nested,
struct kvm_memory_slot *memslot, unsigned long hva,
bool fault_is_perm)
{
int ret = 0;
- bool write_fault, writable, force_pte = false;
+ bool topup_memcache;
+ bool write_fault, writable;
bool exec_fault, mte_allowed, is_vma_cacheable;
bool s2_force_noncacheable = false, vfio_allow_any_uc = false;
unsigned long mmu_seq;
@@ -1495,28 +1616,19 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
gfn_t gfn;
kvm_pfn_t pfn;
bool logging_active = memslot_is_logging(memslot);
+ bool force_pte = logging_active;
long vma_pagesize, fault_granule;
enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_R;
struct kvm_pgtable *pgt;
struct page *page;
vm_flags_t vm_flags;
- enum kvm_pgtable_walk_flags flags = KVM_PGTABLE_WALK_HANDLE_FAULT | KVM_PGTABLE_WALK_SHARED;
+ enum kvm_pgtable_walk_flags flags = KVM_PGTABLE_WALK_MEMABORT_FLAGS;
if (fault_is_perm)
fault_granule = kvm_vcpu_trap_get_perm_fault_granule(vcpu);
write_fault = kvm_is_write_fault(vcpu);
exec_fault = kvm_vcpu_trap_is_exec_fault(vcpu);
- VM_BUG_ON(write_fault && exec_fault);
-
- if (fault_is_perm && !write_fault && !exec_fault) {
- kvm_err("Unexpected L2 read permission error\n");
- return -EFAULT;
- }
-
- if (!is_protected_kvm_enabled())
- memcache = &vcpu->arch.mmu_page_cache;
- else
- memcache = &vcpu->arch.pkvm_memcache;
+ VM_WARN_ON_ONCE(write_fault && exec_fault);
/*
* Permission faults just need to update the existing leaf entry,
@@ -1524,17 +1636,10 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
* only exception to this is when dirty logging is enabled at runtime
* and a write fault needs to collapse a block entry into a table.
*/
- if (!fault_is_perm || (logging_active && write_fault)) {
- int min_pages = kvm_mmu_cache_min_pages(vcpu->arch.hw_mmu);
-
- if (!is_protected_kvm_enabled())
- ret = kvm_mmu_topup_memory_cache(memcache, min_pages);
- else
- ret = topup_hyp_memcache(memcache, min_pages);
-
- if (ret)
- return ret;
- }
+ topup_memcache = !fault_is_perm || (logging_active && write_fault);
+ ret = prepare_mmu_memcache(vcpu, topup_memcache, &memcache);
+ if (ret)
+ return ret;
/*
* Let's check if we will get back a huge page backed by hugetlbfs, or
@@ -1548,16 +1653,10 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
return -EFAULT;
}
- /*
- * logging_active is guaranteed to never be true for VM_PFNMAP
- * memslots.
- */
- if (logging_active) {
- force_pte = true;
+ if (force_pte)
vma_shift = PAGE_SHIFT;
- } else {
+ else
vma_shift = get_vma_page_shift(vma, hva);
- }
switch (vma_shift) {
#ifndef __PAGETABLE_PMD_FOLDED
@@ -1609,7 +1708,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
max_map_size = PAGE_SIZE;
force_pte = (max_map_size == PAGE_SIZE);
- vma_pagesize = min(vma_pagesize, (long)max_map_size);
+ vma_pagesize = min_t(long, vma_pagesize, max_map_size);
}
/*
@@ -1642,7 +1741,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
* Rely on mmap_read_unlock() for an implicit smp_rmb(), which pairs
* with the smp_wmb() in kvm_mmu_invalidate_end().
*/
- mmu_seq = vcpu->kvm->mmu_invalidate_seq;
+ mmu_seq = kvm->mmu_invalidate_seq;
mmap_read_unlock(current->mm);
pfn = __kvm_faultin_pfn(memslot, gfn, write_fault ? FOLL_WRITE : 0,
@@ -1673,7 +1772,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
* cache maintenance.
*/
if (!kvm_supports_cacheable_pfnmap())
- return -EFAULT;
+ ret = -EFAULT;
} else {
/*
* If the page was identified as device early by looking at
@@ -1696,27 +1795,16 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
}
if (exec_fault && s2_force_noncacheable)
- return -ENOEXEC;
-
- /*
- * Potentially reduce shadow S2 permissions to match the guest's own
- * S2. For exec faults, we'd only reach this point if the guest
- * actually allowed it (see kvm_s2_handle_perm_fault).
- *
- * Also encode the level of the original translation in the SW bits
- * of the leaf entry as a proxy for the span of that translation.
- * This will be retrieved on TLB invalidation from the guest and
- * used to limit the invalidation scope if a TTL hint or a range
- * isn't provided.
- */
- if (nested) {
- writable &= kvm_s2_trans_writable(nested);
- if (!kvm_s2_trans_readable(nested))
- prot &= ~KVM_PGTABLE_PROT_R;
+ ret = -ENOEXEC;
- prot |= kvm_encode_nested_level(nested);
+ if (ret) {
+ kvm_release_page_unused(page);
+ return ret;
}
+ if (nested)
+ adjust_nested_fault_perms(nested, &prot, &writable);
+
kvm_fault_lock(kvm);
pgt = vcpu->arch.hw_mmu->pgt;
if (mmu_invalidate_retry(kvm, mmu_seq)) {
@@ -1811,6 +1899,19 @@ static void handle_access_fault(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa)
read_unlock(&vcpu->kvm->mmu_lock);
}
+int kvm_handle_guest_sea(struct kvm_vcpu *vcpu)
+{
+ /*
+ * Give APEI the opportunity to claim the abort before handling it
+ * within KVM. apei_claim_sea() expects to be called with IRQs enabled.
+ */
+ lockdep_assert_irqs_enabled();
+ if (apei_claim_sea(NULL) == 0)
+ return 1;
+
+ return kvm_inject_serror(vcpu);
+}
+
/**
* kvm_handle_guest_abort - handles all 2nd stage aborts
* @vcpu: the VCPU pointer
@@ -1834,17 +1935,8 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu)
gfn_t gfn;
int ret, idx;
- /* Synchronous External Abort? */
- if (kvm_vcpu_abt_issea(vcpu)) {
- /*
- * For RAS the host kernel may handle this abort.
- * There is no need to pass the error into the guest.
- */
- if (kvm_handle_guest_sea())
- return kvm_inject_serror(vcpu);
-
- return 1;
- }
+ if (kvm_vcpu_abt_issea(vcpu))
+ return kvm_handle_guest_sea(vcpu);
esr = kvm_vcpu_get_esr(vcpu);
@@ -1981,8 +2073,15 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu)
goto out_unlock;
}
- ret = user_mem_abort(vcpu, fault_ipa, nested, memslot, hva,
- esr_fsc_is_permission_fault(esr));
+ VM_WARN_ON_ONCE(kvm_vcpu_trap_is_permission_fault(vcpu) &&
+ !write_fault && !kvm_vcpu_trap_is_exec_fault(vcpu));
+
+ if (kvm_slot_has_gmem(memslot))
+ ret = gmem_abort(vcpu, fault_ipa, nested, memslot,
+ esr_fsc_is_permission_fault(esr));
+ else
+ ret = user_mem_abort(vcpu, fault_ipa, nested, memslot, hva,
+ esr_fsc_is_permission_fault(esr));
if (ret == 0)
ret = 1;
out:
@@ -2214,6 +2313,13 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
if ((new->base_gfn + new->npages) > (kvm_phys_size(&kvm->arch.mmu) >> PAGE_SHIFT))
return -EFAULT;
+ /*
+ * Only support guest_memfd backed memslots with mappable memory, since
+ * there aren't any CoCo VMs that support only private memory on arm64.
+ */
+ if (kvm_slot_has_gmem(new) && !kvm_memslot_is_gmem_only(new))
+ return -EINVAL;
+
hva = new->userspace_addr;
reg_end = hva + (new->npages << PAGE_SHIFT);
diff --git a/arch/arm64/kvm/nested.c b/arch/arm64/kvm/nested.c
index 153b3e11b115..7a045cad6bdf 100644
--- a/arch/arm64/kvm/nested.c
+++ b/arch/arm64/kvm/nested.c
@@ -349,7 +349,7 @@ static void vtcr_to_walk_info(u64 vtcr, struct s2_walk_info *wi)
wi->sl = FIELD_GET(VTCR_EL2_SL0_MASK, vtcr);
/* Global limit for now, should eventually be per-VM */
wi->max_oa_bits = min(get_kvm_ipa_limit(),
- ps_to_output_size(FIELD_GET(VTCR_EL2_PS_MASK, vtcr)));
+ ps_to_output_size(FIELD_GET(VTCR_EL2_PS_MASK, vtcr), false));
}
int kvm_walk_nested_s2(struct kvm_vcpu *vcpu, phys_addr_t gipa,
@@ -847,7 +847,7 @@ static void kvm_invalidate_vncr_ipa(struct kvm *kvm, u64 start, u64 end)
ipa_size = ttl_to_size(pgshift_level_to_ttl(vt->wi.pgshift,
vt->wr.level));
- ipa_start = vt->wr.pa & (ipa_size - 1);
+ ipa_start = vt->wr.pa & ~(ipa_size - 1);
ipa_end = ipa_start + ipa_size;
if (ipa_end <= start || ipa_start >= end)
@@ -887,7 +887,7 @@ static void invalidate_vncr_va(struct kvm *kvm,
va_size = ttl_to_size(pgshift_level_to_ttl(vt->wi.pgshift,
vt->wr.level));
- va_start = vt->gva & (va_size - 1);
+ va_start = vt->gva & ~(va_size - 1);
va_end = va_start + va_size;
switch (scope->type) {
@@ -1172,8 +1172,9 @@ static u64 read_vncr_el2(struct kvm_vcpu *vcpu)
return (u64)sign_extend64(__vcpu_sys_reg(vcpu, VNCR_EL2), 48);
}
-static int kvm_translate_vncr(struct kvm_vcpu *vcpu)
+static int kvm_translate_vncr(struct kvm_vcpu *vcpu, bool *is_gmem)
{
+ struct kvm_memory_slot *memslot;
bool write_fault, writable;
unsigned long mmu_seq;
struct vncr_tlb *vt;
@@ -1216,10 +1217,25 @@ static int kvm_translate_vncr(struct kvm_vcpu *vcpu)
smp_rmb();
gfn = vt->wr.pa >> PAGE_SHIFT;
- pfn = kvm_faultin_pfn(vcpu, gfn, write_fault, &writable, &page);
- if (is_error_noslot_pfn(pfn) || (write_fault && !writable))
+ memslot = gfn_to_memslot(vcpu->kvm, gfn);
+ if (!memslot)
return -EFAULT;
+ *is_gmem = kvm_slot_has_gmem(memslot);
+ if (!*is_gmem) {
+ pfn = __kvm_faultin_pfn(memslot, gfn, write_fault ? FOLL_WRITE : 0,
+ &writable, &page);
+ if (is_error_noslot_pfn(pfn) || (write_fault && !writable))
+ return -EFAULT;
+ } else {
+ ret = kvm_gmem_get_pfn(vcpu->kvm, memslot, gfn, &pfn, &page, NULL);
+ if (ret) {
+ kvm_prepare_memory_fault_exit(vcpu, vt->wr.pa, PAGE_SIZE,
+ write_fault, false, false);
+ return ret;
+ }
+ }
+
scoped_guard(write_lock, &vcpu->kvm->mmu_lock) {
if (mmu_invalidate_retry(vcpu->kvm, mmu_seq))
return -EAGAIN;
@@ -1276,7 +1292,7 @@ static bool kvm_vncr_tlb_lookup(struct kvm_vcpu *vcpu)
!(tcr & TCR_ASID16))
asid &= GENMASK(7, 0);
- return asid != vt->wr.asid;
+ return asid == vt->wr.asid;
}
return true;
@@ -1287,28 +1303,44 @@ int kvm_handle_vncr_abort(struct kvm_vcpu *vcpu)
struct vncr_tlb *vt = vcpu->arch.vncr_tlb;
u64 esr = kvm_vcpu_get_esr(vcpu);
- BUG_ON(!(esr & ESR_ELx_VNCR_SHIFT));
+ WARN_ON_ONCE(!(esr & ESR_ELx_VNCR));
+
+ if (kvm_vcpu_abt_issea(vcpu))
+ return kvm_handle_guest_sea(vcpu);
if (esr_fsc_is_permission_fault(esr)) {
inject_vncr_perm(vcpu);
} else if (esr_fsc_is_translation_fault(esr)) {
- bool valid;
+ bool valid, is_gmem = false;
int ret;
scoped_guard(read_lock, &vcpu->kvm->mmu_lock)
valid = kvm_vncr_tlb_lookup(vcpu);
if (!valid)
- ret = kvm_translate_vncr(vcpu);
+ ret = kvm_translate_vncr(vcpu, &is_gmem);
else
ret = -EPERM;
switch (ret) {
case -EAGAIN:
- case -ENOMEM:
/* Let's try again... */
break;
+ case -ENOMEM:
+ /*
+ * For guest_memfd, this indicates that it failed to
+ * create a folio to back the memory. Inform userspace.
+ */
+ if (is_gmem)
+ return 0;
+ /* Otherwise, let's try again... */
+ break;
case -EFAULT:
+ case -EIO:
+ case -EHWPOISON:
+ if (is_gmem)
+ return 0;
+ fallthrough;
case -EINVAL:
case -ENOENT:
case -EACCES:
@@ -1459,9 +1491,16 @@ u64 limit_nv_id_reg(struct kvm *kvm, u32 reg, u64 val)
case SYS_ID_AA64PFR1_EL1:
/* Only support BTI, SSBS, CSV2_frac */
- val &= (ID_AA64PFR1_EL1_BT |
- ID_AA64PFR1_EL1_SSBS |
- ID_AA64PFR1_EL1_CSV2_frac);
+ val &= ~(ID_AA64PFR1_EL1_PFAR |
+ ID_AA64PFR1_EL1_MTEX |
+ ID_AA64PFR1_EL1_THE |
+ ID_AA64PFR1_EL1_GCS |
+ ID_AA64PFR1_EL1_MTE_frac |
+ ID_AA64PFR1_EL1_NMI |
+ ID_AA64PFR1_EL1_SME |
+ ID_AA64PFR1_EL1_RES0 |
+ ID_AA64PFR1_EL1_MPAM_frac |
+ ID_AA64PFR1_EL1_MTE);
break;
case SYS_ID_AA64MMFR0_EL1:
@@ -1514,12 +1553,11 @@ u64 limit_nv_id_reg(struct kvm *kvm, u32 reg, u64 val)
break;
case SYS_ID_AA64MMFR1_EL1:
- val &= (ID_AA64MMFR1_EL1_HCX |
- ID_AA64MMFR1_EL1_PAN |
- ID_AA64MMFR1_EL1_LO |
- ID_AA64MMFR1_EL1_HPDS |
- ID_AA64MMFR1_EL1_VH |
- ID_AA64MMFR1_EL1_VMIDBits);
+ val &= ~(ID_AA64MMFR1_EL1_CMOW |
+ ID_AA64MMFR1_EL1_nTLBPA |
+ ID_AA64MMFR1_EL1_ETS |
+ ID_AA64MMFR1_EL1_XNX |
+ ID_AA64MMFR1_EL1_HAFDBS);
/* FEAT_E2H0 implies no VHE */
if (test_bit(KVM_ARM_VCPU_HAS_EL2_E2H0, kvm->arch.vcpu_features))
val &= ~ID_AA64MMFR1_EL1_VH;
@@ -1561,14 +1599,22 @@ u64 limit_nv_id_reg(struct kvm *kvm, u32 reg, u64 val)
case SYS_ID_AA64DFR0_EL1:
/* Only limited support for PMU, Debug, BPs, WPs, and HPMN0 */
- val &= (ID_AA64DFR0_EL1_PMUVer |
- ID_AA64DFR0_EL1_WRPs |
- ID_AA64DFR0_EL1_BRPs |
- ID_AA64DFR0_EL1_DebugVer|
- ID_AA64DFR0_EL1_HPMN0);
-
- /* Cap Debug to ARMv8.1 */
- val = ID_REG_LIMIT_FIELD_ENUM(val, ID_AA64DFR0_EL1, DebugVer, VHE);
+ val &= ~(ID_AA64DFR0_EL1_ExtTrcBuff |
+ ID_AA64DFR0_EL1_BRBE |
+ ID_AA64DFR0_EL1_MTPMU |
+ ID_AA64DFR0_EL1_TraceBuffer |
+ ID_AA64DFR0_EL1_TraceFilt |
+ ID_AA64DFR0_EL1_PMSVer |
+ ID_AA64DFR0_EL1_CTX_CMPs |
+ ID_AA64DFR0_EL1_SEBEP |
+ ID_AA64DFR0_EL1_PMSS |
+ ID_AA64DFR0_EL1_TraceVer);
+
+ /*
+ * FEAT_Debugv8p9 requires support for extended breakpoints /
+ * watchpoints.
+ */
+ val = ID_REG_LIMIT_FIELD_ENUM(val, ID_AA64DFR0_EL1, DebugVer, V8P8);
break;
}
@@ -1793,3 +1839,33 @@ void kvm_nested_sync_hwstate(struct kvm_vcpu *vcpu)
if (unlikely(vcpu_test_and_clear_flag(vcpu, NESTED_SERROR_PENDING)))
kvm_inject_serror_esr(vcpu, vcpu_get_vsesr(vcpu));
}
+
+/*
+ * KVM unconditionally sets most of these traps anyway but use an allowlist
+ * to document the guest hypervisor traps that may take precedence and guard
+ * against future changes to the non-nested trap configuration.
+ */
+#define NV_MDCR_GUEST_INCLUDE (MDCR_EL2_TDE | \
+ MDCR_EL2_TDA | \
+ MDCR_EL2_TDRA | \
+ MDCR_EL2_TTRF | \
+ MDCR_EL2_TPMS | \
+ MDCR_EL2_TPM | \
+ MDCR_EL2_TPMCR | \
+ MDCR_EL2_TDCC | \
+ MDCR_EL2_TDOSA)
+
+void kvm_nested_setup_mdcr_el2(struct kvm_vcpu *vcpu)
+{
+ u64 guest_mdcr = __vcpu_sys_reg(vcpu, MDCR_EL2);
+
+ /*
+ * In yet another example where FEAT_NV2 is fscking broken, accesses
+ * to MDSCR_EL1 are redirected to the VNCR despite having an effect
+ * at EL2. Use a big hammer to apply sanity.
+ */
+ if (is_hyp_ctxt(vcpu))
+ vcpu->arch.mdcr_el2 |= MDCR_EL2_TDA;
+ else
+ vcpu->arch.mdcr_el2 |= (guest_mdcr & NV_MDCR_GUEST_INCLUDE);
+}
diff --git a/arch/arm64/kvm/pkvm.c b/arch/arm64/kvm/pkvm.c
index fcd70bfe44fb..24f0f8a8c943 100644
--- a/arch/arm64/kvm/pkvm.c
+++ b/arch/arm64/kvm/pkvm.c
@@ -85,16 +85,23 @@ void __init kvm_hyp_reserve(void)
hyp_mem_base);
}
-static void __pkvm_destroy_hyp_vm(struct kvm *host_kvm)
+static void __pkvm_destroy_hyp_vm(struct kvm *kvm)
{
- if (host_kvm->arch.pkvm.handle) {
+ if (pkvm_hyp_vm_is_created(kvm)) {
WARN_ON(kvm_call_hyp_nvhe(__pkvm_teardown_vm,
- host_kvm->arch.pkvm.handle));
+ kvm->arch.pkvm.handle));
+ } else if (kvm->arch.pkvm.handle) {
+ /*
+ * The VM could have been reserved but hyp initialization has
+ * failed. Make sure to unreserve it.
+ */
+ kvm_call_hyp_nvhe(__pkvm_unreserve_vm, kvm->arch.pkvm.handle);
}
- host_kvm->arch.pkvm.handle = 0;
- free_hyp_memcache(&host_kvm->arch.pkvm.teardown_mc);
- free_hyp_memcache(&host_kvm->arch.pkvm.stage2_teardown_mc);
+ kvm->arch.pkvm.handle = 0;
+ kvm->arch.pkvm.is_created = false;
+ free_hyp_memcache(&kvm->arch.pkvm.teardown_mc);
+ free_hyp_memcache(&kvm->arch.pkvm.stage2_teardown_mc);
}
static int __pkvm_create_hyp_vcpu(struct kvm_vcpu *vcpu)
@@ -129,16 +136,16 @@ static int __pkvm_create_hyp_vcpu(struct kvm_vcpu *vcpu)
*
* Return 0 on success, negative error code on failure.
*/
-static int __pkvm_create_hyp_vm(struct kvm *host_kvm)
+static int __pkvm_create_hyp_vm(struct kvm *kvm)
{
size_t pgd_sz, hyp_vm_sz;
void *pgd, *hyp_vm;
int ret;
- if (host_kvm->created_vcpus < 1)
+ if (kvm->created_vcpus < 1)
return -EINVAL;
- pgd_sz = kvm_pgtable_stage2_pgd_size(host_kvm->arch.mmu.vtcr);
+ pgd_sz = kvm_pgtable_stage2_pgd_size(kvm->arch.mmu.vtcr);
/*
* The PGD pages will be reclaimed using a hyp_memcache which implies
@@ -152,7 +159,7 @@ static int __pkvm_create_hyp_vm(struct kvm *host_kvm)
/* Allocate memory to donate to hyp for vm and vcpu pointers. */
hyp_vm_sz = PAGE_ALIGN(size_add(PKVM_HYP_VM_SIZE,
size_mul(sizeof(void *),
- host_kvm->created_vcpus)));
+ kvm->created_vcpus)));
hyp_vm = alloc_pages_exact(hyp_vm_sz, GFP_KERNEL_ACCOUNT);
if (!hyp_vm) {
ret = -ENOMEM;
@@ -160,12 +167,12 @@ static int __pkvm_create_hyp_vm(struct kvm *host_kvm)
}
/* Donate the VM memory to hyp and let hyp initialize it. */
- ret = kvm_call_hyp_nvhe(__pkvm_init_vm, host_kvm, hyp_vm, pgd);
- if (ret < 0)
+ ret = kvm_call_hyp_nvhe(__pkvm_init_vm, kvm, hyp_vm, pgd);
+ if (ret)
goto free_vm;
- host_kvm->arch.pkvm.handle = ret;
- host_kvm->arch.pkvm.stage2_teardown_mc.flags |= HYP_MEMCACHE_ACCOUNT_STAGE2;
+ kvm->arch.pkvm.is_created = true;
+ kvm->arch.pkvm.stage2_teardown_mc.flags |= HYP_MEMCACHE_ACCOUNT_STAGE2;
kvm_account_pgtable_pages(pgd, pgd_sz / PAGE_SIZE);
return 0;
@@ -176,14 +183,19 @@ free_pgd:
return ret;
}
-int pkvm_create_hyp_vm(struct kvm *host_kvm)
+bool pkvm_hyp_vm_is_created(struct kvm *kvm)
+{
+ return READ_ONCE(kvm->arch.pkvm.is_created);
+}
+
+int pkvm_create_hyp_vm(struct kvm *kvm)
{
int ret = 0;
- mutex_lock(&host_kvm->arch.config_lock);
- if (!host_kvm->arch.pkvm.handle)
- ret = __pkvm_create_hyp_vm(host_kvm);
- mutex_unlock(&host_kvm->arch.config_lock);
+ mutex_lock(&kvm->arch.config_lock);
+ if (!pkvm_hyp_vm_is_created(kvm))
+ ret = __pkvm_create_hyp_vm(kvm);
+ mutex_unlock(&kvm->arch.config_lock);
return ret;
}
@@ -200,15 +212,31 @@ int pkvm_create_hyp_vcpu(struct kvm_vcpu *vcpu)
return ret;
}
-void pkvm_destroy_hyp_vm(struct kvm *host_kvm)
+void pkvm_destroy_hyp_vm(struct kvm *kvm)
{
- mutex_lock(&host_kvm->arch.config_lock);
- __pkvm_destroy_hyp_vm(host_kvm);
- mutex_unlock(&host_kvm->arch.config_lock);
+ mutex_lock(&kvm->arch.config_lock);
+ __pkvm_destroy_hyp_vm(kvm);
+ mutex_unlock(&kvm->arch.config_lock);
}
-int pkvm_init_host_vm(struct kvm *host_kvm)
+int pkvm_init_host_vm(struct kvm *kvm)
{
+ int ret;
+
+ if (pkvm_hyp_vm_is_created(kvm))
+ return -EINVAL;
+
+ /* VM is already reserved, no need to proceed. */
+ if (kvm->arch.pkvm.handle)
+ return 0;
+
+ /* Reserve the VM in hyp and obtain a hyp handle for the VM. */
+ ret = kvm_call_hyp_nvhe(__pkvm_reserve_vm);
+ if (ret < 0)
+ return ret;
+
+ kvm->arch.pkvm.handle = ret;
+
return 0;
}
diff --git a/arch/arm64/kvm/ptdump.c b/arch/arm64/kvm/ptdump.c
index 098416d7e5c2..dc5acfb00af9 100644
--- a/arch/arm64/kvm/ptdump.c
+++ b/arch/arm64/kvm/ptdump.c
@@ -32,23 +32,23 @@ static const struct ptdump_prot_bits stage2_pte_bits[] = {
.set = " ",
.clear = "F",
}, {
- .mask = KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R | PTE_VALID,
- .val = KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R | PTE_VALID,
+ .mask = KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R,
+ .val = KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R,
.set = "R",
.clear = " ",
}, {
- .mask = KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W | PTE_VALID,
- .val = KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W | PTE_VALID,
+ .mask = KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W,
+ .val = KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W,
.set = "W",
.clear = " ",
}, {
- .mask = KVM_PTE_LEAF_ATTR_HI_S2_XN | PTE_VALID,
- .val = PTE_VALID,
- .set = " ",
- .clear = "X",
+ .mask = KVM_PTE_LEAF_ATTR_HI_S2_XN,
+ .val = KVM_PTE_LEAF_ATTR_HI_S2_XN,
+ .set = "NX",
+ .clear = "x ",
}, {
- .mask = KVM_PTE_LEAF_ATTR_LO_S2_AF | PTE_VALID,
- .val = KVM_PTE_LEAF_ATTR_LO_S2_AF | PTE_VALID,
+ .mask = KVM_PTE_LEAF_ATTR_LO_S2_AF,
+ .val = KVM_PTE_LEAF_ATTR_LO_S2_AF,
.set = "AF",
.clear = " ",
}, {
diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
index 82ffb3b3b3cf..91053aa832d0 100644
--- a/arch/arm64/kvm/sys_regs.c
+++ b/arch/arm64/kvm/sys_regs.c
@@ -82,43 +82,105 @@ static bool write_to_read_only(struct kvm_vcpu *vcpu,
"sys_reg write to read-only register");
}
-#define PURE_EL2_SYSREG(el2) \
- case el2: { \
- *el1r = el2; \
- return true; \
+enum sr_loc_attr {
+ SR_LOC_MEMORY = 0, /* Register definitely in memory */
+ SR_LOC_LOADED = BIT(0), /* Register on CPU, unless it cannot */
+ SR_LOC_MAPPED = BIT(1), /* Register in a different CPU register */
+ SR_LOC_XLATED = BIT(2), /* Register translated to fit another reg */
+ SR_LOC_SPECIAL = BIT(3), /* Demanding register, implies loaded */
+};
+
+struct sr_loc {
+ enum sr_loc_attr loc;
+ enum vcpu_sysreg map_reg;
+ u64 (*xlate)(u64);
+};
+
+static enum sr_loc_attr locate_direct_register(const struct kvm_vcpu *vcpu,
+ enum vcpu_sysreg reg)
+{
+ switch (reg) {
+ case SCTLR_EL1:
+ case CPACR_EL1:
+ case TTBR0_EL1:
+ case TTBR1_EL1:
+ case TCR_EL1:
+ case TCR2_EL1:
+ case PIR_EL1:
+ case PIRE0_EL1:
+ case POR_EL1:
+ case ESR_EL1:
+ case AFSR0_EL1:
+ case AFSR1_EL1:
+ case FAR_EL1:
+ case MAIR_EL1:
+ case VBAR_EL1:
+ case CONTEXTIDR_EL1:
+ case AMAIR_EL1:
+ case CNTKCTL_EL1:
+ case ELR_EL1:
+ case SPSR_EL1:
+ case ZCR_EL1:
+ case SCTLR2_EL1:
+ /*
+ * EL1 registers which have an ELx2 mapping are loaded if
+ * we're not in hypervisor context.
+ */
+ return is_hyp_ctxt(vcpu) ? SR_LOC_MEMORY : SR_LOC_LOADED;
+
+ case TPIDR_EL0:
+ case TPIDRRO_EL0:
+ case TPIDR_EL1:
+ case PAR_EL1:
+ case DACR32_EL2:
+ case IFSR32_EL2:
+ case DBGVCR32_EL2:
+ /* These registers are always loaded, no matter what */
+ return SR_LOC_LOADED;
+
+ default:
+ /* Non-mapped EL2 registers are by definition in memory. */
+ return SR_LOC_MEMORY;
}
+}
-#define MAPPED_EL2_SYSREG(el2, el1, fn) \
- case el2: { \
- *xlate = fn; \
- *el1r = el1; \
- return true; \
+static void locate_mapped_el2_register(const struct kvm_vcpu *vcpu,
+ enum vcpu_sysreg reg,
+ enum vcpu_sysreg map_reg,
+ u64 (*xlate)(u64),
+ struct sr_loc *loc)
+{
+ if (!is_hyp_ctxt(vcpu)) {
+ loc->loc = SR_LOC_MEMORY;
+ return;
}
-static bool get_el2_to_el1_mapping(unsigned int reg,
- unsigned int *el1r, u64 (**xlate)(u64))
+ loc->loc = SR_LOC_LOADED | SR_LOC_MAPPED;
+ loc->map_reg = map_reg;
+
+ WARN_ON(locate_direct_register(vcpu, map_reg) != SR_LOC_MEMORY);
+
+ if (xlate != NULL && !vcpu_el2_e2h_is_set(vcpu)) {
+ loc->loc |= SR_LOC_XLATED;
+ loc->xlate = xlate;
+ }
+}
+
+#define MAPPED_EL2_SYSREG(r, m, t) \
+ case r: { \
+ locate_mapped_el2_register(vcpu, r, m, t, loc); \
+ break; \
+ }
+
+static void locate_register(const struct kvm_vcpu *vcpu, enum vcpu_sysreg reg,
+ struct sr_loc *loc)
{
+ if (!vcpu_get_flag(vcpu, SYSREGS_ON_CPU)) {
+ loc->loc = SR_LOC_MEMORY;
+ return;
+ }
+
switch (reg) {
- PURE_EL2_SYSREG( VPIDR_EL2 );
- PURE_EL2_SYSREG( VMPIDR_EL2 );
- PURE_EL2_SYSREG( ACTLR_EL2 );
- PURE_EL2_SYSREG( HCR_EL2 );
- PURE_EL2_SYSREG( MDCR_EL2 );
- PURE_EL2_SYSREG( HSTR_EL2 );
- PURE_EL2_SYSREG( HACR_EL2 );
- PURE_EL2_SYSREG( VTTBR_EL2 );
- PURE_EL2_SYSREG( VTCR_EL2 );
- PURE_EL2_SYSREG( TPIDR_EL2 );
- PURE_EL2_SYSREG( HPFAR_EL2 );
- PURE_EL2_SYSREG( HCRX_EL2 );
- PURE_EL2_SYSREG( HFGRTR_EL2 );
- PURE_EL2_SYSREG( HFGWTR_EL2 );
- PURE_EL2_SYSREG( HFGITR_EL2 );
- PURE_EL2_SYSREG( HDFGRTR_EL2 );
- PURE_EL2_SYSREG( HDFGWTR_EL2 );
- PURE_EL2_SYSREG( HAFGRTR_EL2 );
- PURE_EL2_SYSREG( CNTVOFF_EL2 );
- PURE_EL2_SYSREG( CNTHCTL_EL2 );
MAPPED_EL2_SYSREG(SCTLR_EL2, SCTLR_EL1,
translate_sctlr_el2_to_sctlr_el1 );
MAPPED_EL2_SYSREG(CPTR_EL2, CPACR_EL1,
@@ -144,125 +206,189 @@ static bool get_el2_to_el1_mapping(unsigned int reg,
MAPPED_EL2_SYSREG(ZCR_EL2, ZCR_EL1, NULL );
MAPPED_EL2_SYSREG(CONTEXTIDR_EL2, CONTEXTIDR_EL1, NULL );
MAPPED_EL2_SYSREG(SCTLR2_EL2, SCTLR2_EL1, NULL );
+ case CNTHCTL_EL2:
+ /* CNTHCTL_EL2 is super special, until we support NV2.1 */
+ loc->loc = ((is_hyp_ctxt(vcpu) && vcpu_el2_e2h_is_set(vcpu)) ?
+ SR_LOC_SPECIAL : SR_LOC_MEMORY);
+ break;
default:
- return false;
+ loc->loc = locate_direct_register(vcpu, reg);
}
}
-u64 vcpu_read_sys_reg(const struct kvm_vcpu *vcpu, int reg)
+static u64 read_sr_from_cpu(enum vcpu_sysreg reg)
{
u64 val = 0x8badf00d8badf00d;
- u64 (*xlate)(u64) = NULL;
- unsigned int el1r;
- if (!vcpu_get_flag(vcpu, SYSREGS_ON_CPU))
- goto memory_read;
+ switch (reg) {
+ case SCTLR_EL1: val = read_sysreg_s(SYS_SCTLR_EL12); break;
+ case CPACR_EL1: val = read_sysreg_s(SYS_CPACR_EL12); break;
+ case TTBR0_EL1: val = read_sysreg_s(SYS_TTBR0_EL12); break;
+ case TTBR1_EL1: val = read_sysreg_s(SYS_TTBR1_EL12); break;
+ case TCR_EL1: val = read_sysreg_s(SYS_TCR_EL12); break;
+ case TCR2_EL1: val = read_sysreg_s(SYS_TCR2_EL12); break;
+ case PIR_EL1: val = read_sysreg_s(SYS_PIR_EL12); break;
+ case PIRE0_EL1: val = read_sysreg_s(SYS_PIRE0_EL12); break;
+ case POR_EL1: val = read_sysreg_s(SYS_POR_EL12); break;
+ case ESR_EL1: val = read_sysreg_s(SYS_ESR_EL12); break;
+ case AFSR0_EL1: val = read_sysreg_s(SYS_AFSR0_EL12); break;
+ case AFSR1_EL1: val = read_sysreg_s(SYS_AFSR1_EL12); break;
+ case FAR_EL1: val = read_sysreg_s(SYS_FAR_EL12); break;
+ case MAIR_EL1: val = read_sysreg_s(SYS_MAIR_EL12); break;
+ case VBAR_EL1: val = read_sysreg_s(SYS_VBAR_EL12); break;
+ case CONTEXTIDR_EL1: val = read_sysreg_s(SYS_CONTEXTIDR_EL12);break;
+ case AMAIR_EL1: val = read_sysreg_s(SYS_AMAIR_EL12); break;
+ case CNTKCTL_EL1: val = read_sysreg_s(SYS_CNTKCTL_EL12); break;
+ case ELR_EL1: val = read_sysreg_s(SYS_ELR_EL12); break;
+ case SPSR_EL1: val = read_sysreg_s(SYS_SPSR_EL12); break;
+ case ZCR_EL1: val = read_sysreg_s(SYS_ZCR_EL12); break;
+ case SCTLR2_EL1: val = read_sysreg_s(SYS_SCTLR2_EL12); break;
+ case TPIDR_EL0: val = read_sysreg_s(SYS_TPIDR_EL0); break;
+ case TPIDRRO_EL0: val = read_sysreg_s(SYS_TPIDRRO_EL0); break;
+ case TPIDR_EL1: val = read_sysreg_s(SYS_TPIDR_EL1); break;
+ case PAR_EL1: val = read_sysreg_par(); break;
+ case DACR32_EL2: val = read_sysreg_s(SYS_DACR32_EL2); break;
+ case IFSR32_EL2: val = read_sysreg_s(SYS_IFSR32_EL2); break;
+ case DBGVCR32_EL2: val = read_sysreg_s(SYS_DBGVCR32_EL2); break;
+ default: WARN_ON_ONCE(1);
+ }
+
+ return val;
+}
+
+static void write_sr_to_cpu(enum vcpu_sysreg reg, u64 val)
+{
+ switch (reg) {
+ case SCTLR_EL1: write_sysreg_s(val, SYS_SCTLR_EL12); break;
+ case CPACR_EL1: write_sysreg_s(val, SYS_CPACR_EL12); break;
+ case TTBR0_EL1: write_sysreg_s(val, SYS_TTBR0_EL12); break;
+ case TTBR1_EL1: write_sysreg_s(val, SYS_TTBR1_EL12); break;
+ case TCR_EL1: write_sysreg_s(val, SYS_TCR_EL12); break;
+ case TCR2_EL1: write_sysreg_s(val, SYS_TCR2_EL12); break;
+ case PIR_EL1: write_sysreg_s(val, SYS_PIR_EL12); break;
+ case PIRE0_EL1: write_sysreg_s(val, SYS_PIRE0_EL12); break;
+ case POR_EL1: write_sysreg_s(val, SYS_POR_EL12); break;
+ case ESR_EL1: write_sysreg_s(val, SYS_ESR_EL12); break;
+ case AFSR0_EL1: write_sysreg_s(val, SYS_AFSR0_EL12); break;
+ case AFSR1_EL1: write_sysreg_s(val, SYS_AFSR1_EL12); break;
+ case FAR_EL1: write_sysreg_s(val, SYS_FAR_EL12); break;
+ case MAIR_EL1: write_sysreg_s(val, SYS_MAIR_EL12); break;
+ case VBAR_EL1: write_sysreg_s(val, SYS_VBAR_EL12); break;
+ case CONTEXTIDR_EL1: write_sysreg_s(val, SYS_CONTEXTIDR_EL12);break;
+ case AMAIR_EL1: write_sysreg_s(val, SYS_AMAIR_EL12); break;
+ case CNTKCTL_EL1: write_sysreg_s(val, SYS_CNTKCTL_EL12); break;
+ case ELR_EL1: write_sysreg_s(val, SYS_ELR_EL12); break;
+ case SPSR_EL1: write_sysreg_s(val, SYS_SPSR_EL12); break;
+ case ZCR_EL1: write_sysreg_s(val, SYS_ZCR_EL12); break;
+ case SCTLR2_EL1: write_sysreg_s(val, SYS_SCTLR2_EL12); break;
+ case TPIDR_EL0: write_sysreg_s(val, SYS_TPIDR_EL0); break;
+ case TPIDRRO_EL0: write_sysreg_s(val, SYS_TPIDRRO_EL0); break;
+ case TPIDR_EL1: write_sysreg_s(val, SYS_TPIDR_EL1); break;
+ case PAR_EL1: write_sysreg_s(val, SYS_PAR_EL1); break;
+ case DACR32_EL2: write_sysreg_s(val, SYS_DACR32_EL2); break;
+ case IFSR32_EL2: write_sysreg_s(val, SYS_IFSR32_EL2); break;
+ case DBGVCR32_EL2: write_sysreg_s(val, SYS_DBGVCR32_EL2); break;
+ default: WARN_ON_ONCE(1);
+ }
+}
+
+u64 vcpu_read_sys_reg(const struct kvm_vcpu *vcpu, enum vcpu_sysreg reg)
+{
+ struct sr_loc loc = {};
+
+ locate_register(vcpu, reg, &loc);
- if (unlikely(get_el2_to_el1_mapping(reg, &el1r, &xlate))) {
- if (!is_hyp_ctxt(vcpu))
- goto memory_read;
+ WARN_ON_ONCE(!has_vhe() && loc.loc != SR_LOC_MEMORY);
+
+ if (loc.loc & SR_LOC_SPECIAL) {
+ u64 val;
+
+ WARN_ON_ONCE(loc.loc & ~SR_LOC_SPECIAL);
/*
- * CNTHCTL_EL2 requires some special treatment to
- * account for the bits that can be set via CNTKCTL_EL1.
+ * CNTHCTL_EL2 requires some special treatment to account
+ * for the bits that can be set via CNTKCTL_EL1 when E2H==1.
*/
switch (reg) {
case CNTHCTL_EL2:
- if (vcpu_el2_e2h_is_set(vcpu)) {
- val = read_sysreg_el1(SYS_CNTKCTL);
- val &= CNTKCTL_VALID_BITS;
- val |= __vcpu_sys_reg(vcpu, reg) & ~CNTKCTL_VALID_BITS;
- return val;
- }
- break;
+ val = read_sysreg_el1(SYS_CNTKCTL);
+ val &= CNTKCTL_VALID_BITS;
+ val |= __vcpu_sys_reg(vcpu, reg) & ~CNTKCTL_VALID_BITS;
+ return val;
+ default:
+ WARN_ON_ONCE(1);
}
+ }
- /*
- * If this register does not have an EL1 counterpart,
- * then read the stored EL2 version.
- */
- if (reg == el1r)
- goto memory_read;
+ if (loc.loc & SR_LOC_LOADED) {
+ enum vcpu_sysreg map_reg = reg;
- /*
- * If we have a non-VHE guest and that the sysreg
- * requires translation to be used at EL1, use the
- * in-memory copy instead.
- */
- if (!vcpu_el2_e2h_is_set(vcpu) && xlate)
- goto memory_read;
+ if (loc.loc & SR_LOC_MAPPED)
+ map_reg = loc.map_reg;
- /* Get the current version of the EL1 counterpart. */
- WARN_ON(!__vcpu_read_sys_reg_from_cpu(el1r, &val));
- if (reg >= __SANITISED_REG_START__)
- val = kvm_vcpu_apply_reg_masks(vcpu, reg, val);
+ if (!(loc.loc & SR_LOC_XLATED)) {
+ u64 val = read_sr_from_cpu(map_reg);
- return val;
- }
+ if (reg >= __SANITISED_REG_START__)
+ val = kvm_vcpu_apply_reg_masks(vcpu, reg, val);
- /* EL1 register can't be on the CPU if the guest is in vEL2. */
- if (unlikely(is_hyp_ctxt(vcpu)))
- goto memory_read;
-
- if (__vcpu_read_sys_reg_from_cpu(reg, &val))
- return val;
+ return val;
+ }
+ }
-memory_read:
return __vcpu_sys_reg(vcpu, reg);
}
-void vcpu_write_sys_reg(struct kvm_vcpu *vcpu, u64 val, int reg)
+void vcpu_write_sys_reg(struct kvm_vcpu *vcpu, u64 val, enum vcpu_sysreg reg)
{
- u64 (*xlate)(u64) = NULL;
- unsigned int el1r;
+ struct sr_loc loc = {};
- if (!vcpu_get_flag(vcpu, SYSREGS_ON_CPU))
- goto memory_write;
+ locate_register(vcpu, reg, &loc);
- if (unlikely(get_el2_to_el1_mapping(reg, &el1r, &xlate))) {
- if (!is_hyp_ctxt(vcpu))
- goto memory_write;
+ WARN_ON_ONCE(!has_vhe() && loc.loc != SR_LOC_MEMORY);
- /*
- * Always store a copy of the write to memory to avoid having
- * to reverse-translate virtual EL2 system registers for a
- * non-VHE guest hypervisor.
- */
- __vcpu_assign_sys_reg(vcpu, reg, val);
+ if (loc.loc & SR_LOC_SPECIAL) {
+
+ WARN_ON_ONCE(loc.loc & ~SR_LOC_SPECIAL);
switch (reg) {
case CNTHCTL_EL2:
/*
- * If E2H=0, CNHTCTL_EL2 is a pure shadow register.
- * Otherwise, some of the bits are backed by
+ * If E2H=1, some of the bits are backed by
* CNTKCTL_EL1, while the rest is kept in memory.
* Yes, this is fun stuff.
*/
- if (vcpu_el2_e2h_is_set(vcpu))
- write_sysreg_el1(val, SYS_CNTKCTL);
- return;
+ write_sysreg_el1(val, SYS_CNTKCTL);
+ break;
+ default:
+ WARN_ON_ONCE(1);
}
+ }
- /* No EL1 counterpart? We're done here.? */
- if (reg == el1r)
- return;
+ if (loc.loc & SR_LOC_LOADED) {
+ enum vcpu_sysreg map_reg = reg;
+ u64 xlated_val;
- if (!vcpu_el2_e2h_is_set(vcpu) && xlate)
- val = xlate(val);
+ if (reg >= __SANITISED_REG_START__)
+ val = kvm_vcpu_apply_reg_masks(vcpu, reg, val);
- /* Redirect this to the EL1 version of the register. */
- WARN_ON(!__vcpu_write_sys_reg_to_cpu(val, el1r));
- return;
- }
+ if (loc.loc & SR_LOC_MAPPED)
+ map_reg = loc.map_reg;
- /* EL1 register can't be on the CPU if the guest is in vEL2. */
- if (unlikely(is_hyp_ctxt(vcpu)))
- goto memory_write;
+ if (loc.loc & SR_LOC_XLATED)
+ xlated_val = loc.xlate(val);
+ else
+ xlated_val = val;
- if (__vcpu_write_sys_reg_to_cpu(val, reg))
- return;
+ write_sr_to_cpu(map_reg, xlated_val);
+
+ /*
+ * Fall through to write the backing store anyway, which
+ * allows translated registers to be directly read without a
+ * reverse translation.
+ */
+ }
-memory_write:
__vcpu_assign_sys_reg(vcpu, reg, val);
}
@@ -1584,6 +1710,7 @@ static u8 pmuver_to_perfmon(u8 pmuver)
}
static u64 sanitise_id_aa64pfr0_el1(const struct kvm_vcpu *vcpu, u64 val);
+static u64 sanitise_id_aa64pfr1_el1(const struct kvm_vcpu *vcpu, u64 val);
static u64 sanitise_id_aa64dfr0_el1(const struct kvm_vcpu *vcpu, u64 val);
/* Read a sanitised cpufeature ID register by sys_reg_desc */
@@ -1606,19 +1733,7 @@ static u64 __kvm_read_sanitised_id_reg(const struct kvm_vcpu *vcpu,
val = sanitise_id_aa64pfr0_el1(vcpu, val);
break;
case SYS_ID_AA64PFR1_EL1:
- if (!kvm_has_mte(vcpu->kvm)) {
- val &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_MTE);
- val &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_MTE_frac);
- }
-
- val &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_SME);
- val &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_RNDR_trap);
- val &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_NMI);
- val &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_GCS);
- val &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_THE);
- val &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_MTEX);
- val &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_PFAR);
- val &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_MPAM_frac);
+ val = sanitise_id_aa64pfr1_el1(vcpu, val);
break;
case SYS_ID_AA64PFR2_EL1:
val &= ID_AA64PFR2_EL1_FPMR |
@@ -1628,21 +1743,22 @@ static u64 __kvm_read_sanitised_id_reg(const struct kvm_vcpu *vcpu,
break;
case SYS_ID_AA64ISAR1_EL1:
if (!vcpu_has_ptrauth(vcpu))
- val &= ~(ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_APA) |
- ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_API) |
- ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_GPA) |
- ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_GPI));
+ val &= ~(ID_AA64ISAR1_EL1_APA |
+ ID_AA64ISAR1_EL1_API |
+ ID_AA64ISAR1_EL1_GPA |
+ ID_AA64ISAR1_EL1_GPI);
break;
case SYS_ID_AA64ISAR2_EL1:
if (!vcpu_has_ptrauth(vcpu))
- val &= ~(ARM64_FEATURE_MASK(ID_AA64ISAR2_EL1_APA3) |
- ARM64_FEATURE_MASK(ID_AA64ISAR2_EL1_GPA3));
+ val &= ~(ID_AA64ISAR2_EL1_APA3 |
+ ID_AA64ISAR2_EL1_GPA3);
if (!cpus_have_final_cap(ARM64_HAS_WFXT) ||
has_broken_cntvoff())
- val &= ~ARM64_FEATURE_MASK(ID_AA64ISAR2_EL1_WFxT);
+ val &= ~ID_AA64ISAR2_EL1_WFxT;
break;
case SYS_ID_AA64ISAR3_EL1:
- val &= ID_AA64ISAR3_EL1_FPRCVT | ID_AA64ISAR3_EL1_FAMINMAX;
+ val &= ID_AA64ISAR3_EL1_FPRCVT | ID_AA64ISAR3_EL1_LSFE |
+ ID_AA64ISAR3_EL1_FAMINMAX;
break;
case SYS_ID_AA64MMFR2_EL1:
val &= ~ID_AA64MMFR2_EL1_CCIDX_MASK;
@@ -1655,7 +1771,7 @@ static u64 __kvm_read_sanitised_id_reg(const struct kvm_vcpu *vcpu,
ID_AA64MMFR3_EL1_S1PIE;
break;
case SYS_ID_MMFR4_EL1:
- val &= ~ARM64_FEATURE_MASK(ID_MMFR4_EL1_CCIDX);
+ val &= ~ID_MMFR4_EL1_CCIDX;
break;
}
@@ -1836,6 +1952,31 @@ static u64 sanitise_id_aa64pfr0_el1(const struct kvm_vcpu *vcpu, u64 val)
return val;
}
+static u64 sanitise_id_aa64pfr1_el1(const struct kvm_vcpu *vcpu, u64 val)
+{
+ u64 pfr0 = read_sanitised_ftr_reg(SYS_ID_AA64PFR0_EL1);
+
+ if (!kvm_has_mte(vcpu->kvm)) {
+ val &= ~ID_AA64PFR1_EL1_MTE;
+ val &= ~ID_AA64PFR1_EL1_MTE_frac;
+ }
+
+ if (!(cpus_have_final_cap(ARM64_HAS_RASV1P1_EXTN) &&
+ SYS_FIELD_GET(ID_AA64PFR0_EL1, RAS, pfr0) == ID_AA64PFR0_EL1_RAS_IMP))
+ val &= ~ID_AA64PFR1_EL1_RAS_frac;
+
+ val &= ~ID_AA64PFR1_EL1_SME;
+ val &= ~ID_AA64PFR1_EL1_RNDR_trap;
+ val &= ~ID_AA64PFR1_EL1_NMI;
+ val &= ~ID_AA64PFR1_EL1_GCS;
+ val &= ~ID_AA64PFR1_EL1_THE;
+ val &= ~ID_AA64PFR1_EL1_MTEX;
+ val &= ~ID_AA64PFR1_EL1_PFAR;
+ val &= ~ID_AA64PFR1_EL1_MPAM_frac;
+
+ return val;
+}
+
static u64 sanitise_id_aa64dfr0_el1(const struct kvm_vcpu *vcpu, u64 val)
{
val = ID_REG_LIMIT_FIELD_ENUM(val, ID_AA64DFR0_EL1, DebugVer, V8P8);
@@ -1857,6 +1998,26 @@ static u64 sanitise_id_aa64dfr0_el1(const struct kvm_vcpu *vcpu, u64 val)
return val;
}
+/*
+ * Older versions of KVM erroneously claim support for FEAT_DoubleLock with
+ * NV-enabled VMs on unsupporting hardware. Silently ignore the incorrect
+ * value if it is consistent with the bug.
+ */
+static bool ignore_feat_doublelock(struct kvm_vcpu *vcpu, u64 val)
+{
+ u8 host, user;
+
+ if (!vcpu_has_nv(vcpu))
+ return false;
+
+ host = SYS_FIELD_GET(ID_AA64DFR0_EL1, DoubleLock,
+ read_sanitised_ftr_reg(SYS_ID_AA64DFR0_EL1));
+ user = SYS_FIELD_GET(ID_AA64DFR0_EL1, DoubleLock, val);
+
+ return host == ID_AA64DFR0_EL1_DoubleLock_NI &&
+ user == ID_AA64DFR0_EL1_DoubleLock_IMP;
+}
+
static int set_id_aa64dfr0_el1(struct kvm_vcpu *vcpu,
const struct sys_reg_desc *rd,
u64 val)
@@ -1888,6 +2049,11 @@ static int set_id_aa64dfr0_el1(struct kvm_vcpu *vcpu,
if (debugver < ID_AA64DFR0_EL1_DebugVer_IMP)
return -EINVAL;
+ if (ignore_feat_doublelock(vcpu, val)) {
+ val &= ~ID_AA64DFR0_EL1_DoubleLock;
+ val |= SYS_FIELD_PREP_ENUM(ID_AA64DFR0_EL1, DoubleLock, NI);
+ }
+
return set_id_reg(vcpu, rd, val);
}
@@ -2008,16 +2174,29 @@ static int set_id_aa64pfr1_el1(struct kvm_vcpu *vcpu,
return set_id_reg(vcpu, rd, user_val);
}
+/*
+ * Allow userspace to de-feature a stage-2 translation granule but prevent it
+ * from claiming the impossible.
+ */
+#define tgran2_val_allowed(tg, safe, user) \
+({ \
+ u8 __s = SYS_FIELD_GET(ID_AA64MMFR0_EL1, tg, safe); \
+ u8 __u = SYS_FIELD_GET(ID_AA64MMFR0_EL1, tg, user); \
+ \
+ __s == __u || __u == ID_AA64MMFR0_EL1_##tg##_NI; \
+})
+
static int set_id_aa64mmfr0_el1(struct kvm_vcpu *vcpu,
const struct sys_reg_desc *rd, u64 user_val)
{
u64 sanitized_val = kvm_read_sanitised_id_reg(vcpu, rd);
- u64 tgran2_mask = ID_AA64MMFR0_EL1_TGRAN4_2_MASK |
- ID_AA64MMFR0_EL1_TGRAN16_2_MASK |
- ID_AA64MMFR0_EL1_TGRAN64_2_MASK;
- if (vcpu_has_nv(vcpu) &&
- ((sanitized_val & tgran2_mask) != (user_val & tgran2_mask)))
+ if (!vcpu_has_nv(vcpu))
+ return set_id_reg(vcpu, rd, user_val);
+
+ if (!tgran2_val_allowed(TGRAN4_2, sanitized_val, user_val) ||
+ !tgran2_val_allowed(TGRAN16_2, sanitized_val, user_val) ||
+ !tgran2_val_allowed(TGRAN64_2, sanitized_val, user_val))
return -EINVAL;
return set_id_reg(vcpu, rd, user_val);
@@ -2697,6 +2876,18 @@ static bool access_ras(struct kvm_vcpu *vcpu,
struct kvm *kvm = vcpu->kvm;
switch(reg_to_encoding(r)) {
+ case SYS_ERXPFGCDN_EL1:
+ case SYS_ERXPFGCTL_EL1:
+ case SYS_ERXPFGF_EL1:
+ case SYS_ERXMISC2_EL1:
+ case SYS_ERXMISC3_EL1:
+ if (!(kvm_has_feat(kvm, ID_AA64PFR0_EL1, RAS, V1P1) ||
+ (kvm_has_feat_enum(kvm, ID_AA64PFR0_EL1, RAS, IMP) &&
+ kvm_has_feat(kvm, ID_AA64PFR1_EL1, RAS_frac, RASv1p1)))) {
+ kvm_inject_undefined(vcpu);
+ return false;
+ }
+ break;
default:
if (!kvm_has_feat(kvm, ID_AA64PFR0_EL1, RAS, IMP)) {
kvm_inject_undefined(vcpu);
@@ -2929,7 +3120,6 @@ static const struct sys_reg_desc sys_reg_descs[] = {
~(ID_AA64PFR0_EL1_AMU |
ID_AA64PFR0_EL1_MPAM |
ID_AA64PFR0_EL1_SVE |
- ID_AA64PFR0_EL1_RAS |
ID_AA64PFR0_EL1_AdvSIMD |
ID_AA64PFR0_EL1_FP)),
ID_FILTERED(ID_AA64PFR1_EL1, id_aa64pfr1_el1,
@@ -2943,7 +3133,6 @@ static const struct sys_reg_desc sys_reg_descs[] = {
ID_AA64PFR1_EL1_SME |
ID_AA64PFR1_EL1_RES0 |
ID_AA64PFR1_EL1_MPAM_frac |
- ID_AA64PFR1_EL1_RAS_frac |
ID_AA64PFR1_EL1_MTE)),
ID_WRITABLE(ID_AA64PFR2_EL1,
ID_AA64PFR2_EL1_FPMR |
@@ -2991,6 +3180,7 @@ static const struct sys_reg_desc sys_reg_descs[] = {
ID_AA64ISAR2_EL1_APA3 |
ID_AA64ISAR2_EL1_GPA3)),
ID_WRITABLE(ID_AA64ISAR3_EL1, (ID_AA64ISAR3_EL1_FPRCVT |
+ ID_AA64ISAR3_EL1_LSFE |
ID_AA64ISAR3_EL1_FAMINMAX)),
ID_UNALLOCATED(6,4),
ID_UNALLOCATED(6,5),
@@ -3002,8 +3192,6 @@ static const struct sys_reg_desc sys_reg_descs[] = {
~(ID_AA64MMFR0_EL1_RES0 |
ID_AA64MMFR0_EL1_ASIDBITS)),
ID_WRITABLE(ID_AA64MMFR1_EL1, ~(ID_AA64MMFR1_EL1_RES0 |
- ID_AA64MMFR1_EL1_HCX |
- ID_AA64MMFR1_EL1_TWED |
ID_AA64MMFR1_EL1_XNX |
ID_AA64MMFR1_EL1_VH |
ID_AA64MMFR1_EL1_VMIDBits)),
@@ -3063,8 +3251,13 @@ static const struct sys_reg_desc sys_reg_descs[] = {
{ SYS_DESC(SYS_ERXCTLR_EL1), access_ras },
{ SYS_DESC(SYS_ERXSTATUS_EL1), access_ras },
{ SYS_DESC(SYS_ERXADDR_EL1), access_ras },
+ { SYS_DESC(SYS_ERXPFGF_EL1), access_ras },
+ { SYS_DESC(SYS_ERXPFGCTL_EL1), access_ras },
+ { SYS_DESC(SYS_ERXPFGCDN_EL1), access_ras },
{ SYS_DESC(SYS_ERXMISC0_EL1), access_ras },
{ SYS_DESC(SYS_ERXMISC1_EL1), access_ras },
+ { SYS_DESC(SYS_ERXMISC2_EL1), access_ras },
+ { SYS_DESC(SYS_ERXMISC3_EL1), access_ras },
MTE_REG(TFSR_EL1),
MTE_REG(TFSRE0_EL1),
@@ -3083,6 +3276,7 @@ static const struct sys_reg_desc sys_reg_descs[] = {
{ SYS_DESC(SYS_PMBLIMITR_EL1), undef_access },
{ SYS_DESC(SYS_PMBPTR_EL1), undef_access },
{ SYS_DESC(SYS_PMBSR_EL1), undef_access },
+ { SYS_DESC(SYS_PMSDSFR_EL1), undef_access },
/* PMBIDR_EL1 is not trapped */
{ PMU_SYS_REG(PMINTENSET_EL1),
diff --git a/arch/arm64/kvm/vgic/vgic-debug.c b/arch/arm64/kvm/vgic/vgic-debug.c
index 2684f273d9e1..4c1209261b65 100644
--- a/arch/arm64/kvm/vgic/vgic-debug.c
+++ b/arch/arm64/kvm/vgic/vgic-debug.c
@@ -69,7 +69,7 @@ static int iter_mark_lpis(struct kvm *kvm)
int nr_lpis = 0;
xa_for_each(&dist->lpi_xa, intid, irq) {
- if (!vgic_try_get_irq_kref(irq))
+ if (!vgic_try_get_irq_ref(irq))
continue;
xa_set_mark(&dist->lpi_xa, intid, LPI_XA_MARK_DEBUG_ITER);
diff --git a/arch/arm64/kvm/vgic/vgic-init.c b/arch/arm64/kvm/vgic/vgic-init.c
index 1e680ad6e863..1796b1a22a72 100644
--- a/arch/arm64/kvm/vgic/vgic-init.c
+++ b/arch/arm64/kvm/vgic/vgic-init.c
@@ -53,7 +53,7 @@ void kvm_vgic_early_init(struct kvm *kvm)
{
struct vgic_dist *dist = &kvm->arch.vgic;
- xa_init_flags(&dist->lpi_xa, XA_FLAGS_LOCK_IRQ);
+ xa_init(&dist->lpi_xa);
}
/* CREATION */
@@ -208,7 +208,7 @@ static int kvm_vgic_dist_init(struct kvm *kvm, unsigned int nr_spis)
raw_spin_lock_init(&irq->irq_lock);
irq->vcpu = NULL;
irq->target_vcpu = vcpu0;
- kref_init(&irq->refcount);
+ refcount_set(&irq->refcount, 0);
switch (dist->vgic_model) {
case KVM_DEV_TYPE_ARM_VGIC_V2:
irq->targets = 0;
@@ -277,7 +277,7 @@ static int vgic_allocate_private_irqs_locked(struct kvm_vcpu *vcpu, u32 type)
irq->intid = i;
irq->vcpu = NULL;
irq->target_vcpu = vcpu;
- kref_init(&irq->refcount);
+ refcount_set(&irq->refcount, 0);
if (vgic_irq_is_sgi(i)) {
/* SGIs */
irq->enabled = 1;
@@ -554,7 +554,6 @@ int vgic_lazy_init(struct kvm *kvm)
* Also map the virtual CPU interface into the VM.
* v2 calls vgic_init() if not already done.
* v3 and derivatives return an error if the VGIC is not initialized.
- * vgic_ready() returns true if this function has succeeded.
*/
int kvm_vgic_map_resources(struct kvm *kvm)
{
@@ -563,12 +562,12 @@ int kvm_vgic_map_resources(struct kvm *kvm)
gpa_t dist_base;
int ret = 0;
- if (likely(vgic_ready(kvm)))
+ if (likely(smp_load_acquire(&dist->ready)))
return 0;
mutex_lock(&kvm->slots_lock);
mutex_lock(&kvm->arch.config_lock);
- if (vgic_ready(kvm))
+ if (dist->ready)
goto out;
if (!irqchip_in_kernel(kvm))
@@ -594,14 +593,7 @@ int kvm_vgic_map_resources(struct kvm *kvm)
goto out_slots;
}
- /*
- * kvm_io_bus_register_dev() guarantees all readers see the new MMIO
- * registration before returning through synchronize_srcu(), which also
- * implies a full memory barrier. As such, marking the distributor as
- * 'ready' here is guaranteed to be ordered after all vCPUs having seen
- * a completely configured distributor.
- */
- dist->ready = true;
+ smp_store_release(&dist->ready, true);
goto out_slots;
out:
mutex_unlock(&kvm->arch.config_lock);
diff --git a/arch/arm64/kvm/vgic/vgic-its.c b/arch/arm64/kvm/vgic/vgic-its.c
index 7368c13f16b7..ce3e3ed3f29f 100644
--- a/arch/arm64/kvm/vgic/vgic-its.c
+++ b/arch/arm64/kvm/vgic/vgic-its.c
@@ -78,7 +78,6 @@ static struct vgic_irq *vgic_add_lpi(struct kvm *kvm, u32 intid,
{
struct vgic_dist *dist = &kvm->arch.vgic;
struct vgic_irq *irq = vgic_get_irq(kvm, intid), *oldirq;
- unsigned long flags;
int ret;
/* In this case there is no put, since we keep the reference. */
@@ -89,7 +88,7 @@ static struct vgic_irq *vgic_add_lpi(struct kvm *kvm, u32 intid,
if (!irq)
return ERR_PTR(-ENOMEM);
- ret = xa_reserve_irq(&dist->lpi_xa, intid, GFP_KERNEL_ACCOUNT);
+ ret = xa_reserve(&dist->lpi_xa, intid, GFP_KERNEL_ACCOUNT);
if (ret) {
kfree(irq);
return ERR_PTR(ret);
@@ -99,19 +98,19 @@ static struct vgic_irq *vgic_add_lpi(struct kvm *kvm, u32 intid,
raw_spin_lock_init(&irq->irq_lock);
irq->config = VGIC_CONFIG_EDGE;
- kref_init(&irq->refcount);
+ refcount_set(&irq->refcount, 1);
irq->intid = intid;
irq->target_vcpu = vcpu;
irq->group = 1;
- xa_lock_irqsave(&dist->lpi_xa, flags);
+ xa_lock(&dist->lpi_xa);
/*
* There could be a race with another vgic_add_lpi(), so we need to
* check that we don't add a second list entry with the same LPI.
*/
oldirq = xa_load(&dist->lpi_xa, intid);
- if (vgic_try_get_irq_kref(oldirq)) {
+ if (vgic_try_get_irq_ref(oldirq)) {
/* Someone was faster with adding this LPI, lets use that. */
kfree(irq);
irq = oldirq;
@@ -126,7 +125,7 @@ static struct vgic_irq *vgic_add_lpi(struct kvm *kvm, u32 intid,
}
out_unlock:
- xa_unlock_irqrestore(&dist->lpi_xa, flags);
+ xa_unlock(&dist->lpi_xa);
if (ret)
return ERR_PTR(ret);
@@ -547,7 +546,7 @@ static struct vgic_irq *vgic_its_check_cache(struct kvm *kvm, phys_addr_t db,
rcu_read_lock();
irq = xa_load(&its->translation_cache, cache_key);
- if (!vgic_try_get_irq_kref(irq))
+ if (!vgic_try_get_irq_ref(irq))
irq = NULL;
rcu_read_unlock();
@@ -571,7 +570,7 @@ static void vgic_its_cache_translation(struct kvm *kvm, struct vgic_its *its,
* its_lock, as the ITE (and the reference it holds) cannot be freed.
*/
lockdep_assert_held(&its->its_lock);
- vgic_get_irq_kref(irq);
+ vgic_get_irq_ref(irq);
old = xa_store(&its->translation_cache, cache_key, irq, GFP_KERNEL_ACCOUNT);
diff --git a/arch/arm64/kvm/vgic/vgic-mmio-v3.c b/arch/arm64/kvm/vgic/vgic-mmio-v3.c
index a3ef185209e9..70d50c77e5dc 100644
--- a/arch/arm64/kvm/vgic/vgic-mmio-v3.c
+++ b/arch/arm64/kvm/vgic/vgic-mmio-v3.c
@@ -50,6 +50,14 @@ bool vgic_has_its(struct kvm *kvm)
bool vgic_supports_direct_msis(struct kvm *kvm)
{
+ /*
+ * Deliberately conflate vLPI and vSGI support on GICv4.1 hardware,
+ * indirectly allowing userspace to control whether or not vPEs are
+ * allocated for the VM.
+ */
+ if (system_supports_direct_sgis() && !vgic_supports_direct_sgis(kvm))
+ return false;
+
return kvm_vgic_global_state.has_gicv4 && vgic_has_its(kvm);
}
diff --git a/arch/arm64/kvm/vgic/vgic-mmio.c b/arch/arm64/kvm/vgic/vgic-mmio.c
index e416e433baff..a573b1f0c6cb 100644
--- a/arch/arm64/kvm/vgic/vgic-mmio.c
+++ b/arch/arm64/kvm/vgic/vgic-mmio.c
@@ -1091,7 +1091,7 @@ int vgic_register_dist_iodev(struct kvm *kvm, gpa_t dist_base_address,
len = vgic_v3_init_dist_iodev(io_device);
break;
default:
- BUG_ON(1);
+ BUG();
}
io_device->base_addr = dist_base_address;
diff --git a/arch/arm64/kvm/vgic/vgic-v3.c b/arch/arm64/kvm/vgic/vgic-v3.c
index b9ad7c42c5b0..f1c153106c56 100644
--- a/arch/arm64/kvm/vgic/vgic-v3.c
+++ b/arch/arm64/kvm/vgic/vgic-v3.c
@@ -588,6 +588,7 @@ int vgic_v3_map_resources(struct kvm *kvm)
}
DEFINE_STATIC_KEY_FALSE(vgic_v3_cpuif_trap);
+DEFINE_STATIC_KEY_FALSE(vgic_v3_has_v2_compat);
static int __init early_group0_trap_cfg(char *buf)
{
@@ -697,6 +698,13 @@ int vgic_v3_probe(const struct gic_kvm_info *info)
if (kvm_vgic_global_state.vcpu_base == 0)
kvm_info("disabling GICv2 emulation\n");
+ /*
+ * Flip the static branch if the HW supports v2, even if we're
+ * not using it (such as in protected mode).
+ */
+ if (has_v2)
+ static_branch_enable(&vgic_v3_has_v2_compat);
+
if (cpus_have_final_cap(ARM64_WORKAROUND_CAVIUM_30115)) {
group0_trap = true;
group1_trap = true;
diff --git a/arch/arm64/kvm/vgic/vgic-v4.c b/arch/arm64/kvm/vgic/vgic-v4.c
index 4d9343d2b0b1..548aec9d5a72 100644
--- a/arch/arm64/kvm/vgic/vgic-v4.c
+++ b/arch/arm64/kvm/vgic/vgic-v4.c
@@ -518,7 +518,7 @@ static struct vgic_irq *__vgic_host_irq_get_vlpi(struct kvm *kvm, int host_irq)
if (!irq->hw || irq->host_irq != host_irq)
continue;
- if (!vgic_try_get_irq_kref(irq))
+ if (!vgic_try_get_irq_ref(irq))
return NULL;
return irq;
diff --git a/arch/arm64/kvm/vgic/vgic-v5.c b/arch/arm64/kvm/vgic/vgic-v5.c
index 6bdbb221bcde..2d3811f4e117 100644
--- a/arch/arm64/kvm/vgic/vgic-v5.c
+++ b/arch/arm64/kvm/vgic/vgic-v5.c
@@ -15,7 +15,7 @@ int vgic_v5_probe(const struct gic_kvm_info *info)
u64 ich_vtr_el2;
int ret;
- if (!info->has_gcie_v3_compat)
+ if (!cpus_have_final_cap(ARM64_HAS_GICV5_LEGACY))
return -ENODEV;
kvm_vgic_global_state.type = VGIC_V5;
diff --git a/arch/arm64/kvm/vgic/vgic.c b/arch/arm64/kvm/vgic/vgic.c
index f5148b38120a..6dd5a10081e2 100644
--- a/arch/arm64/kvm/vgic/vgic.c
+++ b/arch/arm64/kvm/vgic/vgic.c
@@ -28,8 +28,8 @@ struct vgic_global kvm_vgic_global_state __ro_after_init = {
* kvm->arch.config_lock (mutex)
* its->cmd_lock (mutex)
* its->its_lock (mutex)
- * vgic_cpu->ap_list_lock must be taken with IRQs disabled
- * vgic_dist->lpi_xa.xa_lock must be taken with IRQs disabled
+ * vgic_dist->lpi_xa.xa_lock
+ * vgic_cpu->ap_list_lock must be taken with IRQs disabled
* vgic_irq->irq_lock must be taken with IRQs disabled
*
* As the ap_list_lock might be taken from the timer interrupt handler,
@@ -71,7 +71,7 @@ static struct vgic_irq *vgic_get_lpi(struct kvm *kvm, u32 intid)
rcu_read_lock();
irq = xa_load(&dist->lpi_xa, intid);
- if (!vgic_try_get_irq_kref(irq))
+ if (!vgic_try_get_irq_ref(irq))
irq = NULL;
rcu_read_unlock();
@@ -114,37 +114,66 @@ struct vgic_irq *vgic_get_vcpu_irq(struct kvm_vcpu *vcpu, u32 intid)
return vgic_get_irq(vcpu->kvm, intid);
}
-/*
- * We can't do anything in here, because we lack the kvm pointer to
- * lock and remove the item from the lpi_list. So we keep this function
- * empty and use the return value of kref_put() to trigger the freeing.
- */
-static void vgic_irq_release(struct kref *ref)
+static void vgic_release_lpi_locked(struct vgic_dist *dist, struct vgic_irq *irq)
+{
+ lockdep_assert_held(&dist->lpi_xa.xa_lock);
+ __xa_erase(&dist->lpi_xa, irq->intid);
+ kfree_rcu(irq, rcu);
+}
+
+static __must_check bool __vgic_put_irq(struct kvm *kvm, struct vgic_irq *irq)
+{
+ if (irq->intid < VGIC_MIN_LPI)
+ return false;
+
+ return refcount_dec_and_test(&irq->refcount);
+}
+
+static __must_check bool vgic_put_irq_norelease(struct kvm *kvm, struct vgic_irq *irq)
{
+ if (!__vgic_put_irq(kvm, irq))
+ return false;
+
+ irq->pending_release = true;
+ return true;
}
void vgic_put_irq(struct kvm *kvm, struct vgic_irq *irq)
{
struct vgic_dist *dist = &kvm->arch.vgic;
- unsigned long flags;
- if (irq->intid < VGIC_MIN_LPI)
- return;
+ if (irq->intid >= VGIC_MIN_LPI)
+ might_lock(&dist->lpi_xa.xa_lock);
- if (!kref_put(&irq->refcount, vgic_irq_release))
+ if (!__vgic_put_irq(kvm, irq))
return;
- xa_lock_irqsave(&dist->lpi_xa, flags);
- __xa_erase(&dist->lpi_xa, irq->intid);
- xa_unlock_irqrestore(&dist->lpi_xa, flags);
+ xa_lock(&dist->lpi_xa);
+ vgic_release_lpi_locked(dist, irq);
+ xa_unlock(&dist->lpi_xa);
+}
- kfree_rcu(irq, rcu);
+static void vgic_release_deleted_lpis(struct kvm *kvm)
+{
+ struct vgic_dist *dist = &kvm->arch.vgic;
+ unsigned long intid;
+ struct vgic_irq *irq;
+
+ xa_lock(&dist->lpi_xa);
+
+ xa_for_each(&dist->lpi_xa, intid, irq) {
+ if (irq->pending_release)
+ vgic_release_lpi_locked(dist, irq);
+ }
+
+ xa_unlock(&dist->lpi_xa);
}
void vgic_flush_pending_lpis(struct kvm_vcpu *vcpu)
{
struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
struct vgic_irq *irq, *tmp;
+ bool deleted = false;
unsigned long flags;
raw_spin_lock_irqsave(&vgic_cpu->ap_list_lock, flags);
@@ -155,11 +184,14 @@ void vgic_flush_pending_lpis(struct kvm_vcpu *vcpu)
list_del(&irq->ap_list);
irq->vcpu = NULL;
raw_spin_unlock(&irq->irq_lock);
- vgic_put_irq(vcpu->kvm, irq);
+ deleted |= vgic_put_irq_norelease(vcpu->kvm, irq);
}
}
raw_spin_unlock_irqrestore(&vgic_cpu->ap_list_lock, flags);
+
+ if (deleted)
+ vgic_release_deleted_lpis(vcpu->kvm);
}
void vgic_irq_set_phys_pending(struct vgic_irq *irq, bool pending)
@@ -399,7 +431,7 @@ retry:
* now in the ap_list. This is safe as the caller must already hold a
* reference on the irq.
*/
- vgic_get_irq_kref(irq);
+ vgic_get_irq_ref(irq);
list_add_tail(&irq->ap_list, &vcpu->arch.vgic_cpu.ap_list_head);
irq->vcpu = vcpu;
@@ -630,6 +662,7 @@ static void vgic_prune_ap_list(struct kvm_vcpu *vcpu)
{
struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
struct vgic_irq *irq, *tmp;
+ bool deleted_lpis = false;
DEBUG_SPINLOCK_BUG_ON(!irqs_disabled());
@@ -657,12 +690,12 @@ retry:
/*
* This vgic_put_irq call matches the
- * vgic_get_irq_kref in vgic_queue_irq_unlock,
+ * vgic_get_irq_ref in vgic_queue_irq_unlock,
* where we added the LPI to the ap_list. As
* we remove the irq from the list, we drop
* also drop the refcount.
*/
- vgic_put_irq(vcpu->kvm, irq);
+ deleted_lpis |= vgic_put_irq_norelease(vcpu->kvm, irq);
continue;
}
@@ -725,6 +758,9 @@ retry:
}
raw_spin_unlock(&vgic_cpu->ap_list_lock);
+
+ if (unlikely(deleted_lpis))
+ vgic_release_deleted_lpis(vcpu->kvm);
}
static inline void vgic_fold_lr_state(struct kvm_vcpu *vcpu)
@@ -818,7 +854,7 @@ static void vgic_flush_lr_state(struct kvm_vcpu *vcpu)
* the AP list has been sorted already.
*/
if (multi_sgi && irq->priority > prio) {
- _raw_spin_unlock(&irq->irq_lock);
+ raw_spin_unlock(&irq->irq_lock);
break;
}
diff --git a/arch/arm64/kvm/vgic/vgic.h b/arch/arm64/kvm/vgic/vgic.h
index 1384a04c0784..ac5f9c5d2b98 100644
--- a/arch/arm64/kvm/vgic/vgic.h
+++ b/arch/arm64/kvm/vgic/vgic.h
@@ -267,7 +267,7 @@ void vgic_v2_put(struct kvm_vcpu *vcpu);
void vgic_v2_save_state(struct kvm_vcpu *vcpu);
void vgic_v2_restore_state(struct kvm_vcpu *vcpu);
-static inline bool vgic_try_get_irq_kref(struct vgic_irq *irq)
+static inline bool vgic_try_get_irq_ref(struct vgic_irq *irq)
{
if (!irq)
return false;
@@ -275,12 +275,12 @@ static inline bool vgic_try_get_irq_kref(struct vgic_irq *irq)
if (irq->intid < VGIC_MIN_LPI)
return true;
- return kref_get_unless_zero(&irq->refcount);
+ return refcount_inc_not_zero(&irq->refcount);
}
-static inline void vgic_get_irq_kref(struct vgic_irq *irq)
+static inline void vgic_get_irq_ref(struct vgic_irq *irq)
{
- WARN_ON_ONCE(!vgic_try_get_irq_kref(irq));
+ WARN_ON_ONCE(!vgic_try_get_irq_ref(irq));
}
void vgic_v3_fold_lr_state(struct kvm_vcpu *vcpu);
@@ -396,15 +396,7 @@ bool vgic_supports_direct_sgis(struct kvm *kvm);
static inline bool vgic_supports_direct_irqs(struct kvm *kvm)
{
- /*
- * Deliberately conflate vLPI and vSGI support on GICv4.1 hardware,
- * indirectly allowing userspace to control whether or not vPEs are
- * allocated for the VM.
- */
- if (system_supports_direct_sgis())
- return vgic_supports_direct_sgis(kvm);
-
- return vgic_supports_direct_msis(kvm);
+ return vgic_supports_direct_msis(kvm) || vgic_supports_direct_sgis(kvm);
}
int vgic_v4_init(struct kvm *kvm);