diff options
author | Paolo Bonzini <pbonzini@redhat.com> | 2024-11-14 13:05:36 +0100 |
---|---|---|
committer | Paolo Bonzini <pbonzini@redhat.com> | 2024-11-14 13:05:36 +0100 |
commit | 7b541d557f705c7e5bcd874f3b960c8fb8dee562 (patch) | |
tree | 68139f92a8dfab7dd4e23f44ee65fec910faabc2 /arch/arm64/kvm/at.c | |
parent | KVM: x86: expose MSR_PLATFORM_INFO as a feature MSR (diff) | |
parent | KVM: arm64: Pass on SVE mapping failures (diff) | |
download | linux-7b541d557f705c7e5bcd874f3b960c8fb8dee562.tar.xz linux-7b541d557f705c7e5bcd874f3b960c8fb8dee562.zip |
Merge tag 'kvmarm-6.13' of https://git.kernel.org/pub/scm/linux/kernel/git/kvmarm/kvmarm into HEAD
KVM/arm64 changes for 6.13, part #1
- Support for stage-1 permission indirection (FEAT_S1PIE) and
permission overlays (FEAT_S1POE), including nested virt + the
emulated page table walker
- Introduce PSCI SYSTEM_OFF2 support to KVM + client driver. This call
was introduced in PSCIv1.3 as a mechanism to request hibernation,
similar to the S4 state in ACPI
- Explicitly trap + hide FEAT_MPAM (QoS controls) from KVM guests. As
part of it, introduce trivial initialization of the host's MPAM
context so KVM can use the corresponding traps
- PMU support under nested virtualization, honoring the guest
hypervisor's trap configuration and event filtering when running a
nested guest
- Fixes to vgic ITS serialization where stale device/interrupt table
entries are not zeroed when the mapping is invalidated by the VM
- Avoid emulated MMIO completion if userspace has requested synchronous
external abort injection
- Various fixes and cleanups affecting pKVM, vCPU initialization, and
selftests
Diffstat (limited to 'arch/arm64/kvm/at.c')
-rw-r--r-- | arch/arm64/kvm/at.c | 470 |
1 files changed, 404 insertions, 66 deletions
diff --git a/arch/arm64/kvm/at.c b/arch/arm64/kvm/at.c index 39f0e87a340e..8c5d7990e5b3 100644 --- a/arch/arm64/kvm/at.c +++ b/arch/arm64/kvm/at.c @@ -24,6 +24,9 @@ struct s1_walk_info { unsigned int txsz; int sl; bool hpd; + bool e0poe; + bool poe; + bool pan; bool be; bool s2; }; @@ -37,6 +40,16 @@ struct s1_walk_result { u8 APTable; bool UXNTable; bool PXNTable; + bool uwxn; + bool uov; + bool ur; + bool uw; + bool ux; + bool pwxn; + bool pov; + bool pr; + bool pw; + bool px; }; struct { u8 fst; @@ -87,6 +100,51 @@ static enum trans_regime compute_translation_regime(struct kvm_vcpu *vcpu, u32 o } } +static bool s1pie_enabled(struct kvm_vcpu *vcpu, enum trans_regime regime) +{ + if (!kvm_has_s1pie(vcpu->kvm)) + return false; + + switch (regime) { + case TR_EL2: + case TR_EL20: + return vcpu_read_sys_reg(vcpu, TCR2_EL2) & TCR2_EL2_PIE; + case TR_EL10: + return (__vcpu_sys_reg(vcpu, HCRX_EL2) & HCRX_EL2_TCR2En) && + (__vcpu_sys_reg(vcpu, TCR2_EL1) & TCR2_EL1x_PIE); + default: + BUG(); + } +} + +static void compute_s1poe(struct kvm_vcpu *vcpu, struct s1_walk_info *wi) +{ + u64 val; + + if (!kvm_has_s1poe(vcpu->kvm)) { + wi->poe = wi->e0poe = false; + return; + } + + switch (wi->regime) { + case TR_EL2: + case TR_EL20: + val = vcpu_read_sys_reg(vcpu, TCR2_EL2); + wi->poe = val & TCR2_EL2_POE; + wi->e0poe = (wi->regime == TR_EL20) && (val & TCR2_EL2_E0POE); + break; + case TR_EL10: + if (__vcpu_sys_reg(vcpu, HCRX_EL2) & HCRX_EL2_TCR2En) { + wi->poe = wi->e0poe = false; + return; + } + + val = __vcpu_sys_reg(vcpu, TCR2_EL1); + wi->poe = val & TCR2_EL1x_POE; + wi->e0poe = val & TCR2_EL1x_E0POE; + } +} + static int setup_s1_walk(struct kvm_vcpu *vcpu, u32 op, struct s1_walk_info *wi, struct s1_walk_result *wr, u64 va) { @@ -98,6 +156,8 @@ static int setup_s1_walk(struct kvm_vcpu *vcpu, u32 op, struct s1_walk_info *wi, wi->regime = compute_translation_regime(vcpu, op); as_el0 = (op == OP_AT_S1E0R || op == OP_AT_S1E0W); + wi->pan = (op == OP_AT_S1E1RP || op == OP_AT_S1E1WP) && + (*vcpu_cpsr(vcpu) & PSR_PAN_BIT); va55 = va & BIT(55); @@ -180,6 +240,14 @@ static int setup_s1_walk(struct kvm_vcpu *vcpu, u32 op, struct s1_walk_info *wi, (va55 ? FIELD_GET(TCR_HPD1, tcr) : FIELD_GET(TCR_HPD0, tcr))); + /* R_JHSVW */ + wi->hpd |= s1pie_enabled(vcpu, wi->regime); + + /* Do we have POE? */ + compute_s1poe(vcpu, wi); + + /* R_BVXDG */ + wi->hpd |= (wi->poe || wi->e0poe); /* Someone was silly enough to encode TG0/TG1 differently */ if (va55) { @@ -412,6 +480,11 @@ struct mmu_config { u64 ttbr1; u64 tcr; u64 mair; + u64 tcr2; + u64 pir; + u64 pire0; + u64 por_el0; + u64 por_el1; u64 sctlr; u64 vttbr; u64 vtcr; @@ -424,6 +497,17 @@ static void __mmu_config_save(struct mmu_config *config) config->ttbr1 = read_sysreg_el1(SYS_TTBR1); config->tcr = read_sysreg_el1(SYS_TCR); config->mair = read_sysreg_el1(SYS_MAIR); + if (cpus_have_final_cap(ARM64_HAS_TCR2)) { + config->tcr2 = read_sysreg_el1(SYS_TCR2); + if (cpus_have_final_cap(ARM64_HAS_S1PIE)) { + config->pir = read_sysreg_el1(SYS_PIR); + config->pire0 = read_sysreg_el1(SYS_PIRE0); + } + if (system_supports_poe()) { + config->por_el1 = read_sysreg_el1(SYS_POR); + config->por_el0 = read_sysreg_s(SYS_POR_EL0); + } + } config->sctlr = read_sysreg_el1(SYS_SCTLR); config->vttbr = read_sysreg(vttbr_el2); config->vtcr = read_sysreg(vtcr_el2); @@ -444,6 +528,17 @@ static void __mmu_config_restore(struct mmu_config *config) write_sysreg_el1(config->ttbr1, SYS_TTBR1); write_sysreg_el1(config->tcr, SYS_TCR); write_sysreg_el1(config->mair, SYS_MAIR); + if (cpus_have_final_cap(ARM64_HAS_TCR2)) { + write_sysreg_el1(config->tcr2, SYS_TCR2); + if (cpus_have_final_cap(ARM64_HAS_S1PIE)) { + write_sysreg_el1(config->pir, SYS_PIR); + write_sysreg_el1(config->pire0, SYS_PIRE0); + } + if (system_supports_poe()) { + write_sysreg_el1(config->por_el1, SYS_POR); + write_sysreg_s(config->por_el0, SYS_POR_EL0); + } + } write_sysreg_el1(config->sctlr, SYS_SCTLR); write_sysreg(config->vttbr, vttbr_el2); write_sysreg(config->vtcr, vtcr_el2); @@ -739,6 +834,9 @@ static bool pan3_enabled(struct kvm_vcpu *vcpu, enum trans_regime regime) if (!kvm_has_feat(vcpu->kvm, ID_AA64MMFR1_EL1, PAN, PAN3)) return false; + if (s1pie_enabled(vcpu, regime)) + return true; + if (regime == TR_EL10) sctlr = vcpu_read_sys_reg(vcpu, SCTLR_EL1); else @@ -747,111 +845,343 @@ static bool pan3_enabled(struct kvm_vcpu *vcpu, enum trans_regime regime) return sctlr & SCTLR_EL1_EPAN; } -static u64 handle_at_slow(struct kvm_vcpu *vcpu, u32 op, u64 vaddr) +static void compute_s1_direct_permissions(struct kvm_vcpu *vcpu, + struct s1_walk_info *wi, + struct s1_walk_result *wr) { - bool perm_fail, ur, uw, ux, pr, pw, px; - struct s1_walk_result wr = {}; - struct s1_walk_info wi = {}; - int ret, idx; + bool wxn; - ret = setup_s1_walk(vcpu, op, &wi, &wr, vaddr); - if (ret) - goto compute_par; - - if (wr.level == S1_MMU_DISABLED) - goto compute_par; - - idx = srcu_read_lock(&vcpu->kvm->srcu); - - ret = walk_s1(vcpu, &wi, &wr, vaddr); - - srcu_read_unlock(&vcpu->kvm->srcu, idx); - - if (ret) - goto compute_par; - - /* FIXME: revisit when adding indirect permission support */ - /* AArch64.S1DirectBasePermissions() */ - if (wi.regime != TR_EL2) { - switch (FIELD_GET(PTE_USER | PTE_RDONLY, wr.desc)) { + /* Non-hierarchical part of AArch64.S1DirectBasePermissions() */ + if (wi->regime != TR_EL2) { + switch (FIELD_GET(PTE_USER | PTE_RDONLY, wr->desc)) { case 0b00: - pr = pw = true; - ur = uw = false; + wr->pr = wr->pw = true; + wr->ur = wr->uw = false; break; case 0b01: - pr = pw = ur = uw = true; + wr->pr = wr->pw = wr->ur = wr->uw = true; break; case 0b10: - pr = true; - pw = ur = uw = false; + wr->pr = true; + wr->pw = wr->ur = wr->uw = false; break; case 0b11: - pr = ur = true; - pw = uw = false; + wr->pr = wr->ur = true; + wr->pw = wr->uw = false; break; } - switch (wr.APTable) { + /* We don't use px for anything yet, but hey... */ + wr->px = !((wr->desc & PTE_PXN) || wr->uw); + wr->ux = !(wr->desc & PTE_UXN); + } else { + wr->ur = wr->uw = wr->ux = false; + + if (!(wr->desc & PTE_RDONLY)) { + wr->pr = wr->pw = true; + } else { + wr->pr = true; + wr->pw = false; + } + + /* XN maps to UXN */ + wr->px = !(wr->desc & PTE_UXN); + } + + switch (wi->regime) { + case TR_EL2: + case TR_EL20: + wxn = (vcpu_read_sys_reg(vcpu, SCTLR_EL2) & SCTLR_ELx_WXN); + break; + case TR_EL10: + wxn = (__vcpu_sys_reg(vcpu, SCTLR_EL1) & SCTLR_ELx_WXN); + break; + } + + wr->pwxn = wr->uwxn = wxn; + wr->pov = wi->poe; + wr->uov = wi->e0poe; +} + +static void compute_s1_hierarchical_permissions(struct kvm_vcpu *vcpu, + struct s1_walk_info *wi, + struct s1_walk_result *wr) +{ + /* Hierarchical part of AArch64.S1DirectBasePermissions() */ + if (wi->regime != TR_EL2) { + switch (wr->APTable) { case 0b00: break; case 0b01: - ur = uw = false; + wr->ur = wr->uw = false; break; case 0b10: - pw = uw = false; + wr->pw = wr->uw = false; break; case 0b11: - pw = ur = uw = false; + wr->pw = wr->ur = wr->uw = false; break; } - /* We don't use px for anything yet, but hey... */ - px = !((wr.desc & PTE_PXN) || wr.PXNTable || uw); - ux = !((wr.desc & PTE_UXN) || wr.UXNTable); + wr->px &= !wr->PXNTable; + wr->ux &= !wr->UXNTable; + } else { + if (wr->APTable & BIT(1)) + wr->pw = false; + + /* XN maps to UXN */ + wr->px &= !wr->UXNTable; + } +} - if (op == OP_AT_S1E1RP || op == OP_AT_S1E1WP) { - bool pan; +#define perm_idx(v, r, i) ((vcpu_read_sys_reg((v), (r)) >> ((i) * 4)) & 0xf) + +#define set_priv_perms(wr, r, w, x) \ + do { \ + (wr)->pr = (r); \ + (wr)->pw = (w); \ + (wr)->px = (x); \ + } while (0) + +#define set_unpriv_perms(wr, r, w, x) \ + do { \ + (wr)->ur = (r); \ + (wr)->uw = (w); \ + (wr)->ux = (x); \ + } while (0) + +#define set_priv_wxn(wr, v) \ + do { \ + (wr)->pwxn = (v); \ + } while (0) + +#define set_unpriv_wxn(wr, v) \ + do { \ + (wr)->uwxn = (v); \ + } while (0) + +/* Similar to AArch64.S1IndirectBasePermissions(), without GCS */ +#define set_perms(w, wr, ip) \ + do { \ + /* R_LLZDZ */ \ + switch ((ip)) { \ + case 0b0000: \ + set_ ## w ## _perms((wr), false, false, false); \ + break; \ + case 0b0001: \ + set_ ## w ## _perms((wr), true , false, false); \ + break; \ + case 0b0010: \ + set_ ## w ## _perms((wr), false, false, true ); \ + break; \ + case 0b0011: \ + set_ ## w ## _perms((wr), true , false, true ); \ + break; \ + case 0b0100: \ + set_ ## w ## _perms((wr), false, false, false); \ + break; \ + case 0b0101: \ + set_ ## w ## _perms((wr), true , true , false); \ + break; \ + case 0b0110: \ + set_ ## w ## _perms((wr), true , true , true ); \ + break; \ + case 0b0111: \ + set_ ## w ## _perms((wr), true , true , true ); \ + break; \ + case 0b1000: \ + set_ ## w ## _perms((wr), true , false, false); \ + break; \ + case 0b1001: \ + set_ ## w ## _perms((wr), true , false, false); \ + break; \ + case 0b1010: \ + set_ ## w ## _perms((wr), true , false, true ); \ + break; \ + case 0b1011: \ + set_ ## w ## _perms((wr), false, false, false); \ + break; \ + case 0b1100: \ + set_ ## w ## _perms((wr), true , true , false); \ + break; \ + case 0b1101: \ + set_ ## w ## _perms((wr), false, false, false); \ + break; \ + case 0b1110: \ + set_ ## w ## _perms((wr), true , true , true ); \ + break; \ + case 0b1111: \ + set_ ## w ## _perms((wr), false, false, false); \ + break; \ + } \ + \ + /* R_HJYGR */ \ + set_ ## w ## _wxn((wr), ((ip) == 0b0110)); \ + \ + } while (0) + +static void compute_s1_indirect_permissions(struct kvm_vcpu *vcpu, + struct s1_walk_info *wi, + struct s1_walk_result *wr) +{ + u8 up, pp, idx; - pan = *vcpu_cpsr(vcpu) & PSR_PAN_BIT; - pan &= ur || uw || (pan3_enabled(vcpu, wi.regime) && ux); - pw &= !pan; - pr &= !pan; - } - } else { - ur = uw = ux = false; + idx = pte_pi_index(wr->desc); - if (!(wr.desc & PTE_RDONLY)) { - pr = pw = true; - } else { - pr = true; - pw = false; - } + switch (wi->regime) { + case TR_EL10: + pp = perm_idx(vcpu, PIR_EL1, idx); + up = perm_idx(vcpu, PIRE0_EL1, idx); + break; + case TR_EL20: + pp = perm_idx(vcpu, PIR_EL2, idx); + up = perm_idx(vcpu, PIRE0_EL2, idx); + break; + case TR_EL2: + pp = perm_idx(vcpu, PIR_EL2, idx); + up = 0; + break; + } - if (wr.APTable & BIT(1)) - pw = false; + set_perms(priv, wr, pp); - /* XN maps to UXN */ - px = !((wr.desc & PTE_UXN) || wr.UXNTable); + if (wi->regime != TR_EL2) + set_perms(unpriv, wr, up); + else + set_unpriv_perms(wr, false, false, false); + + wr->pov = wi->poe && !(pp & BIT(3)); + wr->uov = wi->e0poe && !(up & BIT(3)); + + /* R_VFPJF */ + if (wr->px && wr->uw) { + set_priv_perms(wr, false, false, false); + set_unpriv_perms(wr, false, false, false); + } +} + +static void compute_s1_overlay_permissions(struct kvm_vcpu *vcpu, + struct s1_walk_info *wi, + struct s1_walk_result *wr) +{ + u8 idx, pov_perms, uov_perms; + + idx = FIELD_GET(PTE_PO_IDX_MASK, wr->desc); + + switch (wi->regime) { + case TR_EL10: + pov_perms = perm_idx(vcpu, POR_EL1, idx); + uov_perms = perm_idx(vcpu, POR_EL0, idx); + break; + case TR_EL20: + pov_perms = perm_idx(vcpu, POR_EL2, idx); + uov_perms = perm_idx(vcpu, POR_EL0, idx); + break; + case TR_EL2: + pov_perms = perm_idx(vcpu, POR_EL2, idx); + uov_perms = 0; + break; + } + + if (pov_perms & ~POE_RXW) + pov_perms = POE_NONE; + + if (wi->poe && wr->pov) { + wr->pr &= pov_perms & POE_R; + wr->px &= pov_perms & POE_X; + wr->pw &= pov_perms & POE_W; + } + + if (uov_perms & ~POE_RXW) + uov_perms = POE_NONE; + + if (wi->e0poe && wr->uov) { + wr->ur &= uov_perms & POE_R; + wr->ux &= uov_perms & POE_X; + wr->uw &= uov_perms & POE_W; } +} + +static void compute_s1_permissions(struct kvm_vcpu *vcpu, + struct s1_walk_info *wi, + struct s1_walk_result *wr) +{ + bool pan; + + if (!s1pie_enabled(vcpu, wi->regime)) + compute_s1_direct_permissions(vcpu, wi, wr); + else + compute_s1_indirect_permissions(vcpu, wi, wr); + + if (!wi->hpd) + compute_s1_hierarchical_permissions(vcpu, wi, wr); + + if (wi->poe || wi->e0poe) + compute_s1_overlay_permissions(vcpu, wi, wr); + + /* R_QXXPC */ + if (wr->pwxn) { + if (!wr->pov && wr->pw) + wr->px = false; + if (wr->pov && wr->px) + wr->pw = false; + } + + /* R_NPBXC */ + if (wr->uwxn) { + if (!wr->uov && wr->uw) + wr->ux = false; + if (wr->uov && wr->ux) + wr->uw = false; + } + + pan = wi->pan && (wr->ur || wr->uw || + (pan3_enabled(vcpu, wi->regime) && wr->ux)); + wr->pw &= !pan; + wr->pr &= !pan; +} - perm_fail = false; +static u64 handle_at_slow(struct kvm_vcpu *vcpu, u32 op, u64 vaddr) +{ + struct s1_walk_result wr = {}; + struct s1_walk_info wi = {}; + bool perm_fail = false; + int ret, idx; + + ret = setup_s1_walk(vcpu, op, &wi, &wr, vaddr); + if (ret) + goto compute_par; + + if (wr.level == S1_MMU_DISABLED) + goto compute_par; + + idx = srcu_read_lock(&vcpu->kvm->srcu); + + ret = walk_s1(vcpu, &wi, &wr, vaddr); + + srcu_read_unlock(&vcpu->kvm->srcu, idx); + + if (ret) + goto compute_par; + + compute_s1_permissions(vcpu, &wi, &wr); switch (op) { case OP_AT_S1E1RP: case OP_AT_S1E1R: case OP_AT_S1E2R: - perm_fail = !pr; + perm_fail = !wr.pr; break; case OP_AT_S1E1WP: case OP_AT_S1E1W: case OP_AT_S1E2W: - perm_fail = !pw; + perm_fail = !wr.pw; break; case OP_AT_S1E0R: - perm_fail = !ur; + perm_fail = !wr.ur; break; case OP_AT_S1E0W: - perm_fail = !uw; + perm_fail = !wr.uw; break; case OP_AT_S1E1A: case OP_AT_S1E2A: @@ -914,6 +1244,17 @@ static u64 __kvm_at_s1e01_fast(struct kvm_vcpu *vcpu, u32 op, u64 vaddr) write_sysreg_el1(vcpu_read_sys_reg(vcpu, TTBR1_EL1), SYS_TTBR1); write_sysreg_el1(vcpu_read_sys_reg(vcpu, TCR_EL1), SYS_TCR); write_sysreg_el1(vcpu_read_sys_reg(vcpu, MAIR_EL1), SYS_MAIR); + if (kvm_has_tcr2(vcpu->kvm)) { + write_sysreg_el1(vcpu_read_sys_reg(vcpu, TCR2_EL1), SYS_TCR2); + if (kvm_has_s1pie(vcpu->kvm)) { + write_sysreg_el1(vcpu_read_sys_reg(vcpu, PIR_EL1), SYS_PIR); + write_sysreg_el1(vcpu_read_sys_reg(vcpu, PIRE0_EL1), SYS_PIRE0); + } + if (kvm_has_s1poe(vcpu->kvm)) { + write_sysreg_el1(vcpu_read_sys_reg(vcpu, POR_EL1), SYS_POR); + write_sysreg_s(vcpu_read_sys_reg(vcpu, POR_EL0), SYS_POR_EL0); + } + } write_sysreg_el1(vcpu_read_sys_reg(vcpu, SCTLR_EL1), SYS_SCTLR); __load_stage2(mmu, mmu->arch); @@ -992,12 +1333,9 @@ void __kvm_at_s1e2(struct kvm_vcpu *vcpu, u32 op, u64 vaddr) * switching context behind everybody's back, disable interrupts... */ scoped_guard(write_lock_irqsave, &vcpu->kvm->mmu_lock) { - struct kvm_s2_mmu *mmu; u64 val, hcr; bool fail; - mmu = &vcpu->kvm->arch.mmu; - val = hcr = read_sysreg(hcr_el2); val &= ~HCR_TGE; val |= HCR_VM; |