From 8b202ee218395319aec1ef44f72043e1fbaccdd6 Mon Sep 17 00:00:00 2001 From: Sven Schnelle Date: Mon, 25 Apr 2022 14:17:42 +0200 Subject: s390: disable -Warray-bounds gcc-12 shows a lot of array bound warnings on s390. This is caused by the S390_lowcore macro which uses a hardcoded address of 0. Wrapping that with absolute_pointer() works, but gcc no longer knows that a 12 bit displacement is sufficient to access lowcore. So it emits instructions like 'lghi %r1,0; l %rx,xxx(%r1)' instead of a single load/store instruction. As s390 stores variables often read/written in lowcore, this is considered problematic. Therefore disable -Warray-bounds on s390 for gcc-12 for the time being, until there is a better solution. Signed-off-by: Sven Schnelle Link: https://lore.kernel.org/r/yt9dzgkelelc.fsf@linux.ibm.com Link: https://lore.kernel.org/r/20220422134308.1613610-1-svens@linux.ibm.com Link: https://lore.kernel.org/r/20220425121742.3222133-1-svens@linux.ibm.com Signed-off-by: Heiko Carstens --- arch/s390/Makefile | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'arch') diff --git a/arch/s390/Makefile b/arch/s390/Makefile index e441b60b1812..df325eacf62d 100644 --- a/arch/s390/Makefile +++ b/arch/s390/Makefile @@ -30,6 +30,16 @@ KBUILD_CFLAGS_DECOMPRESSOR += -fno-stack-protector KBUILD_CFLAGS_DECOMPRESSOR += $(call cc-disable-warning, address-of-packed-member) KBUILD_CFLAGS_DECOMPRESSOR += $(if $(CONFIG_DEBUG_INFO),-g) KBUILD_CFLAGS_DECOMPRESSOR += $(if $(CONFIG_DEBUG_INFO_DWARF4), $(call cc-option, -gdwarf-4,)) + +ifdef CONFIG_CC_IS_GCC + ifeq ($(call cc-ifversion, -ge, 1200, y), y) + ifeq ($(call cc-ifversion, -lt, 1300, y), y) + KBUILD_CFLAGS += $(call cc-disable-warning, array-bounds) + KBUILD_CFLAGS_DECOMPRESSOR += $(call cc-disable-warning, array-bounds) + endif + endif +endif + UTS_MACHINE := s390x STACK_SIZE := $(if $(CONFIG_KASAN),65536,16384) CHECKFLAGS += -D__s390__ -D__s390x__ -- cgit v1.2.3 From f0a6c68f69981214cb7858738dd2bc81475111f7 Mon Sep 17 00:00:00 2001 From: "Maciej W. Rozycki" Date: Sun, 24 Apr 2022 12:46:23 +0100 Subject: MIPS: Fix CP0 counter erratum detection for R4k CPUs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix the discrepancy between the two places we check for the CP0 counter erratum in along with the incorrect comparison of the R4400 revision number against 0x30 which matches none and consistently consider all R4000 and R4400 processors affected, as documented in processor errata publications[1][2][3], following the mapping between CP0 PRId register values and processor models: PRId | Processor Model ---------+-------------------- 00000422 | R4000 Revision 2.2 00000430 | R4000 Revision 3.0 00000440 | R4400 Revision 1.0 00000450 | R4400 Revision 2.0 00000460 | R4400 Revision 3.0 No other revision of either processor has ever been spotted. Contrary to what has been stated in commit ce202cbb9e0b ("[MIPS] Assume R4000/R4400 newer than 3.0 don't have the mfc0 count bug") marking the CP0 counter as buggy does not preclude it from being used as either a clock event or a clock source device. It just cannot be used as both at a time, because in that case clock event interrupts will be occasionally lost, and the use as a clock event device takes precedence. Compare against 0x4ff in `can_use_mips_counter' so that a single machine instruction is produced. References: [1] "MIPS R4000PC/SC Errata, Processor Revision 2.2 and 3.0", MIPS Technologies Inc., May 10, 1994, Erratum 53, p.13 [2] "MIPS R4400PC/SC Errata, Processor Revision 1.0", MIPS Technologies Inc., February 9, 1994, Erratum 21, p.4 [3] "MIPS R4400PC/SC Errata, Processor Revision 2.0 & 3.0", MIPS Technologies Inc., January 24, 1995, Erratum 14, p.3 Signed-off-by: Maciej W. Rozycki Fixes: ce202cbb9e0b ("[MIPS] Assume R4000/R4400 newer than 3.0 don't have the mfc0 count bug") Cc: stable@vger.kernel.org # v2.6.24+ Reviewed-by: Philippe Mathieu-Daudé Signed-off-by: Thomas Bogendoerfer --- arch/mips/include/asm/timex.h | 8 ++++---- arch/mips/kernel/time.c | 11 +++-------- 2 files changed, 7 insertions(+), 12 deletions(-) (limited to 'arch') diff --git a/arch/mips/include/asm/timex.h b/arch/mips/include/asm/timex.h index b05bb70a2e46..8026baf46e72 100644 --- a/arch/mips/include/asm/timex.h +++ b/arch/mips/include/asm/timex.h @@ -40,9 +40,9 @@ typedef unsigned int cycles_t; /* - * On R4000/R4400 before version 5.0 an erratum exists such that if the - * cycle counter is read in the exact moment that it is matching the - * compare register, no interrupt will be generated. + * On R4000/R4400 an erratum exists such that if the cycle counter is + * read in the exact moment that it is matching the compare register, + * no interrupt will be generated. * * There is a suggested workaround and also the erratum can't strike if * the compare interrupt isn't being used as the clock source device. @@ -63,7 +63,7 @@ static inline int can_use_mips_counter(unsigned int prid) if (!__builtin_constant_p(cpu_has_counter)) asm volatile("" : "=m" (cpu_data[0].options)); if (likely(cpu_has_counter && - prid >= (PRID_IMP_R4000 | PRID_REV_ENCODE_44(5, 0)))) + prid > (PRID_IMP_R4000 | PRID_REV_ENCODE_44(15, 15)))) return 1; else return 0; diff --git a/arch/mips/kernel/time.c b/arch/mips/kernel/time.c index caa01457dce6..ed339d7979f3 100644 --- a/arch/mips/kernel/time.c +++ b/arch/mips/kernel/time.c @@ -141,15 +141,10 @@ static __init int cpu_has_mfc0_count_bug(void) case CPU_R4400MC: /* * The published errata for the R4400 up to 3.0 say the CPU - * has the mfc0 from count bug. + * has the mfc0 from count bug. This seems the last version + * produced. */ - if ((current_cpu_data.processor_id & 0xff) <= 0x30) - return 1; - - /* - * we assume newer revisions are ok - */ - return 0; + return 1; } return 0; -- cgit v1.2.3 From c6fe81191bd74f7e6ae9ce96a4837df9485f3ab8 Mon Sep 17 00:00:00 2001 From: Nick Kossifidis Date: Tue, 22 Mar 2022 15:28:39 +0200 Subject: RISC-V: relocate DTB if it's outside memory region In case the DTB provided by the bootloader/BootROM is before the kernel image or outside /memory, we won't be able to access it through the linear mapping, and get a segfault on setup_arch(). Currently OpenSBI relocates DTB but that's not always the case (e.g. if FW_JUMP_FDT_ADDR is not specified), and it's also not the most portable approach since the default FW_JUMP_FDT_ADDR of the generic platform relocates the DTB at a specific offset that may not be available. To avoid this situation copy DTB so that it's visible through the linear mapping. Signed-off-by: Nick Kossifidis Link: https://lore.kernel.org/r/20220322132839.3653682-1-mick@ics.forth.gr Tested-by: Conor Dooley Fixes: f105aa940e78 ("riscv: add BUILTIN_DTB support for MMU-enabled targets") Cc: stable@vger.kernel.org Signed-off-by: Palmer Dabbelt --- arch/riscv/mm/init.c | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c index b0793dc0c291..05ed641a1134 100644 --- a/arch/riscv/mm/init.c +++ b/arch/riscv/mm/init.c @@ -208,8 +208,25 @@ static void __init setup_bootmem(void) * early_init_fdt_reserve_self() since __pa() does * not work for DTB pointers that are fixmap addresses */ - if (!IS_ENABLED(CONFIG_BUILTIN_DTB)) - memblock_reserve(dtb_early_pa, fdt_totalsize(dtb_early_va)); + if (!IS_ENABLED(CONFIG_BUILTIN_DTB)) { + /* + * In case the DTB is not located in a memory region we won't + * be able to locate it later on via the linear mapping and + * get a segfault when accessing it via __va(dtb_early_pa). + * To avoid this situation copy DTB to a memory region. + * Note that memblock_phys_alloc will also reserve DTB region. + */ + if (!memblock_is_memory(dtb_early_pa)) { + size_t fdt_size = fdt_totalsize(dtb_early_va); + phys_addr_t new_dtb_early_pa = memblock_phys_alloc(fdt_size, PAGE_SIZE); + void *new_dtb_early_va = early_memremap(new_dtb_early_pa, fdt_size); + + memcpy(new_dtb_early_va, dtb_early_va, fdt_size); + early_memunmap(new_dtb_early_va, fdt_size); + _dtb_early_pa = new_dtb_early_pa; + } else + memblock_reserve(dtb_early_pa, fdt_totalsize(dtb_early_va)); + } early_init_fdt_scan_reserved_mem(); dma_contiguous_reserve(dma32_phys_limit); -- cgit v1.2.3 From b5d1274409d0eec6d826f65d6dafebf9d77a1b99 Mon Sep 17 00:00:00 2001 From: Janis Schoetterl-Glausch Date: Fri, 11 Mar 2022 18:00:40 +0100 Subject: KVM: s390: Fix lockdep issue in vm memop Issuing a memop on a protected vm does not make sense, neither is the memory readable/writable, nor does it make sense to check storage keys. This is why the ioctl will return -EINVAL when it detects the vm to be protected. However, in order to ensure that the vm cannot become protected during the memop, the kvm->lock would need to be taken for the duration of the ioctl. This is also required because kvm_s390_pv_is_protected asserts that the lock must be held. Instead, don't try to prevent this. If user space enables secure execution concurrently with a memop it must accecpt the possibility of the memop failing. Still check if the vm is currently protected, but without locking and consider it a heuristic. Fixes: ef11c9463ae0 ("KVM: s390: Add vm IOCTL for key checked guest absolute memory access") Signed-off-by: Janis Schoetterl-Glausch Reviewed-by: Janosch Frank Reviewed-by: Claudio Imbrenda Link: https://lore.kernel.org/r/20220322153204.2637400-1-scgl@linux.ibm.com Signed-off-by: Christian Borntraeger Signed-off-by: Heiko Carstens --- arch/s390/kvm/kvm-s390.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index da3dabda1a12..76ad6408cb2c 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -2384,7 +2384,16 @@ static int kvm_s390_vm_mem_op(struct kvm *kvm, struct kvm_s390_mem_op *mop) return -EINVAL; if (mop->size > MEM_OP_MAX_SIZE) return -E2BIG; - if (kvm_s390_pv_is_protected(kvm)) + /* + * This is technically a heuristic only, if the kvm->lock is not + * taken, it is not guaranteed that the vm is/remains non-protected. + * This is ok from a kernel perspective, wrongdoing is detected + * on the access, -EFAULT is returned and the vm may crash the + * next time it accesses the memory in question. + * There is no sane usecase to do switching and a memop on two + * different CPUs at the same time. + */ + if (kvm_s390_pv_get_handle(kvm)) return -EINVAL; if (mop->flags & KVM_S390_MEMOP_F_SKEY_PROTECTION) { if (access_key_invalid(mop->key)) -- cgit v1.2.3 From 706c9c55e5a32800605eb6a864ef6e1ca0c6c179 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Sat, 23 Apr 2022 03:47:41 +0000 Subject: KVM: x86/mmu: Don't treat fully writable SPTEs as volatile (modulo A/D) Don't treat SPTEs that are truly writable, i.e. writable in hardware, as being volatile (unless they're volatile for other reasons, e.g. A/D bits). KVM _sets_ the WRITABLE bit out of mmu_lock, but never _clears_ the bit out of mmu_lock, so if the WRITABLE bit is set, it cannot magically get cleared just because the SPTE is MMU-writable. Rename the wrapper of MMU-writable to be more literal, the previous name of spte_can_locklessly_be_made_writable() is wrong and misleading. Fixes: c7ba5b48cc8d ("KVM: MMU: fast path of handling guest page fault") Cc: stable@vger.kernel.org Signed-off-by: Sean Christopherson Message-Id: <20220423034752.1161007-2-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 17 +++++++++-------- arch/x86/kvm/mmu/spte.h | 2 +- 2 files changed, 10 insertions(+), 9 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 64a2a7e2be90..48dcb6a782f4 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -484,13 +484,15 @@ static bool spte_has_volatile_bits(u64 spte) * also, it can help us to get a stable is_writable_pte() * to ensure tlb flush is not missed. */ - if (spte_can_locklessly_be_made_writable(spte) || - is_access_track_spte(spte)) + if (!is_writable_pte(spte) && is_mmu_writable_spte(spte)) + return true; + + if (is_access_track_spte(spte)) return true; if (spte_ad_enabled(spte)) { - if ((spte & shadow_accessed_mask) == 0 || - (is_writable_pte(spte) && (spte & shadow_dirty_mask) == 0)) + if (!(spte & shadow_accessed_mask) || + (is_writable_pte(spte) && !(spte & shadow_dirty_mask))) return true; } @@ -557,7 +559,7 @@ static bool mmu_spte_update(u64 *sptep, u64 new_spte) * we always atomically update it, see the comments in * spte_has_volatile_bits(). */ - if (spte_can_locklessly_be_made_writable(old_spte) && + if (is_mmu_writable_spte(old_spte) && !is_writable_pte(new_spte)) flush = true; @@ -1187,7 +1189,7 @@ static bool spte_write_protect(u64 *sptep, bool pt_protect) u64 spte = *sptep; if (!is_writable_pte(spte) && - !(pt_protect && spte_can_locklessly_be_made_writable(spte))) + !(pt_protect && is_mmu_writable_spte(spte))) return false; rmap_printk("spte %p %llx\n", sptep, *sptep); @@ -3196,8 +3198,7 @@ static int fast_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) * be removed in the fast path only if the SPTE was * write-protected for dirty-logging or access tracking. */ - if (fault->write && - spte_can_locklessly_be_made_writable(spte)) { + if (fault->write && is_mmu_writable_spte(spte)) { new_spte |= PT_WRITABLE_MASK; /* diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h index e4abeb5df1b1..c571784cb567 100644 --- a/arch/x86/kvm/mmu/spte.h +++ b/arch/x86/kvm/mmu/spte.h @@ -390,7 +390,7 @@ static inline void check_spte_writable_invariants(u64 spte) "kvm: Writable SPTE is not MMU-writable: %llx", spte); } -static inline bool spte_can_locklessly_be_made_writable(u64 spte) +static inline bool is_mmu_writable_spte(u64 spte) { return spte & shadow_mmu_writable_mask; } -- cgit v1.2.3 From 54eb3ef56f36827aad90915df33387d4c2b5df5a Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Sat, 23 Apr 2022 03:47:42 +0000 Subject: KVM: x86/mmu: Move shadow-present check out of spte_has_volatile_bits() Move the is_shadow_present_pte() check out of spte_has_volatile_bits() and into its callers. Well, caller, since only one of its two callers doesn't already do the shadow-present check. Opportunistically move the helper to spte.c/h so that it can be used by the TDP MMU, which is also the primary motivation for the shadow-present change. Unlike the legacy MMU, the TDP MMU uses a single path for clear leaf and non-leaf SPTEs, and to avoid unnecessary atomic updates, the TDP MMU will need to check is_last_spte() prior to calling spte_has_volatile_bits(), and calling is_last_spte() without first calling is_shadow_present_spte() is at best odd, and at worst a violation of KVM's loosely defines SPTE rules. Note, mmu_spte_clear_track_bits() could likely skip the write entirely for SPTEs that are not shadow-present. Leave that cleanup for a future patch to avoid introducing a functional change, and because the shadow-present check can likely be moved further up the stack, e.g. drop_large_spte() appears to be the only path that doesn't already explicitly check for a shadow-present SPTE. No functional change intended. Cc: stable@vger.kernel.org Signed-off-by: Sean Christopherson Message-Id: <20220423034752.1161007-3-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 29 ++--------------------------- arch/x86/kvm/mmu/spte.c | 28 ++++++++++++++++++++++++++++ arch/x86/kvm/mmu/spte.h | 2 ++ 3 files changed, 32 insertions(+), 27 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 48dcb6a782f4..311e4e1d7870 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -473,32 +473,6 @@ retry: } #endif -static bool spte_has_volatile_bits(u64 spte) -{ - if (!is_shadow_present_pte(spte)) - return false; - - /* - * Always atomically update spte if it can be updated - * out of mmu-lock, it can ensure dirty bit is not lost, - * also, it can help us to get a stable is_writable_pte() - * to ensure tlb flush is not missed. - */ - if (!is_writable_pte(spte) && is_mmu_writable_spte(spte)) - return true; - - if (is_access_track_spte(spte)) - return true; - - if (spte_ad_enabled(spte)) { - if (!(spte & shadow_accessed_mask) || - (is_writable_pte(spte) && !(spte & shadow_dirty_mask))) - return true; - } - - return false; -} - /* Rules for using mmu_spte_set: * Set the sptep from nonpresent to present. * Note: the sptep being assigned *must* be either not present @@ -593,7 +567,8 @@ static int mmu_spte_clear_track_bits(struct kvm *kvm, u64 *sptep) u64 old_spte = *sptep; int level = sptep_to_sp(sptep)->role.level; - if (!spte_has_volatile_bits(old_spte)) + if (!is_shadow_present_pte(old_spte) || + !spte_has_volatile_bits(old_spte)) __update_clear_spte_fast(sptep, 0ull); else old_spte = __update_clear_spte_slow(sptep, 0ull); diff --git a/arch/x86/kvm/mmu/spte.c b/arch/x86/kvm/mmu/spte.c index 4739b53c9734..e5c0b6db6f2c 100644 --- a/arch/x86/kvm/mmu/spte.c +++ b/arch/x86/kvm/mmu/spte.c @@ -90,6 +90,34 @@ static bool kvm_is_mmio_pfn(kvm_pfn_t pfn) E820_TYPE_RAM); } +/* + * Returns true if the SPTE has bits that may be set without holding mmu_lock. + * The caller is responsible for checking if the SPTE is shadow-present, and + * for determining whether or not the caller cares about non-leaf SPTEs. + */ +bool spte_has_volatile_bits(u64 spte) +{ + /* + * Always atomically update spte if it can be updated + * out of mmu-lock, it can ensure dirty bit is not lost, + * also, it can help us to get a stable is_writable_pte() + * to ensure tlb flush is not missed. + */ + if (!is_writable_pte(spte) && is_mmu_writable_spte(spte)) + return true; + + if (is_access_track_spte(spte)) + return true; + + if (spte_ad_enabled(spte)) { + if (!(spte & shadow_accessed_mask) || + (is_writable_pte(spte) && !(spte & shadow_dirty_mask))) + return true; + } + + return false; +} + bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, const struct kvm_memory_slot *slot, unsigned int pte_access, gfn_t gfn, kvm_pfn_t pfn, diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h index c571784cb567..80ab0f5cff01 100644 --- a/arch/x86/kvm/mmu/spte.h +++ b/arch/x86/kvm/mmu/spte.h @@ -404,6 +404,8 @@ static inline u64 get_mmio_spte_generation(u64 spte) return gen; } +bool spte_has_volatile_bits(u64 spte); + bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, const struct kvm_memory_slot *slot, unsigned int pte_access, gfn_t gfn, kvm_pfn_t pfn, -- cgit v1.2.3 From ba3a6120a4e7efc13d19fe43eb6c5caf1da05b72 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Sat, 23 Apr 2022 03:47:43 +0000 Subject: KVM: x86/mmu: Use atomic XCHG to write TDP MMU SPTEs with volatile bits Use an atomic XCHG to write TDP MMU SPTEs that have volatile bits, even if mmu_lock is held for write, as volatile SPTEs can be written by other tasks/vCPUs outside of mmu_lock. If a vCPU uses the to-be-modified SPTE to write a page, the CPU can cache the translation as WRITABLE in the TLB despite it being seen by KVM as !WRITABLE, and/or KVM can clobber the Accessed/Dirty bits and not properly tag the backing page. Exempt non-leaf SPTEs from atomic updates as KVM itself doesn't modify non-leaf SPTEs without holding mmu_lock, they do not have Dirty bits, and KVM doesn't consume the Accessed bit of non-leaf SPTEs. Dropping the Dirty and/or Writable bits is most problematic for dirty logging, as doing so can result in a missed TLB flush and eventually a missed dirty page. In the unlikely event that the only dirty page(s) is a clobbered SPTE, clear_dirty_gfn_range() will see the SPTE as not dirty (based on the Dirty or Writable bit depending on the method) and so not update the SPTE and ultimately not flush. If the SPTE is cached in the TLB as writable before it is clobbered, the guest can continue writing the associated page without ever taking a write-protect fault. For most (all?) file back memory, dropping the Dirty bit is a non-issue. The primary MMU write-protects its PTEs on writeback, i.e. KVM's dirty bit is effectively ignored because the primary MMU will mark that page dirty when the write-protection is lifted, e.g. when KVM faults the page back in for write. The Accessed bit is a complete non-issue. Aside from being unused for non-leaf SPTEs, KVM doesn't do a TLB flush when aging SPTEs, i.e. the Accessed bit may be dropped anyways. Lastly, the Writable bit is also problematic as an extension of the Dirty bit, as KVM (correctly) treats the Dirty bit as volatile iff the SPTE is !DIRTY && WRITABLE. If KVM fixes an MMU-writable, but !WRITABLE, SPTE out of mmu_lock, then it can allow the CPU to set the Dirty bit despite the SPTE being !WRITABLE when it is checked by KVM. But that all depends on the Dirty bit being problematic in the first place. Fixes: 2f2fad0897cb ("kvm: x86/mmu: Add functions to handle changed TDP SPTEs") Cc: stable@vger.kernel.org Cc: Ben Gardon Cc: David Matlack Cc: Venkatesh Srinivas Signed-off-by: Sean Christopherson Message-Id: <20220423034752.1161007-4-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/tdp_iter.h | 34 +++++++++++++++++-- arch/x86/kvm/mmu/tdp_mmu.c | 82 +++++++++++++++++++++++++++++---------------- 2 files changed, 85 insertions(+), 31 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/mmu/tdp_iter.h b/arch/x86/kvm/mmu/tdp_iter.h index b1eaf6ec0e0b..f0af385c56e0 100644 --- a/arch/x86/kvm/mmu/tdp_iter.h +++ b/arch/x86/kvm/mmu/tdp_iter.h @@ -6,6 +6,7 @@ #include #include "mmu.h" +#include "spte.h" /* * TDP MMU SPTEs are RCU protected to allow paging structures (non-leaf SPTEs) @@ -17,9 +18,38 @@ static inline u64 kvm_tdp_mmu_read_spte(tdp_ptep_t sptep) { return READ_ONCE(*rcu_dereference(sptep)); } -static inline void kvm_tdp_mmu_write_spte(tdp_ptep_t sptep, u64 val) + +static inline u64 kvm_tdp_mmu_write_spte_atomic(tdp_ptep_t sptep, u64 new_spte) +{ + return xchg(rcu_dereference(sptep), new_spte); +} + +static inline void __kvm_tdp_mmu_write_spte(tdp_ptep_t sptep, u64 new_spte) +{ + WRITE_ONCE(*rcu_dereference(sptep), new_spte); +} + +static inline u64 kvm_tdp_mmu_write_spte(tdp_ptep_t sptep, u64 old_spte, + u64 new_spte, int level) { - WRITE_ONCE(*rcu_dereference(sptep), val); + /* + * Atomically write the SPTE if it is a shadow-present, leaf SPTE with + * volatile bits, i.e. has bits that can be set outside of mmu_lock. + * The Writable bit can be set by KVM's fast page fault handler, and + * Accessed and Dirty bits can be set by the CPU. + * + * Note, non-leaf SPTEs do have Accessed bits and those bits are + * technically volatile, but KVM doesn't consume the Accessed bit of + * non-leaf SPTEs, i.e. KVM doesn't care if it clobbers the bit. This + * logic needs to be reassessed if KVM were to use non-leaf Accessed + * bits, e.g. to skip stepping down into child SPTEs when aging SPTEs. + */ + if (is_shadow_present_pte(old_spte) && is_last_spte(old_spte, level) && + spte_has_volatile_bits(old_spte)) + return kvm_tdp_mmu_write_spte_atomic(sptep, new_spte); + + __kvm_tdp_mmu_write_spte(sptep, new_spte); + return old_spte; } /* diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index edc68538819b..922b06bf4b94 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -426,9 +426,9 @@ static void handle_removed_pt(struct kvm *kvm, tdp_ptep_t pt, bool shared) tdp_mmu_unlink_sp(kvm, sp, shared); for (i = 0; i < PT64_ENT_PER_PAGE; i++) { - u64 *sptep = rcu_dereference(pt) + i; + tdp_ptep_t sptep = pt + i; gfn_t gfn = base_gfn + i * KVM_PAGES_PER_HPAGE(level); - u64 old_child_spte; + u64 old_spte; if (shared) { /* @@ -440,8 +440,8 @@ static void handle_removed_pt(struct kvm *kvm, tdp_ptep_t pt, bool shared) * value to the removed SPTE value. */ for (;;) { - old_child_spte = xchg(sptep, REMOVED_SPTE); - if (!is_removed_spte(old_child_spte)) + old_spte = kvm_tdp_mmu_write_spte_atomic(sptep, REMOVED_SPTE); + if (!is_removed_spte(old_spte)) break; cpu_relax(); } @@ -455,23 +455,43 @@ static void handle_removed_pt(struct kvm *kvm, tdp_ptep_t pt, bool shared) * are guarded by the memslots generation, not by being * unreachable. */ - old_child_spte = READ_ONCE(*sptep); - if (!is_shadow_present_pte(old_child_spte)) + old_spte = kvm_tdp_mmu_read_spte(sptep); + if (!is_shadow_present_pte(old_spte)) continue; /* - * Marking the SPTE as a removed SPTE is not - * strictly necessary here as the MMU lock will - * stop other threads from concurrently modifying - * this SPTE. Using the removed SPTE value keeps - * the two branches consistent and simplifies - * the function. + * Use the common helper instead of a raw WRITE_ONCE as + * the SPTE needs to be updated atomically if it can be + * modified by a different vCPU outside of mmu_lock. + * Even though the parent SPTE is !PRESENT, the TLB + * hasn't yet been flushed, and both Intel and AMD + * document that A/D assists can use upper-level PxE + * entries that are cached in the TLB, i.e. the CPU can + * still access the page and mark it dirty. + * + * No retry is needed in the atomic update path as the + * sole concern is dropping a Dirty bit, i.e. no other + * task can zap/remove the SPTE as mmu_lock is held for + * write. Marking the SPTE as a removed SPTE is not + * strictly necessary for the same reason, but using + * the remove SPTE value keeps the shared/exclusive + * paths consistent and allows the handle_changed_spte() + * call below to hardcode the new value to REMOVED_SPTE. + * + * Note, even though dropping a Dirty bit is the only + * scenario where a non-atomic update could result in a + * functional bug, simply checking the Dirty bit isn't + * sufficient as a fast page fault could read the upper + * level SPTE before it is zapped, and then make this + * target SPTE writable, resume the guest, and set the + * Dirty bit between reading the SPTE above and writing + * it here. */ - WRITE_ONCE(*sptep, REMOVED_SPTE); + old_spte = kvm_tdp_mmu_write_spte(sptep, old_spte, + REMOVED_SPTE, level); } handle_changed_spte(kvm, kvm_mmu_page_as_id(sp), gfn, - old_child_spte, REMOVED_SPTE, level, - shared); + old_spte, REMOVED_SPTE, level, shared); } call_rcu(&sp->rcu_head, tdp_mmu_free_sp_rcu_callback); @@ -667,14 +687,13 @@ static inline int tdp_mmu_zap_spte_atomic(struct kvm *kvm, KVM_PAGES_PER_HPAGE(iter->level)); /* - * No other thread can overwrite the removed SPTE as they - * must either wait on the MMU lock or use - * tdp_mmu_set_spte_atomic which will not overwrite the - * special removed SPTE value. No bookkeeping is needed - * here since the SPTE is going from non-present - * to non-present. + * No other thread can overwrite the removed SPTE as they must either + * wait on the MMU lock or use tdp_mmu_set_spte_atomic() which will not + * overwrite the special removed SPTE value. No bookkeeping is needed + * here since the SPTE is going from non-present to non-present. Use + * the raw write helper to avoid an unnecessary check on volatile bits. */ - kvm_tdp_mmu_write_spte(iter->sptep, 0); + __kvm_tdp_mmu_write_spte(iter->sptep, 0); return 0; } @@ -699,10 +718,13 @@ static inline int tdp_mmu_zap_spte_atomic(struct kvm *kvm, * unless performing certain dirty logging operations. * Leaving record_dirty_log unset in that case prevents page * writes from being double counted. + * + * Returns the old SPTE value, which _may_ be different than @old_spte if the + * SPTE had voldatile bits. */ -static void __tdp_mmu_set_spte(struct kvm *kvm, int as_id, tdp_ptep_t sptep, - u64 old_spte, u64 new_spte, gfn_t gfn, int level, - bool record_acc_track, bool record_dirty_log) +static u64 __tdp_mmu_set_spte(struct kvm *kvm, int as_id, tdp_ptep_t sptep, + u64 old_spte, u64 new_spte, gfn_t gfn, int level, + bool record_acc_track, bool record_dirty_log) { lockdep_assert_held_write(&kvm->mmu_lock); @@ -715,7 +737,7 @@ static void __tdp_mmu_set_spte(struct kvm *kvm, int as_id, tdp_ptep_t sptep, */ WARN_ON(is_removed_spte(old_spte) || is_removed_spte(new_spte)); - kvm_tdp_mmu_write_spte(sptep, new_spte); + old_spte = kvm_tdp_mmu_write_spte(sptep, old_spte, new_spte, level); __handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level, false); @@ -724,6 +746,7 @@ static void __tdp_mmu_set_spte(struct kvm *kvm, int as_id, tdp_ptep_t sptep, if (record_dirty_log) handle_changed_spte_dirty_log(kvm, as_id, gfn, old_spte, new_spte, level); + return old_spte; } static inline void _tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter, @@ -732,9 +755,10 @@ static inline void _tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter, { WARN_ON_ONCE(iter->yielded); - __tdp_mmu_set_spte(kvm, iter->as_id, iter->sptep, iter->old_spte, - new_spte, iter->gfn, iter->level, - record_acc_track, record_dirty_log); + iter->old_spte = __tdp_mmu_set_spte(kvm, iter->as_id, iter->sptep, + iter->old_spte, new_spte, + iter->gfn, iter->level, + record_acc_track, record_dirty_log); } static inline void tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter, -- cgit v1.2.3 From 5eb849322d7f7ae9d5c587c7bc3b4f7c6872cd2f Mon Sep 17 00:00:00 2001 From: Kyle Huey Date: Mon, 2 May 2022 22:01:36 -0700 Subject: KVM: x86/svm: Account for family 17h event renumberings in amd_pmc_perf_hw_id Zen renumbered some of the performance counters that correspond to the well known events in perf_hw_id. This code in KVM was never updated for that, so guest that attempt to use counters on Zen that correspond to the pre-Zen perf_hw_id values will silently receive the wrong values. This has been observed in the wild with rr[0] when running in Zen 3 guests. rr uses the retired conditional branch counter 00d1 which is incorrectly recognized by KVM as PERF_COUNT_HW_STALLED_CYCLES_BACKEND. [0] https://rr-project.org/ Signed-off-by: Kyle Huey Message-Id: <20220503050136.86298-1-khuey@kylehuey.com> Cc: stable@vger.kernel.org [Check guest family, not host. - Paolo] Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/pmu.c | 28 +++++++++++++++++++++++++--- 1 file changed, 25 insertions(+), 3 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/svm/pmu.c b/arch/x86/kvm/svm/pmu.c index 24eb935b6f85..311cbaa0c3dd 100644 --- a/arch/x86/kvm/svm/pmu.c +++ b/arch/x86/kvm/svm/pmu.c @@ -45,6 +45,22 @@ static struct kvm_event_hw_type_mapping amd_event_mapping[] = { [7] = { 0xd1, 0x00, PERF_COUNT_HW_STALLED_CYCLES_BACKEND }, }; +/* duplicated from amd_f17h_perfmon_event_map. */ +static struct kvm_event_hw_type_mapping amd_f17h_event_mapping[] = { + [0] = { 0x76, 0x00, PERF_COUNT_HW_CPU_CYCLES }, + [1] = { 0xc0, 0x00, PERF_COUNT_HW_INSTRUCTIONS }, + [2] = { 0x60, 0xff, PERF_COUNT_HW_CACHE_REFERENCES }, + [3] = { 0x64, 0x09, PERF_COUNT_HW_CACHE_MISSES }, + [4] = { 0xc2, 0x00, PERF_COUNT_HW_BRANCH_INSTRUCTIONS }, + [5] = { 0xc3, 0x00, PERF_COUNT_HW_BRANCH_MISSES }, + [6] = { 0x87, 0x02, PERF_COUNT_HW_STALLED_CYCLES_FRONTEND }, + [7] = { 0x87, 0x01, PERF_COUNT_HW_STALLED_CYCLES_BACKEND }, +}; + +/* amd_pmc_perf_hw_id depends on these being the same size */ +static_assert(ARRAY_SIZE(amd_event_mapping) == + ARRAY_SIZE(amd_f17h_event_mapping)); + static unsigned int get_msr_base(struct kvm_pmu *pmu, enum pmu_type type) { struct kvm_vcpu *vcpu = pmu_to_vcpu(pmu); @@ -140,6 +156,7 @@ static inline struct kvm_pmc *get_gp_pmc_amd(struct kvm_pmu *pmu, u32 msr, static unsigned int amd_pmc_perf_hw_id(struct kvm_pmc *pmc) { + struct kvm_event_hw_type_mapping *event_mapping; u8 event_select = pmc->eventsel & ARCH_PERFMON_EVENTSEL_EVENT; u8 unit_mask = (pmc->eventsel & ARCH_PERFMON_EVENTSEL_UMASK) >> 8; int i; @@ -148,15 +165,20 @@ static unsigned int amd_pmc_perf_hw_id(struct kvm_pmc *pmc) if (WARN_ON(pmc_is_fixed(pmc))) return PERF_COUNT_HW_MAX; + if (guest_cpuid_family(pmc->vcpu) >= 0x17) + event_mapping = amd_f17h_event_mapping; + else + event_mapping = amd_event_mapping; + for (i = 0; i < ARRAY_SIZE(amd_event_mapping); i++) - if (amd_event_mapping[i].eventsel == event_select - && amd_event_mapping[i].unit_mask == unit_mask) + if (event_mapping[i].eventsel == event_select + && event_mapping[i].unit_mask == unit_mask) break; if (i == ARRAY_SIZE(amd_event_mapping)) return PERF_COUNT_HW_MAX; - return amd_event_mapping[i].event_type; + return event_mapping[i].event_type; } /* check if a PMC is enabled by comparing it against global_ctrl bits. Because -- cgit v1.2.3 From 5a1bde46f98b893cda6122b00e94c0c40a6ead3c Mon Sep 17 00:00:00 2001 From: Sandipan Das Date: Wed, 27 Apr 2022 17:01:49 +0530 Subject: kvm: x86/cpuid: Only provide CPUID leaf 0xA if host has architectural PMU On some x86 processors, CPUID leaf 0xA provides information on Architectural Performance Monitoring features. It advertises a PMU version which Qemu uses to determine the availability of additional MSRs to manage the PMCs. Upon receiving a KVM_GET_SUPPORTED_CPUID ioctl request for the same, the kernel constructs return values based on the x86_pmu_capability irrespective of the vendor. This leaf and the additional MSRs are not supported on AMD and Hygon processors. If AMD PerfMonV2 is detected, the PMU version is set to 2 and guest startup breaks because of an attempt to access a non-existent MSR. Return zeros to avoid this. Fixes: a6c06ed1a60a ("KVM: Expose the architectural performance monitoring CPUID leaf") Reported-by: Vasant Hegde Signed-off-by: Sandipan Das Message-Id: <3fef83d9c2b2f7516e8ff50d60851f29a4bcb716.1651058600.git.sandipan.das@amd.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/cpuid.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'arch') diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index b24ca7f4ed7c..732724ea5b10 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -887,6 +887,11 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) union cpuid10_eax eax; union cpuid10_edx edx; + if (!static_cpu_has(X86_FEATURE_ARCH_PERFMON)) { + entry->eax = entry->ebx = entry->ecx = entry->edx = 0; + break; + } + perf_get_x86_pmu_capability(&cap); /* -- cgit v1.2.3 From a06afe8383080c630a7a528b8382fc6bb4925b61 Mon Sep 17 00:00:00 2001 From: Christian Borntraeger Date: Fri, 29 Apr 2022 17:15:26 +0200 Subject: KVM: s390: vsie/gmap: reduce gmap_rmap overhead there are cases that trigger a 2nd shadow event for the same vmaddr/raddr combination. (prefix changes, reboots, some known races) This will increase memory usages and it will result in long latencies when cleaning up, e.g. on shutdown. To avoid cases with a list that has hundreds of identical raddrs we check existing entries at insert time. As this measurably reduces the list length this will be faster than traversing the list at shutdown time. In the long run several places will be optimized to create less entries and a shrinker might be necessary. Fixes: 4be130a08420 ("s390/mm: add shadow gmap support") Signed-off-by: Christian Borntraeger Acked-by: David Hildenbrand Link: https://lore.kernel.org/r/20220429151526.1560-1-borntraeger@linux.ibm.com Signed-off-by: Heiko Carstens --- arch/s390/mm/gmap.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'arch') diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c index af03cacf34ec..1ac73917a8d3 100644 --- a/arch/s390/mm/gmap.c +++ b/arch/s390/mm/gmap.c @@ -1183,6 +1183,7 @@ EXPORT_SYMBOL_GPL(gmap_read_table); static inline void gmap_insert_rmap(struct gmap *sg, unsigned long vmaddr, struct gmap_rmap *rmap) { + struct gmap_rmap *temp; void __rcu **slot; BUG_ON(!gmap_is_shadow(sg)); @@ -1190,6 +1191,12 @@ static inline void gmap_insert_rmap(struct gmap *sg, unsigned long vmaddr, if (slot) { rmap->next = radix_tree_deref_slot_protected(slot, &sg->guest_table_lock); + for (temp = rmap->next; temp; temp = temp->next) { + if (temp->raddr == rmap->raddr) { + kfree(rmap); + return; + } + } radix_tree_replace_slot(&sg->host_to_rmap, slot, rmap); } else { rmap->next = NULL; -- cgit v1.2.3 From 57831bfb5e78777dc399e351ed68ef77c3aee385 Mon Sep 17 00:00:00 2001 From: Haren Myneni Date: Sat, 19 Mar 2022 02:28:09 -0700 Subject: powerpc/pseries/vas: Use QoS credits from the userspace The user can change the QoS credits dynamically with the management console interface which notifies OS with sysfs. After returning from the OS interface successfully, the management console updates the hypervisor. Since the VAS capabilities in the hypervisor is not updated when the OS gets the update, the kernel is using the old total credits value from the hypervisor. Fix this issue by using the new QoS credits from the userspace instead of depending on VAS capabilities from the hypervisor. Signed-off-by: Haren Myneni Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/76d156f8af1e03cc09369d68e0bfad0c40031bcc.camel@linux.ibm.com --- arch/powerpc/platforms/pseries/vas-sysfs.c | 19 ++++++++++++++----- arch/powerpc/platforms/pseries/vas.c | 23 ++++++++++++----------- arch/powerpc/platforms/pseries/vas.h | 2 +- 3 files changed, 27 insertions(+), 17 deletions(-) (limited to 'arch') diff --git a/arch/powerpc/platforms/pseries/vas-sysfs.c b/arch/powerpc/platforms/pseries/vas-sysfs.c index 909535ca513a..ec65586cbeb3 100644 --- a/arch/powerpc/platforms/pseries/vas-sysfs.c +++ b/arch/powerpc/platforms/pseries/vas-sysfs.c @@ -27,22 +27,31 @@ struct vas_caps_entry { /* * This function is used to get the notification from the drmgr when - * QoS credits are changed. Though receiving the target total QoS - * credits here, get the official QoS capabilities from the hypervisor. + * QoS credits are changed. */ -static ssize_t update_total_credits_trigger(struct vas_cop_feat_caps *caps, +static ssize_t update_total_credits_store(struct vas_cop_feat_caps *caps, const char *buf, size_t count) { int err; u16 creds; err = kstrtou16(buf, 0, &creds); + /* + * The user space interface from the management console + * notifies OS with the new QoS credits and then the + * hypervisor. So OS has to use this new credits value + * and reconfigure VAS windows (close or reopen depends + * on the credits available) instead of depending on VAS + * QoS capabilities from the hypervisor. + */ if (!err) - err = vas_reconfig_capabilties(caps->win_type); + err = vas_reconfig_capabilties(caps->win_type, creds); if (err) return -EINVAL; + pr_info("Set QoS total credits %u\n", creds); + return count; } @@ -92,7 +101,7 @@ VAS_ATTR_RO(nr_total_credits); VAS_ATTR_RO(nr_used_credits); static struct vas_sysfs_entry update_total_credits_attribute = - __ATTR(update_total_credits, 0200, NULL, update_total_credits_trigger); + __ATTR(update_total_credits, 0200, NULL, update_total_credits_store); static struct attribute *vas_def_capab_attrs[] = { &nr_total_credits_attribute.attr, diff --git a/arch/powerpc/platforms/pseries/vas.c b/arch/powerpc/platforms/pseries/vas.c index 1f59d78c77a1..ec643bbdb67f 100644 --- a/arch/powerpc/platforms/pseries/vas.c +++ b/arch/powerpc/platforms/pseries/vas.c @@ -779,10 +779,10 @@ static int reconfig_close_windows(struct vas_caps *vcap, int excess_creds, * changes. Reconfig window configurations based on the credits * availability from this new capabilities. */ -int vas_reconfig_capabilties(u8 type) +int vas_reconfig_capabilties(u8 type, int new_nr_creds) { struct vas_cop_feat_caps *caps; - int old_nr_creds, new_nr_creds; + int old_nr_creds; struct vas_caps *vcaps; int rc = 0, nr_active_wins; @@ -795,12 +795,6 @@ int vas_reconfig_capabilties(u8 type) caps = &vcaps->caps; mutex_lock(&vas_pseries_mutex); - rc = h_query_vas_capabilities(H_QUERY_VAS_CAPABILITIES, vcaps->feat, - (u64)virt_to_phys(&hv_cop_caps)); - if (rc) - goto out; - - new_nr_creds = be16_to_cpu(hv_cop_caps.target_lpar_creds); old_nr_creds = atomic_read(&caps->nr_total_credits); @@ -832,7 +826,6 @@ int vas_reconfig_capabilties(u8 type) false); } -out: mutex_unlock(&vas_pseries_mutex); return rc; } @@ -850,7 +843,7 @@ static int pseries_vas_notifier(struct notifier_block *nb, struct of_reconfig_data *rd = data; struct device_node *dn = rd->dn; const __be32 *intserv = NULL; - int len, rc = 0; + int new_nr_creds, len, rc = 0; if ((action == OF_RECONFIG_ATTACH_NODE) || (action == OF_RECONFIG_DETACH_NODE)) @@ -862,7 +855,15 @@ static int pseries_vas_notifier(struct notifier_block *nb, if (!intserv) return NOTIFY_OK; - rc = vas_reconfig_capabilties(VAS_GZIP_DEF_FEAT_TYPE); + rc = h_query_vas_capabilities(H_QUERY_VAS_CAPABILITIES, + vascaps[VAS_GZIP_DEF_FEAT_TYPE].feat, + (u64)virt_to_phys(&hv_cop_caps)); + if (!rc) { + new_nr_creds = be16_to_cpu(hv_cop_caps.target_lpar_creds); + rc = vas_reconfig_capabilties(VAS_GZIP_DEF_FEAT_TYPE, + new_nr_creds); + } + if (rc) pr_err("Failed reconfig VAS capabilities with DLPAR\n"); diff --git a/arch/powerpc/platforms/pseries/vas.h b/arch/powerpc/platforms/pseries/vas.h index 34177881e998..333ffa2f9f42 100644 --- a/arch/powerpc/platforms/pseries/vas.h +++ b/arch/powerpc/platforms/pseries/vas.h @@ -135,7 +135,7 @@ struct pseries_vas_window { }; int sysfs_add_vas_caps(struct vas_cop_feat_caps *caps); -int vas_reconfig_capabilties(u8 type); +int vas_reconfig_capabilties(u8 type, int new_nr_creds); int __init sysfs_pseries_vas_init(struct vas_all_caps *vas_caps); #ifdef CONFIG_PPC_VAS -- cgit v1.2.3 From 6d65028eb67dbb7627651adfc460d64196d38bd8 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Mon, 2 May 2022 22:50:10 +1000 Subject: powerpc/vdso: Fix incorrect CFI in gettimeofday.S As reported by Alan, the CFI (Call Frame Information) in the VDSO time routines is incorrect since commit ce7d8056e38b ("powerpc/vdso: Prepare for switching VDSO to generic C implementation."). DWARF has a concept called the CFA (Canonical Frame Address), which on powerpc is calculated as an offset from the stack pointer (r1). That means when the stack pointer is changed there must be a corresponding CFI directive to update the calculation of the CFA. The current code is missing those directives for the changes to r1, which prevents gdb from being able to generate a backtrace from inside VDSO functions, eg: Breakpoint 1, 0x00007ffff7f804dc in __kernel_clock_gettime () (gdb) bt #0 0x00007ffff7f804dc in __kernel_clock_gettime () #1 0x00007ffff7d8872c in clock_gettime@@GLIBC_2.17 () from /lib64/libc.so.6 #2 0x00007fffffffd960 in ?? () #3 0x00007ffff7d8872c in clock_gettime@@GLIBC_2.17 () from /lib64/libc.so.6 Backtrace stopped: frame did not save the PC Alan helpfully describes some rules for correctly maintaining the CFI information: 1) Every adjustment to the current frame address reg (ie. r1) must be described, and exactly at the instruction where r1 changes. Why? Because stack unwinding might want to access previous frames. 2) If a function changes LR or any non-volatile register, the save location for those regs must be given. The CFI can be at any instruction after the saves up to the point that the reg is changed. (Exception: LR save should be described before a bl. not after) 3) If asychronous unwind info is needed then restores of LR and non-volatile regs must also be described. The CFI can be at any instruction after the reg is restored up to the point where the save location is (potentially) trashed. Fix the inability to backtrace by adding CFI directives describing the changes to r1, ie. satisfying rule 1. Also change the information for LR to point to the copy saved on the stack, not the value in r0 that will be overwritten by the function call. Finally, add CFI directives describing the save/restore of r2. With the fix gdb can correctly back trace and navigate up and down the stack: Breakpoint 1, 0x00007ffff7f804dc in __kernel_clock_gettime () (gdb) bt #0 0x00007ffff7f804dc in __kernel_clock_gettime () #1 0x00007ffff7d8872c in clock_gettime@@GLIBC_2.17 () from /lib64/libc.so.6 #2 0x0000000100015b60 in gettime () #3 0x000000010000c8bc in print_long_format () #4 0x000000010000d180 in print_current_files () #5 0x00000001000054ac in main () (gdb) up #1 0x00007ffff7d8872c in clock_gettime@@GLIBC_2.17 () from /lib64/libc.so.6 (gdb) #2 0x0000000100015b60 in gettime () (gdb) #3 0x000000010000c8bc in print_long_format () (gdb) #4 0x000000010000d180 in print_current_files () (gdb) #5 0x00000001000054ac in main () (gdb) Initial frame selected; you cannot go up. (gdb) down #4 0x000000010000d180 in print_current_files () (gdb) #3 0x000000010000c8bc in print_long_format () (gdb) #2 0x0000000100015b60 in gettime () (gdb) #1 0x00007ffff7d8872c in clock_gettime@@GLIBC_2.17 () from /lib64/libc.so.6 (gdb) #0 0x00007ffff7f804dc in __kernel_clock_gettime () (gdb) Fixes: ce7d8056e38b ("powerpc/vdso: Prepare for switching VDSO to generic C implementation.") Cc: stable@vger.kernel.org # v5.11+ Reported-by: Alan Modra Signed-off-by: Michael Ellerman Reviewed-by: Segher Boessenkool Link: https://lore.kernel.org/r/20220502125010.1319370-1-mpe@ellerman.id.au --- arch/powerpc/kernel/vdso/gettimeofday.S | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/powerpc/kernel/vdso/gettimeofday.S b/arch/powerpc/kernel/vdso/gettimeofday.S index eb9c81e1c218..0c4ecc8fec5a 100644 --- a/arch/powerpc/kernel/vdso/gettimeofday.S +++ b/arch/powerpc/kernel/vdso/gettimeofday.S @@ -22,12 +22,15 @@ .macro cvdso_call funct call_time=0 .cfi_startproc PPC_STLU r1, -PPC_MIN_STKFRM(r1) + .cfi_adjust_cfa_offset PPC_MIN_STKFRM mflr r0 - .cfi_register lr, r0 PPC_STLU r1, -PPC_MIN_STKFRM(r1) + .cfi_adjust_cfa_offset PPC_MIN_STKFRM PPC_STL r0, PPC_MIN_STKFRM + PPC_LR_STKOFF(r1) + .cfi_rel_offset lr, PPC_MIN_STKFRM + PPC_LR_STKOFF #ifdef __powerpc64__ PPC_STL r2, PPC_MIN_STKFRM + STK_GOT(r1) + .cfi_rel_offset r2, PPC_MIN_STKFRM + STK_GOT #endif get_datapage r5 .ifeq \call_time @@ -39,13 +42,15 @@ PPC_LL r0, PPC_MIN_STKFRM + PPC_LR_STKOFF(r1) #ifdef __powerpc64__ PPC_LL r2, PPC_MIN_STKFRM + STK_GOT(r1) + .cfi_restore r2 #endif .ifeq \call_time cmpwi r3, 0 .endif mtlr r0 - .cfi_restore lr addi r1, r1, 2 * PPC_MIN_STKFRM + .cfi_restore lr + .cfi_def_cfa_offset 0 crclr so .ifeq \call_time beqlr+ -- cgit v1.2.3 From 59f5ede3bc0f00eb856425f636dab0c10feb06d8 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 1 May 2022 21:31:43 +0200 Subject: x86/fpu: Prevent FPU state corruption The FPU usage related to task FPU management is either protected by disabling interrupts (switch_to, return to user) or via fpregs_lock() which is a wrapper around local_bh_disable(). When kernel code wants to use the FPU then it has to check whether it is possible by calling irq_fpu_usable(). But the condition in irq_fpu_usable() is wrong. It allows FPU to be used when: !in_interrupt() || interrupted_user_mode() || interrupted_kernel_fpu_idle() The latter is checking whether some other context already uses FPU in the kernel, but if that's not the case then it allows FPU to be used unconditionally even if the calling context interrupted a fpregs_lock() critical region. If that happens then the FPU state of the interrupted context becomes corrupted. Allow in kernel FPU usage only when no other context has in kernel FPU usage and either the calling context is not hard interrupt context or the hard interrupt did not interrupt a local bottomhalf disabled region. It's hard to find a proper Fixes tag as the condition was broken in one way or the other for a very long time and the eager/lazy FPU changes caused a lot of churn. Picked something remotely connected from the history. This survived undetected for quite some time as FPU usage in interrupt context is rare, but the recent changes to the random code unearthed it at least on a kernel which had FPU debugging enabled. There is probably a higher rate of silent corruption as not all issues can be detected by the FPU debugging code. This will be addressed in a subsequent change. Fixes: 5d2bd7009f30 ("x86, fpu: decouple non-lazy/eager fpu restore from xsave") Reported-by: Filipe Manana Signed-off-by: Thomas Gleixner Tested-by: Filipe Manana Reviewed-by: Borislav Petkov Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20220501193102.588689270@linutronix.de --- arch/x86/kernel/fpu/core.c | 67 ++++++++++++++++++---------------------------- 1 file changed, 26 insertions(+), 41 deletions(-) (limited to 'arch') diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index c049561f373a..e28ab0ecc537 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -41,17 +41,7 @@ struct fpu_state_config fpu_user_cfg __ro_after_init; */ struct fpstate init_fpstate __ro_after_init; -/* - * Track whether the kernel is using the FPU state - * currently. - * - * This flag is used: - * - * - by IRQ context code to potentially use the FPU - * if it's unused. - * - * - to debug kernel_fpu_begin()/end() correctness - */ +/* Track in-kernel FPU usage */ static DEFINE_PER_CPU(bool, in_kernel_fpu); /* @@ -59,42 +49,37 @@ static DEFINE_PER_CPU(bool, in_kernel_fpu); */ DEFINE_PER_CPU(struct fpu *, fpu_fpregs_owner_ctx); -static bool kernel_fpu_disabled(void) -{ - return this_cpu_read(in_kernel_fpu); -} - -static bool interrupted_kernel_fpu_idle(void) -{ - return !kernel_fpu_disabled(); -} - -/* - * Were we in user mode (or vm86 mode) when we were - * interrupted? - * - * Doing kernel_fpu_begin/end() is ok if we are running - * in an interrupt context from user mode - we'll just - * save the FPU state as required. - */ -static bool interrupted_user_mode(void) -{ - struct pt_regs *regs = get_irq_regs(); - return regs && user_mode(regs); -} - /* * Can we use the FPU in kernel mode with the * whole "kernel_fpu_begin/end()" sequence? - * - * It's always ok in process context (ie "not interrupt") - * but it is sometimes ok even from an irq. */ bool irq_fpu_usable(void) { - return !in_interrupt() || - interrupted_user_mode() || - interrupted_kernel_fpu_idle(); + if (WARN_ON_ONCE(in_nmi())) + return false; + + /* In kernel FPU usage already active? */ + if (this_cpu_read(in_kernel_fpu)) + return false; + + /* + * When not in NMI or hard interrupt context, FPU can be used in: + * + * - Task context except from within fpregs_lock()'ed critical + * regions. + * + * - Soft interrupt processing context which cannot happen + * while in a fpregs_lock()'ed critical region. + */ + if (!in_hardirq()) + return true; + + /* + * In hard interrupt context it's safe when soft interrupts + * are enabled, which means the interrupt did not hit in + * a fpregs_lock()'ed critical region. + */ + return !softirq_count(); } EXPORT_SYMBOL(irq_fpu_usable); -- cgit v1.2.3 From 348c71344111d7a48892e3e52264ff11956fc196 Mon Sep 17 00:00:00 2001 From: Kajol Jain Date: Thu, 5 May 2022 21:04:51 +0530 Subject: powerpc/papr_scm: Fix buffer overflow issue with CONFIG_FORTIFY_SOURCE With CONFIG_FORTIFY_SOURCE enabled, string functions will also perform dynamic checks for string size which can panic the kernel, like incase of overflow detection. In papr_scm, papr_scm_pmu_check_events function uses stat->stat_id with string operations, to populate the nvdimm_events_map array. Since stat_id variable is not NULL terminated, the kernel panics with CONFIG_FORTIFY_SOURCE enabled at boot time. Below are the logs of kernel panic: detected buffer overflow in __fortify_strlen ------------[ cut here ]------------ kernel BUG at lib/string_helpers.c:980! Oops: Exception in kernel mode, sig: 5 [#1] NIP [c00000000077dad0] fortify_panic+0x28/0x38 LR [c00000000077dacc] fortify_panic+0x24/0x38 Call Trace: [c0000022d77836e0] [c00000000077dacc] fortify_panic+0x24/0x38 (unreliable) [c00800000deb2660] papr_scm_pmu_check_events.constprop.0+0x118/0x220 [papr_scm] [c00800000deb2cb0] papr_scm_probe+0x288/0x62c [papr_scm] [c0000000009b46a8] platform_probe+0x98/0x150 Fix this issue by using kmemdup_nul() to copy the content of stat->stat_id directly to the nvdimm_events_map array. mpe: stat->stat_id comes from the hypervisor, not userspace, so there is no security exposure. Fixes: 4c08d4bbc089 ("powerpc/papr_scm: Add perf interface support") Signed-off-by: Kajol Jain Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20220505153451.35503-1-kjain@linux.ibm.com --- arch/powerpc/platforms/pseries/papr_scm.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'arch') diff --git a/arch/powerpc/platforms/pseries/papr_scm.c b/arch/powerpc/platforms/pseries/papr_scm.c index f58728d5f10d..39962c905542 100644 --- a/arch/powerpc/platforms/pseries/papr_scm.c +++ b/arch/powerpc/platforms/pseries/papr_scm.c @@ -462,7 +462,6 @@ static int papr_scm_pmu_check_events(struct papr_scm_priv *p, struct nvdimm_pmu { struct papr_scm_perf_stat *stat; struct papr_scm_perf_stats *stats; - char *statid; int index, rc, count; u32 available_events; @@ -493,14 +492,12 @@ static int papr_scm_pmu_check_events(struct papr_scm_priv *p, struct nvdimm_pmu for (index = 0, stat = stats->scm_statistic, count = 0; index < available_events; index++, ++stat) { - statid = kzalloc(strlen(stat->stat_id) + 1, GFP_KERNEL); - if (!statid) { + p->nvdimm_events_map[count] = kmemdup_nul(stat->stat_id, 8, GFP_KERNEL); + if (!p->nvdimm_events_map[count]) { rc = -ENOMEM; goto out_nvdimm_events_map; } - strcpy(statid, stat->stat_id); - p->nvdimm_events_map[count] = statid; count++; } p->nvdimm_events_map[count] = NULL; -- cgit v1.2.3 From 0c2c7c069285374fc8feacddc0498f8ab7627117 Mon Sep 17 00:00:00 2001 From: Peter Gonda Date: Mon, 2 May 2022 09:58:07 -0700 Subject: KVM: SEV: Mark nested locking of vcpu->lock svm_vm_migrate_from() uses sev_lock_vcpus_for_migration() to lock all source and target vcpu->locks. Unfortunately there is an 8 subclass limit, so a new subclass cannot be used for each vCPU. Instead maintain ownership of the first vcpu's mutex.dep_map using a role specific subclass: source vs target. Release the other vcpu's mutex.dep_maps. Fixes: b56639318bb2b ("KVM: SEV: Add support for SEV intra host migration") Reported-by: John Sperbeck Suggested-by: David Rientjes Suggested-by: Sean Christopherson Suggested-by: Paolo Bonzini Cc: Hillf Danton Cc: kvm@vger.kernel.org Cc: linux-kernel@vger.kernel.org Signed-off-by: Peter Gonda Message-Id: <20220502165807.529624-1-pgonda@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/sev.c | 42 ++++++++++++++++++++++++++++++++++++++---- 1 file changed, 38 insertions(+), 4 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 0ad70c12c7c3..7c392873626f 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -1594,24 +1594,51 @@ static void sev_unlock_two_vms(struct kvm *dst_kvm, struct kvm *src_kvm) atomic_set_release(&src_sev->migration_in_progress, 0); } +/* vCPU mutex subclasses. */ +enum sev_migration_role { + SEV_MIGRATION_SOURCE = 0, + SEV_MIGRATION_TARGET, + SEV_NR_MIGRATION_ROLES, +}; -static int sev_lock_vcpus_for_migration(struct kvm *kvm) +static int sev_lock_vcpus_for_migration(struct kvm *kvm, + enum sev_migration_role role) { struct kvm_vcpu *vcpu; unsigned long i, j; + bool first = true; kvm_for_each_vcpu(i, vcpu, kvm) { - if (mutex_lock_killable(&vcpu->mutex)) + if (mutex_lock_killable_nested(&vcpu->mutex, role)) goto out_unlock; + + if (first) { + /* + * Reset the role to one that avoids colliding with + * the role used for the first vcpu mutex. + */ + role = SEV_NR_MIGRATION_ROLES; + first = false; + } else { + mutex_release(&vcpu->mutex.dep_map, _THIS_IP_); + } } return 0; out_unlock: + + first = true; kvm_for_each_vcpu(j, vcpu, kvm) { if (i == j) break; + if (first) + first = false; + else + mutex_acquire(&vcpu->mutex.dep_map, role, 0, _THIS_IP_); + + mutex_unlock(&vcpu->mutex); } return -EINTR; @@ -1621,8 +1648,15 @@ static void sev_unlock_vcpus_for_migration(struct kvm *kvm) { struct kvm_vcpu *vcpu; unsigned long i; + bool first = true; kvm_for_each_vcpu(i, vcpu, kvm) { + if (first) + first = false; + else + mutex_acquire(&vcpu->mutex.dep_map, + SEV_NR_MIGRATION_ROLES, 0, _THIS_IP_); + mutex_unlock(&vcpu->mutex); } } @@ -1748,10 +1782,10 @@ int sev_vm_move_enc_context_from(struct kvm *kvm, unsigned int source_fd) charged = true; } - ret = sev_lock_vcpus_for_migration(kvm); + ret = sev_lock_vcpus_for_migration(kvm, SEV_MIGRATION_SOURCE); if (ret) goto out_dst_cgroup; - ret = sev_lock_vcpus_for_migration(source_kvm); + ret = sev_lock_vcpus_for_migration(source_kvm, SEV_MIGRATION_TARGET); if (ret) goto out_dst_vcpu; -- cgit v1.2.3 From 053d2290c0307e3642e75e0185ddadf084dc36c1 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 2 May 2022 22:18:50 +0000 Subject: KVM: VMX: Exit to userspace if vCPU has injected exception and invalid state Exit to userspace with an emulation error if KVM encounters an injected exception with invalid guest state, in addition to the existing check of bailing if there's a pending exception (KVM doesn't support emulating exceptions except when emulating real mode via vm86). In theory, KVM should never get to such a situation as KVM is supposed to exit to userspace before injecting an exception with invalid guest state. But in practice, userspace can intervene and manually inject an exception and/or stuff registers to force invalid guest state while a previously injected exception is awaiting reinjection. Fixes: fc4fad79fc3d ("KVM: VMX: Reject KVM_RUN if emulation is required with pending exception") Reported-by: syzbot+cfafed3bb76d3e37581b@syzkaller.appspotmail.com Signed-off-by: Sean Christopherson Message-Id: <20220502221850.131873-1-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index d58b763df855..610355b9ccce 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -5472,7 +5472,7 @@ static bool vmx_emulation_required_with_pending_exception(struct kvm_vcpu *vcpu) struct vcpu_vmx *vmx = to_vmx(vcpu); return vmx->emulation_required && !vmx->rmode.vm86_active && - vcpu->arch.exception.pending; + (vcpu->arch.exception.pending || vcpu->arch.exception.injected); } static int handle_invalid_guest_state(struct kvm_vcpu *vcpu) -- cgit v1.2.3 From 9dc4241bb14afecd16518a0760bceb3d7359b12a Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Sat, 7 May 2022 15:31:16 +0200 Subject: Revert "parisc: Mark cr16 CPU clocksource unstable on all SMP machines" This reverts commit afdb4a5b1d340e4afffc65daa21cc71890d7d589. It triggers RCU stalls at boot with a 32-bit kernel. Signed-off-by: Helge Deller Noticed-by: John David Anglin Cc: stable@vger.kernel.org # v5.16+ --- arch/parisc/kernel/time.c | 30 ++++++++++++++++++++++-------- 1 file changed, 22 insertions(+), 8 deletions(-) (limited to 'arch') diff --git a/arch/parisc/kernel/time.c b/arch/parisc/kernel/time.c index bb27dfeeddfc..95ee9e1a364b 100644 --- a/arch/parisc/kernel/time.c +++ b/arch/parisc/kernel/time.c @@ -251,16 +251,30 @@ void __init time_init(void) static int __init init_cr16_clocksource(void) { /* - * The cr16 interval timers are not syncronized across CPUs, even if - * they share the same socket. + * The cr16 interval timers are not syncronized across CPUs on + * different sockets, so mark them unstable and lower rating on + * multi-socket SMP systems. */ if (num_online_cpus() > 1 && !running_on_qemu) { - /* mark sched_clock unstable */ - clear_sched_clock_stable(); - - clocksource_cr16.name = "cr16_unstable"; - clocksource_cr16.flags = CLOCK_SOURCE_UNSTABLE; - clocksource_cr16.rating = 0; + int cpu; + unsigned long cpu0_loc; + cpu0_loc = per_cpu(cpu_data, 0).cpu_loc; + + for_each_online_cpu(cpu) { + if (cpu == 0) + continue; + if ((cpu0_loc != 0) && + (cpu0_loc == per_cpu(cpu_data, cpu).cpu_loc)) + continue; + + /* mark sched_clock unstable */ + clear_sched_clock_stable(); + + clocksource_cr16.name = "cr16_unstable"; + clocksource_cr16.flags = CLOCK_SOURCE_UNSTABLE; + clocksource_cr16.rating = 0; + break; + } } /* register at clocksource framework */ -- cgit v1.2.3 From 7962c0896429af2a0e00ec6bc15d992536453b2d Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Sat, 7 May 2022 15:32:38 +0200 Subject: Revert "parisc: Mark sched_clock unstable only if clocks are not syncronized" This reverts commit d97180ad68bdb7ee10f327205a649bc2f558741d. It triggers RCU stalls at boot with a 32-bit kernel. Signed-off-by: Helge Deller Noticed-by: John David Anglin Cc: stable@vger.kernel.org # v5.15+ --- arch/parisc/kernel/setup.c | 2 ++ arch/parisc/kernel/time.c | 7 ++++--- 2 files changed, 6 insertions(+), 3 deletions(-) (limited to 'arch') diff --git a/arch/parisc/kernel/setup.c b/arch/parisc/kernel/setup.c index b91cb45ffd4e..f005ddedb50e 100644 --- a/arch/parisc/kernel/setup.c +++ b/arch/parisc/kernel/setup.c @@ -161,6 +161,8 @@ void __init setup_arch(char **cmdline_p) #ifdef CONFIG_PA11 dma_ops_init(); #endif + + clear_sched_clock_stable(); } /* diff --git a/arch/parisc/kernel/time.c b/arch/parisc/kernel/time.c index 95ee9e1a364b..19c31a72fe76 100644 --- a/arch/parisc/kernel/time.c +++ b/arch/parisc/kernel/time.c @@ -267,9 +267,6 @@ static int __init init_cr16_clocksource(void) (cpu0_loc == per_cpu(cpu_data, cpu).cpu_loc)) continue; - /* mark sched_clock unstable */ - clear_sched_clock_stable(); - clocksource_cr16.name = "cr16_unstable"; clocksource_cr16.flags = CLOCK_SOURCE_UNSTABLE; clocksource_cr16.rating = 0; @@ -277,6 +274,10 @@ static int __init init_cr16_clocksource(void) } } + /* XXX: We may want to mark sched_clock stable here if cr16 clocks are + * in sync: + * (clocksource_cr16.flags == CLOCK_SOURCE_IS_CONTINUOUS) */ + /* register at clocksource framework */ clocksource_register_hz(&clocksource_cr16, 100 * PAGE0->mem_10msec); -- cgit v1.2.3 From 6c800d7f55fcd78e17deae5ae4374d8e73482c13 Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Sun, 8 May 2022 10:18:40 +0200 Subject: Revert "parisc: Fix patch code locking and flushing" This reverts commit a9fe7fa7d874a536e0540469f314772c054a0323. Leads to segfaults on 32bit kernel. Signed-off-by: Helge Deller --- arch/parisc/kernel/patch.c | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) (limited to 'arch') diff --git a/arch/parisc/kernel/patch.c b/arch/parisc/kernel/patch.c index e59574f65e64..80a0ab372802 100644 --- a/arch/parisc/kernel/patch.c +++ b/arch/parisc/kernel/patch.c @@ -40,7 +40,10 @@ static void __kprobes *patch_map(void *addr, int fixmap, unsigned long *flags, *need_unmap = 1; set_fixmap(fixmap, page_to_phys(page)); - raw_spin_lock_irqsave(&patch_lock, *flags); + if (flags) + raw_spin_lock_irqsave(&patch_lock, *flags); + else + __acquire(&patch_lock); return (void *) (__fix_to_virt(fixmap) + (uintaddr & ~PAGE_MASK)); } @@ -49,7 +52,10 @@ static void __kprobes patch_unmap(int fixmap, unsigned long *flags) { clear_fixmap(fixmap); - raw_spin_unlock_irqrestore(&patch_lock, *flags); + if (flags) + raw_spin_unlock_irqrestore(&patch_lock, *flags); + else + __release(&patch_lock); } void __kprobes __patch_text_multiple(void *addr, u32 *insn, unsigned int len) @@ -61,9 +67,8 @@ void __kprobes __patch_text_multiple(void *addr, u32 *insn, unsigned int len) int mapped; /* Make sure we don't have any aliases in cache */ - flush_kernel_dcache_range_asm(start, end); - flush_kernel_icache_range_asm(start, end); - flush_tlb_kernel_range(start, end); + flush_kernel_vmap_range(addr, len); + flush_icache_range(start, end); p = fixmap = patch_map(addr, FIX_TEXT_POKE0, &flags, &mapped); @@ -76,10 +81,8 @@ void __kprobes __patch_text_multiple(void *addr, u32 *insn, unsigned int len) * We're crossing a page boundary, so * need to remap */ - flush_kernel_dcache_range_asm((unsigned long)fixmap, - (unsigned long)p); - flush_tlb_kernel_range((unsigned long)fixmap, - (unsigned long)p); + flush_kernel_vmap_range((void *)fixmap, + (p-fixmap) * sizeof(*p)); if (mapped) patch_unmap(FIX_TEXT_POKE0, &flags); p = fixmap = patch_map(addr, FIX_TEXT_POKE0, &flags, @@ -87,10 +90,10 @@ void __kprobes __patch_text_multiple(void *addr, u32 *insn, unsigned int len) } } - flush_kernel_dcache_range_asm((unsigned long)fixmap, (unsigned long)p); - flush_tlb_kernel_range((unsigned long)fixmap, (unsigned long)p); + flush_kernel_vmap_range((void *)fixmap, (p-fixmap) * sizeof(*p)); if (mapped) patch_unmap(FIX_TEXT_POKE0, &flags); + flush_icache_range(start, end); } void __kprobes __patch_text(void *addr, u32 insn) -- cgit v1.2.3 From 0921244f6f4f0d05698b953fe632a99b38907226 Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Fri, 1 Apr 2022 09:19:11 +0200 Subject: parisc: Only list existing CPUs in cpu_possible_mask The inventory knows which CPUs are in the system, so this bitmask should be in cpu_possible_mask instead of the bitmask based on CONFIG_NR_CPUS. Reset the cpu_possible_mask before scanning the system for CPUs, and mark each existing CPU as possible during initialization of that CPU. This avoids those warnings later on too: register_cpu_capacity_sysctl: too early to get CPU4 device! Signed-off-by: Helge Deller Noticed-by: John David Anglin --- arch/parisc/kernel/processor.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'arch') diff --git a/arch/parisc/kernel/processor.c b/arch/parisc/kernel/processor.c index d98692115221..9e92b76b0ce0 100644 --- a/arch/parisc/kernel/processor.c +++ b/arch/parisc/kernel/processor.c @@ -171,6 +171,7 @@ static int __init processor_probe(struct parisc_device *dev) p->cpu_num = cpu_info.cpu_num; p->cpu_loc = cpu_info.cpu_loc; + set_cpu_possible(cpuid, true); store_cpu_topology(cpuid); #ifdef CONFIG_SMP @@ -461,6 +462,13 @@ static struct parisc_driver cpu_driver __refdata = { */ void __init processor_init(void) { + unsigned int cpu; + reset_cpu_topology(); + + /* reset possible mask. We will mark those which are possible. */ + for_each_possible_cpu(cpu) + set_cpu_possible(cpu, false); + register_parisc_driver(&cpu_driver); } -- cgit v1.2.3 From 7e93a3dd63db2341d094ab1d9ba29b5d8d5093d1 Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Fri, 1 Apr 2022 18:55:27 +0200 Subject: parisc: Update 32- and 64-bit defconfigs Enable CONFIG_CGROUPS=y on 32-bit defconfig for systemd-support, and enable CONFIG_NAMESPACES and CONFIG_USER_NS. Signed-off-by: Helge Deller --- arch/parisc/configs/generic-32bit_defconfig | 4 +++- arch/parisc/configs/generic-64bit_defconfig | 3 ++- 2 files changed, 5 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/parisc/configs/generic-32bit_defconfig b/arch/parisc/configs/generic-32bit_defconfig index a5fee10d76ee..8ce0ae370680 100644 --- a/arch/parisc/configs/generic-32bit_defconfig +++ b/arch/parisc/configs/generic-32bit_defconfig @@ -6,6 +6,9 @@ CONFIG_BSD_PROCESS_ACCT=y CONFIG_IKCONFIG=y CONFIG_IKCONFIG_PROC=y CONFIG_LOG_BUF_SHIFT=16 +CONFIG_CGROUPS=y +CONFIG_NAMESPACES=y +CONFIG_USER_NS=y CONFIG_BLK_DEV_INITRD=y CONFIG_EXPERT=y CONFIG_PERF_EVENTS=y @@ -47,7 +50,6 @@ CONFIG_PARPORT=y CONFIG_PARPORT_PC=m CONFIG_PARPORT_1284=y CONFIG_BLK_DEV_LOOP=y -CONFIG_BLK_DEV_CRYPTOLOOP=y CONFIG_BLK_DEV_RAM=y CONFIG_BLK_DEV_RAM_SIZE=6144 CONFIG_BLK_DEV_SD=y diff --git a/arch/parisc/configs/generic-64bit_defconfig b/arch/parisc/configs/generic-64bit_defconfig index 1b8fd80cbe7f..57501b0aed92 100644 --- a/arch/parisc/configs/generic-64bit_defconfig +++ b/arch/parisc/configs/generic-64bit_defconfig @@ -16,6 +16,7 @@ CONFIG_CGROUPS=y CONFIG_MEMCG=y CONFIG_CGROUP_PIDS=y CONFIG_CPUSETS=y +CONFIG_USER_NS=y CONFIG_RELAY=y CONFIG_BLK_DEV_INITRD=y CONFIG_CC_OPTIMIZE_FOR_SIZE=y @@ -267,9 +268,9 @@ CONFIG_CRYPTO_DEFLATE=m CONFIG_CRC_CCITT=m CONFIG_LIBCRC32C=y CONFIG_PRINTK_TIME=y +CONFIG_DEBUG_KERNEL=y CONFIG_STRIP_ASM_SYMS=y CONFIG_MAGIC_SYSRQ=y CONFIG_DEBUG_FS=y -CONFIG_DEBUG_KERNEL=y CONFIG_DEBUG_STACKOVERFLOW=y # CONFIG_SCHED_DEBUG is not set -- cgit v1.2.3 From 1955c4f879a130c7822f483cf593338ad747aed4 Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Fri, 1 Apr 2022 22:24:20 +0200 Subject: parisc: Re-enable GENERIC_CPU_DEVICES for !SMP In commit 62773112acc5 ("parisc: Switch from GENERIC_CPU_DEVICES to GENERIC_ARCH_TOPOLOGY") GENERIC_CPU_DEVICES was unconditionally turned off, but this triggers a warning in topology_add_dev(). Turning it back on for the !SMP case avoids this warning. Reported-by: Guenter Roeck Tested-by: Guenter Roeck Fixes: 62773112acc5 ("parisc: Switch from GENERIC_CPU_DEVICES to GENERIC_ARCH_TOPOLOGY") Signed-off-by: Helge Deller --- arch/parisc/Kconfig | 1 + 1 file changed, 1 insertion(+) (limited to 'arch') diff --git a/arch/parisc/Kconfig b/arch/parisc/Kconfig index 52e550b45692..bd22578859d0 100644 --- a/arch/parisc/Kconfig +++ b/arch/parisc/Kconfig @@ -38,6 +38,7 @@ config PARISC select ARCH_HAVE_NMI_SAFE_CMPXCHG select GENERIC_SMP_IDLE_THREAD select GENERIC_ARCH_TOPOLOGY if SMP + select GENERIC_CPU_DEVICES if !SMP select GENERIC_LIB_DEVMEM_IS_ALLOWED select SYSCTL_ARCH_UNALIGN_ALLOW select SYSCTL_EXCEPTION_TRACE -- cgit v1.2.3 From 5b89966bc96a06f6ad65f64ae4b0461918fcc9d3 Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Sun, 3 Apr 2022 21:57:51 +0200 Subject: parisc: Merge model and model name into one line in /proc/cpuinfo The Linux tool "lscpu" shows the double amount of CPUs if we have "model" and "model name" in two different lines in /proc/cpuinfo. This change combines the model and the model name into one line. Signed-off-by: Helge Deller Cc: stable@vger.kernel.org --- arch/parisc/kernel/processor.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/parisc/kernel/processor.c b/arch/parisc/kernel/processor.c index 9e92b76b0ce0..26eb568f8b96 100644 --- a/arch/parisc/kernel/processor.c +++ b/arch/parisc/kernel/processor.c @@ -420,8 +420,7 @@ show_cpuinfo (struct seq_file *m, void *v) } seq_printf(m, " (0x%02lx)\n", boot_cpu_data.pdc.capabilities); - seq_printf(m, "model\t\t: %s\n" - "model name\t: %s\n", + seq_printf(m, "model\t\t: %s - %s\n", boot_cpu_data.pdc.sys_model_name, cpuinfo->dev ? cpuinfo->dev->name : "Unknown"); -- cgit v1.2.3 From 234ff4c585d704896450a3634a7c29fa4e1907e1 Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Tue, 5 Apr 2022 21:28:37 +0200 Subject: parisc: Change MAX_ADDRESS to become unsigned long long Dave noticed that for the 32-bit kernel MAX_ADDRESS should be a ULL, otherwise this define would become 0: MAX_ADDRESS (1UL << MAX_ADDRBITS) It has no real effect on the kernel. Signed-off-by: Helge Deller Noticed-by: John David Anglin --- arch/parisc/include/asm/pgtable.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/parisc/include/asm/pgtable.h b/arch/parisc/include/asm/pgtable.h index 939db6fe620b..69765a6dbe89 100644 --- a/arch/parisc/include/asm/pgtable.h +++ b/arch/parisc/include/asm/pgtable.h @@ -160,7 +160,7 @@ extern void __update_cache(pte_t pte); #define SPACEID_SHIFT (MAX_ADDRBITS - 32) #else #define MAX_ADDRBITS (BITS_PER_LONG) -#define MAX_ADDRESS (1UL << MAX_ADDRBITS) +#define MAX_ADDRESS (1ULL << MAX_ADDRBITS) #define SPACEID_SHIFT 0 #endif -- cgit v1.2.3 From a65bcad5421507c2f6c52e1e2ca6a6ce02fd1ad6 Mon Sep 17 00:00:00 2001 From: Julia Lawall Date: Sat, 30 Apr 2022 21:07:18 +0200 Subject: parisc: Fix typos in comments Various spelling mistakes in comments. Detected with the help of Coccinelle. Signed-off-by: Julia Lawall Signed-off-by: Helge Deller --- arch/parisc/kernel/kprobes.c | 2 +- arch/parisc/kernel/traps.c | 2 +- arch/parisc/math-emu/dfadd.c | 2 +- arch/parisc/math-emu/dfsub.c | 2 +- arch/parisc/math-emu/sfadd.c | 2 +- arch/parisc/math-emu/sfsub.c | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) (limited to 'arch') diff --git a/arch/parisc/kernel/kprobes.c b/arch/parisc/kernel/kprobes.c index 3343d2fb7889..6e0b86652f30 100644 --- a/arch/parisc/kernel/kprobes.c +++ b/arch/parisc/kernel/kprobes.c @@ -152,7 +152,7 @@ int __kprobes parisc_kprobe_ss_handler(struct pt_regs *regs) /* for absolute branch instructions we can copy iaoq_b. for relative * branch instructions we need to calculate the new address based on the * difference between iaoq_f and iaoq_b. We cannot use iaoq_b without - * modificationt because it's based on our ainsn.insn address. + * modifications because it's based on our ainsn.insn address. */ if (p->post_handler) diff --git a/arch/parisc/kernel/traps.c b/arch/parisc/kernel/traps.c index a6e61cf2cad0..b78f1b9d45c1 100644 --- a/arch/parisc/kernel/traps.c +++ b/arch/parisc/kernel/traps.c @@ -469,7 +469,7 @@ void parisc_terminate(char *msg, struct pt_regs *regs, int code, unsigned long o * panic notifiers, and we should call panic * directly from the location that we wish. * e.g. We should not call panic from - * parisc_terminate, but rather the oter way around. + * parisc_terminate, but rather the other way around. * This hack works, prints the panic message twice, * and it enables reboot timers! */ diff --git a/arch/parisc/math-emu/dfadd.c b/arch/parisc/math-emu/dfadd.c index ec487e07f004..00e561d4aa55 100644 --- a/arch/parisc/math-emu/dfadd.c +++ b/arch/parisc/math-emu/dfadd.c @@ -253,7 +253,7 @@ dbl_fadd( return(NOEXCEPTION); } right_exponent = 1; /* Set exponent to reflect different bias - * with denomalized numbers. */ + * with denormalized numbers. */ } else { diff --git a/arch/parisc/math-emu/dfsub.c b/arch/parisc/math-emu/dfsub.c index c4f30acf2d48..4f03782284bd 100644 --- a/arch/parisc/math-emu/dfsub.c +++ b/arch/parisc/math-emu/dfsub.c @@ -256,7 +256,7 @@ dbl_fsub( return(NOEXCEPTION); } right_exponent = 1; /* Set exponent to reflect different bias - * with denomalized numbers. */ + * with denormalized numbers. */ } else { diff --git a/arch/parisc/math-emu/sfadd.c b/arch/parisc/math-emu/sfadd.c index 838758279d5b..9b98c874dfac 100644 --- a/arch/parisc/math-emu/sfadd.c +++ b/arch/parisc/math-emu/sfadd.c @@ -249,7 +249,7 @@ sgl_fadd( return(NOEXCEPTION); } right_exponent = 1; /* Set exponent to reflect different bias - * with denomalized numbers. */ + * with denormalized numbers. */ } else { diff --git a/arch/parisc/math-emu/sfsub.c b/arch/parisc/math-emu/sfsub.c index 583d3ace4634..29d9eed09d12 100644 --- a/arch/parisc/math-emu/sfsub.c +++ b/arch/parisc/math-emu/sfsub.c @@ -252,7 +252,7 @@ sgl_fsub( return(NOEXCEPTION); } right_exponent = 1; /* Set exponent to reflect different bias - * with denomalized numbers. */ + * with denormalized numbers. */ } else { -- cgit v1.2.3 From 340233dcc0160aafcce46ca893d1679f16acf409 Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Sun, 8 May 2022 18:25:00 +0200 Subject: parisc: Mark cr16 clock unstable on all SMP machines The cr16 interval timers are not synchronized across CPUs, even with just one dual-core CPU. This becomes visible if the machines have a longer uptime. Signed-off-by: Helge Deller --- arch/parisc/kernel/time.c | 27 ++++----------------------- 1 file changed, 4 insertions(+), 23 deletions(-) (limited to 'arch') diff --git a/arch/parisc/kernel/time.c b/arch/parisc/kernel/time.c index 19c31a72fe76..9714fbd7c42d 100644 --- a/arch/parisc/kernel/time.c +++ b/arch/parisc/kernel/time.c @@ -251,33 +251,14 @@ void __init time_init(void) static int __init init_cr16_clocksource(void) { /* - * The cr16 interval timers are not syncronized across CPUs on - * different sockets, so mark them unstable and lower rating on - * multi-socket SMP systems. + * The cr16 interval timers are not synchronized across CPUs. */ if (num_online_cpus() > 1 && !running_on_qemu) { - int cpu; - unsigned long cpu0_loc; - cpu0_loc = per_cpu(cpu_data, 0).cpu_loc; - - for_each_online_cpu(cpu) { - if (cpu == 0) - continue; - if ((cpu0_loc != 0) && - (cpu0_loc == per_cpu(cpu_data, cpu).cpu_loc)) - continue; - - clocksource_cr16.name = "cr16_unstable"; - clocksource_cr16.flags = CLOCK_SOURCE_UNSTABLE; - clocksource_cr16.rating = 0; - break; - } + clocksource_cr16.name = "cr16_unstable"; + clocksource_cr16.flags = CLOCK_SOURCE_UNSTABLE; + clocksource_cr16.rating = 0; } - /* XXX: We may want to mark sched_clock stable here if cr16 clocks are - * in sync: - * (clocksource_cr16.flags == CLOCK_SOURCE_IS_CONTINUOUS) */ - /* register at clocksource framework */ clocksource_register_hz(&clocksource_cr16, 100 * PAGE0->mem_10msec); -- cgit v1.2.3 From ba0c04104082ca211e108dd8eec6db2ad7676528 Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Sun, 8 May 2022 19:55:13 +0200 Subject: Revert "parisc: Increase parisc_cache_flush_threshold setting" This reverts commit a58e9d0984e8dad53f17ec73ae3c1cc7f8d88151. Triggers segfaults with 32-bit kernels on PA8500 machines. Signed-off-by: Helge Deller --- arch/parisc/kernel/cache.c | 18 +++--------------- 1 file changed, 3 insertions(+), 15 deletions(-) (limited to 'arch') diff --git a/arch/parisc/kernel/cache.c b/arch/parisc/kernel/cache.c index 23348199f3f8..e7911225a4f8 100644 --- a/arch/parisc/kernel/cache.c +++ b/arch/parisc/kernel/cache.c @@ -403,7 +403,7 @@ void __init parisc_setup_cache_timing(void) { unsigned long rangetime, alltime; unsigned long size; - unsigned long threshold, threshold2; + unsigned long threshold; alltime = mfctl(16); flush_data_cache(); @@ -418,20 +418,8 @@ void __init parisc_setup_cache_timing(void) alltime, size, rangetime); threshold = L1_CACHE_ALIGN(size * alltime / rangetime); - - /* - * The threshold computed above isn't very reliable since the - * flush times depend greatly on the percentage of dirty lines - * in the flush range. Further, the whole cache time doesn't - * include the time to refill lines that aren't in the mm/vma - * being flushed. By timing glibc build and checks on mako cpus, - * the following formula seems to work reasonably well. The - * value from the timing calculation is too small, and increases - * build and check times by almost a factor two. - */ - threshold2 = cache_info.dc_size * num_online_cpus(); - if (threshold2 > threshold) - threshold = threshold2; + if (threshold > cache_info.dc_size) + threshold = cache_info.dc_size; if (threshold) parisc_cache_flush_threshold = threshold; printk(KERN_INFO "Cache flush threshold set to %lu KiB\n", -- cgit v1.2.3