diff options
Diffstat (limited to 'arch/x86')
132 files changed, 2591 insertions, 2074 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 2852fcd82cbd..a3c31b784edc 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -93,6 +93,7 @@ config X86 select ARCH_HAS_NMI_SAFE_THIS_CPU_OPS select ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE select ARCH_HAS_PMEM_API if X86_64 + select ARCH_HAS_PREEMPT_LAZY select ARCH_HAS_PTE_DEVMAP if X86_64 select ARCH_HAS_PTE_SPECIAL select ARCH_HAS_HW_PTE_YOUNG @@ -145,7 +146,6 @@ config X86 select ARCH_HAS_PARANOID_L1D_FLUSH select BUILDTIME_TABLE_SORT select CLKEVT_I8253 - select CLOCKSOURCE_VALIDATE_LAST_CYCLE select CLOCKSOURCE_WATCHDOG # Word-size accesses may read uninitialized data past the trailing \0 # in strings and cause false KMSAN reports. @@ -1954,6 +1954,7 @@ config X86_USER_SHADOW_STACK depends on AS_WRUSS depends on X86_64 select ARCH_USES_HIGH_VMA_FLAGS + select ARCH_HAS_USER_SHADOW_STACK select X86_CET help Shadow stack protection is a hardware feature that detects function @@ -2084,6 +2085,9 @@ config ARCH_SUPPORTS_KEXEC_JUMP config ARCH_SUPPORTS_CRASH_DUMP def_bool X86_64 || (X86_32 && HIGHMEM) +config ARCH_DEFAULT_CRASH_DUMP + def_bool y + config ARCH_SUPPORTS_CRASH_HOTPLUG def_bool y @@ -2257,6 +2261,7 @@ config RANDOMIZE_MEMORY_PHYSICAL_PADDING config ADDRESS_MASKING bool "Linear Address Masking support" depends on X86_64 + depends on COMPILE_TEST || !CPU_MITIGATIONS # wait for LASS help Linear Address Masking (LAM) modifies the checking that is applied to 64-bit linear addresses, allowing software to use of the @@ -2423,6 +2428,14 @@ config CFI_AUTO_DEFAULT source "kernel/livepatch/Kconfig" +config X86_BUS_LOCK_DETECT + bool "Split Lock Detect and Bus Lock Detect support" + depends on CPU_SUP_INTEL || CPU_SUP_AMD + default y + help + Enable Split Lock Detect and Bus Lock Detect functionalities. + See <file:Documentation/arch/x86/buslock.rst> for more information. + endmenu config CC_HAS_NAMED_AS diff --git a/arch/x86/Makefile b/arch/x86/Makefile index cd75e78a06c1..5b773b34768d 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -142,9 +142,10 @@ ifeq ($(CONFIG_X86_32),y) ifeq ($(CONFIG_STACKPROTECTOR),y) ifeq ($(CONFIG_SMP),y) - KBUILD_CFLAGS += -mstack-protector-guard-reg=fs -mstack-protector-guard-symbol=__stack_chk_guard + KBUILD_CFLAGS += -mstack-protector-guard-reg=fs \ + -mstack-protector-guard-symbol=__ref_stack_chk_guard else - KBUILD_CFLAGS += -mstack-protector-guard=global + KBUILD_CFLAGS += -mstack-protector-guard=global endif endif else diff --git a/arch/x86/boot/boot.h b/arch/x86/boot/boot.h index 148ba5c5106e..0f24f7ebec9b 100644 --- a/arch/x86/boot/boot.h +++ b/arch/x86/boot/boot.h @@ -305,7 +305,6 @@ void initregs(struct biosregs *regs); int strcmp(const char *str1, const char *str2); int strncmp(const char *cs, const char *ct, size_t count); size_t strnlen(const char *s, size_t maxlen); -unsigned int atou(const char *s); unsigned long long simple_strtoull(const char *cp, char **endp, unsigned int base); size_t strlen(const char *s); char *strchr(const char *s, int c); diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c index 04a35b2c26e9..0d37420cad02 100644 --- a/arch/x86/boot/compressed/misc.c +++ b/arch/x86/boot/compressed/misc.c @@ -385,6 +385,19 @@ static void parse_mem_encrypt(struct setup_header *hdr) hdr->xloadflags |= XLF_MEM_ENCRYPTION; } +static void early_sev_detect(void) +{ + /* + * Accessing video memory causes guest termination because + * the boot stage2 #VC handler of SEV-ES/SNP guests does not + * support MMIO handling and kexec -c adds screen_info to the + * boot parameters passed to the kexec kernel, which causes + * console output to be dumped to both video and serial. + */ + if (sev_status & MSR_AMD64_SEV_ES_ENABLED) + lines = cols = 0; +} + /* * The compressed kernel image (ZO), has been moved so that its position * is against the end of the buffer used to hold the uncompressed kernel @@ -440,6 +453,8 @@ asmlinkage __visible void *extract_kernel(void *rmode, unsigned char *output) */ early_tdx_detect(); + early_sev_detect(); + console_init(); /* diff --git a/arch/x86/boot/string.c b/arch/x86/boot/string.c index c23f3b9c84fe..84f7a883ce1e 100644 --- a/arch/x86/boot/string.c +++ b/arch/x86/boot/string.c @@ -88,14 +88,6 @@ size_t strnlen(const char *s, size_t maxlen) return (es - s); } -unsigned int atou(const char *s) -{ - unsigned int i = 0; - while (isdigit(*s)) - i = i * 10 + (*s++ - '0'); - return i; -} - /* Works only for digits and letters, but small and fast */ #define TOLOWER(x) ((x) | 0x20) diff --git a/arch/x86/boot/string.h b/arch/x86/boot/string.h index e5d2c6b8c2f1..a5b05ebc037d 100644 --- a/arch/x86/boot/string.h +++ b/arch/x86/boot/string.h @@ -24,7 +24,6 @@ extern size_t strlen(const char *s); extern char *strstr(const char *s1, const char *s2); extern char *strchr(const char *s, int c); extern size_t strnlen(const char *s, size_t maxlen); -extern unsigned int atou(const char *s); extern unsigned long long simple_strtoull(const char *cp, char **endp, unsigned int base); long simple_strtol(const char *cp, char **endp, unsigned int base); diff --git a/arch/x86/coco/sev/core.c b/arch/x86/coco/sev/core.c index de1df0cb45da..c5b0148b8c0a 100644 --- a/arch/x86/coco/sev/core.c +++ b/arch/x86/coco/sev/core.c @@ -92,6 +92,9 @@ static struct ghcb *boot_ghcb __section(".data"); /* Bitmap of SEV features supported by the hypervisor */ static u64 sev_hv_features __ro_after_init; +/* Secrets page physical address from the CC blob */ +static u64 secrets_pa __ro_after_init; + /* #VC handler runtime per-CPU data */ struct sev_es_runtime_data { struct ghcb ghcb_page; @@ -141,33 +144,6 @@ static DEFINE_PER_CPU(struct sev_es_save_area *, sev_vmsa); static DEFINE_PER_CPU(struct svsm_ca *, svsm_caa); static DEFINE_PER_CPU(u64, svsm_caa_pa); -struct sev_config { - __u64 debug : 1, - - /* - * Indicates when the per-CPU GHCB has been created and registered - * and thus can be used by the BSP instead of the early boot GHCB. - * - * For APs, the per-CPU GHCB is created before they are started - * and registered upon startup, so this flag can be used globally - * for the BSP and APs. - */ - ghcbs_initialized : 1, - - /* - * Indicates when the per-CPU SVSM CA is to be used instead of the - * boot SVSM CA. - * - * For APs, the per-CPU SVSM CA is created as part of the AP - * bringup, so this flag can be used globally for the BSP and APs. - */ - use_cas : 1, - - __reserved : 61; -}; - -static struct sev_config sev_cfg __read_mostly; - static __always_inline bool on_vc_stack(struct pt_regs *regs) { unsigned long sp = regs->sp; @@ -722,45 +698,13 @@ void noinstr __sev_es_nmi_complete(void) __sev_put_ghcb(&state); } -static u64 __init get_secrets_page(void) -{ - u64 pa_data = boot_params.cc_blob_address; - struct cc_blob_sev_info info; - void *map; - - /* - * The CC blob contains the address of the secrets page, check if the - * blob is present. - */ - if (!pa_data) - return 0; - - map = early_memremap(pa_data, sizeof(info)); - if (!map) { - pr_err("Unable to locate SNP secrets page: failed to map the Confidential Computing blob.\n"); - return 0; - } - memcpy(&info, map, sizeof(info)); - early_memunmap(map, sizeof(info)); - - /* smoke-test the secrets page passed */ - if (!info.secrets_phys || info.secrets_len != PAGE_SIZE) - return 0; - - return info.secrets_phys; -} - static u64 __init get_snp_jump_table_addr(void) { struct snp_secrets_page *secrets; void __iomem *mem; - u64 pa, addr; + u64 addr; - pa = get_secrets_page(); - if (!pa) - return 0; - - mem = ioremap_encrypted(pa, PAGE_SIZE); + mem = ioremap_encrypted(secrets_pa, PAGE_SIZE); if (!mem) { pr_err("Unable to locate AP jump table address: failed to map the SNP secrets page.\n"); return 0; @@ -1010,6 +954,137 @@ void snp_accept_memory(phys_addr_t start, phys_addr_t end) set_pages_state(vaddr, npages, SNP_PAGE_STATE_PRIVATE); } +static void set_pte_enc(pte_t *kpte, int level, void *va) +{ + struct pte_enc_desc d = { + .kpte = kpte, + .pte_level = level, + .va = va, + .encrypt = true + }; + + prepare_pte_enc(&d); + set_pte_enc_mask(kpte, d.pfn, d.new_pgprot); +} + +static void unshare_all_memory(void) +{ + unsigned long addr, end, size, ghcb; + struct sev_es_runtime_data *data; + unsigned int npages, level; + bool skipped_addr; + pte_t *pte; + int cpu; + + /* Unshare the direct mapping. */ + addr = PAGE_OFFSET; + end = PAGE_OFFSET + get_max_mapped(); + + while (addr < end) { + pte = lookup_address(addr, &level); + size = page_level_size(level); + npages = size / PAGE_SIZE; + skipped_addr = false; + + if (!pte || !pte_decrypted(*pte) || pte_none(*pte)) { + addr += size; + continue; + } + + /* + * Ensure that all the per-CPU GHCBs are made private at the + * end of the unsharing loop so that the switch to the slower + * MSR protocol happens last. + */ + for_each_possible_cpu(cpu) { + data = per_cpu(runtime_data, cpu); + ghcb = (unsigned long)&data->ghcb_page; + + if (addr <= ghcb && ghcb <= addr + size) { + skipped_addr = true; + break; + } + } + + if (!skipped_addr) { + set_pte_enc(pte, level, (void *)addr); + snp_set_memory_private(addr, npages); + } + addr += size; + } + + /* Unshare all bss decrypted memory. */ + addr = (unsigned long)__start_bss_decrypted; + end = (unsigned long)__start_bss_decrypted_unused; + npages = (end - addr) >> PAGE_SHIFT; + + for (; addr < end; addr += PAGE_SIZE) { + pte = lookup_address(addr, &level); + if (!pte || !pte_decrypted(*pte) || pte_none(*pte)) + continue; + + set_pte_enc(pte, level, (void *)addr); + } + addr = (unsigned long)__start_bss_decrypted; + snp_set_memory_private(addr, npages); + + __flush_tlb_all(); +} + +/* Stop new private<->shared conversions */ +void snp_kexec_begin(void) +{ + if (!cc_platform_has(CC_ATTR_GUEST_SEV_SNP)) + return; + + if (!IS_ENABLED(CONFIG_KEXEC_CORE)) + return; + + /* + * Crash kernel ends up here with interrupts disabled: can't wait for + * conversions to finish. + * + * If race happened, just report and proceed. + */ + if (!set_memory_enc_stop_conversion()) + pr_warn("Failed to stop shared<->private conversions\n"); +} + +void snp_kexec_finish(void) +{ + struct sev_es_runtime_data *data; + unsigned int level, cpu; + unsigned long size; + struct ghcb *ghcb; + pte_t *pte; + + if (!cc_platform_has(CC_ATTR_GUEST_SEV_SNP)) + return; + + if (!IS_ENABLED(CONFIG_KEXEC_CORE)) + return; + + unshare_all_memory(); + + /* + * Switch to using the MSR protocol to change per-CPU GHCBs to + * private. All the per-CPU GHCBs have been switched back to private, + * so can't do any more GHCB calls to the hypervisor beyond this point + * until the kexec'ed kernel starts running. + */ + boot_ghcb = NULL; + sev_cfg.ghcbs_initialized = false; + + for_each_possible_cpu(cpu) { + data = per_cpu(runtime_data, cpu); + ghcb = &data->ghcb_page; + pte = lookup_address((unsigned long)ghcb, &level); + size = page_level_size(level); + set_pte_enc(pte, level, (void *)ghcb); + snp_set_memory_private((unsigned long)ghcb, (size / PAGE_SIZE)); + } +} + static int snp_set_vmsa(void *va, void *caa, int apic_id, bool make_vmsa) { int ret; @@ -1331,35 +1406,39 @@ int __init sev_es_efi_map_ghcbs(pgd_t *pgd) return 0; } +/* Writes to the SVSM CAA MSR are ignored */ +static enum es_result __vc_handle_msr_caa(struct pt_regs *regs, bool write) +{ + if (write) + return ES_OK; + + regs->ax = lower_32_bits(this_cpu_read(svsm_caa_pa)); + regs->dx = upper_32_bits(this_cpu_read(svsm_caa_pa)); + + return ES_OK; +} + static enum es_result vc_handle_msr(struct ghcb *ghcb, struct es_em_ctxt *ctxt) { struct pt_regs *regs = ctxt->regs; enum es_result ret; - u64 exit_info_1; + bool write; /* Is it a WRMSR? */ - exit_info_1 = (ctxt->insn.opcode.bytes[1] == 0x30) ? 1 : 0; + write = ctxt->insn.opcode.bytes[1] == 0x30; - if (regs->cx == MSR_SVSM_CAA) { - /* Writes to the SVSM CAA msr are ignored */ - if (exit_info_1) - return ES_OK; - - regs->ax = lower_32_bits(this_cpu_read(svsm_caa_pa)); - regs->dx = upper_32_bits(this_cpu_read(svsm_caa_pa)); - - return ES_OK; - } + if (regs->cx == MSR_SVSM_CAA) + return __vc_handle_msr_caa(regs, write); ghcb_set_rcx(ghcb, regs->cx); - if (exit_info_1) { + if (write) { ghcb_set_rax(ghcb, regs->ax); ghcb_set_rdx(ghcb, regs->dx); } - ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_MSR, exit_info_1, 0); + ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_MSR, write, 0); - if ((ret == ES_OK) && (!exit_info_1)) { + if ((ret == ES_OK) && !write) { regs->ax = ghcb->save.rax; regs->dx = ghcb->save.rdx; } @@ -2300,6 +2379,11 @@ bool __head snp_init(struct boot_params *bp) if (!cc_info) return false; + if (cc_info->secrets_phys && cc_info->secrets_len == PAGE_SIZE) + secrets_pa = cc_info->secrets_phys; + else + return false; + setup_cpuid_table(cc_info); svsm_setup(cc_info); @@ -2374,23 +2458,6 @@ static int __init report_snp_info(void) } arch_initcall(report_snp_info); -static int __init init_sev_config(char *str) -{ - char *s; - - while ((s = strsep(&str, ","))) { - if (!strcmp(s, "debug")) { - sev_cfg.debug = true; - continue; - } - - pr_info("SEV command-line option '%s' was not recognized\n", s); - } - - return 1; -} -__setup("sev=", init_sev_config); - static void update_attest_input(struct svsm_call *call, struct svsm_attest_call *input) { /* If (new) lengths have been returned, propagate them up */ @@ -2441,7 +2508,8 @@ int snp_issue_svsm_attest_req(u64 call_id, struct svsm_call *call, } EXPORT_SYMBOL_GPL(snp_issue_svsm_attest_req); -int snp_issue_guest_request(u64 exit_code, struct snp_req_data *input, struct snp_guest_request_ioctl *rio) +int snp_issue_guest_request(struct snp_guest_req *req, struct snp_req_data *input, + struct snp_guest_request_ioctl *rio) { struct ghcb_state state; struct es_em_ctxt ctxt; @@ -2465,12 +2533,12 @@ int snp_issue_guest_request(u64 exit_code, struct snp_req_data *input, struct sn vc_ghcb_invalidate(ghcb); - if (exit_code == SVM_VMGEXIT_EXT_GUEST_REQUEST) { + if (req->exit_code == SVM_VMGEXIT_EXT_GUEST_REQUEST) { ghcb_set_rax(ghcb, input->data_gpa); ghcb_set_rbx(ghcb, input->data_npages); } - ret = sev_es_ghcb_hv_call(ghcb, &ctxt, exit_code, input->req_gpa, input->resp_gpa); + ret = sev_es_ghcb_hv_call(ghcb, &ctxt, req->exit_code, input->req_gpa, input->resp_gpa); if (ret) goto e_put; @@ -2485,7 +2553,7 @@ int snp_issue_guest_request(u64 exit_code, struct snp_req_data *input, struct sn case SNP_GUEST_VMM_ERR(SNP_GUEST_VMM_ERR_INVALID_LEN): /* Number of expected pages are returned in RBX */ - if (exit_code == SVM_VMGEXIT_EXT_GUEST_REQUEST) { + if (req->exit_code == SVM_VMGEXIT_EXT_GUEST_REQUEST) { input->data_npages = ghcb_get_rbx(ghcb); ret = -ENOSPC; break; @@ -2513,16 +2581,11 @@ static struct platform_device sev_guest_device = { static int __init snp_init_platform_device(void) { struct sev_guest_platform_data data; - u64 gpa; if (!cc_platform_has(CC_ATTR_GUEST_SEV_SNP)) return -ENODEV; - gpa = get_secrets_page(); - if (!gpa) - return -ENODEV; - - data.secrets_gpa = gpa; + data.secrets_gpa = secrets_pa; if (platform_device_add_data(&sev_guest_device, &data, sizeof(data))) return -ENODEV; diff --git a/arch/x86/crypto/Kconfig b/arch/x86/crypto/Kconfig index 7b1bebed879d..3d2e38ba5240 100644 --- a/arch/x86/crypto/Kconfig +++ b/arch/x86/crypto/Kconfig @@ -363,7 +363,7 @@ config CRYPTO_CHACHA20_X86_64 - AVX-512VL (Advanced Vector Extensions-512VL) config CRYPTO_AEGIS128_AESNI_SSE2 - tristate "AEAD ciphers: AEGIS-128 (AES-NI/SSE2)" + tristate "AEAD ciphers: AEGIS-128 (AES-NI/SSE4.1)" depends on X86 && 64BIT select CRYPTO_AEAD select CRYPTO_SIMD @@ -372,7 +372,7 @@ config CRYPTO_AEGIS128_AESNI_SSE2 Architecture: x86_64 using: - AES-NI (AES New Instructions) - - SSE2 (Streaming SIMD Extensions 2) + - SSE4.1 (Streaming SIMD Extensions 4.1) config CRYPTO_NHPOLY1305_SSE2 tristate "Hash functions: NHPoly1305 (SSE2)" diff --git a/arch/x86/crypto/aegis128-aesni-asm.S b/arch/x86/crypto/aegis128-aesni-asm.S index ad7f4c891625..7294dc0ee7ba 100644 --- a/arch/x86/crypto/aegis128-aesni-asm.S +++ b/arch/x86/crypto/aegis128-aesni-asm.S @@ -1,14 +1,13 @@ /* SPDX-License-Identifier: GPL-2.0-only */ /* - * AES-NI + SSE2 implementation of AEGIS-128 + * AES-NI + SSE4.1 implementation of AEGIS-128 * * Copyright (c) 2017-2018 Ondrej Mosnacek <omosnacek@gmail.com> * Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved. + * Copyright 2024 Google LLC */ #include <linux/linkage.h> -#include <linux/cfi_types.h> -#include <asm/frame.h> #define STATE0 %xmm0 #define STATE1 %xmm1 @@ -20,11 +19,6 @@ #define T0 %xmm6 #define T1 %xmm7 -#define STATEP %rdi -#define LEN %rsi -#define SRC %rdx -#define DST %rcx - .section .rodata.cst16.aegis128_const, "aM", @progbits, 32 .align 16 .Laegis128_const_0: @@ -34,11 +28,11 @@ .byte 0xdb, 0x3d, 0x18, 0x55, 0x6d, 0xc2, 0x2f, 0xf1 .byte 0x20, 0x11, 0x31, 0x42, 0x73, 0xb5, 0x28, 0xdd -.section .rodata.cst16.aegis128_counter, "aM", @progbits, 16 -.align 16 -.Laegis128_counter: - .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07 - .byte 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f +.section .rodata.cst32.zeropad_mask, "aM", @progbits, 32 +.align 32 +.Lzeropad_mask: + .octa 0xffffffffffffffffffffffffffffffff + .octa 0 .text @@ -61,140 +55,102 @@ .endm /* - * __load_partial: internal ABI - * input: - * LEN - bytes - * SRC - src - * output: - * MSG - message block - * changed: - * T0 - * %r8 - * %r9 + * Load 1 <= LEN (%ecx) <= 15 bytes from the pointer SRC into the xmm register + * MSG and zeroize any remaining bytes. Clobbers %rax, %rcx, and %r8. */ -SYM_FUNC_START_LOCAL(__load_partial) - xor %r9d, %r9d - pxor MSG, MSG - - mov LEN, %r8 - and $0x1, %r8 - jz .Lld_partial_1 - - mov LEN, %r8 - and $0x1E, %r8 - add SRC, %r8 - mov (%r8), %r9b - -.Lld_partial_1: - mov LEN, %r8 - and $0x2, %r8 - jz .Lld_partial_2 - - mov LEN, %r8 - and $0x1C, %r8 - add SRC, %r8 - shl $0x10, %r9 - mov (%r8), %r9w - -.Lld_partial_2: - mov LEN, %r8 - and $0x4, %r8 - jz .Lld_partial_4 - - mov LEN, %r8 - and $0x18, %r8 - add SRC, %r8 - shl $32, %r9 - mov (%r8), %r8d - xor %r8, %r9 - -.Lld_partial_4: - movq %r9, MSG - - mov LEN, %r8 - and $0x8, %r8 - jz .Lld_partial_8 - - mov LEN, %r8 - and $0x10, %r8 - add SRC, %r8 - pslldq $8, MSG - movq (%r8), T0 - pxor T0, MSG - -.Lld_partial_8: - RET -SYM_FUNC_END(__load_partial) +.macro load_partial + sub $8, %ecx /* LEN - 8 */ + jle .Lle8\@ + + /* Load 9 <= LEN <= 15 bytes: */ + movq (SRC), MSG /* Load first 8 bytes */ + mov (SRC, %rcx), %rax /* Load last 8 bytes */ + neg %ecx + shl $3, %ecx + shr %cl, %rax /* Discard overlapping bytes */ + pinsrq $1, %rax, MSG + jmp .Ldone\@ + +.Lle8\@: + add $4, %ecx /* LEN - 4 */ + jl .Llt4\@ + + /* Load 4 <= LEN <= 8 bytes: */ + mov (SRC), %eax /* Load first 4 bytes */ + mov (SRC, %rcx), %r8d /* Load last 4 bytes */ + jmp .Lcombine\@ + +.Llt4\@: + /* Load 1 <= LEN <= 3 bytes: */ + add $2, %ecx /* LEN - 2 */ + movzbl (SRC), %eax /* Load first byte */ + jl .Lmovq\@ + movzwl (SRC, %rcx), %r8d /* Load last 2 bytes */ +.Lcombine\@: + shl $3, %ecx + shl %cl, %r8 + or %r8, %rax /* Combine the two parts */ +.Lmovq\@: + movq %rax, MSG +.Ldone\@: +.endm /* - * __store_partial: internal ABI - * input: - * LEN - bytes - * DST - dst - * output: - * T0 - message block - * changed: - * %r8 - * %r9 - * %r10 + * Store 1 <= LEN (%ecx) <= 15 bytes from the xmm register \msg to the pointer + * DST. Clobbers %rax, %rcx, and %r8. */ -SYM_FUNC_START_LOCAL(__store_partial) - mov LEN, %r8 - mov DST, %r9 - - movq T0, %r10 - - cmp $8, %r8 - jl .Lst_partial_8 - - mov %r10, (%r9) - psrldq $8, T0 - movq T0, %r10 - - sub $8, %r8 - add $8, %r9 - -.Lst_partial_8: - cmp $4, %r8 - jl .Lst_partial_4 - - mov %r10d, (%r9) - shr $32, %r10 - - sub $4, %r8 - add $4, %r9 - -.Lst_partial_4: - cmp $2, %r8 - jl .Lst_partial_2 - - mov %r10w, (%r9) - shr $0x10, %r10 - - sub $2, %r8 - add $2, %r9 - -.Lst_partial_2: - cmp $1, %r8 - jl .Lst_partial_1 - - mov %r10b, (%r9) - -.Lst_partial_1: - RET -SYM_FUNC_END(__store_partial) +.macro store_partial msg + sub $8, %ecx /* LEN - 8 */ + jl .Llt8\@ + + /* Store 8 <= LEN <= 15 bytes: */ + pextrq $1, \msg, %rax + mov %ecx, %r8d + shl $3, %ecx + ror %cl, %rax + mov %rax, (DST, %r8) /* Store last LEN - 8 bytes */ + movq \msg, (DST) /* Store first 8 bytes */ + jmp .Ldone\@ + +.Llt8\@: + add $4, %ecx /* LEN - 4 */ + jl .Llt4\@ + + /* Store 4 <= LEN <= 7 bytes: */ + pextrd $1, \msg, %eax + mov %ecx, %r8d + shl $3, %ecx + ror %cl, %eax + mov %eax, (DST, %r8) /* Store last LEN - 4 bytes */ + movd \msg, (DST) /* Store first 4 bytes */ + jmp .Ldone\@ + +.Llt4\@: + /* Store 1 <= LEN <= 3 bytes: */ + pextrb $0, \msg, 0(DST) + cmp $-2, %ecx /* LEN - 4 == -2, i.e. LEN == 2? */ + jl .Ldone\@ + pextrb $1, \msg, 1(DST) + je .Ldone\@ + pextrb $2, \msg, 2(DST) +.Ldone\@: +.endm /* - * void crypto_aegis128_aesni_init(void *state, const void *key, const void *iv); + * void aegis128_aesni_init(struct aegis_state *state, + * const struct aegis_block *key, + * const u8 iv[AEGIS128_NONCE_SIZE]); */ -SYM_FUNC_START(crypto_aegis128_aesni_init) - FRAME_BEGIN +SYM_FUNC_START(aegis128_aesni_init) + .set STATEP, %rdi + .set KEYP, %rsi + .set IVP, %rdx /* load IV: */ - movdqu (%rdx), T1 + movdqu (IVP), T1 /* load key: */ - movdqa (%rsi), KEY + movdqa (KEYP), KEY pxor KEY, T1 movdqa T1, STATE0 movdqa KEY, STATE3 @@ -224,20 +180,22 @@ SYM_FUNC_START(crypto_aegis128_aesni_init) movdqu STATE2, 0x20(STATEP) movdqu STATE3, 0x30(STATEP) movdqu STATE4, 0x40(STATEP) - - FRAME_END RET -SYM_FUNC_END(crypto_aegis128_aesni_init) +SYM_FUNC_END(aegis128_aesni_init) /* - * void crypto_aegis128_aesni_ad(void *state, unsigned int length, - * const void *data); + * void aegis128_aesni_ad(struct aegis_state *state, const u8 *data, + * unsigned int len); + * + * len must be a multiple of 16. */ -SYM_FUNC_START(crypto_aegis128_aesni_ad) - FRAME_BEGIN +SYM_FUNC_START(aegis128_aesni_ad) + .set STATEP, %rdi + .set SRC, %rsi + .set LEN, %edx - cmp $0x10, LEN - jb .Lad_out + test LEN, LEN + jz .Lad_out /* load the state: */ movdqu 0x00(STATEP), STATE0 @@ -246,89 +204,40 @@ SYM_FUNC_START(crypto_aegis128_aesni_ad) movdqu 0x30(STATEP), STATE3 movdqu 0x40(STATEP), STATE4 - mov SRC, %r8 - and $0xF, %r8 - jnz .Lad_u_loop - -.align 8 -.Lad_a_loop: - movdqa 0x00(SRC), MSG - aegis128_update - pxor MSG, STATE4 - sub $0x10, LEN - cmp $0x10, LEN - jl .Lad_out_1 - - movdqa 0x10(SRC), MSG - aegis128_update - pxor MSG, STATE3 - sub $0x10, LEN - cmp $0x10, LEN - jl .Lad_out_2 - - movdqa 0x20(SRC), MSG - aegis128_update - pxor MSG, STATE2 - sub $0x10, LEN - cmp $0x10, LEN - jl .Lad_out_3 - - movdqa 0x30(SRC), MSG - aegis128_update - pxor MSG, STATE1 - sub $0x10, LEN - cmp $0x10, LEN - jl .Lad_out_4 - - movdqa 0x40(SRC), MSG - aegis128_update - pxor MSG, STATE0 - sub $0x10, LEN - cmp $0x10, LEN - jl .Lad_out_0 - - add $0x50, SRC - jmp .Lad_a_loop - .align 8 -.Lad_u_loop: +.Lad_loop: movdqu 0x00(SRC), MSG aegis128_update pxor MSG, STATE4 sub $0x10, LEN - cmp $0x10, LEN - jl .Lad_out_1 + jz .Lad_out_1 movdqu 0x10(SRC), MSG aegis128_update pxor MSG, STATE3 sub $0x10, LEN - cmp $0x10, LEN - jl .Lad_out_2 + jz .Lad_out_2 movdqu 0x20(SRC), MSG aegis128_update pxor MSG, STATE2 sub $0x10, LEN - cmp $0x10, LEN - jl .Lad_out_3 + jz .Lad_out_3 movdqu 0x30(SRC), MSG aegis128_update pxor MSG, STATE1 sub $0x10, LEN - cmp $0x10, LEN - jl .Lad_out_4 + jz .Lad_out_4 movdqu 0x40(SRC), MSG aegis128_update pxor MSG, STATE0 sub $0x10, LEN - cmp $0x10, LEN - jl .Lad_out_0 + jz .Lad_out_0 add $0x50, SRC - jmp .Lad_u_loop + jmp .Lad_loop /* store the state: */ .Lad_out_0: @@ -337,7 +246,6 @@ SYM_FUNC_START(crypto_aegis128_aesni_ad) movdqu STATE2, 0x20(STATEP) movdqu STATE3, 0x30(STATEP) movdqu STATE4, 0x40(STATEP) - FRAME_END RET .Lad_out_1: @@ -346,7 +254,6 @@ SYM_FUNC_START(crypto_aegis128_aesni_ad) movdqu STATE1, 0x20(STATEP) movdqu STATE2, 0x30(STATEP) movdqu STATE3, 0x40(STATEP) - FRAME_END RET .Lad_out_2: @@ -355,7 +262,6 @@ SYM_FUNC_START(crypto_aegis128_aesni_ad) movdqu STATE0, 0x20(STATEP) movdqu STATE1, 0x30(STATEP) movdqu STATE2, 0x40(STATEP) - FRAME_END RET .Lad_out_3: @@ -364,7 +270,6 @@ SYM_FUNC_START(crypto_aegis128_aesni_ad) movdqu STATE4, 0x20(STATEP) movdqu STATE0, 0x30(STATEP) movdqu STATE1, 0x40(STATEP) - FRAME_END RET .Lad_out_4: @@ -373,41 +278,38 @@ SYM_FUNC_START(crypto_aegis128_aesni_ad) movdqu STATE3, 0x20(STATEP) movdqu STATE4, 0x30(STATEP) movdqu STATE0, 0x40(STATEP) - FRAME_END - RET - .Lad_out: - FRAME_END RET -SYM_FUNC_END(crypto_aegis128_aesni_ad) +SYM_FUNC_END(aegis128_aesni_ad) -.macro encrypt_block a s0 s1 s2 s3 s4 i - movdq\a (\i * 0x10)(SRC), MSG +.macro encrypt_block s0 s1 s2 s3 s4 i + movdqu (\i * 0x10)(SRC), MSG movdqa MSG, T0 pxor \s1, T0 pxor \s4, T0 movdqa \s2, T1 pand \s3, T1 pxor T1, T0 - movdq\a T0, (\i * 0x10)(DST) + movdqu T0, (\i * 0x10)(DST) aegis128_update pxor MSG, \s4 sub $0x10, LEN - cmp $0x10, LEN - jl .Lenc_out_\i + jz .Lenc_out_\i .endm /* - * void crypto_aegis128_aesni_enc(void *state, unsigned int length, - * const void *src, void *dst); + * void aegis128_aesni_enc(struct aegis_state *state, const u8 *src, u8 *dst, + * unsigned int len); + * + * len must be nonzero and a multiple of 16. */ -SYM_TYPED_FUNC_START(crypto_aegis128_aesni_enc) - FRAME_BEGIN - - cmp $0x10, LEN - jb .Lenc_out +SYM_FUNC_START(aegis128_aesni_enc) + .set STATEP, %rdi + .set SRC, %rsi + .set DST, %rdx + .set LEN, %ecx /* load the state: */ movdqu 0x00(STATEP), STATE0 @@ -416,34 +318,17 @@ SYM_TYPED_FUNC_START(crypto_aegis128_aesni_enc) movdqu 0x30(STATEP), STATE3 movdqu 0x40(STATEP), STATE4 - mov SRC, %r8 - or DST, %r8 - and $0xF, %r8 - jnz .Lenc_u_loop - .align 8 -.Lenc_a_loop: - encrypt_block a STATE0 STATE1 STATE2 STATE3 STATE4 0 - encrypt_block a STATE4 STATE0 STATE1 STATE2 STATE3 1 - encrypt_block a STATE3 STATE4 STATE0 STATE1 STATE2 2 - encrypt_block a STATE2 STATE3 STATE4 STATE0 STATE1 3 - encrypt_block a STATE1 STATE2 STATE3 STATE4 STATE0 4 +.Lenc_loop: + encrypt_block STATE0 STATE1 STATE2 STATE3 STATE4 0 + encrypt_block STATE4 STATE0 STATE1 STATE2 STATE3 1 + encrypt_block STATE3 STATE4 STATE0 STATE1 STATE2 2 + encrypt_block STATE2 STATE3 STATE4 STATE0 STATE1 3 + encrypt_block STATE1 STATE2 STATE3 STATE4 STATE0 4 add $0x50, SRC add $0x50, DST - jmp .Lenc_a_loop - -.align 8 -.Lenc_u_loop: - encrypt_block u STATE0 STATE1 STATE2 STATE3 STATE4 0 - encrypt_block u STATE4 STATE0 STATE1 STATE2 STATE3 1 - encrypt_block u STATE3 STATE4 STATE0 STATE1 STATE2 2 - encrypt_block u STATE2 STATE3 STATE4 STATE0 STATE1 3 - encrypt_block u STATE1 STATE2 STATE3 STATE4 STATE0 4 - - add $0x50, SRC - add $0x50, DST - jmp .Lenc_u_loop + jmp .Lenc_loop /* store the state: */ .Lenc_out_0: @@ -452,7 +337,6 @@ SYM_TYPED_FUNC_START(crypto_aegis128_aesni_enc) movdqu STATE1, 0x20(STATEP) movdqu STATE2, 0x30(STATEP) movdqu STATE3, 0x40(STATEP) - FRAME_END RET .Lenc_out_1: @@ -461,7 +345,6 @@ SYM_TYPED_FUNC_START(crypto_aegis128_aesni_enc) movdqu STATE0, 0x20(STATEP) movdqu STATE1, 0x30(STATEP) movdqu STATE2, 0x40(STATEP) - FRAME_END RET .Lenc_out_2: @@ -470,7 +353,6 @@ SYM_TYPED_FUNC_START(crypto_aegis128_aesni_enc) movdqu STATE4, 0x20(STATEP) movdqu STATE0, 0x30(STATEP) movdqu STATE1, 0x40(STATEP) - FRAME_END RET .Lenc_out_3: @@ -479,7 +361,6 @@ SYM_TYPED_FUNC_START(crypto_aegis128_aesni_enc) movdqu STATE3, 0x20(STATEP) movdqu STATE4, 0x30(STATEP) movdqu STATE0, 0x40(STATEP) - FRAME_END RET .Lenc_out_4: @@ -488,20 +369,19 @@ SYM_TYPED_FUNC_START(crypto_aegis128_aesni_enc) movdqu STATE2, 0x20(STATEP) movdqu STATE3, 0x30(STATEP) movdqu STATE4, 0x40(STATEP) - FRAME_END - RET - .Lenc_out: - FRAME_END RET -SYM_FUNC_END(crypto_aegis128_aesni_enc) +SYM_FUNC_END(aegis128_aesni_enc) /* - * void crypto_aegis128_aesni_enc_tail(void *state, unsigned int length, - * const void *src, void *dst); + * void aegis128_aesni_enc_tail(struct aegis_state *state, const u8 *src, + * u8 *dst, unsigned int len); */ -SYM_TYPED_FUNC_START(crypto_aegis128_aesni_enc_tail) - FRAME_BEGIN +SYM_FUNC_START(aegis128_aesni_enc_tail) + .set STATEP, %rdi + .set SRC, %rsi + .set DST, %rdx + .set LEN, %ecx /* {load,store}_partial rely on this being %ecx */ /* load the state: */ movdqu 0x00(STATEP), STATE0 @@ -511,7 +391,8 @@ SYM_TYPED_FUNC_START(crypto_aegis128_aesni_enc_tail) movdqu 0x40(STATEP), STATE4 /* encrypt message: */ - call __load_partial + mov LEN, %r9d + load_partial movdqa MSG, T0 pxor STATE1, T0 @@ -520,7 +401,8 @@ SYM_TYPED_FUNC_START(crypto_aegis128_aesni_enc_tail) pand STATE3, T1 pxor T1, T0 - call __store_partial + mov %r9d, LEN + store_partial T0 aegis128_update pxor MSG, STATE4 @@ -531,37 +413,36 @@ SYM_TYPED_FUNC_START(crypto_aegis128_aesni_enc_tail) movdqu STATE1, 0x20(STATEP) movdqu STATE2, 0x30(STATEP) movdqu STATE3, 0x40(STATEP) - - FRAME_END RET -SYM_FUNC_END(crypto_aegis128_aesni_enc_tail) +SYM_FUNC_END(aegis128_aesni_enc_tail) -.macro decrypt_block a s0 s1 s2 s3 s4 i - movdq\a (\i * 0x10)(SRC), MSG +.macro decrypt_block s0 s1 s2 s3 s4 i + movdqu (\i * 0x10)(SRC), MSG pxor \s1, MSG pxor \s4, MSG movdqa \s2, T1 pand \s3, T1 pxor T1, MSG - movdq\a MSG, (\i * 0x10)(DST) + movdqu MSG, (\i * 0x10)(DST) aegis128_update pxor MSG, \s4 sub $0x10, LEN - cmp $0x10, LEN - jl .Ldec_out_\i + jz .Ldec_out_\i .endm /* - * void crypto_aegis128_aesni_dec(void *state, unsigned int length, - * const void *src, void *dst); + * void aegis128_aesni_dec(struct aegis_state *state, const u8 *src, u8 *dst, + * unsigned int len); + * + * len must be nonzero and a multiple of 16. */ -SYM_TYPED_FUNC_START(crypto_aegis128_aesni_dec) - FRAME_BEGIN - - cmp $0x10, LEN - jb .Ldec_out +SYM_FUNC_START(aegis128_aesni_dec) + .set STATEP, %rdi + .set SRC, %rsi + .set DST, %rdx + .set LEN, %ecx /* load the state: */ movdqu 0x00(STATEP), STATE0 @@ -570,34 +451,17 @@ SYM_TYPED_FUNC_START(crypto_aegis128_aesni_dec) movdqu 0x30(STATEP), STATE3 movdqu 0x40(STATEP), STATE4 - mov SRC, %r8 - or DST, %r8 - and $0xF, %r8 - jnz .Ldec_u_loop - .align 8 -.Ldec_a_loop: - decrypt_block a STATE0 STATE1 STATE2 STATE3 STATE4 0 - decrypt_block a STATE4 STATE0 STATE1 STATE2 STATE3 1 - decrypt_block a STATE3 STATE4 STATE0 STATE1 STATE2 2 - decrypt_block a STATE2 STATE3 STATE4 STATE0 STATE1 3 - decrypt_block a STATE1 STATE2 STATE3 STATE4 STATE0 4 +.Ldec_loop: + decrypt_block STATE0 STATE1 STATE2 STATE3 STATE4 0 + decrypt_block STATE4 STATE0 STATE1 STATE2 STATE3 1 + decrypt_block STATE3 STATE4 STATE0 STATE1 STATE2 2 + decrypt_block STATE2 STATE3 STATE4 STATE0 STATE1 3 + decrypt_block STATE1 STATE2 STATE3 STATE4 STATE0 4 add $0x50, SRC add $0x50, DST - jmp .Ldec_a_loop - -.align 8 -.Ldec_u_loop: - decrypt_block u STATE0 STATE1 STATE2 STATE3 STATE4 0 - decrypt_block u STATE4 STATE0 STATE1 STATE2 STATE3 1 - decrypt_block u STATE3 STATE4 STATE0 STATE1 STATE2 2 - decrypt_block u STATE2 STATE3 STATE4 STATE0 STATE1 3 - decrypt_block u STATE1 STATE2 STATE3 STATE4 STATE0 4 - - add $0x50, SRC - add $0x50, DST - jmp .Ldec_u_loop + jmp .Ldec_loop /* store the state: */ .Ldec_out_0: @@ -606,7 +470,6 @@ SYM_TYPED_FUNC_START(crypto_aegis128_aesni_dec) movdqu STATE1, 0x20(STATEP) movdqu STATE2, 0x30(STATEP) movdqu STATE3, 0x40(STATEP) - FRAME_END RET .Ldec_out_1: @@ -615,7 +478,6 @@ SYM_TYPED_FUNC_START(crypto_aegis128_aesni_dec) movdqu STATE0, 0x20(STATEP) movdqu STATE1, 0x30(STATEP) movdqu STATE2, 0x40(STATEP) - FRAME_END RET .Ldec_out_2: @@ -624,7 +486,6 @@ SYM_TYPED_FUNC_START(crypto_aegis128_aesni_dec) movdqu STATE4, 0x20(STATEP) movdqu STATE0, 0x30(STATEP) movdqu STATE1, 0x40(STATEP) - FRAME_END RET .Ldec_out_3: @@ -633,7 +494,6 @@ SYM_TYPED_FUNC_START(crypto_aegis128_aesni_dec) movdqu STATE3, 0x20(STATEP) movdqu STATE4, 0x30(STATEP) movdqu STATE0, 0x40(STATEP) - FRAME_END RET .Ldec_out_4: @@ -642,20 +502,19 @@ SYM_TYPED_FUNC_START(crypto_aegis128_aesni_dec) movdqu STATE2, 0x20(STATEP) movdqu STATE3, 0x30(STATEP) movdqu STATE4, 0x40(STATEP) - FRAME_END - RET - .Ldec_out: - FRAME_END RET -SYM_FUNC_END(crypto_aegis128_aesni_dec) +SYM_FUNC_END(aegis128_aesni_dec) /* - * void crypto_aegis128_aesni_dec_tail(void *state, unsigned int length, - * const void *src, void *dst); + * void aegis128_aesni_dec_tail(struct aegis_state *state, const u8 *src, + * u8 *dst, unsigned int len); */ -SYM_TYPED_FUNC_START(crypto_aegis128_aesni_dec_tail) - FRAME_BEGIN +SYM_FUNC_START(aegis128_aesni_dec_tail) + .set STATEP, %rdi + .set SRC, %rsi + .set DST, %rdx + .set LEN, %ecx /* {load,store}_partial rely on this being %ecx */ /* load the state: */ movdqu 0x00(STATEP), STATE0 @@ -665,7 +524,8 @@ SYM_TYPED_FUNC_START(crypto_aegis128_aesni_dec_tail) movdqu 0x40(STATEP), STATE4 /* decrypt message: */ - call __load_partial + mov LEN, %r9d + load_partial pxor STATE1, MSG pxor STATE4, MSG @@ -673,17 +533,13 @@ SYM_TYPED_FUNC_START(crypto_aegis128_aesni_dec_tail) pand STATE3, T1 pxor T1, MSG - movdqa MSG, T0 - call __store_partial + mov %r9d, LEN + store_partial MSG /* mask with byte count: */ - movq LEN, T0 - punpcklbw T0, T0 - punpcklbw T0, T0 - punpcklbw T0, T0 - punpcklbw T0, T0 - movdqa .Laegis128_counter(%rip), T1 - pcmpgtb T1, T0 + lea .Lzeropad_mask+16(%rip), %rax + sub %r9, %rax + movdqu (%rax), T0 pand T0, MSG aegis128_update @@ -695,17 +551,19 @@ SYM_TYPED_FUNC_START(crypto_aegis128_aesni_dec_tail) movdqu STATE1, 0x20(STATEP) movdqu STATE2, 0x30(STATEP) movdqu STATE3, 0x40(STATEP) - - FRAME_END RET -SYM_FUNC_END(crypto_aegis128_aesni_dec_tail) +SYM_FUNC_END(aegis128_aesni_dec_tail) /* - * void crypto_aegis128_aesni_final(void *state, void *tag_xor, - * u64 assoclen, u64 cryptlen); + * void aegis128_aesni_final(struct aegis_state *state, + * struct aegis_block *tag_xor, + * unsigned int assoclen, unsigned int cryptlen); */ -SYM_FUNC_START(crypto_aegis128_aesni_final) - FRAME_BEGIN +SYM_FUNC_START(aegis128_aesni_final) + .set STATEP, %rdi + .set TAG_XOR, %rsi + .set ASSOCLEN, %edx + .set CRYPTLEN, %ecx /* load the state: */ movdqu 0x00(STATEP), STATE0 @@ -715,10 +573,8 @@ SYM_FUNC_START(crypto_aegis128_aesni_final) movdqu 0x40(STATEP), STATE4 /* prepare length block: */ - movq %rdx, MSG - movq %rcx, T0 - pslldq $8, T0 - pxor T0, MSG + movd ASSOCLEN, MSG + pinsrd $2, CRYPTLEN, MSG psllq $3, MSG /* multiply by 8 (to get bit count) */ pxor STATE3, MSG @@ -733,7 +589,7 @@ SYM_FUNC_START(crypto_aegis128_aesni_final) aegis128_update; pxor MSG, STATE3 /* xor tag: */ - movdqu (%rsi), MSG + movdqu (TAG_XOR), MSG pxor STATE0, MSG pxor STATE1, MSG @@ -741,8 +597,6 @@ SYM_FUNC_START(crypto_aegis128_aesni_final) pxor STATE3, MSG pxor STATE4, MSG - movdqu MSG, (%rsi) - - FRAME_END + movdqu MSG, (TAG_XOR) RET -SYM_FUNC_END(crypto_aegis128_aesni_final) +SYM_FUNC_END(aegis128_aesni_final) diff --git a/arch/x86/crypto/aegis128-aesni-glue.c b/arch/x86/crypto/aegis128-aesni-glue.c index 4623189000d8..c19d8e3d96a3 100644 --- a/arch/x86/crypto/aegis128-aesni-glue.c +++ b/arch/x86/crypto/aegis128-aesni-glue.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-2.0-or-later /* * The AEGIS-128 Authenticated-Encryption Algorithm - * Glue for AES-NI + SSE2 implementation + * Glue for AES-NI + SSE4.1 implementation * * Copyright (c) 2017-2018 Ondrej Mosnacek <omosnacek@gmail.com> * Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved. @@ -23,27 +23,6 @@ #define AEGIS128_MIN_AUTH_SIZE 8 #define AEGIS128_MAX_AUTH_SIZE 16 -asmlinkage void crypto_aegis128_aesni_init(void *state, void *key, void *iv); - -asmlinkage void crypto_aegis128_aesni_ad( - void *state, unsigned int length, const void *data); - -asmlinkage void crypto_aegis128_aesni_enc( - void *state, unsigned int length, const void *src, void *dst); - -asmlinkage void crypto_aegis128_aesni_dec( - void *state, unsigned int length, const void *src, void *dst); - -asmlinkage void crypto_aegis128_aesni_enc_tail( - void *state, unsigned int length, const void *src, void *dst); - -asmlinkage void crypto_aegis128_aesni_dec_tail( - void *state, unsigned int length, const void *src, void *dst); - -asmlinkage void crypto_aegis128_aesni_final( - void *state, void *tag_xor, unsigned int cryptlen, - unsigned int assoclen); - struct aegis_block { u8 bytes[AEGIS128_BLOCK_SIZE] __aligned(AEGIS128_BLOCK_ALIGN); }; @@ -56,15 +35,31 @@ struct aegis_ctx { struct aegis_block key; }; -struct aegis_crypt_ops { - int (*skcipher_walk_init)(struct skcipher_walk *walk, - struct aead_request *req, bool atomic); +asmlinkage void aegis128_aesni_init(struct aegis_state *state, + const struct aegis_block *key, + const u8 iv[AEGIS128_NONCE_SIZE]); - void (*crypt_blocks)(void *state, unsigned int length, const void *src, - void *dst); - void (*crypt_tail)(void *state, unsigned int length, const void *src, - void *dst); -}; +asmlinkage void aegis128_aesni_ad(struct aegis_state *state, const u8 *data, + unsigned int len); + +asmlinkage void aegis128_aesni_enc(struct aegis_state *state, const u8 *src, + u8 *dst, unsigned int len); + +asmlinkage void aegis128_aesni_dec(struct aegis_state *state, const u8 *src, + u8 *dst, unsigned int len); + +asmlinkage void aegis128_aesni_enc_tail(struct aegis_state *state, + const u8 *src, u8 *dst, + unsigned int len); + +asmlinkage void aegis128_aesni_dec_tail(struct aegis_state *state, + const u8 *src, u8 *dst, + unsigned int len); + +asmlinkage void aegis128_aesni_final(struct aegis_state *state, + struct aegis_block *tag_xor, + unsigned int assoclen, + unsigned int cryptlen); static void crypto_aegis128_aesni_process_ad( struct aegis_state *state, struct scatterlist *sg_src, @@ -85,16 +80,15 @@ static void crypto_aegis128_aesni_process_ad( if (pos > 0) { unsigned int fill = AEGIS128_BLOCK_SIZE - pos; memcpy(buf.bytes + pos, src, fill); - crypto_aegis128_aesni_ad(state, - AEGIS128_BLOCK_SIZE, - buf.bytes); + aegis128_aesni_ad(state, buf.bytes, + AEGIS128_BLOCK_SIZE); pos = 0; left -= fill; src += fill; } - crypto_aegis128_aesni_ad(state, left, src); - + aegis128_aesni_ad(state, src, + left & ~(AEGIS128_BLOCK_SIZE - 1)); src += left & ~(AEGIS128_BLOCK_SIZE - 1); left &= AEGIS128_BLOCK_SIZE - 1; } @@ -110,24 +104,37 @@ static void crypto_aegis128_aesni_process_ad( if (pos > 0) { memset(buf.bytes + pos, 0, AEGIS128_BLOCK_SIZE - pos); - crypto_aegis128_aesni_ad(state, AEGIS128_BLOCK_SIZE, buf.bytes); + aegis128_aesni_ad(state, buf.bytes, AEGIS128_BLOCK_SIZE); } } -static void crypto_aegis128_aesni_process_crypt( - struct aegis_state *state, struct skcipher_walk *walk, - const struct aegis_crypt_ops *ops) +static __always_inline void +crypto_aegis128_aesni_process_crypt(struct aegis_state *state, + struct skcipher_walk *walk, bool enc) { while (walk->nbytes >= AEGIS128_BLOCK_SIZE) { - ops->crypt_blocks(state, - round_down(walk->nbytes, AEGIS128_BLOCK_SIZE), - walk->src.virt.addr, walk->dst.virt.addr); + if (enc) + aegis128_aesni_enc(state, walk->src.virt.addr, + walk->dst.virt.addr, + round_down(walk->nbytes, + AEGIS128_BLOCK_SIZE)); + else + aegis128_aesni_dec(state, walk->src.virt.addr, + walk->dst.virt.addr, + round_down(walk->nbytes, + AEGIS128_BLOCK_SIZE)); skcipher_walk_done(walk, walk->nbytes % AEGIS128_BLOCK_SIZE); } if (walk->nbytes) { - ops->crypt_tail(state, walk->nbytes, walk->src.virt.addr, - walk->dst.virt.addr); + if (enc) + aegis128_aesni_enc_tail(state, walk->src.virt.addr, + walk->dst.virt.addr, + walk->nbytes); + else + aegis128_aesni_dec_tail(state, walk->src.virt.addr, + walk->dst.virt.addr, + walk->nbytes); skcipher_walk_done(walk, 0); } } @@ -162,42 +169,39 @@ static int crypto_aegis128_aesni_setauthsize(struct crypto_aead *tfm, return 0; } -static void crypto_aegis128_aesni_crypt(struct aead_request *req, - struct aegis_block *tag_xor, - unsigned int cryptlen, - const struct aegis_crypt_ops *ops) +static __always_inline void +crypto_aegis128_aesni_crypt(struct aead_request *req, + struct aegis_block *tag_xor, + unsigned int cryptlen, bool enc) { struct crypto_aead *tfm = crypto_aead_reqtfm(req); struct aegis_ctx *ctx = crypto_aegis128_aesni_ctx(tfm); struct skcipher_walk walk; struct aegis_state state; - ops->skcipher_walk_init(&walk, req, true); + if (enc) + skcipher_walk_aead_encrypt(&walk, req, true); + else + skcipher_walk_aead_decrypt(&walk, req, true); kernel_fpu_begin(); - crypto_aegis128_aesni_init(&state, ctx->key.bytes, req->iv); + aegis128_aesni_init(&state, &ctx->key, req->iv); crypto_aegis128_aesni_process_ad(&state, req->src, req->assoclen); - crypto_aegis128_aesni_process_crypt(&state, &walk, ops); - crypto_aegis128_aesni_final(&state, tag_xor, req->assoclen, cryptlen); + crypto_aegis128_aesni_process_crypt(&state, &walk, enc); + aegis128_aesni_final(&state, tag_xor, req->assoclen, cryptlen); kernel_fpu_end(); } static int crypto_aegis128_aesni_encrypt(struct aead_request *req) { - static const struct aegis_crypt_ops OPS = { - .skcipher_walk_init = skcipher_walk_aead_encrypt, - .crypt_blocks = crypto_aegis128_aesni_enc, - .crypt_tail = crypto_aegis128_aesni_enc_tail, - }; - struct crypto_aead *tfm = crypto_aead_reqtfm(req); struct aegis_block tag = {}; unsigned int authsize = crypto_aead_authsize(tfm); unsigned int cryptlen = req->cryptlen; - crypto_aegis128_aesni_crypt(req, &tag, cryptlen, &OPS); + crypto_aegis128_aesni_crypt(req, &tag, cryptlen, true); scatterwalk_map_and_copy(tag.bytes, req->dst, req->assoclen + cryptlen, authsize, 1); @@ -208,12 +212,6 @@ static int crypto_aegis128_aesni_decrypt(struct aead_request *req) { static const struct aegis_block zeros = {}; - static const struct aegis_crypt_ops OPS = { - .skcipher_walk_init = skcipher_walk_aead_decrypt, - .crypt_blocks = crypto_aegis128_aesni_dec, - .crypt_tail = crypto_aegis128_aesni_dec_tail, - }; - struct crypto_aead *tfm = crypto_aead_reqtfm(req); struct aegis_block tag; unsigned int authsize = crypto_aead_authsize(tfm); @@ -222,27 +220,16 @@ static int crypto_aegis128_aesni_decrypt(struct aead_request *req) scatterwalk_map_and_copy(tag.bytes, req->src, req->assoclen + cryptlen, authsize, 0); - crypto_aegis128_aesni_crypt(req, &tag, cryptlen, &OPS); + crypto_aegis128_aesni_crypt(req, &tag, cryptlen, false); return crypto_memneq(tag.bytes, zeros.bytes, authsize) ? -EBADMSG : 0; } -static int crypto_aegis128_aesni_init_tfm(struct crypto_aead *aead) -{ - return 0; -} - -static void crypto_aegis128_aesni_exit_tfm(struct crypto_aead *aead) -{ -} - static struct aead_alg crypto_aegis128_aesni_alg = { .setkey = crypto_aegis128_aesni_setkey, .setauthsize = crypto_aegis128_aesni_setauthsize, .encrypt = crypto_aegis128_aesni_encrypt, .decrypt = crypto_aegis128_aesni_decrypt, - .init = crypto_aegis128_aesni_init_tfm, - .exit = crypto_aegis128_aesni_exit_tfm, .ivsize = AEGIS128_NONCE_SIZE, .maxauthsize = AEGIS128_MAX_AUTH_SIZE, @@ -267,7 +254,7 @@ static struct simd_aead_alg *simd_alg; static int __init crypto_aegis128_aesni_module_init(void) { - if (!boot_cpu_has(X86_FEATURE_XMM2) || + if (!boot_cpu_has(X86_FEATURE_XMM4_1) || !boot_cpu_has(X86_FEATURE_AES) || !cpu_has_xfeatures(XFEATURE_MASK_SSE, NULL)) return -ENODEV; @@ -286,6 +273,6 @@ module_exit(crypto_aegis128_aesni_module_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Ondrej Mosnacek <omosnacek@gmail.com>"); -MODULE_DESCRIPTION("AEGIS-128 AEAD algorithm -- AESNI+SSE2 implementation"); +MODULE_DESCRIPTION("AEGIS-128 AEAD algorithm -- AESNI+SSE4.1 implementation"); MODULE_ALIAS_CRYPTO("aegis128"); MODULE_ALIAS_CRYPTO("aegis128-aesni"); diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c index b0dd83555499..fbf43482e1f5 100644 --- a/arch/x86/crypto/aesni-intel_glue.c +++ b/arch/x86/crypto/aesni-intel_glue.c @@ -1747,7 +1747,7 @@ static void __exit aesni_exit(void) unregister_avx_algs(); } -late_initcall(aesni_init); +module_init(aesni_init); module_exit(aesni_exit); MODULE_DESCRIPTION("AES cipher and modes, optimized with AES-NI or VAES instructions"); diff --git a/arch/x86/crypto/camellia_glue.c b/arch/x86/crypto/camellia_glue.c index d45e9c0c42ac..f110708c8038 100644 --- a/arch/x86/crypto/camellia_glue.c +++ b/arch/x86/crypto/camellia_glue.c @@ -8,7 +8,7 @@ * Copyright (C) 2006 NTT (Nippon Telegraph and Telephone Corporation) */ -#include <asm/unaligned.h> +#include <linux/unaligned.h> #include <linux/crypto.h> #include <linux/init.h> #include <linux/module.h> diff --git a/arch/x86/crypto/cast5-avx-x86_64-asm_64.S b/arch/x86/crypto/cast5-avx-x86_64-asm_64.S index b4e460a87f18..fb95a614249d 100644 --- a/arch/x86/crypto/cast5-avx-x86_64-asm_64.S +++ b/arch/x86/crypto/cast5-avx-x86_64-asm_64.S @@ -487,79 +487,3 @@ SYM_FUNC_START(cast5_cbc_dec_16way) FRAME_END RET; SYM_FUNC_END(cast5_cbc_dec_16way) - -SYM_FUNC_START(cast5_ctr_16way) - /* input: - * %rdi: ctx - * %rsi: dst - * %rdx: src - * %rcx: iv (big endian, 64bit) - */ - FRAME_BEGIN - pushq %r12; - pushq %r15; - - movq %rdi, CTX; - movq %rsi, %r11; - movq %rdx, %r12; - - vpcmpeqd RTMP, RTMP, RTMP; - vpsrldq $8, RTMP, RTMP; /* low: -1, high: 0 */ - - vpcmpeqd RKR, RKR, RKR; - vpaddq RKR, RKR, RKR; /* low: -2, high: -2 */ - vmovdqa .Lbswap_iv_mask(%rip), R1ST; - vmovdqa .Lbswap128_mask(%rip), RKM; - - /* load IV and byteswap */ - vmovq (%rcx), RX; - vpshufb R1ST, RX, RX; - - /* construct IVs */ - vpsubq RTMP, RX, RX; /* le: IV1, IV0 */ - vpshufb RKM, RX, RL1; /* be: IV0, IV1 */ - vpsubq RKR, RX, RX; - vpshufb RKM, RX, RR1; /* be: IV2, IV3 */ - vpsubq RKR, RX, RX; - vpshufb RKM, RX, RL2; /* be: IV4, IV5 */ - vpsubq RKR, RX, RX; - vpshufb RKM, RX, RR2; /* be: IV6, IV7 */ - vpsubq RKR, RX, RX; - vpshufb RKM, RX, RL3; /* be: IV8, IV9 */ - vpsubq RKR, RX, RX; - vpshufb RKM, RX, RR3; /* be: IV10, IV11 */ - vpsubq RKR, RX, RX; - vpshufb RKM, RX, RL4; /* be: IV12, IV13 */ - vpsubq RKR, RX, RX; - vpshufb RKM, RX, RR4; /* be: IV14, IV15 */ - - /* store last IV */ - vpsubq RTMP, RX, RX; /* le: IV16, IV14 */ - vpshufb R1ST, RX, RX; /* be: IV16, IV16 */ - vmovq RX, (%rcx); - - call __cast5_enc_blk16; - - /* dst = src ^ iv */ - vpxor (0*16)(%r12), RR1, RR1; - vpxor (1*16)(%r12), RL1, RL1; - vpxor (2*16)(%r12), RR2, RR2; - vpxor (3*16)(%r12), RL2, RL2; - vpxor (4*16)(%r12), RR3, RR3; - vpxor (5*16)(%r12), RL3, RL3; - vpxor (6*16)(%r12), RR4, RR4; - vpxor (7*16)(%r12), RL4, RL4; - vmovdqu RR1, (0*16)(%r11); - vmovdqu RL1, (1*16)(%r11); - vmovdqu RR2, (2*16)(%r11); - vmovdqu RL2, (3*16)(%r11); - vmovdqu RR3, (4*16)(%r11); - vmovdqu RL3, (5*16)(%r11); - vmovdqu RR4, (6*16)(%r11); - vmovdqu RL4, (7*16)(%r11); - - popq %r15; - popq %r12; - FRAME_END - RET; -SYM_FUNC_END(cast5_ctr_16way) diff --git a/arch/x86/crypto/crc32c-intel_glue.c b/arch/x86/crypto/crc32c-intel_glue.c index feccb5254c7e..52c5d47ef5a1 100644 --- a/arch/x86/crypto/crc32c-intel_glue.c +++ b/arch/x86/crypto/crc32c-intel_glue.c @@ -41,7 +41,7 @@ */ #define CRC32C_PCL_BREAKEVEN 512 -asmlinkage unsigned int crc_pcl(const u8 *buffer, int len, +asmlinkage unsigned int crc_pcl(const u8 *buffer, unsigned int len, unsigned int crc_init); #endif /* CONFIG_X86_64 */ diff --git a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S index bbcff1fb78cb..752812bc4991 100644 --- a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S +++ b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S @@ -7,6 +7,7 @@ * http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-paper.pdf * * Copyright (C) 2012 Intel Corporation. + * Copyright 2024 Google LLC * * Authors: * Wajdi Feghali <wajdi.k.feghali@intel.com> @@ -44,185 +45,129 @@ */ #include <linux/linkage.h> -#include <asm/nospec-branch.h> ## ISCSI CRC 32 Implementation with crc32 and pclmulqdq Instruction -.macro LABEL prefix n -.L\prefix\n\(): -.endm - -.macro JMPTBL_ENTRY i -.quad .Lcrc_\i -.endm - -.macro JNC_LESS_THAN j - jnc .Lless_than_\j -.endm - -# Define threshold where buffers are considered "small" and routed to more -# efficient "by-1" code. This "by-1" code only handles up to 255 bytes, so -# SMALL_SIZE can be no larger than 255. - +# Define threshold below which buffers are considered "small" and routed to +# regular CRC code that does not interleave the CRC instructions. #define SMALL_SIZE 200 -.if (SMALL_SIZE > 255) -.error "SMALL_ SIZE must be < 256" -.endif - -# unsigned int crc_pcl(u8 *buffer, int len, unsigned int crc_init); +# unsigned int crc_pcl(const u8 *buffer, unsigned int len, unsigned int crc_init); .text SYM_FUNC_START(crc_pcl) -#define bufp rdi -#define bufp_dw %edi -#define bufp_w %di -#define bufp_b %dil -#define bufptmp %rcx -#define block_0 %rcx -#define block_1 %rdx -#define block_2 %r11 -#define len %rsi -#define len_dw %esi -#define len_w %si -#define len_b %sil -#define crc_init_arg %rdx -#define tmp %rbx -#define crc_init %r8 -#define crc_init_dw %r8d -#define crc1 %r9 -#define crc2 %r10 - - pushq %rbx - pushq %rdi - pushq %rsi - - ## Move crc_init for Linux to a different - mov crc_init_arg, crc_init +#define bufp %rdi +#define bufp_d %edi +#define len %esi +#define crc_init %edx +#define crc_init_q %rdx +#define n_misaligned %ecx /* overlaps chunk_bytes! */ +#define n_misaligned_q %rcx +#define chunk_bytes %ecx /* overlaps n_misaligned! */ +#define chunk_bytes_q %rcx +#define crc1 %r8 +#define crc2 %r9 + + cmp $SMALL_SIZE, len + jb .Lsmall ################################################################ ## 1) ALIGN: ################################################################ - - mov %bufp, bufptmp # rdi = *buf - neg %bufp - and $7, %bufp # calculate the unalignment amount of + mov bufp_d, n_misaligned + neg n_misaligned + and $7, n_misaligned # calculate the misalignment amount of # the address - je .Lproc_block # Skip if aligned - - ## If len is less than 8 and we're unaligned, we need to jump - ## to special code to avoid reading beyond the end of the buffer - cmp $8, len - jae .Ldo_align - # less_than_8 expects length in upper 3 bits of len_dw - # less_than_8_post_shl1 expects length = carryflag * 8 + len_dw[31:30] - shl $32-3+1, len_dw - jmp .Lless_than_8_post_shl1 + je .Laligned # Skip if aligned + # Process 1 <= n_misaligned <= 7 bytes individually in order to align + # the remaining data to an 8-byte boundary. .Ldo_align: - #### Calculate CRC of unaligned bytes of the buffer (if any) - movq (bufptmp), tmp # load a quadward from the buffer - add %bufp, bufptmp # align buffer pointer for quadword - # processing - sub %bufp, len # update buffer length + movq (bufp), %rax + add n_misaligned_q, bufp + sub n_misaligned, len .Lalign_loop: - crc32b %bl, crc_init_dw # compute crc32 of 1-byte - shr $8, tmp # get next byte - dec %bufp + crc32b %al, crc_init # compute crc32 of 1-byte + shr $8, %rax # get next byte + dec n_misaligned jne .Lalign_loop - -.Lproc_block: +.Laligned: ################################################################ - ## 2) PROCESS BLOCKS: + ## 2) PROCESS BLOCK: ################################################################ - ## compute num of bytes to be processed - movq len, tmp # save num bytes in tmp - - cmpq $128*24, len + cmp $128*24, len jae .Lfull_block -.Lcontinue_block: - cmpq $SMALL_SIZE, len - jb .Lsmall - - ## len < 128*24 - movq $2731, %rax # 2731 = ceil(2^16 / 24) - mul len_dw - shrq $16, %rax - - ## eax contains floor(bytes / 24) = num 24-byte chunks to do - - ## process rax 24-byte chunks (128 >= rax >= 0) - - ## compute end address of each block - ## block 0 (base addr + RAX * 8) - ## block 1 (base addr + RAX * 16) - ## block 2 (base addr + RAX * 24) - lea (bufptmp, %rax, 8), block_0 - lea (block_0, %rax, 8), block_1 - lea (block_1, %rax, 8), block_2 +.Lpartial_block: + # Compute floor(len / 24) to get num qwords to process from each lane. + imul $2731, len, %eax # 2731 = ceil(2^16 / 24) + shr $16, %eax + jmp .Lcrc_3lanes - xor crc1, crc1 - xor crc2, crc2 - - ## branch into array - leaq jump_table(%rip), %bufp - mov (%bufp,%rax,8), %bufp - JMP_NOSPEC bufp - - ################################################################ - ## 2a) PROCESS FULL BLOCKS: - ################################################################ .Lfull_block: - movl $128,%eax - lea 128*8*2(block_0), block_1 - lea 128*8*3(block_0), block_2 - add $128*8*1, block_0 - - xor crc1,crc1 - xor crc2,crc2 - - # Fall through into top of crc array (crc_128) + # Processing 128 qwords from each lane. + mov $128, %eax ################################################################ - ## 3) CRC Array: + ## 3) CRC each of three lanes: ################################################################ - i=128 -.rept 128-1 -.altmacro -LABEL crc_ %i -.noaltmacro - ENDBR - crc32q -i*8(block_0), crc_init - crc32q -i*8(block_1), crc1 - crc32q -i*8(block_2), crc2 - i=(i-1) -.endr - -.altmacro -LABEL crc_ %i -.noaltmacro - ENDBR - crc32q -i*8(block_0), crc_init - crc32q -i*8(block_1), crc1 -# SKIP crc32 -i*8(block_2), crc2 ; Don't do this one yet - - mov block_2, block_0 +.Lcrc_3lanes: + xor crc1,crc1 + xor crc2,crc2 + mov %eax, chunk_bytes + shl $3, chunk_bytes # num bytes to process from each lane + sub $5, %eax # 4 for 4x_loop, 1 for special last iter + jl .Lcrc_3lanes_4x_done + + # Unroll the loop by a factor of 4 to reduce the overhead of the loop + # bookkeeping instructions, which can compete with crc32q for the ALUs. +.Lcrc_3lanes_4x_loop: + crc32q (bufp), crc_init_q + crc32q (bufp,chunk_bytes_q), crc1 + crc32q (bufp,chunk_bytes_q,2), crc2 + crc32q 8(bufp), crc_init_q + crc32q 8(bufp,chunk_bytes_q), crc1 + crc32q 8(bufp,chunk_bytes_q,2), crc2 + crc32q 16(bufp), crc_init_q + crc32q 16(bufp,chunk_bytes_q), crc1 + crc32q 16(bufp,chunk_bytes_q,2), crc2 + crc32q 24(bufp), crc_init_q + crc32q 24(bufp,chunk_bytes_q), crc1 + crc32q 24(bufp,chunk_bytes_q,2), crc2 + add $32, bufp + sub $4, %eax + jge .Lcrc_3lanes_4x_loop + +.Lcrc_3lanes_4x_done: + add $4, %eax + jz .Lcrc_3lanes_last_qword + +.Lcrc_3lanes_1x_loop: + crc32q (bufp), crc_init_q + crc32q (bufp,chunk_bytes_q), crc1 + crc32q (bufp,chunk_bytes_q,2), crc2 + add $8, bufp + dec %eax + jnz .Lcrc_3lanes_1x_loop + +.Lcrc_3lanes_last_qword: + crc32q (bufp), crc_init_q + crc32q (bufp,chunk_bytes_q), crc1 +# SKIP crc32q (bufp,chunk_bytes_q,2), crc2 ; Don't do this one yet ################################################################ ## 4) Combine three results: ################################################################ - lea (K_table-8)(%rip), %bufp # first entry is for idx 1 - shlq $3, %rax # rax *= 8 - pmovzxdq (%bufp,%rax), %xmm0 # 2 consts: K1:K2 - leal (%eax,%eax,2), %eax # rax *= 3 (total *24) - subq %rax, tmp # tmp -= rax*24 + lea (K_table-8)(%rip), %rax # first entry is for idx 1 + pmovzxdq (%rax,chunk_bytes_q), %xmm0 # 2 consts: K1:K2 + lea (chunk_bytes,chunk_bytes,2), %eax # chunk_bytes * 3 + sub %eax, len # len -= chunk_bytes * 3 - movq crc_init, %xmm1 # CRC for block 1 + movq crc_init_q, %xmm1 # CRC for block 1 pclmulqdq $0x00, %xmm0, %xmm1 # Multiply by K2 movq crc1, %xmm2 # CRC for block 2 @@ -230,103 +175,54 @@ LABEL crc_ %i pxor %xmm2,%xmm1 movq %xmm1, %rax - xor -i*8(block_2), %rax - mov crc2, crc_init - crc32 %rax, crc_init + xor (bufp,chunk_bytes_q,2), %rax + mov crc2, crc_init_q + crc32 %rax, crc_init_q + lea 8(bufp,chunk_bytes_q,2), bufp ################################################################ - ## 5) Check for end: + ## 5) If more blocks remain, goto (2): ################################################################ -LABEL crc_ 0 - ENDBR - mov tmp, len - cmp $128*24, tmp - jae .Lfull_block - cmp $24, tmp - jae .Lcontinue_block - -.Lless_than_24: - shl $32-4, len_dw # less_than_16 expects length - # in upper 4 bits of len_dw - jnc .Lless_than_16 - crc32q (bufptmp), crc_init - crc32q 8(bufptmp), crc_init - jz .Ldo_return - add $16, bufptmp - # len is less than 8 if we got here - # less_than_8 expects length in upper 3 bits of len_dw - # less_than_8_post_shl1 expects length = carryflag * 8 + len_dw[31:30] - shl $2, len_dw - jmp .Lless_than_8_post_shl1 + cmp $128*24, len + jae .Lfull_block + cmp $SMALL_SIZE, len + jae .Lpartial_block ####################################################################### - ## 6) LESS THAN 256-bytes REMAIN AT THIS POINT (8-bits of len are full) + ## 6) Process any remainder without interleaving: ####################################################################### .Lsmall: - shl $32-8, len_dw # Prepare len_dw for less_than_256 - j=256 -.rept 5 # j = {256, 128, 64, 32, 16} -.altmacro -LABEL less_than_ %j # less_than_j: Length should be in - # upper lg(j) bits of len_dw - j=(j/2) - shl $1, len_dw # Get next MSB - JNC_LESS_THAN %j -.noaltmacro - i=0 -.rept (j/8) - crc32q i(bufptmp), crc_init # Compute crc32 of 8-byte data - i=i+8 -.endr - jz .Ldo_return # Return if remaining length is zero - add $j, bufptmp # Advance buf -.endr - -.Lless_than_8: # Length should be stored in - # upper 3 bits of len_dw - shl $1, len_dw -.Lless_than_8_post_shl1: - jnc .Lless_than_4 - crc32l (bufptmp), crc_init_dw # CRC of 4 bytes - jz .Ldo_return # return if remaining data is zero - add $4, bufptmp -.Lless_than_4: # Length should be stored in - # upper 2 bits of len_dw - shl $1, len_dw - jnc .Lless_than_2 - crc32w (bufptmp), crc_init_dw # CRC of 2 bytes - jz .Ldo_return # return if remaining data is zero - add $2, bufptmp -.Lless_than_2: # Length should be stored in the MSB - # of len_dw - shl $1, len_dw - jnc .Lless_than_1 - crc32b (bufptmp), crc_init_dw # CRC of 1 byte -.Lless_than_1: # Length should be zero -.Ldo_return: - movq crc_init, %rax - popq %rsi - popq %rdi - popq %rbx + test len, len + jz .Ldone + mov len, %eax + shr $3, %eax + jz .Ldo_dword +.Ldo_qwords: + crc32q (bufp), crc_init_q + add $8, bufp + dec %eax + jnz .Ldo_qwords +.Ldo_dword: + test $4, len + jz .Ldo_word + crc32l (bufp), crc_init + add $4, bufp +.Ldo_word: + test $2, len + jz .Ldo_byte + crc32w (bufp), crc_init + add $2, bufp +.Ldo_byte: + test $1, len + jz .Ldone + crc32b (bufp), crc_init +.Ldone: + mov crc_init, %eax RET SYM_FUNC_END(crc_pcl) .section .rodata, "a", @progbits - ################################################################ - ## jump table Table is 129 entries x 2 bytes each - ################################################################ -.align 4 -jump_table: - i=0 -.rept 129 -.altmacro -JMPTBL_ENTRY %i -.noaltmacro - i=i+1 -.endr - - ################################################################ ## PCLMULQDQ tables ## Table is 128 entries x 2 words (8 bytes) each diff --git a/arch/x86/crypto/ghash-clmulni-intel_glue.c b/arch/x86/crypto/ghash-clmulni-intel_glue.c index 700ecaee9a08..41bc02e48916 100644 --- a/arch/x86/crypto/ghash-clmulni-intel_glue.c +++ b/arch/x86/crypto/ghash-clmulni-intel_glue.c @@ -19,7 +19,7 @@ #include <crypto/internal/simd.h> #include <asm/cpu_device_id.h> #include <asm/simd.h> -#include <asm/unaligned.h> +#include <linux/unaligned.h> #define GHASH_BLOCK_SIZE 16 #define GHASH_DIGEST_SIZE 16 diff --git a/arch/x86/entry/entry.S b/arch/x86/entry/entry.S index d9feadffa972..b7ea3e8e9ecc 100644 --- a/arch/x86/entry/entry.S +++ b/arch/x86/entry/entry.S @@ -9,6 +9,8 @@ #include <asm/unwind_hints.h> #include <asm/segment.h> #include <asm/cache.h> +#include <asm/cpufeatures.h> +#include <asm/nospec-branch.h> #include "calling.h" @@ -19,6 +21,9 @@ SYM_FUNC_START(entry_ibpb) movl $PRED_CMD_IBPB, %eax xorl %edx, %edx wrmsr + + /* Make sure IBPB clears return stack preductions too. */ + FILL_RETURN_BUFFER %rax, RSB_CLEAR_LOOPS, X86_BUG_IBPB_NO_RET RET SYM_FUNC_END(entry_ibpb) /* For KVM */ @@ -46,3 +51,19 @@ EXPORT_SYMBOL_GPL(mds_verw_sel); .popsection THUNK warn_thunk_thunk, __warn_thunk + +#ifndef CONFIG_X86_64 +/* + * Clang's implementation of TLS stack cookies requires the variable in + * question to be a TLS variable. If the variable happens to be defined as an + * ordinary variable with external linkage in the same compilation unit (which + * amounts to the whole of vmlinux with LTO enabled), Clang will drop the + * segment register prefix from the references, resulting in broken code. Work + * around this by avoiding the symbol used in -mstack-protector-guard-symbol= + * entirely in the C code, and use an alias emitted by the linker script + * instead. + */ +#ifdef CONFIG_STACKPROTECTOR +EXPORT_SYMBOL(__ref_stack_chk_guard); +#endif +#endif diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index d3a814efbff6..20be5758c2d2 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -871,6 +871,8 @@ SYM_FUNC_START(entry_SYSENTER_32) /* Now ready to switch the cr3 */ SWITCH_TO_USER_CR3 scratch_reg=%eax + /* Clobbers ZF */ + CLEAR_CPU_BUFFERS /* * Restore all flags except IF. (We restore IF separately because @@ -881,7 +883,6 @@ SYM_FUNC_START(entry_SYSENTER_32) BUG_IF_WRONG_CR3 no_user_check=1 popfl popl %eax - CLEAR_CPU_BUFFERS /* * Return back to the vDSO, which will pop ecx and edx. @@ -1144,7 +1145,6 @@ SYM_CODE_START(asm_exc_nmi) /* Not on SYSENTER stack. */ call exc_nmi - CLEAR_CPU_BUFFERS jmp .Lnmi_return .Lnmi_from_sysenter_stack: @@ -1165,6 +1165,7 @@ SYM_CODE_START(asm_exc_nmi) CHECK_AND_APPLY_ESPFIX RESTORE_ALL_NMI cr3_reg=%edi pop=4 + CLEAR_CPU_BUFFERS jmp .Lirq_return #ifdef CONFIG_X86_ESPFIX32 @@ -1206,6 +1207,7 @@ SYM_CODE_START(asm_exc_nmi) * 1 - orig_ax */ lss (1+5+6)*4(%esp), %esp # back to espfix stack + CLEAR_CPU_BUFFERS jmp .Lirq_return #endif SYM_CODE_END(asm_exc_nmi) diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl index 534c74b14fab..4d0fb2fba7e2 100644 --- a/arch/x86/entry/syscalls/syscall_32.tbl +++ b/arch/x86/entry/syscalls/syscall_32.tbl @@ -468,3 +468,7 @@ 460 i386 lsm_set_self_attr sys_lsm_set_self_attr 461 i386 lsm_list_modules sys_lsm_list_modules 462 i386 mseal sys_mseal +463 i386 setxattrat sys_setxattrat +464 i386 getxattrat sys_getxattrat +465 i386 listxattrat sys_listxattrat +466 i386 removexattrat sys_removexattrat diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl index 7093ee21c0d1..5eb708bff1c7 100644 --- a/arch/x86/entry/syscalls/syscall_64.tbl +++ b/arch/x86/entry/syscalls/syscall_64.tbl @@ -386,6 +386,10 @@ 460 common lsm_set_self_attr sys_lsm_set_self_attr 461 common lsm_list_modules sys_lsm_list_modules 462 common mseal sys_mseal +463 common setxattrat sys_setxattrat +464 common getxattrat sys_getxattrat +465 common listxattrat sys_listxattrat +466 common removexattrat sys_removexattrat # # Due to a historical design error, certain syscalls are numbered differently diff --git a/arch/x86/entry/vdso/vdso-layout.lds.S b/arch/x86/entry/vdso/vdso-layout.lds.S index bafa73f09e92..872947c1004c 100644 --- a/arch/x86/entry/vdso/vdso-layout.lds.S +++ b/arch/x86/entry/vdso/vdso-layout.lds.S @@ -1,5 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0 */ #include <asm/vdso.h> +#include <asm/vdso/vsyscall.h> /* * Linker script for vDSO. This is an ELF shared object prelinked to @@ -16,23 +17,16 @@ SECTIONS * segment. */ - vvar_start = . - 4 * PAGE_SIZE; + vvar_start = . - __VVAR_PAGES * PAGE_SIZE; vvar_page = vvar_start; - /* Place all vvars at the offsets in asm/vvar.h. */ -#define EMIT_VVAR(name, offset) vvar_ ## name = vvar_page + offset; -#include <asm/vvar.h> -#undef EMIT_VVAR + vdso_rng_data = vvar_page + __VDSO_RND_DATA_OFFSET; - pvclock_page = vvar_start + PAGE_SIZE; - hvclock_page = vvar_start + 2 * PAGE_SIZE; - timens_page = vvar_start + 3 * PAGE_SIZE; + timens_page = vvar_start + PAGE_SIZE; -#undef _ASM_X86_VVAR_H - /* Place all vvars in timens too at the offsets in asm/vvar.h. */ -#define EMIT_VVAR(name, offset) timens_ ## name = timens_page + offset; -#include <asm/vvar.h> -#undef EMIT_VVAR + vclock_pages = vvar_start + VDSO_NR_VCLOCK_PAGES * PAGE_SIZE; + pvclock_page = vclock_pages + VDSO_PAGE_PVCLOCK_OFFSET * PAGE_SIZE; + hvclock_page = vclock_pages + VDSO_PAGE_HVCLOCK_OFFSET * PAGE_SIZE; . = SIZEOF_HEADERS; diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c index b8fed8b8b9cc..bfc7cabf4017 100644 --- a/arch/x86/entry/vdso/vma.c +++ b/arch/x86/entry/vdso/vma.c @@ -20,26 +20,20 @@ #include <asm/vgtod.h> #include <asm/proto.h> #include <asm/vdso.h> -#include <asm/vvar.h> #include <asm/tlb.h> #include <asm/page.h> #include <asm/desc.h> #include <asm/cpufeature.h> +#include <asm/vdso/vsyscall.h> #include <clocksource/hyperv_timer.h> -#undef _ASM_X86_VVAR_H -#define EMIT_VVAR(name, offset) \ - const size_t name ## _offset = offset; -#include <asm/vvar.h> - struct vdso_data *arch_get_vdso_data(void *vvar_page) { - return (struct vdso_data *)(vvar_page + _vdso_data_offset); + return (struct vdso_data *)vvar_page; } -#undef EMIT_VVAR -DEFINE_VVAR(struct vdso_data, _vdso_data); -DEFINE_VVAR_SINGLE(struct vdso_rng_data, _vdso_rng_data); +static union vdso_data_store vdso_data_store __page_aligned_data; +struct vdso_data *vdso_data = vdso_data_store.data; unsigned int vclocks_used __read_mostly; @@ -154,7 +148,7 @@ static vm_fault_t vvar_fault(const struct vm_special_mapping *sm, if (sym_offset == image->sym_vvar_page) { struct page *timens_page = find_timens_vvar_page(vma); - pfn = __pa_symbol(&__vvar_page) >> PAGE_SHIFT; + pfn = __pa_symbol(vdso_data) >> PAGE_SHIFT; /* * If a task belongs to a time namespace then a namespace @@ -182,32 +176,52 @@ static vm_fault_t vvar_fault(const struct vm_special_mapping *sm, } return vmf_insert_pfn(vma, vmf->address, pfn); - } else if (sym_offset == image->sym_pvclock_page) { - struct pvclock_vsyscall_time_info *pvti = - pvclock_get_pvti_cpu0_va(); - if (pvti && vclock_was_used(VDSO_CLOCKMODE_PVCLOCK)) { - return vmf_insert_pfn_prot(vma, vmf->address, - __pa(pvti) >> PAGE_SHIFT, - pgprot_decrypted(vma->vm_page_prot)); - } - } else if (sym_offset == image->sym_hvclock_page) { - pfn = hv_get_tsc_pfn(); - if (pfn && vclock_was_used(VDSO_CLOCKMODE_HVCLOCK)) - return vmf_insert_pfn(vma, vmf->address, pfn); } else if (sym_offset == image->sym_timens_page) { struct page *timens_page = find_timens_vvar_page(vma); if (!timens_page) return VM_FAULT_SIGBUS; - pfn = __pa_symbol(&__vvar_page) >> PAGE_SHIFT; + pfn = __pa_symbol(vdso_data) >> PAGE_SHIFT; return vmf_insert_pfn(vma, vmf->address, pfn); } return VM_FAULT_SIGBUS; } +static vm_fault_t vvar_vclock_fault(const struct vm_special_mapping *sm, + struct vm_area_struct *vma, struct vm_fault *vmf) +{ + switch (vmf->pgoff) { +#ifdef CONFIG_PARAVIRT_CLOCK + case VDSO_PAGE_PVCLOCK_OFFSET: + { + struct pvclock_vsyscall_time_info *pvti = + pvclock_get_pvti_cpu0_va(); + + if (pvti && vclock_was_used(VDSO_CLOCKMODE_PVCLOCK)) + return vmf_insert_pfn_prot(vma, vmf->address, + __pa(pvti) >> PAGE_SHIFT, + pgprot_decrypted(vma->vm_page_prot)); + break; + } +#endif /* CONFIG_PARAVIRT_CLOCK */ +#ifdef CONFIG_HYPERV_TIMER + case VDSO_PAGE_HVCLOCK_OFFSET: + { + unsigned long pfn = hv_get_tsc_pfn(); + + if (pfn && vclock_was_used(VDSO_CLOCKMODE_HVCLOCK)) + return vmf_insert_pfn(vma, vmf->address, pfn); + break; + } +#endif /* CONFIG_HYPERV_TIMER */ + } + + return VM_FAULT_SIGBUS; +} + static const struct vm_special_mapping vdso_mapping = { .name = "[vdso]", .fault = vdso_fault, @@ -217,6 +231,10 @@ static const struct vm_special_mapping vvar_mapping = { .name = "[vvar]", .fault = vvar_fault, }; +static const struct vm_special_mapping vvar_vclock_mapping = { + .name = "[vvar_vclock]", + .fault = vvar_vclock_fault, +}; /* * Add vdso and vvar mappings to current process. @@ -259,7 +277,7 @@ static int map_vdso(const struct vdso_image *image, unsigned long addr) vma = _install_special_mapping(mm, addr, - -image->sym_vvar_start, + (__VVAR_PAGES - VDSO_NR_VCLOCK_PAGES) * PAGE_SIZE, VM_READ|VM_MAYREAD|VM_IO|VM_DONTDUMP| VM_PFNMAP, &vvar_mapping); @@ -267,11 +285,26 @@ static int map_vdso(const struct vdso_image *image, unsigned long addr) if (IS_ERR(vma)) { ret = PTR_ERR(vma); do_munmap(mm, text_start, image->size, NULL); - } else { - current->mm->context.vdso = (void __user *)text_start; - current->mm->context.vdso_image = image; + goto up_fail; } + vma = _install_special_mapping(mm, + addr + (__VVAR_PAGES - VDSO_NR_VCLOCK_PAGES) * PAGE_SIZE, + VDSO_NR_VCLOCK_PAGES * PAGE_SIZE, + VM_READ|VM_MAYREAD|VM_IO|VM_DONTDUMP| + VM_PFNMAP, + &vvar_vclock_mapping); + + if (IS_ERR(vma)) { + ret = PTR_ERR(vma); + do_munmap(mm, text_start, image->size, NULL); + do_munmap(mm, addr, image->size, NULL); + goto up_fail; + } + + current->mm->context.vdso = (void __user *)text_start; + current->mm->context.vdso_image = image; + up_fail: mmap_write_unlock(mm); return ret; @@ -293,7 +326,8 @@ int map_vdso_once(const struct vdso_image *image, unsigned long addr) */ for_each_vma(vmi, vma) { if (vma_is_special_mapping(vma, &vdso_mapping) || - vma_is_special_mapping(vma, &vvar_mapping)) { + vma_is_special_mapping(vma, &vvar_mapping) || + vma_is_special_mapping(vma, &vvar_vclock_mapping)) { mmap_write_unlock(mm); return -EEXIST; } diff --git a/arch/x86/events/amd/core.c b/arch/x86/events/amd/core.c index 920e3a640cad..b4a1a2576510 100644 --- a/arch/x86/events/amd/core.c +++ b/arch/x86/events/amd/core.c @@ -943,11 +943,12 @@ static int amd_pmu_v2_snapshot_branch_stack(struct perf_branch_entry *entries, u static int amd_pmu_v2_handle_irq(struct pt_regs *regs) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + static atomic64_t status_warned = ATOMIC64_INIT(0); + u64 reserved, status, mask, new_bits, prev_bits; struct perf_sample_data data; struct hw_perf_event *hwc; struct perf_event *event; int handled = 0, idx; - u64 reserved, status, mask; bool pmu_enabled; /* @@ -1012,7 +1013,12 @@ static int amd_pmu_v2_handle_irq(struct pt_regs *regs) * the corresponding PMCs are expected to be inactive according to the * active_mask */ - WARN_ON(status > 0); + if (status > 0) { + prev_bits = atomic64_fetch_or(status, &status_warned); + // A new bit was set for the very first time. + new_bits = status & ~prev_bits; + WARN(new_bits, "New overflows for inactive PMCs: %llx\n", new_bits); + } /* Clear overflow and freeze bits */ amd_pmu_ack_global_status(~status); diff --git a/arch/x86/events/amd/uncore.c b/arch/x86/events/amd/uncore.c index 0bfde2ea5cb8..49c26ce2b115 100644 --- a/arch/x86/events/amd/uncore.c +++ b/arch/x86/events/amd/uncore.c @@ -916,7 +916,8 @@ int amd_uncore_umc_ctx_init(struct amd_uncore *uncore, unsigned int cpu) u8 group_num_pmcs[UNCORE_GROUP_MAX] = { 0 }; union amd_uncore_info info; struct amd_uncore_pmu *pmu; - int index = 0, gid, i; + int gid, i; + u16 index = 0; if (pmu_version < 2) return 0; @@ -948,7 +949,7 @@ int amd_uncore_umc_ctx_init(struct amd_uncore *uncore, unsigned int cpu) for_each_set_bit(gid, gmask, UNCORE_GROUP_MAX) { for (i = 0; i < group_num_pmus[gid]; i++) { pmu = &uncore->pmus[index]; - snprintf(pmu->name, sizeof(pmu->name), "amd_umc_%d", index); + snprintf(pmu->name, sizeof(pmu->name), "amd_umc_%hu", index); pmu->num_counters = group_num_pmcs[gid] / group_num_pmus[gid]; pmu->msr_base = MSR_F19H_UMC_PERF_CTL + i * pmu->num_counters * 2; pmu->rdpmc_base = -1; diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index 65ab6460aed4..c75c482d4c52 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -3003,35 +3003,57 @@ static unsigned long code_segment_base(struct pt_regs *regs) return 0; } -unsigned long perf_instruction_pointer(struct pt_regs *regs) +unsigned long perf_arch_instruction_pointer(struct pt_regs *regs) { - if (perf_guest_state()) - return perf_guest_get_ip(); - return regs->ip + code_segment_base(regs); } -unsigned long perf_misc_flags(struct pt_regs *regs) +static unsigned long common_misc_flags(struct pt_regs *regs) { - unsigned int guest_state = perf_guest_state(); - int misc = 0; + if (regs->flags & PERF_EFLAGS_EXACT) + return PERF_RECORD_MISC_EXACT_IP; - if (guest_state) { - if (guest_state & PERF_GUEST_USER) - misc |= PERF_RECORD_MISC_GUEST_USER; - else - misc |= PERF_RECORD_MISC_GUEST_KERNEL; - } else { - if (user_mode(regs)) - misc |= PERF_RECORD_MISC_USER; - else - misc |= PERF_RECORD_MISC_KERNEL; - } + return 0; +} - if (regs->flags & PERF_EFLAGS_EXACT) - misc |= PERF_RECORD_MISC_EXACT_IP; +static unsigned long guest_misc_flags(struct pt_regs *regs) +{ + unsigned long guest_state = perf_guest_state(); + + if (!(guest_state & PERF_GUEST_ACTIVE)) + return 0; + + if (guest_state & PERF_GUEST_USER) + return PERF_RECORD_MISC_GUEST_USER; + else + return PERF_RECORD_MISC_GUEST_KERNEL; + +} + +static unsigned long host_misc_flags(struct pt_regs *regs) +{ + if (user_mode(regs)) + return PERF_RECORD_MISC_USER; + else + return PERF_RECORD_MISC_KERNEL; +} + +unsigned long perf_arch_guest_misc_flags(struct pt_regs *regs) +{ + unsigned long flags = common_misc_flags(regs); + + flags |= guest_misc_flags(regs); + + return flags; +} + +unsigned long perf_arch_misc_flags(struct pt_regs *regs) +{ + unsigned long flags = common_misc_flags(regs); + + flags |= host_misc_flags(regs); - return misc; + return flags; } void perf_get_x86_pmu_capability(struct x86_pmu_capability *cap) diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index d879478db3f5..bb284aff7bfd 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -3962,8 +3962,8 @@ static int intel_pmu_hw_config(struct perf_event *event) if (!(event->attr.freq || (event->attr.wakeup_events && !event->attr.watermark))) { event->hw.flags |= PERF_X86_EVENT_AUTO_RELOAD; - if (!(event->attr.sample_type & - ~intel_pmu_large_pebs_flags(event))) { + if (!(event->attr.sample_type & ~intel_pmu_large_pebs_flags(event)) && + !has_aux_action(event)) { event->hw.flags |= PERF_X86_EVENT_LARGE_PEBS; event->attach_state |= PERF_ATTACH_SCHED_CB; } @@ -4599,6 +4599,28 @@ static inline bool erratum_hsw11(struct perf_event *event) X86_CONFIG(.event=0xc0, .umask=0x01); } +static struct event_constraint * +arl_h_get_event_constraints(struct cpu_hw_events *cpuc, int idx, + struct perf_event *event) +{ + struct x86_hybrid_pmu *pmu = hybrid_pmu(event->pmu); + + if (pmu->pmu_type == hybrid_tiny) + return cmt_get_event_constraints(cpuc, idx, event); + + return mtl_get_event_constraints(cpuc, idx, event); +} + +static int arl_h_hw_config(struct perf_event *event) +{ + struct x86_hybrid_pmu *pmu = hybrid_pmu(event->pmu); + + if (pmu->pmu_type == hybrid_tiny) + return intel_pmu_hw_config(event); + + return adl_hw_config(event); +} + /* * The HSW11 requires a period larger than 100 which is the same as the BDM11. * A minimum period of 128 is enforced as well for the INST_RETIRED.ALL. @@ -4924,17 +4946,26 @@ static struct x86_hybrid_pmu *find_hybrid_pmu_for_cpu(void) /* * This essentially just maps between the 'hybrid_cpu_type' - * and 'hybrid_pmu_type' enums: + * and 'hybrid_pmu_type' enums except for ARL-H processor + * which needs to compare atom uarch native id since ARL-H + * contains two different atom uarchs. */ for (i = 0; i < x86_pmu.num_hybrid_pmus; i++) { enum hybrid_pmu_type pmu_type = x86_pmu.hybrid_pmu[i].pmu_type; + u32 native_id; - if (cpu_type == HYBRID_INTEL_CORE && - pmu_type == hybrid_big) - return &x86_pmu.hybrid_pmu[i]; - if (cpu_type == HYBRID_INTEL_ATOM && - pmu_type == hybrid_small) + if (cpu_type == HYBRID_INTEL_CORE && pmu_type == hybrid_big) return &x86_pmu.hybrid_pmu[i]; + if (cpu_type == HYBRID_INTEL_ATOM) { + if (x86_pmu.num_hybrid_pmus == 2 && pmu_type == hybrid_small) + return &x86_pmu.hybrid_pmu[i]; + + native_id = get_this_hybrid_cpu_native_id(); + if (native_id == skt_native_id && pmu_type == hybrid_small) + return &x86_pmu.hybrid_pmu[i]; + if (native_id == cmt_native_id && pmu_type == hybrid_tiny) + return &x86_pmu.hybrid_pmu[i]; + } } return NULL; @@ -5965,6 +5996,37 @@ static struct attribute *lnl_hybrid_events_attrs[] = { NULL }; +/* The event string must be in PMU IDX order. */ +EVENT_ATTR_STR_HYBRID(topdown-retiring, + td_retiring_arl_h, + "event=0xc2,umask=0x02;event=0x00,umask=0x80;event=0xc2,umask=0x0", + hybrid_big_small_tiny); +EVENT_ATTR_STR_HYBRID(topdown-bad-spec, + td_bad_spec_arl_h, + "event=0x73,umask=0x0;event=0x00,umask=0x81;event=0x73,umask=0x0", + hybrid_big_small_tiny); +EVENT_ATTR_STR_HYBRID(topdown-fe-bound, + td_fe_bound_arl_h, + "event=0x9c,umask=0x01;event=0x00,umask=0x82;event=0x71,umask=0x0", + hybrid_big_small_tiny); +EVENT_ATTR_STR_HYBRID(topdown-be-bound, + td_be_bound_arl_h, + "event=0xa4,umask=0x02;event=0x00,umask=0x83;event=0x74,umask=0x0", + hybrid_big_small_tiny); + +static struct attribute *arl_h_hybrid_events_attrs[] = { + EVENT_PTR(slots_adl), + EVENT_PTR(td_retiring_arl_h), + EVENT_PTR(td_bad_spec_arl_h), + EVENT_PTR(td_fe_bound_arl_h), + EVENT_PTR(td_be_bound_arl_h), + EVENT_PTR(td_heavy_ops_adl), + EVENT_PTR(td_br_mis_adl), + EVENT_PTR(td_fetch_lat_adl), + EVENT_PTR(td_mem_bound_adl), + NULL, +}; + /* Must be in IDX order */ EVENT_ATTR_STR_HYBRID(mem-loads, mem_ld_adl, "event=0xd0,umask=0x5,ldlat=3;event=0xcd,umask=0x1,ldlat=3", hybrid_big_small); EVENT_ATTR_STR_HYBRID(mem-stores, mem_st_adl, "event=0xd0,umask=0x6;event=0xcd,umask=0x2", hybrid_big_small); @@ -5983,6 +6045,21 @@ static struct attribute *mtl_hybrid_mem_attrs[] = { NULL }; +EVENT_ATTR_STR_HYBRID(mem-loads, + mem_ld_arl_h, + "event=0xd0,umask=0x5,ldlat=3;event=0xcd,umask=0x1,ldlat=3;event=0xd0,umask=0x5,ldlat=3", + hybrid_big_small_tiny); +EVENT_ATTR_STR_HYBRID(mem-stores, + mem_st_arl_h, + "event=0xd0,umask=0x6;event=0xcd,umask=0x2;event=0xd0,umask=0x6", + hybrid_big_small_tiny); + +static struct attribute *arl_h_hybrid_mem_attrs[] = { + EVENT_PTR(mem_ld_arl_h), + EVENT_PTR(mem_st_arl_h), + NULL, +}; + EVENT_ATTR_STR_HYBRID(tx-start, tx_start_adl, "event=0xc9,umask=0x1", hybrid_big); EVENT_ATTR_STR_HYBRID(tx-commit, tx_commit_adl, "event=0xc9,umask=0x2", hybrid_big); EVENT_ATTR_STR_HYBRID(tx-abort, tx_abort_adl, "event=0xc9,umask=0x4", hybrid_big); @@ -6006,8 +6083,8 @@ static struct attribute *adl_hybrid_tsx_attrs[] = { FORMAT_ATTR_HYBRID(in_tx, hybrid_big); FORMAT_ATTR_HYBRID(in_tx_cp, hybrid_big); -FORMAT_ATTR_HYBRID(offcore_rsp, hybrid_big_small); -FORMAT_ATTR_HYBRID(ldlat, hybrid_big_small); +FORMAT_ATTR_HYBRID(offcore_rsp, hybrid_big_small_tiny); +FORMAT_ATTR_HYBRID(ldlat, hybrid_big_small_tiny); FORMAT_ATTR_HYBRID(frontend, hybrid_big); #define ADL_HYBRID_RTM_FORMAT_ATTR \ @@ -6030,7 +6107,7 @@ static struct attribute *adl_hybrid_extra_attr[] = { NULL }; -FORMAT_ATTR_HYBRID(snoop_rsp, hybrid_small); +FORMAT_ATTR_HYBRID(snoop_rsp, hybrid_small_tiny); static struct attribute *mtl_hybrid_extra_attr_rtm[] = { ADL_HYBRID_RTM_FORMAT_ATTR, @@ -6238,8 +6315,9 @@ static inline int intel_pmu_v6_addr_offset(int index, bool eventsel) } static const struct { enum hybrid_pmu_type id; char *name; } intel_hybrid_pmu_type_map[] __initconst = { - { hybrid_small, "cpu_atom" }, - { hybrid_big, "cpu_core" }, + { hybrid_small, "cpu_atom" }, + { hybrid_big, "cpu_core" }, + { hybrid_tiny, "cpu_lowpower" }, }; static __always_inline int intel_pmu_init_hybrid(enum hybrid_pmu_type pmus) @@ -6272,7 +6350,7 @@ static __always_inline int intel_pmu_init_hybrid(enum hybrid_pmu_type pmus) 0, x86_pmu_num_counters(&pmu->pmu), 0, 0); pmu->intel_cap.capabilities = x86_pmu.intel_cap.capabilities; - if (pmu->pmu_type & hybrid_small) { + if (pmu->pmu_type & hybrid_small_tiny) { pmu->intel_cap.perf_metrics = 0; pmu->intel_cap.pebs_output_pt_available = 1; pmu->mid_ack = true; @@ -7111,6 +7189,37 @@ __init int intel_pmu_init(void) name = "lunarlake_hybrid"; break; + case INTEL_ARROWLAKE_H: + intel_pmu_init_hybrid(hybrid_big_small_tiny); + + x86_pmu.pebs_latency_data = arl_h_latency_data; + x86_pmu.get_event_constraints = arl_h_get_event_constraints; + x86_pmu.hw_config = arl_h_hw_config; + + td_attr = arl_h_hybrid_events_attrs; + mem_attr = arl_h_hybrid_mem_attrs; + tsx_attr = adl_hybrid_tsx_attrs; + extra_attr = boot_cpu_has(X86_FEATURE_RTM) ? + mtl_hybrid_extra_attr_rtm : mtl_hybrid_extra_attr; + + /* Initialize big core specific PerfMon capabilities. */ + pmu = &x86_pmu.hybrid_pmu[X86_HYBRID_PMU_CORE_IDX]; + intel_pmu_init_lnc(&pmu->pmu); + + /* Initialize Atom core specific PerfMon capabilities. */ + pmu = &x86_pmu.hybrid_pmu[X86_HYBRID_PMU_ATOM_IDX]; + intel_pmu_init_skt(&pmu->pmu); + + /* Initialize Lower Power Atom specific PerfMon capabilities. */ + pmu = &x86_pmu.hybrid_pmu[X86_HYBRID_PMU_TINY_IDX]; + intel_pmu_init_grt(&pmu->pmu); + pmu->extra_regs = intel_cmt_extra_regs; + + intel_pmu_pebs_data_source_arl_h(); + pr_cont("ArrowLake-H Hybrid events, "); + name = "arrowlake_h_hybrid"; + break; + default: switch (x86_pmu.version) { case 1: diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index fa5ea65de0d0..8afc4ad3cd16 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -177,6 +177,17 @@ void __init intel_pmu_pebs_data_source_mtl(void) __intel_pmu_pebs_data_source_cmt(data_source); } +void __init intel_pmu_pebs_data_source_arl_h(void) +{ + u64 *data_source; + + intel_pmu_pebs_data_source_lnl(); + + data_source = x86_pmu.hybrid_pmu[X86_HYBRID_PMU_TINY_IDX].pebs_data_source; + memcpy(data_source, pebs_data_source, sizeof(pebs_data_source)); + __intel_pmu_pebs_data_source_cmt(data_source); +} + void __init intel_pmu_pebs_data_source_cmt(void) { __intel_pmu_pebs_data_source_cmt(pebs_data_source); @@ -388,6 +399,16 @@ u64 lnl_latency_data(struct perf_event *event, u64 status) return lnc_latency_data(event, status); } +u64 arl_h_latency_data(struct perf_event *event, u64 status) +{ + struct x86_hybrid_pmu *pmu = hybrid_pmu(event->pmu); + + if (pmu->pmu_type == hybrid_tiny) + return cmt_latency_data(event, status); + + return lnl_latency_data(event, status); +} + static u64 load_latency_data(struct perf_event *event, u64 status) { union intel_x86_pebs_dse dse; diff --git a/arch/x86/events/intel/pt.c b/arch/x86/events/intel/pt.c index fd4670a6694e..4b0373bc8ab4 100644 --- a/arch/x86/events/intel/pt.c +++ b/arch/x86/events/intel/pt.c @@ -418,6 +418,9 @@ static void pt_config_start(struct perf_event *event) struct pt *pt = this_cpu_ptr(&pt_ctx); u64 ctl = event->hw.aux_config; + if (READ_ONCE(event->hw.aux_paused)) + return; + ctl |= RTIT_CTL_TRACEEN; if (READ_ONCE(pt->vmx_on)) perf_aux_output_flag(&pt->handle, PERF_AUX_FLAG_PARTIAL); @@ -534,7 +537,24 @@ static void pt_config(struct perf_event *event) reg |= (event->attr.config & PT_CONFIG_MASK); event->hw.aux_config = reg; + + /* + * Allow resume before starting so as not to overwrite a value set by a + * PMI. + */ + barrier(); + WRITE_ONCE(pt->resume_allowed, 1); + /* Configuration is complete, it is now OK to handle an NMI */ + barrier(); + WRITE_ONCE(pt->handle_nmi, 1); + barrier(); pt_config_start(event); + barrier(); + /* + * Allow pause after starting so its pt_config_stop() doesn't race with + * pt_config_start(). + */ + WRITE_ONCE(pt->pause_allowed, 1); } static void pt_config_stop(struct perf_event *event) @@ -828,11 +848,13 @@ static void pt_buffer_advance(struct pt_buffer *buf) buf->cur_idx++; if (buf->cur_idx == buf->cur->last) { - if (buf->cur == buf->last) + if (buf->cur == buf->last) { buf->cur = buf->first; - else + buf->wrapped = true; + } else { buf->cur = list_entry(buf->cur->list.next, struct topa, list); + } buf->cur_idx = 0; } } @@ -846,8 +868,11 @@ static void pt_buffer_advance(struct pt_buffer *buf) static void pt_update_head(struct pt *pt) { struct pt_buffer *buf = perf_get_aux(&pt->handle); + bool wrapped = buf->wrapped; u64 topa_idx, base, old; + buf->wrapped = false; + if (buf->single) { local_set(&buf->data_size, buf->output_off); return; @@ -865,7 +890,7 @@ static void pt_update_head(struct pt *pt) } else { old = (local64_xchg(&buf->head, base) & ((buf->nr_pages << PAGE_SHIFT) - 1)); - if (base < old) + if (base < old || (base == old && wrapped)) base += buf->nr_pages << PAGE_SHIFT; local_add(base - old, &buf->data_size); @@ -1511,6 +1536,7 @@ void intel_pt_interrupt(void) buf = perf_aux_output_begin(&pt->handle, event); if (!buf) { event->hw.state = PERF_HES_STOPPED; + WRITE_ONCE(pt->resume_allowed, 0); return; } @@ -1519,6 +1545,7 @@ void intel_pt_interrupt(void) ret = pt_buffer_reset_markers(buf, &pt->handle); if (ret) { perf_aux_output_end(&pt->handle, 0); + WRITE_ONCE(pt->resume_allowed, 0); return; } @@ -1573,6 +1600,26 @@ static void pt_event_start(struct perf_event *event, int mode) struct pt *pt = this_cpu_ptr(&pt_ctx); struct pt_buffer *buf; + if (mode & PERF_EF_RESUME) { + if (READ_ONCE(pt->resume_allowed)) { + u64 status; + + /* + * Only if the trace is not active and the error and + * stopped bits are clear, is it safe to start, but a + * PMI might have just cleared these, so resume_allowed + * must be checked again also. + */ + rdmsrl(MSR_IA32_RTIT_STATUS, status); + if (!(status & (RTIT_STATUS_TRIGGEREN | + RTIT_STATUS_ERROR | + RTIT_STATUS_STOPPED)) && + READ_ONCE(pt->resume_allowed)) + pt_config_start(event); + } + return; + } + buf = perf_aux_output_begin(&pt->handle, event); if (!buf) goto fail_stop; @@ -1583,7 +1630,6 @@ static void pt_event_start(struct perf_event *event, int mode) goto fail_end_stop; } - WRITE_ONCE(pt->handle_nmi, 1); hwc->state = 0; pt_config_buffer(buf); @@ -1601,6 +1647,12 @@ static void pt_event_stop(struct perf_event *event, int mode) { struct pt *pt = this_cpu_ptr(&pt_ctx); + if (mode & PERF_EF_PAUSE) { + if (READ_ONCE(pt->pause_allowed)) + pt_config_stop(event); + return; + } + /* * Protect against the PMI racing with disabling wrmsr, * see comment in intel_pt_interrupt(). @@ -1608,6 +1660,15 @@ static void pt_event_stop(struct perf_event *event, int mode) WRITE_ONCE(pt->handle_nmi, 0); barrier(); + /* + * Prevent a resume from attempting to restart tracing, or a pause + * during a subsequent start. Do this after clearing handle_nmi so that + * pt_event_snapshot_aux() will not re-allow them. + */ + WRITE_ONCE(pt->pause_allowed, 0); + WRITE_ONCE(pt->resume_allowed, 0); + barrier(); + pt_config_stop(event); if (event->hw.state == PERF_HES_STOPPED) @@ -1657,6 +1718,10 @@ static long pt_event_snapshot_aux(struct perf_event *event, if (WARN_ON_ONCE(!buf->snapshot)) return 0; + /* Prevent pause/resume from attempting to start/stop tracing */ + WRITE_ONCE(pt->pause_allowed, 0); + WRITE_ONCE(pt->resume_allowed, 0); + barrier(); /* * There is no PT interrupt in this mode, so stop the trace and it will * remain stopped while the buffer is copied. @@ -1676,8 +1741,13 @@ static long pt_event_snapshot_aux(struct perf_event *event, * Here, handle_nmi tells us if the tracing was on. * If the tracing was on, restart it. */ - if (READ_ONCE(pt->handle_nmi)) + if (READ_ONCE(pt->handle_nmi)) { + WRITE_ONCE(pt->resume_allowed, 1); + barrier(); pt_config_start(event); + barrier(); + WRITE_ONCE(pt->pause_allowed, 1); + } return ret; } @@ -1793,7 +1863,9 @@ static __init int pt_init(void) if (!intel_pt_validate_hw_cap(PT_CAP_topa_multiple_entries)) pt_pmu.pmu.capabilities = PERF_PMU_CAP_AUX_NO_SG; - pt_pmu.pmu.capabilities |= PERF_PMU_CAP_EXCLUSIVE | PERF_PMU_CAP_ITRACE; + pt_pmu.pmu.capabilities |= PERF_PMU_CAP_EXCLUSIVE | + PERF_PMU_CAP_ITRACE | + PERF_PMU_CAP_AUX_PAUSE; pt_pmu.pmu.attr_groups = pt_attr_groups; pt_pmu.pmu.task_ctx_nr = perf_sw_context; pt_pmu.pmu.event_init = pt_event_init; diff --git a/arch/x86/events/intel/pt.h b/arch/x86/events/intel/pt.h index f5e46c04c145..7ee94fc6d7cb 100644 --- a/arch/x86/events/intel/pt.h +++ b/arch/x86/events/intel/pt.h @@ -65,6 +65,7 @@ struct pt_pmu { * @head: logical write offset inside the buffer * @snapshot: if this is for a snapshot/overwrite counter * @single: use Single Range Output instead of ToPA + * @wrapped: buffer advance wrapped back to the first topa table * @stop_pos: STOP topa entry index * @intr_pos: INT topa entry index * @stop_te: STOP topa entry pointer @@ -82,6 +83,7 @@ struct pt_buffer { local64_t head; bool snapshot; bool single; + bool wrapped; long stop_pos, intr_pos; struct topa_entry *stop_te, *intr_te; void **data_pages; @@ -117,6 +119,8 @@ struct pt_filters { * @filters: last configured filters * @handle_nmi: do handle PT PMI on this cpu, there's an active event * @vmx_on: 1 if VMX is ON on this cpu + * @pause_allowed: PERF_EF_PAUSE is allowed to stop tracing + * @resume_allowed: PERF_EF_RESUME is allowed to start tracing * @output_base: cached RTIT_OUTPUT_BASE MSR value * @output_mask: cached RTIT_OUTPUT_MASK MSR value */ @@ -125,6 +129,8 @@ struct pt { struct pt_filters filters; int handle_nmi; int vmx_on; + int pause_allowed; + int resume_allowed; u64 output_base; u64 output_mask; }; diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index ac1182141bf6..82c6f45ce975 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -668,24 +668,38 @@ enum { #define PERF_PEBS_DATA_SOURCE_GRT_MAX 0x10 #define PERF_PEBS_DATA_SOURCE_GRT_MASK (PERF_PEBS_DATA_SOURCE_GRT_MAX - 1) +/* + * CPUID.1AH.EAX[31:0] uniquely identifies the microarchitecture + * of the core. Bits 31-24 indicates its core type (Core or Atom) + * and Bits [23:0] indicates the native model ID of the core. + * Core type and native model ID are defined in below enumerations. + */ enum hybrid_cpu_type { HYBRID_INTEL_NONE, HYBRID_INTEL_ATOM = 0x20, HYBRID_INTEL_CORE = 0x40, }; +#define X86_HYBRID_PMU_ATOM_IDX 0 +#define X86_HYBRID_PMU_CORE_IDX 1 +#define X86_HYBRID_PMU_TINY_IDX 2 + enum hybrid_pmu_type { not_hybrid, - hybrid_small = BIT(0), - hybrid_big = BIT(1), - - hybrid_big_small = hybrid_big | hybrid_small, /* only used for matching */ + hybrid_small = BIT(X86_HYBRID_PMU_ATOM_IDX), + hybrid_big = BIT(X86_HYBRID_PMU_CORE_IDX), + hybrid_tiny = BIT(X86_HYBRID_PMU_TINY_IDX), + + /* The belows are only used for matching */ + hybrid_big_small = hybrid_big | hybrid_small, + hybrid_small_tiny = hybrid_small | hybrid_tiny, + hybrid_big_small_tiny = hybrid_big | hybrid_small_tiny, }; -#define X86_HYBRID_PMU_ATOM_IDX 0 -#define X86_HYBRID_PMU_CORE_IDX 1 - -#define X86_HYBRID_NUM_PMUS 2 +enum atom_native_id { + cmt_native_id = 0x2, /* Crestmont */ + skt_native_id = 0x3, /* Skymont */ +}; struct x86_hybrid_pmu { struct pmu pmu; @@ -1578,6 +1592,8 @@ u64 cmt_latency_data(struct perf_event *event, u64 status); u64 lnl_latency_data(struct perf_event *event, u64 status); +u64 arl_h_latency_data(struct perf_event *event, u64 status); + extern struct event_constraint intel_core2_pebs_event_constraints[]; extern struct event_constraint intel_atom_pebs_event_constraints[]; @@ -1697,6 +1713,8 @@ void intel_pmu_pebs_data_source_grt(void); void intel_pmu_pebs_data_source_mtl(void); +void intel_pmu_pebs_data_source_arl_h(void); + void intel_pmu_pebs_data_source_cmt(void); void intel_pmu_pebs_data_source_lnl(void); diff --git a/arch/x86/events/rapl.c b/arch/x86/events/rapl.c index a481a939862e..a8defc813c36 100644 --- a/arch/x86/events/rapl.c +++ b/arch/x86/events/rapl.c @@ -148,7 +148,6 @@ struct rapl_model { /* 1/2^hw_unit Joule */ static int rapl_hw_unit[NR_RAPL_DOMAINS] __read_mostly; static struct rapl_pmus *rapl_pmus; -static cpumask_t rapl_cpu_mask; static unsigned int rapl_cntr_mask; static u64 rapl_timer_ms; static struct perf_msr *rapl_msrs; @@ -369,8 +368,6 @@ static int rapl_pmu_event_init(struct perf_event *event) if (event->cpu < 0) return -EINVAL; - event->event_caps |= PERF_EV_CAP_READ_ACTIVE_PKG; - if (!cfg || cfg >= NR_RAPL_DOMAINS + 1) return -EINVAL; @@ -389,7 +386,6 @@ static int rapl_pmu_event_init(struct perf_event *event) pmu = cpu_to_rapl_pmu(event->cpu); if (!pmu) return -EINVAL; - event->cpu = pmu->cpu; event->pmu_private = pmu; event->hw.event_base = rapl_msrs[bit].msr; event->hw.config = cfg; @@ -403,23 +399,6 @@ static void rapl_pmu_event_read(struct perf_event *event) rapl_event_update(event); } -static ssize_t rapl_get_attr_cpumask(struct device *dev, - struct device_attribute *attr, char *buf) -{ - return cpumap_print_to_pagebuf(true, buf, &rapl_cpu_mask); -} - -static DEVICE_ATTR(cpumask, S_IRUGO, rapl_get_attr_cpumask, NULL); - -static struct attribute *rapl_pmu_attrs[] = { - &dev_attr_cpumask.attr, - NULL, -}; - -static struct attribute_group rapl_pmu_attr_group = { - .attrs = rapl_pmu_attrs, -}; - RAPL_EVENT_ATTR_STR(energy-cores, rapl_cores, "event=0x01"); RAPL_EVENT_ATTR_STR(energy-pkg , rapl_pkg, "event=0x02"); RAPL_EVENT_ATTR_STR(energy-ram , rapl_ram, "event=0x03"); @@ -467,7 +446,6 @@ static struct attribute_group rapl_pmu_format_group = { }; static const struct attribute_group *rapl_attr_groups[] = { - &rapl_pmu_attr_group, &rapl_pmu_format_group, &rapl_pmu_events_group, NULL, @@ -570,65 +548,6 @@ static struct perf_msr amd_rapl_msrs[] = { [PERF_RAPL_PSYS] = { 0, &rapl_events_psys_group, NULL, false, 0 }, }; -static int rapl_cpu_offline(unsigned int cpu) -{ - struct rapl_pmu *pmu = cpu_to_rapl_pmu(cpu); - int target; - - /* Check if exiting cpu is used for collecting rapl events */ - if (!cpumask_test_and_clear_cpu(cpu, &rapl_cpu_mask)) - return 0; - - pmu->cpu = -1; - /* Find a new cpu to collect rapl events */ - target = cpumask_any_but(get_rapl_pmu_cpumask(cpu), cpu); - - /* Migrate rapl events to the new target */ - if (target < nr_cpu_ids) { - cpumask_set_cpu(target, &rapl_cpu_mask); - pmu->cpu = target; - perf_pmu_migrate_context(pmu->pmu, cpu, target); - } - return 0; -} - -static int rapl_cpu_online(unsigned int cpu) -{ - s32 rapl_pmu_idx = get_rapl_pmu_idx(cpu); - if (rapl_pmu_idx < 0) { - pr_err("topology_logical_(package/die)_id() returned a negative value"); - return -EINVAL; - } - struct rapl_pmu *pmu = cpu_to_rapl_pmu(cpu); - int target; - - if (!pmu) { - pmu = kzalloc_node(sizeof(*pmu), GFP_KERNEL, cpu_to_node(cpu)); - if (!pmu) - return -ENOMEM; - - raw_spin_lock_init(&pmu->lock); - INIT_LIST_HEAD(&pmu->active_list); - pmu->pmu = &rapl_pmus->pmu; - pmu->timer_interval = ms_to_ktime(rapl_timer_ms); - rapl_hrtimer_init(pmu); - - rapl_pmus->pmus[rapl_pmu_idx] = pmu; - } - - /* - * Check if there is an online cpu in the package which collects rapl - * events already. - */ - target = cpumask_any_and(&rapl_cpu_mask, get_rapl_pmu_cpumask(cpu)); - if (target < nr_cpu_ids) - return 0; - - cpumask_set_cpu(cpu, &rapl_cpu_mask); - pmu->cpu = cpu; - return 0; -} - static int rapl_check_hw_unit(struct rapl_model *rm) { u64 msr_rapl_power_unit_bits; @@ -707,12 +626,41 @@ static const struct attribute_group *rapl_attr_update[] = { NULL, }; +static int __init init_rapl_pmu(void) +{ + struct rapl_pmu *pmu; + int idx; + + for (idx = 0; idx < rapl_pmus->nr_rapl_pmu; idx++) { + pmu = kzalloc(sizeof(*pmu), GFP_KERNEL); + if (!pmu) + goto free; + + raw_spin_lock_init(&pmu->lock); + INIT_LIST_HEAD(&pmu->active_list); + pmu->pmu = &rapl_pmus->pmu; + pmu->timer_interval = ms_to_ktime(rapl_timer_ms); + rapl_hrtimer_init(pmu); + + rapl_pmus->pmus[idx] = pmu; + } + + return 0; +free: + for (; idx > 0; idx--) + kfree(rapl_pmus->pmus[idx - 1]); + return -ENOMEM; +} + static int __init init_rapl_pmus(void) { int nr_rapl_pmu = topology_max_packages(); + int rapl_pmu_scope = PERF_PMU_SCOPE_PKG; - if (!rapl_pmu_is_pkg_scope()) + if (!rapl_pmu_is_pkg_scope()) { nr_rapl_pmu *= topology_max_dies_per_package(); + rapl_pmu_scope = PERF_PMU_SCOPE_DIE; + } rapl_pmus = kzalloc(struct_size(rapl_pmus, pmus, nr_rapl_pmu), GFP_KERNEL); if (!rapl_pmus) @@ -728,9 +676,11 @@ static int __init init_rapl_pmus(void) rapl_pmus->pmu.start = rapl_pmu_event_start; rapl_pmus->pmu.stop = rapl_pmu_event_stop; rapl_pmus->pmu.read = rapl_pmu_event_read; + rapl_pmus->pmu.scope = rapl_pmu_scope; rapl_pmus->pmu.module = THIS_MODULE; rapl_pmus->pmu.capabilities = PERF_PMU_CAP_NO_EXCLUDE; - return 0; + + return init_rapl_pmu(); } static struct rapl_model model_snb = { @@ -876,24 +826,13 @@ static int __init rapl_pmu_init(void) if (ret) return ret; - /* - * Install callbacks. Core will call them for each online cpu. - */ - ret = cpuhp_setup_state(CPUHP_AP_PERF_X86_RAPL_ONLINE, - "perf/x86/rapl:online", - rapl_cpu_online, rapl_cpu_offline); - if (ret) - goto out; - ret = perf_pmu_register(&rapl_pmus->pmu, "power", -1); if (ret) - goto out1; + goto out; rapl_advertise(); return 0; -out1: - cpuhp_remove_state(CPUHP_AP_PERF_X86_RAPL_ONLINE); out: pr_warn("Initialization failed (%d), disabled\n", ret); cleanup_rapl_pmus(); @@ -903,7 +842,6 @@ module_init(rapl_pmu_init); static void __exit intel_rapl_exit(void) { - cpuhp_remove_state_nocalls(CPUHP_AP_PERF_X86_RAPL_ONLINE); perf_pmu_unregister(&rapl_pmus->pmu); cleanup_rapl_pmus(); } diff --git a/arch/x86/include/asm/amd_nb.h b/arch/x86/include/asm/amd_nb.h index 6f3b6aef47ba..d0caac26533f 100644 --- a/arch/x86/include/asm/amd_nb.h +++ b/arch/x86/include/asm/amd_nb.h @@ -116,7 +116,10 @@ static inline bool amd_gart_present(void) #define amd_nb_num(x) 0 #define amd_nb_has_feature(x) false -#define node_to_amd_nb(x) NULL +static inline struct amd_northbridge *node_to_amd_nb(int node) +{ + return NULL; +} #define amd_gart_present(x) false #endif diff --git a/arch/x86/include/asm/asm-prototypes.h b/arch/x86/include/asm/asm-prototypes.h index 25466c4d2134..3674006e3974 100644 --- a/arch/x86/include/asm/asm-prototypes.h +++ b/arch/x86/include/asm/asm-prototypes.h @@ -20,3 +20,6 @@ extern void cmpxchg8b_emu(void); #endif +#if defined(__GENKSYMS__) && defined(CONFIG_STACKPROTECTOR) +extern unsigned long __ref_stack_chk_guard; +#endif diff --git a/arch/x86/include/asm/atomic64_32.h b/arch/x86/include/asm/atomic64_32.h index 1f650b4dde50..6c6e9b9f98a4 100644 --- a/arch/x86/include/asm/atomic64_32.h +++ b/arch/x86/include/asm/atomic64_32.h @@ -51,7 +51,8 @@ static __always_inline s64 arch_atomic64_read_nonatomic(const atomic64_t *v) #ifdef CONFIG_X86_CMPXCHG64 #define __alternative_atomic64(f, g, out, in...) \ asm volatile("call %c[func]" \ - : out : [func] "i" (atomic64_##g##_cx8), ## in) + : ALT_OUTPUT_SP(out) \ + : [func] "i" (atomic64_##g##_cx8), ## in) #define ATOMIC64_DECL(sym) ATOMIC64_DECL_ONE(sym##_cx8) #else diff --git a/arch/x86/include/asm/cmpxchg_32.h b/arch/x86/include/asm/cmpxchg_32.h index 62cef2113ca7..fd1282a783dd 100644 --- a/arch/x86/include/asm/cmpxchg_32.h +++ b/arch/x86/include/asm/cmpxchg_32.h @@ -94,7 +94,7 @@ static __always_inline bool __try_cmpxchg64_local(volatile u64 *ptr, u64 *oldp, asm volatile(ALTERNATIVE(_lock_loc \ "call cmpxchg8b_emu", \ _lock "cmpxchg8b %a[ptr]", X86_FEATURE_CX8) \ - : "+a" (o.low), "+d" (o.high) \ + : ALT_OUTPUT_SP("+a" (o.low), "+d" (o.high)) \ : "b" (n.low), "c" (n.high), [ptr] "S" (_ptr) \ : "memory"); \ \ @@ -123,8 +123,8 @@ static __always_inline u64 arch_cmpxchg64_local(volatile u64 *ptr, u64 old, u64 "call cmpxchg8b_emu", \ _lock "cmpxchg8b %a[ptr]", X86_FEATURE_CX8) \ CC_SET(e) \ - : CC_OUT(e) (ret), \ - "+a" (o.low), "+d" (o.high) \ + : ALT_OUTPUT_SP(CC_OUT(e) (ret), \ + "+a" (o.low), "+d" (o.high)) \ : "b" (n.low), "c" (n.high), [ptr] "S" (_ptr) \ : "memory"); \ \ diff --git a/arch/x86/include/asm/cpu.h b/arch/x86/include/asm/cpu.h index aa30fd8cad7f..98eced5084ca 100644 --- a/arch/x86/include/asm/cpu.h +++ b/arch/x86/include/asm/cpu.h @@ -26,12 +26,13 @@ int mwait_usable(const struct cpuinfo_x86 *); unsigned int x86_family(unsigned int sig); unsigned int x86_model(unsigned int sig); unsigned int x86_stepping(unsigned int sig); -#ifdef CONFIG_CPU_SUP_INTEL +#ifdef CONFIG_X86_BUS_LOCK_DETECT extern void __init sld_setup(struct cpuinfo_x86 *c); extern bool handle_user_split_lock(struct pt_regs *regs, long error_code); extern bool handle_guest_split_lock(unsigned long ip); extern void handle_bus_lock(struct pt_regs *regs); -u8 get_this_hybrid_cpu_type(void); +void split_lock_init(void); +void bus_lock_init(void); #else static inline void __init sld_setup(struct cpuinfo_x86 *c) {} static inline bool handle_user_split_lock(struct pt_regs *regs, long error_code) @@ -45,11 +46,23 @@ static inline bool handle_guest_split_lock(unsigned long ip) } static inline void handle_bus_lock(struct pt_regs *regs) {} +static inline void split_lock_init(void) {} +static inline void bus_lock_init(void) {} +#endif +#ifdef CONFIG_CPU_SUP_INTEL +u8 get_this_hybrid_cpu_type(void); +u32 get_this_hybrid_cpu_native_id(void); +#else static inline u8 get_this_hybrid_cpu_type(void) { return 0; } + +static inline u32 get_this_hybrid_cpu_native_id(void) +{ + return 0; +} #endif #ifdef CONFIG_IA32_FEAT_CTL void init_ia32_feat_ctl(struct cpuinfo_x86 *c); diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index dd4682857c12..ea33439a5d00 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -215,7 +215,7 @@ #define X86_FEATURE_SPEC_STORE_BYPASS_DISABLE ( 7*32+23) /* Disable Speculative Store Bypass. */ #define X86_FEATURE_LS_CFG_SSBD ( 7*32+24) /* AMD SSBD implementation via LS_CFG MSR */ #define X86_FEATURE_IBRS ( 7*32+25) /* "ibrs" Indirect Branch Restricted Speculation */ -#define X86_FEATURE_IBPB ( 7*32+26) /* "ibpb" Indirect Branch Prediction Barrier */ +#define X86_FEATURE_IBPB ( 7*32+26) /* "ibpb" Indirect Branch Prediction Barrier without a guaranteed RSB flush */ #define X86_FEATURE_STIBP ( 7*32+27) /* "stibp" Single Thread Indirect Branch Predictors */ #define X86_FEATURE_ZEN ( 7*32+28) /* Generic flag for all Zen and newer */ #define X86_FEATURE_L1TF_PTEINV ( 7*32+29) /* L1TF workaround PTE inversion */ @@ -348,6 +348,7 @@ #define X86_FEATURE_CPPC (13*32+27) /* "cppc" Collaborative Processor Performance Control */ #define X86_FEATURE_AMD_PSFD (13*32+28) /* Predictive Store Forwarding Disable */ #define X86_FEATURE_BTC_NO (13*32+29) /* Not vulnerable to Branch Type Confusion */ +#define X86_FEATURE_AMD_IBPB_RET (13*32+30) /* IBPB clears return address predictor */ #define X86_FEATURE_BRS (13*32+31) /* "brs" Branch Sampling available */ /* Thermal and Power Management Leaf, CPUID level 0x00000006 (EAX), word 14 */ @@ -472,7 +473,9 @@ #define X86_FEATURE_BHI_CTRL (21*32+ 2) /* BHI_DIS_S HW control available */ #define X86_FEATURE_CLEAR_BHB_HW (21*32+ 3) /* BHI_DIS_S HW control enabled */ #define X86_FEATURE_CLEAR_BHB_LOOP_ON_VMEXIT (21*32+ 4) /* Clear branch history at vmexit using SW loop */ -#define X86_FEATURE_FAST_CPPC (21*32 + 5) /* AMD Fast CPPC */ +#define X86_FEATURE_AMD_FAST_CPPC (21*32 + 5) /* Fast CPPC */ +#define X86_FEATURE_AMD_HETEROGENEOUS_CORES (21*32 + 6) /* Heterogeneous Core Topology */ +#define X86_FEATURE_AMD_WORKLOAD_CLASS (21*32 + 7) /* Workload Classification */ /* * BUG word(s) @@ -523,4 +526,5 @@ #define X86_BUG_DIV0 X86_BUG(1*32 + 1) /* "div0" AMD DIV0 speculation bug */ #define X86_BUG_RFDS X86_BUG(1*32 + 2) /* "rfds" CPU is vulnerable to Register File Data Sampling */ #define X86_BUG_BHI X86_BUG(1*32 + 3) /* "bhi" CPU is affected by Branch History Injection */ +#define X86_BUG_IBPB_NO_RET X86_BUG(1*32 + 4) /* "ibpb_no_ret" IBPB omits return target predictions */ #endif /* _ASM_X86_CPUFEATURES_H */ diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h index 0152a81d9b4a..6e8cf0fa48fc 100644 --- a/arch/x86/include/asm/ftrace.h +++ b/arch/x86/include/asm/ftrace.h @@ -2,6 +2,8 @@ #ifndef _ASM_X86_FTRACE_H #define _ASM_X86_FTRACE_H +#include <asm/ptrace.h> + #ifdef CONFIG_FUNCTION_TRACER #ifndef CC_USING_FENTRY # error Compiler does not support fentry? @@ -33,37 +35,21 @@ static inline unsigned long ftrace_call_adjust(unsigned long addr) } #ifdef CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS -struct ftrace_regs { - struct pt_regs regs; -}; + +#include <linux/ftrace_regs.h> static __always_inline struct pt_regs * arch_ftrace_get_regs(struct ftrace_regs *fregs) { /* Only when FL_SAVE_REGS is set, cs will be non zero */ - if (!fregs->regs.cs) + if (!arch_ftrace_regs(fregs)->regs.cs) return NULL; - return &fregs->regs; + return &arch_ftrace_regs(fregs)->regs; } #define ftrace_regs_set_instruction_pointer(fregs, _ip) \ - do { (fregs)->regs.ip = (_ip); } while (0) - -#define ftrace_regs_get_instruction_pointer(fregs) \ - ((fregs)->regs.ip) - -#define ftrace_regs_get_argument(fregs, n) \ - regs_get_kernel_argument(&(fregs)->regs, n) -#define ftrace_regs_get_stack_pointer(fregs) \ - kernel_stack_pointer(&(fregs)->regs) -#define ftrace_regs_return_value(fregs) \ - regs_return_value(&(fregs)->regs) -#define ftrace_regs_set_return_value(fregs, ret) \ - regs_set_return_value(&(fregs)->regs, ret) -#define ftrace_override_function_with_return(fregs) \ - override_function_with_return(&(fregs)->regs) -#define ftrace_regs_query_register_offset(name) \ - regs_query_register_offset(name) + do { arch_ftrace_regs(fregs)->regs.ip = (_ip); } while (0) + struct ftrace_ops; #define ftrace_graph_func ftrace_graph_func @@ -88,7 +74,7 @@ __arch_ftrace_set_direct_caller(struct pt_regs *regs, unsigned long addr) regs->orig_ax = addr; } #define arch_ftrace_set_direct_caller(fregs, addr) \ - __arch_ftrace_set_direct_caller(&(fregs)->regs, addr) + __arch_ftrace_set_direct_caller(&arch_ftrace_regs(fregs)->regs, addr) #endif /* CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS */ #ifdef CONFIG_DYNAMIC_FTRACE diff --git a/arch/x86/include/asm/intel-family.h b/arch/x86/include/asm/intel-family.h index 1a42f829667a..6d7b04ffc5fd 100644 --- a/arch/x86/include/asm/intel-family.h +++ b/arch/x86/include/asm/intel-family.h @@ -177,10 +177,15 @@ #define INTEL_XEON_PHI_KNM IFM(6, 0x85) /* Knights Mill */ /* Family 5 */ -#define INTEL_FAM5_QUARK_X1000 0x09 /* Quark X1000 SoC */ #define INTEL_QUARK_X1000 IFM(5, 0x09) /* Quark X1000 SoC */ /* Family 19 */ #define INTEL_PANTHERCOVE_X IFM(19, 0x01) /* Diamond Rapids */ +/* CPU core types */ +enum intel_cpu_type { + INTEL_CPU_TYPE_ATOM = 0x20, + INTEL_CPU_TYPE_CORE = 0x40, +}; + #endif /* _ASM_X86_INTEL_FAMILY_H */ diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index 3b9970117a0f..4543cf2eb5e8 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -61,6 +61,7 @@ * - TCC bit is present in MCx_STATUS. */ #define MCI_CONFIG_MCAX 0x1 +#define MCI_CONFIG_FRUTEXT BIT_ULL(9) #define MCI_IPID_MCATYPE 0xFFFF0000 #define MCI_IPID_HWID 0xFFF @@ -122,6 +123,9 @@ #define MSR_AMD64_SMCA_MC0_DESTAT 0xc0002008 #define MSR_AMD64_SMCA_MC0_DEADDR 0xc0002009 #define MSR_AMD64_SMCA_MC0_MISC1 0xc000200a +/* Registers MISC2 to MISC4 are at offsets B to D. */ +#define MSR_AMD64_SMCA_MC0_SYND1 0xc000200e +#define MSR_AMD64_SMCA_MC0_SYND2 0xc000200f #define MSR_AMD64_SMCA_MCx_CTL(x) (MSR_AMD64_SMCA_MC0_CTL + 0x10*(x)) #define MSR_AMD64_SMCA_MCx_STATUS(x) (MSR_AMD64_SMCA_MC0_STATUS + 0x10*(x)) #define MSR_AMD64_SMCA_MCx_ADDR(x) (MSR_AMD64_SMCA_MC0_ADDR + 0x10*(x)) @@ -132,6 +136,8 @@ #define MSR_AMD64_SMCA_MCx_DESTAT(x) (MSR_AMD64_SMCA_MC0_DESTAT + 0x10*(x)) #define MSR_AMD64_SMCA_MCx_DEADDR(x) (MSR_AMD64_SMCA_MC0_DEADDR + 0x10*(x)) #define MSR_AMD64_SMCA_MCx_MISCy(x, y) ((MSR_AMD64_SMCA_MC0_MISC1 + y) + (0x10*(x))) +#define MSR_AMD64_SMCA_MCx_SYND1(x) (MSR_AMD64_SMCA_MC0_SYND1 + 0x10*(x)) +#define MSR_AMD64_SMCA_MCx_SYND2(x) (MSR_AMD64_SMCA_MC0_SYND2 + 0x10*(x)) #define XEC(x, mask) (((x) >> 16) & mask) @@ -187,6 +193,32 @@ enum mce_notifier_prios { MCE_PRIO_HIGHEST = MCE_PRIO_CEC }; +/** + * struct mce_hw_err - Hardware Error Record. + * @m: Machine Check record. + * @vendor: Vendor-specific error information. + * + * Vendor-specific fields should not be added to struct mce. Instead, vendors + * should export their vendor-specific data through their structure in the + * vendor union below. + * + * AMD's vendor data is parsed by error decoding tools for supplemental error + * information. Thus, current offsets of existing fields must be maintained. + * Only add new fields at the end of AMD's vendor structure. + */ +struct mce_hw_err { + struct mce m; + + union vendor_info { + struct { + u64 synd1; /* MCA_SYND1 MSR */ + u64 synd2; /* MCA_SYND2 MSR */ + } amd; + } vendor; +}; + +#define to_mce_hw_err(mce) container_of(mce, struct mce_hw_err, m) + struct notifier_block; extern void mce_register_decode_chain(struct notifier_block *nb); extern void mce_unregister_decode_chain(struct notifier_block *nb); @@ -221,8 +253,8 @@ static inline int apei_smca_report_x86_error(struct cper_ia_proc_ctx *ctx_info, u64 lapic_id) { return -EINVAL; } #endif -void mce_prep_record(struct mce *m); -void mce_log(struct mce *m); +void mce_prep_record(struct mce_hw_err *err); +void mce_log(struct mce_hw_err *err); DECLARE_PER_CPU(struct device *, mce_device); /* Maximum number of MCA banks per CPU. */ diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h index ff5f1ecc7d1e..96b410b1d4e8 100644 --- a/arch/x86/include/asm/nospec-branch.h +++ b/arch/x86/include/asm/nospec-branch.h @@ -323,7 +323,16 @@ * Note: Only the memory operand variant of VERW clears the CPU buffers. */ .macro CLEAR_CPU_BUFFERS - ALTERNATIVE "", __stringify(verw _ASM_RIP(mds_verw_sel)), X86_FEATURE_CLEAR_CPU_BUF +#ifdef CONFIG_X86_64 + ALTERNATIVE "", "verw mds_verw_sel(%rip)", X86_FEATURE_CLEAR_CPU_BUF +#else + /* + * In 32bit mode, the memory operand must be a %cs reference. The data + * segments may not be usable (vm86 mode), and the stack segment may not + * be flat (ESPFIX32). + */ + ALTERNATIVE "", "verw %cs:mds_verw_sel", X86_FEATURE_CLEAR_CPU_BUF +#endif .endm #ifdef CONFIG_X86_64 diff --git a/arch/x86/include/asm/page_types.h b/arch/x86/include/asm/page_types.h index 52f1b4ff0cc1..974688973cf6 100644 --- a/arch/x86/include/asm/page_types.h +++ b/arch/x86/include/asm/page_types.h @@ -6,10 +6,7 @@ #include <linux/types.h> #include <linux/mem_encrypt.h> -/* PAGE_SHIFT determines the page size */ -#define PAGE_SHIFT CONFIG_PAGE_SHIFT -#define PAGE_SIZE (_AC(1,UL) << PAGE_SHIFT) -#define PAGE_MASK (~(PAGE_SIZE-1)) +#include <vdso/page.h> #define __VIRTUAL_MASK ((1UL << __VIRTUAL_MASK_SHIFT) - 1) diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h index 91b73571412f..d95f902acc52 100644 --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h @@ -536,15 +536,17 @@ struct x86_perf_regs { u64 *xmm_regs; }; -extern unsigned long perf_instruction_pointer(struct pt_regs *regs); -extern unsigned long perf_misc_flags(struct pt_regs *regs); -#define perf_misc_flags(regs) perf_misc_flags(regs) +extern unsigned long perf_arch_instruction_pointer(struct pt_regs *regs); +extern unsigned long perf_arch_misc_flags(struct pt_regs *regs); +extern unsigned long perf_arch_guest_misc_flags(struct pt_regs *regs); +#define perf_arch_misc_flags(regs) perf_arch_misc_flags(regs) +#define perf_arch_guest_misc_flags(regs) perf_arch_guest_misc_flags(regs) #include <asm/stacktrace.h> /* - * We abuse bit 3 from flags to pass exact information, see perf_misc_flags - * and the comment with PERF_EFLAGS_EXACT. + * We abuse bit 3 from flags to pass exact information, see + * perf_arch_misc_flags() and the comment with PERF_EFLAGS_EXACT. */ #define perf_arch_fetch_caller_regs(regs, __ip) { \ (regs)->ip = (__ip); \ diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 4a686f0e5dbf..c0975815980c 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -105,6 +105,24 @@ struct cpuinfo_topology { // Cache level topology IDs u32 llc_id; u32 l2c_id; + + // Hardware defined CPU-type + union { + u32 cpu_type; + struct { + // CPUID.1A.EAX[23-0] + u32 intel_native_model_id :24; + // CPUID.1A.EAX[31-24] + u32 intel_type :8; + }; + struct { + // CPUID 0x80000026.EBX + u32 amd_num_processors :16, + amd_power_eff_ranking :8, + amd_native_model_id :4, + amd_type :4; + }; + }; }; struct cpuinfo_x86 { diff --git a/arch/x86/include/asm/reboot.h b/arch/x86/include/asm/reboot.h index c02183d3cdd7..ecd58ea9a837 100644 --- a/arch/x86/include/asm/reboot.h +++ b/arch/x86/include/asm/reboot.h @@ -26,7 +26,7 @@ void __noreturn machine_real_restart(unsigned int type); #define MRR_APM 1 typedef void (cpu_emergency_virt_cb)(void); -#if IS_ENABLED(CONFIG_KVM_INTEL) || IS_ENABLED(CONFIG_KVM_AMD) +#if IS_ENABLED(CONFIG_KVM_X86) void cpu_emergency_register_virt_callback(cpu_emergency_virt_cb *callback); void cpu_emergency_unregister_virt_callback(cpu_emergency_virt_cb *callback); void cpu_emergency_disable_virtualization(void); @@ -34,7 +34,7 @@ void cpu_emergency_disable_virtualization(void); static inline void cpu_emergency_register_virt_callback(cpu_emergency_virt_cb *callback) {} static inline void cpu_emergency_unregister_virt_callback(cpu_emergency_virt_cb *callback) {} static inline void cpu_emergency_disable_virtualization(void) {} -#endif /* CONFIG_KVM_INTEL || CONFIG_KVM_AMD */ +#endif /* CONFIG_KVM_X86 */ typedef void (*nmi_shootdown_cb)(int, struct pt_regs*); void nmi_shootdown_cpus(nmi_shootdown_cb callback); diff --git a/arch/x86/include/asm/runtime-const.h b/arch/x86/include/asm/runtime-const.h index 24e3a53ca255..6652ebddfd02 100644 --- a/arch/x86/include/asm/runtime-const.h +++ b/arch/x86/include/asm/runtime-const.h @@ -6,7 +6,7 @@ typeof(sym) __ret; \ asm_inline("mov %1,%0\n1:\n" \ ".pushsection runtime_ptr_" #sym ",\"a\"\n\t" \ - ".long 1b - %c2 - .\n\t" \ + ".long 1b - %c2 - .\n" \ ".popsection" \ :"=r" (__ret) \ :"i" ((unsigned long)0x0123456789abcdefull), \ @@ -20,7 +20,7 @@ typeof(0u+(val)) __ret = (val); \ asm_inline("shrl $12,%k0\n1:\n" \ ".pushsection runtime_shift_" #sym ",\"a\"\n\t" \ - ".long 1b - 1 - .\n\t" \ + ".long 1b - 1 - .\n" \ ".popsection" \ :"+r" (__ret)); \ __ret; }) diff --git a/arch/x86/include/asm/sev-common.h b/arch/x86/include/asm/sev-common.h index 98726c2b04f8..50f5666938c0 100644 --- a/arch/x86/include/asm/sev-common.h +++ b/arch/x86/include/asm/sev-common.h @@ -220,4 +220,31 @@ struct snp_psc_desc { #define GHCB_ERR_INVALID_INPUT 5 #define GHCB_ERR_INVALID_EVENT 6 +struct sev_config { + __u64 debug : 1, + + /* + * Indicates when the per-CPU GHCB has been created and registered + * and thus can be used by the BSP instead of the early boot GHCB. + * + * For APs, the per-CPU GHCB is created before they are started + * and registered upon startup, so this flag can be used globally + * for the BSP and APs. + */ + ghcbs_initialized : 1, + + /* + * Indicates when the per-CPU SVSM CA is to be used instead of the + * boot SVSM CA. + * + * For APs, the per-CPU SVSM CA is created as part of the AP + * bringup, so this flag can be used globally for the BSP and APs. + */ + use_cas : 1, + + __reserved : 61; +}; + +extern struct sev_config sev_cfg; + #endif diff --git a/arch/x86/include/asm/sev.h b/arch/x86/include/asm/sev.h index ee34ab00a8d6..91f08af31078 100644 --- a/arch/x86/include/asm/sev.h +++ b/arch/x86/include/asm/sev.h @@ -120,6 +120,9 @@ struct snp_req_data { }; #define MAX_AUTHTAG_LEN 32 +#define AUTHTAG_LEN 16 +#define AAD_LEN 48 +#define MSG_HDR_VER 1 /* See SNP spec SNP_GUEST_REQUEST section for the structure */ enum msg_type { @@ -171,6 +174,19 @@ struct sev_guest_platform_data { u64 secrets_gpa; }; +struct snp_guest_req { + void *req_buf; + size_t req_sz; + + void *resp_buf; + size_t resp_sz; + + u64 exit_code; + unsigned int vmpck_id; + u8 msg_version; + u8 msg_type; +}; + /* * The secrets page contains 96-bytes of reserved field that can be used by * the guest OS. The guest OS uses the area to save the message sequence @@ -218,6 +234,27 @@ struct snp_secrets_page { u8 rsvd4[3744]; } __packed; +struct snp_msg_desc { + /* request and response are in unencrypted memory */ + struct snp_guest_msg *request, *response; + + /* + * Avoid information leakage by double-buffering shared messages + * in fields that are in regular encrypted memory. + */ + struct snp_guest_msg secret_request, secret_response; + + struct snp_secrets_page *secrets; + struct snp_req_data input; + + void *certs_data; + + struct aesgcm_ctx *ctx; + + u32 *os_area_msg_seqno; + u8 *vmpck; +}; + /* * The SVSM Calling Area (CA) related structures. */ @@ -285,6 +322,22 @@ struct svsm_attest_call { u8 rsvd[4]; }; +/* PTE descriptor used for the prepare_pte_enc() operations. */ +struct pte_enc_desc { + pte_t *kpte; + int pte_level; + bool encrypt; + /* pfn of the kpte above */ + unsigned long pfn; + /* physical address of @pfn */ + unsigned long pa; + /* virtual address of @pfn */ + void *va; + /* memory covered by the pte */ + unsigned long size; + pgprot_t new_pgprot; +}; + /* * SVSM protocol structure */ @@ -392,13 +445,18 @@ void snp_set_wakeup_secondary_cpu(void); bool snp_init(struct boot_params *bp); void __noreturn snp_abort(void); void snp_dmi_setup(void); -int snp_issue_guest_request(u64 exit_code, struct snp_req_data *input, struct snp_guest_request_ioctl *rio); +int snp_issue_guest_request(struct snp_guest_req *req, struct snp_req_data *input, + struct snp_guest_request_ioctl *rio); int snp_issue_svsm_attest_req(u64 call_id, struct svsm_call *call, struct svsm_attest_call *input); void snp_accept_memory(phys_addr_t start, phys_addr_t end); u64 snp_get_unsupported_features(u64 status); u64 sev_get_status(void); void sev_show_status(void); void snp_update_svsm_ca(void); +int prepare_pte_enc(struct pte_enc_desc *d); +void set_pte_enc_mask(pte_t *kpte, unsigned long pfn, pgprot_t new_prot); +void snp_kexec_finish(void); +void snp_kexec_begin(void); #else /* !CONFIG_AMD_MEM_ENCRYPT */ @@ -422,7 +480,8 @@ static inline void snp_set_wakeup_secondary_cpu(void) { } static inline bool snp_init(struct boot_params *bp) { return false; } static inline void snp_abort(void) { } static inline void snp_dmi_setup(void) { } -static inline int snp_issue_guest_request(u64 exit_code, struct snp_req_data *input, struct snp_guest_request_ioctl *rio) +static inline int snp_issue_guest_request(struct snp_guest_req *req, struct snp_req_data *input, + struct snp_guest_request_ioctl *rio) { return -ENOTTY; } @@ -435,6 +494,10 @@ static inline u64 snp_get_unsupported_features(u64 status) { return 0; } static inline u64 sev_get_status(void) { return 0; } static inline void sev_show_status(void) { } static inline void snp_update_svsm_ca(void) { } +static inline int prepare_pte_enc(struct pte_enc_desc *d) { return 0; } +static inline void set_pte_enc_mask(pte_t *kpte, unsigned long pfn, pgprot_t new_prot) { } +static inline void snp_kexec_finish(void) { } +static inline void snp_kexec_begin(void) { } #endif /* CONFIG_AMD_MEM_ENCRYPT */ diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index 12da7dfd5ef1..a55c214f3ba6 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -87,8 +87,9 @@ struct thread_info { #define TIF_NOTIFY_RESUME 1 /* callback before returning to user */ #define TIF_SIGPENDING 2 /* signal pending */ #define TIF_NEED_RESCHED 3 /* rescheduling necessary */ -#define TIF_SINGLESTEP 4 /* reenable singlestep on user return*/ -#define TIF_SSBD 5 /* Speculative store bypass disable */ +#define TIF_NEED_RESCHED_LAZY 4 /* Lazy rescheduling needed */ +#define TIF_SINGLESTEP 5 /* reenable singlestep on user return*/ +#define TIF_SSBD 6 /* Speculative store bypass disable */ #define TIF_SPEC_IB 9 /* Indirect branch speculation mitigation */ #define TIF_SPEC_L1D_FLUSH 10 /* Flush L1D on mm switches (processes) */ #define TIF_USER_RETURN_NOTIFY 11 /* notify kernel of userspace return */ @@ -110,6 +111,7 @@ struct thread_info { #define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME) #define _TIF_SIGPENDING (1 << TIF_SIGPENDING) #define _TIF_NEED_RESCHED (1 << TIF_NEED_RESCHED) +#define _TIF_NEED_RESCHED_LAZY (1 << TIF_NEED_RESCHED_LAZY) #define _TIF_SINGLESTEP (1 << TIF_SINGLESTEP) #define _TIF_SSBD (1 << TIF_SSBD) #define _TIF_SPEC_IB (1 << TIF_SPEC_IB) diff --git a/arch/x86/include/asm/timer.h b/arch/x86/include/asm/timer.h index 7365dd4acffb..23baf8c9b34c 100644 --- a/arch/x86/include/asm/timer.h +++ b/arch/x86/include/asm/timer.h @@ -6,8 +6,6 @@ #include <linux/interrupt.h> #include <linux/math64.h> -#define TICK_SIZE (tick_nsec / 1000) - unsigned long long native_sched_clock(void); extern void recalibrate_cpu_khz(void); diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h index aef70336d624..fd41103ad342 100644 --- a/arch/x86/include/asm/topology.h +++ b/arch/x86/include/asm/topology.h @@ -114,6 +114,12 @@ enum x86_topology_domains { TOPO_MAX_DOMAIN, }; +enum x86_topology_cpu_type { + TOPO_CPU_TYPE_PERFORMANCE, + TOPO_CPU_TYPE_EFFICIENCY, + TOPO_CPU_TYPE_UNKNOWN, +}; + struct x86_topology_system { unsigned int dom_shifts[TOPO_MAX_DOMAIN]; unsigned int dom_size[TOPO_MAX_DOMAIN]; @@ -149,6 +155,9 @@ extern unsigned int __max_threads_per_core; extern unsigned int __num_threads_per_package; extern unsigned int __num_cores_per_package; +const char *get_topology_cpu_type_name(struct cpuinfo_x86 *c); +enum x86_topology_cpu_type get_topology_cpu_type(struct cpuinfo_x86 *c); + static inline unsigned int topology_max_packages(void) { return __max_logical_packages; @@ -305,9 +314,4 @@ static inline void freq_invariance_set_perf_ratio(u64 ratio, bool turbo_disabled extern void arch_scale_freq_tick(void); #define arch_scale_freq_tick arch_scale_freq_tick -#ifdef CONFIG_ACPI_CPPC_LIB -void init_freq_invariance_cppc(void); -#define arch_init_invariance_cppc init_freq_invariance_cppc -#endif - #endif /* _ASM_X86_TOPOLOGY_H */ diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h index afce8ee5d7b7..b0a887209400 100644 --- a/arch/x86/include/asm/uaccess_64.h +++ b/arch/x86/include/asm/uaccess_64.h @@ -12,6 +12,13 @@ #include <asm/cpufeatures.h> #include <asm/page.h> #include <asm/percpu.h> +#include <asm/runtime-const.h> + +/* + * Virtual variable: there's no actual backing store for this, + * it can purely be used as 'runtime_const_ptr(USER_PTR_MAX)' + */ +extern unsigned long USER_PTR_MAX; #ifdef CONFIG_ADDRESS_MASKING /* @@ -46,19 +53,24 @@ static inline unsigned long __untagged_addr_remote(struct mm_struct *mm, #endif -/* - * The virtual address space space is logically divided into a kernel - * half and a user half. When cast to a signed type, user pointers - * are positive and kernel pointers are negative. - */ -#define valid_user_address(x) ((__force long)(x) >= 0) +#define valid_user_address(x) \ + ((__force unsigned long)(x) <= runtime_const_ptr(USER_PTR_MAX)) /* * Masking the user address is an alternative to a conditional * user_access_begin that can avoid the fencing. This only works * for dense accesses starting at the address. */ -#define mask_user_address(x) ((typeof(x))((long)(x)|((long)(x)>>63))) +static inline void __user *mask_user_address(const void __user *ptr) +{ + unsigned long mask; + asm("cmp %1,%0\n\t" + "sbb %0,%0" + :"=r" (mask) + :"r" (ptr), + "0" (runtime_const_ptr(USER_PTR_MAX))); + return (__force void __user *)(mask | (__force unsigned long)ptr); +} #define masked_user_access_begin(x) ({ \ __auto_type __masked_ptr = (x); \ __masked_ptr = mask_user_address(__masked_ptr); \ @@ -69,23 +81,16 @@ static inline unsigned long __untagged_addr_remote(struct mm_struct *mm, * arbitrary values in those bits rather then masking them off. * * Enforce two rules: - * 1. 'ptr' must be in the user half of the address space + * 1. 'ptr' must be in the user part of the address space * 2. 'ptr+size' must not overflow into kernel addresses * - * Note that addresses around the sign change are not valid addresses, - * and will GP-fault even with LAM enabled if the sign bit is set (see - * "CR3.LAM_SUP" that can narrow the canonicality check if we ever - * enable it, but not remove it entirely). - * - * So the "overflow into kernel addresses" does not imply some sudden - * exact boundary at the sign bit, and we can allow a lot of slop on the - * size check. + * Note that we always have at least one guard page between the + * max user address and the non-canonical gap, allowing us to + * ignore small sizes entirely. * * In fact, we could probably remove the size check entirely, since * any kernel accesses will be in increasing address order starting - * at 'ptr', and even if the end might be in kernel space, we'll - * hit the GP faults for non-canonical accesses before we ever get - * there. + * at 'ptr'. * * That's a separate optimization, for now just handle the small * constant case. diff --git a/arch/x86/include/asm/vdso/getrandom.h b/arch/x86/include/asm/vdso/getrandom.h index ff5334ad32a0..2bf9c0e970c3 100644 --- a/arch/x86/include/asm/vdso/getrandom.h +++ b/arch/x86/include/asm/vdso/getrandom.h @@ -8,7 +8,6 @@ #ifndef __ASSEMBLY__ #include <asm/unistd.h> -#include <asm/vvar.h> /** * getrandom_syscall - Invoke the getrandom() syscall. @@ -28,13 +27,14 @@ static __always_inline ssize_t getrandom_syscall(void *buffer, size_t len, unsig return ret; } -#define __vdso_rng_data (VVAR(_vdso_rng_data)) +extern struct vdso_rng_data vdso_rng_data + __attribute__((visibility("hidden"))); static __always_inline const struct vdso_rng_data *__arch_get_vdso_rng_data(void) { - if (IS_ENABLED(CONFIG_TIME_NS) && __vdso_data->clock_mode == VDSO_CLOCKMODE_TIMENS) - return (void *)&__vdso_rng_data + ((void *)&__timens_vdso_data - (void *)&__vdso_data); - return &__vdso_rng_data; + if (IS_ENABLED(CONFIG_TIME_NS) && __arch_get_vdso_data()->clock_mode == VDSO_CLOCKMODE_TIMENS) + return (void *)&vdso_rng_data + ((void *)&timens_page - (void *)__arch_get_vdso_data()); + return &vdso_rng_data; } #endif /* !__ASSEMBLY__ */ diff --git a/arch/x86/include/asm/vdso/gettimeofday.h b/arch/x86/include/asm/vdso/gettimeofday.h index b2d2df026f6e..375a34b0f365 100644 --- a/arch/x86/include/asm/vdso/gettimeofday.h +++ b/arch/x86/include/asm/vdso/gettimeofday.h @@ -14,14 +14,16 @@ #include <uapi/linux/time.h> #include <asm/vgtod.h> -#include <asm/vvar.h> #include <asm/unistd.h> #include <asm/msr.h> #include <asm/pvclock.h> #include <clocksource/hyperv_timer.h> -#define __vdso_data (VVAR(_vdso_data)) -#define __timens_vdso_data (TIMENS(_vdso_data)) +extern struct vdso_data vvar_page + __attribute__((visibility("hidden"))); + +extern struct vdso_data timens_page + __attribute__((visibility("hidden"))); #define VDSO_HAS_TIME 1 @@ -61,7 +63,7 @@ extern struct ms_hyperv_tsc_page hvclock_page static __always_inline const struct vdso_data *__arch_get_timens_vdso_data(const struct vdso_data *vd) { - return __timens_vdso_data; + return &timens_page; } #endif @@ -275,7 +277,7 @@ static inline u64 __arch_get_hw_counter(s32 clock_mode, static __always_inline const struct vdso_data *__arch_get_vdso_data(void) { - return __vdso_data; + return &vvar_page; } static inline bool arch_vdso_clocksource_ok(const struct vdso_data *vd) diff --git a/arch/x86/include/asm/vdso/vsyscall.h b/arch/x86/include/asm/vdso/vsyscall.h index 67fedf1698b5..37b4a70559a8 100644 --- a/arch/x86/include/asm/vdso/vsyscall.h +++ b/arch/x86/include/asm/vdso/vsyscall.h @@ -2,12 +2,19 @@ #ifndef __ASM_VDSO_VSYSCALL_H #define __ASM_VDSO_VSYSCALL_H +#define __VDSO_RND_DATA_OFFSET 640 +#define __VVAR_PAGES 4 + +#define VDSO_NR_VCLOCK_PAGES 2 +#define VDSO_PAGE_PVCLOCK_OFFSET 0 +#define VDSO_PAGE_HVCLOCK_OFFSET 1 + #ifndef __ASSEMBLY__ -#include <linux/timekeeper_internal.h> #include <vdso/datapage.h> #include <asm/vgtod.h> -#include <asm/vvar.h> + +extern struct vdso_data *vdso_data; /* * Update the vDSO data page to keep in sync with kernel timekeeping. @@ -15,14 +22,14 @@ static __always_inline struct vdso_data *__x86_get_k_vdso_data(void) { - return _vdso_data; + return vdso_data; } #define __arch_get_k_vdso_data __x86_get_k_vdso_data static __always_inline struct vdso_rng_data *__x86_get_k_vdso_rng_data(void) { - return &_vdso_rng_data; + return (void *)vdso_data + __VDSO_RND_DATA_OFFSET; } #define __arch_get_k_vdso_rng_data __x86_get_k_vdso_rng_data diff --git a/arch/x86/include/asm/vvar.h b/arch/x86/include/asm/vvar.h deleted file mode 100644 index 9d9af37f7cab..000000000000 --- a/arch/x86/include/asm/vvar.h +++ /dev/null @@ -1,71 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * vvar.h: Shared vDSO/kernel variable declarations - * Copyright (c) 2011 Andy Lutomirski - * - * A handful of variables are accessible (read-only) from userspace - * code in the vsyscall page and the vdso. They are declared here. - * Some other file must define them with DEFINE_VVAR. - * - * In normal kernel code, they are used like any other variable. - * In user code, they are accessed through the VVAR macro. - * - * These variables live in a page of kernel data that has an extra RO - * mapping for userspace. Each variable needs a unique offset within - * that page; specify that offset with the DECLARE_VVAR macro. (If - * you mess up, the linker will catch it.) - */ - -#ifndef _ASM_X86_VVAR_H -#define _ASM_X86_VVAR_H - -#ifdef EMIT_VVAR -/* - * EMIT_VVAR() is used by the kernel linker script to put vvars in the - * right place. Also, it's used by kernel code to import offsets values. - */ -#define DECLARE_VVAR(offset, type, name) \ - EMIT_VVAR(name, offset) -#define DECLARE_VVAR_SINGLE(offset, type, name) \ - EMIT_VVAR(name, offset) - -#else - -extern char __vvar_page; - -#define DECLARE_VVAR(offset, type, name) \ - extern type vvar_ ## name[CS_BASES] \ - __attribute__((visibility("hidden"))); \ - extern type timens_ ## name[CS_BASES] \ - __attribute__((visibility("hidden"))); \ - -#define DECLARE_VVAR_SINGLE(offset, type, name) \ - extern type vvar_ ## name \ - __attribute__((visibility("hidden"))); \ - -#define VVAR(name) (vvar_ ## name) -#define TIMENS(name) (timens_ ## name) - -#define DEFINE_VVAR(type, name) \ - type name[CS_BASES] \ - __attribute__((section(".vvar_" #name), aligned(16))) __visible - -#define DEFINE_VVAR_SINGLE(type, name) \ - type name \ - __attribute__((section(".vvar_" #name), aligned(16))) __visible - -#endif - -/* DECLARE_VVAR(offset, type, name) */ - -DECLARE_VVAR(128, struct vdso_data, _vdso_data) - -#if !defined(_SINGLE_DATA) -#define _SINGLE_DATA -DECLARE_VVAR_SINGLE(640, struct vdso_rng_data, _vdso_rng_data) -#endif - -#undef DECLARE_VVAR -#undef DECLARE_VVAR_SINGLE - -#endif diff --git a/arch/x86/include/uapi/asm/mce.h b/arch/x86/include/uapi/asm/mce.h index db9adc081c5a..cb6b48a7c22b 100644 --- a/arch/x86/include/uapi/asm/mce.h +++ b/arch/x86/include/uapi/asm/mce.h @@ -8,7 +8,8 @@ /* * Fields are zero when not available. Also, this struct is shared with * userspace mcelog and thus must keep existing fields at current offsets. - * Only add new fields to the end of the structure + * Only add new, shared fields to the end of the structure. + * Do not add vendor-specific fields. */ struct mce { __u64 status; /* Bank's MCi_STATUS MSR */ diff --git a/arch/x86/include/uapi/asm/mman.h b/arch/x86/include/uapi/asm/mman.h index 46cdc941f958..ac1e6277212b 100644 --- a/arch/x86/include/uapi/asm/mman.h +++ b/arch/x86/include/uapi/asm/mman.h @@ -5,9 +5,6 @@ #define MAP_32BIT 0x40 /* only give out 32bit addresses */ #define MAP_ABOVE4G 0x80 /* only map above 4GB */ -/* Flags for map_shadow_stack(2) */ -#define SHADOW_STACK_SET_TOKEN (1ULL << 0) /* Set up a restore token in the shadow stack */ - #include <asm-generic/mman.h> #endif /* _ASM_X86_MMAN_H */ diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 4efecac49863..3a44a9dc3fb7 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -1171,7 +1171,8 @@ static int __init acpi_parse_madt_ioapic_entries(void) } count = acpi_table_parse_madt(ACPI_MADT_TYPE_INTERRUPT_OVERRIDE, - acpi_parse_int_src_ovr, nr_irqs); + acpi_parse_int_src_ovr, + irq_get_nr_irqs()); if (count < 0) { pr_err("Error parsing interrupt source overrides entry\n"); /* TBD: Cleanup to allow fallback to MPS */ @@ -1191,7 +1192,8 @@ static int __init acpi_parse_madt_ioapic_entries(void) mp_config_acpi_legacy_irqs(); count = acpi_table_parse_madt(ACPI_MADT_TYPE_NMI_SOURCE, - acpi_parse_nmi_src, nr_irqs); + acpi_parse_nmi_src, + irq_get_nr_irqs()); if (count < 0) { pr_err("Error parsing NMI SRC entry\n"); /* TBD: Cleanup to allow fallback to MPS */ diff --git a/arch/x86/kernel/acpi/cppc.c b/arch/x86/kernel/acpi/cppc.c index 956984054bf3..d745dd586303 100644 --- a/arch/x86/kernel/acpi/cppc.c +++ b/arch/x86/kernel/acpi/cppc.c @@ -110,7 +110,7 @@ static void amd_set_max_freq_ratio(void) static DEFINE_MUTEX(freq_invariance_lock); -void init_freq_invariance_cppc(void) +static inline void init_freq_invariance_cppc(void) { static bool init_done; @@ -127,6 +127,11 @@ void init_freq_invariance_cppc(void) mutex_unlock(&freq_invariance_lock); } +void acpi_processor_init_invariance_cppc(void) +{ + init_freq_invariance_cppc(); +} + /* * Get the highest performance register value. * @cpu: CPU from which to get highest performance. @@ -234,8 +239,10 @@ EXPORT_SYMBOL_GPL(amd_detect_prefcore); */ int amd_get_boost_ratio_numerator(unsigned int cpu, u64 *numerator) { + enum x86_topology_cpu_type core_type = get_topology_cpu_type(&cpu_data(cpu)); bool prefcore; int ret; + u32 tmp; ret = amd_detect_prefcore(&prefcore); if (ret) @@ -261,6 +268,27 @@ int amd_get_boost_ratio_numerator(unsigned int cpu, u64 *numerator) break; } } + + /* detect if running on heterogeneous design */ + if (cpu_feature_enabled(X86_FEATURE_AMD_HETEROGENEOUS_CORES)) { + switch (core_type) { + case TOPO_CPU_TYPE_UNKNOWN: + pr_warn("Undefined core type found for cpu %d\n", cpu); + break; + case TOPO_CPU_TYPE_PERFORMANCE: + /* use the max scale for performance cores */ + *numerator = CPPC_HIGHEST_PERF_PERFORMANCE; + return 0; + case TOPO_CPU_TYPE_EFFICIENCY: + /* use the highest perf value for efficiency cores */ + ret = amd_get_highest_perf(cpu, &tmp); + if (ret) + return ret; + *numerator = tmp; + return 0; + } + } + *numerator = CPPC_HIGHEST_PERF_PREFCORE; return 0; diff --git a/arch/x86/kernel/acpi/wakeup_64.S b/arch/x86/kernel/acpi/wakeup_64.S index 94ff83f3d3fe..b200a193beeb 100644 --- a/arch/x86/kernel/acpi/wakeup_64.S +++ b/arch/x86/kernel/acpi/wakeup_64.S @@ -87,6 +87,7 @@ SYM_FUNC_START(do_suspend_lowlevel) .align 4 .Lresume_point: + ANNOTATE_NOENDBR /* We don't restore %rax, it must be 0 anyway */ movq $saved_context, %rax movq saved_context_cr4(%rax), %rbx diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c index dc5d3216af24..9fe9972d2071 100644 --- a/arch/x86/kernel/amd_nb.c +++ b/arch/x86/kernel/amd_nb.c @@ -44,6 +44,7 @@ #define PCI_DEVICE_ID_AMD_19H_M70H_DF_F4 0x14f4 #define PCI_DEVICE_ID_AMD_19H_M78H_DF_F4 0x12fc #define PCI_DEVICE_ID_AMD_1AH_M00H_DF_F4 0x12c4 +#define PCI_DEVICE_ID_AMD_1AH_M20H_DF_F4 0x16fc #define PCI_DEVICE_ID_AMD_1AH_M60H_DF_F4 0x124c #define PCI_DEVICE_ID_AMD_1AH_M70H_DF_F4 0x12bc #define PCI_DEVICE_ID_AMD_MI200_DF_F4 0x14d4 @@ -127,6 +128,7 @@ static const struct pci_device_id amd_nb_link_ids[] = { { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M78H_DF_F4) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_CNB17H_F4) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_1AH_M00H_DF_F4) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_1AH_M20H_DF_F4) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_1AH_M60H_DF_F4) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_1AH_M70H_DF_F4) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_MI200_DF_F4) }, diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 6513c53c9459..c5fb28e6451a 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -440,7 +440,19 @@ static int lapic_timer_shutdown(struct clock_event_device *evt) v = apic_read(APIC_LVTT); v |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR); apic_write(APIC_LVTT, v); - apic_write(APIC_TMICT, 0); + + /* + * Setting APIC_LVT_MASKED (above) should be enough to tell + * the hardware that this timer will never fire. But AMD + * erratum 411 and some Intel CPU behavior circa 2024 say + * otherwise. Time for belt and suspenders programming: mask + * the timer _and_ zero the counter registers: + */ + if (v & APIC_LVT_TIMER_TSCDEADLINE) + wrmsrl(MSR_IA32_TSC_DEADLINE, 0); + else + apic_write(APIC_TMICT, 0); + return 0; } diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c index 557318145038..736f62812f5c 100644 --- a/arch/x86/kernel/apic/vector.c +++ b/arch/x86/kernel/apic/vector.c @@ -712,8 +712,8 @@ int __init arch_probe_nr_irqs(void) { int nr; - if (nr_irqs > (NR_VECTORS * nr_cpu_ids)) - nr_irqs = NR_VECTORS * nr_cpu_ids; + if (irq_get_nr_irqs() > NR_VECTORS * nr_cpu_ids) + irq_set_nr_irqs(NR_VECTORS * nr_cpu_ids); nr = (gsi_top + nr_legacy_irqs()) + 8 * nr_cpu_ids; #if defined(CONFIG_PCI_MSI) @@ -725,8 +725,8 @@ int __init arch_probe_nr_irqs(void) else nr += gsi_top * 16; #endif - if (nr < nr_irqs) - nr_irqs = nr; + if (nr < irq_get_nr_irqs()) + irq_set_nr_irqs(nr); /* * We don't know if PIC is present at this point so we need to do diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index 5857a0f5d514..4efdf5c2efc8 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -59,6 +59,8 @@ obj-$(CONFIG_ACRN_GUEST) += acrn.o obj-$(CONFIG_DEBUG_FS) += debugfs.o +obj-$(CONFIG_X86_BUS_LOCK_DETECT) += bus_lock.o + quiet_cmd_mkcapflags = MKCAP $@ cmd_mkcapflags = $(CONFIG_SHELL) $(src)/mkcapflags.sh $@ $^ diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 015971adadfc..823f44f7bc94 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -924,6 +924,17 @@ static void init_amd_zen4(struct cpuinfo_x86 *c) { if (!cpu_has(c, X86_FEATURE_HYPERVISOR)) msr_set_bit(MSR_ZEN4_BP_CFG, MSR_ZEN4_BP_CFG_SHARED_BTB_FIX_BIT); + + /* + * These Zen4 SoCs advertise support for virtualized VMLOAD/VMSAVE + * in some BIOS versions but they can lead to random host reboots. + */ + switch (c->x86_model) { + case 0x18 ... 0x1f: + case 0x60 ... 0x7f: + clear_cpu_cap(c, X86_FEATURE_V_VMSAVE_VMLOAD); + break; + } } static void init_amd_zen5(struct cpuinfo_x86 *c) @@ -1202,5 +1213,6 @@ void amd_check_microcode(void) if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) return; - on_each_cpu(zenbleed_check_cpu, NULL, 1); + if (cpu_feature_enabled(X86_FEATURE_ZEN2)) + on_each_cpu(zenbleed_check_cpu, NULL, 1); } diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index d1915427b4ff..47a01d4028f6 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -1115,8 +1115,25 @@ do_cmd_auto: case RETBLEED_MITIGATION_IBPB: setup_force_cpu_cap(X86_FEATURE_ENTRY_IBPB); + + /* + * IBPB on entry already obviates the need for + * software-based untraining so clear those in case some + * other mitigation like SRSO has selected them. + */ + setup_clear_cpu_cap(X86_FEATURE_UNRET); + setup_clear_cpu_cap(X86_FEATURE_RETHUNK); + setup_force_cpu_cap(X86_FEATURE_IBPB_ON_VMEXIT); mitigate_smt = true; + + /* + * There is no need for RSB filling: entry_ibpb() ensures + * all predictions, including the RSB, are invalidated, + * regardless of IBPB implementation. + */ + setup_clear_cpu_cap(X86_FEATURE_RSB_VMEXIT); + break; case RETBLEED_MITIGATION_STUFF: @@ -2627,6 +2644,14 @@ static void __init srso_select_mitigation(void) if (has_microcode) { setup_force_cpu_cap(X86_FEATURE_ENTRY_IBPB); srso_mitigation = SRSO_MITIGATION_IBPB; + + /* + * IBPB on entry already obviates the need for + * software-based untraining so clear those in case some + * other mitigation like Retbleed has selected them. + */ + setup_clear_cpu_cap(X86_FEATURE_UNRET); + setup_clear_cpu_cap(X86_FEATURE_RETHUNK); } } else { pr_err("WARNING: kernel not compiled with MITIGATION_IBPB_ENTRY.\n"); @@ -2638,6 +2663,13 @@ static void __init srso_select_mitigation(void) if (!boot_cpu_has(X86_FEATURE_ENTRY_IBPB) && has_microcode) { setup_force_cpu_cap(X86_FEATURE_IBPB_ON_VMEXIT); srso_mitigation = SRSO_MITIGATION_IBPB_ON_VMEXIT; + + /* + * There is no need for RSB filling: entry_ibpb() ensures + * all predictions, including the RSB, are invalidated, + * regardless of IBPB implementation. + */ + setup_clear_cpu_cap(X86_FEATURE_RSB_VMEXIT); } } else { pr_err("WARNING: kernel not compiled with MITIGATION_SRSO.\n"); diff --git a/arch/x86/kernel/cpu/bus_lock.c b/arch/x86/kernel/cpu/bus_lock.c new file mode 100644 index 000000000000..704e9241b964 --- /dev/null +++ b/arch/x86/kernel/cpu/bus_lock.c @@ -0,0 +1,406 @@ +// SPDX-License-Identifier: GPL-2.0 + +#define pr_fmt(fmt) "x86/split lock detection: " fmt + +#include <linux/semaphore.h> +#include <linux/workqueue.h> +#include <linux/delay.h> +#include <linux/cpuhotplug.h> +#include <asm/cpu_device_id.h> +#include <asm/cmdline.h> +#include <asm/traps.h> +#include <asm/cpu.h> + +enum split_lock_detect_state { + sld_off = 0, + sld_warn, + sld_fatal, + sld_ratelimit, +}; + +/* + * Default to sld_off because most systems do not support split lock detection. + * sld_state_setup() will switch this to sld_warn on systems that support + * split lock/bus lock detect, unless there is a command line override. + */ +static enum split_lock_detect_state sld_state __ro_after_init = sld_off; +static u64 msr_test_ctrl_cache __ro_after_init; + +/* + * With a name like MSR_TEST_CTL it should go without saying, but don't touch + * MSR_TEST_CTL unless the CPU is one of the whitelisted models. Writing it + * on CPUs that do not support SLD can cause fireworks, even when writing '0'. + */ +static bool cpu_model_supports_sld __ro_after_init; + +static const struct { + const char *option; + enum split_lock_detect_state state; +} sld_options[] __initconst = { + { "off", sld_off }, + { "warn", sld_warn }, + { "fatal", sld_fatal }, + { "ratelimit:", sld_ratelimit }, +}; + +static struct ratelimit_state bld_ratelimit; + +static unsigned int sysctl_sld_mitigate = 1; +static DEFINE_SEMAPHORE(buslock_sem, 1); + +#ifdef CONFIG_PROC_SYSCTL +static struct ctl_table sld_sysctls[] = { + { + .procname = "split_lock_mitigate", + .data = &sysctl_sld_mitigate, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_douintvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, +}; + +static int __init sld_mitigate_sysctl_init(void) +{ + register_sysctl_init("kernel", sld_sysctls); + return 0; +} + +late_initcall(sld_mitigate_sysctl_init); +#endif + +static inline bool match_option(const char *arg, int arglen, const char *opt) +{ + int len = strlen(opt), ratelimit; + + if (strncmp(arg, opt, len)) + return false; + + /* + * Min ratelimit is 1 bus lock/sec. + * Max ratelimit is 1000 bus locks/sec. + */ + if (sscanf(arg, "ratelimit:%d", &ratelimit) == 1 && + ratelimit > 0 && ratelimit <= 1000) { + ratelimit_state_init(&bld_ratelimit, HZ, ratelimit); + ratelimit_set_flags(&bld_ratelimit, RATELIMIT_MSG_ON_RELEASE); + return true; + } + + return len == arglen; +} + +static bool split_lock_verify_msr(bool on) +{ + u64 ctrl, tmp; + + if (rdmsrl_safe(MSR_TEST_CTRL, &ctrl)) + return false; + if (on) + ctrl |= MSR_TEST_CTRL_SPLIT_LOCK_DETECT; + else + ctrl &= ~MSR_TEST_CTRL_SPLIT_LOCK_DETECT; + if (wrmsrl_safe(MSR_TEST_CTRL, ctrl)) + return false; + rdmsrl(MSR_TEST_CTRL, tmp); + return ctrl == tmp; +} + +static void __init sld_state_setup(void) +{ + enum split_lock_detect_state state = sld_warn; + char arg[20]; + int i, ret; + + if (!boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT) && + !boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT)) + return; + + ret = cmdline_find_option(boot_command_line, "split_lock_detect", + arg, sizeof(arg)); + if (ret >= 0) { + for (i = 0; i < ARRAY_SIZE(sld_options); i++) { + if (match_option(arg, ret, sld_options[i].option)) { + state = sld_options[i].state; + break; + } + } + } + sld_state = state; +} + +static void __init __split_lock_setup(void) +{ + if (!split_lock_verify_msr(false)) { + pr_info("MSR access failed: Disabled\n"); + return; + } + + rdmsrl(MSR_TEST_CTRL, msr_test_ctrl_cache); + + if (!split_lock_verify_msr(true)) { + pr_info("MSR access failed: Disabled\n"); + return; + } + + /* Restore the MSR to its cached value. */ + wrmsrl(MSR_TEST_CTRL, msr_test_ctrl_cache); + + setup_force_cpu_cap(X86_FEATURE_SPLIT_LOCK_DETECT); +} + +/* + * MSR_TEST_CTRL is per core, but we treat it like a per CPU MSR. Locking + * is not implemented as one thread could undo the setting of the other + * thread immediately after dropping the lock anyway. + */ +static void sld_update_msr(bool on) +{ + u64 test_ctrl_val = msr_test_ctrl_cache; + + if (on) + test_ctrl_val |= MSR_TEST_CTRL_SPLIT_LOCK_DETECT; + + wrmsrl(MSR_TEST_CTRL, test_ctrl_val); +} + +void split_lock_init(void) +{ + /* + * #DB for bus lock handles ratelimit and #AC for split lock is + * disabled. + */ + if (sld_state == sld_ratelimit) { + split_lock_verify_msr(false); + return; + } + + if (cpu_model_supports_sld) + split_lock_verify_msr(sld_state != sld_off); +} + +static void __split_lock_reenable_unlock(struct work_struct *work) +{ + sld_update_msr(true); + up(&buslock_sem); +} + +static DECLARE_DELAYED_WORK(sl_reenable_unlock, __split_lock_reenable_unlock); + +static void __split_lock_reenable(struct work_struct *work) +{ + sld_update_msr(true); +} +static DECLARE_DELAYED_WORK(sl_reenable, __split_lock_reenable); + +/* + * If a CPU goes offline with pending delayed work to re-enable split lock + * detection then the delayed work will be executed on some other CPU. That + * handles releasing the buslock_sem, but because it executes on a + * different CPU probably won't re-enable split lock detection. This is a + * problem on HT systems since the sibling CPU on the same core may then be + * left running with split lock detection disabled. + * + * Unconditionally re-enable detection here. + */ +static int splitlock_cpu_offline(unsigned int cpu) +{ + sld_update_msr(true); + + return 0; +} + +static void split_lock_warn(unsigned long ip) +{ + struct delayed_work *work; + int cpu; + + if (!current->reported_split_lock) + pr_warn_ratelimited("#AC: %s/%d took a split_lock trap at address: 0x%lx\n", + current->comm, current->pid, ip); + current->reported_split_lock = 1; + + if (sysctl_sld_mitigate) { + /* + * misery factor #1: + * sleep 10ms before trying to execute split lock. + */ + if (msleep_interruptible(10) > 0) + return; + /* + * Misery factor #2: + * only allow one buslocked disabled core at a time. + */ + if (down_interruptible(&buslock_sem) == -EINTR) + return; + work = &sl_reenable_unlock; + } else { + work = &sl_reenable; + } + + cpu = get_cpu(); + schedule_delayed_work_on(cpu, work, 2); + + /* Disable split lock detection on this CPU to make progress */ + sld_update_msr(false); + put_cpu(); +} + +bool handle_guest_split_lock(unsigned long ip) +{ + if (sld_state == sld_warn) { + split_lock_warn(ip); + return true; + } + + pr_warn_once("#AC: %s/%d %s split_lock trap at address: 0x%lx\n", + current->comm, current->pid, + sld_state == sld_fatal ? "fatal" : "bogus", ip); + + current->thread.error_code = 0; + current->thread.trap_nr = X86_TRAP_AC; + force_sig_fault(SIGBUS, BUS_ADRALN, NULL); + return false; +} +EXPORT_SYMBOL_GPL(handle_guest_split_lock); + +void bus_lock_init(void) +{ + u64 val; + + if (!boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT)) + return; + + rdmsrl(MSR_IA32_DEBUGCTLMSR, val); + + if ((boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT) && + (sld_state == sld_warn || sld_state == sld_fatal)) || + sld_state == sld_off) { + /* + * Warn and fatal are handled by #AC for split lock if #AC for + * split lock is supported. + */ + val &= ~DEBUGCTLMSR_BUS_LOCK_DETECT; + } else { + val |= DEBUGCTLMSR_BUS_LOCK_DETECT; + } + + wrmsrl(MSR_IA32_DEBUGCTLMSR, val); +} + +bool handle_user_split_lock(struct pt_regs *regs, long error_code) +{ + if ((regs->flags & X86_EFLAGS_AC) || sld_state == sld_fatal) + return false; + split_lock_warn(regs->ip); + return true; +} + +void handle_bus_lock(struct pt_regs *regs) +{ + switch (sld_state) { + case sld_off: + break; + case sld_ratelimit: + /* Enforce no more than bld_ratelimit bus locks/sec. */ + while (!__ratelimit(&bld_ratelimit)) + msleep(20); + /* Warn on the bus lock. */ + fallthrough; + case sld_warn: + pr_warn_ratelimited("#DB: %s/%d took a bus_lock trap at address: 0x%lx\n", + current->comm, current->pid, regs->ip); + break; + case sld_fatal: + force_sig_fault(SIGBUS, BUS_ADRALN, NULL); + break; + } +} + +/* + * CPU models that are known to have the per-core split-lock detection + * feature even though they do not enumerate IA32_CORE_CAPABILITIES. + */ +static const struct x86_cpu_id split_lock_cpu_ids[] __initconst = { + X86_MATCH_VFM(INTEL_ICELAKE_X, 0), + X86_MATCH_VFM(INTEL_ICELAKE_L, 0), + X86_MATCH_VFM(INTEL_ICELAKE_D, 0), + {} +}; + +static void __init split_lock_setup(struct cpuinfo_x86 *c) +{ + const struct x86_cpu_id *m; + u64 ia32_core_caps; + + if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) + return; + + /* Check for CPUs that have support but do not enumerate it: */ + m = x86_match_cpu(split_lock_cpu_ids); + if (m) + goto supported; + + if (!cpu_has(c, X86_FEATURE_CORE_CAPABILITIES)) + return; + + /* + * Not all bits in MSR_IA32_CORE_CAPS are architectural, but + * MSR_IA32_CORE_CAPS_SPLIT_LOCK_DETECT is. All CPUs that set + * it have split lock detection. + */ + rdmsrl(MSR_IA32_CORE_CAPS, ia32_core_caps); + if (ia32_core_caps & MSR_IA32_CORE_CAPS_SPLIT_LOCK_DETECT) + goto supported; + + /* CPU is not in the model list and does not have the MSR bit: */ + return; + +supported: + cpu_model_supports_sld = true; + __split_lock_setup(); +} + +static void sld_state_show(void) +{ + if (!boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT) && + !boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT)) + return; + + switch (sld_state) { + case sld_off: + pr_info("disabled\n"); + break; + case sld_warn: + if (boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT)) { + pr_info("#AC: crashing the kernel on kernel split_locks and warning on user-space split_locks\n"); + if (cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, + "x86/splitlock", NULL, splitlock_cpu_offline) < 0) + pr_warn("No splitlock CPU offline handler\n"); + } else if (boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT)) { + pr_info("#DB: warning on user-space bus_locks\n"); + } + break; + case sld_fatal: + if (boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT)) { + pr_info("#AC: crashing the kernel on kernel split_locks and sending SIGBUS on user-space split_locks\n"); + } else if (boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT)) { + pr_info("#DB: sending SIGBUS on user-space bus_locks%s\n", + boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT) ? + " from non-WB" : ""); + } + break; + case sld_ratelimit: + if (boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT)) + pr_info("#DB: setting system wide bus lock rate limit to %u/sec\n", bld_ratelimit.burst); + break; + } +} + +void __init sld_setup(struct cpuinfo_x86 *c) +{ + split_lock_setup(c); + sld_state_setup(); + sld_state_show(); +} diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 07a34d723505..02637365d1a9 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -69,6 +69,7 @@ #include <asm/sev.h> #include <asm/tdx.h> #include <asm/posted_intr.h> +#include <asm/runtime-const.h> #include "cpu.h" @@ -1443,6 +1444,9 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c) boot_cpu_has(X86_FEATURE_HYPERVISOR))) setup_force_cpu_bug(X86_BUG_BHI); + if (cpu_has(c, X86_FEATURE_AMD_IBPB) && !cpu_has(c, X86_FEATURE_AMD_IBPB_RET)) + setup_force_cpu_bug(X86_BUG_IBPB_NO_RET); + if (cpu_matches(cpu_vuln_whitelist, NO_MELTDOWN)) return; @@ -1837,6 +1841,8 @@ static void identify_cpu(struct cpuinfo_x86 *c) if (this_cpu->c_init) this_cpu->c_init(c); + bus_lock_init(); + /* Disable the PN if appropriate */ squash_the_stupid_serial_number(c); @@ -1902,9 +1908,7 @@ static void identify_cpu(struct cpuinfo_x86 *c) /* Init Machine Check Exception if available. */ mcheck_cpu_init(c); -#ifdef CONFIG_NUMA numa_add_cpu(smp_processor_id()); -#endif } /* @@ -2085,8 +2089,10 @@ void syscall_init(void) #ifdef CONFIG_STACKPROTECTOR DEFINE_PER_CPU(unsigned long, __stack_chk_guard); +#ifndef CONFIG_SMP EXPORT_PER_CPU_SYMBOL(__stack_chk_guard); #endif +#endif #endif /* CONFIG_X86_64 */ @@ -2386,6 +2392,15 @@ void __init arch_cpu_finalize_init(void) alternative_instructions(); if (IS_ENABLED(CONFIG_X86_64)) { + unsigned long USER_PTR_MAX = TASK_SIZE_MAX-1; + + /* + * Enable this when LAM is gated on LASS support + if (cpu_feature_enabled(X86_FEATURE_LAM)) + USER_PTR_MAX = (1ul << 63) - PAGE_SIZE - 1; + */ + runtime_const_init(ptr, USER_PTR_MAX); + /* * Make sure the first 2MB area is not mapped by huge pages * There are typically fixed size MTRRs in there and overlapping diff --git a/arch/x86/kernel/cpu/debugfs.c b/arch/x86/kernel/cpu/debugfs.c index 3baf3e435834..10719aba6276 100644 --- a/arch/x86/kernel/cpu/debugfs.c +++ b/arch/x86/kernel/cpu/debugfs.c @@ -22,6 +22,7 @@ static int cpu_debug_show(struct seq_file *m, void *p) seq_printf(m, "die_id: %u\n", c->topo.die_id); seq_printf(m, "cu_id: %u\n", c->topo.cu_id); seq_printf(m, "core_id: %u\n", c->topo.core_id); + seq_printf(m, "cpu_type: %s\n", get_topology_cpu_type_name(c)); seq_printf(m, "logical_pkg_id: %u\n", c->topo.logical_pkg_id); seq_printf(m, "logical_die_id: %u\n", c->topo.logical_die_id); seq_printf(m, "llc_id: %u\n", c->topo.llc_id); diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index e7656cbef68d..d1de300af173 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -7,13 +7,9 @@ #include <linux/smp.h> #include <linux/sched.h> #include <linux/sched/clock.h> -#include <linux/semaphore.h> #include <linux/thread_info.h> #include <linux/init.h> #include <linux/uaccess.h> -#include <linux/workqueue.h> -#include <linux/delay.h> -#include <linux/cpuhotplug.h> #include <asm/cpufeature.h> #include <asm/msr.h> @@ -24,8 +20,6 @@ #include <asm/hwcap2.h> #include <asm/elf.h> #include <asm/cpu_device_id.h> -#include <asm/cmdline.h> -#include <asm/traps.h> #include <asm/resctrl.h> #include <asm/numa.h> #include <asm/thermal.h> @@ -41,28 +35,6 @@ #include <asm/apic.h> #endif -enum split_lock_detect_state { - sld_off = 0, - sld_warn, - sld_fatal, - sld_ratelimit, -}; - -/* - * Default to sld_off because most systems do not support split lock detection. - * sld_state_setup() will switch this to sld_warn on systems that support - * split lock/bus lock detect, unless there is a command line override. - */ -static enum split_lock_detect_state sld_state __ro_after_init = sld_off; -static u64 msr_test_ctrl_cache __ro_after_init; - -/* - * With a name like MSR_TEST_CTL it should go without saying, but don't touch - * MSR_TEST_CTL unless the CPU is one of the whitelisted models. Writing it - * on CPUs that do not support SLD can cause fireworks, even when writing '0'. - */ -static bool cpu_model_supports_sld __ro_after_init; - /* * Processors which have self-snooping capability can handle conflicting * memory type across CPUs by snooping its own cache. However, there exists @@ -549,9 +521,6 @@ static void init_intel_misc_features(struct cpuinfo_x86 *c) wrmsrl(MSR_MISC_FEATURES_ENABLES, msr); } -static void split_lock_init(void); -static void bus_lock_init(void); - static void init_intel(struct cpuinfo_x86 *c) { early_init_intel(c); @@ -643,7 +612,6 @@ static void init_intel(struct cpuinfo_x86 *c) init_intel_misc_features(c); split_lock_init(); - bus_lock_init(); intel_init_thermal(c); } @@ -909,381 +877,6 @@ static const struct cpu_dev intel_cpu_dev = { cpu_dev_register(intel_cpu_dev); -#undef pr_fmt -#define pr_fmt(fmt) "x86/split lock detection: " fmt - -static const struct { - const char *option; - enum split_lock_detect_state state; -} sld_options[] __initconst = { - { "off", sld_off }, - { "warn", sld_warn }, - { "fatal", sld_fatal }, - { "ratelimit:", sld_ratelimit }, -}; - -static struct ratelimit_state bld_ratelimit; - -static unsigned int sysctl_sld_mitigate = 1; -static DEFINE_SEMAPHORE(buslock_sem, 1); - -#ifdef CONFIG_PROC_SYSCTL -static struct ctl_table sld_sysctls[] = { - { - .procname = "split_lock_mitigate", - .data = &sysctl_sld_mitigate, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_douintvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, -}; - -static int __init sld_mitigate_sysctl_init(void) -{ - register_sysctl_init("kernel", sld_sysctls); - return 0; -} - -late_initcall(sld_mitigate_sysctl_init); -#endif - -static inline bool match_option(const char *arg, int arglen, const char *opt) -{ - int len = strlen(opt), ratelimit; - - if (strncmp(arg, opt, len)) - return false; - - /* - * Min ratelimit is 1 bus lock/sec. - * Max ratelimit is 1000 bus locks/sec. - */ - if (sscanf(arg, "ratelimit:%d", &ratelimit) == 1 && - ratelimit > 0 && ratelimit <= 1000) { - ratelimit_state_init(&bld_ratelimit, HZ, ratelimit); - ratelimit_set_flags(&bld_ratelimit, RATELIMIT_MSG_ON_RELEASE); - return true; - } - - return len == arglen; -} - -static bool split_lock_verify_msr(bool on) -{ - u64 ctrl, tmp; - - if (rdmsrl_safe(MSR_TEST_CTRL, &ctrl)) - return false; - if (on) - ctrl |= MSR_TEST_CTRL_SPLIT_LOCK_DETECT; - else - ctrl &= ~MSR_TEST_CTRL_SPLIT_LOCK_DETECT; - if (wrmsrl_safe(MSR_TEST_CTRL, ctrl)) - return false; - rdmsrl(MSR_TEST_CTRL, tmp); - return ctrl == tmp; -} - -static void __init sld_state_setup(void) -{ - enum split_lock_detect_state state = sld_warn; - char arg[20]; - int i, ret; - - if (!boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT) && - !boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT)) - return; - - ret = cmdline_find_option(boot_command_line, "split_lock_detect", - arg, sizeof(arg)); - if (ret >= 0) { - for (i = 0; i < ARRAY_SIZE(sld_options); i++) { - if (match_option(arg, ret, sld_options[i].option)) { - state = sld_options[i].state; - break; - } - } - } - sld_state = state; -} - -static void __init __split_lock_setup(void) -{ - if (!split_lock_verify_msr(false)) { - pr_info("MSR access failed: Disabled\n"); - return; - } - - rdmsrl(MSR_TEST_CTRL, msr_test_ctrl_cache); - - if (!split_lock_verify_msr(true)) { - pr_info("MSR access failed: Disabled\n"); - return; - } - - /* Restore the MSR to its cached value. */ - wrmsrl(MSR_TEST_CTRL, msr_test_ctrl_cache); - - setup_force_cpu_cap(X86_FEATURE_SPLIT_LOCK_DETECT); -} - -/* - * MSR_TEST_CTRL is per core, but we treat it like a per CPU MSR. Locking - * is not implemented as one thread could undo the setting of the other - * thread immediately after dropping the lock anyway. - */ -static void sld_update_msr(bool on) -{ - u64 test_ctrl_val = msr_test_ctrl_cache; - - if (on) - test_ctrl_val |= MSR_TEST_CTRL_SPLIT_LOCK_DETECT; - - wrmsrl(MSR_TEST_CTRL, test_ctrl_val); -} - -static void split_lock_init(void) -{ - /* - * #DB for bus lock handles ratelimit and #AC for split lock is - * disabled. - */ - if (sld_state == sld_ratelimit) { - split_lock_verify_msr(false); - return; - } - - if (cpu_model_supports_sld) - split_lock_verify_msr(sld_state != sld_off); -} - -static void __split_lock_reenable_unlock(struct work_struct *work) -{ - sld_update_msr(true); - up(&buslock_sem); -} - -static DECLARE_DELAYED_WORK(sl_reenable_unlock, __split_lock_reenable_unlock); - -static void __split_lock_reenable(struct work_struct *work) -{ - sld_update_msr(true); -} -static DECLARE_DELAYED_WORK(sl_reenable, __split_lock_reenable); - -/* - * If a CPU goes offline with pending delayed work to re-enable split lock - * detection then the delayed work will be executed on some other CPU. That - * handles releasing the buslock_sem, but because it executes on a - * different CPU probably won't re-enable split lock detection. This is a - * problem on HT systems since the sibling CPU on the same core may then be - * left running with split lock detection disabled. - * - * Unconditionally re-enable detection here. - */ -static int splitlock_cpu_offline(unsigned int cpu) -{ - sld_update_msr(true); - - return 0; -} - -static void split_lock_warn(unsigned long ip) -{ - struct delayed_work *work; - int cpu; - - if (!current->reported_split_lock) - pr_warn_ratelimited("#AC: %s/%d took a split_lock trap at address: 0x%lx\n", - current->comm, current->pid, ip); - current->reported_split_lock = 1; - - if (sysctl_sld_mitigate) { - /* - * misery factor #1: - * sleep 10ms before trying to execute split lock. - */ - if (msleep_interruptible(10) > 0) - return; - /* - * Misery factor #2: - * only allow one buslocked disabled core at a time. - */ - if (down_interruptible(&buslock_sem) == -EINTR) - return; - work = &sl_reenable_unlock; - } else { - work = &sl_reenable; - } - - cpu = get_cpu(); - schedule_delayed_work_on(cpu, work, 2); - - /* Disable split lock detection on this CPU to make progress */ - sld_update_msr(false); - put_cpu(); -} - -bool handle_guest_split_lock(unsigned long ip) -{ - if (sld_state == sld_warn) { - split_lock_warn(ip); - return true; - } - - pr_warn_once("#AC: %s/%d %s split_lock trap at address: 0x%lx\n", - current->comm, current->pid, - sld_state == sld_fatal ? "fatal" : "bogus", ip); - - current->thread.error_code = 0; - current->thread.trap_nr = X86_TRAP_AC; - force_sig_fault(SIGBUS, BUS_ADRALN, NULL); - return false; -} -EXPORT_SYMBOL_GPL(handle_guest_split_lock); - -static void bus_lock_init(void) -{ - u64 val; - - if (!boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT)) - return; - - rdmsrl(MSR_IA32_DEBUGCTLMSR, val); - - if ((boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT) && - (sld_state == sld_warn || sld_state == sld_fatal)) || - sld_state == sld_off) { - /* - * Warn and fatal are handled by #AC for split lock if #AC for - * split lock is supported. - */ - val &= ~DEBUGCTLMSR_BUS_LOCK_DETECT; - } else { - val |= DEBUGCTLMSR_BUS_LOCK_DETECT; - } - - wrmsrl(MSR_IA32_DEBUGCTLMSR, val); -} - -bool handle_user_split_lock(struct pt_regs *regs, long error_code) -{ - if ((regs->flags & X86_EFLAGS_AC) || sld_state == sld_fatal) - return false; - split_lock_warn(regs->ip); - return true; -} - -void handle_bus_lock(struct pt_regs *regs) -{ - switch (sld_state) { - case sld_off: - break; - case sld_ratelimit: - /* Enforce no more than bld_ratelimit bus locks/sec. */ - while (!__ratelimit(&bld_ratelimit)) - msleep(20); - /* Warn on the bus lock. */ - fallthrough; - case sld_warn: - pr_warn_ratelimited("#DB: %s/%d took a bus_lock trap at address: 0x%lx\n", - current->comm, current->pid, regs->ip); - break; - case sld_fatal: - force_sig_fault(SIGBUS, BUS_ADRALN, NULL); - break; - } -} - -/* - * CPU models that are known to have the per-core split-lock detection - * feature even though they do not enumerate IA32_CORE_CAPABILITIES. - */ -static const struct x86_cpu_id split_lock_cpu_ids[] __initconst = { - X86_MATCH_VFM(INTEL_ICELAKE_X, 0), - X86_MATCH_VFM(INTEL_ICELAKE_L, 0), - X86_MATCH_VFM(INTEL_ICELAKE_D, 0), - {} -}; - -static void __init split_lock_setup(struct cpuinfo_x86 *c) -{ - const struct x86_cpu_id *m; - u64 ia32_core_caps; - - if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) - return; - - /* Check for CPUs that have support but do not enumerate it: */ - m = x86_match_cpu(split_lock_cpu_ids); - if (m) - goto supported; - - if (!cpu_has(c, X86_FEATURE_CORE_CAPABILITIES)) - return; - - /* - * Not all bits in MSR_IA32_CORE_CAPS are architectural, but - * MSR_IA32_CORE_CAPS_SPLIT_LOCK_DETECT is. All CPUs that set - * it have split lock detection. - */ - rdmsrl(MSR_IA32_CORE_CAPS, ia32_core_caps); - if (ia32_core_caps & MSR_IA32_CORE_CAPS_SPLIT_LOCK_DETECT) - goto supported; - - /* CPU is not in the model list and does not have the MSR bit: */ - return; - -supported: - cpu_model_supports_sld = true; - __split_lock_setup(); -} - -static void sld_state_show(void) -{ - if (!boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT) && - !boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT)) - return; - - switch (sld_state) { - case sld_off: - pr_info("disabled\n"); - break; - case sld_warn: - if (boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT)) { - pr_info("#AC: crashing the kernel on kernel split_locks and warning on user-space split_locks\n"); - if (cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, - "x86/splitlock", NULL, splitlock_cpu_offline) < 0) - pr_warn("No splitlock CPU offline handler\n"); - } else if (boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT)) { - pr_info("#DB: warning on user-space bus_locks\n"); - } - break; - case sld_fatal: - if (boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT)) { - pr_info("#AC: crashing the kernel on kernel split_locks and sending SIGBUS on user-space split_locks\n"); - } else if (boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT)) { - pr_info("#DB: sending SIGBUS on user-space bus_locks%s\n", - boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT) ? - " from non-WB" : ""); - } - break; - case sld_ratelimit: - if (boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT)) - pr_info("#DB: setting system wide bus lock rate limit to %u/sec\n", bld_ratelimit.burst); - break; - } -} - -void __init sld_setup(struct cpuinfo_x86 *c) -{ - split_lock_setup(c); - sld_state_setup(); - sld_state_show(); -} - #define X86_HYBRID_CPU_TYPE_ID_SHIFT 24 /** @@ -1299,3 +892,18 @@ u8 get_this_hybrid_cpu_type(void) return cpuid_eax(0x0000001a) >> X86_HYBRID_CPU_TYPE_ID_SHIFT; } + +/** + * get_this_hybrid_cpu_native_id() - Get the native id of this hybrid CPU + * + * Returns the uarch native ID [23:0] of a CPU in a hybrid processor. + * If the processor is not hybrid, returns 0. + */ +u32 get_this_hybrid_cpu_native_id(void) +{ + if (!cpu_feature_enabled(X86_FEATURE_HYBRID_CPU)) + return 0; + + return cpuid_eax(0x0000001a) & + (BIT_ULL(X86_HYBRID_CPU_TYPE_ID_SHIFT) - 1); +} diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c index 14bf8c232e45..6ca80fff1fea 100644 --- a/arch/x86/kernel/cpu/mce/amd.c +++ b/arch/x86/kernel/cpu/mce/amd.c @@ -778,29 +778,33 @@ bool amd_mce_usable_address(struct mce *m) static void __log_error(unsigned int bank, u64 status, u64 addr, u64 misc) { - struct mce m; + struct mce_hw_err err; + struct mce *m = &err.m; - mce_prep_record(&m); + mce_prep_record(&err); - m.status = status; - m.misc = misc; - m.bank = bank; - m.tsc = rdtsc(); + m->status = status; + m->misc = misc; + m->bank = bank; + m->tsc = rdtsc(); - if (m.status & MCI_STATUS_ADDRV) { - m.addr = addr; + if (m->status & MCI_STATUS_ADDRV) { + m->addr = addr; - smca_extract_err_addr(&m); + smca_extract_err_addr(m); } if (mce_flags.smca) { - rdmsrl(MSR_AMD64_SMCA_MCx_IPID(bank), m.ipid); + rdmsrl(MSR_AMD64_SMCA_MCx_IPID(bank), m->ipid); - if (m.status & MCI_STATUS_SYNDV) - rdmsrl(MSR_AMD64_SMCA_MCx_SYND(bank), m.synd); + if (m->status & MCI_STATUS_SYNDV) { + rdmsrl(MSR_AMD64_SMCA_MCx_SYND(bank), m->synd); + rdmsrl(MSR_AMD64_SMCA_MCx_SYND1(bank), err.vendor.amd.synd1); + rdmsrl(MSR_AMD64_SMCA_MCx_SYND2(bank), err.vendor.amd.synd2); + } } - mce_log(&m); + mce_log(&err); } DEFINE_IDTENTRY_SYSVEC(sysvec_deferred_error) diff --git a/arch/x86/kernel/cpu/mce/apei.c b/arch/x86/kernel/cpu/mce/apei.c index 3885fe05f01e..0a89947e47bc 100644 --- a/arch/x86/kernel/cpu/mce/apei.c +++ b/arch/x86/kernel/cpu/mce/apei.c @@ -28,7 +28,8 @@ void apei_mce_report_mem_error(int severity, struct cper_sec_mem_err *mem_err) { - struct mce m; + struct mce_hw_err err; + struct mce *m; int lsb; if (!(mem_err->validation_bits & CPER_MEM_VALID_PA)) @@ -44,31 +45,33 @@ void apei_mce_report_mem_error(int severity, struct cper_sec_mem_err *mem_err) else lsb = PAGE_SHIFT; - mce_prep_record(&m); - m.bank = -1; + mce_prep_record(&err); + m = &err.m; + m->bank = -1; /* Fake a memory read error with unknown channel */ - m.status = MCI_STATUS_VAL | MCI_STATUS_EN | MCI_STATUS_ADDRV | MCI_STATUS_MISCV | 0x9f; - m.misc = (MCI_MISC_ADDR_PHYS << 6) | lsb; + m->status = MCI_STATUS_VAL | MCI_STATUS_EN | MCI_STATUS_ADDRV | MCI_STATUS_MISCV | 0x9f; + m->misc = (MCI_MISC_ADDR_PHYS << 6) | lsb; if (severity >= GHES_SEV_RECOVERABLE) - m.status |= MCI_STATUS_UC; + m->status |= MCI_STATUS_UC; if (severity >= GHES_SEV_PANIC) { - m.status |= MCI_STATUS_PCC; - m.tsc = rdtsc(); + m->status |= MCI_STATUS_PCC; + m->tsc = rdtsc(); } - m.addr = mem_err->physical_addr; - mce_log(&m); + m->addr = mem_err->physical_addr; + mce_log(&err); } EXPORT_SYMBOL_GPL(apei_mce_report_mem_error); int apei_smca_report_x86_error(struct cper_ia_proc_ctx *ctx_info, u64 lapic_id) { const u64 *i_mce = ((const u64 *) (ctx_info + 1)); + unsigned int cpu, num_regs; bool apicid_found = false; - unsigned int cpu; - struct mce m; + struct mce_hw_err err; + struct mce *m; if (!boot_cpu_has(X86_FEATURE_SMCA)) return -EINVAL; @@ -86,16 +89,12 @@ int apei_smca_report_x86_error(struct cper_ia_proc_ctx *ctx_info, u64 lapic_id) return -EINVAL; /* - * The register array size must be large enough to include all the - * SMCA registers which need to be extracted. - * * The number of registers in the register array is determined by * Register Array Size/8 as defined in UEFI spec v2.8, sec N.2.4.2.2. - * The register layout is fixed and currently the raw data in the - * register array includes 6 SMCA registers which the kernel can - * extract. + * Sanity-check registers array size. */ - if (ctx_info->reg_arr_size < 48) + num_regs = ctx_info->reg_arr_size >> 3; + if (!num_regs) return -EINVAL; for_each_possible_cpu(cpu) { @@ -108,18 +107,68 @@ int apei_smca_report_x86_error(struct cper_ia_proc_ctx *ctx_info, u64 lapic_id) if (!apicid_found) return -EINVAL; - mce_prep_record_common(&m); - mce_prep_record_per_cpu(cpu, &m); + m = &err.m; + memset(&err, 0, sizeof(struct mce_hw_err)); + mce_prep_record_common(m); + mce_prep_record_per_cpu(cpu, m); + + m->bank = (ctx_info->msr_addr >> 4) & 0xFF; - m.bank = (ctx_info->msr_addr >> 4) & 0xFF; - m.status = *i_mce; - m.addr = *(i_mce + 1); - m.misc = *(i_mce + 2); - /* Skipping MCA_CONFIG */ - m.ipid = *(i_mce + 4); - m.synd = *(i_mce + 5); + /* + * The SMCA register layout is fixed and includes 16 registers. + * The end of the array may be variable, but the beginning is known. + * Cap the number of registers to expected max (15). + */ + if (num_regs > 15) + num_regs = 15; + + switch (num_regs) { + /* MCA_SYND2 */ + case 15: + err.vendor.amd.synd2 = *(i_mce + 14); + fallthrough; + /* MCA_SYND1 */ + case 14: + err.vendor.amd.synd1 = *(i_mce + 13); + fallthrough; + /* MCA_MISC4 */ + case 13: + /* MCA_MISC3 */ + case 12: + /* MCA_MISC2 */ + case 11: + /* MCA_MISC1 */ + case 10: + /* MCA_DEADDR */ + case 9: + /* MCA_DESTAT */ + case 8: + /* reserved */ + case 7: + /* MCA_SYND */ + case 6: + m->synd = *(i_mce + 5); + fallthrough; + /* MCA_IPID */ + case 5: + m->ipid = *(i_mce + 4); + fallthrough; + /* MCA_CONFIG */ + case 4: + /* MCA_MISC0 */ + case 3: + m->misc = *(i_mce + 2); + fallthrough; + /* MCA_ADDR */ + case 2: + m->addr = *(i_mce + 1); + fallthrough; + /* MCA_STATUS */ + case 1: + m->status = *i_mce; + } - mce_log(&m); + mce_log(&err); return 0; } diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index 2a938f429c4d..7fb5556a0b53 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -88,7 +88,7 @@ struct mca_config mca_cfg __read_mostly = { .monarch_timeout = -1 }; -static DEFINE_PER_CPU(struct mce, mces_seen); +static DEFINE_PER_CPU(struct mce_hw_err, hw_errs_seen); static unsigned long mce_need_notify; /* @@ -119,8 +119,6 @@ BLOCKING_NOTIFIER_HEAD(x86_mce_decoder_chain); void mce_prep_record_common(struct mce *m) { - memset(m, 0, sizeof(struct mce)); - m->cpuid = cpuid_eax(1); m->cpuvendor = boot_cpu_data.x86_vendor; m->mcgcap = __rdmsr(MSR_IA32_MCG_CAP); @@ -138,9 +136,12 @@ void mce_prep_record_per_cpu(unsigned int cpu, struct mce *m) m->socketid = topology_physical_package_id(cpu); } -/* Do initial initialization of a struct mce */ -void mce_prep_record(struct mce *m) +/* Do initial initialization of struct mce_hw_err */ +void mce_prep_record(struct mce_hw_err *err) { + struct mce *m = &err->m; + + memset(err, 0, sizeof(struct mce_hw_err)); mce_prep_record_common(m); mce_prep_record_per_cpu(smp_processor_id(), m); } @@ -148,9 +149,9 @@ void mce_prep_record(struct mce *m) DEFINE_PER_CPU(struct mce, injectm); EXPORT_PER_CPU_SYMBOL_GPL(injectm); -void mce_log(struct mce *m) +void mce_log(struct mce_hw_err *err) { - if (!mce_gen_pool_add(m)) + if (!mce_gen_pool_add(err)) irq_work_queue(&mce_irq_work); } EXPORT_SYMBOL_GPL(mce_log); @@ -171,8 +172,10 @@ void mce_unregister_decode_chain(struct notifier_block *nb) } EXPORT_SYMBOL_GPL(mce_unregister_decode_chain); -static void __print_mce(struct mce *m) +static void __print_mce(struct mce_hw_err *err) { + struct mce *m = &err->m; + pr_emerg(HW_ERR "CPU %d: Machine Check%s: %Lx Bank %d: %016Lx\n", m->extcpu, (m->mcgstatus & MCG_STATUS_MCIP ? " Exception" : ""), @@ -199,6 +202,10 @@ static void __print_mce(struct mce *m) if (mce_flags.smca) { if (m->synd) pr_cont("SYND %llx ", m->synd); + if (err->vendor.amd.synd1) + pr_cont("SYND1 %llx ", err->vendor.amd.synd1); + if (err->vendor.amd.synd2) + pr_cont("SYND2 %llx ", err->vendor.amd.synd2); if (m->ipid) pr_cont("IPID %llx ", m->ipid); } @@ -214,9 +221,11 @@ static void __print_mce(struct mce *m) m->microcode); } -static void print_mce(struct mce *m) +static void print_mce(struct mce_hw_err *err) { - __print_mce(m); + struct mce *m = &err->m; + + __print_mce(err); if (m->cpuvendor != X86_VENDOR_AMD && m->cpuvendor != X86_VENDOR_HYGON) pr_emerg_ratelimited(HW_ERR "Run the above through 'mcelog --ascii'\n"); @@ -251,7 +260,7 @@ static const char *mce_dump_aux_info(struct mce *m) return NULL; } -static noinstr void mce_panic(const char *msg, struct mce *final, char *exp) +static noinstr void mce_panic(const char *msg, struct mce_hw_err *final, char *exp) { struct llist_node *pending; struct mce_evt_llist *l; @@ -282,20 +291,22 @@ static noinstr void mce_panic(const char *msg, struct mce *final, char *exp) pending = mce_gen_pool_prepare_records(); /* First print corrected ones that are still unlogged */ llist_for_each_entry(l, pending, llnode) { - struct mce *m = &l->mce; + struct mce_hw_err *err = &l->err; + struct mce *m = &err->m; if (!(m->status & MCI_STATUS_UC)) { - print_mce(m); + print_mce(err); if (!apei_err) apei_err = apei_write_mce(m); } } /* Now print uncorrected but with the final one last */ llist_for_each_entry(l, pending, llnode) { - struct mce *m = &l->mce; + struct mce_hw_err *err = &l->err; + struct mce *m = &err->m; if (!(m->status & MCI_STATUS_UC)) continue; - if (!final || mce_cmp(m, final)) { - print_mce(m); + if (!final || mce_cmp(m, &final->m)) { + print_mce(err); if (!apei_err) apei_err = apei_write_mce(m); } @@ -303,12 +314,12 @@ static noinstr void mce_panic(const char *msg, struct mce *final, char *exp) if (final) { print_mce(final); if (!apei_err) - apei_err = apei_write_mce(final); + apei_err = apei_write_mce(&final->m); } if (exp) pr_emerg(HW_ERR "Machine check: %s\n", exp); - memmsg = mce_dump_aux_info(final); + memmsg = mce_dump_aux_info(&final->m); if (memmsg) pr_emerg(HW_ERR "Machine check: %s\n", memmsg); @@ -323,9 +334,9 @@ static noinstr void mce_panic(const char *msg, struct mce *final, char *exp) * panic. */ if (kexec_crash_loaded()) { - if (final && (final->status & MCI_STATUS_ADDRV)) { + if (final && (final->m.status & MCI_STATUS_ADDRV)) { struct page *p; - p = pfn_to_online_page(final->addr >> PAGE_SHIFT); + p = pfn_to_online_page(final->m.addr >> PAGE_SHIFT); if (p) SetPageHWPoison(p); } @@ -445,16 +456,18 @@ static noinstr void mce_wrmsrl(u32 msr, u64 v) * check into our "mce" struct so that we can use it later to assess * the severity of the problem as we read per-bank specific details. */ -static noinstr void mce_gather_info(struct mce *m, struct pt_regs *regs) +static noinstr void mce_gather_info(struct mce_hw_err *err, struct pt_regs *regs) { + struct mce *m; /* * Enable instrumentation around mce_prep_record() which calls external * facilities. */ instrumentation_begin(); - mce_prep_record(m); + mce_prep_record(err); instrumentation_end(); + m = &err->m; m->mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS); if (regs) { /* @@ -574,13 +587,13 @@ EXPORT_SYMBOL_GPL(mce_is_correctable); static int mce_early_notifier(struct notifier_block *nb, unsigned long val, void *data) { - struct mce *m = (struct mce *)data; + struct mce_hw_err *err = to_mce_hw_err(data); - if (!m) + if (!err) return NOTIFY_DONE; /* Emit the trace record: */ - trace_mce_record(m); + trace_mce_record(err); set_bit(0, &mce_need_notify); @@ -624,13 +637,13 @@ static struct notifier_block mce_uc_nb = { static int mce_default_notifier(struct notifier_block *nb, unsigned long val, void *data) { - struct mce *m = (struct mce *)data; + struct mce_hw_err *err = to_mce_hw_err(data); - if (!m) + if (!err) return NOTIFY_DONE; - if (mca_cfg.print_all || !m->kflags) - __print_mce(m); + if (mca_cfg.print_all || !(err->m.kflags)) + __print_mce(err); return NOTIFY_DONE; } @@ -644,8 +657,10 @@ static struct notifier_block mce_default_nb = { /* * Read ADDR and MISC registers. */ -static noinstr void mce_read_aux(struct mce *m, int i) +static noinstr void mce_read_aux(struct mce_hw_err *err, int i) { + struct mce *m = &err->m; + if (m->status & MCI_STATUS_MISCV) m->misc = mce_rdmsrl(mca_msr_reg(i, MCA_MISC)); @@ -667,8 +682,11 @@ static noinstr void mce_read_aux(struct mce *m, int i) if (mce_flags.smca) { m->ipid = mce_rdmsrl(MSR_AMD64_SMCA_MCx_IPID(i)); - if (m->status & MCI_STATUS_SYNDV) + if (m->status & MCI_STATUS_SYNDV) { m->synd = mce_rdmsrl(MSR_AMD64_SMCA_MCx_SYND(i)); + err->vendor.amd.synd1 = mce_rdmsrl(MSR_AMD64_SMCA_MCx_SYND1(i)); + err->vendor.amd.synd2 = mce_rdmsrl(MSR_AMD64_SMCA_MCx_SYND2(i)); + } } } @@ -692,26 +710,28 @@ DEFINE_PER_CPU(unsigned, mce_poll_count); void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) { struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array); - struct mce m; + struct mce_hw_err err; + struct mce *m; int i; this_cpu_inc(mce_poll_count); - mce_gather_info(&m, NULL); + mce_gather_info(&err, NULL); + m = &err.m; if (flags & MCP_TIMESTAMP) - m.tsc = rdtsc(); + m->tsc = rdtsc(); for (i = 0; i < this_cpu_read(mce_num_banks); i++) { if (!mce_banks[i].ctl || !test_bit(i, *b)) continue; - m.misc = 0; - m.addr = 0; - m.bank = i; + m->misc = 0; + m->addr = 0; + m->bank = i; barrier(); - m.status = mce_rdmsrl(mca_msr_reg(i, MCA_STATUS)); + m->status = mce_rdmsrl(mca_msr_reg(i, MCA_STATUS)); /* * Update storm tracking here, before checking for the @@ -721,17 +741,17 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) * storm status. */ if (!mca_cfg.cmci_disabled) - mce_track_storm(&m); + mce_track_storm(m); /* If this entry is not valid, ignore it */ - if (!(m.status & MCI_STATUS_VAL)) + if (!(m->status & MCI_STATUS_VAL)) continue; /* * If we are logging everything (at CPU online) or this * is a corrected error, then we must log it. */ - if ((flags & MCP_UC) || !(m.status & MCI_STATUS_UC)) + if ((flags & MCP_UC) || !(m->status & MCI_STATUS_UC)) goto log_it; /* @@ -741,20 +761,20 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) * everything else. */ if (!mca_cfg.ser) { - if (m.status & MCI_STATUS_UC) + if (m->status & MCI_STATUS_UC) continue; goto log_it; } /* Log "not enabled" (speculative) errors */ - if (!(m.status & MCI_STATUS_EN)) + if (!(m->status & MCI_STATUS_EN)) goto log_it; /* * Log UCNA (SDM: 15.6.3 "UCR Error Classification") * UC == 1 && PCC == 0 && S == 0 */ - if (!(m.status & MCI_STATUS_PCC) && !(m.status & MCI_STATUS_S)) + if (!(m->status & MCI_STATUS_PCC) && !(m->status & MCI_STATUS_S)) goto log_it; /* @@ -768,20 +788,20 @@ log_it: if (flags & MCP_DONTLOG) goto clear_it; - mce_read_aux(&m, i); - m.severity = mce_severity(&m, NULL, NULL, false); + mce_read_aux(&err, i); + m->severity = mce_severity(m, NULL, NULL, false); /* * Don't get the IP here because it's unlikely to * have anything to do with the actual error location. */ - if (mca_cfg.dont_log_ce && !mce_usable_address(&m)) + if (mca_cfg.dont_log_ce && !mce_usable_address(m)) goto clear_it; if (flags & MCP_QUEUE_LOG) - mce_gen_pool_add(&m); + mce_gen_pool_add(&err); else - mce_log(&m); + mce_log(&err); clear_it: /* @@ -905,9 +925,10 @@ static __always_inline void quirk_zen_ifu(int bank, struct mce *m, struct pt_reg * Do a quick check if any of the events requires a panic. * This decides if we keep the events around or clear them. */ -static __always_inline int mce_no_way_out(struct mce *m, char **msg, unsigned long *validp, +static __always_inline int mce_no_way_out(struct mce_hw_err *err, char **msg, unsigned long *validp, struct pt_regs *regs) { + struct mce *m = &err->m; char *tmp = *msg; int i; @@ -925,7 +946,7 @@ static __always_inline int mce_no_way_out(struct mce *m, char **msg, unsigned lo m->bank = i; if (mce_severity(m, regs, &tmp, true) >= MCE_PANIC_SEVERITY) { - mce_read_aux(m, i); + mce_read_aux(err, i); *msg = tmp; return 1; } @@ -1016,10 +1037,11 @@ out: */ static void mce_reign(void) { - int cpu; + struct mce_hw_err *err = NULL; struct mce *m = NULL; int global_worst = 0; char *msg = NULL; + int cpu; /* * This CPU is the Monarch and the other CPUs have run @@ -1027,11 +1049,13 @@ static void mce_reign(void) * Grade the severity of the errors of all the CPUs. */ for_each_possible_cpu(cpu) { - struct mce *mtmp = &per_cpu(mces_seen, cpu); + struct mce_hw_err *etmp = &per_cpu(hw_errs_seen, cpu); + struct mce *mtmp = &etmp->m; if (mtmp->severity > global_worst) { global_worst = mtmp->severity; - m = &per_cpu(mces_seen, cpu); + err = &per_cpu(hw_errs_seen, cpu); + m = &err->m; } } @@ -1043,7 +1067,7 @@ static void mce_reign(void) if (m && global_worst >= MCE_PANIC_SEVERITY) { /* call mce_severity() to get "msg" for panic */ mce_severity(m, NULL, &msg, true); - mce_panic("Fatal machine check", m, msg); + mce_panic("Fatal machine check", err, msg); } /* @@ -1060,11 +1084,11 @@ static void mce_reign(void) mce_panic("Fatal machine check from unknown source", NULL, NULL); /* - * Now clear all the mces_seen so that they don't reappear on + * Now clear all the hw_errs_seen so that they don't reappear on * the next mce. */ for_each_possible_cpu(cpu) - memset(&per_cpu(mces_seen, cpu), 0, sizeof(struct mce)); + memset(&per_cpu(hw_errs_seen, cpu), 0, sizeof(struct mce_hw_err)); } static atomic_t global_nwo; @@ -1268,13 +1292,14 @@ static noinstr bool mce_check_crashing_cpu(void) } static __always_inline int -__mc_scan_banks(struct mce *m, struct pt_regs *regs, struct mce *final, - unsigned long *toclear, unsigned long *valid_banks, int no_way_out, - int *worst) +__mc_scan_banks(struct mce_hw_err *err, struct pt_regs *regs, + struct mce_hw_err *final, unsigned long *toclear, + unsigned long *valid_banks, int no_way_out, int *worst) { struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array); struct mca_config *cfg = &mca_cfg; int severity, i, taint = 0; + struct mce *m = &err->m; for (i = 0; i < this_cpu_read(mce_num_banks); i++) { arch___clear_bit(i, toclear); @@ -1319,7 +1344,7 @@ __mc_scan_banks(struct mce *m, struct pt_regs *regs, struct mce *final, if (severity == MCE_NO_SEVERITY) continue; - mce_read_aux(m, i); + mce_read_aux(err, i); /* assuming valid severity level != 0 */ m->severity = severity; @@ -1329,17 +1354,17 @@ __mc_scan_banks(struct mce *m, struct pt_regs *regs, struct mce *final, * done in #MC context, where instrumentation is disabled. */ instrumentation_begin(); - mce_log(m); + mce_log(err); instrumentation_end(); if (severity > *worst) { - *final = *m; + *final = *err; *worst = severity; } } /* mce_clear_state will clear *final, save locally for use later */ - *m = *final; + *err = *final; return taint; } @@ -1399,9 +1424,10 @@ static void kill_me_never(struct callback_head *cb) set_mce_nospec(pfn); } -static void queue_task_work(struct mce *m, char *msg, void (*func)(struct callback_head *)) +static void queue_task_work(struct mce_hw_err *err, char *msg, void (*func)(struct callback_head *)) { int count = ++current->mce_count; + struct mce *m = &err->m; /* First call, save all the details */ if (count == 1) { @@ -1414,11 +1440,12 @@ static void queue_task_work(struct mce *m, char *msg, void (*func)(struct callba /* Ten is likely overkill. Don't expect more than two faults before task_work() */ if (count > 10) - mce_panic("Too many consecutive machine checks while accessing user data", m, msg); + mce_panic("Too many consecutive machine checks while accessing user data", + err, msg); /* Second or later call, make sure page address matches the one from first call */ if (count > 1 && (current->mce_addr >> PAGE_SHIFT) != (m->addr >> PAGE_SHIFT)) - mce_panic("Consecutive machine checks to different user pages", m, msg); + mce_panic("Consecutive machine checks to different user pages", err, msg); /* Do not call task_work_add() more than once */ if (count > 1) @@ -1467,8 +1494,10 @@ noinstr void do_machine_check(struct pt_regs *regs) int worst = 0, order, no_way_out, kill_current_task, lmce, taint = 0; DECLARE_BITMAP(valid_banks, MAX_NR_BANKS) = { 0 }; DECLARE_BITMAP(toclear, MAX_NR_BANKS) = { 0 }; - struct mce m, *final; + struct mce_hw_err *final; + struct mce_hw_err err; char *msg = NULL; + struct mce *m; if (unlikely(mce_flags.p5)) return pentium_machine_check(regs); @@ -1506,13 +1535,14 @@ noinstr void do_machine_check(struct pt_regs *regs) this_cpu_inc(mce_exception_count); - mce_gather_info(&m, regs); - m.tsc = rdtsc(); + mce_gather_info(&err, regs); + m = &err.m; + m->tsc = rdtsc(); - final = this_cpu_ptr(&mces_seen); - *final = m; + final = this_cpu_ptr(&hw_errs_seen); + *final = err; - no_way_out = mce_no_way_out(&m, &msg, valid_banks, regs); + no_way_out = mce_no_way_out(&err, &msg, valid_banks, regs); barrier(); @@ -1521,15 +1551,15 @@ noinstr void do_machine_check(struct pt_regs *regs) * Assume the worst for now, but if we find the * severity is MCE_AR_SEVERITY we have other options. */ - if (!(m.mcgstatus & MCG_STATUS_RIPV)) + if (!(m->mcgstatus & MCG_STATUS_RIPV)) kill_current_task = 1; /* * Check if this MCE is signaled to only this logical processor, * on Intel, Zhaoxin only. */ - if (m.cpuvendor == X86_VENDOR_INTEL || - m.cpuvendor == X86_VENDOR_ZHAOXIN) - lmce = m.mcgstatus & MCG_STATUS_LMCES; + if (m->cpuvendor == X86_VENDOR_INTEL || + m->cpuvendor == X86_VENDOR_ZHAOXIN) + lmce = m->mcgstatus & MCG_STATUS_LMCES; /* * Local machine check may already know that we have to panic. @@ -1540,12 +1570,12 @@ noinstr void do_machine_check(struct pt_regs *regs) */ if (lmce) { if (no_way_out) - mce_panic("Fatal local machine check", &m, msg); + mce_panic("Fatal local machine check", &err, msg); } else { order = mce_start(&no_way_out); } - taint = __mc_scan_banks(&m, regs, final, toclear, valid_banks, no_way_out, &worst); + taint = __mc_scan_banks(&err, regs, final, toclear, valid_banks, no_way_out, &worst); if (!no_way_out) mce_clear_state(toclear); @@ -1560,7 +1590,7 @@ noinstr void do_machine_check(struct pt_regs *regs) no_way_out = worst >= MCE_PANIC_SEVERITY; if (no_way_out) - mce_panic("Fatal machine check on current CPU", &m, msg); + mce_panic("Fatal machine check on current CPU", &err, msg); } } else { /* @@ -1572,8 +1602,8 @@ noinstr void do_machine_check(struct pt_regs *regs) * make sure we have the right "msg". */ if (worst >= MCE_PANIC_SEVERITY) { - mce_severity(&m, regs, &msg, true); - mce_panic("Local fatal machine check!", &m, msg); + mce_severity(m, regs, &msg, true); + mce_panic("Local fatal machine check!", &err, msg); } } @@ -1591,16 +1621,16 @@ noinstr void do_machine_check(struct pt_regs *regs) goto out; /* Fault was in user mode and we need to take some action */ - if ((m.cs & 3) == 3) { + if ((m->cs & 3) == 3) { /* If this triggers there is no way to recover. Die hard. */ BUG_ON(!on_thread_stack() || !user_mode(regs)); - if (!mce_usable_address(&m)) - queue_task_work(&m, msg, kill_me_now); + if (!mce_usable_address(m)) + queue_task_work(&err, msg, kill_me_now); else - queue_task_work(&m, msg, kill_me_maybe); + queue_task_work(&err, msg, kill_me_maybe); - } else if (m.mcgstatus & MCG_STATUS_SEAM_NR) { + } else if (m->mcgstatus & MCG_STATUS_SEAM_NR) { /* * Saved RIP on stack makes it look like the machine check * was taken in the kernel on the instruction following @@ -1612,8 +1642,8 @@ noinstr void do_machine_check(struct pt_regs *regs) * not occur there. Mark the page as poisoned so it won't * be added to free list when the guest is terminated. */ - if (mce_usable_address(&m)) { - struct page *p = pfn_to_online_page(m.addr >> PAGE_SHIFT); + if (mce_usable_address(m)) { + struct page *p = pfn_to_online_page(m->addr >> PAGE_SHIFT); if (p) SetPageHWPoison(p); @@ -1628,13 +1658,13 @@ noinstr void do_machine_check(struct pt_regs *regs) * corresponding exception handler which would do that is the * proper one. */ - if (m.kflags & MCE_IN_KERNEL_RECOV) { + if (m->kflags & MCE_IN_KERNEL_RECOV) { if (!fixup_exception(regs, X86_TRAP_MC, 0, 0)) - mce_panic("Failed kernel mode recovery", &m, msg); + mce_panic("Failed kernel mode recovery", &err, msg); } - if (m.kflags & MCE_IN_KERNEL_COPYIN) - queue_task_work(&m, msg, kill_me_never); + if (m->kflags & MCE_IN_KERNEL_COPYIN) + queue_task_work(&err, msg, kill_me_never); } out: diff --git a/arch/x86/kernel/cpu/mce/dev-mcelog.c b/arch/x86/kernel/cpu/mce/dev-mcelog.c index af44fd5dbd7c..8d023239ce18 100644 --- a/arch/x86/kernel/cpu/mce/dev-mcelog.c +++ b/arch/x86/kernel/cpu/mce/dev-mcelog.c @@ -264,15 +264,8 @@ static long mce_chrdev_ioctl(struct file *f, unsigned int cmd, return put_user(sizeof(struct mce), p); case MCE_GET_LOG_LEN: return put_user(mcelog->len, p); - case MCE_GETCLEAR_FLAGS: { - unsigned flags; - - do { - flags = mcelog->flags; - } while (cmpxchg(&mcelog->flags, flags, 0) != flags); - - return put_user(flags, p); - } + case MCE_GETCLEAR_FLAGS: + return put_user(xchg(&mcelog->flags, 0), p); default: return -ENOTTY; } diff --git a/arch/x86/kernel/cpu/mce/genpool.c b/arch/x86/kernel/cpu/mce/genpool.c index 4284749ec803..d0be6dda0c14 100644 --- a/arch/x86/kernel/cpu/mce/genpool.c +++ b/arch/x86/kernel/cpu/mce/genpool.c @@ -31,15 +31,15 @@ static LLIST_HEAD(mce_event_llist); */ static bool is_duplicate_mce_record(struct mce_evt_llist *t, struct mce_evt_llist *l) { + struct mce_hw_err *err1, *err2; struct mce_evt_llist *node; - struct mce *m1, *m2; - m1 = &t->mce; + err1 = &t->err; llist_for_each_entry(node, &l->llnode, llnode) { - m2 = &node->mce; + err2 = &node->err; - if (!mce_cmp(m1, m2)) + if (!mce_cmp(&err1->m, &err2->m)) return true; } return false; @@ -73,8 +73,8 @@ struct llist_node *mce_gen_pool_prepare_records(void) void mce_gen_pool_process(struct work_struct *__unused) { - struct llist_node *head; struct mce_evt_llist *node, *tmp; + struct llist_node *head; struct mce *mce; head = llist_del_all(&mce_event_llist); @@ -83,7 +83,7 @@ void mce_gen_pool_process(struct work_struct *__unused) head = llist_reverse_order(head); llist_for_each_entry_safe(node, tmp, head, llnode) { - mce = &node->mce; + mce = &node->err.m; blocking_notifier_call_chain(&x86_mce_decoder_chain, 0, mce); gen_pool_free(mce_evt_pool, (unsigned long)node, sizeof(*node)); } @@ -94,11 +94,11 @@ bool mce_gen_pool_empty(void) return llist_empty(&mce_event_llist); } -int mce_gen_pool_add(struct mce *mce) +int mce_gen_pool_add(struct mce_hw_err *err) { struct mce_evt_llist *node; - if (filter_mce(mce)) + if (filter_mce(&err->m)) return -EINVAL; if (!mce_evt_pool) @@ -110,7 +110,7 @@ int mce_gen_pool_add(struct mce *mce) return -ENOMEM; } - memcpy(&node->mce, mce, sizeof(*mce)); + memcpy(&node->err, err, sizeof(*err)); llist_add(&node->llnode, &mce_event_llist); return 0; diff --git a/arch/x86/kernel/cpu/mce/inject.c b/arch/x86/kernel/cpu/mce/inject.c index 49ed3428785d..313fe682db33 100644 --- a/arch/x86/kernel/cpu/mce/inject.c +++ b/arch/x86/kernel/cpu/mce/inject.c @@ -502,8 +502,9 @@ static void prepare_msrs(void *info) static void do_inject(void) { - u64 mcg_status = 0; unsigned int cpu = i_mce.extcpu; + struct mce_hw_err err; + u64 mcg_status = 0; u8 b = i_mce.bank; i_mce.tsc = rdtsc_ordered(); @@ -517,7 +518,8 @@ static void do_inject(void) i_mce.status |= MCI_STATUS_SYNDV; if (inj_type == SW_INJ) { - mce_log(&i_mce); + err.m = i_mce; + mce_log(&err); return; } diff --git a/arch/x86/kernel/cpu/mce/intel.c b/arch/x86/kernel/cpu/mce/intel.c index f6103e6bf69a..b3cd2c61b11d 100644 --- a/arch/x86/kernel/cpu/mce/intel.c +++ b/arch/x86/kernel/cpu/mce/intel.c @@ -94,7 +94,7 @@ static int cmci_supported(int *banks) if (!boot_cpu_has(X86_FEATURE_APIC) || lapic_get_maxlvt() < 6) return 0; rdmsrl(MSR_IA32_MCG_CAP, cap); - *banks = min_t(unsigned, MAX_NR_BANKS, cap & 0xff); + *banks = min_t(unsigned, MAX_NR_BANKS, cap & MCG_BANKCNT_MASK); return !!(cap & MCG_CMCI_P); } diff --git a/arch/x86/kernel/cpu/mce/internal.h b/arch/x86/kernel/cpu/mce/internal.h index 43c7f3b71df5..84f810598231 100644 --- a/arch/x86/kernel/cpu/mce/internal.h +++ b/arch/x86/kernel/cpu/mce/internal.h @@ -26,12 +26,12 @@ extern struct blocking_notifier_head x86_mce_decoder_chain; struct mce_evt_llist { struct llist_node llnode; - struct mce mce; + struct mce_hw_err err; }; void mce_gen_pool_process(struct work_struct *__unused); bool mce_gen_pool_empty(void); -int mce_gen_pool_add(struct mce *mce); +int mce_gen_pool_add(struct mce_hw_err *err); int mce_gen_pool_init(void); struct llist_node *mce_gen_pool_prepare_records(void); diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c index f63b051f25a0..31a73715d755 100644 --- a/arch/x86/kernel/cpu/microcode/amd.c +++ b/arch/x86/kernel/cpu/microcode/amd.c @@ -584,7 +584,7 @@ void __init load_ucode_amd_bsp(struct early_load_data *ed, unsigned int cpuid_1_ native_rdmsr(MSR_AMD64_PATCH_LEVEL, ed->new_rev, dummy); } -static enum ucode_state load_microcode_amd(u8 family, const u8 *data, size_t size); +static enum ucode_state _load_microcode_amd(u8 family, const u8 *data, size_t size); static int __init save_microcode_in_initrd(void) { @@ -605,7 +605,7 @@ static int __init save_microcode_in_initrd(void) if (!desc.mc) return -EINVAL; - ret = load_microcode_amd(x86_family(cpuid_1_eax), desc.data, desc.size); + ret = _load_microcode_amd(x86_family(cpuid_1_eax), desc.data, desc.size); if (ret > UCODE_UPDATED) return -EINVAL; @@ -613,16 +613,19 @@ static int __init save_microcode_in_initrd(void) } early_initcall(save_microcode_in_initrd); -static inline bool patch_cpus_equivalent(struct ucode_patch *p, struct ucode_patch *n) +static inline bool patch_cpus_equivalent(struct ucode_patch *p, + struct ucode_patch *n, + bool ignore_stepping) { /* Zen and newer hardcode the f/m/s in the patch ID */ if (x86_family(bsp_cpuid_1_eax) >= 0x17) { union cpuid_1_eax p_cid = ucode_rev_to_cpuid(p->patch_id); union cpuid_1_eax n_cid = ucode_rev_to_cpuid(n->patch_id); - /* Zap stepping */ - p_cid.stepping = 0; - n_cid.stepping = 0; + if (ignore_stepping) { + p_cid.stepping = 0; + n_cid.stepping = 0; + } return p_cid.full == n_cid.full; } else { @@ -644,13 +647,13 @@ static struct ucode_patch *cache_find_patch(struct ucode_cpu_info *uci, u16 equi WARN_ON_ONCE(!n.patch_id); list_for_each_entry(p, µcode_cache, plist) - if (patch_cpus_equivalent(p, &n)) + if (patch_cpus_equivalent(p, &n, false)) return p; return NULL; } -static inline bool patch_newer(struct ucode_patch *p, struct ucode_patch *n) +static inline int patch_newer(struct ucode_patch *p, struct ucode_patch *n) { /* Zen and newer hardcode the f/m/s in the patch ID */ if (x86_family(bsp_cpuid_1_eax) >= 0x17) { @@ -659,6 +662,9 @@ static inline bool patch_newer(struct ucode_patch *p, struct ucode_patch *n) zp.ucode_rev = p->patch_id; zn.ucode_rev = n->patch_id; + if (zn.stepping != zp.stepping) + return -1; + return zn.rev > zp.rev; } else { return n->patch_id > p->patch_id; @@ -668,10 +674,14 @@ static inline bool patch_newer(struct ucode_patch *p, struct ucode_patch *n) static void update_cache(struct ucode_patch *new_patch) { struct ucode_patch *p; + int ret; list_for_each_entry(p, µcode_cache, plist) { - if (patch_cpus_equivalent(p, new_patch)) { - if (!patch_newer(p, new_patch)) { + if (patch_cpus_equivalent(p, new_patch, true)) { + ret = patch_newer(p, new_patch); + if (ret < 0) + continue; + else if (!ret) { /* we already have the latest patch */ kfree(new_patch->data); kfree(new_patch); @@ -944,21 +954,30 @@ static enum ucode_state __load_microcode_amd(u8 family, const u8 *data, return UCODE_OK; } -static enum ucode_state load_microcode_amd(u8 family, const u8 *data, size_t size) +static enum ucode_state _load_microcode_amd(u8 family, const u8 *data, size_t size) { - struct cpuinfo_x86 *c; - unsigned int nid, cpu; - struct ucode_patch *p; enum ucode_state ret; /* free old equiv table */ free_equiv_cpu_table(); ret = __load_microcode_amd(family, data, size); - if (ret != UCODE_OK) { + if (ret != UCODE_OK) cleanup(); + + return ret; +} + +static enum ucode_state load_microcode_amd(u8 family, const u8 *data, size_t size) +{ + struct cpuinfo_x86 *c; + unsigned int nid, cpu; + struct ucode_patch *p; + enum ucode_state ret; + + ret = _load_microcode_amd(family, data, size); + if (ret != UCODE_OK) return ret; - } for_each_node(nid) { cpu = cpumask_first(cpumask_of_node(nid)); diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c index 815fa67356a2..f3d534807d91 100644 --- a/arch/x86/kernel/cpu/microcode/intel.c +++ b/arch/x86/kernel/cpu/microcode/intel.c @@ -319,12 +319,6 @@ static enum ucode_state __apply_microcode(struct ucode_cpu_info *uci, return UCODE_OK; } - /* - * Writeback and invalidate caches before updating microcode to avoid - * internal issues depending on what the microcode is updating. - */ - native_wbinvd(); - /* write microcode via MSR 0x79 */ native_wrmsrl(MSR_IA32_UCODE_WRITE, (unsigned long)mc->bits); @@ -574,14 +568,14 @@ static bool is_blacklisted(unsigned int cpu) /* * Late loading on model 79 with microcode revision less than 0x0b000021 * and LLC size per core bigger than 2.5MB may result in a system hang. - * This behavior is documented in item BDF90, #334165 (Intel Xeon + * This behavior is documented in item BDX90, #334165 (Intel Xeon * Processor E7-8800/4800 v4 Product Family). */ if (c->x86_vfm == INTEL_BROADWELL_X && c->x86_stepping == 0x01 && llc_size_per_core > 2621440 && c->microcode < 0x0b000021) { - pr_err_once("Erratum BDF90: late loading with revision < 0x0b000021 (0x%x) disabled.\n", c->microcode); + pr_err_once("Erratum BDX90: late loading with revision < 0x0b000021 (0x%x) disabled.\n", c->microcode); pr_err_once("Please consider either early loading through initrd/built-in or a potential BIOS update.\n"); return true; } diff --git a/arch/x86/kernel/cpu/proc.c b/arch/x86/kernel/cpu/proc.c index e65fae63660e..41ed01f46bd9 100644 --- a/arch/x86/kernel/cpu/proc.c +++ b/arch/x86/kernel/cpu/proc.c @@ -41,11 +41,11 @@ static void show_cpuinfo_misc(struct seq_file *m, struct cpuinfo_x86 *c) "fpu_exception\t: %s\n" "cpuid level\t: %d\n" "wp\t\t: yes\n", - boot_cpu_has_bug(X86_BUG_FDIV) ? "yes" : "no", - boot_cpu_has_bug(X86_BUG_F00F) ? "yes" : "no", - boot_cpu_has_bug(X86_BUG_COMA) ? "yes" : "no", - boot_cpu_has(X86_FEATURE_FPU) ? "yes" : "no", - boot_cpu_has(X86_FEATURE_FPU) ? "yes" : "no", + str_yes_no(boot_cpu_has_bug(X86_BUG_FDIV)), + str_yes_no(boot_cpu_has_bug(X86_BUG_F00F)), + str_yes_no(boot_cpu_has_bug(X86_BUG_COMA)), + str_yes_no(boot_cpu_has(X86_FEATURE_FPU)), + str_yes_no(boot_cpu_has(X86_FEATURE_FPU)), c->cpuid_level); } #else diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c index 8591d53c144b..b681c2e07dbf 100644 --- a/arch/x86/kernel/cpu/resctrl/core.c +++ b/arch/x86/kernel/cpu/resctrl/core.c @@ -207,7 +207,7 @@ static inline bool rdt_get_mb_table(struct rdt_resource *r) return false; } -static bool __get_mem_config_intel(struct rdt_resource *r) +static __init bool __get_mem_config_intel(struct rdt_resource *r) { struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r); union cpuid_0x10_3_eax eax; @@ -241,7 +241,7 @@ static bool __get_mem_config_intel(struct rdt_resource *r) return true; } -static bool __rdt_get_mem_config_amd(struct rdt_resource *r) +static __init bool __rdt_get_mem_config_amd(struct rdt_resource *r) { struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r); u32 eax, ebx, ecx, edx, subleaf; diff --git a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c index 50fa1fe9a073..200d89a64027 100644 --- a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c +++ b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c @@ -29,10 +29,10 @@ * hardware. The allocated bandwidth percentage is rounded to the next * control step available on the hardware. */ -static bool bw_validate(char *buf, unsigned long *data, struct rdt_resource *r) +static bool bw_validate(char *buf, u32 *data, struct rdt_resource *r) { - unsigned long bw; int ret; + u32 bw; /* * Only linear delay values is supported for current Intel SKUs. @@ -42,16 +42,21 @@ static bool bw_validate(char *buf, unsigned long *data, struct rdt_resource *r) return false; } - ret = kstrtoul(buf, 10, &bw); + ret = kstrtou32(buf, 10, &bw); if (ret) { - rdt_last_cmd_printf("Non-decimal digit in MB value %s\n", buf); + rdt_last_cmd_printf("Invalid MB value %s\n", buf); return false; } - if ((bw < r->membw.min_bw || bw > r->default_ctrl) && - !is_mba_sc(r)) { - rdt_last_cmd_printf("MB value %ld out of range [%d,%d]\n", bw, - r->membw.min_bw, r->default_ctrl); + /* Nothing else to do if software controller is enabled. */ + if (is_mba_sc(r)) { + *data = bw; + return true; + } + + if (bw < r->membw.min_bw || bw > r->default_ctrl) { + rdt_last_cmd_printf("MB value %u out of range [%d,%d]\n", + bw, r->membw.min_bw, r->default_ctrl); return false; } @@ -65,7 +70,7 @@ int parse_bw(struct rdt_parse_data *data, struct resctrl_schema *s, struct resctrl_staged_config *cfg; u32 closid = data->rdtgrp->closid; struct rdt_resource *r = s->res; - unsigned long bw_val; + u32 bw_val; cfg = &d->staged_config[s->conf_type]; if (cfg->have_new_ctrl) { diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c index 851b561850e0..5fcb3d635d91 100644 --- a/arch/x86/kernel/cpu/resctrl/monitor.c +++ b/arch/x86/kernel/cpu/resctrl/monitor.c @@ -1158,11 +1158,12 @@ static __init int snc_get_config(void) ret = cpus_per_l3 / cpus_per_node; - /* sanity check: Only valid results are 1, 2, 3, 4 */ + /* sanity check: Only valid results are 1, 2, 3, 4, 6 */ switch (ret) { case 1: break; case 2 ... 4: + case 6: pr_info("Sub-NUMA Cluster mode detected with %d nodes per L3 cache\n", ret); rdt_resources_all[RDT_RESOURCE_L3].r_resctrl.mon_scope = RESCTRL_L3_NODE; break; diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c index d7163b764c62..d906a1cd8491 100644 --- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c +++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c @@ -1596,7 +1596,7 @@ static void mondata_config_read(struct rdt_mon_domain *d, struct mon_config_info static int mbm_config_show(struct seq_file *s, struct rdt_resource *r, u32 evtid) { - struct mon_config_info mon_info = {0}; + struct mon_config_info mon_info; struct rdt_mon_domain *dom; bool sep = false; diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c index c84c30188fdf..16f3ca30626a 100644 --- a/arch/x86/kernel/cpu/scattered.c +++ b/arch/x86/kernel/cpu/scattered.c @@ -24,34 +24,36 @@ struct cpuid_bit { * levels are different and there is a separate entry for each. */ static const struct cpuid_bit cpuid_bits[] = { - { X86_FEATURE_APERFMPERF, CPUID_ECX, 0, 0x00000006, 0 }, - { X86_FEATURE_EPB, CPUID_ECX, 3, 0x00000006, 0 }, - { X86_FEATURE_INTEL_PPIN, CPUID_EBX, 0, 0x00000007, 1 }, - { X86_FEATURE_RRSBA_CTRL, CPUID_EDX, 2, 0x00000007, 2 }, - { X86_FEATURE_BHI_CTRL, CPUID_EDX, 4, 0x00000007, 2 }, - { X86_FEATURE_CQM_LLC, CPUID_EDX, 1, 0x0000000f, 0 }, - { X86_FEATURE_CQM_OCCUP_LLC, CPUID_EDX, 0, 0x0000000f, 1 }, - { X86_FEATURE_CQM_MBM_TOTAL, CPUID_EDX, 1, 0x0000000f, 1 }, - { X86_FEATURE_CQM_MBM_LOCAL, CPUID_EDX, 2, 0x0000000f, 1 }, - { X86_FEATURE_CAT_L3, CPUID_EBX, 1, 0x00000010, 0 }, - { X86_FEATURE_CAT_L2, CPUID_EBX, 2, 0x00000010, 0 }, - { X86_FEATURE_CDP_L3, CPUID_ECX, 2, 0x00000010, 1 }, - { X86_FEATURE_CDP_L2, CPUID_ECX, 2, 0x00000010, 2 }, - { X86_FEATURE_MBA, CPUID_EBX, 3, 0x00000010, 0 }, - { X86_FEATURE_PER_THREAD_MBA, CPUID_ECX, 0, 0x00000010, 3 }, - { X86_FEATURE_SGX1, CPUID_EAX, 0, 0x00000012, 0 }, - { X86_FEATURE_SGX2, CPUID_EAX, 1, 0x00000012, 0 }, - { X86_FEATURE_SGX_EDECCSSA, CPUID_EAX, 11, 0x00000012, 0 }, - { X86_FEATURE_HW_PSTATE, CPUID_EDX, 7, 0x80000007, 0 }, - { X86_FEATURE_CPB, CPUID_EDX, 9, 0x80000007, 0 }, - { X86_FEATURE_PROC_FEEDBACK, CPUID_EDX, 11, 0x80000007, 0 }, - { X86_FEATURE_FAST_CPPC, CPUID_EDX, 15, 0x80000007, 0 }, - { X86_FEATURE_MBA, CPUID_EBX, 6, 0x80000008, 0 }, - { X86_FEATURE_SMBA, CPUID_EBX, 2, 0x80000020, 0 }, - { X86_FEATURE_BMEC, CPUID_EBX, 3, 0x80000020, 0 }, - { X86_FEATURE_PERFMON_V2, CPUID_EAX, 0, 0x80000022, 0 }, - { X86_FEATURE_AMD_LBR_V2, CPUID_EAX, 1, 0x80000022, 0 }, + { X86_FEATURE_APERFMPERF, CPUID_ECX, 0, 0x00000006, 0 }, + { X86_FEATURE_EPB, CPUID_ECX, 3, 0x00000006, 0 }, + { X86_FEATURE_INTEL_PPIN, CPUID_EBX, 0, 0x00000007, 1 }, + { X86_FEATURE_RRSBA_CTRL, CPUID_EDX, 2, 0x00000007, 2 }, + { X86_FEATURE_BHI_CTRL, CPUID_EDX, 4, 0x00000007, 2 }, + { X86_FEATURE_CQM_LLC, CPUID_EDX, 1, 0x0000000f, 0 }, + { X86_FEATURE_CQM_OCCUP_LLC, CPUID_EDX, 0, 0x0000000f, 1 }, + { X86_FEATURE_CQM_MBM_TOTAL, CPUID_EDX, 1, 0x0000000f, 1 }, + { X86_FEATURE_CQM_MBM_LOCAL, CPUID_EDX, 2, 0x0000000f, 1 }, + { X86_FEATURE_CAT_L3, CPUID_EBX, 1, 0x00000010, 0 }, + { X86_FEATURE_CAT_L2, CPUID_EBX, 2, 0x00000010, 0 }, + { X86_FEATURE_CDP_L3, CPUID_ECX, 2, 0x00000010, 1 }, + { X86_FEATURE_CDP_L2, CPUID_ECX, 2, 0x00000010, 2 }, + { X86_FEATURE_MBA, CPUID_EBX, 3, 0x00000010, 0 }, + { X86_FEATURE_PER_THREAD_MBA, CPUID_ECX, 0, 0x00000010, 3 }, + { X86_FEATURE_SGX1, CPUID_EAX, 0, 0x00000012, 0 }, + { X86_FEATURE_SGX2, CPUID_EAX, 1, 0x00000012, 0 }, + { X86_FEATURE_SGX_EDECCSSA, CPUID_EAX, 11, 0x00000012, 0 }, + { X86_FEATURE_HW_PSTATE, CPUID_EDX, 7, 0x80000007, 0 }, + { X86_FEATURE_CPB, CPUID_EDX, 9, 0x80000007, 0 }, + { X86_FEATURE_PROC_FEEDBACK, CPUID_EDX, 11, 0x80000007, 0 }, + { X86_FEATURE_AMD_FAST_CPPC, CPUID_EDX, 15, 0x80000007, 0 }, + { X86_FEATURE_MBA, CPUID_EBX, 6, 0x80000008, 0 }, + { X86_FEATURE_SMBA, CPUID_EBX, 2, 0x80000020, 0 }, + { X86_FEATURE_BMEC, CPUID_EBX, 3, 0x80000020, 0 }, + { X86_FEATURE_AMD_WORKLOAD_CLASS, CPUID_EAX, 22, 0x80000021, 0 }, + { X86_FEATURE_PERFMON_V2, CPUID_EAX, 0, 0x80000022, 0 }, + { X86_FEATURE_AMD_LBR_V2, CPUID_EAX, 1, 0x80000022, 0 }, { X86_FEATURE_AMD_LBR_PMC_FREEZE, CPUID_EAX, 2, 0x80000022, 0 }, + { X86_FEATURE_AMD_HETEROGENEOUS_CORES, CPUID_EAX, 30, 0x80000026, 0 }, { 0, 0, 0, 0, 0 } }; diff --git a/arch/x86/kernel/cpu/sgx/main.c b/arch/x86/kernel/cpu/sgx/main.c index 9ace84486499..eb5848d1851a 100644 --- a/arch/x86/kernel/cpu/sgx/main.c +++ b/arch/x86/kernel/cpu/sgx/main.c @@ -901,19 +901,15 @@ static struct miscdevice sgx_dev_provision = { int sgx_set_attribute(unsigned long *allowed_attributes, unsigned int attribute_fd) { - struct fd f = fdget(attribute_fd); + CLASS(fd, f)(attribute_fd); - if (!fd_file(f)) + if (fd_empty(f)) return -EINVAL; - if (fd_file(f)->f_op != &sgx_provision_fops) { - fdput(f); + if (fd_file(f)->f_op != &sgx_provision_fops) return -EINVAL; - } *allowed_attributes |= SGX_ATTR_PROVISIONKEY; - - fdput(f); return 0; } EXPORT_SYMBOL_GPL(sgx_set_attribute); diff --git a/arch/x86/kernel/cpu/topology_amd.c b/arch/x86/kernel/cpu/topology_amd.c index 7d476fa697ca..03b3c9c3a45e 100644 --- a/arch/x86/kernel/cpu/topology_amd.c +++ b/arch/x86/kernel/cpu/topology_amd.c @@ -182,6 +182,9 @@ static void parse_topology_amd(struct topo_scan *tscan) if (cpu_feature_enabled(X86_FEATURE_TOPOEXT)) has_topoext = cpu_parse_topology_ext(tscan); + if (cpu_feature_enabled(X86_FEATURE_AMD_HETEROGENEOUS_CORES)) + tscan->c->topo.cpu_type = cpuid_ebx(0x80000026); + if (!has_topoext && !parse_8000_0008(tscan)) return; diff --git a/arch/x86/kernel/cpu/topology_common.c b/arch/x86/kernel/cpu/topology_common.c index 9a6069e7133c..8277c64f88db 100644 --- a/arch/x86/kernel/cpu/topology_common.c +++ b/arch/x86/kernel/cpu/topology_common.c @@ -3,6 +3,7 @@ #include <xen/xen.h> +#include <asm/intel-family.h> #include <asm/apic.h> #include <asm/processor.h> #include <asm/smp.h> @@ -27,6 +28,36 @@ void topology_set_dom(struct topo_scan *tscan, enum x86_topology_domains dom, } } +enum x86_topology_cpu_type get_topology_cpu_type(struct cpuinfo_x86 *c) +{ + if (c->x86_vendor == X86_VENDOR_INTEL) { + switch (c->topo.intel_type) { + case INTEL_CPU_TYPE_ATOM: return TOPO_CPU_TYPE_EFFICIENCY; + case INTEL_CPU_TYPE_CORE: return TOPO_CPU_TYPE_PERFORMANCE; + } + } + if (c->x86_vendor == X86_VENDOR_AMD) { + switch (c->topo.amd_type) { + case 0: return TOPO_CPU_TYPE_PERFORMANCE; + case 1: return TOPO_CPU_TYPE_EFFICIENCY; + } + } + + return TOPO_CPU_TYPE_UNKNOWN; +} + +const char *get_topology_cpu_type_name(struct cpuinfo_x86 *c) +{ + switch (get_topology_cpu_type(c)) { + case TOPO_CPU_TYPE_PERFORMANCE: + return "performance"; + case TOPO_CPU_TYPE_EFFICIENCY: + return "efficiency"; + default: + return "unknown"; + } +} + static unsigned int __maybe_unused parse_num_cores_legacy(struct cpuinfo_x86 *c) { struct { @@ -87,6 +118,7 @@ static void parse_topology(struct topo_scan *tscan, bool early) .cu_id = 0xff, .llc_id = BAD_APICID, .l2c_id = BAD_APICID, + .cpu_type = TOPO_CPU_TYPE_UNKNOWN, }; struct cpuinfo_x86 *c = tscan->c; struct { @@ -132,6 +164,8 @@ static void parse_topology(struct topo_scan *tscan, bool early) case X86_VENDOR_INTEL: if (!IS_ENABLED(CONFIG_CPU_SUP_INTEL) || !cpu_parse_topology_ext(tscan)) parse_legacy(tscan); + if (c->cpuid_level >= 0x1a) + c->topo.cpu_type = cpuid_eax(0x1a); break; case X86_VENDOR_HYGON: if (IS_ENABLED(CONFIG_CPU_SUP_HYGON)) diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 8da0e66ca22d..adb09f78edb2 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -647,7 +647,7 @@ void prepare_ftrace_return(unsigned long ip, unsigned long *parent, void ftrace_graph_func(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *op, struct ftrace_regs *fregs) { - struct pt_regs *regs = &fregs->regs; + struct pt_regs *regs = &arch_ftrace_regs(fregs)->regs; unsigned long *stack = (unsigned long *)kernel_stack_pointer(regs); prepare_ftrace_return(ip, (unsigned long *)stack, 0); diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index 16752b8dfa89..56163e2124cf 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -77,6 +77,7 @@ SYM_CODE_START_NOALIGN(startup_64) lretq .Lon_kernel_cs: + ANNOTATE_NOENDBR UNWIND_HINT_END_OF_STACK #ifdef CONFIG_AMD_MEM_ENCRYPT diff --git a/arch/x86/kernel/kprobes/ftrace.c b/arch/x86/kernel/kprobes/ftrace.c index 15af7e98e161..2be55ec3f392 100644 --- a/arch/x86/kernel/kprobes/ftrace.c +++ b/arch/x86/kernel/kprobes/ftrace.c @@ -9,6 +9,7 @@ #include <linux/hardirq.h> #include <linux/preempt.h> #include <linux/ftrace.h> +#include <asm/text-patching.h> #include "common.h" @@ -36,23 +37,25 @@ void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip, if (kprobe_running()) { kprobes_inc_nmissed_count(p); } else { - unsigned long orig_ip = regs->ip; + unsigned long orig_ip = instruction_pointer(regs); + /* Kprobe handler expects regs->ip = ip + 1 as breakpoint hit */ - regs->ip = ip + sizeof(kprobe_opcode_t); + instruction_pointer_set(regs, ip + INT3_INSN_SIZE); __this_cpu_write(current_kprobe, p); kcb->kprobe_status = KPROBE_HIT_ACTIVE; if (!p->pre_handler || !p->pre_handler(p, regs)) { - /* - * Emulate singlestep (and also recover regs->ip) - * as if there is a 5byte nop - */ - regs->ip = (unsigned long)p->addr + MCOUNT_INSN_SIZE; if (unlikely(p->post_handler)) { + /* + * Emulate singlestep (and also recover regs->ip) + * as if there is a 5byte nop + */ + instruction_pointer_set(regs, ip + MCOUNT_INSN_SIZE); kcb->kprobe_status = KPROBE_HIT_SSDONE; p->post_handler(p, regs, 0); } - regs->ip = orig_ip; + /* Recover IP address */ + instruction_pointer_set(regs, orig_ip); } /* * If pre_handler returns !0, it changes regs->ip. We have to diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 263f8aed4e2c..21e9e4845354 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -37,6 +37,7 @@ #include <asm/apic.h> #include <asm/apicdef.h> #include <asm/hypervisor.h> +#include <asm/mtrr.h> #include <asm/tlb.h> #include <asm/cpuidle_haltpoll.h> #include <asm/ptrace.h> @@ -980,6 +981,9 @@ static void __init kvm_init_platform(void) } kvmclock_init(); x86_platform.apic_post_init = kvm_apic_init; + + /* Set WB as the default cache mode for SEV-SNP and TDX */ + mtrr_overwrite_state(NULL, 0, MTRR_TYPE_WRBACK); } #if defined(CONFIG_AMD_MEM_ENCRYPT) diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index 0e0a4cf6b5eb..615922838c51 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -530,7 +530,7 @@ static inline void kb_wait(void) static inline void nmi_shootdown_cpus_on_restart(void); -#if IS_ENABLED(CONFIG_KVM_INTEL) || IS_ENABLED(CONFIG_KVM_AMD) +#if IS_ENABLED(CONFIG_KVM_X86) /* RCU-protected callback to disable virtualization prior to reboot. */ static cpu_emergency_virt_cb __rcu *cpu_emergency_virt_callback; @@ -600,7 +600,7 @@ static void emergency_reboot_disable_virtualization(void) } #else static void emergency_reboot_disable_virtualization(void) { } -#endif /* CONFIG_KVM_INTEL || CONFIG_KVM_AMD */ +#endif /* CONFIG_KVM_X86 */ void __attribute__((weak)) mach_reboot_fixups(void) { diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 766f092dab80..b5a8f0891135 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -497,8 +497,9 @@ static int x86_cluster_flags(void) static int x86_die_flags(void) { - if (cpu_feature_enabled(X86_FEATURE_HYBRID_CPU)) - return x86_sched_itmt_flags(); + if (cpu_feature_enabled(X86_FEATURE_HYBRID_CPU) || + cpu_feature_enabled(X86_FEATURE_AMD_HETEROGENEOUS_CORES)) + return x86_sched_itmt_flags(); return 0; } diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index d05392db5d0f..2dbadf347b5f 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -261,12 +261,6 @@ static noinstr bool handle_bug(struct pt_regs *regs) int ud_type; u32 imm; - /* - * Normally @regs are unpoisoned by irqentry_enter(), but handle_bug() - * is a rare case that uses @regs without passing them to - * irqentry_enter(). - */ - kmsan_unpoison_entry_regs(regs); ud_type = decode_bug(regs->ip, &imm); if (ud_type == BUG_NONE) return handled; @@ -276,6 +270,12 @@ static noinstr bool handle_bug(struct pt_regs *regs) */ instrumentation_begin(); /* + * Normally @regs are unpoisoned by irqentry_enter(), but handle_bug() + * is a rare case that uses @regs without passing them to + * irqentry_enter(). + */ + kmsan_unpoison_entry_regs(regs); + /* * Since we're emulating a CALL with exceptions, restore the interrupt * state to what it was at the exception site. */ diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index dfe6847fd99e..67aeaba4ba9c 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -174,10 +174,11 @@ static void __set_cyc2ns_scale(unsigned long khz, int cpu, unsigned long long ts c2n = per_cpu_ptr(&cyc2ns, cpu); - raw_write_seqcount_latch(&c2n->seq); + write_seqcount_latch_begin(&c2n->seq); c2n->data[0] = data; - raw_write_seqcount_latch(&c2n->seq); + write_seqcount_latch(&c2n->seq); c2n->data[1] = data; + write_seqcount_latch_end(&c2n->seq); } static void set_cyc2ns_scale(unsigned long khz, int cpu, unsigned long long tsc_now) diff --git a/arch/x86/kernel/unwind_orc.c b/arch/x86/kernel/unwind_orc.c index d00c28aaa5be..d4705a348a80 100644 --- a/arch/x86/kernel/unwind_orc.c +++ b/arch/x86/kernel/unwind_orc.c @@ -723,7 +723,7 @@ void __unwind_start(struct unwind_state *state, struct task_struct *task, state->sp = task->thread.sp + sizeof(*frame); state->bp = READ_ONCE_NOCHECK(frame->bp); state->ip = READ_ONCE_NOCHECK(frame->ret_addr); - state->signal = (void *)state->ip == ret_from_fork; + state->signal = (void *)state->ip == ret_from_fork_asm; } if (get_stack_info((unsigned long *)state->sp, state->task, diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 6726be89b7a6..68efd8cd8bf1 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -193,29 +193,6 @@ SECTIONS ORC_UNWIND_TABLE - . = ALIGN(PAGE_SIZE); - __vvar_page = .; - - .vvar : AT(ADDR(.vvar) - LOAD_OFFSET) { - /* work around gold bug 13023 */ - __vvar_beginning_hack = .; - - /* Place all vvars at the offsets in asm/vvar.h. */ -#define EMIT_VVAR(name, offset) \ - . = __vvar_beginning_hack + offset; \ - *(.vvar_ ## name) -#include <asm/vvar.h> -#undef EMIT_VVAR - - /* - * Pad the rest of the page with zeros. Otherwise the loader - * can leave garbage here. - */ - . = __vvar_beginning_hack + PAGE_SIZE; - } :data - - . = ALIGN(__vvar_page + PAGE_SIZE, PAGE_SIZE); - /* Init code and data - will be freed after init */ . = ALIGN(PAGE_SIZE); .init.begin : AT(ADDR(.init.begin) - LOAD_OFFSET) { @@ -358,6 +335,7 @@ SECTIONS #endif RUNTIME_CONST_VARIABLES + RUNTIME_CONST(ptr, USER_PTR_MAX) . = ALIGN(PAGE_SIZE); @@ -490,6 +468,9 @@ SECTIONS . = ASSERT((_end - LOAD_OFFSET <= KERNEL_IMAGE_SIZE), "kernel image bigger than KERNEL_IMAGE_SIZE"); +/* needed for Clang - see arch/x86/entry/entry.S */ +PROVIDE(__ref_stack_chk_guard = __stack_chk_guard); + #ifdef CONFIG_X86_64 /* * Per-cpu symbols which need to be offset from __per_cpu_load @@ -527,3 +508,22 @@ INIT_PER_CPU(irq_stack_backing_store); #endif #endif /* CONFIG_X86_64 */ + +/* + * The symbols below are referenced using relative relocations in the + * respective ELF notes. This produces build time constants that the + * linker will never mark as relocatable. (Using just ABSOLUTE() is not + * sufficient for that). + */ +#ifdef CONFIG_XEN +#ifdef CONFIG_XEN_PV +xen_elfnote_entry_value = + ABSOLUTE(xen_elfnote_entry) + ABSOLUTE(startup_xen); +#endif +xen_elfnote_hypercall_page_value = + ABSOLUTE(xen_elfnote_hypercall_page) + ABSOLUTE(hypercall_page); +#endif +#ifdef CONFIG_PVH +xen_elfnote_phys32_entry_value = + ABSOLUTE(xen_elfnote_phys32_entry) + ABSOLUTE(pvh_start_xen - LOAD_OFFSET); +#endif diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index 730c2f34d347..f09f13c01c6b 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -17,8 +17,8 @@ menuconfig VIRTUALIZATION if VIRTUALIZATION -config KVM - tristate "Kernel-based Virtual Machine (KVM) support" +config KVM_X86 + def_tristate KVM if KVM_INTEL || KVM_AMD depends on X86_LOCAL_APIC select KVM_COMMON select KVM_GENERIC_MMU_NOTIFIER @@ -44,7 +44,11 @@ config KVM select HAVE_KVM_PM_NOTIFIER if PM select KVM_GENERIC_HARDWARE_ENABLING select KVM_GENERIC_PRE_FAULT_MEMORY + select KVM_GENERIC_PRIVATE_MEM if KVM_SW_PROTECTED_VM select KVM_WERROR if WERROR + +config KVM + tristate "Kernel-based Virtual Machine (KVM) support" help Support hosting fully virtualized guest machines using hardware virtualization extensions. You will need a fairly recent @@ -77,7 +81,6 @@ config KVM_SW_PROTECTED_VM bool "Enable support for KVM software-protected VMs" depends on EXPERT depends on KVM && X86_64 - select KVM_GENERIC_PRIVATE_MEM help Enable support for KVM software-protected VMs. Currently, software- protected VMs are purely a development and testing vehicle for diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile index 5494669a055a..f9dddb8cb466 100644 --- a/arch/x86/kvm/Makefile +++ b/arch/x86/kvm/Makefile @@ -32,7 +32,7 @@ kvm-intel-y += vmx/vmx_onhyperv.o vmx/hyperv_evmcs.o kvm-amd-y += svm/svm_onhyperv.o endif -obj-$(CONFIG_KVM) += kvm.o +obj-$(CONFIG_KVM_X86) += kvm.o obj-$(CONFIG_KVM_INTEL) += kvm-intel.o obj-$(CONFIG_KVM_AMD) += kvm-amd.o diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 2098dc689088..95c6beb8ce27 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -2629,19 +2629,26 @@ void kvm_apic_update_apicv(struct kvm_vcpu *vcpu) { struct kvm_lapic *apic = vcpu->arch.apic; - if (apic->apicv_active) { - /* irr_pending is always true when apicv is activated. */ - apic->irr_pending = true; + /* + * When APICv is enabled, KVM must always search the IRR for a pending + * IRQ, as other vCPUs and devices can set IRR bits even if the vCPU + * isn't running. If APICv is disabled, KVM _should_ search the IRR + * for a pending IRQ. But KVM currently doesn't ensure *all* hardware, + * e.g. CPUs and IOMMUs, has seen the change in state, i.e. searching + * the IRR at this time could race with IRQ delivery from hardware that + * still sees APICv as being enabled. + * + * FIXME: Ensure other vCPUs and devices observe the change in APICv + * state prior to updating KVM's metadata caches, so that KVM + * can safely search the IRR and set irr_pending accordingly. + */ + apic->irr_pending = true; + + if (apic->apicv_active) apic->isr_count = 1; - } else { - /* - * Don't clear irr_pending, searching the IRR can race with - * updates from the CPU as APICv is still active from hardware's - * perspective. The flag will be cleared as appropriate when - * KVM injects the interrupt. - */ + else apic->isr_count = count_vectors(apic->regs + APIC_ISR); - } + apic->highest_isr_cache = -1; } diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index e52f990548df..8e853a5fc867 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -1556,6 +1556,17 @@ bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range) { bool flush = false; + /* + * To prevent races with vCPUs faulting in a gfn using stale data, + * zapping a gfn range must be protected by mmu_invalidate_in_progress + * (and mmu_invalidate_seq). The only exception is memslot deletion; + * in that case, SRCU synchronization ensures that SPTEs are zapped + * after all vCPUs have unlocked SRCU, guaranteeing that vCPUs see the + * invalid slot. + */ + lockdep_assert_once(kvm->mmu_invalidate_in_progress || + lockdep_is_held(&kvm->slots_lock)); + if (kvm_memslots_have_rmaps(kvm)) flush = __kvm_rmap_zap_gfn_range(kvm, range->slot, range->start, range->end, @@ -7047,14 +7058,42 @@ void kvm_arch_flush_shadow_all(struct kvm *kvm) kvm_mmu_zap_all(kvm); } -/* - * Zapping leaf SPTEs with memslot range when a memslot is moved/deleted. - * - * Zapping non-leaf SPTEs, a.k.a. not-last SPTEs, isn't required, worst - * case scenario we'll have unused shadow pages lying around until they - * are recycled due to age or when the VM is destroyed. - */ -static void kvm_mmu_zap_memslot_leafs(struct kvm *kvm, struct kvm_memory_slot *slot) +static void kvm_mmu_zap_memslot_pages_and_flush(struct kvm *kvm, + struct kvm_memory_slot *slot, + bool flush) +{ + LIST_HEAD(invalid_list); + unsigned long i; + + if (list_empty(&kvm->arch.active_mmu_pages)) + goto out_flush; + + /* + * Since accounting information is stored in struct kvm_arch_memory_slot, + * all MMU pages that are shadowing guest PTEs must be zapped before the + * memslot is deleted, as freeing such pages after the memslot is freed + * will result in use-after-free, e.g. in unaccount_shadowed(). + */ + for (i = 0; i < slot->npages; i++) { + struct kvm_mmu_page *sp; + gfn_t gfn = slot->base_gfn + i; + + for_each_gfn_valid_sp_with_gptes(kvm, sp, gfn) + kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list); + + if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) { + kvm_mmu_remote_flush_or_zap(kvm, &invalid_list, flush); + flush = false; + cond_resched_rwlock_write(&kvm->mmu_lock); + } + } + +out_flush: + kvm_mmu_remote_flush_or_zap(kvm, &invalid_list, flush); +} + +static void kvm_mmu_zap_memslot(struct kvm *kvm, + struct kvm_memory_slot *slot) { struct kvm_gfn_range range = { .slot = slot, @@ -7062,11 +7101,11 @@ static void kvm_mmu_zap_memslot_leafs(struct kvm *kvm, struct kvm_memory_slot *s .end = slot->base_gfn + slot->npages, .may_block = true, }; + bool flush; write_lock(&kvm->mmu_lock); - if (kvm_unmap_gfn_range(kvm, &range)) - kvm_flush_remote_tlbs_memslot(kvm, slot); - + flush = kvm_unmap_gfn_range(kvm, &range); + kvm_mmu_zap_memslot_pages_and_flush(kvm, slot, flush); write_unlock(&kvm->mmu_lock); } @@ -7082,7 +7121,7 @@ void kvm_arch_flush_shadow_memslot(struct kvm *kvm, if (kvm_memslot_flush_zap_all(kvm)) kvm_mmu_zap_all_fast(kvm); else - kvm_mmu_zap_memslot_leafs(kvm, slot); + kvm_mmu_zap_memslot(kvm, slot); } void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen) diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index d5314cb7dff4..cf84103ce38b 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -63,8 +63,12 @@ static u64 nested_svm_get_tdp_pdptr(struct kvm_vcpu *vcpu, int index) u64 pdpte; int ret; + /* + * Note, nCR3 is "assumed" to be 32-byte aligned, i.e. the CPU ignores + * nCR3[4:0] when loading PDPTEs from memory. + */ ret = kvm_vcpu_read_guest_page(vcpu, gpa_to_gfn(cr3), &pdpte, - offset_in_page(cr3) + index * 8, 8); + (cr3 & GENMASK(11, 5)) + index * 8, 8); if (ret) return 0; return pdpte; diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 0b851ef937f2..92d4711fd1e4 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -450,8 +450,11 @@ static int __sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp, goto e_free; /* This needs to happen after SEV/SNP firmware initialization. */ - if (vm_type == KVM_X86_SNP_VM && snp_guest_req_init(kvm)) - goto e_free; + if (vm_type == KVM_X86_SNP_VM) { + ret = snp_guest_req_init(kvm); + if (ret) + goto e_free; + } INIT_LIST_HEAD(&sev->regions_list); INIT_LIST_HEAD(&sev->mirror_vms); @@ -530,17 +533,12 @@ static int sev_bind_asid(struct kvm *kvm, unsigned int handle, int *error) static int __sev_issue_cmd(int fd, int id, void *data, int *error) { - struct fd f; - int ret; + CLASS(fd, f)(fd); - f = fdget(fd); - if (!fd_file(f)) + if (fd_empty(f)) return -EBADF; - ret = sev_issue_cmd_external_user(fd_file(f), id, data, error); - - fdput(f); - return ret; + return sev_issue_cmd_external_user(fd_file(f), id, data, error); } static int sev_issue_cmd(struct kvm *kvm, int id, void *data, int *error) @@ -2073,23 +2071,21 @@ int sev_vm_move_enc_context_from(struct kvm *kvm, unsigned int source_fd) { struct kvm_sev_info *dst_sev = &to_kvm_svm(kvm)->sev_info; struct kvm_sev_info *src_sev, *cg_cleanup_sev; - struct fd f = fdget(source_fd); + CLASS(fd, f)(source_fd); struct kvm *source_kvm; bool charged = false; int ret; - if (!fd_file(f)) + if (fd_empty(f)) return -EBADF; - if (!file_is_kvm(fd_file(f))) { - ret = -EBADF; - goto out_fput; - } + if (!file_is_kvm(fd_file(f))) + return -EBADF; source_kvm = fd_file(f)->private_data; ret = sev_lock_two_vms(kvm, source_kvm); if (ret) - goto out_fput; + return ret; if (kvm->arch.vm_type != source_kvm->arch.vm_type || sev_guest(kvm) || !sev_guest(source_kvm)) { @@ -2136,8 +2132,6 @@ out_dst_cgroup: cg_cleanup_sev->misc_cg = NULL; out_unlock: sev_unlock_two_vms(kvm, source_kvm); -out_fput: - fdput(f); return ret; } @@ -2212,10 +2206,6 @@ static int snp_launch_start(struct kvm *kvm, struct kvm_sev_cmd *argp) if (sev->snp_context) return -EINVAL; - sev->snp_context = snp_context_create(kvm, argp); - if (!sev->snp_context) - return -ENOTTY; - if (params.flags) return -EINVAL; @@ -2230,6 +2220,10 @@ static int snp_launch_start(struct kvm *kvm, struct kvm_sev_cmd *argp) if (params.policy & SNP_POLICY_MASK_SINGLE_SOCKET) return -EINVAL; + sev->snp_context = snp_context_create(kvm, argp); + if (!sev->snp_context) + return -ENOTTY; + start.gctx_paddr = __psp_pa(sev->snp_context); start.policy = params.policy; memcpy(start.gosvw, params.gosvw, sizeof(params.gosvw)); @@ -2798,23 +2792,21 @@ failed: int sev_vm_copy_enc_context_from(struct kvm *kvm, unsigned int source_fd) { - struct fd f = fdget(source_fd); + CLASS(fd, f)(source_fd); struct kvm *source_kvm; struct kvm_sev_info *source_sev, *mirror_sev; int ret; - if (!fd_file(f)) + if (fd_empty(f)) return -EBADF; - if (!file_is_kvm(fd_file(f))) { - ret = -EBADF; - goto e_source_fput; - } + if (!file_is_kvm(fd_file(f))) + return -EBADF; source_kvm = fd_file(f)->private_data; ret = sev_lock_two_vms(kvm, source_kvm); if (ret) - goto e_source_fput; + return ret; /* * Mirrors of mirrors should work, but let's not get silly. Also @@ -2857,8 +2849,6 @@ int sev_vm_copy_enc_context_from(struct kvm *kvm, unsigned int source_fd) e_unlock: sev_unlock_two_vms(kvm, source_kvm); -e_source_fput: - fdput(f); return ret; } diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index a8e7bc04d9bf..931a7361c30f 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -1197,11 +1197,14 @@ static void nested_vmx_transition_tlb_flush(struct kvm_vcpu *vcpu, kvm_hv_nested_transtion_tlb_flush(vcpu, enable_ept); /* - * If vmcs12 doesn't use VPID, L1 expects linear and combined mappings - * for *all* contexts to be flushed on VM-Enter/VM-Exit, i.e. it's a - * full TLB flush from the guest's perspective. This is required even - * if VPID is disabled in the host as KVM may need to synchronize the - * MMU in response to the guest TLB flush. + * If VPID is disabled, then guest TLB accesses use VPID=0, i.e. the + * same VPID as the host, and so architecturally, linear and combined + * mappings for VPID=0 must be flushed at VM-Enter and VM-Exit. KVM + * emulates L2 sharing L1's VPID=0 by using vpid01 while running L2, + * and so KVM must also emulate TLB flush of VPID=0, i.e. vpid01. This + * is required if VPID is disabled in KVM, as a TLB flush (there are no + * VPIDs) still occurs from L1's perspective, and KVM may need to + * synchronize the MMU in response to the guest TLB flush. * * Note, using TLB_FLUSH_GUEST is correct even if nested EPT is in use. * EPT is a special snowflake, as guest-physical mappings aren't @@ -2315,6 +2318,17 @@ static void prepare_vmcs02_early_rare(struct vcpu_vmx *vmx, vmcs_write64(VMCS_LINK_POINTER, INVALID_GPA); + /* + * If VPID is disabled, then guest TLB accesses use VPID=0, i.e. the + * same VPID as the host. Emulate this behavior by using vpid01 for L2 + * if VPID is disabled in vmcs12. Note, if VPID is disabled, VM-Enter + * and VM-Exit are architecturally required to flush VPID=0, but *only* + * VPID=0. I.e. using vpid02 would be ok (so long as KVM emulates the + * required flushes), but doing so would cause KVM to over-flush. E.g. + * if L1 runs L2 X with VPID12=1, then runs L2 Y with VPID12 disabled, + * and then runs L2 X again, then KVM can and should retain TLB entries + * for VPID12=1. + */ if (enable_vpid) { if (nested_cpu_has_vpid(vmcs12) && vmx->nested.vpid02) vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->nested.vpid02); @@ -5950,6 +5964,12 @@ static int handle_invvpid(struct kvm_vcpu *vcpu) return nested_vmx_fail(vcpu, VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); + /* + * Always flush the effective vpid02, i.e. never flush the current VPID + * and never explicitly flush vpid01. INVVPID targets a VPID, not a + * VMCS, and so whether or not the current vmcs12 has VPID enabled is + * irrelevant (and there may not be a loaded vmcs12). + */ vpid02 = nested_get_vpid02(vcpu); switch (type) { case VMX_VPID_EXTENT_INDIVIDUAL_ADDR: diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 1a4438358c5e..d28618e9277e 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -217,9 +217,11 @@ module_param(ple_window_shrink, uint, 0444); static unsigned int ple_window_max = KVM_VMX_DEFAULT_PLE_WINDOW_MAX; module_param(ple_window_max, uint, 0444); -/* Default is SYSTEM mode, 1 for host-guest mode */ +/* Default is SYSTEM mode, 1 for host-guest mode (which is BROKEN) */ int __read_mostly pt_mode = PT_MODE_SYSTEM; +#ifdef CONFIG_BROKEN module_param(pt_mode, int, S_IRUGO); +#endif struct x86_pmu_lbr __ro_after_init vmx_lbr_caps; @@ -3216,7 +3218,7 @@ void vmx_flush_tlb_all(struct kvm_vcpu *vcpu) static inline int vmx_get_current_vpid(struct kvm_vcpu *vcpu) { - if (is_guest_mode(vcpu)) + if (is_guest_mode(vcpu) && nested_cpu_has_vpid(get_vmcs12(vcpu))) return nested_get_vpid02(vcpu); return to_vmx(vcpu)->vpid; } @@ -4888,9 +4890,6 @@ void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) vmx->hv_deadline_tsc = -1; kvm_set_cr8(vcpu, 0); - vmx_segment_cache_clear(vmx); - kvm_register_mark_available(vcpu, VCPU_EXREG_SEGMENTS); - seg_setup(VCPU_SREG_CS); vmcs_write16(GUEST_CS_SELECTOR, 0xf000); vmcs_writel(GUEST_CS_BASE, 0xffff0000ul); @@ -4917,6 +4916,9 @@ void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) vmcs_writel(GUEST_IDTR_BASE, 0); vmcs_write32(GUEST_IDTR_LIMIT, 0xffff); + vmx_segment_cache_clear(vmx); + kvm_register_mark_available(vcpu, VCPU_EXREG_SEGMENTS); + vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE); vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0); vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, 0); diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c index 622fe24da910..a909b817b9c0 100644 --- a/arch/x86/kvm/xen.c +++ b/arch/x86/kvm/xen.c @@ -263,13 +263,6 @@ static void kvm_xen_stop_timer(struct kvm_vcpu *vcpu) atomic_set(&vcpu->arch.xen.timer_pending, 0); } -static void kvm_xen_init_timer(struct kvm_vcpu *vcpu) -{ - hrtimer_init(&vcpu->arch.xen.timer, CLOCK_MONOTONIC, - HRTIMER_MODE_ABS_HARD); - vcpu->arch.xen.timer.function = xen_timer_callback; -} - static void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, bool atomic) { struct kvm_vcpu_xen *vx = &v->arch.xen; @@ -1070,9 +1063,6 @@ int kvm_xen_vcpu_set_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data) break; } - if (!vcpu->arch.xen.timer.function) - kvm_xen_init_timer(vcpu); - /* Stop the timer (if it's running) before changing the vector */ kvm_xen_stop_timer(vcpu); vcpu->arch.xen.timer_virq = data->u.timer.port; @@ -2235,6 +2225,8 @@ void kvm_xen_init_vcpu(struct kvm_vcpu *vcpu) vcpu->arch.xen.poll_evtchn = 0; timer_setup(&vcpu->arch.xen.poll_timer, cancel_evtchn_poll, 0); + hrtimer_init(&vcpu->arch.xen.timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD); + vcpu->arch.xen.timer.function = xen_timer_callback; kvm_gpc_init(&vcpu->arch.xen.runstate_cache, vcpu->kvm); kvm_gpc_init(&vcpu->arch.xen.runstate2_cache, vcpu->kvm); diff --git a/arch/x86/lib/getuser.S b/arch/x86/lib/getuser.S index d066aecf8aeb..4357ec2a0bfc 100644 --- a/arch/x86/lib/getuser.S +++ b/arch/x86/lib/getuser.S @@ -39,8 +39,13 @@ .macro check_range size:req .if IS_ENABLED(CONFIG_X86_64) - mov %rax, %rdx - sar $63, %rdx + movq $0x0123456789abcdef,%rdx + 1: + .pushsection runtime_ptr_USER_PTR_MAX,"a" + .long 1b - 8 - . + .popsection + cmp %rax, %rdx + sbb %rdx, %rdx or %rdx, %rax .else cmp $TASK_SIZE_MAX-\size+1, %eax diff --git a/arch/x86/lib/insn.c b/arch/x86/lib/insn.c index 5952ab41c60f..6ffb931b9fb1 100644 --- a/arch/x86/lib/insn.c +++ b/arch/x86/lib/insn.c @@ -13,7 +13,7 @@ #endif #include <asm/inat.h> /*__ignore_sync_check__ */ #include <asm/insn.h> /* __ignore_sync_check__ */ -#include <asm/unaligned.h> /* __ignore_sync_check__ */ +#include <linux/unaligned.h> /* __ignore_sync_check__ */ #include <linux/errno.h> #include <linux/kconfig.h> diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index eb503f53c319..101725c149c4 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -263,28 +263,33 @@ static void __init probe_page_size_mask(void) } /* - * INVLPG may not properly flush Global entries - * on these CPUs when PCIDs are enabled. + * INVLPG may not properly flush Global entries on + * these CPUs. New microcode fixes the issue. */ static const struct x86_cpu_id invlpg_miss_ids[] = { - X86_MATCH_VFM(INTEL_ALDERLAKE, 0), - X86_MATCH_VFM(INTEL_ALDERLAKE_L, 0), - X86_MATCH_VFM(INTEL_ATOM_GRACEMONT, 0), - X86_MATCH_VFM(INTEL_RAPTORLAKE, 0), - X86_MATCH_VFM(INTEL_RAPTORLAKE_P, 0), - X86_MATCH_VFM(INTEL_RAPTORLAKE_S, 0), + X86_MATCH_VFM(INTEL_ALDERLAKE, 0x2e), + X86_MATCH_VFM(INTEL_ALDERLAKE_L, 0x42c), + X86_MATCH_VFM(INTEL_ATOM_GRACEMONT, 0x11), + X86_MATCH_VFM(INTEL_RAPTORLAKE, 0x118), + X86_MATCH_VFM(INTEL_RAPTORLAKE_P, 0x4117), + X86_MATCH_VFM(INTEL_RAPTORLAKE_S, 0x2e), {} }; static void setup_pcid(void) { + const struct x86_cpu_id *invlpg_miss_match; + if (!IS_ENABLED(CONFIG_X86_64)) return; if (!boot_cpu_has(X86_FEATURE_PCID)) return; - if (x86_match_cpu(invlpg_miss_ids)) { + invlpg_miss_match = x86_match_cpu(invlpg_miss_ids); + + if (invlpg_miss_match && + boot_cpu_data.microcode < invlpg_miss_match->driver_data) { pr_info("Incomplete global flushes, disabling PCID"); setup_clear_cpu_cap(X86_FEATURE_PCID); return; diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index 70b02fc61d93..8d29163568a7 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -656,7 +656,8 @@ static bool memremap_is_setup_data(resource_size_t phys_addr, paddr_next = data->next; len = data->len; - if ((phys_addr > paddr) && (phys_addr < (paddr + len))) { + if ((phys_addr > paddr) && + (phys_addr < (paddr + sizeof(struct setup_data) + len))) { memunmap(data); return true; } @@ -718,7 +719,8 @@ static bool __init early_memremap_is_setup_data(resource_size_t phys_addr, paddr_next = data->next; len = data->len; - if ((phys_addr > paddr) && (phys_addr < (paddr + len))) { + if ((phys_addr > paddr) && + (phys_addr < (paddr + sizeof(struct setup_data) + len))) { early_memunmap(data, sizeof(*data)); return true; } diff --git a/arch/x86/mm/kaslr.c b/arch/x86/mm/kaslr.c index 230f1dee4f09..e17e6e27b7ec 100644 --- a/arch/x86/mm/kaslr.c +++ b/arch/x86/mm/kaslr.c @@ -22,7 +22,7 @@ #include <linux/kernel.h> #include <linux/init.h> -#include <linux/random.h> +#include <linux/prandom.h> #include <linux/memblock.h> #include <linux/pgtable.h> diff --git a/arch/x86/mm/mem_encrypt_amd.c b/arch/x86/mm/mem_encrypt_amd.c index 86a476a426c2..774f9677458f 100644 --- a/arch/x86/mm/mem_encrypt_amd.c +++ b/arch/x86/mm/mem_encrypt_amd.c @@ -311,59 +311,82 @@ static int amd_enc_status_change_finish(unsigned long vaddr, int npages, bool en return 0; } -static void __init __set_clr_pte_enc(pte_t *kpte, int level, bool enc) +int prepare_pte_enc(struct pte_enc_desc *d) { - pgprot_t old_prot, new_prot; - unsigned long pfn, pa, size; - pte_t new_pte; + pgprot_t old_prot; - pfn = pg_level_to_pfn(level, kpte, &old_prot); - if (!pfn) - return; + d->pfn = pg_level_to_pfn(d->pte_level, d->kpte, &old_prot); + if (!d->pfn) + return 1; - new_prot = old_prot; - if (enc) - pgprot_val(new_prot) |= _PAGE_ENC; + d->new_pgprot = old_prot; + if (d->encrypt) + pgprot_val(d->new_pgprot) |= _PAGE_ENC; else - pgprot_val(new_prot) &= ~_PAGE_ENC; + pgprot_val(d->new_pgprot) &= ~_PAGE_ENC; /* If prot is same then do nothing. */ - if (pgprot_val(old_prot) == pgprot_val(new_prot)) - return; + if (pgprot_val(old_prot) == pgprot_val(d->new_pgprot)) + return 1; - pa = pfn << PAGE_SHIFT; - size = page_level_size(level); + d->pa = d->pfn << PAGE_SHIFT; + d->size = page_level_size(d->pte_level); /* - * We are going to perform in-place en-/decryption and change the - * physical page attribute from C=1 to C=0 or vice versa. Flush the - * caches to ensure that data gets accessed with the correct C-bit. + * In-place en-/decryption and physical page attribute change + * from C=1 to C=0 or vice versa will be performed. Flush the + * caches to ensure that data gets accessed with the correct + * C-bit. */ - clflush_cache_range(__va(pa), size); + if (d->va) + clflush_cache_range(d->va, d->size); + else + clflush_cache_range(__va(d->pa), d->size); + + return 0; +} + +void set_pte_enc_mask(pte_t *kpte, unsigned long pfn, pgprot_t new_prot) +{ + pte_t new_pte; + + /* Change the page encryption mask. */ + new_pte = pfn_pte(pfn, new_prot); + set_pte_atomic(kpte, new_pte); +} + +static void __init __set_clr_pte_enc(pte_t *kpte, int level, bool enc) +{ + struct pte_enc_desc d = { + .kpte = kpte, + .pte_level = level, + .encrypt = enc + }; + + if (prepare_pte_enc(&d)) + return; /* Encrypt/decrypt the contents in-place */ if (enc) { - sme_early_encrypt(pa, size); + sme_early_encrypt(d.pa, d.size); } else { - sme_early_decrypt(pa, size); + sme_early_decrypt(d.pa, d.size); /* * ON SNP, the page state in the RMP table must happen * before the page table updates. */ - early_snp_set_memory_shared((unsigned long)__va(pa), pa, 1); + early_snp_set_memory_shared((unsigned long)__va(d.pa), d.pa, 1); } - /* Change the page encryption mask. */ - new_pte = pfn_pte(pfn, new_prot); - set_pte_atomic(kpte, new_pte); + set_pte_enc_mask(kpte, d.pfn, d.new_pgprot); /* * If page is set encrypted in the page table, then update the RMP table to * add this page as private. */ if (enc) - early_snp_set_memory_private((unsigned long)__va(pa), pa, 1); + early_snp_set_memory_private((unsigned long)__va(d.pa), d.pa, 1); } static int __init early_set_memory_enc_dec(unsigned long vaddr, @@ -467,6 +490,8 @@ void __init sme_early_init(void) x86_platform.guest.enc_status_change_finish = amd_enc_status_change_finish; x86_platform.guest.enc_tlb_flush_required = amd_enc_tlb_flush_required; x86_platform.guest.enc_cache_flush_required = amd_enc_cache_flush_required; + x86_platform.guest.enc_kexec_begin = snp_kexec_begin; + x86_platform.guest.enc_kexec_finish = snp_kexec_finish; /* * AMD-SEV-ES intercepts the RDMSR to read the X2APIC ID in the diff --git a/arch/x86/mm/mem_encrypt_identity.c b/arch/x86/mm/mem_encrypt_identity.c index ac33b2263a43..e6c7686f443a 100644 --- a/arch/x86/mm/mem_encrypt_identity.c +++ b/arch/x86/mm/mem_encrypt_identity.c @@ -495,10 +495,10 @@ void __head sme_enable(struct boot_params *bp) unsigned int eax, ebx, ecx, edx; unsigned long feature_mask; unsigned long me_mask; - bool snp; + bool snp_en; u64 msr; - snp = snp_init(bp); + snp_en = snp_init(bp); /* Check for the SME/SEV support leaf */ eax = 0x80000000; @@ -531,8 +531,11 @@ void __head sme_enable(struct boot_params *bp) RIP_REL_REF(sev_status) = msr = __rdmsr(MSR_AMD64_SEV); feature_mask = (msr & MSR_AMD64_SEV_ENABLED) ? AMD_SEV_BIT : AMD_SME_BIT; - /* The SEV-SNP CC blob should never be present unless SEV-SNP is enabled. */ - if (snp && !(msr & MSR_AMD64_SEV_SNP_ENABLED)) + /* + * Any discrepancies between the presence of a CC blob and SNP + * enablement abort the guest. + */ + if (snp_en ^ !!(msr & MSR_AMD64_SEV_SNP_ENABLED)) snp_abort(); /* Check if memory encryption is enabled */ diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c index a2cabb1c81e1..b8a6ffffb451 100644 --- a/arch/x86/mm/mmap.c +++ b/arch/x86/mm/mmap.c @@ -163,11 +163,6 @@ unsigned long get_mmap_base(int is_legacy) return is_legacy ? mm->mmap_legacy_base : mm->mmap_base; } -const char *arch_vma_name(struct vm_area_struct *vma) -{ - return NULL; -} - /** * mmap_address_hint_valid - Validate the address hint of mmap * @addr: Address hint diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 86593d1b787d..b0d5a644fc84 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -568,7 +568,7 @@ void switch_mm_irqs_off(struct mm_struct *unused, struct mm_struct *next, * mm_cpumask. The TLB shootdown code can figure out from * cpu_tlbstate_shared.is_lazy whether or not to send an IPI. */ - if (WARN_ON_ONCE(prev != &init_mm && + if (IS_ENABLED(CONFIG_DEBUG_VM) && WARN_ON_ONCE(prev != &init_mm && !cpumask_test_cpu(cpu, mm_cpumask(next)))) cpumask_set_cpu(cpu, mm_cpumask(next)); diff --git a/arch/x86/platform/efi/quirks.c b/arch/x86/platform/efi/quirks.c index f0cc00032751..846bf49f2508 100644 --- a/arch/x86/platform/efi/quirks.c +++ b/arch/x86/platform/efi/quirks.c @@ -656,8 +656,7 @@ static int qrk_capsule_setup_info(struct capsule_info *cap_info, void **pkbuff, } static const struct x86_cpu_id efi_capsule_quirk_ids[] = { - X86_MATCH_VENDOR_FAM_MODEL(INTEL, 5, INTEL_FAM5_QUARK_X1000, - &qrk_capsule_setup_info), + X86_MATCH_VFM(INTEL_QUARK_X1000, &qrk_capsule_setup_info), { } }; diff --git a/arch/x86/platform/intel-mid/pwr.c b/arch/x86/platform/intel-mid/pwr.c index 27288d8d3f71..cd7e0c71adde 100644 --- a/arch/x86/platform/intel-mid/pwr.c +++ b/arch/x86/platform/intel-mid/pwr.c @@ -358,18 +358,18 @@ static int mid_pwr_probe(struct pci_dev *pdev, const struct pci_device_id *id) return ret; } - ret = pcim_iomap_regions(pdev, 1 << 0, pci_name(pdev)); - if (ret) { - dev_err(&pdev->dev, "I/O memory remapping failed\n"); - return ret; - } - pwr = devm_kzalloc(dev, sizeof(*pwr), GFP_KERNEL); if (!pwr) return -ENOMEM; + pwr->regs = pcim_iomap_region(pdev, 0, "intel_mid_pwr"); + ret = PTR_ERR_OR_ZERO(pwr->regs); + if (ret) { + dev_err(&pdev->dev, "Could not request / ioremap I/O-Mem: %d\n", ret); + return ret; + } + pwr->dev = dev; - pwr->regs = pcim_iomap_table(pdev)[0]; pwr->irq = pdev->irq; mutex_init(&pwr->lock); diff --git a/arch/x86/platform/intel-quark/imr.c b/arch/x86/platform/intel-quark/imr.c index d3d456925b2a..ee25b032c0b3 100644 --- a/arch/x86/platform/intel-quark/imr.c +++ b/arch/x86/platform/intel-quark/imr.c @@ -569,7 +569,7 @@ static void __init imr_fixup_memmap(struct imr_device *idev) } static const struct x86_cpu_id imr_ids[] __initconst = { - X86_MATCH_VENDOR_FAM_MODEL(INTEL, 5, INTEL_FAM5_QUARK_X1000, NULL), + X86_MATCH_VFM(INTEL_QUARK_X1000, NULL), {} }; diff --git a/arch/x86/platform/intel-quark/imr_selftest.c b/arch/x86/platform/intel-quark/imr_selftest.c index 84ba715f44d1..657925b0f428 100644 --- a/arch/x86/platform/intel-quark/imr_selftest.c +++ b/arch/x86/platform/intel-quark/imr_selftest.c @@ -105,7 +105,7 @@ static void __init imr_self_test(void) } static const struct x86_cpu_id imr_ids[] __initconst = { - X86_MATCH_VENDOR_FAM_MODEL(INTEL, 5, INTEL_FAM5_QUARK_X1000, NULL), + X86_MATCH_VFM(INTEL_QUARK_X1000, NULL), {} }; diff --git a/arch/x86/platform/iris/iris.c b/arch/x86/platform/iris/iris.c index c5f3bbdbdcfe..5591a2d9cfe8 100644 --- a/arch/x86/platform/iris/iris.c +++ b/arch/x86/platform/iris/iris.c @@ -73,7 +73,7 @@ static struct platform_driver iris_driver = { .name = "iris", }, .probe = iris_probe, - .remove_new = iris_remove, + .remove = iris_remove, }; static struct resource iris_resources[] = { diff --git a/arch/x86/platform/olpc/olpc-xo1-pm.c b/arch/x86/platform/olpc/olpc-xo1-pm.c index 6a9c42de74e7..424eeae12759 100644 --- a/arch/x86/platform/olpc/olpc-xo1-pm.c +++ b/arch/x86/platform/olpc/olpc-xo1-pm.c @@ -159,7 +159,7 @@ static struct platform_driver cs5535_pms_driver = { .name = "cs5535-pms", }, .probe = xo1_pm_probe, - .remove_new = xo1_pm_remove, + .remove = xo1_pm_remove, }; static struct platform_driver cs5535_acpi_driver = { @@ -167,7 +167,7 @@ static struct platform_driver cs5535_acpi_driver = { .name = "olpc-xo1-pm-acpi", }, .probe = xo1_pm_probe, - .remove_new = xo1_pm_remove, + .remove = xo1_pm_remove, }; static int __init xo1_pm_init(void) diff --git a/arch/x86/platform/olpc/olpc-xo1-sci.c b/arch/x86/platform/olpc/olpc-xo1-sci.c index 46d42ff6e18a..ccb23c73cbe8 100644 --- a/arch/x86/platform/olpc/olpc-xo1-sci.c +++ b/arch/x86/platform/olpc/olpc-xo1-sci.c @@ -616,7 +616,7 @@ static struct platform_driver xo1_sci_driver = { .dev_groups = lid_groups, }, .probe = xo1_sci_probe, - .remove_new = xo1_sci_remove, + .remove = xo1_sci_remove, .suspend = xo1_sci_suspend, .resume = xo1_sci_resume, }; diff --git a/arch/x86/platform/pvh/head.S b/arch/x86/platform/pvh/head.S index 64fca49cd88f..4733a5f467b8 100644 --- a/arch/x86/platform/pvh/head.S +++ b/arch/x86/platform/pvh/head.S @@ -6,7 +6,9 @@ .code32 .text +#ifdef CONFIG_X86_32 #define _pa(x) ((x) - __START_KERNEL_map) +#endif #define rva(x) ((x) - pvh_start_xen) #include <linux/elfnote.h> @@ -52,7 +54,7 @@ #define PVH_CS_SEL (PVH_GDT_ENTRY_CS * 8) #define PVH_DS_SEL (PVH_GDT_ENTRY_DS * 8) -SYM_CODE_START_LOCAL(pvh_start_xen) +SYM_CODE_START(pvh_start_xen) UNWIND_HINT_END_OF_STACK cld @@ -72,8 +74,7 @@ SYM_CODE_START_LOCAL(pvh_start_xen) movl $0, %esp leal rva(gdt)(%ebp), %eax - leal rva(gdt_start)(%ebp), %ecx - movl %ecx, 2(%eax) + addl %eax, 2(%eax) lgdt (%eax) mov $PVH_DS_SEL,%eax @@ -103,10 +104,23 @@ SYM_CODE_START_LOCAL(pvh_start_xen) btsl $_EFER_LME, %eax wrmsr + /* + * Reuse the non-relocatable symbol emitted for the ELF note to + * subtract the build time physical address of pvh_start_xen() from + * its actual runtime address, without relying on absolute 32-bit ELF + * relocations, as these are not supported by the linker when running + * in -pie mode, and should be avoided in .head.text in general. + */ mov %ebp, %ebx - subl $_pa(pvh_start_xen), %ebx /* offset */ + subl rva(xen_elfnote_phys32_entry)(%ebp), %ebx jz .Lpagetable_done + /* + * Store the resulting load offset in phys_base. __pa() needs + * phys_base set to calculate the hypercall page in xen_pvh_init(). + */ + movl %ebx, rva(phys_base)(%ebp) + /* Fixup page-tables for relocation. */ leal rva(pvh_init_top_pgt)(%ebp), %edi movl $PTRS_PER_PGD, %ecx @@ -165,20 +179,12 @@ SYM_CODE_START_LOCAL(pvh_start_xen) xor %edx, %edx wrmsr - /* - * Calculate load offset and store in phys_base. __pa() needs - * phys_base set to calculate the hypercall page in xen_pvh_init(). - */ - movq %rbp, %rbx - subq $_pa(pvh_start_xen), %rbx - movq %rbx, phys_base(%rip) - call xen_prepare_pvh - /* - * Clear phys_base. __startup_64 will *add* to its value, - * so reset to 0. - */ - xor %rbx, %rbx - movq %rbx, phys_base(%rip) + /* Call xen_prepare_pvh() via the kernel virtual mapping */ + leaq xen_prepare_pvh(%rip), %rax + subq phys_base(%rip), %rax + addq $__START_KERNEL_map, %rax + ANNOTATE_RETPOLINE_SAFE + call *%rax /* startup_64 expects boot_params in %rsi. */ lea pvh_bootparams(%rip), %rsi @@ -217,8 +223,8 @@ SYM_CODE_END(pvh_start_xen) .section ".init.data","aw" .balign 8 SYM_DATA_START_LOCAL(gdt) - .word gdt_end - gdt_start - .long _pa(gdt_start) /* x86-64 will overwrite if relocated. */ + .word gdt_end - gdt_start - 1 + .long gdt_start - gdt .word 0 SYM_DATA_END(gdt) SYM_DATA_START_LOCAL(gdt_start) @@ -300,5 +306,5 @@ SYM_DATA_END(pvh_level2_kernel_pgt) .long KERNEL_IMAGE_SIZE - 1) #endif - ELFNOTE(Xen, XEN_ELFNOTE_PHYS32_ENTRY, - _ASM_PTR (pvh_start_xen - __START_KERNEL_map)) + ELFNOTE(Xen, XEN_ELFNOTE_PHYS32_ENTRY, .global xen_elfnote_phys32_entry; + xen_elfnote_phys32_entry: _ASM_PTR xen_elfnote_phys32_entry_value - .) diff --git a/arch/x86/tools/relocs.c b/arch/x86/tools/relocs.c index c101bed61940..27441e5863b2 100644 --- a/arch/x86/tools/relocs.c +++ b/arch/x86/tools/relocs.c @@ -56,6 +56,7 @@ static const char * const sym_regex_kernel[S_NSYMTYPES] = { [S_ABS] = "^(xen_irq_disable_direct_reloc$|" "xen_save_fl_direct_reloc$|" + "xen_elfnote_.+_offset$|" "VDSO|" "__kcfi_typeid_|" "__crc_)", @@ -89,7 +90,6 @@ static const char * const sym_regex_kernel[S_NSYMTYPES] = { "init_per_cpu__.*|" "__end_rodata_hpage_align|" #endif - "__vvar_page|" "_end)$" }; diff --git a/arch/x86/virt/svm/Makefile b/arch/x86/virt/svm/Makefile index ef2a31bdcc70..eca6d71355fa 100644 --- a/arch/x86/virt/svm/Makefile +++ b/arch/x86/virt/svm/Makefile @@ -1,3 +1,4 @@ # SPDX-License-Identifier: GPL-2.0 obj-$(CONFIG_KVM_AMD_SEV) += sev.o +obj-$(CONFIG_CPU_SUP_AMD) += cmdline.o diff --git a/arch/x86/virt/svm/cmdline.c b/arch/x86/virt/svm/cmdline.c new file mode 100644 index 000000000000..affa2759fa20 --- /dev/null +++ b/arch/x86/virt/svm/cmdline.c @@ -0,0 +1,45 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * AMD SVM-SEV command line parsing support + * + * Copyright (C) 2023 - 2024 Advanced Micro Devices, Inc. + * + * Author: Michael Roth <michael.roth@amd.com> + */ + +#include <linux/string.h> +#include <linux/printk.h> +#include <linux/cache.h> +#include <linux/cpufeature.h> + +#include <asm/sev-common.h> + +struct sev_config sev_cfg __read_mostly; + +static int __init init_sev_config(char *str) +{ + char *s; + + while ((s = strsep(&str, ","))) { + if (!strcmp(s, "debug")) { + sev_cfg.debug = true; + continue; + } + + if (!strcmp(s, "nosnp")) { + if (!cpu_feature_enabled(X86_FEATURE_HYPERVISOR)) { + setup_clear_cpu_cap(X86_FEATURE_SEV_SNP); + cc_platform_clear(CC_ATTR_HOST_SEV_SNP); + continue; + } else { + goto warn; + } + } + +warn: + pr_info("SEV command-line option '%s' was not recognized\n", s); + } + + return 1; +} +__setup("sev=", init_sev_config); diff --git a/arch/x86/virt/svm/sev.c b/arch/x86/virt/svm/sev.c index 0ce17766c0e5..9a6a943d8e41 100644 --- a/arch/x86/virt/svm/sev.c +++ b/arch/x86/virt/svm/sev.c @@ -173,6 +173,8 @@ static void __init __snp_fixup_e820_tables(u64 pa) e820__range_update(pa, PMD_SIZE, E820_TYPE_RAM, E820_TYPE_RESERVED); e820__range_update_table(e820_table_kexec, pa, PMD_SIZE, E820_TYPE_RAM, E820_TYPE_RESERVED); e820__range_update_table(e820_table_firmware, pa, PMD_SIZE, E820_TYPE_RAM, E820_TYPE_RESERVED); + if (!memblock_is_region_reserved(pa, PMD_SIZE)) + memblock_reserve(pa, PMD_SIZE); } } diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c index 2c12ae42dc8b..d6818c6cafda 100644 --- a/arch/x86/xen/enlighten_pv.c +++ b/arch/x86/xen/enlighten_pv.c @@ -1032,6 +1032,10 @@ static u64 xen_do_read_msr(unsigned int msr, int *err) switch (msr) { case MSR_IA32_APICBASE: val &= ~X2APIC_ENABLE; + if (smp_processor_id() == 0) + val |= MSR_IA32_APICBASE_BSP; + else + val &= ~MSR_IA32_APICBASE_BSP; break; } return val; diff --git a/arch/x86/xen/xen-head.S b/arch/x86/xen/xen-head.S index 758bcd47b72d..7f6c69dbb816 100644 --- a/arch/x86/xen/xen-head.S +++ b/arch/x86/xen/xen-head.S @@ -94,7 +94,8 @@ SYM_CODE_END(xen_cpu_bringup_again) ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE, _ASM_PTR __START_KERNEL_map) /* Map the p2m table to a 512GB-aligned user address. */ ELFNOTE(Xen, XEN_ELFNOTE_INIT_P2M, .quad (PUD_SIZE * PTRS_PER_PUD)) - ELFNOTE(Xen, XEN_ELFNOTE_ENTRY, _ASM_PTR startup_xen) + ELFNOTE(Xen, XEN_ELFNOTE_ENTRY, .globl xen_elfnote_entry; + xen_elfnote_entry: _ASM_PTR xen_elfnote_entry_value - .) ELFNOTE(Xen, XEN_ELFNOTE_FEATURES, .ascii "!writable_page_tables") ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE, .asciz "yes") ELFNOTE(Xen, XEN_ELFNOTE_L1_MFN_VALID, @@ -115,7 +116,8 @@ SYM_CODE_END(xen_cpu_bringup_again) #else # define FEATURES_DOM0 0 #endif - ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, _ASM_PTR hypercall_page) + ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, .globl xen_elfnote_hypercall_page; + xen_elfnote_hypercall_page: _ASM_PTR xen_elfnote_hypercall_page_value - .) ELFNOTE(Xen, XEN_ELFNOTE_SUPPORTED_FEATURES, .long FEATURES_PV | FEATURES_PVH | FEATURES_DOM0) ELFNOTE(Xen, XEN_ELFNOTE_LOADER, .asciz "generic") |