diff options
120 files changed, 2467 insertions, 744 deletions
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 05496622b4ef..ef03e6e16bdb 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -3185,6 +3185,10 @@ allowed (eg kernel_enable_fpu()/kernel_disable_fpu()). There is some performance impact when enabling this. + ppc_tm= [PPC] + Format: {"off"} + Disable Hardware Transactional Memory + print-fatal-signals= [KNL] debug: print fatal signals diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 809c468edab1..b8b75b423316 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -334,7 +334,7 @@ config PPC_OF_PLATFORM_PCI default n config ARCH_SUPPORTS_DEBUG_PAGEALLOC - depends on PPC32 || PPC_STD_MMU_64 + depends on PPC32 || PPC_BOOK3S_64 def_bool y config ARCH_SUPPORTS_UPROBES @@ -721,7 +721,7 @@ config PPC_16K_PAGES config PPC_64K_PAGES bool "64k page size" - depends on !PPC_FSL_BOOK3E && (44x || PPC_STD_MMU_64 || PPC_BOOK3E_64) + depends on !PPC_FSL_BOOK3E && (44x || PPC_BOOK3S_64 || PPC_BOOK3E_64) select HAVE_ARCH_SOFT_DIRTY if PPC_BOOK3S_64 config PPC_256K_PAGES @@ -780,7 +780,7 @@ config FORCE_MAX_ZONEORDER config PPC_SUBPAGE_PROT bool "Support setting protections for 4k subpages" - depends on PPC_STD_MMU_64 && PPC_64K_PAGES + depends on PPC_BOOK3S_64 && PPC_64K_PAGES help This option adds support for a system call to allow user programs to set access permissions (read/write, readonly, or no access) diff --git a/arch/powerpc/Kconfig.debug b/arch/powerpc/Kconfig.debug index c86df246339e..b8a1eb42a38d 100644 --- a/arch/powerpc/Kconfig.debug +++ b/arch/powerpc/Kconfig.debug @@ -369,4 +369,10 @@ config PPC_HTDUMP def_bool y depends on PPC_PTDUMP && PPC_BOOK3S +config PPC_FAST_ENDIAN_SWITCH + bool "Deprecated fast endian-switch syscall" + depends on DEBUG_KERNEL && PPC_BOOK3S_64 + help + If you're unsure what this is, say N. + endmenu diff --git a/arch/powerpc/boot/dts/acadia.dts b/arch/powerpc/boot/dts/acadia.dts index 57291f61ffe7..86266159521e 100644 --- a/arch/powerpc/boot/dts/acadia.dts +++ b/arch/powerpc/boot/dts/acadia.dts @@ -183,7 +183,7 @@ usb@ef603000 { compatible = "ohci-be"; reg = <0xef603000 0x80>; - interrupts-parent = <&UIC0>; + interrupt-parent = <&UIC0>; interrupts = <0xd 0x4 0xe 0x4>; }; diff --git a/arch/powerpc/configs/powernv_defconfig b/arch/powerpc/configs/powernv_defconfig index caee834760d2..4891bbed6258 100644 --- a/arch/powerpc/configs/powernv_defconfig +++ b/arch/powerpc/configs/powernv_defconfig @@ -192,6 +192,7 @@ CONFIG_IPMI_DEVICE_INTERFACE=y CONFIG_IPMI_POWERNV=y CONFIG_RAW_DRIVER=y CONFIG_MAX_RAW_DEVS=1024 +CONFIG_I2C_CHARDEV=y CONFIG_DRM=y CONFIG_DRM_AST=y CONFIG_FIRMWARE_EDID=y @@ -295,6 +296,7 @@ CONFIG_FUNCTION_GRAPH_TRACER=y CONFIG_SCHED_TRACER=y CONFIG_FTRACE_SYSCALLS=y CONFIG_BLK_DEV_IO_TRACE=y +CONFIG_PPC_EMULATED_STATS=y CONFIG_CODE_PATCHING_SELFTEST=y CONFIG_FTR_FIXUP_SELFTEST=y CONFIG_MSI_BITMAP_SELFTEST=y diff --git a/arch/powerpc/configs/pseries_defconfig b/arch/powerpc/configs/pseries_defconfig index 3d935969e5a2..bde2cd1005a2 100644 --- a/arch/powerpc/configs/pseries_defconfig +++ b/arch/powerpc/configs/pseries_defconfig @@ -193,6 +193,7 @@ CONFIG_VIRTIO_CONSOLE=m CONFIG_IBM_BSR=m CONFIG_RAW_DRIVER=y CONFIG_MAX_RAW_DEVS=1024 +CONFIG_I2C_CHARDEV=y CONFIG_FB=y CONFIG_FIRMWARE_EDID=y CONFIG_FB_OF=y diff --git a/arch/powerpc/configs/skiroot_defconfig b/arch/powerpc/configs/skiroot_defconfig new file mode 100644 index 000000000000..6bd5e7261335 --- /dev/null +++ b/arch/powerpc/configs/skiroot_defconfig @@ -0,0 +1,232 @@ +CONFIG_PPC64=y +CONFIG_ALTIVEC=y +CONFIG_VSX=y +CONFIG_NR_CPUS=2048 +CONFIG_CPU_LITTLE_ENDIAN=y +# CONFIG_SWAP is not set +CONFIG_SYSVIPC=y +CONFIG_POSIX_MQUEUE=y +# CONFIG_CROSS_MEMORY_ATTACH is not set +CONFIG_NO_HZ=y +CONFIG_HIGH_RES_TIMERS=y +CONFIG_TASKSTATS=y +CONFIG_TASK_DELAY_ACCT=y +CONFIG_TASK_XACCT=y +CONFIG_TASK_IO_ACCOUNTING=y +CONFIG_IKCONFIG=y +CONFIG_IKCONFIG_PROC=y +CONFIG_LOG_BUF_SHIFT=20 +CONFIG_RELAY=y +CONFIG_BLK_DEV_INITRD=y +# CONFIG_RD_GZIP is not set +# CONFIG_RD_BZIP2 is not set +# CONFIG_RD_LZMA is not set +# CONFIG_RD_LZO is not set +# CONFIG_RD_LZ4 is not set +CONFIG_CC_OPTIMIZE_FOR_SIZE=y +CONFIG_PERF_EVENTS=y +# CONFIG_COMPAT_BRK is not set +CONFIG_JUMP_LABEL=y +CONFIG_STRICT_KERNEL_RWX=y +CONFIG_MODULES=y +CONFIG_MODULE_UNLOAD=y +CONFIG_MODULE_SIG=y +CONFIG_MODULE_SIG_FORCE=y +CONFIG_MODULE_SIG_SHA512=y +CONFIG_PARTITION_ADVANCED=y +# CONFIG_IOSCHED_DEADLINE is not set +# CONFIG_PPC_PSERIES is not set +CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND=y +CONFIG_CPU_IDLE=y +CONFIG_HZ_100=y +CONFIG_KEXEC=y +CONFIG_IRQ_ALL_CPUS=y +CONFIG_NUMA=y +# CONFIG_COMPACTION is not set +# CONFIG_MIGRATION is not set +# CONFIG_BOUNCE is not set +CONFIG_PPC_64K_PAGES=y +CONFIG_SCHED_SMT=y +CONFIG_CMDLINE_BOOL=y +CONFIG_CMDLINE="console=tty0 console=hvc0 powersave=off" +# CONFIG_SECCOMP is not set +CONFIG_NET=y +CONFIG_PACKET=y +CONFIG_UNIX=y +CONFIG_INET=y +CONFIG_IP_MULTICAST=y +CONFIG_NET_IPIP=y +CONFIG_SYN_COOKIES=y +# CONFIG_INET_XFRM_MODE_TRANSPORT is not set +# CONFIG_INET_XFRM_MODE_TUNNEL is not set +# CONFIG_INET_XFRM_MODE_BEET is not set +# CONFIG_IPV6 is not set +CONFIG_DNS_RESOLVER=y +# CONFIG_WIRELESS is not set +CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug" +CONFIG_DEVTMPFS=y +CONFIG_DEVTMPFS_MOUNT=y +CONFIG_MTD=m +CONFIG_MTD_POWERNV_FLASH=m +CONFIG_BLK_DEV_LOOP=y +CONFIG_BLK_DEV_RAM=y +CONFIG_BLK_DEV_RAM_SIZE=65536 +CONFIG_VIRTIO_BLK=m +CONFIG_BLK_DEV_NVME=m +CONFIG_EEPROM_AT24=y +# CONFIG_CXL is not set +CONFIG_BLK_DEV_SD=m +CONFIG_BLK_DEV_SR=m +CONFIG_BLK_DEV_SR_VENDOR=y +CONFIG_CHR_DEV_SG=m +CONFIG_SCSI_CONSTANTS=y +CONFIG_SCSI_SCAN_ASYNC=y +CONFIG_SCSI_FC_ATTRS=y +CONFIG_SCSI_CXGB3_ISCSI=m +CONFIG_SCSI_CXGB4_ISCSI=m +CONFIG_SCSI_BNX2_ISCSI=m +CONFIG_BE2ISCSI=m +CONFIG_SCSI_AACRAID=m +CONFIG_MEGARAID_NEWGEN=y +CONFIG_MEGARAID_MM=m +CONFIG_MEGARAID_MAILBOX=m +CONFIG_MEGARAID_SAS=m +CONFIG_SCSI_MPT2SAS=m +CONFIG_SCSI_IPR=m +# CONFIG_SCSI_IPR_TRACE is not set +# CONFIG_SCSI_IPR_DUMP is not set +CONFIG_SCSI_QLA_FC=m +CONFIG_SCSI_QLA_ISCSI=m +CONFIG_SCSI_LPFC=m +CONFIG_SCSI_VIRTIO=m +CONFIG_SCSI_DH=y +CONFIG_SCSI_DH_ALUA=m +CONFIG_ATA=y +CONFIG_SATA_AHCI=y +# CONFIG_ATA_SFF is not set +CONFIG_MD=y +CONFIG_BLK_DEV_MD=m +CONFIG_MD_LINEAR=m +CONFIG_MD_RAID0=m +CONFIG_MD_RAID1=m +CONFIG_MD_RAID10=m +CONFIG_MD_RAID456=m +CONFIG_MD_MULTIPATH=m +CONFIG_MD_FAULTY=m +CONFIG_BLK_DEV_DM=m +CONFIG_DM_CRYPT=m +CONFIG_DM_SNAPSHOT=m +CONFIG_DM_MIRROR=m +CONFIG_DM_ZERO=m +CONFIG_DM_MULTIPATH=m +CONFIG_ACENIC=m +CONFIG_ACENIC_OMIT_TIGON_I=y +CONFIG_TIGON3=y +CONFIG_BNX2X=m +CONFIG_CHELSIO_T1=y +CONFIG_BE2NET=m +CONFIG_S2IO=m +CONFIG_E100=m +CONFIG_E1000=m +CONFIG_E1000E=m +CONFIG_IXGB=m +CONFIG_IXGBE=m +CONFIG_MLX4_EN=m +CONFIG_MLX5_CORE=m +CONFIG_MLX5_CORE_EN=y +CONFIG_MYRI10GE=m +CONFIG_QLGE=m +CONFIG_NETXEN_NIC=m +CONFIG_SFC=m +# CONFIG_USB_NET_DRIVERS is not set +# CONFIG_WLAN is not set +CONFIG_INPUT_EVDEV=y +CONFIG_INPUT_MISC=y +# CONFIG_SERIO_SERPORT is not set +# CONFIG_DEVMEM is not set +CONFIG_SERIAL_8250=y +CONFIG_SERIAL_8250_CONSOLE=y +CONFIG_IPMI_HANDLER=y +CONFIG_IPMI_DEVICE_INTERFACE=y +CONFIG_IPMI_POWERNV=y +CONFIG_HW_RANDOM=y +CONFIG_TCG_TIS_I2C_NUVOTON=y +# CONFIG_I2C_COMPAT is not set +CONFIG_I2C_CHARDEV=y +# CONFIG_I2C_HELPER_AUTO is not set +CONFIG_DRM=y +CONFIG_DRM_RADEON=y +CONFIG_DRM_AST=m +CONFIG_FIRMWARE_EDID=y +CONFIG_FB_MODE_HELPERS=y +CONFIG_FB_OF=y +CONFIG_FB_MATROX=y +CONFIG_FB_MATROX_MILLENIUM=y +CONFIG_FB_MATROX_MYSTIQUE=y +CONFIG_FB_MATROX_G=y +# CONFIG_LCD_CLASS_DEVICE is not set +# CONFIG_BACKLIGHT_GENERIC is not set +# CONFIG_VGA_CONSOLE is not set +CONFIG_LOGO=y +# CONFIG_LOGO_LINUX_MONO is not set +# CONFIG_LOGO_LINUX_VGA16 is not set +CONFIG_USB_HIDDEV=y +CONFIG_USB=y +CONFIG_USB_MON=y +CONFIG_USB_XHCI_HCD=y +CONFIG_USB_EHCI_HCD=y +# CONFIG_USB_EHCI_HCD_PPC_OF is not set +CONFIG_USB_OHCI_HCD=y +CONFIG_USB_STORAGE=y +CONFIG_RTC_CLASS=y +CONFIG_RTC_DRV_GENERIC=m +CONFIG_VIRT_DRIVERS=y +CONFIG_VIRTIO_PCI=y +# CONFIG_IOMMU_SUPPORT is not set +CONFIG_EXT4_FS=m +CONFIG_EXT4_FS_POSIX_ACL=y +CONFIG_EXT4_FS_SECURITY=y +CONFIG_XFS_FS=m +CONFIG_XFS_POSIX_ACL=y +CONFIG_BTRFS_FS=m +CONFIG_BTRFS_FS_POSIX_ACL=y +CONFIG_ISO9660_FS=m +CONFIG_UDF_FS=m +CONFIG_MSDOS_FS=m +CONFIG_VFAT_FS=m +CONFIG_PROC_KCORE=y +CONFIG_TMPFS=y +CONFIG_TMPFS_POSIX_ACL=y +# CONFIG_MISC_FILESYSTEMS is not set +# CONFIG_NETWORK_FILESYSTEMS is not set +CONFIG_NLS_DEFAULT="utf8" +CONFIG_NLS_CODEPAGE_437=y +CONFIG_NLS_ASCII=y +CONFIG_NLS_ISO8859_1=y +CONFIG_NLS_UTF8=y +CONFIG_CRC16=y +CONFIG_CRC_ITU_T=y +CONFIG_LIBCRC32C=y +CONFIG_PRINTK_TIME=y +CONFIG_MAGIC_SYSRQ=y +CONFIG_DEBUG_KERNEL=y +CONFIG_DEBUG_STACKOVERFLOW=y +CONFIG_SOFTLOCKUP_DETECTOR=y +CONFIG_HARDLOCKUP_DETECTOR=y +CONFIG_BOOTPARAM_HARDLOCKUP_PANIC=y +CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC=y +CONFIG_WQ_WATCHDOG=y +CONFIG_SCHEDSTATS=y +# CONFIG_FTRACE is not set +CONFIG_XMON=y +CONFIG_XMON_DEFAULT=y +CONFIG_SECURITY=y +CONFIG_IMA=y +CONFIG_EVM=y +# CONFIG_CRYPTO_ECHAINIV is not set +CONFIG_CRYPTO_ECB=y +CONFIG_CRYPTO_CMAC=y +CONFIG_CRYPTO_MD4=y +CONFIG_CRYPTO_ARC4=y +CONFIG_CRYPTO_DES=y +# CONFIG_CRYPTO_HW is not set diff --git a/arch/powerpc/include/asm/book3s/64/tlbflush-hash.h b/arch/powerpc/include/asm/book3s/64/tlbflush-hash.h index 2f6373144e2c..99c99bb04353 100644 --- a/arch/powerpc/include/asm/book3s/64/tlbflush-hash.h +++ b/arch/powerpc/include/asm/book3s/64/tlbflush-hash.h @@ -65,6 +65,28 @@ static inline void hash__flush_tlb_mm(struct mm_struct *mm) { } +static inline void hash__local_flush_all_mm(struct mm_struct *mm) +{ + /* + * There's no Page Walk Cache for hash, so what is needed is + * the same as flush_tlb_mm(), which doesn't really make sense + * with hash. So the only thing we could do is flush the + * entire LPID! Punt for now, as it's not being used. + */ + WARN_ON_ONCE(1); +} + +static inline void hash__flush_all_mm(struct mm_struct *mm) +{ + /* + * There's no Page Walk Cache for hash, so what is needed is + * the same as flush_tlb_mm(), which doesn't really make sense + * with hash. So the only thing we could do is flush the + * entire LPID! Punt for now, as it's not being used. + */ + WARN_ON_ONCE(1); +} + static inline void hash__local_flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr) { diff --git a/arch/powerpc/include/asm/book3s/64/tlbflush-radix.h b/arch/powerpc/include/asm/book3s/64/tlbflush-radix.h index 9b433a624bf3..af06c6fe8a9f 100644 --- a/arch/powerpc/include/asm/book3s/64/tlbflush-radix.h +++ b/arch/powerpc/include/asm/book3s/64/tlbflush-radix.h @@ -21,17 +21,20 @@ extern void radix__flush_tlb_range(struct vm_area_struct *vma, unsigned long sta extern void radix__flush_tlb_kernel_range(unsigned long start, unsigned long end); extern void radix__local_flush_tlb_mm(struct mm_struct *mm); +extern void radix__local_flush_all_mm(struct mm_struct *mm); extern void radix__local_flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr); extern void radix__local_flush_tlb_page_psize(struct mm_struct *mm, unsigned long vmaddr, int psize); extern void radix__tlb_flush(struct mmu_gather *tlb); #ifdef CONFIG_SMP extern void radix__flush_tlb_mm(struct mm_struct *mm); +extern void radix__flush_all_mm(struct mm_struct *mm); extern void radix__flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr); extern void radix__flush_tlb_page_psize(struct mm_struct *mm, unsigned long vmaddr, int psize); #else #define radix__flush_tlb_mm(mm) radix__local_flush_tlb_mm(mm) +#define radix__flush_all_mm(mm) radix__local_flush_all_mm(mm) #define radix__flush_tlb_page(vma,addr) radix__local_flush_tlb_page(vma,addr) #define radix__flush_tlb_page_psize(mm,addr,p) radix__local_flush_tlb_page_psize(mm,addr,p) #endif diff --git a/arch/powerpc/include/asm/book3s/64/tlbflush.h b/arch/powerpc/include/asm/book3s/64/tlbflush.h index 72b925f97bab..70760d018bcd 100644 --- a/arch/powerpc/include/asm/book3s/64/tlbflush.h +++ b/arch/powerpc/include/asm/book3s/64/tlbflush.h @@ -57,6 +57,13 @@ static inline void local_flush_tlb_page(struct vm_area_struct *vma, return hash__local_flush_tlb_page(vma, vmaddr); } +static inline void local_flush_all_mm(struct mm_struct *mm) +{ + if (radix_enabled()) + return radix__local_flush_all_mm(mm); + return hash__local_flush_all_mm(mm); +} + static inline void tlb_flush(struct mmu_gather *tlb) { if (radix_enabled()) @@ -79,9 +86,17 @@ static inline void flush_tlb_page(struct vm_area_struct *vma, return radix__flush_tlb_page(vma, vmaddr); return hash__flush_tlb_page(vma, vmaddr); } + +static inline void flush_all_mm(struct mm_struct *mm) +{ + if (radix_enabled()) + return radix__flush_all_mm(mm); + return hash__flush_all_mm(mm); +} #else #define flush_tlb_mm(mm) local_flush_tlb_mm(mm) #define flush_tlb_page(vma, addr) local_flush_tlb_page(vma, addr) +#define flush_all_mm(mm) local_flush_all_mm(mm) #endif /* CONFIG_SMP */ /* * flush the page walk cache for the address diff --git a/arch/powerpc/include/asm/cputable.h b/arch/powerpc/include/asm/cputable.h index a9bf921f4efc..1ca26ca85ec6 100644 --- a/arch/powerpc/include/asm/cputable.h +++ b/arch/powerpc/include/asm/cputable.h @@ -206,7 +206,7 @@ enum { #define CPU_FTR_STCX_CHECKS_ADDRESS LONG_ASM_CONST(0x0004000000000000) #define CPU_FTR_POPCNTB LONG_ASM_CONST(0x0008000000000000) #define CPU_FTR_POPCNTD LONG_ASM_CONST(0x0010000000000000) -#define CPU_FTR_ICSWX LONG_ASM_CONST(0x0020000000000000) +/* Free LONG_ASM_CONST(0x0020000000000000) */ #define CPU_FTR_VMX_COPY LONG_ASM_CONST(0x0040000000000000) #define CPU_FTR_TM LONG_ASM_CONST(0x0080000000000000) #define CPU_FTR_CFAR LONG_ASM_CONST(0x0100000000000000) @@ -215,6 +215,7 @@ enum { #define CPU_FTR_DABRX LONG_ASM_CONST(0x0800000000000000) #define CPU_FTR_PMAO_BUG LONG_ASM_CONST(0x1000000000000000) #define CPU_FTR_POWER9_DD1 LONG_ASM_CONST(0x4000000000000000) +#define CPU_FTR_POWER9_DD20 LONG_ASM_CONST(0x8000000000000000) #ifndef __ASSEMBLY__ @@ -451,7 +452,7 @@ enum { CPU_FTR_PURR | CPU_FTR_SPURR | CPU_FTR_REAL_LE | \ CPU_FTR_DSCR | CPU_FTR_SAO | CPU_FTR_ASYM_SMT | \ CPU_FTR_STCX_CHECKS_ADDRESS | CPU_FTR_POPCNTB | CPU_FTR_POPCNTD | \ - CPU_FTR_ICSWX | CPU_FTR_CFAR | CPU_FTR_HVMODE | \ + CPU_FTR_CFAR | CPU_FTR_HVMODE | \ CPU_FTR_VMX_COPY | CPU_FTR_HAS_PPR | CPU_FTR_DABRX) #define CPU_FTRS_POWER8 (CPU_FTR_USE_TB | CPU_FTR_LWSYNC | \ CPU_FTR_PPCAS_ARCH_V2 | CPU_FTR_CTRL | CPU_FTR_ARCH_206 |\ @@ -460,7 +461,7 @@ enum { CPU_FTR_PURR | CPU_FTR_SPURR | CPU_FTR_REAL_LE | \ CPU_FTR_DSCR | CPU_FTR_SAO | \ CPU_FTR_STCX_CHECKS_ADDRESS | CPU_FTR_POPCNTB | CPU_FTR_POPCNTD | \ - CPU_FTR_ICSWX | CPU_FTR_CFAR | CPU_FTR_HVMODE | CPU_FTR_VMX_COPY | \ + CPU_FTR_CFAR | CPU_FTR_HVMODE | CPU_FTR_VMX_COPY | \ CPU_FTR_DBELL | CPU_FTR_HAS_PPR | CPU_FTR_DAWR | \ CPU_FTR_ARCH_207S | CPU_FTR_TM_COMP) #define CPU_FTRS_POWER8E (CPU_FTRS_POWER8 | CPU_FTR_PMAO_BUG) @@ -477,6 +478,7 @@ enum { CPU_FTR_ARCH_207S | CPU_FTR_TM_COMP | CPU_FTR_ARCH_300) #define CPU_FTRS_POWER9_DD1 ((CPU_FTRS_POWER9 | CPU_FTR_POWER9_DD1) & \ (~CPU_FTR_SAO)) +#define CPU_FTRS_POWER9_DD20 (CPU_FTRS_POWER9 | CPU_FTR_POWER9_DD20) #define CPU_FTRS_CELL (CPU_FTR_USE_TB | CPU_FTR_LWSYNC | \ CPU_FTR_PPCAS_ARCH_V2 | CPU_FTR_CTRL | \ CPU_FTR_ALTIVEC_COMP | CPU_FTR_MMCRA | CPU_FTR_SMT | \ @@ -495,7 +497,8 @@ enum { (CPU_FTRS_POWER4 | CPU_FTRS_PPC970 | CPU_FTRS_POWER5 | \ CPU_FTRS_POWER6 | CPU_FTRS_POWER7 | CPU_FTRS_POWER8E | \ CPU_FTRS_POWER8 | CPU_FTRS_POWER8_DD1 | CPU_FTRS_CELL | \ - CPU_FTRS_PA6T | CPU_FTR_VSX | CPU_FTRS_POWER9 | CPU_FTRS_POWER9_DD1) + CPU_FTRS_PA6T | CPU_FTR_VSX | CPU_FTRS_POWER9 | \ + CPU_FTRS_POWER9_DD1 | CPU_FTRS_POWER9_DD20) #endif #else enum { diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h index 9847ae3a12d1..5161c37dd039 100644 --- a/arch/powerpc/include/asm/eeh.h +++ b/arch/powerpc/include/asm/eeh.h @@ -93,7 +93,7 @@ struct eeh_pe { struct pci_bus *bus; /* Top PCI bus for bus PE */ int check_count; /* Times of ignored error */ int freeze_count; /* Times of froze up */ - struct timeval tstamp; /* Time on first-time freeze */ + time64_t tstamp; /* Time on first-time freeze */ int false_positives; /* Times of reported #ff's */ atomic_t pass_dev_cnt; /* Count of passed through devs */ struct eeh_pe *parent; /* Parent PE */ @@ -200,7 +200,6 @@ enum { struct eeh_ops { char *name; int (*init)(void); - int (*post_init)(void); void* (*probe)(struct pci_dn *pdn, void *data); int (*set_option)(struct eeh_pe *pe, int option); int (*get_pe_addr)(struct eeh_pe *pe); @@ -275,7 +274,7 @@ struct pci_bus *eeh_pe_bus_get(struct eeh_pe *pe); struct eeh_dev *eeh_dev_init(struct pci_dn *pdn); void eeh_dev_phb_init_dynamic(struct pci_controller *phb); -int eeh_init(void); +void eeh_probe_devices(void); int __init eeh_ops_register(struct eeh_ops *ops); int __exit eeh_ops_unregister(const char *name); int eeh_check_failure(const volatile void __iomem *token); @@ -321,10 +320,7 @@ static inline bool eeh_enabled(void) return false; } -static inline int eeh_init(void) -{ - return 0; -} +static inline void eeh_probe_devices(void) { } static inline void *eeh_dev_init(struct pci_dn *pdn, void *data) { diff --git a/arch/powerpc/include/asm/emulated_ops.h b/arch/powerpc/include/asm/emulated_ops.h index f00e10e2a335..651e1354498e 100644 --- a/arch/powerpc/include/asm/emulated_ops.h +++ b/arch/powerpc/include/asm/emulated_ops.h @@ -55,6 +55,10 @@ extern struct ppc_emulated { struct ppc_emulated_entry mfdscr; struct ppc_emulated_entry mtdscr; struct ppc_emulated_entry lq_stq; + struct ppc_emulated_entry lxvw4x; + struct ppc_emulated_entry lxvh8x; + struct ppc_emulated_entry lxvd2x; + struct ppc_emulated_entry lxvb16x; #endif } ppc_emulated; diff --git a/arch/powerpc/include/asm/epapr_hcalls.h b/arch/powerpc/include/asm/epapr_hcalls.h index 334459ad145b..90863245df53 100644 --- a/arch/powerpc/include/asm/epapr_hcalls.h +++ b/arch/powerpc/include/asm/epapr_hcalls.h @@ -508,7 +508,7 @@ static unsigned long epapr_hypercall(unsigned long *in, static inline long epapr_hypercall0_1(unsigned int nr, unsigned long *r2) { - unsigned long in[8]; + unsigned long in[8] = {0}; unsigned long out[8]; unsigned long r; @@ -520,7 +520,7 @@ static inline long epapr_hypercall0_1(unsigned int nr, unsigned long *r2) static inline long epapr_hypercall0(unsigned int nr) { - unsigned long in[8]; + unsigned long in[8] = {0}; unsigned long out[8]; return epapr_hypercall(in, out, nr); @@ -528,7 +528,7 @@ static inline long epapr_hypercall0(unsigned int nr) static inline long epapr_hypercall1(unsigned int nr, unsigned long p1) { - unsigned long in[8]; + unsigned long in[8] = {0}; unsigned long out[8]; in[0] = p1; @@ -538,7 +538,7 @@ static inline long epapr_hypercall1(unsigned int nr, unsigned long p1) static inline long epapr_hypercall2(unsigned int nr, unsigned long p1, unsigned long p2) { - unsigned long in[8]; + unsigned long in[8] = {0}; unsigned long out[8]; in[0] = p1; @@ -549,7 +549,7 @@ static inline long epapr_hypercall2(unsigned int nr, unsigned long p1, static inline long epapr_hypercall3(unsigned int nr, unsigned long p1, unsigned long p2, unsigned long p3) { - unsigned long in[8]; + unsigned long in[8] = {0}; unsigned long out[8]; in[0] = p1; @@ -562,7 +562,7 @@ static inline long epapr_hypercall4(unsigned int nr, unsigned long p1, unsigned long p2, unsigned long p3, unsigned long p4) { - unsigned long in[8]; + unsigned long in[8] = {0}; unsigned long out[8]; in[0] = p1; diff --git a/arch/powerpc/include/asm/exception-64s.h b/arch/powerpc/include/asm/exception-64s.h index 9a318973af05..b27205297e1d 100644 --- a/arch/powerpc/include/asm/exception-64s.h +++ b/arch/powerpc/include/asm/exception-64s.h @@ -55,6 +55,11 @@ #endif /* + * maximum recursive depth of MCE exceptions + */ +#define MAX_MCE_DEPTH 4 + +/* * EX_LR is only used in EXSLB and where it does not overlap with EX_DAR * EX_CCR similarly with DSISR, but being 4 byte registers there is a hole * in the save area so it's not necessary to overlap them. Could be used diff --git a/arch/powerpc/include/asm/hugetlb.h b/arch/powerpc/include/asm/hugetlb.h index b8a0fb442c64..795d825c2edd 100644 --- a/arch/powerpc/include/asm/hugetlb.h +++ b/arch/powerpc/include/asm/hugetlb.h @@ -40,12 +40,6 @@ static inline void flush_hugetlb_page(struct vm_area_struct *vma, return radix__flush_hugetlb_page(vma, vmaddr); } -static inline void __local_flush_hugetlb_page(struct vm_area_struct *vma, - unsigned long vmaddr) -{ - if (radix_enabled()) - return radix__local_flush_hugetlb_page(vma, vmaddr); -} #else static inline pte_t *hugepd_page(hugepd_t hpd) diff --git a/arch/powerpc/include/asm/hw_irq.h b/arch/powerpc/include/asm/hw_irq.h index c1dd1929342d..d2ef36111518 100644 --- a/arch/powerpc/include/asm/hw_irq.h +++ b/arch/powerpc/include/asm/hw_irq.h @@ -31,6 +31,7 @@ #ifndef __ASSEMBLY__ +extern void replay_system_reset(void); extern void __replay_interrupt(unsigned int vector); extern void timer_interrupt(struct pt_regs *); diff --git a/arch/powerpc/include/asm/kprobes.h b/arch/powerpc/include/asm/kprobes.h index 8814a7249ceb..9f3be5c8a4a3 100644 --- a/arch/powerpc/include/asm/kprobes.h +++ b/arch/powerpc/include/asm/kprobes.h @@ -103,8 +103,8 @@ extern int kprobe_exceptions_notify(struct notifier_block *self, extern int kprobe_fault_handler(struct pt_regs *regs, int trapnr); extern int kprobe_handler(struct pt_regs *regs); extern int kprobe_post_handler(struct pt_regs *regs); -extern int is_current_kprobe_addr(unsigned long addr); #ifdef CONFIG_KPROBES_ON_FTRACE +extern int __is_active_jprobe(unsigned long addr); extern int skip_singlestep(struct kprobe *p, struct pt_regs *regs, struct kprobe_ctlblk *kcb); #else diff --git a/arch/powerpc/include/asm/kvm_book3s_asm.h b/arch/powerpc/include/asm/kvm_book3s_asm.h index 83596f32f50b..7cea76f11c26 100644 --- a/arch/powerpc/include/asm/kvm_book3s_asm.h +++ b/arch/powerpc/include/asm/kvm_book3s_asm.h @@ -104,10 +104,6 @@ struct kvmppc_host_state { u8 napping; #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE - /* - * hwthread_req/hwthread_state pair is used to pull sibling threads - * out of guest on pre-ISAv3.0B CPUs where threads share MMU. - */ u8 hwthread_req; u8 hwthread_state; u8 host_ipi; diff --git a/arch/powerpc/include/asm/mce.h b/arch/powerpc/include/asm/mce.h index 190d69a7f701..3a1226e9b465 100644 --- a/arch/powerpc/include/asm/mce.h +++ b/arch/powerpc/include/asm/mce.h @@ -204,12 +204,10 @@ struct mce_error_info { extern void save_mce_event(struct pt_regs *regs, long handled, struct mce_error_info *mce_err, uint64_t nip, - uint64_t addr); + uint64_t addr, uint64_t phys_addr); extern int get_mce_event(struct machine_check_event *mce, bool release); extern void release_mce_event(void); extern void machine_check_queue_event(void); extern void machine_check_print_event_info(struct machine_check_event *evt, bool user_mode); -extern uint64_t get_mce_fault_addr(struct machine_check_event *evt); - #endif /* __ASM_PPC64_MCE_H__ */ diff --git a/arch/powerpc/include/asm/mmu_context.h b/arch/powerpc/include/asm/mmu_context.h index 309592589e30..20eae6f76247 100644 --- a/arch/powerpc/include/asm/mmu_context.h +++ b/arch/powerpc/include/asm/mmu_context.h @@ -77,6 +77,52 @@ extern void switch_cop(struct mm_struct *next); extern int use_cop(unsigned long acop, struct mm_struct *mm); extern void drop_cop(unsigned long acop, struct mm_struct *mm); +#ifdef CONFIG_PPC_BOOK3S_64 +static inline void inc_mm_active_cpus(struct mm_struct *mm) +{ + atomic_inc(&mm->context.active_cpus); +} + +static inline void dec_mm_active_cpus(struct mm_struct *mm) +{ + atomic_dec(&mm->context.active_cpus); +} + +static inline void mm_context_add_copro(struct mm_struct *mm) +{ + /* + * On hash, should only be called once over the lifetime of + * the context, as we can't decrement the active cpus count + * and flush properly for the time being. + */ + inc_mm_active_cpus(mm); +} + +static inline void mm_context_remove_copro(struct mm_struct *mm) +{ + /* + * Need to broadcast a global flush of the full mm before + * decrementing active_cpus count, as the next TLBI may be + * local and the nMMU and/or PSL need to be cleaned up. + * Should be rare enough so that it's acceptable. + * + * Skip on hash, as we don't know how to do the proper flush + * for the time being. Invalidations will remain global if + * used on hash. + */ + if (radix_enabled()) { + flush_all_mm(mm); + dec_mm_active_cpus(mm); + } +} +#else +static inline void inc_mm_active_cpus(struct mm_struct *mm) { } +static inline void dec_mm_active_cpus(struct mm_struct *mm) { } +static inline void mm_context_add_copro(struct mm_struct *mm) { } +static inline void mm_context_remove_copro(struct mm_struct *mm) { } +#endif + + extern void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, struct task_struct *tsk); @@ -118,9 +164,13 @@ static inline void arch_dup_mmap(struct mm_struct *oldmm, { } +#ifndef CONFIG_PPC_BOOK3S_64 static inline void arch_exit_mmap(struct mm_struct *mm) { } +#else +extern void arch_exit_mmap(struct mm_struct *mm); +#endif static inline void arch_unmap(struct mm_struct *mm, struct vm_area_struct *vma, diff --git a/arch/powerpc/include/asm/nohash/64/pgtable.h b/arch/powerpc/include/asm/nohash/64/pgtable.h index f0ff384d4ca5..b1c42ac54d96 100644 --- a/arch/powerpc/include/asm/nohash/64/pgtable.h +++ b/arch/powerpc/include/asm/nohash/64/pgtable.h @@ -203,7 +203,7 @@ static inline unsigned long pte_update(struct mm_struct *mm, if (!huge) assert_pte_locked(mm, addr); -#ifdef CONFIG_PPC_STD_MMU_64 +#ifdef CONFIG_PPC_BOOK3S_64 if (old & _PAGE_HASHPTE) hpte_need_flush(mm, addr, ptep, old, huge); #endif diff --git a/arch/powerpc/include/asm/opal-api.h b/arch/powerpc/include/asm/opal-api.h index 450a60b81d2a..233c7504b1f2 100644 --- a/arch/powerpc/include/asm/opal-api.h +++ b/arch/powerpc/include/asm/opal-api.h @@ -188,6 +188,7 @@ #define OPAL_XIVE_DUMP 142 #define OPAL_XIVE_RESERVED3 143 #define OPAL_XIVE_RESERVED4 144 +#define OPAL_SIGNAL_SYSTEM_RESET 145 #define OPAL_NPU_INIT_CONTEXT 146 #define OPAL_NPU_DESTROY_CONTEXT 147 #define OPAL_NPU_MAP_LPAR 148 @@ -895,6 +896,8 @@ enum { */ OPAL_REINIT_CPUS_MMU_HASH = (1 << 2), OPAL_REINIT_CPUS_MMU_RADIX = (1 << 3), + + OPAL_REINIT_CPUS_TM_SUSPEND_DISABLED = (1 << 4), }; typedef struct oppanel_line { diff --git a/arch/powerpc/include/asm/opal.h b/arch/powerpc/include/asm/opal.h index 726c23304a57..0c545f7fc77b 100644 --- a/arch/powerpc/include/asm/opal.h +++ b/arch/powerpc/include/asm/opal.h @@ -281,6 +281,8 @@ int opal_get_power_shift_ratio(u32 handle, int token, u32 *psr); int opal_set_power_shift_ratio(u32 handle, int token, u32 psr); int opal_sensor_group_clear(u32 group_hndl, int token); +s64 opal_signal_system_reset(s32 cpu); + /* Internal functions */ extern int early_init_dt_scan_opal(unsigned long node, const char *uname, int depth, void *data); @@ -304,11 +306,11 @@ extern void opal_notifier_enable(void); extern void opal_notifier_disable(void); extern void opal_notifier_update_evt(uint64_t evt_mask, uint64_t evt_val); -extern int __opal_async_get_token(void); extern int opal_async_get_token_interruptible(void); -extern int __opal_async_release_token(int token); extern int opal_async_release_token(int token); extern int opal_async_wait_response(uint64_t token, struct opal_msg *msg); +extern int opal_async_wait_response_interruptible(uint64_t token, + struct opal_msg *msg); extern int opal_get_sensor_data(u32 sensor_hndl, u32 *sensor_data); struct rtc_time; diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h index 04b60af027ae..c907ae23c956 100644 --- a/arch/powerpc/include/asm/paca.h +++ b/arch/powerpc/include/asm/paca.h @@ -91,14 +91,14 @@ struct paca_struct { u8 cpu_start; /* At startup, processor spins until */ /* this becomes non-zero. */ u8 kexec_state; /* set when kexec down has irqs off */ -#ifdef CONFIG_PPC_STD_MMU_64 +#ifdef CONFIG_PPC_BOOK3S_64 struct slb_shadow *slb_shadow_ptr; struct dtl_entry *dispatch_log; struct dtl_entry *dispatch_log_end; -#endif /* CONFIG_PPC_STD_MMU_64 */ +#endif u64 dscr_default; /* per-CPU default DSCR */ -#ifdef CONFIG_PPC_STD_MMU_64 +#ifdef CONFIG_PPC_BOOK3S_64 /* * Now, starting in cacheline 2, the exception save areas */ @@ -110,7 +110,7 @@ struct paca_struct { u16 vmalloc_sllp; u16 slb_cache_ptr; u32 slb_cache[SLB_CACHE_ENTRIES]; -#endif /* CONFIG_PPC_STD_MMU_64 */ +#endif /* CONFIG_PPC_BOOK3S_64 */ #ifdef CONFIG_PPC_BOOK3E u64 exgen[8] __aligned(0x40); @@ -192,7 +192,7 @@ struct paca_struct { struct stop_sprs stop_sprs; #endif -#ifdef CONFIG_PPC_STD_MMU_64 +#ifdef CONFIG_PPC_BOOK3S_64 /* Non-maskable exceptions that are not performance critical */ u64 exnmi[EX_SIZE]; /* used for system reset (nmi) */ u64 exmc[EX_SIZE]; /* used for machine checks */ @@ -210,6 +210,7 @@ struct paca_struct { */ u16 in_mce; u8 hmi_event_available; /* HMI event is available */ + u8 hmi_p9_special_emu; /* HMI P9 special emulation */ #endif /* Stuff for accurate time accounting */ diff --git a/arch/powerpc/include/asm/page_64.h b/arch/powerpc/include/asm/page_64.h index c4d9654bd637..56234c6fcd61 100644 --- a/arch/powerpc/include/asm/page_64.h +++ b/arch/powerpc/include/asm/page_64.h @@ -117,21 +117,21 @@ extern void slice_set_range_psize(struct mm_struct *mm, unsigned long start, #endif /* __ASSEMBLY__ */ #else #define slice_init() -#ifdef CONFIG_PPC_STD_MMU_64 +#ifdef CONFIG_PPC_BOOK3S_64 #define get_slice_psize(mm, addr) ((mm)->context.user_psize) #define slice_set_user_psize(mm, psize) \ do { \ (mm)->context.user_psize = (psize); \ (mm)->context.sllp = SLB_VSID_USER | mmu_psize_defs[(psize)].sllp; \ } while (0) -#else /* CONFIG_PPC_STD_MMU_64 */ +#else /* !CONFIG_PPC_BOOK3S_64 */ #ifdef CONFIG_PPC_64K_PAGES #define get_slice_psize(mm, addr) MMU_PAGE_64K #else /* CONFIG_PPC_64K_PAGES */ #define get_slice_psize(mm, addr) MMU_PAGE_4K #endif /* !CONFIG_PPC_64K_PAGES */ #define slice_set_user_psize(mm, psize) do { BUG(); } while(0) -#endif /* !CONFIG_PPC_STD_MMU_64 */ +#endif /* CONFIG_PPC_BOOK3S_64 */ #define slice_set_range_psize(mm, start, len, psize) \ slice_set_user_psize((mm), (psize)) diff --git a/arch/powerpc/include/asm/pci-bridge.h b/arch/powerpc/include/asm/pci-bridge.h index 0b8aa1fe2d5f..62ed83db04ae 100644 --- a/arch/powerpc/include/asm/pci-bridge.h +++ b/arch/powerpc/include/asm/pci-bridge.h @@ -218,6 +218,7 @@ struct pci_dn { #endif struct list_head child_list; struct list_head list; + struct resource holes[PCI_SRIOV_NUM_BARS]; }; /* Get the pointer to a device_node's pci_dn */ diff --git a/arch/powerpc/include/asm/pgtable-be-types.h b/arch/powerpc/include/asm/pgtable-be-types.h index 67e7e3d990f4..03ab638593e1 100644 --- a/arch/powerpc/include/asm/pgtable-be-types.h +++ b/arch/powerpc/include/asm/pgtable-be-types.h @@ -76,7 +76,7 @@ typedef struct { unsigned long pgprot; } pgprot_t; * With hash config 64k pages additionally define a bigger "real PTE" type that * gathers the "second half" part of the PTE for pseudo 64k pages */ -#if defined(CONFIG_PPC_64K_PAGES) && defined(CONFIG_PPC_STD_MMU_64) +#if defined(CONFIG_PPC_64K_PAGES) && defined(CONFIG_PPC_BOOK3S_64) typedef struct { pte_t pte; unsigned long hidx; } real_pte_t; #else typedef struct { pte_t pte; } real_pte_t; diff --git a/arch/powerpc/include/asm/pgtable-types.h b/arch/powerpc/include/asm/pgtable-types.h index 369a164b545c..5d2c38d26396 100644 --- a/arch/powerpc/include/asm/pgtable-types.h +++ b/arch/powerpc/include/asm/pgtable-types.h @@ -49,13 +49,13 @@ typedef struct { unsigned long pgprot; } pgprot_t; * With hash config 64k pages additionally define a bigger "real PTE" type that * gathers the "second half" part of the PTE for pseudo 64k pages */ -#if defined(CONFIG_PPC_64K_PAGES) && defined(CONFIG_PPC_STD_MMU_64) +#if defined(CONFIG_PPC_64K_PAGES) && defined(CONFIG_PPC_BOOK3S_64) typedef struct { pte_t pte; unsigned long hidx; } real_pte_t; #else typedef struct { pte_t pte; } real_pte_t; #endif -#ifdef CONFIG_PPC_STD_MMU_64 +#ifdef CONFIG_PPC_BOOK3S_64 #include <asm/cmpxchg.h> static inline bool pte_xchg(pte_t *ptep, pte_t old, pte_t new) diff --git a/arch/powerpc/include/asm/powernv.h b/arch/powerpc/include/asm/powernv.h index f62797702300..dc5f6a5d4575 100644 --- a/arch/powerpc/include/asm/powernv.h +++ b/arch/powerpc/include/asm/powernv.h @@ -22,6 +22,8 @@ extern void pnv_npu2_destroy_context(struct npu_context *context, extern int pnv_npu2_handle_fault(struct npu_context *context, uintptr_t *ea, unsigned long *flags, unsigned long *status, int count); + +void pnv_tm_init(void); #else static inline void powernv_set_nmmu_ptcr(unsigned long ptcr) { } static inline struct npu_context *pnv_npu2_init_context(struct pci_dev *gpdev, @@ -36,6 +38,8 @@ static inline int pnv_npu2_handle_fault(struct npu_context *context, unsigned long *status, int count) { return -ENODEV; } + +static inline void pnv_tm_init(void) { } #endif #endif /* _ASM_POWERNV_H */ diff --git a/arch/powerpc/include/asm/ppc_asm.h b/arch/powerpc/include/asm/ppc_asm.h index 36f3e41c9fbe..ae94b3626b6c 100644 --- a/arch/powerpc/include/asm/ppc_asm.h +++ b/arch/powerpc/include/asm/ppc_asm.h @@ -774,9 +774,13 @@ END_FTR_SECTION_IFCLR(CPU_FTR_601) #ifdef CONFIG_PPC_BOOK3E #define FIXUP_ENDIAN #else +/* + * This version may be used in in HV or non-HV context. + * MSR[EE] must be disabled. + */ #define FIXUP_ENDIAN \ tdi 0,0,0x48; /* Reverse endian of b . + 8 */ \ - b $+44; /* Skip trampoline if endian is good */ \ + b 191f; /* Skip trampoline if endian is good */ \ .long 0xa600607d; /* mfmsr r11 */ \ .long 0x01006b69; /* xori r11,r11,1 */ \ .long 0x00004039; /* li r10,0 */ \ @@ -786,7 +790,26 @@ END_FTR_SECTION_IFCLR(CPU_FTR_601) .long 0x14004a39; /* addi r10,r10,20 */ \ .long 0xa6035a7d; /* mtsrr0 r10 */ \ .long 0xa6037b7d; /* mtsrr1 r11 */ \ - .long 0x2400004c /* rfid */ + .long 0x2400004c; /* rfid */ \ +191: + +/* + * This version that may only be used with MSR[HV]=1 + * - Does not clear MSR[RI], so more robust. + * - Slightly smaller and faster. + */ +#define FIXUP_ENDIAN_HV \ + tdi 0,0,0x48; /* Reverse endian of b . + 8 */ \ + b 191f; /* Skip trampoline if endian is good */ \ + .long 0xa600607d; /* mfmsr r11 */ \ + .long 0x01006b69; /* xori r11,r11,1 */ \ + .long 0x05009f42; /* bcl 20,31,$+4 */ \ + .long 0xa602487d; /* mflr r10 */ \ + .long 0x14004a39; /* addi r10,r10,20 */ \ + .long 0xa64b5a7d; /* mthsrr0 r10 */ \ + .long 0xa64b7b7d; /* mthsrr1 r11 */ \ + .long 0x2402004c; /* hrfid */ \ +191: #endif /* !CONFIG_PPC_BOOK3E */ diff --git a/arch/powerpc/include/asm/tlbflush.h b/arch/powerpc/include/asm/tlbflush.h index 13dbcd41885e..7d5a157c7832 100644 --- a/arch/powerpc/include/asm/tlbflush.h +++ b/arch/powerpc/include/asm/tlbflush.h @@ -77,7 +77,7 @@ static inline void local_flush_tlb_mm(struct mm_struct *mm) flush_tlb_mm(mm); } -#elif defined(CONFIG_PPC_STD_MMU_64) +#elif defined(CONFIG_PPC_BOOK3S_64) #include <asm/book3s/64/tlbflush.h> #else #error Unsupported MMU type diff --git a/arch/powerpc/include/asm/tm.h b/arch/powerpc/include/asm/tm.h index 82e06ca3a49b..79d4156554b4 100644 --- a/arch/powerpc/include/asm/tm.h +++ b/arch/powerpc/include/asm/tm.h @@ -11,12 +11,13 @@ extern void tm_enable(void); extern void tm_reclaim(struct thread_struct *thread, - unsigned long orig_msr, uint8_t cause); + uint8_t cause); extern void tm_reclaim_current(uint8_t cause); -extern void tm_recheckpoint(struct thread_struct *thread, - unsigned long orig_msr); +extern void tm_recheckpoint(struct thread_struct *thread); extern void tm_abort(uint8_t cause); extern void tm_save_sprs(struct thread_struct *thread); extern void tm_restore_sprs(struct thread_struct *thread); +extern bool tm_suspend_disabled; + #endif /* __ASSEMBLY__ */ diff --git a/arch/powerpc/include/asm/topology.h b/arch/powerpc/include/asm/topology.h index 2d84bca8d053..34da2c5fd1df 100644 --- a/arch/powerpc/include/asm/topology.h +++ b/arch/powerpc/include/asm/topology.h @@ -96,6 +96,14 @@ static inline int prrn_is_enabled(void) } #endif /* CONFIG_NUMA && CONFIG_PPC_SPLPAR */ +#if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_NEED_MULTIPLE_NODES) +#if defined(CONFIG_PPC_SPLPAR) +extern int timed_topology_update(int nsecs); +#else +#define timed_topology_update(nsecs) +#endif /* CONFIG_PPC_SPLPAR */ +#endif /* CONFIG_HOTPLUG_CPU || CONFIG_NEED_MULTIPLE_NODES */ + #include <asm-generic/topology.h> #ifdef CONFIG_SMP diff --git a/arch/powerpc/include/asm/uaccess.h b/arch/powerpc/include/asm/uaccess.h index 9c0e60ca1666..e34f15e727d9 100644 --- a/arch/powerpc/include/asm/uaccess.h +++ b/arch/powerpc/include/asm/uaccess.h @@ -173,6 +173,23 @@ do { \ extern long __get_user_bad(void); +/* + * This does an atomic 128 byte aligned load from userspace. + * Upto caller to do enable_kernel_vmx() before calling! + */ +#define __get_user_atomic_128_aligned(kaddr, uaddr, err) \ + __asm__ __volatile__( \ + "1: lvx 0,0,%1 # get user\n" \ + " stvx 0,0,%2 # put kernel\n" \ + "2:\n" \ + ".section .fixup,\"ax\"\n" \ + "3: li %0,%3\n" \ + " b 2b\n" \ + ".previous\n" \ + EX_TABLE(1b, 3b) \ + : "=r" (err) \ + : "b" (uaddr), "b" (kaddr), "i" (-EFAULT), "0" (err)) + #define __get_user_asm(x, addr, err, op) \ __asm__ __volatile__( \ "1: "op" %1,0(%2) # get_user\n" \ diff --git a/arch/powerpc/include/uapi/asm/cputable.h b/arch/powerpc/include/uapi/asm/cputable.h index 4d877144f377..b3b64cba71ec 100644 --- a/arch/powerpc/include/uapi/asm/cputable.h +++ b/arch/powerpc/include/uapi/asm/cputable.h @@ -48,6 +48,7 @@ #define PPC_FEATURE2_HAS_IEEE128 0x00400000 /* VSX IEEE Binary Float 128-bit */ #define PPC_FEATURE2_DARN 0x00200000 /* darn random number insn */ #define PPC_FEATURE2_SCV 0x00100000 /* scv syscall */ +#define PPC_FEATURE2_HTM_NO_SUSPEND 0x00080000 /* TM w/out suspended state */ /* * IMPORTANT! diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c index 8cfb20e38cfe..200623e71474 100644 --- a/arch/powerpc/kernel/asm-offsets.c +++ b/arch/powerpc/kernel/asm-offsets.c @@ -208,7 +208,7 @@ int main(void) OFFSET(TCD_ESEL_FIRST, tlb_core_data, esel_first); #endif /* CONFIG_PPC_BOOK3E */ -#ifdef CONFIG_PPC_STD_MMU_64 +#ifdef CONFIG_PPC_BOOK3S_64 OFFSET(PACASLBCACHE, paca_struct, slb_cache); OFFSET(PACASLBCACHEPTR, paca_struct, slb_cache_ptr); OFFSET(PACAVMALLOCSLLP, paca_struct, vmalloc_sllp); @@ -230,7 +230,7 @@ int main(void) OFFSET(LPPACA_DTLIDX, lppaca, dtl_idx); OFFSET(LPPACA_YIELDCOUNT, lppaca, yield_count); OFFSET(PACA_DTL_RIDX, paca_struct, dtl_ridx); -#endif /* CONFIG_PPC_STD_MMU_64 */ +#endif /* CONFIG_PPC_BOOK3S_64 */ OFFSET(PACAEMERGSP, paca_struct, emergency_sp); #ifdef CONFIG_PPC_BOOK3S_64 OFFSET(PACAMCEMERGSP, paca_struct, mc_emergency_sp); diff --git a/arch/powerpc/kernel/cputable.c b/arch/powerpc/kernel/cputable.c index 760872916013..171820190de7 100644 --- a/arch/powerpc/kernel/cputable.c +++ b/arch/powerpc/kernel/cputable.c @@ -547,6 +547,26 @@ static struct cpu_spec __initdata cpu_specs[] = { .machine_check_early = __machine_check_early_realmode_p9, .platform = "power9", }, + { /* Power9 DD2.0 */ + .pvr_mask = 0xffffefff, + .pvr_value = 0x004e0200, + .cpu_name = "POWER9 (raw)", + .cpu_features = CPU_FTRS_POWER9_DD20, + .cpu_user_features = COMMON_USER_POWER9, + .cpu_user_features2 = COMMON_USER2_POWER9, + .mmu_features = MMU_FTRS_POWER9, + .icache_bsize = 128, + .dcache_bsize = 128, + .num_pmcs = 6, + .pmc_type = PPC_PMC_IBM, + .oprofile_cpu_type = "ppc64/power9", + .oprofile_type = PPC_OPROFILE_INVALID, + .cpu_setup = __setup_cpu_power9, + .cpu_restore = __restore_cpu_power9, + .flush_tlb = __flush_tlb_power9, + .machine_check_early = __machine_check_early_realmode_p9, + .platform = "power9", + }, { /* Power9 */ .pvr_mask = 0xffff0000, .pvr_value = 0x004e0000, diff --git a/arch/powerpc/kernel/dt_cpu_ftrs.c b/arch/powerpc/kernel/dt_cpu_ftrs.c index 7275fed271af..58e3b51de31c 100644 --- a/arch/powerpc/kernel/dt_cpu_ftrs.c +++ b/arch/powerpc/kernel/dt_cpu_ftrs.c @@ -634,7 +634,7 @@ static struct dt_cpu_feature_match __initdata {"no-execute", feat_enable, 0}, {"strong-access-ordering", feat_enable, CPU_FTR_SAO}, {"cache-inhibited-large-page", feat_enable_large_ci, 0}, - {"coprocessor-icswx", feat_enable, CPU_FTR_ICSWX}, + {"coprocessor-icswx", feat_enable, 0}, {"hypervisor-virtualization-interrupt", feat_enable_hvi, 0}, {"program-priority-register", feat_enable, CPU_FTR_HAS_PPR}, {"wait", feat_enable, 0}, @@ -735,6 +735,8 @@ static __init void cpufeatures_cpu_quirks(void) */ if ((version & 0xffffff00) == 0x004e0100) cur_cpu_spec->cpu_features |= CPU_FTR_POWER9_DD1; + else if ((version & 0xffffefff) == 0x004e0200) + cur_cpu_spec->cpu_features |= CPU_FTR_POWER9_DD20; } static void __init cpufeatures_setup_finished(void) diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c index 116000b45531..cbca0a667682 100644 --- a/arch/powerpc/kernel/eeh.c +++ b/arch/powerpc/kernel/eeh.c @@ -972,6 +972,18 @@ static struct notifier_block eeh_reboot_nb = { .notifier_call = eeh_reboot_notifier, }; +void eeh_probe_devices(void) +{ + struct pci_controller *hose, *tmp; + struct pci_dn *pdn; + + /* Enable EEH for all adapters */ + list_for_each_entry_safe(hose, tmp, &hose_list, list_node) { + pdn = hose->pci_data; + traverse_pci_dn(pdn, eeh_ops->probe, NULL); + } +} + /** * eeh_init - EEH initialization * @@ -987,22 +999,11 @@ static struct notifier_block eeh_reboot_nb = { * Even if force-off is set, the EEH hardware is still enabled, so that * newer systems can boot. */ -int eeh_init(void) +static int eeh_init(void) { struct pci_controller *hose, *tmp; - struct pci_dn *pdn; - static int cnt = 0; int ret = 0; - /* - * We have to delay the initialization on PowerNV after - * the PCI hierarchy tree has been built because the PEs - * are figured out based on PCI devices instead of device - * tree nodes - */ - if (machine_is(powernv) && cnt++ <= 0) - return ret; - /* Register reboot notifier */ ret = register_reboot_notifier(&eeh_reboot_nb); if (ret) { @@ -1028,22 +1029,7 @@ int eeh_init(void) if (ret) return ret; - /* Enable EEH for all adapters */ - list_for_each_entry_safe(hose, tmp, &hose_list, list_node) { - pdn = hose->pci_data; - traverse_pci_dn(pdn, eeh_ops->probe, NULL); - } - - /* - * Call platform post-initialization. Actually, It's good chance - * to inform platform that EEH is ready to supply service if the - * I/O cache stuff has been built up. - */ - if (eeh_ops->post_init) { - ret = eeh_ops->post_init(); - if (ret) - return ret; - } + eeh_probe_devices(); if (eeh_enabled()) pr_info("EEH: PCI Enhanced I/O Error Handling Enabled\n"); @@ -1757,10 +1743,6 @@ static int eeh_enable_dbgfs_set(void *data, u64 val) else eeh_add_flag(EEH_FORCE_DISABLED); - /* Notify the backend */ - if (eeh_ops->post_init) - eeh_ops->post_init(); - return 0; } diff --git a/arch/powerpc/kernel/eeh_driver.c b/arch/powerpc/kernel/eeh_driver.c index 8b840191df59..57d54163bf94 100644 --- a/arch/powerpc/kernel/eeh_driver.c +++ b/arch/powerpc/kernel/eeh_driver.c @@ -623,7 +623,7 @@ static int eeh_reset_device(struct eeh_pe *pe, struct pci_bus *bus, struct eeh_rmv_data *rmv_data) { struct pci_bus *frozen_bus = eeh_pe_bus_get(pe); - struct timeval tstamp; + time64_t tstamp; int cnt, rc; struct eeh_dev *edev; diff --git a/arch/powerpc/kernel/eeh_pe.c b/arch/powerpc/kernel/eeh_pe.c index 2e8d1b2b5af4..2d4956e97aa9 100644 --- a/arch/powerpc/kernel/eeh_pe.c +++ b/arch/powerpc/kernel/eeh_pe.c @@ -526,16 +526,16 @@ int eeh_rmv_from_parent_pe(struct eeh_dev *edev) */ void eeh_pe_update_time_stamp(struct eeh_pe *pe) { - struct timeval tstamp; + time64_t tstamp; if (!pe) return; if (pe->freeze_count <= 0) { pe->freeze_count = 0; - do_gettimeofday(&pe->tstamp); + pe->tstamp = ktime_get_seconds(); } else { - do_gettimeofday(&tstamp); - if (tstamp.tv_sec - pe->tstamp.tv_sec > 3600) { + tstamp = ktime_get_seconds(); + if (tstamp - pe->tstamp > 3600) { pe->tstamp = tstamp; pe->freeze_count = 0; } diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S index 4a0fd4f40245..3320bcac7192 100644 --- a/arch/powerpc/kernel/entry_64.S +++ b/arch/powerpc/kernel/entry_64.S @@ -539,7 +539,7 @@ _GLOBAL(_switch) std r6,PACACURRENT(r13) /* Set new 'current' */ ld r8,KSP(r4) /* new stack pointer */ -#ifdef CONFIG_PPC_STD_MMU_64 +#ifdef CONFIG_PPC_BOOK3S_64 BEGIN_MMU_FTR_SECTION b 2f END_MMU_FTR_SECTION_IFSET(MMU_FTR_TYPE_RADIX) @@ -588,7 +588,7 @@ END_MMU_FTR_SECTION_IFSET(MMU_FTR_1T_SEGMENT) slbmte r7,r0 isync 2: -#endif /* CONFIG_PPC_STD_MMU_64 */ +#endif /* CONFIG_PPC_BOOK3S_64 */ CURRENT_THREAD_INFO(r7, r8) /* base of new stack */ /* Note: this uses SWITCH_FRAME_SIZE rather than INT_FRAME_SIZE diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index 48da0f5d2f7f..6879aed47377 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -113,6 +113,7 @@ EXC_VIRT_NONE(0x4000, 0x100) cmpwi cr3,r10,2 ; \ BRANCH_TO_C000(r10, system_reset_idle_common) ; \ 1: \ + KVMTEST_PR(n) ; \ END_FTR_SECTION_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206) #else #define IDLETEST NOTEST @@ -129,6 +130,7 @@ EXC_REAL_BEGIN(system_reset, 0x100, 0x100) EXC_REAL_END(system_reset, 0x100, 0x100) EXC_VIRT_NONE(0x4100, 0x100) +TRAMP_KVM(PACA_EXNMI, 0x100) #ifdef CONFIG_PPC_P7_NAP EXC_COMMON_BEGIN(system_reset_idle_common) @@ -232,7 +234,7 @@ BEGIN_FTR_SECTION addi r10,r10,1 /* increment paca->in_mce */ sth r10,PACA_IN_MCE(r13) /* Limit nested MCE to level 4 to avoid stack overflow */ - cmpwi r10,4 + cmpwi r10,MAX_MCE_DEPTH bgt 2f /* Check if we hit limit of 4 */ std r11,GPR1(r1) /* Save r1 on the stack. */ std r11,0(r1) /* make stack chain pointer */ @@ -605,7 +607,7 @@ EXC_COMMON_BEGIN(slb_miss_common) cmpdi cr5,r11,MSR_RI crset 4*cr0+eq -#ifdef CONFIG_PPC_STD_MMU_64 +#ifdef CONFIG_PPC_BOOK3S_64 BEGIN_MMU_FTR_SECTION bl slb_allocate END_MMU_FTR_SECTION_IFCLR(MMU_FTR_TYPE_RADIX) @@ -865,12 +867,6 @@ EXC_COMMON(trap_0b_common, 0xb00, unknown_exception) #define LOAD_SYSCALL_HANDLER(reg) \ __LOAD_HANDLER(reg, system_call_common) -#define SYSCALL_FASTENDIAN_TEST \ -BEGIN_FTR_SECTION \ - cmpdi r0,0x1ebe ; \ - beq- 1f ; \ -END_FTR_SECTION_IFSET(CPU_FTR_REAL_LE) \ - /* * After SYSCALL_KVMTEST, we reach here with PACA in r13, r13 in r9, * and HMT_MEDIUM. @@ -885,6 +881,13 @@ END_FTR_SECTION_IFSET(CPU_FTR_REAL_LE) \ rfid ; \ b . ; /* prevent speculative execution */ +#ifdef CONFIG_PPC_FAST_ENDIAN_SWITCH +#define SYSCALL_FASTENDIAN_TEST \ +BEGIN_FTR_SECTION \ + cmpdi r0,0x1ebe ; \ + beq- 1f ; \ +END_FTR_SECTION_IFSET(CPU_FTR_REAL_LE) \ + #define SYSCALL_FASTENDIAN \ /* Fast LE/BE switch system call */ \ 1: mfspr r12,SPRN_SRR1 ; \ @@ -893,6 +896,10 @@ END_FTR_SECTION_IFSET(CPU_FTR_REAL_LE) \ mr r13,r9 ; \ rfid ; /* return to userspace */ \ b . ; /* prevent speculative execution */ +#else +#define SYSCALL_FASTENDIAN_TEST +#define SYSCALL_FASTENDIAN +#endif /* CONFIG_PPC_FAST_ENDIAN_SWITCH */ #if defined(CONFIG_RELOCATABLE) /* @@ -1010,6 +1017,8 @@ TRAMP_REAL_BEGIN(hmi_exception_early) EXCEPTION_PROLOG_COMMON_3(0xe60) addi r3,r1,STACK_FRAME_OVERHEAD BRANCH_LINK_TO_FAR(hmi_exception_realmode) /* Function call ABI */ + cmpdi cr0,r3,0 + /* Windup the stack. */ /* Move original HSRR0 and HSRR1 into the respective regs */ ld r9,_MSR(r1) @@ -1026,10 +1035,15 @@ TRAMP_REAL_BEGIN(hmi_exception_early) REST_8GPRS(2, r1) REST_GPR(10, r1) ld r11,_CCR(r1) + REST_2GPRS(12, r1) + bne 1f mtcr r11 REST_GPR(11, r1) - REST_2GPRS(12, r1) - /* restore original r1. */ + ld r1,GPR1(r1) + hrfid + +1: mtcr r11 + REST_GPR(11, r1) ld r1,GPR1(r1) /* @@ -1042,8 +1056,9 @@ hmi_exception_after_realmode: EXCEPTION_PROLOG_0(PACA_EXGEN) b tramp_real_hmi_exception -EXC_COMMON_ASYNC(hmi_exception_common, 0xe60, handle_hmi_exception) - +EXC_COMMON_BEGIN(hmi_exception_common) +EXCEPTION_COMMON(PACA_EXGEN, 0xe60, hmi_exception_common, handle_hmi_exception, + ret_from_except, FINISH_NAP;ADD_NVGPRS;ADD_RECONCILE;RUNLATCH_ON) EXC_REAL_OOL_MASKABLE_HV(h_doorbell, 0xe80, 0x20) EXC_VIRT_OOL_MASKABLE_HV(h_doorbell, 0x4e80, 0x20, 0xe80) @@ -1482,7 +1497,7 @@ USE_TEXT_SECTION() */ .balign IFETCH_ALIGN_BYTES do_hash_page: - #ifdef CONFIG_PPC_STD_MMU_64 +#ifdef CONFIG_PPC_BOOK3S_64 lis r0,DSISR_BAD_FAULT_64S@h ori r0,r0,DSISR_BAD_FAULT_64S@l and. r0,r4,r0 /* weird error? */ @@ -1513,7 +1528,7 @@ do_hash_page: /* Reload DSISR into r4 for the DABR check below */ ld r4,_DSISR(r1) -#endif /* CONFIG_PPC_STD_MMU_64 */ +#endif /* CONFIG_PPC_BOOK3S_64 */ /* Here we have a page fault that hash_page can't handle. */ handle_page_fault: @@ -1542,7 +1557,7 @@ handle_dabr_fault: 12: b ret_from_except_lite -#ifdef CONFIG_PPC_STD_MMU_64 +#ifdef CONFIG_PPC_BOOK3S_64 /* We have a page fault that hash_page could handle but HV refused * the PTE insertion */ diff --git a/arch/powerpc/kernel/idle_book3s.S b/arch/powerpc/kernel/idle_book3s.S index 1125c9be9e06..59657783d1d5 100644 --- a/arch/powerpc/kernel/idle_book3s.S +++ b/arch/powerpc/kernel/idle_book3s.S @@ -112,12 +112,14 @@ power9_save_additional_sprs: std r4, STOP_HFSCR(r13) mfspr r3, SPRN_MMCRA - mfspr r4, SPRN_MMCR1 + mfspr r4, SPRN_MMCR0 std r3, STOP_MMCRA(r13) - std r4, STOP_MMCR1(r13) + std r4, _MMCR0(r1) - mfspr r3, SPRN_MMCR2 - std r3, STOP_MMCR2(r13) + mfspr r3, SPRN_MMCR1 + mfspr r4, SPRN_MMCR2 + std r3, STOP_MMCR1(r13) + std r4, STOP_MMCR2(r13) blr power9_restore_additional_sprs: @@ -135,11 +137,14 @@ power9_restore_additional_sprs: ld r4, STOP_MMCRA(r13) mtspr SPRN_HFSCR, r3 mtspr SPRN_MMCRA, r4 - /* We have already restored PACA_MMCR0 */ - ld r3, STOP_MMCR1(r13) - ld r4, STOP_MMCR2(r13) - mtspr SPRN_MMCR1, r3 - mtspr SPRN_MMCR2, r4 + + ld r3, _MMCR0(r1) + ld r4, STOP_MMCR1(r13) + mtspr SPRN_MMCR0, r3 + mtspr SPRN_MMCR1, r4 + + ld r3, STOP_MMCR2(r13) + mtspr SPRN_MMCR2, r3 blr /* @@ -319,20 +324,13 @@ enter_winkle: /* * r3 - PSSCR value corresponding to the requested stop state. */ +power_enter_stop: #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE -power_enter_stop_kvm_rm: - /* - * This is currently unused because POWER9 KVM does not have to - * gather secondary threads into sibling mode, but the code is - * here in case that function is required. - * - * Tell KVM we're entering idle. - */ + /* Tell KVM we're entering idle */ li r4,KVM_HWTHREAD_IN_IDLE /* DO THIS IN REAL MODE! See comment above. */ stb r4,HSTATE_HWTHREAD_STATE(r13) #endif -power_enter_stop: /* * Check if we are executing the lite variant with ESL=EC=0 */ @@ -357,6 +355,7 @@ power_enter_stop: b pnv_wakeup_noloss .Lhandle_esl_ec_set: +BEGIN_FTR_SECTION /* * POWER9 DD2 can incorrectly set PMAO when waking up after a * state-loss idle. Saving and restoring MMCR0 over idle is a @@ -364,6 +363,7 @@ power_enter_stop: */ mfspr r4,SPRN_MMCR0 std r4,_MMCR0(r1) +END_FTR_SECTION_IFSET(CPU_FTR_POWER9_DD1 | CPU_FTR_POWER9_DD20) /* * Check if the requested state is a deep idle state. @@ -496,18 +496,6 @@ pnv_powersave_wakeup_mce: b pnv_powersave_wakeup -#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE -kvm_start_guest_check: - li r0,KVM_HWTHREAD_IN_KERNEL - stb r0,HSTATE_HWTHREAD_STATE(r13) - /* Order setting hwthread_state vs. testing hwthread_req */ - sync - lbz r0,HSTATE_HWTHREAD_REQ(r13) - cmpwi r0,0 - beqlr - b kvm_start_guest -#endif - /* * Called from reset vector for powersave wakeups. * cr3 - set to gt if waking up with partial/complete hypervisor state loss @@ -532,9 +520,15 @@ ALT_FTR_SECTION_END_IFSET(CPU_FTR_ARCH_300) mr r3,r12 #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE -BEGIN_FTR_SECTION - bl kvm_start_guest_check -END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300) + li r0,KVM_HWTHREAD_IN_KERNEL + stb r0,HSTATE_HWTHREAD_STATE(r13) + /* Order setting hwthread_state vs. testing hwthread_req */ + sync + lbz r0,HSTATE_HWTHREAD_REQ(r13) + cmpwi r0,0 + beq 1f + b kvm_start_guest +1: #endif /* Return SRR1 from power7_nap() */ @@ -555,15 +549,17 @@ pnv_restore_hyp_resource_arch300: * then clear bit 60 in MMCRA to ensure the PMU starts running. */ blt cr3,1f +BEGIN_FTR_SECTION PPC_INVALIDATE_ERAT ld r1,PACAR1(r13) + ld r4,_MMCR0(r1) + mtspr SPRN_MMCR0,r4 +END_FTR_SECTION_IFSET(CPU_FTR_POWER9_DD1 | CPU_FTR_POWER9_DD20) mfspr r4,SPRN_MMCRA ori r4,r4,(1 << (63-60)) mtspr SPRN_MMCRA,r4 xori r4,r4,(1 << (63-60)) mtspr SPRN_MMCRA,r4 - ld r4,_MMCR0(r1) - mtspr SPRN_MMCR0,r4 1: /* * POWER ISA 3. Use PSSCR to determine if we diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c index 4e65bf82f5e0..b7a84522e652 100644 --- a/arch/powerpc/kernel/irq.c +++ b/arch/powerpc/kernel/irq.c @@ -143,6 +143,13 @@ notrace unsigned int __check_irq_replay(void) */ unsigned char happened = local_paca->irq_happened; + /* + * We are responding to the next interrupt, so interrupt-off + * latencies should be reset here. + */ + trace_hardirqs_on(); + trace_hardirqs_off(); + if (happened & PACA_IRQ_HARD_DIS) { /* Clear bit 0 which we wouldn't clear otherwise */ local_paca->irq_happened &= ~PACA_IRQ_HARD_DIS; @@ -270,6 +277,7 @@ notrace void arch_local_irq_restore(unsigned long en) #endif /* CONFIG_TRACE_IRQFLAGS */ set_soft_enabled(0); + trace_hardirqs_off(); /* * Check if anything needs to be re-emitted. We haven't @@ -279,6 +287,7 @@ notrace void arch_local_irq_restore(unsigned long en) replay = __check_irq_replay(); /* We can soft-enable now */ + trace_hardirqs_on(); set_soft_enabled(1); /* @@ -394,11 +403,19 @@ bool prep_irq_for_idle_irqsoff(void) /* * Take the SRR1 wakeup reason, index into this table to find the * appropriate irq_happened bit. + * + * Sytem reset exceptions taken in idle state also come through here, + * but they are NMI interrupts so do not need to wait for IRQs to be + * restored, and should be taken as early as practical. These are marked + * with 0xff in the table. The Power ISA specifies 0100b as the system + * reset interrupt reason. */ +#define IRQ_SYSTEM_RESET 0xff + static const u8 srr1_to_lazyirq[0x10] = { 0, 0, 0, PACA_IRQ_DBELL, - 0, + IRQ_SYSTEM_RESET, PACA_IRQ_DBELL, PACA_IRQ_DEC, 0, @@ -407,15 +424,43 @@ static const u8 srr1_to_lazyirq[0x10] = { PACA_IRQ_HMI, 0, 0, 0, 0, 0 }; +void replay_system_reset(void) +{ + struct pt_regs regs; + + ppc_save_regs(®s); + regs.trap = 0x100; + get_paca()->in_nmi = 1; + system_reset_exception(®s); + get_paca()->in_nmi = 0; +} +EXPORT_SYMBOL_GPL(replay_system_reset); + void irq_set_pending_from_srr1(unsigned long srr1) { unsigned int idx = (srr1 & SRR1_WAKEMASK_P8) >> 18; + u8 reason = srr1_to_lazyirq[idx]; + + /* + * Take the system reset now, which is immediately after registers + * are restored from idle. It's an NMI, so interrupts need not be + * re-enabled before it is taken. + */ + if (unlikely(reason == IRQ_SYSTEM_RESET)) { + replay_system_reset(); + return; + } /* * The 0 index (SRR1[42:45]=b0000) must always evaluate to 0, - * so this can be called unconditionally with srr1 wake reason. + * so this can be called unconditionally with the SRR1 wake + * reason as returned by the idle code, which uses 0 to mean no + * interrupt. + * + * If a future CPU was to designate this as an interrupt reason, + * then a new index for no interrupt must be assigned. */ - local_paca->irq_happened |= srr1_to_lazyirq[idx]; + local_paca->irq_happened |= reason; } #endif /* CONFIG_PPC_BOOK3S */ diff --git a/arch/powerpc/kernel/kprobes-ftrace.c b/arch/powerpc/kernel/kprobes-ftrace.c index 6c089d9757c9..4b1f34f685b1 100644 --- a/arch/powerpc/kernel/kprobes-ftrace.c +++ b/arch/powerpc/kernel/kprobes-ftrace.c @@ -25,6 +25,21 @@ #include <linux/preempt.h> #include <linux/ftrace.h> +/* + * This is called from ftrace code after invoking registered handlers to + * disambiguate regs->nip changes done by jprobes and livepatch. We check if + * there is an active jprobe at the provided address (mcount location). + */ +int __is_active_jprobe(unsigned long addr) +{ + if (!preemptible()) { + struct kprobe *p = raw_cpu_read(current_kprobe); + return (p && (unsigned long)p->addr == addr) ? 1 : 0; + } + + return 0; +} + static nokprobe_inline int __skip_singlestep(struct kprobe *p, struct pt_regs *regs, struct kprobe_ctlblk *kcb, unsigned long orig_nip) @@ -65,6 +80,7 @@ void kprobe_ftrace_handler(unsigned long nip, unsigned long parent_nip, /* Disable irq for emulating a breakpoint and avoiding preempt */ local_irq_save(flags); hard_irq_disable(); + preempt_disable(); p = get_kprobe((kprobe_opcode_t *)nip); if (unlikely(!p) || kprobe_disabled(p)) @@ -86,12 +102,18 @@ void kprobe_ftrace_handler(unsigned long nip, unsigned long parent_nip, kcb->kprobe_status = KPROBE_HIT_ACTIVE; if (!p->pre_handler || !p->pre_handler(p, regs)) __skip_singlestep(p, regs, kcb, orig_nip); - /* - * If pre_handler returns !0, it sets regs->nip and - * resets current kprobe. - */ + else { + /* + * If pre_handler returns !0, it sets regs->nip and + * resets current kprobe. In this case, we still need + * to restore irq, but not preemption. + */ + local_irq_restore(flags); + return; + } } end: + preempt_enable_no_resched(); local_irq_restore(flags); } NOKPROBE_SYMBOL(kprobe_ftrace_handler); diff --git a/arch/powerpc/kernel/kprobes.c b/arch/powerpc/kernel/kprobes.c index bebc3007a793..a20ce12adab1 100644 --- a/arch/powerpc/kernel/kprobes.c +++ b/arch/powerpc/kernel/kprobes.c @@ -43,12 +43,6 @@ DEFINE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk); struct kretprobe_blackpoint kretprobe_blacklist[] = {{NULL, NULL}}; -int is_current_kprobe_addr(unsigned long addr) -{ - struct kprobe *p = kprobe_running(); - return (p && (unsigned long)p->addr == addr) ? 1 : 0; -} - bool arch_within_kprobe_blacklist(unsigned long addr) { return (addr >= (unsigned long)__kprobes_text_start && @@ -239,7 +233,7 @@ void arch_prepare_kretprobe(struct kretprobe_instance *ri, struct pt_regs *regs) } NOKPROBE_SYMBOL(arch_prepare_kretprobe); -int try_to_emulate(struct kprobe *p, struct pt_regs *regs) +static int try_to_emulate(struct kprobe *p, struct pt_regs *regs) { int ret; unsigned int insn = *p->ainsn.insn; @@ -261,9 +255,20 @@ int try_to_emulate(struct kprobe *p, struct pt_regs *regs) */ printk("Can't step on instruction %x\n", insn); BUG(); - } else if (ret == 0) - /* This instruction can't be boosted */ - p->ainsn.boostable = -1; + } else { + /* + * If we haven't previously emulated this instruction, then it + * can't be boosted. Note it down so we don't try to do so again. + * + * If, however, we had emulated this instruction in the past, + * then this is just an error with the current run (for + * instance, exceptions due to a load/store). We return 0 so + * that this is now single-stepped, but continue to try + * emulating it in subsequent probe hits. + */ + if (unlikely(p->ainsn.boostable != 1)) + p->ainsn.boostable = -1; + } return ret; } @@ -639,24 +644,22 @@ NOKPROBE_SYMBOL(setjmp_pre_handler); void __used jprobe_return(void) { - asm volatile("trap" ::: "memory"); + asm volatile("jprobe_return_trap:\n" + "trap\n" + ::: "memory"); } NOKPROBE_SYMBOL(jprobe_return); -static void __used jprobe_return_end(void) -{ -} -NOKPROBE_SYMBOL(jprobe_return_end); - int longjmp_break_handler(struct kprobe *p, struct pt_regs *regs) { struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); - /* - * FIXME - we should ideally be validating that we got here 'cos - * of the "trap" in jprobe_return() above, before restoring the - * saved regs... - */ + if (regs->nip != ppc_kallsyms_lookup_name("jprobe_return_trap")) { + pr_debug("longjmp_break_handler NIP (0x%lx) does not match jprobe_return_trap (0x%lx)\n", + regs->nip, ppc_kallsyms_lookup_name("jprobe_return_trap")); + return 0; + } + memcpy(regs, &kcb->jprobe_saved_regs, sizeof(struct pt_regs)); /* It's OK to start function graph tracing again */ unpause_graph_tracing(); diff --git a/arch/powerpc/kernel/machine_kexec_64.c b/arch/powerpc/kernel/machine_kexec_64.c index 5c12e21d0d1a..49d34d7271e7 100644 --- a/arch/powerpc/kernel/machine_kexec_64.c +++ b/arch/powerpc/kernel/machine_kexec_64.c @@ -360,7 +360,7 @@ void default_machine_kexec(struct kimage *image) /* NOTREACHED */ } -#ifdef CONFIG_PPC_STD_MMU_64 +#ifdef CONFIG_PPC_BOOK3S_64 /* Values we need to export to the second kernel via the device tree. */ static unsigned long htab_base; static unsigned long htab_size; @@ -402,4 +402,4 @@ static int __init export_htab_values(void) return 0; } late_initcall(export_htab_values); -#endif /* CONFIG_PPC_STD_MMU_64 */ +#endif /* CONFIG_PPC_BOOK3S_64 */ diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c index 9b2ea7e71c06..742e4658c5dc 100644 --- a/arch/powerpc/kernel/mce.c +++ b/arch/powerpc/kernel/mce.c @@ -39,11 +39,21 @@ static DEFINE_PER_CPU(struct machine_check_event[MAX_MC_EVT], mce_event); static DEFINE_PER_CPU(int, mce_queue_count); static DEFINE_PER_CPU(struct machine_check_event[MAX_MC_EVT], mce_event_queue); +/* Queue for delayed MCE UE events. */ +static DEFINE_PER_CPU(int, mce_ue_count); +static DEFINE_PER_CPU(struct machine_check_event[MAX_MC_EVT], + mce_ue_event_queue); + static void machine_check_process_queued_event(struct irq_work *work); +void machine_check_ue_event(struct machine_check_event *evt); +static void machine_process_ue_event(struct work_struct *work); + static struct irq_work mce_event_process_work = { .func = machine_check_process_queued_event, }; +DECLARE_WORK(mce_ue_event_work, machine_process_ue_event); + static void mce_set_error_info(struct machine_check_event *mce, struct mce_error_info *mce_err) { @@ -82,7 +92,7 @@ static void mce_set_error_info(struct machine_check_event *mce, */ void save_mce_event(struct pt_regs *regs, long handled, struct mce_error_info *mce_err, - uint64_t nip, uint64_t addr) + uint64_t nip, uint64_t addr, uint64_t phys_addr) { int index = __this_cpu_inc_return(mce_nest_count) - 1; struct machine_check_event *mce = this_cpu_ptr(&mce_event[index]); @@ -140,6 +150,11 @@ void save_mce_event(struct pt_regs *regs, long handled, } else if (mce->error_type == MCE_ERROR_TYPE_UE) { mce->u.ue_error.effective_address_provided = true; mce->u.ue_error.effective_address = addr; + if (phys_addr != ULONG_MAX) { + mce->u.ue_error.physical_address_provided = true; + mce->u.ue_error.physical_address = phys_addr; + machine_check_ue_event(mce); + } } return; } @@ -193,6 +208,26 @@ void release_mce_event(void) get_mce_event(NULL, true); } + +/* + * Queue up the MCE event which then can be handled later. + */ +void machine_check_ue_event(struct machine_check_event *evt) +{ + int index; + + index = __this_cpu_inc_return(mce_ue_count) - 1; + /* If queue is full, just return for now. */ + if (index >= MAX_MC_EVT) { + __this_cpu_dec(mce_ue_count); + return; + } + memcpy(this_cpu_ptr(&mce_ue_event_queue[index]), evt, sizeof(*evt)); + + /* Queue work to process this event later. */ + schedule_work(&mce_ue_event_work); +} + /* * Queue up the MCE event which then can be handled later. */ @@ -215,7 +250,39 @@ void machine_check_queue_event(void) /* Queue irq work to process this event later. */ irq_work_queue(&mce_event_process_work); } - +/* + * process pending MCE event from the mce event queue. This function will be + * called during syscall exit. + */ +static void machine_process_ue_event(struct work_struct *work) +{ + int index; + struct machine_check_event *evt; + + while (__this_cpu_read(mce_ue_count) > 0) { + index = __this_cpu_read(mce_ue_count) - 1; + evt = this_cpu_ptr(&mce_ue_event_queue[index]); +#ifdef CONFIG_MEMORY_FAILURE + /* + * This should probably queued elsewhere, but + * oh! well + */ + if (evt->error_type == MCE_ERROR_TYPE_UE) { + if (evt->u.ue_error.physical_address_provided) { + unsigned long pfn; + + pfn = evt->u.ue_error.physical_address >> + PAGE_SHIFT; + memory_failure(pfn, SIGBUS, 0); + } else + pr_warn("Failed to identify bad address from " + "where the uncorrectable error (UE) " + "was generated\n"); + } +#endif + __this_cpu_dec(mce_ue_count); + } +} /* * process pending MCE event from the mce event queue. This function will be * called during syscall exit. @@ -223,6 +290,7 @@ void machine_check_queue_event(void) static void machine_check_process_queued_event(struct irq_work *work) { int index; + struct machine_check_event *evt; add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE); @@ -232,8 +300,8 @@ static void machine_check_process_queued_event(struct irq_work *work) */ while (__this_cpu_read(mce_queue_count) > 0) { index = __this_cpu_read(mce_queue_count) - 1; - machine_check_print_event_info( - this_cpu_ptr(&mce_event_queue[index]), false); + evt = this_cpu_ptr(&mce_event_queue[index]); + machine_check_print_event_info(evt, false); __this_cpu_dec(mce_queue_count); } } @@ -340,7 +408,7 @@ void machine_check_print_event_info(struct machine_check_event *evt, printk("%s Effective address: %016llx\n", level, evt->u.ue_error.effective_address); if (evt->u.ue_error.physical_address_provided) - printk("%s Physical address: %016llx\n", + printk("%s Physical address: %016llx\n", level, evt->u.ue_error.physical_address); break; case MCE_ERROR_TYPE_SLB: @@ -411,45 +479,6 @@ void machine_check_print_event_info(struct machine_check_event *evt, } EXPORT_SYMBOL_GPL(machine_check_print_event_info); -uint64_t get_mce_fault_addr(struct machine_check_event *evt) -{ - switch (evt->error_type) { - case MCE_ERROR_TYPE_UE: - if (evt->u.ue_error.effective_address_provided) - return evt->u.ue_error.effective_address; - break; - case MCE_ERROR_TYPE_SLB: - if (evt->u.slb_error.effective_address_provided) - return evt->u.slb_error.effective_address; - break; - case MCE_ERROR_TYPE_ERAT: - if (evt->u.erat_error.effective_address_provided) - return evt->u.erat_error.effective_address; - break; - case MCE_ERROR_TYPE_TLB: - if (evt->u.tlb_error.effective_address_provided) - return evt->u.tlb_error.effective_address; - break; - case MCE_ERROR_TYPE_USER: - if (evt->u.user_error.effective_address_provided) - return evt->u.user_error.effective_address; - break; - case MCE_ERROR_TYPE_RA: - if (evt->u.ra_error.effective_address_provided) - return evt->u.ra_error.effective_address; - break; - case MCE_ERROR_TYPE_LINK: - if (evt->u.link_error.effective_address_provided) - return evt->u.link_error.effective_address; - break; - default: - case MCE_ERROR_TYPE_UNKNOWN: - break; - } - return 0; -} -EXPORT_SYMBOL(get_mce_fault_addr); - /* * This function is called in real mode. Strictly no printk's please. * @@ -470,6 +499,34 @@ long hmi_exception_realmode(struct pt_regs *regs) { __this_cpu_inc(irq_stat.hmi_exceptions); +#ifdef CONFIG_PPC_BOOK3S_64 + /* Workaround for P9 vector CI loads (see p9_hmi_special_emu) */ + if (pvr_version_is(PVR_POWER9)) { + unsigned long hmer = mfspr(SPRN_HMER); + + /* Do we have the debug bit set */ + if (hmer & PPC_BIT(17)) { + hmer &= ~PPC_BIT(17); + mtspr(SPRN_HMER, hmer); + + /* + * Now to avoid problems with soft-disable we + * only do the emulation if we are coming from + * user space + */ + if (user_mode(regs)) + local_paca->hmi_p9_special_emu = 1; + + /* + * Don't bother going to OPAL if that's the + * only relevant bit. + */ + if (!(hmer & mfspr(SPRN_HMEER))) + return local_paca->hmi_p9_special_emu; + } + } +#endif /* CONFIG_PPC_BOOK3S_64 */ + wait_for_subcore_guest_exit(); if (ppc_md.hmi_exception_early) @@ -477,5 +534,5 @@ long hmi_exception_realmode(struct pt_regs *regs) wait_for_tb_resync(); - return 0; + return 1; } diff --git a/arch/powerpc/kernel/mce_power.c b/arch/powerpc/kernel/mce_power.c index 72f153c6f3fa..644f7040b91c 100644 --- a/arch/powerpc/kernel/mce_power.c +++ b/arch/powerpc/kernel/mce_power.c @@ -27,6 +27,36 @@ #include <asm/mmu.h> #include <asm/mce.h> #include <asm/machdep.h> +#include <asm/pgtable.h> +#include <asm/pte-walk.h> +#include <asm/sstep.h> +#include <asm/exception-64s.h> + +/* + * Convert an address related to an mm to a PFN. NOTE: we are in real + * mode, we could potentially race with page table updates. + */ +static unsigned long addr_to_pfn(struct pt_regs *regs, unsigned long addr) +{ + pte_t *ptep; + unsigned long flags; + struct mm_struct *mm; + + if (user_mode(regs)) + mm = current->mm; + else + mm = &init_mm; + + local_irq_save(flags); + if (mm == current->mm) + ptep = find_current_mm_pte(mm->pgd, addr, NULL, NULL); + else + ptep = find_init_mm_pte(addr, NULL); + local_irq_restore(flags); + if (!ptep || pte_special(*ptep)) + return ULONG_MAX; + return pte_pfn(*ptep); +} static void flush_tlb_206(unsigned int num_sets, unsigned int action) { @@ -128,7 +158,7 @@ void __flush_tlb_power9(unsigned int action) { unsigned int num_sets; - if (radix_enabled()) + if (early_radix_enabled()) num_sets = POWER9_TLB_SETS_RADIX; else num_sets = POWER9_TLB_SETS_HASH; @@ -138,7 +168,7 @@ void __flush_tlb_power9(unsigned int action) /* flush SLBs and reload */ -#ifdef CONFIG_PPC_STD_MMU_64 +#ifdef CONFIG_PPC_BOOK3S_64 static void flush_and_reload_slb(void) { struct slb_shadow *slb; @@ -185,7 +215,7 @@ static void flush_erat(void) static int mce_flush(int what) { -#ifdef CONFIG_PPC_STD_MMU_64 +#ifdef CONFIG_PPC_BOOK3S_64 if (what == MCE_FLUSH_SLB) { flush_and_reload_slb(); return 1; @@ -421,9 +451,45 @@ static const struct mce_derror_table mce_p9_derror_table[] = { MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, { 0, false, 0, 0, 0, 0 } }; +static int mce_find_instr_ea_and_pfn(struct pt_regs *regs, uint64_t *addr, + uint64_t *phys_addr) +{ + /* + * Carefully look at the NIP to determine + * the instruction to analyse. Reading the NIP + * in real-mode is tricky and can lead to recursive + * faults + */ + int instr; + unsigned long pfn, instr_addr; + struct instruction_op op; + struct pt_regs tmp = *regs; + + pfn = addr_to_pfn(regs, regs->nip); + if (pfn != ULONG_MAX) { + instr_addr = (pfn << PAGE_SHIFT) + (regs->nip & ~PAGE_MASK); + instr = *(unsigned int *)(instr_addr); + if (!analyse_instr(&op, &tmp, instr)) { + pfn = addr_to_pfn(regs, op.ea); + *addr = op.ea; + *phys_addr = (pfn << PAGE_SHIFT); + return 0; + } + /* + * analyse_instr() might fail if the instruction + * is not a load/store, although this is unexpected + * for load/store errors or if we got the NIP + * wrong + */ + } + *addr = 0; + return -1; +} + static int mce_handle_ierror(struct pt_regs *regs, const struct mce_ierror_table table[], - struct mce_error_info *mce_err, uint64_t *addr) + struct mce_error_info *mce_err, uint64_t *addr, + uint64_t *phys_addr) { uint64_t srr1 = regs->msr; int handled = 0; @@ -475,8 +541,22 @@ static int mce_handle_ierror(struct pt_regs *regs, } mce_err->severity = table[i].severity; mce_err->initiator = table[i].initiator; - if (table[i].nip_valid) + if (table[i].nip_valid) { *addr = regs->nip; + if (mce_err->severity == MCE_SEV_ERROR_SYNC && + table[i].error_type == MCE_ERROR_TYPE_UE) { + unsigned long pfn; + + if (get_paca()->in_mce < MAX_MCE_DEPTH) { + pfn = addr_to_pfn(regs, regs->nip); + if (pfn != ULONG_MAX) { + *phys_addr = + (pfn << PAGE_SHIFT); + handled = 1; + } + } + } + } return handled; } @@ -489,7 +569,8 @@ static int mce_handle_ierror(struct pt_regs *regs, static int mce_handle_derror(struct pt_regs *regs, const struct mce_derror_table table[], - struct mce_error_info *mce_err, uint64_t *addr) + struct mce_error_info *mce_err, uint64_t *addr, + uint64_t *phys_addr) { uint64_t dsisr = regs->dsisr; int handled = 0; @@ -555,7 +636,17 @@ static int mce_handle_derror(struct pt_regs *regs, mce_err->initiator = table[i].initiator; if (table[i].dar_valid) *addr = regs->dar; - + else if (mce_err->severity == MCE_SEV_ERROR_SYNC && + table[i].error_type == MCE_ERROR_TYPE_UE) { + /* + * We do a maximum of 4 nested MCE calls, see + * kernel/exception-64s.h + */ + if (get_paca()->in_mce < MAX_MCE_DEPTH) + if (!mce_find_instr_ea_and_pfn(regs, addr, + phys_addr)) + handled = 1; + } found = 1; } @@ -592,19 +683,21 @@ static long mce_handle_error(struct pt_regs *regs, const struct mce_ierror_table itable[]) { struct mce_error_info mce_err = { 0 }; - uint64_t addr; + uint64_t addr, phys_addr; uint64_t srr1 = regs->msr; long handled; if (SRR1_MC_LOADSTORE(srr1)) - handled = mce_handle_derror(regs, dtable, &mce_err, &addr); + handled = mce_handle_derror(regs, dtable, &mce_err, &addr, + &phys_addr); else - handled = mce_handle_ierror(regs, itable, &mce_err, &addr); + handled = mce_handle_ierror(regs, itable, &mce_err, &addr, + &phys_addr); if (!handled && mce_err.error_type == MCE_ERROR_TYPE_UE) handled = mce_handle_ue_error(regs); - save_mce_event(regs, handled, &mce_err, regs->nip, addr); + save_mce_event(regs, handled, &mce_err, regs->nip, addr, phys_addr); return handled; } diff --git a/arch/powerpc/kernel/module_64.c b/arch/powerpc/kernel/module_64.c index 0b0f89685b67..759104b99f9f 100644 --- a/arch/powerpc/kernel/module_64.c +++ b/arch/powerpc/kernel/module_64.c @@ -429,7 +429,8 @@ static unsigned long stub_for_addr(const Elf64_Shdr *sechdrs, /* Find this stub, or if that fails, the next avail. entry */ stubs = (void *)sechdrs[me->arch.stubs_section].sh_addr; for (i = 0; stub_func_addr(stubs[i].funcdata); i++) { - BUG_ON(i >= num_stubs); + if (WARN_ON(i >= num_stubs)) + return 0; if (stub_func_addr(stubs[i].funcdata) == func_addr(addr)) return (unsigned long)&stubs[i]; diff --git a/arch/powerpc/kernel/paca.c b/arch/powerpc/kernel/paca.c index 2ff2b8a19f71..5d38d5ea9a24 100644 --- a/arch/powerpc/kernel/paca.c +++ b/arch/powerpc/kernel/paca.c @@ -90,7 +90,7 @@ static inline void free_lppacas(void) { } #endif /* CONFIG_PPC_BOOK3S */ -#ifdef CONFIG_PPC_STD_MMU_64 +#ifdef CONFIG_PPC_BOOK3S_64 /* * 3 persistent SLBs are registered here. The buffer will be zero @@ -135,11 +135,11 @@ static struct slb_shadow * __init init_slb_shadow(int cpu) return s; } -#else /* CONFIG_PPC_STD_MMU_64 */ +#else /* !CONFIG_PPC_BOOK3S_64 */ static void __init allocate_slb_shadows(int nr_cpus, int limit) { } -#endif /* CONFIG_PPC_STD_MMU_64 */ +#endif /* CONFIG_PPC_BOOK3S_64 */ /* The Paca is an array with one entry per processor. Each contains an * lppaca, which contains the information shared between the @@ -170,9 +170,9 @@ void __init initialise_paca(struct paca_struct *new_paca, int cpu) new_paca->kexec_state = KEXEC_STATE_NONE; new_paca->__current = &init_task; new_paca->data_offset = 0xfeeeeeeeeeeeeeeeULL; -#ifdef CONFIG_PPC_STD_MMU_64 +#ifdef CONFIG_PPC_BOOK3S_64 new_paca->slb_shadow_ptr = init_slb_shadow(cpu); -#endif /* CONFIG_PPC_STD_MMU_64 */ +#endif #ifdef CONFIG_PPC_BOOK3E /* For now -- if we have threads this will be adjusted later */ @@ -271,7 +271,7 @@ void copy_mm_to_paca(struct mm_struct *mm) get_paca()->mm_ctx_user_psize = context->user_psize; get_paca()->mm_ctx_sllp = context->sllp; #endif -#else /* CONFIG_PPC_BOOK3S */ +#else /* !CONFIG_PPC_BOOK3S */ return; #endif } diff --git a/arch/powerpc/kernel/pci_64.c b/arch/powerpc/kernel/pci_64.c index 932b9741aa8f..15ce0306b092 100644 --- a/arch/powerpc/kernel/pci_64.c +++ b/arch/powerpc/kernel/pci_64.c @@ -90,14 +90,14 @@ int pcibios_unmap_io_space(struct pci_bus *bus) * to do an appropriate TLB flush here too */ if (bus->self) { -#ifdef CONFIG_PPC_STD_MMU_64 +#ifdef CONFIG_PPC_BOOK3S_64 struct resource *res = bus->resource[0]; #endif pr_debug("IO unmapping for PCI-PCI bridge %s\n", pci_name(bus->self)); -#ifdef CONFIG_PPC_STD_MMU_64 +#ifdef CONFIG_PPC_BOOK3S_64 __flush_hash_table_range(&init_mm, res->start + _IO_BASE, res->end + _IO_BASE + 1); #endif diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c index a0c74bbf3454..4083b737decb 100644 --- a/arch/powerpc/kernel/process.c +++ b/arch/powerpc/kernel/process.c @@ -77,6 +77,13 @@ extern unsigned long _get_SP(void); #ifdef CONFIG_PPC_TRANSACTIONAL_MEM +/* + * Are we running in "Suspend disabled" mode? If so we have to block any + * sigreturn that would get us into suspended state, and we also warn in some + * other paths that we should never reach with suspend disabled. + */ +bool tm_suspend_disabled __ro_after_init = false; + static void check_if_tm_restore_required(struct task_struct *tsk) { /* @@ -97,9 +104,23 @@ static inline bool msr_tm_active(unsigned long msr) { return MSR_TM_ACTIVE(msr); } + +static bool tm_active_with_fp(struct task_struct *tsk) +{ + return msr_tm_active(tsk->thread.regs->msr) && + (tsk->thread.ckpt_regs.msr & MSR_FP); +} + +static bool tm_active_with_altivec(struct task_struct *tsk) +{ + return msr_tm_active(tsk->thread.regs->msr) && + (tsk->thread.ckpt_regs.msr & MSR_VEC); +} #else static inline bool msr_tm_active(unsigned long msr) { return false; } static inline void check_if_tm_restore_required(struct task_struct *tsk) { } +static inline bool tm_active_with_fp(struct task_struct *tsk) { return false; } +static inline bool tm_active_with_altivec(struct task_struct *tsk) { return false; } #endif /* CONFIG_PPC_TRANSACTIONAL_MEM */ bool strict_msr_control; @@ -232,7 +253,7 @@ EXPORT_SYMBOL(enable_kernel_fp); static int restore_fp(struct task_struct *tsk) { - if (tsk->thread.load_fp || msr_tm_active(tsk->thread.regs->msr)) { + if (tsk->thread.load_fp || tm_active_with_fp(tsk)) { load_fp_state(¤t->thread.fp_state); current->thread.load_fp++; return 1; @@ -314,7 +335,7 @@ EXPORT_SYMBOL_GPL(flush_altivec_to_thread); static int restore_altivec(struct task_struct *tsk) { if (cpu_has_feature(CPU_FTR_ALTIVEC) && - (tsk->thread.load_vec || msr_tm_active(tsk->thread.regs->msr))) { + (tsk->thread.load_vec || tm_active_with_altivec(tsk))) { load_vr_state(&tsk->thread.vr_state); tsk->thread.used_vr = 1; tsk->thread.load_vec++; @@ -853,6 +874,10 @@ static void tm_reclaim_thread(struct thread_struct *thr, if (!MSR_TM_SUSPENDED(mfmsr())) return; + giveup_all(container_of(thr, struct task_struct, thread)); + + tm_reclaim(thr, cause); + /* * If we are in a transaction and FP is off then we can't have * used FP inside that transaction. Hence the checkpointed @@ -871,10 +896,6 @@ static void tm_reclaim_thread(struct thread_struct *thr, if ((thr->ckpt_regs.msr & MSR_VEC) == 0) memcpy(&thr->ckvr_state, &thr->vr_state, sizeof(struct thread_vr_state)); - - giveup_all(container_of(thr, struct task_struct, thread)); - - tm_reclaim(thr, thr->ckpt_regs.msr, cause); } void tm_reclaim_current(uint8_t cause) @@ -903,6 +924,8 @@ static inline void tm_reclaim_task(struct task_struct *tsk) if (!MSR_TM_ACTIVE(thr->regs->msr)) goto out_and_saveregs; + WARN_ON(tm_suspend_disabled); + TM_DEBUG("--- tm_reclaim on pid %d (NIP=%lx, " "ccr=%lx, msr=%lx, trap=%lx)\n", tsk->pid, thr->regs->nip, @@ -923,11 +946,9 @@ out_and_saveregs: tm_save_sprs(thr); } -extern void __tm_recheckpoint(struct thread_struct *thread, - unsigned long orig_msr); +extern void __tm_recheckpoint(struct thread_struct *thread); -void tm_recheckpoint(struct thread_struct *thread, - unsigned long orig_msr) +void tm_recheckpoint(struct thread_struct *thread) { unsigned long flags; @@ -946,15 +967,13 @@ void tm_recheckpoint(struct thread_struct *thread, */ tm_restore_sprs(thread); - __tm_recheckpoint(thread, orig_msr); + __tm_recheckpoint(thread); local_irq_restore(flags); } static inline void tm_recheckpoint_new_task(struct task_struct *new) { - unsigned long msr; - if (!cpu_has_feature(CPU_FTR_TM)) return; @@ -973,13 +992,11 @@ static inline void tm_recheckpoint_new_task(struct task_struct *new) tm_restore_sprs(&new->thread); return; } - msr = new->thread.ckpt_regs.msr; /* Recheckpoint to restore original checkpointed register state. */ - TM_DEBUG("*** tm_recheckpoint of pid %d " - "(new->msr 0x%lx, new->origmsr 0x%lx)\n", - new->pid, new->thread.regs->msr, msr); + TM_DEBUG("*** tm_recheckpoint of pid %d (new->msr 0x%lx)\n", + new->pid, new->thread.regs->msr); - tm_recheckpoint(&new->thread, msr); + tm_recheckpoint(&new->thread); /* * The checkpointed state has been restored but the live state has @@ -1155,7 +1172,7 @@ struct task_struct *__switch_to(struct task_struct *prev, } #endif /* CONFIG_PPC64 */ -#ifdef CONFIG_PPC_STD_MMU_64 +#ifdef CONFIG_PPC_BOOK3S_64 batch = this_cpu_ptr(&ppc64_tlb_batch); if (batch->active) { current_thread_info()->local_flags |= _TLF_LAZY_MMU; @@ -1163,7 +1180,7 @@ struct task_struct *__switch_to(struct task_struct *prev, __flush_tlb_pending(batch); batch->active = 0; } -#endif /* CONFIG_PPC_STD_MMU_64 */ +#endif /* CONFIG_PPC_BOOK3S_64 */ #ifdef CONFIG_PPC_ADV_DEBUG_REGS switch_booke_debug_regs(&new->thread.debug); @@ -1209,7 +1226,7 @@ struct task_struct *__switch_to(struct task_struct *prev, last = _switch(old_thread, new_thread); -#ifdef CONFIG_PPC_STD_MMU_64 +#ifdef CONFIG_PPC_BOOK3S_64 if (current_thread_info()->local_flags & _TLF_LAZY_MMU) { current_thread_info()->local_flags &= ~_TLF_LAZY_MMU; batch = this_cpu_ptr(&ppc64_tlb_batch); @@ -1238,7 +1255,7 @@ struct task_struct *__switch_to(struct task_struct *prev, : : "r"(dummy_copy_buffer), "r"(0)); } } -#endif /* CONFIG_PPC_STD_MMU_64 */ +#endif /* CONFIG_PPC_BOOK3S_64 */ return last; } @@ -1467,7 +1484,7 @@ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) static void setup_ksp_vsid(struct task_struct *p, unsigned long sp) { -#ifdef CONFIG_PPC_STD_MMU_64 +#ifdef CONFIG_PPC_BOOK3S_64 unsigned long sp_vsid; unsigned long llp = mmu_psize_defs[mmu_linear_psize].sllp; @@ -1898,7 +1915,8 @@ unsigned long get_wchan(struct task_struct *p) do { sp = *(unsigned long *)sp; - if (!validate_sp(sp, p, STACK_FRAME_OVERHEAD)) + if (!validate_sp(sp, p, STACK_FRAME_OVERHEAD) || + p->state == TASK_RUNNING) return 0; if (count > 0) { ip = ((unsigned long *)sp)[STACK_FRAME_LR_SAVE]; @@ -2046,7 +2064,7 @@ unsigned long arch_randomize_brk(struct mm_struct *mm) unsigned long base = mm->brk; unsigned long ret; -#ifdef CONFIG_PPC_STD_MMU_64 +#ifdef CONFIG_PPC_BOOK3S_64 /* * If we are using 1TB segments and we are allowed to randomise * the heap, we can put it above 1TB so it is backed by a 1TB diff --git a/arch/powerpc/kernel/prom.c b/arch/powerpc/kernel/prom.c index f83056297441..b15bae265c90 100644 --- a/arch/powerpc/kernel/prom.c +++ b/arch/powerpc/kernel/prom.c @@ -47,6 +47,7 @@ #include <asm/mmu.h> #include <asm/paca.h> #include <asm/pgtable.h> +#include <asm/powernv.h> #include <asm/iommu.h> #include <asm/btext.h> #include <asm/sections.h> @@ -228,7 +229,7 @@ static void __init check_cpu_pa_features(unsigned long node) ibm_pa_features, ARRAY_SIZE(ibm_pa_features)); } -#ifdef CONFIG_PPC_STD_MMU_64 +#ifdef CONFIG_PPC_BOOK3S_64 static void __init init_mmu_slb_size(unsigned long node) { const __be32 *slb_size_ptr; @@ -658,6 +659,38 @@ static void __init early_reserve_mem(void) #endif } +#ifdef CONFIG_PPC_TRANSACTIONAL_MEM +static bool tm_disabled __initdata; + +static int __init parse_ppc_tm(char *str) +{ + bool res; + + if (kstrtobool(str, &res)) + return -EINVAL; + + tm_disabled = !res; + + return 0; +} +early_param("ppc_tm", parse_ppc_tm); + +static void __init tm_init(void) +{ + if (tm_disabled) { + pr_info("Disabling hardware transactional memory (HTM)\n"); + cur_cpu_spec->cpu_user_features2 &= + ~(PPC_FEATURE2_HTM_NOSC | PPC_FEATURE2_HTM); + cur_cpu_spec->cpu_features &= ~CPU_FTR_TM; + return; + } + + pnv_tm_init(); +} +#else +static void tm_init(void) { } +#endif /* CONFIG_PPC_TRANSACTIONAL_MEM */ + void __init early_init_devtree(void *params) { phys_addr_t limit; @@ -767,6 +800,8 @@ void __init early_init_devtree(void *params) powerpc_firmware_features |= FW_FEATURE_PS3_POSSIBLE; #endif + tm_init(); + DBG(" <- early_init_devtree()\n"); } diff --git a/arch/powerpc/kernel/setup-common.c b/arch/powerpc/kernel/setup-common.c index 2e3bc16d02b2..fa661ed616f5 100644 --- a/arch/powerpc/kernel/setup-common.c +++ b/arch/powerpc/kernel/setup-common.c @@ -773,7 +773,7 @@ void arch_setup_pdev_archdata(struct platform_device *pdev) static __init void print_system_info(void) { pr_info("-----------------------------------------------------\n"); -#ifdef CONFIG_PPC_STD_MMU_64 +#ifdef CONFIG_PPC_BOOK3S_64 pr_info("ppc64_pft_size = 0x%llx\n", ppc64_pft_size); #endif #ifdef CONFIG_PPC_STD_MMU_32 @@ -800,7 +800,7 @@ static __init void print_system_info(void) pr_info("firmware_features = 0x%016lx\n", powerpc_firmware_features); #endif -#ifdef CONFIG_PPC_STD_MMU_64 +#ifdef CONFIG_PPC_BOOK3S_64 if (htab_address) pr_info("htab_address = 0x%p\n", htab_address); if (htab_hash_mask) diff --git a/arch/powerpc/kernel/signal_32.c b/arch/powerpc/kernel/signal_32.c index 92fb1c8dbbd8..16d16583cf11 100644 --- a/arch/powerpc/kernel/signal_32.c +++ b/arch/powerpc/kernel/signal_32.c @@ -519,6 +519,8 @@ static int save_tm_user_regs(struct pt_regs *regs, { unsigned long msr = regs->msr; + WARN_ON(tm_suspend_disabled); + /* Remove TM bits from thread's MSR. The MSR in the sigcontext * just indicates to userland that we were doing a transaction, but we * don't want to return in transactional state. This also ensures @@ -769,6 +771,8 @@ static long restore_tm_user_regs(struct pt_regs *regs, int i; #endif + if (tm_suspend_disabled) + return 1; /* * restore general registers but not including MSR or SOFTE. Also * take care of keeping r2 (TLS) intact if not a signal. @@ -876,7 +880,7 @@ static long restore_tm_user_regs(struct pt_regs *regs, /* Make sure the transaction is marked as failed */ current->thread.tm_texasr |= TEXASR_FS; /* This loads the checkpointed FP/VEC state, if used */ - tm_recheckpoint(¤t->thread, msr); + tm_recheckpoint(¤t->thread); /* This loads the speculative FP/VEC state, if used */ msr_check_and_set(msr & (MSR_FP | MSR_VEC)); diff --git a/arch/powerpc/kernel/signal_64.c b/arch/powerpc/kernel/signal_64.c index c83c115858c1..528ea744911c 100644 --- a/arch/powerpc/kernel/signal_64.c +++ b/arch/powerpc/kernel/signal_64.c @@ -214,6 +214,8 @@ static long setup_tm_sigcontexts(struct sigcontext __user *sc, BUG_ON(!MSR_TM_ACTIVE(regs->msr)); + WARN_ON(tm_suspend_disabled); + /* Remove TM bits from thread's MSR. The MSR in the sigcontext * just indicates to userland that we were doing a transaction, but we * don't want to return in transactional state. This also ensures @@ -430,6 +432,9 @@ static long restore_tm_sigcontexts(struct task_struct *tsk, BUG_ON(tsk != current); + if (tm_suspend_disabled) + return -EINVAL; + /* copy the GPRs */ err |= __copy_from_user(regs->gpr, tm_sc->gp_regs, sizeof(regs->gpr)); err |= __copy_from_user(&tsk->thread.ckpt_regs, sc->gp_regs, @@ -547,7 +552,7 @@ static long restore_tm_sigcontexts(struct task_struct *tsk, /* Make sure the transaction is marked as failed */ tsk->thread.tm_texasr |= TEXASR_FS; /* This loads the checkpointed FP/VEC state, if used */ - tm_recheckpoint(&tsk->thread, msr); + tm_recheckpoint(&tsk->thread); msr_check_and_set(msr & (MSR_FP | MSR_VEC)); if (msr & MSR_FP) { diff --git a/arch/powerpc/kernel/tau_6xx.c b/arch/powerpc/kernel/tau_6xx.c index a753b72efbc0..726c2d683f93 100644 --- a/arch/powerpc/kernel/tau_6xx.c +++ b/arch/powerpc/kernel/tau_6xx.c @@ -229,8 +229,7 @@ int __init TAU_init(void) /* first, set up the window shrinking timer */ - init_timer(&tau_timer); - tau_timer.function = tau_timeout_smp; + setup_timer(&tau_timer, tau_timeout_smp, 0UL); tau_timer.expires = jiffies + shrink_timer; add_timer(&tau_timer); diff --git a/arch/powerpc/kernel/tm.S b/arch/powerpc/kernel/tm.S index c4ba37822ba0..d89fb0e6f9ed 100644 --- a/arch/powerpc/kernel/tm.S +++ b/arch/powerpc/kernel/tm.S @@ -79,15 +79,12 @@ _GLOBAL(tm_abort) blr /* void tm_reclaim(struct thread_struct *thread, - * unsigned long orig_msr, * uint8_t cause) * * - Performs a full reclaim. This destroys outstanding * transactions and updates thread->regs.tm_ckpt_* with the * original checkpointed state. Note that thread->regs is * unchanged. - * - FP regs are written back to thread->transact_fpr before - * reclaiming. These are the transactional (current) versions. * * Purpose is to both abort transactions of, and preserve the state of, * a transactions at a context switch. We preserve/restore both sets of process @@ -98,9 +95,9 @@ _GLOBAL(tm_abort) * Call with IRQs off, stacks get all out of sync for some periods in here! */ _GLOBAL(tm_reclaim) - mfcr r6 + mfcr r5 mflr r0 - stw r6, 8(r1) + stw r5, 8(r1) std r0, 16(r1) std r2, STK_GOT(r1) stdu r1, -TM_FRAME_SIZE(r1) @@ -108,7 +105,6 @@ _GLOBAL(tm_reclaim) /* We've a struct pt_regs at [r1+STACK_FRAME_OVERHEAD]. */ std r3, STK_PARAM(R3)(r1) - std r4, STK_PARAM(R4)(r1) SAVE_NVGPRS(r1) /* We need to setup MSR for VSX register save instructions. */ @@ -138,8 +134,8 @@ _GLOBAL(tm_reclaim) std r1, PACAR1(r13) /* Clear MSR RI since we are about to change r1, EE is already off. */ - li r4, 0 - mtmsrd r4, 1 + li r5, 0 + mtmsrd r5, 1 /* * BE CAREFUL HERE: @@ -151,7 +147,7 @@ _GLOBAL(tm_reclaim) * to user register state. (FPRs, CCR etc. also!) * Use an sprg and a tm_scratch in the PACA to shuffle. */ - TRECLAIM(R5) /* Cause in r5 */ + TRECLAIM(R4) /* Cause in r4 */ /* ******************** GPRs ******************** */ /* Stash the checkpointed r13 away in the scratch SPR and get the real @@ -242,40 +238,30 @@ _GLOBAL(tm_reclaim) /* ******************** FPR/VR/VSRs ************ - * After reclaiming, capture the checkpointed FPRs/VRs /if used/. - * - * (If VSX used, FP and VMX are implied. Or, we don't need to look - * at MSR.VSX as copying FP regs if .FP, vector regs if .VMX covers it.) - * - * We're passed the thread's MSR as the second parameter + * After reclaiming, capture the checkpointed FPRs/VRs. * * We enabled VEC/FP/VSX in the msr above, so we can execute these * instructions! */ - ld r4, STK_PARAM(R4)(r1) /* Second parameter, MSR * */ mr r3, r12 - andis. r0, r4, MSR_VEC@h - beq dont_backup_vec + /* Altivec (VEC/VMX/VR)*/ addi r7, r3, THREAD_CKVRSTATE SAVE_32VRS(0, r6, r7) /* r6 scratch, r7 transact vr state */ mfvscr v0 li r6, VRSTATE_VSCR stvx v0, r7, r6 -dont_backup_vec: + + /* VRSAVE */ mfspr r0, SPRN_VRSAVE std r0, THREAD_CKVRSAVE(r3) - andi. r0, r4, MSR_FP - beq dont_backup_fp - + /* Floating Point (FP) */ addi r7, r3, THREAD_CKFPSTATE SAVE_32FPRS_VSRS(0, R6, R7) /* r6 scratch, r7 transact fp state */ - mffs fr0 stfd fr0,FPSTATE_FPSCR(r7) -dont_backup_fp: /* TM regs, incl TEXASR -- these live in thread_struct. Note they've * been updated by the treclaim, to explain to userland the failure @@ -343,22 +329,19 @@ _GLOBAL(__tm_recheckpoint) */ subi r7, r7, STACK_FRAME_OVERHEAD + /* We need to setup MSR for FP/VMX/VSX register save instructions. */ mfmsr r6 - /* R4 = original MSR to indicate whether thread used FP/Vector etc. */ - - /* Enable FP/vec in MSR if necessary! */ - lis r5, MSR_VEC@h + mr r5, r6 ori r5, r5, MSR_FP - and. r5, r4, r5 - beq restore_gprs /* if neither, skip both */ - +#ifdef CONFIG_ALTIVEC + oris r5, r5, MSR_VEC@h +#endif #ifdef CONFIG_VSX BEGIN_FTR_SECTION - oris r5, r5, MSR_VSX@h + oris r5,r5, MSR_VSX@h END_FTR_SECTION_IFSET(CPU_FTR_VSX) #endif - or r5, r6, r5 /* Set MSR.FP+.VSX/.VEC */ - mtmsr r5 + mtmsrd r5 #ifdef CONFIG_ALTIVEC /* @@ -367,28 +350,20 @@ _GLOBAL(__tm_recheckpoint) * thread.fp_state[] version holds the 'live' (transactional) * and will be loaded subsequently by any FPUnavailable trap. */ - andis. r0, r4, MSR_VEC@h - beq dont_restore_vec - addi r8, r3, THREAD_CKVRSTATE li r5, VRSTATE_VSCR lvx v0, r8, r5 mtvscr v0 REST_32VRS(0, r5, r8) /* r5 scratch, r8 ptr */ -dont_restore_vec: ld r5, THREAD_CKVRSAVE(r3) mtspr SPRN_VRSAVE, r5 #endif - andi. r0, r4, MSR_FP - beq dont_restore_fp - addi r8, r3, THREAD_CKFPSTATE lfd fr0, FPSTATE_FPSCR(r8) MTFSF_L(fr0) REST_32FPRS_VSRS(0, R4, R8) -dont_restore_fp: mtmsr r6 /* FP/Vec off again! */ restore_gprs: diff --git a/arch/powerpc/kernel/trace/ftrace_64_mprofile.S b/arch/powerpc/kernel/trace/ftrace_64_mprofile.S index b4e2b7165f79..3f3e81852422 100644 --- a/arch/powerpc/kernel/trace/ftrace_64_mprofile.S +++ b/arch/powerpc/kernel/trace/ftrace_64_mprofile.S @@ -110,9 +110,9 @@ ftrace_call: /* NIP has not been altered, skip over further checks */ beq 1f - /* Check if there is an active kprobe on us */ + /* Check if there is an active jprobe on us */ subi r3, r14, 4 - bl is_current_kprobe_addr + bl __is_active_jprobe nop /* diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c index 13c9dcdcba69..f3eb61be0d30 100644 --- a/arch/powerpc/kernel/traps.c +++ b/arch/powerpc/kernel/traps.c @@ -37,6 +37,7 @@ #include <linux/kdebug.h> #include <linux/ratelimit.h> #include <linux/context_tracking.h> +#include <linux/smp.h> #include <asm/emulated_ops.h> #include <asm/pgtable.h> @@ -699,6 +700,187 @@ void SMIException(struct pt_regs *regs) die("System Management Interrupt", regs, SIGABRT); } +#ifdef CONFIG_VSX +static void p9_hmi_special_emu(struct pt_regs *regs) +{ + unsigned int ra, rb, t, i, sel, instr, rc; + const void __user *addr; + u8 vbuf[16], *vdst; + unsigned long ea, msr, msr_mask; + bool swap; + + if (__get_user_inatomic(instr, (unsigned int __user *)regs->nip)) + return; + + /* + * lxvb16x opcode: 0x7c0006d8 + * lxvd2x opcode: 0x7c000698 + * lxvh8x opcode: 0x7c000658 + * lxvw4x opcode: 0x7c000618 + */ + if ((instr & 0xfc00073e) != 0x7c000618) { + pr_devel("HMI vec emu: not vector CI %i:%s[%d] nip=%016lx" + " instr=%08x\n", + smp_processor_id(), current->comm, current->pid, + regs->nip, instr); + return; + } + + /* Grab vector registers into the task struct */ + msr = regs->msr; /* Grab msr before we flush the bits */ + flush_vsx_to_thread(current); + enable_kernel_altivec(); + + /* + * Is userspace running with a different endian (this is rare but + * not impossible) + */ + swap = (msr & MSR_LE) != (MSR_KERNEL & MSR_LE); + + /* Decode the instruction */ + ra = (instr >> 16) & 0x1f; + rb = (instr >> 11) & 0x1f; + t = (instr >> 21) & 0x1f; + if (instr & 1) + vdst = (u8 *)¤t->thread.vr_state.vr[t]; + else + vdst = (u8 *)¤t->thread.fp_state.fpr[t][0]; + + /* Grab the vector address */ + ea = regs->gpr[rb] + (ra ? regs->gpr[ra] : 0); + if (is_32bit_task()) + ea &= 0xfffffffful; + addr = (__force const void __user *)ea; + + /* Check it */ + if (!access_ok(VERIFY_READ, addr, 16)) { + pr_devel("HMI vec emu: bad access %i:%s[%d] nip=%016lx" + " instr=%08x addr=%016lx\n", + smp_processor_id(), current->comm, current->pid, + regs->nip, instr, (unsigned long)addr); + return; + } + + /* Read the vector */ + rc = 0; + if ((unsigned long)addr & 0xfUL) + /* unaligned case */ + rc = __copy_from_user_inatomic(vbuf, addr, 16); + else + __get_user_atomic_128_aligned(vbuf, addr, rc); + if (rc) { + pr_devel("HMI vec emu: page fault %i:%s[%d] nip=%016lx" + " instr=%08x addr=%016lx\n", + smp_processor_id(), current->comm, current->pid, + regs->nip, instr, (unsigned long)addr); + return; + } + + pr_devel("HMI vec emu: emulated vector CI %i:%s[%d] nip=%016lx" + " instr=%08x addr=%016lx\n", + smp_processor_id(), current->comm, current->pid, regs->nip, + instr, (unsigned long) addr); + + /* Grab instruction "selector" */ + sel = (instr >> 6) & 3; + + /* + * Check to make sure the facility is actually enabled. This + * could happen if we get a false positive hit. + * + * lxvd2x/lxvw4x always check MSR VSX sel = 0,2 + * lxvh8x/lxvb16x check MSR VSX or VEC depending on VSR used sel = 1,3 + */ + msr_mask = MSR_VSX; + if ((sel & 1) && (instr & 1)) /* lxvh8x & lxvb16x + VSR >= 32 */ + msr_mask = MSR_VEC; + if (!(msr & msr_mask)) { + pr_devel("HMI vec emu: MSR fac clear %i:%s[%d] nip=%016lx" + " instr=%08x msr:%016lx\n", + smp_processor_id(), current->comm, current->pid, + regs->nip, instr, msr); + return; + } + + /* Do logging here before we modify sel based on endian */ + switch (sel) { + case 0: /* lxvw4x */ + PPC_WARN_EMULATED(lxvw4x, regs); + break; + case 1: /* lxvh8x */ + PPC_WARN_EMULATED(lxvh8x, regs); + break; + case 2: /* lxvd2x */ + PPC_WARN_EMULATED(lxvd2x, regs); + break; + case 3: /* lxvb16x */ + PPC_WARN_EMULATED(lxvb16x, regs); + break; + } + +#ifdef __LITTLE_ENDIAN__ + /* + * An LE kernel stores the vector in the task struct as an LE + * byte array (effectively swapping both the components and + * the content of the components). Those instructions expect + * the components to remain in ascending address order, so we + * swap them back. + * + * If we are running a BE user space, the expectation is that + * of a simple memcpy, so forcing the emulation to look like + * a lxvb16x should do the trick. + */ + if (swap) + sel = 3; + + switch (sel) { + case 0: /* lxvw4x */ + for (i = 0; i < 4; i++) + ((u32 *)vdst)[i] = ((u32 *)vbuf)[3-i]; + break; + case 1: /* lxvh8x */ + for (i = 0; i < 8; i++) + ((u16 *)vdst)[i] = ((u16 *)vbuf)[7-i]; + break; + case 2: /* lxvd2x */ + for (i = 0; i < 2; i++) + ((u64 *)vdst)[i] = ((u64 *)vbuf)[1-i]; + break; + case 3: /* lxvb16x */ + for (i = 0; i < 16; i++) + vdst[i] = vbuf[15-i]; + break; + } +#else /* __LITTLE_ENDIAN__ */ + /* On a big endian kernel, a BE userspace only needs a memcpy */ + if (!swap) + sel = 3; + + /* Otherwise, we need to swap the content of the components */ + switch (sel) { + case 0: /* lxvw4x */ + for (i = 0; i < 4; i++) + ((u32 *)vdst)[i] = cpu_to_le32(((u32 *)vbuf)[i]); + break; + case 1: /* lxvh8x */ + for (i = 0; i < 8; i++) + ((u16 *)vdst)[i] = cpu_to_le16(((u16 *)vbuf)[i]); + break; + case 2: /* lxvd2x */ + for (i = 0; i < 2; i++) + ((u64 *)vdst)[i] = cpu_to_le64(((u64 *)vbuf)[i]); + break; + case 3: /* lxvb16x */ + memcpy(vdst, vbuf, 16); + break; + } +#endif /* !__LITTLE_ENDIAN__ */ + + /* Go to next instruction */ + regs->nip += 4; +} +#endif /* CONFIG_VSX */ + void handle_hmi_exception(struct pt_regs *regs) { struct pt_regs *old_regs; @@ -706,6 +888,21 @@ void handle_hmi_exception(struct pt_regs *regs) old_regs = set_irq_regs(regs); irq_enter(); +#ifdef CONFIG_VSX + /* Real mode flagged P9 special emu is needed */ + if (local_paca->hmi_p9_special_emu) { + local_paca->hmi_p9_special_emu = 0; + + /* + * We don't want to take page faults while doing the + * emulation, we just replay the instruction if necessary. + */ + pagefault_disable(); + p9_hmi_special_emu(regs); + pagefault_enable(); + } +#endif /* CONFIG_VSX */ + if (ppc_md.handle_hmi_exception) ppc_md.handle_hmi_exception(regs); @@ -1140,13 +1337,8 @@ void program_check_exception(struct pt_regs *regs) * - A treclaim is attempted when non transactional. * - A tend is illegally attempted. * - writing a TM SPR when transactional. - */ - if (!user_mode(regs) && - report_bug(regs->nip, regs) == BUG_TRAP_TYPE_WARN) { - regs->nip += 4; - goto bail; - } - /* If usermode caused this, it's done something illegal and + * + * If usermode caused this, it's done something illegal and * gets a SIGILL slap on the wrist. We call it an illegal * operand to distinguish from the instruction just being bad * (e.g. executing a 'tend' on a CPU without TM!); it's an @@ -1487,7 +1679,7 @@ void fp_unavailable_tm(struct pt_regs *regs) /* Reclaim didn't save out any FPRs to transact_fprs. */ /* Enable FP for the task: */ - regs->msr |= (MSR_FP | current->thread.fpexc_mode); + current->thread.load_fp = 1; /* This loads and recheckpoints the FP registers from * thread.fpr[]. They will remain in registers after the @@ -1495,15 +1687,7 @@ void fp_unavailable_tm(struct pt_regs *regs) * If VMX is in use, the VRs now hold checkpointed values, * so we don't want to load the VRs from the thread_struct. */ - tm_recheckpoint(¤t->thread, MSR_FP); - - /* If VMX is in use, get the transactional values back */ - if (regs->msr & MSR_VEC) { - msr_check_and_set(MSR_VEC); - load_vr_state(¤t->thread.vr_state); - /* At this point all the VSX state is loaded, so enable it */ - regs->msr |= MSR_VSX; - } + tm_recheckpoint(¤t->thread); } void altivec_unavailable_tm(struct pt_regs *regs) @@ -1516,21 +1700,13 @@ void altivec_unavailable_tm(struct pt_regs *regs) "MSR=%lx\n", regs->nip, regs->msr); tm_reclaim_current(TM_CAUSE_FAC_UNAV); - regs->msr |= MSR_VEC; - tm_recheckpoint(¤t->thread, MSR_VEC); + current->thread.load_vec = 1; + tm_recheckpoint(¤t->thread); current->thread.used_vr = 1; - - if (regs->msr & MSR_FP) { - msr_check_and_set(MSR_FP); - load_fp_state(¤t->thread.fp_state); - regs->msr |= MSR_VSX; - } } void vsx_unavailable_tm(struct pt_regs *regs) { - unsigned long orig_msr = regs->msr; - /* See the comments in fp_unavailable_tm(). This works similarly, * though we're loading both FP and VEC registers in here. * @@ -1544,29 +1720,13 @@ void vsx_unavailable_tm(struct pt_regs *regs) current->thread.used_vsr = 1; - /* If FP and VMX are already loaded, we have all the state we need */ - if ((orig_msr & (MSR_FP | MSR_VEC)) == (MSR_FP | MSR_VEC)) { - regs->msr |= MSR_VSX; - return; - } - /* This reclaims FP and/or VR regs if they're already enabled */ tm_reclaim_current(TM_CAUSE_FAC_UNAV); - regs->msr |= MSR_VEC | MSR_FP | current->thread.fpexc_mode | - MSR_VSX; - - /* This loads & recheckpoints FP and VRs; but we have - * to be sure not to overwrite previously-valid state. - */ - tm_recheckpoint(¤t->thread, regs->msr & ~orig_msr); - - msr_check_and_set(orig_msr & (MSR_FP | MSR_VEC)); + current->thread.load_vec = 1; + current->thread.load_fp = 1; - if (orig_msr & MSR_FP) - load_fp_state(¤t->thread.fp_state); - if (orig_msr & MSR_VEC) - load_vr_state(¤t->thread.vr_state); + tm_recheckpoint(¤t->thread); } #endif /* CONFIG_PPC_TRANSACTIONAL_MEM */ @@ -1924,6 +2084,10 @@ struct ppc_emulated ppc_emulated = { WARN_EMULATED_SETUP(mfdscr), WARN_EMULATED_SETUP(mtdscr), WARN_EMULATED_SETUP(lq_stq), + WARN_EMULATED_SETUP(lxvw4x), + WARN_EMULATED_SETUP(lxvh8x), + WARN_EMULATED_SETUP(lxvd2x), + WARN_EMULATED_SETUP(lxvb16x), #endif }; diff --git a/arch/powerpc/kernel/watchdog.c b/arch/powerpc/kernel/watchdog.c index 2f6eadd9408d..1fb9379dc683 100644 --- a/arch/powerpc/kernel/watchdog.c +++ b/arch/powerpc/kernel/watchdog.c @@ -97,8 +97,7 @@ static void wd_lockup_ipi(struct pt_regs *regs) else dump_stack(); - if (hardlockup_panic) - nmi_panic(regs, "Hard LOCKUP"); + /* Do not panic from here because that can recurse into NMI IPI layer */ } static void set_cpumask_stuck(const struct cpumask *cpumask, u64 tb) @@ -134,15 +133,18 @@ static void watchdog_smp_panic(int cpu, u64 tb) pr_emerg("Watchdog CPU:%d detected Hard LOCKUP other CPUS:%*pbl\n", cpu, cpumask_pr_args(&wd_smp_cpus_pending)); - /* - * Try to trigger the stuck CPUs. - */ - for_each_cpu(c, &wd_smp_cpus_pending) { - if (c == cpu) - continue; - smp_send_nmi_ipi(c, wd_lockup_ipi, 1000000); + if (!sysctl_hardlockup_all_cpu_backtrace) { + /* + * Try to trigger the stuck CPUs, unless we are going to + * get a backtrace on all of them anyway. + */ + for_each_cpu(c, &wd_smp_cpus_pending) { + if (c == cpu) + continue; + smp_send_nmi_ipi(c, wd_lockup_ipi, 1000000); + } + smp_flush_nmi_ipi(1000000); } - smp_flush_nmi_ipi(1000000); /* Take the stuck CPUs out of the watch group */ set_cpumask_stuck(&wd_smp_cpus_pending, tb); @@ -275,9 +277,12 @@ void arch_touch_nmi_watchdog(void) { unsigned long ticks = tb_ticks_per_usec * wd_timer_period_ms * 1000; int cpu = smp_processor_id(); + u64 tb = get_tb(); - if (get_tb() - per_cpu(wd_timer_tb, cpu) >= ticks) - watchdog_timer_interrupt(cpu); + if (tb - per_cpu(wd_timer_tb, cpu) >= ticks) { + per_cpu(wd_timer_tb, cpu) = tb; + wd_smp_clear_cpu_pending(cpu, tb); + } } EXPORT_SYMBOL(arch_touch_nmi_watchdog); diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 73bf1ebfa78f..31a362669fea 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -47,6 +47,7 @@ #include <asm/reg.h> #include <asm/ppc-opcode.h> +#include <asm/asm-prototypes.h> #include <asm/disassemble.h> #include <asm/cputable.h> #include <asm/cacheflush.h> @@ -1089,9 +1090,10 @@ static int kvmppc_handle_exit_hv(struct kvm_run *run, struct kvm_vcpu *vcpu, vcpu->stat.ext_intr_exits++; r = RESUME_GUEST; break; - /* HMI is hypervisor interrupt and host has handled it. Resume guest.*/ + /* SR/HMI/PMI are HV interrupts that host has handled. Resume guest.*/ case BOOK3S_INTERRUPT_HMI: case BOOK3S_INTERRUPT_PERFMON: + case BOOK3S_INTERRUPT_SYSTEM_RESET: r = RESUME_GUEST; break; case BOOK3S_INTERRUPT_MACHINE_CHECK: @@ -2117,15 +2119,6 @@ static int kvmppc_grab_hwthread(int cpu) struct paca_struct *tpaca; long timeout = 10000; - /* - * ISA v3.0 idle routines do not set hwthread_state or test - * hwthread_req, so they can not grab idle threads. - */ - if (cpu_has_feature(CPU_FTR_ARCH_300)) { - WARN(1, "KVM: can not control sibling threads\n"); - return -EBUSY; - } - tpaca = &paca[cpu]; /* Ensure the thread won't go into the kernel if it wakes */ @@ -2160,12 +2153,10 @@ static void kvmppc_release_hwthread(int cpu) struct paca_struct *tpaca; tpaca = &paca[cpu]; + tpaca->kvm_hstate.hwthread_req = 0; tpaca->kvm_hstate.kvm_vcpu = NULL; tpaca->kvm_hstate.kvm_vcore = NULL; tpaca->kvm_hstate.kvm_split_mode = NULL; - if (!cpu_has_feature(CPU_FTR_ARCH_300)) - tpaca->kvm_hstate.hwthread_req = 0; - } static void radix_flush_cpu(struct kvm *kvm, int cpu, struct kvm_vcpu *vcpu) @@ -2615,6 +2606,9 @@ static void set_irq_happened(int trap) case BOOK3S_INTERRUPT_HMI: local_paca->irq_happened |= PACA_IRQ_HMI; break; + case BOOK3S_INTERRUPT_SYSTEM_RESET: + replay_system_reset(); + break; } } diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S index 17936f82d3c7..fb02209ff782 100644 --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S @@ -149,11 +149,9 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) subf r4, r4, r3 mtspr SPRN_DEC, r4 -BEGIN_FTR_SECTION /* hwthread_req may have got set by cede or no vcpu, so clear it */ li r0, 0 stb r0, HSTATE_HWTHREAD_REQ(r13) -END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300) /* * For external interrupts we need to call the Linux @@ -316,7 +314,6 @@ kvm_novcpu_exit: * Relocation is off and most register values are lost. * r13 points to the PACA. * r3 contains the SRR1 wakeup value, SRR1 is trashed. - * This is not used by ISAv3.0B processors. */ .globl kvm_start_guest kvm_start_guest: @@ -435,9 +432,6 @@ kvm_secondary_got_guest: * While waiting we also need to check if we get given a vcpu to run. */ kvm_no_guest: -BEGIN_FTR_SECTION - twi 31,0,0 -END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300) lbz r3, HSTATE_HWTHREAD_REQ(r13) cmpwi r3, 0 bne 53f @@ -2531,10 +2525,8 @@ kvm_do_nap: clrrdi r0, r0, 1 mtspr SPRN_CTRLT, r0 -BEGIN_FTR_SECTION li r0,1 stb r0,HSTATE_HWTHREAD_REQ(r13) -END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300) mfspr r5,SPRN_LPCR ori r5,r5,LPCR_PECE0 | LPCR_PECE1 BEGIN_FTR_SECTION diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index 3480faaf1ef8..a3746b98ec11 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -644,8 +644,8 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) break; #endif case KVM_CAP_PPC_HTM: - r = cpu_has_feature(CPU_FTR_TM_COMP) && - is_kvmppc_hv_enabled(kvm); + r = is_kvmppc_hv_enabled(kvm) && + (cur_cpu_spec->cpu_user_features2 & PPC_FEATURE2_HTM_COMP); break; default: r = 0; diff --git a/arch/powerpc/lib/sstep.c b/arch/powerpc/lib/sstep.c index f208f560aecd..8c3955e183d4 100644 --- a/arch/powerpc/lib/sstep.c +++ b/arch/powerpc/lib/sstep.c @@ -31,6 +31,8 @@ extern char system_call_common[]; #define XER_SO 0x80000000U #define XER_OV 0x40000000U #define XER_CA 0x20000000U +#define XER_OV32 0x00080000U +#define XER_CA32 0x00040000U #ifdef CONFIG_PPC_FPU /* @@ -962,6 +964,16 @@ static nokprobe_inline void set_cr0(const struct pt_regs *regs, op->ccval |= 0x20000000; } +static nokprobe_inline void set_ca32(struct instruction_op *op, bool val) +{ + if (cpu_has_feature(CPU_FTR_ARCH_300)) { + if (val) + op->xerval |= XER_CA32; + else + op->xerval &= ~XER_CA32; + } +} + static nokprobe_inline void add_with_carry(const struct pt_regs *regs, struct instruction_op *op, int rd, unsigned long val1, unsigned long val2, @@ -985,6 +997,9 @@ static nokprobe_inline void add_with_carry(const struct pt_regs *regs, op->xerval |= XER_CA; else op->xerval &= ~XER_CA; + + set_ca32(op, (unsigned int)val < (unsigned int)val1 || + (carry_in && (unsigned int)val == (unsigned int)val1)); } static nokprobe_inline void do_cmp_signed(const struct pt_regs *regs, @@ -1791,6 +1806,7 @@ int analyse_instr(struct instruction_op *op, const struct pt_regs *regs, op->xerval |= XER_CA; else op->xerval &= ~XER_CA; + set_ca32(op, op->xerval & XER_CA); goto logical_done; case 824: /* srawi */ @@ -1803,6 +1819,7 @@ int analyse_instr(struct instruction_op *op, const struct pt_regs *regs, op->xerval |= XER_CA; else op->xerval &= ~XER_CA; + set_ca32(op, op->xerval & XER_CA); goto logical_done; #ifdef __powerpc64__ @@ -1832,6 +1849,7 @@ int analyse_instr(struct instruction_op *op, const struct pt_regs *regs, op->xerval |= XER_CA; else op->xerval &= ~XER_CA; + set_ca32(op, op->xerval & XER_CA); goto logical_done; case 826: /* sradi with sh_5 = 0 */ @@ -1845,6 +1863,7 @@ int analyse_instr(struct instruction_op *op, const struct pt_regs *regs, op->xerval |= XER_CA; else op->xerval &= ~XER_CA; + set_ca32(op, op->xerval & XER_CA); goto logical_done; #endif /* __powerpc64__ */ diff --git a/arch/powerpc/mm/Makefile b/arch/powerpc/mm/Makefile index fb844d2f266e..915178423c3a 100644 --- a/arch/powerpc/mm/Makefile +++ b/arch/powerpc/mm/Makefile @@ -14,11 +14,11 @@ obj-$(CONFIG_PPC_MMU_NOHASH) += mmu_context_nohash.o tlb_nohash.o \ obj-$(CONFIG_PPC_BOOK3E) += tlb_low_$(BITS)e.o hash64-$(CONFIG_PPC_NATIVE) := hash_native_64.o obj-$(CONFIG_PPC_BOOK3E_64) += pgtable-book3e.o -obj-$(CONFIG_PPC_STD_MMU_64) += pgtable-hash64.o hash_utils_64.o slb_low.o slb.o $(hash64-y) mmu_context_book3s64.o pgtable-book3s64.o +obj-$(CONFIG_PPC_BOOK3S_64) += pgtable-hash64.o hash_utils_64.o slb_low.o slb.o $(hash64-y) mmu_context_book3s64.o pgtable-book3s64.o obj-$(CONFIG_PPC_RADIX_MMU) += pgtable-radix.o tlb-radix.o obj-$(CONFIG_PPC_STD_MMU_32) += ppc_mmu_32.o hash_low_32.o mmu_context_hash32.o obj-$(CONFIG_PPC_STD_MMU) += tlb_hash$(BITS).o -ifeq ($(CONFIG_PPC_STD_MMU_64),y) +ifeq ($(CONFIG_PPC_BOOK3S_64),y) obj-$(CONFIG_PPC_4K_PAGES) += hash64_4k.o obj-$(CONFIG_PPC_64K_PAGES) += hash64_64k.o endif @@ -31,7 +31,7 @@ obj-$(CONFIG_PPC_SPLPAR) += vphn.o obj-$(CONFIG_PPC_MM_SLICES) += slice.o obj-y += hugetlbpage.o ifeq ($(CONFIG_HUGETLB_PAGE),y) -obj-$(CONFIG_PPC_STD_MMU_64) += hugetlbpage-hash64.o +obj-$(CONFIG_PPC_BOOK3S_64) += hugetlbpage-hash64.o obj-$(CONFIG_PPC_RADIX_MMU) += hugetlbpage-radix.o obj-$(CONFIG_PPC_BOOK3E_MMU) += hugetlbpage-book3e.o endif diff --git a/arch/powerpc/mm/dump_hashpagetable.c b/arch/powerpc/mm/dump_hashpagetable.c index 5c4c93dcff19..14cfb11b09d0 100644 --- a/arch/powerpc/mm/dump_hashpagetable.c +++ b/arch/powerpc/mm/dump_hashpagetable.c @@ -500,7 +500,7 @@ static void populate_markers(void) address_markers[6].start_address = PHB_IO_END; address_markers[7].start_address = IOREMAP_BASE; address_markers[8].start_address = IOREMAP_END; -#ifdef CONFIG_PPC_STD_MMU_64 +#ifdef CONFIG_PPC_BOOK3S_64 address_markers[9].start_address = H_VMEMMAP_BASE; #else address_markers[9].start_address = VMEMMAP_BASE; diff --git a/arch/powerpc/mm/dump_linuxpagetables.c b/arch/powerpc/mm/dump_linuxpagetables.c index c9282d27b203..c2e7dea59490 100644 --- a/arch/powerpc/mm/dump_linuxpagetables.c +++ b/arch/powerpc/mm/dump_linuxpagetables.c @@ -112,7 +112,7 @@ struct flag_info { static const struct flag_info flag_array[] = { { -#ifdef CONFIG_PPC_STD_MMU_64 +#ifdef CONFIG_PPC_BOOK3S_64 .mask = _PAGE_PRIVILEGED, .val = 0, #else @@ -147,7 +147,7 @@ static const struct flag_info flag_array[] = { .set = "present", .clear = " ", }, { -#ifdef CONFIG_PPC_STD_MMU_64 +#ifdef CONFIG_PPC_BOOK3S_64 .mask = H_PAGE_HASHPTE, .val = H_PAGE_HASHPTE, #else @@ -157,7 +157,7 @@ static const struct flag_info flag_array[] = { .set = "hpte", .clear = " ", }, { -#ifndef CONFIG_PPC_STD_MMU_64 +#ifndef CONFIG_PPC_BOOK3S_64 .mask = _PAGE_GUARDED, .val = _PAGE_GUARDED, .set = "guarded", @@ -174,7 +174,7 @@ static const struct flag_info flag_array[] = { .set = "accessed", .clear = " ", }, { -#ifndef CONFIG_PPC_STD_MMU_64 +#ifndef CONFIG_PPC_BOOK3S_64 .mask = _PAGE_WRITETHRU, .val = _PAGE_WRITETHRU, .set = "write through", @@ -450,7 +450,7 @@ static void populate_markers(void) address_markers[i++].start_address = PHB_IO_END; address_markers[i++].start_address = IOREMAP_BASE; address_markers[i++].start_address = IOREMAP_END; -#ifdef CONFIG_PPC_STD_MMU_64 +#ifdef CONFIG_PPC_BOOK3S_64 address_markers[i++].start_address = H_VMEMMAP_BASE; #else address_markers[i++].start_address = VMEMMAP_BASE; diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c index 67ec2e927253..655a5a9a183d 100644 --- a/arch/powerpc/mm/hash_utils_64.c +++ b/arch/powerpc/mm/hash_utils_64.c @@ -21,6 +21,7 @@ #undef DEBUG #undef DEBUG_LOW +#define pr_fmt(fmt) "hash-mmu: " fmt #include <linux/spinlock.h> #include <linux/errno.h> #include <linux/sched/mm.h> diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c index 588a521966ec..a07722531b32 100644 --- a/arch/powerpc/mm/init_64.c +++ b/arch/powerpc/mm/init_64.c @@ -68,11 +68,11 @@ #include "mmu_decl.h" -#ifdef CONFIG_PPC_STD_MMU_64 +#ifdef CONFIG_PPC_BOOK3S_64 #if H_PGTABLE_RANGE > USER_VSID_RANGE #warning Limited user VSID range means pagetable space is wasted #endif -#endif /* CONFIG_PPC_STD_MMU_64 */ +#endif /* CONFIG_PPC_BOOK3S_64 */ phys_addr_t memstart_addr = ~0; EXPORT_SYMBOL_GPL(memstart_addr); @@ -367,11 +367,20 @@ EXPORT_SYMBOL_GPL(realmode_pfn_to_page); #endif /* CONFIG_SPARSEMEM_VMEMMAP */ -#ifdef CONFIG_PPC_STD_MMU_64 -static bool disable_radix; +#ifdef CONFIG_PPC_BOOK3S_64 +static bool disable_radix = !IS_ENABLED(CONFIG_PPC_RADIX_MMU_DEFAULT); + static int __init parse_disable_radix(char *p) { - disable_radix = true; + bool val; + + if (strlen(p) == 0) + val = true; + else if (kstrtobool(p, &val)) + return -EINVAL; + + disable_radix = val; + return 0; } early_param("disable_radix", parse_disable_radix); @@ -444,4 +453,4 @@ void __init mmu_early_init_devtree(void) else hash__early_init_devtree(); } -#endif /* CONFIG_PPC_STD_MMU_64 */ +#endif /* CONFIG_PPC_BOOK3S_64 */ diff --git a/arch/powerpc/mm/mmu_context.c b/arch/powerpc/mm/mmu_context.c index 0f613bc63c50..d60a62bf4fc7 100644 --- a/arch/powerpc/mm/mmu_context.c +++ b/arch/powerpc/mm/mmu_context.c @@ -34,15 +34,6 @@ static inline void switch_mm_pgdir(struct task_struct *tsk, struct mm_struct *mm) { } #endif -#ifdef CONFIG_PPC_BOOK3S_64 -static inline void inc_mm_active_cpus(struct mm_struct *mm) -{ - atomic_inc(&mm->context.active_cpus); -} -#else -static inline void inc_mm_active_cpus(struct mm_struct *mm) { } -#endif - void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, struct task_struct *tsk) { diff --git a/arch/powerpc/mm/mmu_context_book3s64.c b/arch/powerpc/mm/mmu_context_book3s64.c index 05e15386d4cb..6d724dab27c2 100644 --- a/arch/powerpc/mm/mmu_context_book3s64.c +++ b/arch/powerpc/mm/mmu_context_book3s64.c @@ -216,19 +216,34 @@ void destroy_context(struct mm_struct *mm) #ifdef CONFIG_SPAPR_TCE_IOMMU WARN_ON_ONCE(!list_empty(&mm->context.iommu_group_mem_list)); #endif + if (radix_enabled()) + WARN_ON(process_tb[mm->context.id].prtb0 != 0); + else + subpage_prot_free(mm); + destroy_pagetable_page(mm); + __destroy_context(mm->context.id); + mm->context.id = MMU_NO_CONTEXT; +} + +void arch_exit_mmap(struct mm_struct *mm) +{ if (radix_enabled()) { /* * Radix doesn't have a valid bit in the process table * entries. However we know that at least P9 implementation * will avoid caching an entry with an invalid RTS field, * and 0 is invalid. So this will do. + * + * This runs before the "fullmm" tlb flush in exit_mmap, + * which does a RIC=2 tlbie to clear the process table + * entry. See the "fullmm" comments in tlb-radix.c. + * + * No barrier required here after the store because + * this process will do the invalidate, which starts with + * ptesync. */ process_tb[mm->context.id].prtb0 = 0; - } else - subpage_prot_free(mm); - destroy_pagetable_page(mm); - __destroy_context(mm->context.id); - mm->context.id = MMU_NO_CONTEXT; + } } #ifdef CONFIG_PPC_RADIX_MMU diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c index a51df9ef529d..eb604b3574fa 100644 --- a/arch/powerpc/mm/numa.c +++ b/arch/powerpc/mm/numa.c @@ -1148,11 +1148,33 @@ struct topology_update_data { int new_nid; }; +#define TOPOLOGY_DEF_TIMER_SECS 60 + static u8 vphn_cpu_change_counts[NR_CPUS][MAX_DISTANCE_REF_POINTS]; static cpumask_t cpu_associativity_changes_mask; static int vphn_enabled; static int prrn_enabled; static void reset_topology_timer(void); +static int topology_timer_secs = 1; +static int topology_inited; +static int topology_update_needed; + +/* + * Change polling interval for associativity changes. + */ +int timed_topology_update(int nsecs) +{ + if (vphn_enabled) { + if (nsecs > 0) + topology_timer_secs = nsecs; + else + topology_timer_secs = TOPOLOGY_DEF_TIMER_SECS; + + reset_topology_timer(); + } + + return 0; +} /* * Store the current values of the associativity change counters in the @@ -1246,6 +1268,11 @@ static long vphn_get_associativity(unsigned long cpu, "hcall_vphn() experienced a hardware fault " "preventing VPHN. Disabling polling...\n"); stop_topology_update(); + break; + case H_SUCCESS: + dbg("VPHN hcall succeeded. Reset polling...\n"); + timed_topology_update(0); + break; } return rc; @@ -1323,8 +1350,11 @@ int numa_update_cpu_topology(bool cpus_locked) struct device *dev; int weight, new_nid, i = 0; - if (!prrn_enabled && !vphn_enabled) + if (!prrn_enabled && !vphn_enabled) { + if (!topology_inited) + topology_update_needed = 1; return 0; + } weight = cpumask_weight(&cpu_associativity_changes_mask); if (!weight) @@ -1363,22 +1393,30 @@ int numa_update_cpu_topology(bool cpus_locked) cpumask_andnot(&cpu_associativity_changes_mask, &cpu_associativity_changes_mask, cpu_sibling_mask(cpu)); + dbg("Assoc chg gives same node %d for cpu%d\n", + new_nid, cpu); cpu = cpu_last_thread_sibling(cpu); continue; } for_each_cpu(sibling, cpu_sibling_mask(cpu)) { ud = &updates[i++]; + ud->next = &updates[i]; ud->cpu = sibling; ud->new_nid = new_nid; ud->old_nid = numa_cpu_lookup_table[sibling]; cpumask_set_cpu(sibling, &updated_cpus); - if (i < weight) - ud->next = &updates[i]; } cpu = cpu_last_thread_sibling(cpu); } + /* + * Prevent processing of 'updates' from overflowing array + * where last entry filled in a 'next' pointer. + */ + if (i) + updates[i-1].next = NULL; + pr_debug("Topology update for the following CPUs:\n"); if (cpumask_weight(&updated_cpus)) { for (ud = &updates[0]; ud; ud = ud->next) { @@ -1433,6 +1471,7 @@ int numa_update_cpu_topology(bool cpus_locked) out: kfree(updates); + topology_update_needed = 0; return changed; } @@ -1468,7 +1507,7 @@ static struct timer_list topology_timer = static void reset_topology_timer(void) { topology_timer.data = 0; - topology_timer.expires = jiffies + 60 * HZ; + topology_timer.expires = jiffies + topology_timer_secs * HZ; mod_timer(&topology_timer, topology_timer.expires); } @@ -1518,15 +1557,14 @@ int start_topology_update(void) if (firmware_has_feature(FW_FEATURE_PRRN)) { if (!prrn_enabled) { prrn_enabled = 1; - vphn_enabled = 0; #ifdef CONFIG_SMP rc = of_reconfig_notifier_register(&dt_update_nb); #endif } - } else if (firmware_has_feature(FW_FEATURE_VPHN) && + } + if (firmware_has_feature(FW_FEATURE_VPHN) && lppaca_shared_proc(get_lppaca())) { if (!vphn_enabled) { - prrn_enabled = 0; vphn_enabled = 1; setup_cpu_associativity_change_counters(); init_timer_deferrable(&topology_timer); @@ -1549,7 +1587,8 @@ int stop_topology_update(void) #ifdef CONFIG_SMP rc = of_reconfig_notifier_unregister(&dt_update_nb); #endif - } else if (vphn_enabled) { + } + if (vphn_enabled) { vphn_enabled = 0; rc = del_timer_sync(&topology_timer); } @@ -1612,9 +1651,17 @@ static int topology_update_init(void) if (topology_updates_enabled) start_topology_update(); + if (vphn_enabled) + topology_schedule_update(); + if (!proc_create("powerpc/topology_updates", 0644, NULL, &topology_ops)) return -ENOMEM; + topology_inited = 1; + if (topology_update_needed) + bitmap_fill(cpumask_bits(&cpu_associativity_changes_mask), + nr_cpumask_bits); + return 0; } device_initcall(topology_update_init); diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c index ac0717a90ca6..2851282edcd4 100644 --- a/arch/powerpc/mm/pgtable_64.c +++ b/arch/powerpc/mm/pgtable_64.c @@ -57,7 +57,7 @@ #include "mmu_decl.h" -#ifdef CONFIG_PPC_STD_MMU_64 +#ifdef CONFIG_PPC_BOOK3S_64 #if TASK_SIZE_USER64 > (1UL << (ESID_BITS + SID_SHIFT)) #error TASK_SIZE_USER64 exceeds user VSID range #endif diff --git a/arch/powerpc/mm/slb_low.S b/arch/powerpc/mm/slb_low.S index 906a86fe457b..ed60ad861dfa 100644 --- a/arch/powerpc/mm/slb_low.S +++ b/arch/powerpc/mm/slb_low.S @@ -309,10 +309,6 @@ slb_compare_rr_to_size: srdi r10,r10,(SID_SHIFT_1T - SID_SHIFT) /* get 1T ESID */ rldimi r10,r9,ESID_BITS_1T,0 ASM_VSID_SCRAMBLE(r10,r9,r11,1T) - /* - * bits above VSID_BITS_1T need to be ignored from r10 - * also combine VSID and flags - */ li r10,MMU_SEGSIZE_1T rldimi r11,r10,SLB_VSID_SSIZE_SHIFT,0 /* insert segment size */ diff --git a/arch/powerpc/mm/tlb-radix.c b/arch/powerpc/mm/tlb-radix.c index d304028641a2..63e277b6e60c 100644 --- a/arch/powerpc/mm/tlb-radix.c +++ b/arch/powerpc/mm/tlb-radix.c @@ -144,7 +144,7 @@ void radix__local_flush_tlb_mm(struct mm_struct *mm) EXPORT_SYMBOL(radix__local_flush_tlb_mm); #ifndef CONFIG_SMP -static void radix__local_flush_all_mm(struct mm_struct *mm) +void radix__local_flush_all_mm(struct mm_struct *mm) { unsigned long pid; @@ -154,6 +154,7 @@ static void radix__local_flush_all_mm(struct mm_struct *mm) _tlbiel_pid(pid, RIC_FLUSH_ALL); preempt_enable(); } +EXPORT_SYMBOL(radix__local_flush_all_mm); #endif /* CONFIG_SMP */ void radix__local_flush_tlb_page_psize(struct mm_struct *mm, unsigned long vmaddr, @@ -163,7 +164,7 @@ void radix__local_flush_tlb_page_psize(struct mm_struct *mm, unsigned long vmadd unsigned long ap = mmu_get_ap(psize); preempt_disable(); - pid = mm ? mm->context.id : 0; + pid = mm->context.id; if (pid != MMU_NO_CONTEXT) _tlbiel_va(vmaddr, pid, ap, RIC_FLUSH_TLB); preempt_enable(); @@ -173,11 +174,10 @@ void radix__local_flush_tlb_page(struct vm_area_struct *vma, unsigned long vmadd { #ifdef CONFIG_HUGETLB_PAGE /* need the return fix for nohash.c */ - if (vma && is_vm_hugetlb_page(vma)) - return __local_flush_hugetlb_page(vma, vmaddr); + if (is_vm_hugetlb_page(vma)) + return radix__local_flush_hugetlb_page(vma, vmaddr); #endif - radix__local_flush_tlb_page_psize(vma ? vma->vm_mm : NULL, vmaddr, - mmu_virtual_psize); + radix__local_flush_tlb_page_psize(vma->vm_mm, vmaddr, mmu_virtual_psize); } EXPORT_SYMBOL(radix__local_flush_tlb_page); @@ -186,36 +186,35 @@ void radix__flush_tlb_mm(struct mm_struct *mm) { unsigned long pid; - preempt_disable(); pid = mm->context.id; if (unlikely(pid == MMU_NO_CONTEXT)) - goto no_context; + return; + preempt_disable(); if (!mm_is_thread_local(mm)) _tlbie_pid(pid, RIC_FLUSH_TLB); else _tlbiel_pid(pid, RIC_FLUSH_TLB); -no_context: preempt_enable(); } EXPORT_SYMBOL(radix__flush_tlb_mm); -static void radix__flush_all_mm(struct mm_struct *mm) +void radix__flush_all_mm(struct mm_struct *mm) { unsigned long pid; - preempt_disable(); pid = mm->context.id; if (unlikely(pid == MMU_NO_CONTEXT)) - goto no_context; + return; + preempt_disable(); if (!mm_is_thread_local(mm)) _tlbie_pid(pid, RIC_FLUSH_ALL); else _tlbiel_pid(pid, RIC_FLUSH_ALL); -no_context: preempt_enable(); } +EXPORT_SYMBOL(radix__flush_all_mm); void radix__flush_tlb_pwc(struct mmu_gather *tlb, unsigned long addr) { @@ -229,26 +228,25 @@ void radix__flush_tlb_page_psize(struct mm_struct *mm, unsigned long vmaddr, unsigned long pid; unsigned long ap = mmu_get_ap(psize); - preempt_disable(); - pid = mm ? mm->context.id : 0; + pid = mm->context.id; if (unlikely(pid == MMU_NO_CONTEXT)) - goto bail; + return; + + preempt_disable(); if (!mm_is_thread_local(mm)) _tlbie_va(vmaddr, pid, ap, RIC_FLUSH_TLB); else _tlbiel_va(vmaddr, pid, ap, RIC_FLUSH_TLB); -bail: preempt_enable(); } void radix__flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr) { #ifdef CONFIG_HUGETLB_PAGE - if (vma && is_vm_hugetlb_page(vma)) - return flush_hugetlb_page(vma, vmaddr); + if (is_vm_hugetlb_page(vma)) + return radix__flush_hugetlb_page(vma, vmaddr); #endif - radix__flush_tlb_page_psize(vma ? vma->vm_mm : NULL, vmaddr, - mmu_virtual_psize); + radix__flush_tlb_page_psize(vma->vm_mm, vmaddr, mmu_virtual_psize); } EXPORT_SYMBOL(radix__flush_tlb_page); @@ -300,10 +298,14 @@ void radix__tlb_flush(struct mmu_gather *tlb) psize = radix_get_mmu_psize(page_size); /* * if page size is not something we understand, do a full mm flush + * + * A "fullmm" flush must always do a flush_all_mm (RIC=2) flush + * that flushes the process table entry cache upon process teardown. + * See the comment for radix in arch_exit_mmap(). */ if (psize != -1 && !tlb->fullmm && !tlb->need_flush_all) radix__flush_tlb_range_psize(mm, tlb->start, tlb->end, psize); - else if (tlb->need_flush_all) { + else if (tlb->fullmm || tlb->need_flush_all) { tlb->need_flush_all = 0; radix__flush_all_mm(mm); } else @@ -322,55 +324,53 @@ void radix__flush_tlb_range_psize(struct mm_struct *mm, unsigned long start, { unsigned long pid; unsigned long addr; - int local = mm_is_thread_local(mm); + bool local; unsigned long ap = mmu_get_ap(psize); unsigned long page_size = 1UL << mmu_psize_defs[psize].shift; - - preempt_disable(); - pid = mm ? mm->context.id : 0; + pid = mm->context.id; if (unlikely(pid == MMU_NO_CONTEXT)) - goto err_out; + return; + preempt_disable(); + local = mm_is_thread_local(mm); if (end == TLB_FLUSH_ALL || (end - start) > tlb_single_page_flush_ceiling * page_size) { if (local) _tlbiel_pid(pid, RIC_FLUSH_TLB); else _tlbie_pid(pid, RIC_FLUSH_TLB); - goto err_out; - } - for (addr = start; addr < end; addr += page_size) { + } else { + for (addr = start; addr < end; addr += page_size) { - if (local) - _tlbiel_va(addr, pid, ap, RIC_FLUSH_TLB); - else - _tlbie_va(addr, pid, ap, RIC_FLUSH_TLB); + if (local) + _tlbiel_va(addr, pid, ap, RIC_FLUSH_TLB); + else + _tlbie_va(addr, pid, ap, RIC_FLUSH_TLB); + } } -err_out: preempt_enable(); } #ifdef CONFIG_TRANSPARENT_HUGEPAGE void radix__flush_tlb_collapsed_pmd(struct mm_struct *mm, unsigned long addr) { - int local = mm_is_thread_local(mm); unsigned long ap = mmu_get_ap(mmu_virtual_psize); unsigned long pid, end; + bool local; - - pid = mm ? mm->context.id : 0; - preempt_disable(); + pid = mm->context.id; if (unlikely(pid == MMU_NO_CONTEXT)) - goto no_context; + return; /* 4k page size, just blow the world */ if (PAGE_SIZE == 0x1000) { radix__flush_all_mm(mm); - preempt_enable(); return; } + preempt_disable(); + local = mm_is_thread_local(mm); /* Otherwise first do the PWC */ if (local) _tlbiel_pid(pid, RIC_FLUSH_PWC); @@ -385,7 +385,7 @@ void radix__flush_tlb_collapsed_pmd(struct mm_struct *mm, unsigned long addr) else _tlbie_va(addr, pid, ap, RIC_FLUSH_TLB); } -no_context: + preempt_enable(); } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ diff --git a/arch/powerpc/net/bpf_jit64.h b/arch/powerpc/net/bpf_jit64.h index 62fa7589db2b..8bdef7ed28a8 100644 --- a/arch/powerpc/net/bpf_jit64.h +++ b/arch/powerpc/net/bpf_jit64.h @@ -23,7 +23,7 @@ * [ nv gpr save area ] 8*8 | * [ tail_call_cnt ] 8 | * [ local_tmp_var ] 8 | - * fp (r31) --> [ ebpf stack space ] 512 | + * fp (r31) --> [ ebpf stack space ] upto 512 | * [ frame header ] 32/112 | * sp (r1) ---> [ stack pointer ] -------------- */ @@ -32,8 +32,8 @@ #define BPF_PPC_STACK_SAVE (8*8) /* for bpf JIT code internal usage */ #define BPF_PPC_STACK_LOCALS 16 -/* Ensure this is quadword aligned */ -#define BPF_PPC_STACKFRAME (STACK_FRAME_MIN_SIZE + MAX_BPF_STACK + \ +/* stack frame excluding BPF stack, ensure this is quadword aligned */ +#define BPF_PPC_STACKFRAME (STACK_FRAME_MIN_SIZE + \ BPF_PPC_STACK_LOCALS + BPF_PPC_STACK_SAVE) #ifndef __ASSEMBLY__ @@ -103,6 +103,7 @@ struct codegen_context { */ unsigned int seen; unsigned int idx; + unsigned int stack_size; }; #endif /* !__ASSEMBLY__ */ diff --git a/arch/powerpc/net/bpf_jit_comp64.c b/arch/powerpc/net/bpf_jit_comp64.c index a66e64b0b251..46d74e81aff1 100644 --- a/arch/powerpc/net/bpf_jit_comp64.c +++ b/arch/powerpc/net/bpf_jit_comp64.c @@ -69,7 +69,7 @@ static inline bool bpf_has_stack_frame(struct codegen_context *ctx) static int bpf_jit_stack_local(struct codegen_context *ctx) { if (bpf_has_stack_frame(ctx)) - return STACK_FRAME_MIN_SIZE + MAX_BPF_STACK; + return STACK_FRAME_MIN_SIZE + ctx->stack_size; else return -(BPF_PPC_STACK_SAVE + 16); } @@ -82,8 +82,9 @@ static int bpf_jit_stack_tailcallcnt(struct codegen_context *ctx) static int bpf_jit_stack_offsetof(struct codegen_context *ctx, int reg) { if (reg >= BPF_PPC_NVR_MIN && reg < 32) - return (bpf_has_stack_frame(ctx) ? BPF_PPC_STACKFRAME : 0) - - (8 * (32 - reg)); + return (bpf_has_stack_frame(ctx) ? + (BPF_PPC_STACKFRAME + ctx->stack_size) : 0) + - (8 * (32 - reg)); pr_err("BPF JIT is asking about unknown registers"); BUG(); @@ -134,7 +135,7 @@ static void bpf_jit_build_prologue(u32 *image, struct codegen_context *ctx) PPC_BPF_STL(0, 1, PPC_LR_STKOFF); } - PPC_BPF_STLU(1, 1, -BPF_PPC_STACKFRAME); + PPC_BPF_STLU(1, 1, -(BPF_PPC_STACKFRAME + ctx->stack_size)); } /* @@ -161,7 +162,7 @@ static void bpf_jit_build_prologue(u32 *image, struct codegen_context *ctx) /* Setup frame pointer to point to the bpf stack area */ if (bpf_is_seen_register(ctx, BPF_REG_FP)) PPC_ADDI(b2p[BPF_REG_FP], 1, - STACK_FRAME_MIN_SIZE + MAX_BPF_STACK); + STACK_FRAME_MIN_SIZE + ctx->stack_size); } static void bpf_jit_emit_common_epilogue(u32 *image, struct codegen_context *ctx) @@ -183,7 +184,7 @@ static void bpf_jit_emit_common_epilogue(u32 *image, struct codegen_context *ctx /* Tear down our stack frame */ if (bpf_has_stack_frame(ctx)) { - PPC_ADDI(1, 1, BPF_PPC_STACKFRAME); + PPC_ADDI(1, 1, BPF_PPC_STACKFRAME + ctx->stack_size); if (ctx->seen & SEEN_FUNC) { PPC_BPF_LL(0, 1, PPC_LR_STKOFF); PPC_MTLR(0); @@ -1013,6 +1014,9 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp) memset(&cgctx, 0, sizeof(struct codegen_context)); + /* Make sure that the stack is quadword aligned. */ + cgctx.stack_size = round_up(fp->aux->stack_depth, 16); + /* Scouting faux-generate pass 0 */ if (bpf_jit_build_body(fp, 0, &cgctx, addrs)) { /* We hit something illegal or unsupported. */ diff --git a/arch/powerpc/oprofile/op_model_cell.c b/arch/powerpc/oprofile/op_model_cell.c index c82497a31c54..264b6ab11978 100644 --- a/arch/powerpc/oprofile/op_model_cell.c +++ b/arch/powerpc/oprofile/op_model_cell.c @@ -555,9 +555,7 @@ static void cell_virtual_cntr(unsigned long data) static void start_virt_cntrs(void) { - init_timer(&timer_virt_cntr); - timer_virt_cntr.function = cell_virtual_cntr; - timer_virt_cntr.data = 0UL; + setup_timer(&timer_virt_cntr, cell_virtual_cntr, 0UL); timer_virt_cntr.expires = jiffies + HZ / 10; add_timer(&timer_virt_cntr); } @@ -679,9 +677,7 @@ static void spu_evnt_swap(unsigned long data) static void start_spu_event_swap(void) { - init_timer(&timer_spu_event_swap); - timer_spu_event_swap.function = spu_evnt_swap; - timer_spu_event_swap.data = 0UL; + setup_timer(&timer_spu_event_swap, spu_evnt_swap, 0UL); timer_spu_event_swap.expires = jiffies + HZ / 25; add_timer(&timer_spu_event_swap); } diff --git a/arch/powerpc/perf/hv-24x7.c b/arch/powerpc/perf/hv-24x7.c index 9c88b82f6229..72238eedc360 100644 --- a/arch/powerpc/perf/hv-24x7.c +++ b/arch/powerpc/perf/hv-24x7.c @@ -540,7 +540,7 @@ static int memord(const void *d1, size_t s1, const void *d2, size_t s2) { if (s1 < s2) return 1; - if (s2 > s1) + if (s1 > s2) return -1; return memcmp(d1, d2, s1); diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype index 13663efc1d31..596bd9091478 100644 --- a/arch/powerpc/platforms/Kconfig.cputype +++ b/arch/powerpc/platforms/Kconfig.cputype @@ -294,10 +294,6 @@ config PPC_STD_MMU_32 def_bool y depends on PPC_STD_MMU && PPC32 -config PPC_STD_MMU_64 - def_bool y - depends on PPC_STD_MMU && PPC64 - config PPC_RADIX_MMU bool "Radix MMU Support" depends on PPC_BOOK3S_64 @@ -308,6 +304,19 @@ config PPC_RADIX_MMU is only implemented by IBM Power9 CPUs, if you don't have one of them you can probably disable this. +config PPC_RADIX_MMU_DEFAULT + bool "Default to using the Radix MMU when possible" + depends on PPC_RADIX_MMU + default y + help + When the hardware supports the Radix MMU, default to using it unless + "disable_radix[=yes]" is specified on the kernel command line. + + If this option is disabled, the Hash MMU will be used by default, + unless "disable_radix=no" is specified on the kernel command line. + + If you're unsure, say Y. + config ARCH_ENABLE_HUGEPAGE_MIGRATION def_bool y depends on PPC_BOOK3S_64 && HUGETLB_PAGE && MIGRATION @@ -323,7 +332,7 @@ config PPC_BOOK3E_MMU config PPC_MM_SLICES bool - default y if PPC_STD_MMU_64 + default y if PPC_BOOK3S_64 default n config PPC_HAVE_PMU_SUPPORT diff --git a/arch/powerpc/platforms/powermac/low_i2c.c b/arch/powerpc/platforms/powermac/low_i2c.c index 70183eb3d5c8..39a1d4225e0f 100644 --- a/arch/powerpc/platforms/powermac/low_i2c.c +++ b/arch/powerpc/platforms/powermac/low_i2c.c @@ -513,9 +513,7 @@ static struct pmac_i2c_host_kw *__init kw_i2c_host_init(struct device_node *np) mutex_init(&host->mutex); init_completion(&host->complete); spin_lock_init(&host->lock); - init_timer(&host->timeout_timer); - host->timeout_timer.function = kw_i2c_timeout; - host->timeout_timer.data = (unsigned long)host; + setup_timer(&host->timeout_timer, kw_i2c_timeout, (unsigned long)host); psteps = of_get_property(np, "AAPL,address-step", NULL); steps = psteps ? (*psteps) : 0x10; diff --git a/arch/powerpc/platforms/powernv/eeh-powernv.c b/arch/powerpc/platforms/powernv/eeh-powernv.c index 8864065eba22..4650fb294e7a 100644 --- a/arch/powerpc/platforms/powernv/eeh-powernv.c +++ b/arch/powerpc/platforms/powernv/eeh-powernv.c @@ -41,7 +41,6 @@ #include "powernv.h" #include "pci.h" -static bool pnv_eeh_nb_init = false; static int eeh_event_irq = -EINVAL; static int pnv_eeh_init(void) @@ -197,31 +196,31 @@ PNV_EEH_DBGFS_ENTRY(inbB, 0xE10); * been built. If the I/O cache staff has been built, EEH is * ready to supply service. */ -static int pnv_eeh_post_init(void) +int pnv_eeh_post_init(void) { struct pci_controller *hose; struct pnv_phb *phb; int ret = 0; - /* Register OPAL event notifier */ - if (!pnv_eeh_nb_init) { - eeh_event_irq = opal_event_request(ilog2(OPAL_EVENT_PCI_ERROR)); - if (eeh_event_irq < 0) { - pr_err("%s: Can't register OPAL event interrupt (%d)\n", - __func__, eeh_event_irq); - return eeh_event_irq; - } + /* Probe devices & build address cache */ + eeh_probe_devices(); + eeh_addr_cache_build(); - ret = request_irq(eeh_event_irq, pnv_eeh_event, - IRQ_TYPE_LEVEL_HIGH, "opal-eeh", NULL); - if (ret < 0) { - irq_dispose_mapping(eeh_event_irq); - pr_err("%s: Can't request OPAL event interrupt (%d)\n", - __func__, eeh_event_irq); - return ret; - } + /* Register OPAL event notifier */ + eeh_event_irq = opal_event_request(ilog2(OPAL_EVENT_PCI_ERROR)); + if (eeh_event_irq < 0) { + pr_err("%s: Can't register OPAL event interrupt (%d)\n", + __func__, eeh_event_irq); + return eeh_event_irq; + } - pnv_eeh_nb_init = true; + ret = request_irq(eeh_event_irq, pnv_eeh_event, + IRQ_TYPE_LEVEL_HIGH, "opal-eeh", NULL); + if (ret < 0) { + irq_dispose_mapping(eeh_event_irq); + pr_err("%s: Can't request OPAL event interrupt (%d)\n", + __func__, eeh_event_irq); + return ret; } if (!eeh_enabled()) @@ -367,6 +366,10 @@ static void *pnv_eeh_probe(struct pci_dn *pdn, void *data) if ((pdn->class_code >> 8) == PCI_CLASS_BRIDGE_ISA) return NULL; + /* Skip if we haven't probed yet */ + if (phb->ioda.pe_rmap[config_addr] == IODA_INVALID_PE) + return NULL; + /* Initialize eeh device */ edev->class_code = pdn->class_code; edev->mode &= 0xFFFFFF00; @@ -1731,7 +1734,6 @@ static int pnv_eeh_restore_config(struct pci_dn *pdn) static struct eeh_ops pnv_eeh_ops = { .name = "powernv", .init = pnv_eeh_init, - .post_init = pnv_eeh_post_init, .probe = pnv_eeh_probe, .set_option = pnv_eeh_set_option, .get_pe_addr = pnv_eeh_get_pe_addr, diff --git a/arch/powerpc/platforms/powernv/opal-async.c b/arch/powerpc/platforms/powernv/opal-async.c index cf33769a7b72..18a355fa15e8 100644 --- a/arch/powerpc/platforms/powernv/opal-async.c +++ b/arch/powerpc/platforms/powernv/opal-async.c @@ -1,7 +1,7 @@ /* * PowerNV OPAL asynchronous completion interfaces * - * Copyright 2013 IBM Corp. + * Copyright 2013-2017 IBM Corp. * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -23,40 +23,50 @@ #include <asm/machdep.h> #include <asm/opal.h> -#define N_ASYNC_COMPLETIONS 64 +enum opal_async_token_state { + ASYNC_TOKEN_UNALLOCATED = 0, + ASYNC_TOKEN_ALLOCATED, + ASYNC_TOKEN_DISPATCHED, + ASYNC_TOKEN_ABANDONED, + ASYNC_TOKEN_COMPLETED +}; + +struct opal_async_token { + enum opal_async_token_state state; + struct opal_msg response; +}; -static DECLARE_BITMAP(opal_async_complete_map, N_ASYNC_COMPLETIONS) = {~0UL}; -static DECLARE_BITMAP(opal_async_token_map, N_ASYNC_COMPLETIONS); static DECLARE_WAIT_QUEUE_HEAD(opal_async_wait); static DEFINE_SPINLOCK(opal_async_comp_lock); static struct semaphore opal_async_sem; -static struct opal_msg *opal_async_responses; static unsigned int opal_max_async_tokens; +static struct opal_async_token *opal_async_tokens; -int __opal_async_get_token(void) +static int __opal_async_get_token(void) { unsigned long flags; - int token; + int i, token = -EBUSY; spin_lock_irqsave(&opal_async_comp_lock, flags); - token = find_first_bit(opal_async_complete_map, opal_max_async_tokens); - if (token >= opal_max_async_tokens) { - token = -EBUSY; - goto out; - } - if (__test_and_set_bit(token, opal_async_token_map)) { - token = -EBUSY; - goto out; + for (i = 0; i < opal_max_async_tokens; i++) { + if (opal_async_tokens[i].state == ASYNC_TOKEN_UNALLOCATED) { + opal_async_tokens[i].state = ASYNC_TOKEN_ALLOCATED; + token = i; + break; + } } - __clear_bit(token, opal_async_complete_map); - -out: spin_unlock_irqrestore(&opal_async_comp_lock, flags); return token; } +/* + * Note: If the returned token is used in an opal call and opal returns + * OPAL_ASYNC_COMPLETION you MUST call one of opal_async_wait_response() or + * opal_async_wait_response_interruptible() at least once before calling another + * opal_async_* function + */ int opal_async_get_token_interruptible(void) { int token; @@ -73,9 +83,10 @@ int opal_async_get_token_interruptible(void) } EXPORT_SYMBOL_GPL(opal_async_get_token_interruptible); -int __opal_async_release_token(int token) +static int __opal_async_release_token(int token) { unsigned long flags; + int rc; if (token < 0 || token >= opal_max_async_tokens) { pr_err("%s: Passed token is out of range, token %d\n", @@ -84,11 +95,26 @@ int __opal_async_release_token(int token) } spin_lock_irqsave(&opal_async_comp_lock, flags); - __set_bit(token, opal_async_complete_map); - __clear_bit(token, opal_async_token_map); + switch (opal_async_tokens[token].state) { + case ASYNC_TOKEN_COMPLETED: + case ASYNC_TOKEN_ALLOCATED: + opal_async_tokens[token].state = ASYNC_TOKEN_UNALLOCATED; + rc = 0; + break; + /* + * DISPATCHED and ABANDONED tokens must wait for OPAL to respond. + * Mark a DISPATCHED token as ABANDONED so that the response handling + * code knows no one cares and that it can free it then. + */ + case ASYNC_TOKEN_DISPATCHED: + opal_async_tokens[token].state = ASYNC_TOKEN_ABANDONED; + /* Fall through */ + default: + rc = 1; + } spin_unlock_irqrestore(&opal_async_comp_lock, flags); - return 0; + return rc; } int opal_async_release_token(int token) @@ -96,12 +122,10 @@ int opal_async_release_token(int token) int ret; ret = __opal_async_release_token(token); - if (ret) - return ret; - - up(&opal_async_sem); + if (!ret) + up(&opal_async_sem); - return 0; + return ret; } EXPORT_SYMBOL_GPL(opal_async_release_token); @@ -117,22 +141,83 @@ int opal_async_wait_response(uint64_t token, struct opal_msg *msg) return -EINVAL; } - /* Wakeup the poller before we wait for events to speed things + /* + * There is no need to mark the token as dispatched, wait_event() + * will block until the token completes. + * + * Wakeup the poller before we wait for events to speed things * up on platforms or simulators where the interrupts aren't * functional. */ opal_wake_poller(); - wait_event(opal_async_wait, test_bit(token, opal_async_complete_map)); - memcpy(msg, &opal_async_responses[token], sizeof(*msg)); + wait_event(opal_async_wait, opal_async_tokens[token].state + == ASYNC_TOKEN_COMPLETED); + memcpy(msg, &opal_async_tokens[token].response, sizeof(*msg)); return 0; } EXPORT_SYMBOL_GPL(opal_async_wait_response); +int opal_async_wait_response_interruptible(uint64_t token, struct opal_msg *msg) +{ + unsigned long flags; + int ret; + + if (token >= opal_max_async_tokens) { + pr_err("%s: Invalid token passed\n", __func__); + return -EINVAL; + } + + if (!msg) { + pr_err("%s: Invalid message pointer passed\n", __func__); + return -EINVAL; + } + + /* + * The first time this gets called we mark the token as DISPATCHED + * so that if wait_event_interruptible() returns not zero and the + * caller frees the token, we know not to actually free the token + * until the response comes. + * + * Only change if the token is ALLOCATED - it may have been + * completed even before the caller gets around to calling this + * the first time. + * + * There is also a dirty great comment at the token allocation + * function that if the opal call returns OPAL_ASYNC_COMPLETION to + * the caller then the caller *must* call this or the not + * interruptible version before doing anything else with the + * token. + */ + if (opal_async_tokens[token].state == ASYNC_TOKEN_ALLOCATED) { + spin_lock_irqsave(&opal_async_comp_lock, flags); + if (opal_async_tokens[token].state == ASYNC_TOKEN_ALLOCATED) + opal_async_tokens[token].state = ASYNC_TOKEN_DISPATCHED; + spin_unlock_irqrestore(&opal_async_comp_lock, flags); + } + + /* + * Wakeup the poller before we wait for events to speed things + * up on platforms or simulators where the interrupts aren't + * functional. + */ + opal_wake_poller(); + ret = wait_event_interruptible(opal_async_wait, + opal_async_tokens[token].state == + ASYNC_TOKEN_COMPLETED); + if (!ret) + memcpy(msg, &opal_async_tokens[token].response, sizeof(*msg)); + + return ret; +} +EXPORT_SYMBOL_GPL(opal_async_wait_response_interruptible); + +/* Called from interrupt context */ static int opal_async_comp_event(struct notifier_block *nb, unsigned long msg_type, void *msg) { struct opal_msg *comp_msg = msg; + enum opal_async_token_state state; unsigned long flags; uint64_t token; @@ -140,11 +225,17 @@ static int opal_async_comp_event(struct notifier_block *nb, return 0; token = be64_to_cpu(comp_msg->params[0]); - memcpy(&opal_async_responses[token], comp_msg, sizeof(*comp_msg)); spin_lock_irqsave(&opal_async_comp_lock, flags); - __set_bit(token, opal_async_complete_map); + state = opal_async_tokens[token].state; + opal_async_tokens[token].state = ASYNC_TOKEN_COMPLETED; spin_unlock_irqrestore(&opal_async_comp_lock, flags); + if (state == ASYNC_TOKEN_ABANDONED) { + /* Free the token, no one else will */ + opal_async_release_token(token); + return 0; + } + memcpy(&opal_async_tokens[token].response, comp_msg, sizeof(*comp_msg)); wake_up(&opal_async_wait); return 0; @@ -178,32 +269,23 @@ int __init opal_async_comp_init(void) } opal_max_async_tokens = be32_to_cpup(async); - if (opal_max_async_tokens > N_ASYNC_COMPLETIONS) - opal_max_async_tokens = N_ASYNC_COMPLETIONS; + opal_async_tokens = kcalloc(opal_max_async_tokens, + sizeof(*opal_async_tokens), GFP_KERNEL); + if (!opal_async_tokens) { + err = -ENOMEM; + goto out_opal_node; + } err = opal_message_notifier_register(OPAL_MSG_ASYNC_COMP, &opal_async_comp_nb); if (err) { pr_err("%s: Can't register OPAL event notifier (%d)\n", __func__, err); + kfree(opal_async_tokens); goto out_opal_node; } - opal_async_responses = kzalloc( - sizeof(*opal_async_responses) * opal_max_async_tokens, - GFP_KERNEL); - if (!opal_async_responses) { - pr_err("%s: Out of memory, failed to do asynchronous " - "completion init\n", __func__); - err = -ENOMEM; - goto out_opal_node; - } - - /* Initialize to 1 less than the maximum tokens available, as we may - * require to pop one during emergency through synchronous call to - * __opal_async_get_token() - */ - sema_init(&opal_async_sem, opal_max_async_tokens - 1); + sema_init(&opal_async_sem, opal_max_async_tokens); out_opal_node: of_node_put(opal_node); diff --git a/arch/powerpc/platforms/powernv/opal-hmi.c b/arch/powerpc/platforms/powernv/opal-hmi.c index d78fed728cdf..c9e1a4ff295c 100644 --- a/arch/powerpc/platforms/powernv/opal-hmi.c +++ b/arch/powerpc/platforms/powernv/opal-hmi.c @@ -1,5 +1,5 @@ /* - * OPAL hypervisor Maintenance interrupt handling support in PowreNV. + * OPAL hypervisor Maintenance interrupt handling support in PowerNV. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by diff --git a/arch/powerpc/platforms/powernv/opal-irqchip.c b/arch/powerpc/platforms/powernv/opal-irqchip.c index ecdcba9d1220..9d1b8c0aaf93 100644 --- a/arch/powerpc/platforms/powernv/opal-irqchip.c +++ b/arch/powerpc/platforms/powernv/opal-irqchip.c @@ -174,8 +174,14 @@ void opal_event_shutdown(void) /* First free interrupts, which will also mask them */ for (i = 0; i < opal_irq_count; i++) { - if (opal_irqs[i]) + if (!opal_irqs[i]) + continue; + + if (in_interrupt()) + disable_irq_nosync(opal_irqs[i]); + else free_irq(opal_irqs[i], NULL); + opal_irqs[i] = 0; } } diff --git a/arch/powerpc/platforms/powernv/opal-memory-errors.c b/arch/powerpc/platforms/powernv/opal-memory-errors.c index 4495f428b500..d9916ea62305 100644 --- a/arch/powerpc/platforms/powernv/opal-memory-errors.c +++ b/arch/powerpc/platforms/powernv/opal-memory-errors.c @@ -1,5 +1,5 @@ /* - * OPAL asynchronus Memory error handling support in PowreNV. + * OPAL asynchronus Memory error handling support in PowerNV. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by diff --git a/arch/powerpc/platforms/powernv/opal-sensor.c b/arch/powerpc/platforms/powernv/opal-sensor.c index aa267f120033..0a7074bb91dc 100644 --- a/arch/powerpc/platforms/powernv/opal-sensor.c +++ b/arch/powerpc/platforms/powernv/opal-sensor.c @@ -19,13 +19,10 @@ */ #include <linux/delay.h> -#include <linux/mutex.h> #include <linux/of_platform.h> #include <asm/opal.h> #include <asm/machdep.h> -static DEFINE_MUTEX(opal_sensor_mutex); - /* * This will return sensor information to driver based on the requested sensor * handle. A handle is an opaque id for the powernv, read by the driver from the @@ -38,13 +35,9 @@ int opal_get_sensor_data(u32 sensor_hndl, u32 *sensor_data) __be32 data; token = opal_async_get_token_interruptible(); - if (token < 0) { - pr_err("%s: Couldn't get the token, returning\n", __func__); - ret = token; - goto out; - } + if (token < 0) + return token; - mutex_lock(&opal_sensor_mutex); ret = opal_sensor_read(sensor_hndl, token, &data); switch (ret) { case OPAL_ASYNC_COMPLETION: @@ -52,7 +45,7 @@ int opal_get_sensor_data(u32 sensor_hndl, u32 *sensor_data) if (ret) { pr_err("%s: Failed to wait for the async response, %d\n", __func__, ret); - goto out_token; + goto out; } ret = opal_error_code(opal_get_async_rc(msg)); @@ -73,10 +66,8 @@ int opal_get_sensor_data(u32 sensor_hndl, u32 *sensor_data) break; } -out_token: - mutex_unlock(&opal_sensor_mutex); - opal_async_release_token(token); out: + opal_async_release_token(token); return ret; } EXPORT_SYMBOL_GPL(opal_get_sensor_data); diff --git a/arch/powerpc/platforms/powernv/opal-wrappers.S b/arch/powerpc/platforms/powernv/opal-wrappers.S index 8c1ede2d3f7e..6f4b00a2ac46 100644 --- a/arch/powerpc/platforms/powernv/opal-wrappers.S +++ b/arch/powerpc/platforms/powernv/opal-wrappers.S @@ -94,7 +94,7 @@ opal_return: * bytes (always BE) since MSR:LE will end up fixed up as a side * effect of the rfid. */ - FIXUP_ENDIAN + FIXUP_ENDIAN_HV ld r2,PACATOC(r13); lwz r4,8(r1); ld r5,PPC_LR_STKOFF(r1); @@ -120,7 +120,7 @@ opal_real_call: hrfid opal_return_realmode: - FIXUP_ENDIAN + FIXUP_ENDIAN_HV ld r2,PACATOC(r13); lwz r11,8(r1); ld r12,PPC_LR_STKOFF(r1) @@ -307,6 +307,7 @@ OPAL_CALL(opal_xive_get_vp_info, OPAL_XIVE_GET_VP_INFO); OPAL_CALL(opal_xive_set_vp_info, OPAL_XIVE_SET_VP_INFO); OPAL_CALL(opal_xive_sync, OPAL_XIVE_SYNC); OPAL_CALL(opal_xive_dump, OPAL_XIVE_DUMP); +OPAL_CALL(opal_signal_system_reset, OPAL_SIGNAL_SYSTEM_RESET); OPAL_CALL(opal_npu_init_context, OPAL_NPU_INIT_CONTEXT); OPAL_CALL(opal_npu_destroy_context, OPAL_NPU_DESTROY_CONTEXT); OPAL_CALL(opal_npu_map_lpar, OPAL_NPU_MAP_LPAR); diff --git a/arch/powerpc/platforms/powernv/opal.c b/arch/powerpc/platforms/powernv/opal.c index 65c79ecf5a4d..041ddbd1fc57 100644 --- a/arch/powerpc/platforms/powernv/opal.c +++ b/arch/powerpc/platforms/powernv/opal.c @@ -998,6 +998,7 @@ int opal_error_code(int rc) case OPAL_PARAMETER: return -EINVAL; case OPAL_ASYNC_COMPLETION: return -EINPROGRESS; + case OPAL_BUSY: case OPAL_BUSY_EVENT: return -EBUSY; case OPAL_NO_MEM: return -ENOMEM; case OPAL_PERMISSION: return -EPERM; @@ -1037,3 +1038,4 @@ EXPORT_SYMBOL_GPL(opal_write_oppanel_async); /* Export this for KVM */ EXPORT_SYMBOL_GPL(opal_int_set_mfrr); EXPORT_SYMBOL_GPL(opal_int_eoi); +EXPORT_SYMBOL_GPL(opal_error_code); diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index 57f9e55f4352..749055553064 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -1002,9 +1002,12 @@ static int pnv_pci_vf_resource_shift(struct pci_dev *dev, int offset) } /* - * After doing so, there would be a "hole" in the /proc/iomem when - * offset is a positive value. It looks like the device return some - * mmio back to the system, which actually no one could use it. + * Since M64 BAR shares segments among all possible 256 PEs, + * we have to shift the beginning of PF IOV BAR to make it start from + * the segment which belongs to the PE number assigned to the first VF. + * This creates a "hole" in the /proc/iomem which could be used for + * allocating other resources so we reserve this area below and + * release when IOV is released. */ for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) { res = &dev->resource[i + PCI_IOV_RESOURCES]; @@ -1018,7 +1021,22 @@ static int pnv_pci_vf_resource_shift(struct pci_dev *dev, int offset) dev_info(&dev->dev, "VF BAR%d: %pR shifted to %pR (%sabling %d VFs shifted by %d)\n", i, &res2, res, (offset > 0) ? "En" : "Dis", num_vfs, offset); + + if (offset < 0) { + devm_release_resource(&dev->dev, &pdn->holes[i]); + memset(&pdn->holes[i], 0, sizeof(pdn->holes[i])); + } + pci_update_resource(dev, i + PCI_IOV_RESOURCES); + + if (offset > 0) { + pdn->holes[i].start = res2.start; + pdn->holes[i].end = res2.start + size * offset - 1; + pdn->holes[i].flags = IORESOURCE_BUS; + pdn->holes[i].name = "pnv_iov_reserved"; + devm_request_resource(&dev->dev, res->parent, + &pdn->holes[i]); + } } return 0; } @@ -2779,7 +2797,7 @@ static long pnv_pci_ioda2_table_alloc_pages(int nid, __u64 bus_offset, if (!levels || (levels > POWERNV_IOMMU_MAX_LEVELS)) return -EINVAL; - if ((window_size > memory_hotplug_max()) || !is_power_of_2(window_size)) + if (!is_power_of_2(window_size)) return -EINVAL; /* Adjust direct table size from window_size and levels */ @@ -3293,8 +3311,7 @@ static void pnv_pci_ioda_fixup(void) pnv_pci_ioda_create_dbgfs(); #ifdef CONFIG_EEH - eeh_init(); - eeh_addr_cache_build(); + pnv_eeh_post_init(); #endif } diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h index a95273c524f6..56d1f272d4ad 100644 --- a/arch/powerpc/platforms/powernv/pci.h +++ b/arch/powerpc/platforms/powernv/pci.h @@ -234,6 +234,7 @@ extern struct pnv_ioda_pe *pnv_ioda_get_pe(struct pci_dev *dev); extern void pnv_set_msi_irq_chip(struct pnv_phb *phb, unsigned int virq); extern bool pnv_pci_enable_device_hook(struct pci_dev *dev); extern void pnv_pci_ioda2_set_bypass(struct pnv_ioda_pe *pe, bool enable); +extern int pnv_eeh_post_init(void); extern void pe_level_printk(const struct pnv_ioda_pe *pe, const char *level, const char *fmt, ...); diff --git a/arch/powerpc/platforms/powernv/setup.c b/arch/powerpc/platforms/powernv/setup.c index bbb73aa0eb8f..1edfbc1e40f4 100644 --- a/arch/powerpc/platforms/powernv/setup.c +++ b/arch/powerpc/platforms/powernv/setup.c @@ -36,6 +36,7 @@ #include <asm/opal.h> #include <asm/kexec.h> #include <asm/smp.h> +#include <asm/tm.h> #include "powernv.h" @@ -290,6 +291,7 @@ static void __init pnv_setup_machdep_opal(void) ppc_md.restart = pnv_restart; pm_power_off = pnv_power_off; ppc_md.halt = pnv_halt; + /* ppc_md.system_reset_exception gets filled in by pnv_smp_init() */ ppc_md.machine_check_exception = opal_machine_check; ppc_md.mce_check_early_recovery = opal_mce_check_early_recovery; ppc_md.hmi_exception_early = opal_hmi_exception_early; @@ -311,6 +313,28 @@ static int __init pnv_probe(void) return 1; } +#ifdef CONFIG_PPC_TRANSACTIONAL_MEM +void __init pnv_tm_init(void) +{ + if (!firmware_has_feature(FW_FEATURE_OPAL) || + !pvr_version_is(PVR_POWER9) || + early_cpu_has_feature(CPU_FTR_TM)) + return; + + if (opal_reinit_cpus(OPAL_REINIT_CPUS_TM_SUSPEND_DISABLED) != OPAL_SUCCESS) + return; + + pr_info("Enabling TM (Transactional Memory) with Suspend Disabled\n"); + cur_cpu_spec->cpu_features |= CPU_FTR_TM; + /* Make sure "normal" HTM is off (it should be) */ + cur_cpu_spec->cpu_user_features2 &= ~PPC_FEATURE2_HTM; + /* Turn on no suspend mode, and HTM no SC */ + cur_cpu_spec->cpu_user_features2 |= PPC_FEATURE2_HTM_NO_SUSPEND | \ + PPC_FEATURE2_HTM_NOSC; + tm_suspend_disabled = true; +} +#endif /* CONFIG_PPC_TRANSACTIONAL_MEM */ + /* * Returns the cpu frequency for 'cpu' in Hz. This is used by * /proc/cpuinfo @@ -319,7 +343,7 @@ static unsigned long pnv_get_proc_freq(unsigned int cpu) { unsigned long ret_freq; - ret_freq = cpufreq_quick_get(cpu) * 1000ul; + ret_freq = cpufreq_get(cpu) * 1000ul; /* * If the backend cpufreq driver does not exist, diff --git a/arch/powerpc/platforms/powernv/smp.c b/arch/powerpc/platforms/powernv/smp.c index c17f81e433f7..ba030669eca1 100644 --- a/arch/powerpc/platforms/powernv/smp.c +++ b/arch/powerpc/platforms/powernv/smp.c @@ -49,6 +49,13 @@ static void pnv_smp_setup_cpu(int cpu) { + /* + * P9 workaround for CI vector load (see traps.c), + * enable the corresponding HMI interrupt + */ + if (pvr_version_is(PVR_POWER9)) + mtspr(SPRN_HMEER, mfspr(SPRN_HMEER) | PPC_BIT(17)); + if (xive_enabled()) xive_smp_setup_cpu(); else if (cpu != boot_cpuid) @@ -290,6 +297,54 @@ static void __init pnv_smp_probe(void) } } +static int pnv_system_reset_exception(struct pt_regs *regs) +{ + if (smp_handle_nmi_ipi(regs)) + return 1; + return 0; +} + +static int pnv_cause_nmi_ipi(int cpu) +{ + int64_t rc; + + if (cpu >= 0) { + rc = opal_signal_system_reset(get_hard_smp_processor_id(cpu)); + if (rc != OPAL_SUCCESS) + return 0; + return 1; + + } else if (cpu == NMI_IPI_ALL_OTHERS) { + bool success = true; + int c; + + + /* + * We do not use broadcasts (yet), because it's not clear + * exactly what semantics Linux wants or the firmware should + * provide. + */ + for_each_online_cpu(c) { + if (c == smp_processor_id()) + continue; + + rc = opal_signal_system_reset( + get_hard_smp_processor_id(c)); + if (rc != OPAL_SUCCESS) + success = false; + } + if (success) + return 1; + + /* + * Caller will fall back to doorbells, which may pick + * up the remainders. + */ + } + + return 0; +} + static struct smp_ops_t pnv_smp_ops = { .message_pass = NULL, /* Use smp_muxed_ipi_message_pass */ .cause_ipi = NULL, /* Filled at runtime by pnv_smp_probe() */ @@ -308,6 +363,10 @@ static struct smp_ops_t pnv_smp_ops = { /* This is called very early during platform setup_arch */ void __init pnv_smp_init(void) { + if (opal_check_token(OPAL_SIGNAL_SYSTEM_RESET)) { + ppc_md.system_reset_exception = pnv_system_reset_exception; + pnv_smp_ops.cause_nmi_ipi = pnv_cause_nmi_ipi; + } smp_ops = &pnv_smp_ops; #ifdef CONFIG_HOTPLUG_CPU diff --git a/arch/powerpc/platforms/pseries/hotplug-cpu.c b/arch/powerpc/platforms/pseries/hotplug-cpu.c index fadb95efbb9e..a7d14aa7bb7c 100644 --- a/arch/powerpc/platforms/pseries/hotplug-cpu.c +++ b/arch/powerpc/platforms/pseries/hotplug-cpu.c @@ -363,6 +363,7 @@ static int dlpar_online_cpu(struct device_node *dn) BUG_ON(get_cpu_current_state(cpu) != CPU_STATE_OFFLINE); cpu_maps_update_done(); + timed_topology_update(1); rc = device_online(get_cpu_device(cpu)); if (rc) goto out; @@ -533,6 +534,7 @@ static int dlpar_offline_cpu(struct device_node *dn) set_preferred_offline_state(cpu, CPU_STATE_OFFLINE); cpu_maps_update_done(); + timed_topology_update(1); rc = device_offline(get_cpu_device(cpu)); if (rc) goto out; diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c index 7c181467d0ad..69921f72e2da 100644 --- a/arch/powerpc/platforms/pseries/iommu.c +++ b/arch/powerpc/platforms/pseries/iommu.c @@ -55,23 +55,23 @@ static struct iommu_table_group *iommu_pseries_alloc_group(int node) { - struct iommu_table_group *table_group = NULL; - struct iommu_table *tbl = NULL; - struct iommu_table_group_link *tgl = NULL; + struct iommu_table_group *table_group; + struct iommu_table *tbl; + struct iommu_table_group_link *tgl; table_group = kzalloc_node(sizeof(struct iommu_table_group), GFP_KERNEL, node); if (!table_group) - goto fail_exit; + return NULL; tbl = kzalloc_node(sizeof(struct iommu_table), GFP_KERNEL, node); if (!tbl) - goto fail_exit; + goto free_group; tgl = kzalloc_node(sizeof(struct iommu_table_group_link), GFP_KERNEL, node); if (!tgl) - goto fail_exit; + goto free_table; INIT_LIST_HEAD_RCU(&tbl->it_group_list); kref_init(&tbl->it_kref); @@ -82,11 +82,10 @@ static struct iommu_table_group *iommu_pseries_alloc_group(int node) return table_group; -fail_exit: - kfree(tgl); - kfree(table_group); +free_table: kfree(tbl); - +free_group: + kfree(table_group); return NULL; } diff --git a/arch/powerpc/platforms/pseries/lpar.c b/arch/powerpc/platforms/pseries/lpar.c index 495ba4e7336d..0ee4a469a4ae 100644 --- a/arch/powerpc/platforms/pseries/lpar.c +++ b/arch/powerpc/platforms/pseries/lpar.c @@ -93,7 +93,7 @@ void vpa_init(int cpu) return; } -#ifdef CONFIG_PPC_STD_MMU_64 +#ifdef CONFIG_PPC_BOOK3S_64 /* * PAPR says this feature is SLB-Buffer but firmware never * reports that. All SPLPAR support SLB shadow buffer. @@ -106,7 +106,7 @@ void vpa_init(int cpu) "cpu %d (hw %d) of area %lx failed with %ld\n", cpu, hwcpu, addr, ret); } -#endif /* CONFIG_PPC_STD_MMU_64 */ +#endif /* CONFIG_PPC_BOOK3S_64 */ /* * Register dispatch trace log, if one has been allocated. @@ -129,7 +129,7 @@ void vpa_init(int cpu) } } -#ifdef CONFIG_PPC_STD_MMU_64 +#ifdef CONFIG_PPC_BOOK3S_64 static long pSeries_lpar_hpte_insert(unsigned long hpte_group, unsigned long vpn, unsigned long pa, @@ -824,7 +824,7 @@ void arch_free_page(struct page *page, int order) EXPORT_SYMBOL(arch_free_page); #endif /* CONFIG_PPC_SMLPAR */ -#endif /* CONFIG_PPC_STD_MMU_64 */ +#endif /* CONFIG_PPC_BOOK3S_64 */ #ifdef CONFIG_TRACEPOINTS #ifdef HAVE_JUMP_LABEL diff --git a/arch/powerpc/platforms/pseries/lparcfg.c b/arch/powerpc/platforms/pseries/lparcfg.c index 779fc2a1c8f7..b2706c483067 100644 --- a/arch/powerpc/platforms/pseries/lparcfg.c +++ b/arch/powerpc/platforms/pseries/lparcfg.c @@ -485,7 +485,7 @@ static int pseries_lparcfg_data(struct seq_file *m, void *v) seq_printf(m, "shared_processor_mode=%d\n", lppaca_shared_proc(get_lppaca())); -#ifdef CONFIG_PPC_STD_MMU_64 +#ifdef CONFIG_PPC_BOOK3S_64 seq_printf(m, "slb_size=%d\n", mmu_slb_size); #endif parse_em_data(m); diff --git a/arch/powerpc/platforms/pseries/vio.c b/arch/powerpc/platforms/pseries/vio.c index 12277bc9fd9e..d86938260a86 100644 --- a/arch/powerpc/platforms/pseries/vio.c +++ b/arch/powerpc/platforms/pseries/vio.c @@ -1592,6 +1592,8 @@ ATTRIBUTE_GROUPS(vio_dev); void vio_unregister_device(struct vio_dev *viodev) { device_unregister(&viodev->dev); + if (viodev->family == VDEVICE) + irq_dispose_mapping(viodev->irq); } EXPORT_SYMBOL(vio_unregister_device); diff --git a/arch/powerpc/sysdev/axonram.c b/arch/powerpc/sysdev/axonram.c index c60e84e4558d..1b307c80b401 100644 --- a/arch/powerpc/sysdev/axonram.c +++ b/arch/powerpc/sysdev/axonram.c @@ -184,7 +184,7 @@ static int axon_ram_probe(struct platform_device *device) static int axon_ram_bank_id = -1; struct axon_ram_bank *bank; struct resource resource; - int rc = 0; + int rc; axon_ram_bank_id++; diff --git a/arch/powerpc/sysdev/ipic.c b/arch/powerpc/sysdev/ipic.c index 16f1edd78c40..535cf1f6941c 100644 --- a/arch/powerpc/sysdev/ipic.c +++ b/arch/powerpc/sysdev/ipic.c @@ -846,12 +846,12 @@ void ipic_disable_mcp(enum ipic_mcp_irq mcp_irq) u32 ipic_get_mcp_status(void) { - return ipic_read(primary_ipic->regs, IPIC_SERMR); + return ipic_read(primary_ipic->regs, IPIC_SERSR); } void ipic_clear_mcp_status(u32 mask) { - ipic_write(primary_ipic->regs, IPIC_SERMR, mask); + ipic_write(primary_ipic->regs, IPIC_SERSR, mask); } /* Return an interrupt vector or 0 if no interrupt is pending. */ diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c index 33351c6704b1..1b2d8cb49abb 100644 --- a/arch/powerpc/xmon/xmon.c +++ b/arch/powerpc/xmon/xmon.c @@ -28,6 +28,7 @@ #include <linux/bug.h> #include <linux/nmi.h> #include <linux/ctype.h> +#include <linux/highmem.h> #include <asm/debugfs.h> #include <asm/ptrace.h> @@ -127,6 +128,7 @@ static void byterev(unsigned char *, int); static void memex(void); static int bsesc(void); static void dump(void); +static void show_pte(unsigned long); static void prdump(unsigned long, long); static int ppc_inst_dump(unsigned long, long, int); static void dump_log_buf(void); @@ -234,6 +236,7 @@ Commands:\n\ #endif "\ dr dump stream of raw bytes\n\ + dv dump virtual address translation \n\ dt dump the tracing buffers (uses printk)\n\ dtc dump the tracing buffers for current CPU (uses printk)\n\ " @@ -278,6 +281,7 @@ Commands:\n\ #elif defined(CONFIG_44x) || defined(CONFIG_PPC_BOOK3E) " u dump TLB\n" #endif +" U show uptime information\n" " ? help\n" " # n limit output to n lines per page (for dp, dpa, dl)\n" " zr reboot\n\ @@ -530,14 +534,19 @@ static int xmon_core(struct pt_regs *regs, int fromipi) waiting: secondary = 1; + spin_begin(); while (secondary && !xmon_gate) { if (in_xmon == 0) { - if (fromipi) + if (fromipi) { + spin_end(); goto leave; + } secondary = test_and_set_bit(0, &in_xmon); } - barrier(); + spin_cpu_relax(); + touch_nmi_watchdog(); } + spin_end(); if (!secondary && !xmon_gate) { /* we are the first cpu to come in */ @@ -568,21 +577,25 @@ static int xmon_core(struct pt_regs *regs, int fromipi) mb(); xmon_gate = 1; barrier(); + touch_nmi_watchdog(); } cmdloop: while (in_xmon) { if (secondary) { + spin_begin(); if (cpu == xmon_owner) { if (!test_and_set_bit(0, &xmon_taken)) { secondary = 0; + spin_end(); continue; } /* missed it */ while (cpu == xmon_owner) - barrier(); + spin_cpu_relax(); } - barrier(); + spin_cpu_relax(); + touch_nmi_watchdog(); } else { cmd = cmds(regs); if (cmd != 0) { @@ -896,6 +909,26 @@ static void remove_cpu_bpts(void) write_ciabr(0); } +/* Based on uptime_proc_show(). */ +static void +show_uptime(void) +{ + struct timespec uptime; + + if (setjmp(bus_error_jmp) == 0) { + catch_memory_errors = 1; + sync(); + + get_monotonic_boottime(&uptime); + printf("Uptime: %lu.%.2lu seconds\n", (unsigned long)uptime.tv_sec, + ((unsigned long)uptime.tv_nsec / (NSEC_PER_SEC/100))); + + sync(); + __delay(200); \ + } + catch_memory_errors = 0; +} + static void set_lpp_cmd(void) { unsigned long lpp; @@ -1031,6 +1064,9 @@ cmds(struct pt_regs *excp) dump_tlb_book3e(); break; #endif + case 'U': + show_uptime(); + break; default: printf("Unrecognized command: "); do { @@ -2279,7 +2315,7 @@ static void dump_tracing(void) static void dump_one_paca(int cpu) { struct paca_struct *p; -#ifdef CONFIG_PPC_STD_MMU_64 +#ifdef CONFIG_PPC_BOOK3S_64 int i = 0; #endif @@ -2320,7 +2356,7 @@ static void dump_one_paca(int cpu) DUMP(p, hw_cpu_id, "x"); DUMP(p, cpu_start, "x"); DUMP(p, kexec_state, "x"); -#ifdef CONFIG_PPC_STD_MMU_64 +#ifdef CONFIG_PPC_BOOK3S_64 for (i = 0; i < SLB_NUM_BOLTED; i++) { u64 esid, vsid; @@ -2351,6 +2387,7 @@ static void dump_one_paca(int cpu) #endif DUMP(p, __current, "p"); DUMP(p, kstack, "lx"); + printf(" kstack_base = 0x%016lx\n", p->kstack & ~(THREAD_SIZE - 1)); DUMP(p, stab_rr, "lx"); DUMP(p, saved_r1, "lx"); DUMP(p, trap_save, "x"); @@ -2475,6 +2512,11 @@ static void dump_xives(void) unsigned long num; int c; + if (!xive_enabled()) { + printf("Xive disabled on this system\n"); + return; + } + c = inchar(); if (c == 'a') { dump_all_xives(); @@ -2574,6 +2616,9 @@ dump(void) dump_log_buf(); } else if (c == 'o') { dump_opal_msglog(); + } else if (c == 'v') { + /* dump virtual to physical translation */ + show_pte(adrs); } else if (c == 'r') { scanhex(&ndump); if (ndump == 0) @@ -2907,6 +2952,116 @@ static void show_task(struct task_struct *tsk) tsk->comm); } +#ifdef CONFIG_PPC_BOOK3S_64 +void format_pte(void *ptep, unsigned long pte) +{ + printf("ptep @ 0x%016lx = 0x%016lx\n", (unsigned long)ptep, pte); + printf("Maps physical address = 0x%016lx\n", pte & PTE_RPN_MASK); + + printf("Flags = %s%s%s%s%s\n", + (pte & _PAGE_ACCESSED) ? "Accessed " : "", + (pte & _PAGE_DIRTY) ? "Dirty " : "", + (pte & _PAGE_READ) ? "Read " : "", + (pte & _PAGE_WRITE) ? "Write " : "", + (pte & _PAGE_EXEC) ? "Exec " : ""); +} + +static void show_pte(unsigned long addr) +{ + unsigned long tskv = 0; + struct task_struct *tsk = NULL; + struct mm_struct *mm; + pgd_t *pgdp, *pgdir; + pud_t *pudp; + pmd_t *pmdp; + pte_t *ptep; + + if (!scanhex(&tskv)) + mm = &init_mm; + else + tsk = (struct task_struct *)tskv; + + if (tsk == NULL) + mm = &init_mm; + else + mm = tsk->active_mm; + + if (setjmp(bus_error_jmp) != 0) { + catch_memory_errors = 0; + printf("*** Error dumping pte for task %p\n", tsk); + return; + } + + catch_memory_errors = 1; + sync(); + + if (mm == &init_mm) { + pgdp = pgd_offset_k(addr); + pgdir = pgd_offset_k(0); + } else { + pgdp = pgd_offset(mm, addr); + pgdir = pgd_offset(mm, 0); + } + + if (pgd_none(*pgdp)) { + printf("no linux page table for address\n"); + return; + } + + printf("pgd @ 0x%016lx\n", pgdir); + + if (pgd_huge(*pgdp)) { + format_pte(pgdp, pgd_val(*pgdp)); + return; + } + printf("pgdp @ 0x%016lx = 0x%016lx\n", pgdp, pgd_val(*pgdp)); + + pudp = pud_offset(pgdp, addr); + + if (pud_none(*pudp)) { + printf("No valid PUD\n"); + return; + } + + if (pud_huge(*pudp)) { + format_pte(pudp, pud_val(*pudp)); + return; + } + + printf("pudp @ 0x%016lx = 0x%016lx\n", pudp, pud_val(*pudp)); + + pmdp = pmd_offset(pudp, addr); + + if (pmd_none(*pmdp)) { + printf("No valid PMD\n"); + return; + } + + if (pmd_huge(*pmdp)) { + format_pte(pmdp, pmd_val(*pmdp)); + return; + } + printf("pmdp @ 0x%016lx = 0x%016lx\n", pmdp, pmd_val(*pmdp)); + + ptep = pte_offset_map(pmdp, addr); + if (pte_none(*ptep)) { + printf("no valid PTE\n"); + return; + } + + format_pte(ptep, pte_val(*ptep)); + + sync(); + __delay(200); + catch_memory_errors = 0; +} +#else +static void show_pte(unsigned long addr) +{ + printf("show_pte not yet implemented\n"); +} +#endif /* CONFIG_PPC_BOOK3S_64 */ + static void show_tasks(void) { unsigned long tskv; @@ -3224,7 +3379,7 @@ static void xmon_print_symbol(unsigned long address, const char *mid, printf("%s", after); } -#ifdef CONFIG_PPC_STD_MMU_64 +#ifdef CONFIG_PPC_BOOK3S_64 void dump_segments(void) { int i; diff --git a/drivers/misc/cxl/api.c b/drivers/misc/cxl/api.c index a0c44d16bf30..7c11bad5cded 100644 --- a/drivers/misc/cxl/api.c +++ b/drivers/misc/cxl/api.c @@ -15,6 +15,7 @@ #include <linux/module.h> #include <linux/mount.h> #include <linux/sched/mm.h> +#include <linux/mmu_context.h> #include "cxl.h" @@ -331,9 +332,12 @@ int cxl_start_context(struct cxl_context *ctx, u64 wed, /* ensure this mm_struct can't be freed */ cxl_context_mm_count_get(ctx); - /* decrement the use count */ - if (ctx->mm) + if (ctx->mm) { + /* decrement the use count from above */ mmput(ctx->mm); + /* make TLBIs for this context global */ + mm_context_add_copro(ctx->mm); + } } /* @@ -342,13 +346,19 @@ int cxl_start_context(struct cxl_context *ctx, u64 wed, */ cxl_ctx_get(); + /* See the comment in afu_ioctl_start_work() */ + smp_mb(); + if ((rc = cxl_ops->attach_process(ctx, kernel, wed, 0))) { put_pid(ctx->pid); ctx->pid = NULL; cxl_adapter_context_put(ctx->afu->adapter); cxl_ctx_put(); - if (task) + if (task) { cxl_context_mm_count_put(ctx); + if (ctx->mm) + mm_context_remove_copro(ctx->mm); + } goto out; } diff --git a/drivers/misc/cxl/context.c b/drivers/misc/cxl/context.c index 8c32040b9c09..12a41b2753f0 100644 --- a/drivers/misc/cxl/context.c +++ b/drivers/misc/cxl/context.c @@ -18,6 +18,7 @@ #include <linux/slab.h> #include <linux/idr.h> #include <linux/sched/mm.h> +#include <linux/mmu_context.h> #include <asm/cputable.h> #include <asm/current.h> #include <asm/copro.h> @@ -267,6 +268,8 @@ int __detach_context(struct cxl_context *ctx) /* Decrease the mm count on the context */ cxl_context_mm_count_put(ctx); + if (ctx->mm) + mm_context_remove_copro(ctx->mm); ctx->mm = NULL; return 0; diff --git a/drivers/misc/cxl/cxl.h b/drivers/misc/cxl/cxl.h index b1afeccbb97f..e46a4062904a 100644 --- a/drivers/misc/cxl/cxl.h +++ b/drivers/misc/cxl/cxl.h @@ -100,9 +100,12 @@ static const cxl_p1_reg_t CXL_XSL_FEC = {0x0158}; static const cxl_p1_reg_t CXL_XSL_DSNCTL = {0x0168}; /* PSL registers - CAIA 2 */ static const cxl_p1_reg_t CXL_PSL9_CONTROL = {0x0020}; +static const cxl_p1_reg_t CXL_XSL9_INV = {0x0110}; +static const cxl_p1_reg_t CXL_XSL9_DBG = {0x0130}; +static const cxl_p1_reg_t CXL_XSL9_DEF = {0x0140}; static const cxl_p1_reg_t CXL_XSL9_DSNCTL = {0x0168}; static const cxl_p1_reg_t CXL_PSL9_FIR1 = {0x0300}; -static const cxl_p1_reg_t CXL_PSL9_FIR2 = {0x0308}; +static const cxl_p1_reg_t CXL_PSL9_FIR_MASK = {0x0308}; static const cxl_p1_reg_t CXL_PSL9_Timebase = {0x0310}; static const cxl_p1_reg_t CXL_PSL9_DEBUG = {0x0320}; static const cxl_p1_reg_t CXL_PSL9_FIR_CNTL = {0x0348}; @@ -112,6 +115,7 @@ static const cxl_p1_reg_t CXL_PSL9_TRACECFG = {0x0368}; static const cxl_p1_reg_t CXL_PSL9_APCDEDALLOC = {0x0378}; static const cxl_p1_reg_t CXL_PSL9_APCDEDTYPE = {0x0380}; static const cxl_p1_reg_t CXL_PSL9_TNR_ADDR = {0x0388}; +static const cxl_p1_reg_t CXL_PSL9_CTCCFG = {0x0390}; static const cxl_p1_reg_t CXL_PSL9_GP_CT = {0x0398}; static const cxl_p1_reg_t CXL_XSL9_IERAT = {0x0588}; static const cxl_p1_reg_t CXL_XSL9_ILPP = {0x0590}; @@ -414,6 +418,9 @@ static const cxl_p2n_reg_t CXL_PSL_WED_An = {0x0A0}; #define CXL_CARD_MINOR(adapter) (adapter->adapter_num * CXL_DEV_MINORS) #define CXL_DEVT_ADAPTER(dev) (MINOR(dev) / CXL_DEV_MINORS) +#define CXL_PSL9_TRACEID_MAX 0xAU +#define CXL_PSL9_TRACESTATE_FIN 0x3U + enum cxl_context_status { CLOSED, OPENED, @@ -938,8 +945,6 @@ int cxl_debugfs_adapter_add(struct cxl *adapter); void cxl_debugfs_adapter_remove(struct cxl *adapter); int cxl_debugfs_afu_add(struct cxl_afu *afu); void cxl_debugfs_afu_remove(struct cxl_afu *afu); -void cxl_stop_trace_psl9(struct cxl *cxl); -void cxl_stop_trace_psl8(struct cxl *cxl); void cxl_debugfs_add_adapter_regs_psl9(struct cxl *adapter, struct dentry *dir); void cxl_debugfs_add_adapter_regs_psl8(struct cxl *adapter, struct dentry *dir); void cxl_debugfs_add_adapter_regs_xsl(struct cxl *adapter, struct dentry *dir); @@ -975,14 +980,6 @@ static inline void cxl_debugfs_afu_remove(struct cxl_afu *afu) { } -static inline void cxl_stop_trace_psl9(struct cxl *cxl) -{ -} - -static inline void cxl_stop_trace_psl8(struct cxl *cxl) -{ -} - static inline void cxl_debugfs_add_adapter_regs_psl9(struct cxl *adapter, struct dentry *dir) { @@ -1070,7 +1067,8 @@ u64 cxl_calculate_sr(bool master, bool kernel, bool real_mode, bool p9); void cxl_native_irq_dump_regs_psl9(struct cxl_context *ctx); void cxl_native_irq_dump_regs_psl8(struct cxl_context *ctx); -void cxl_native_err_irq_dump_regs(struct cxl *adapter); +void cxl_native_err_irq_dump_regs_psl8(struct cxl *adapter); +void cxl_native_err_irq_dump_regs_psl9(struct cxl *adapter); int cxl_pci_vphb_add(struct cxl_afu *afu); void cxl_pci_vphb_remove(struct cxl_afu *afu); void cxl_release_mapping(struct cxl_context *ctx); diff --git a/drivers/misc/cxl/debugfs.c b/drivers/misc/cxl/debugfs.c index eae9d749f967..1643850d2302 100644 --- a/drivers/misc/cxl/debugfs.c +++ b/drivers/misc/cxl/debugfs.c @@ -15,28 +15,6 @@ static struct dentry *cxl_debugfs; -void cxl_stop_trace_psl9(struct cxl *adapter) -{ - /* Stop the trace */ - cxl_p1_write(adapter, CXL_PSL9_TRACECFG, 0x4480000000000000ULL); -} - -void cxl_stop_trace_psl8(struct cxl *adapter) -{ - int slice; - - /* Stop the trace */ - cxl_p1_write(adapter, CXL_PSL_TRACE, 0x8000000000000017LL); - - /* Stop the slice traces */ - spin_lock(&adapter->afu_list_lock); - for (slice = 0; slice < adapter->slices; slice++) { - if (adapter->afu[slice]) - cxl_p1n_write(adapter->afu[slice], CXL_PSL_SLICE_TRACE, 0x8000000000000000LL); - } - spin_unlock(&adapter->afu_list_lock); -} - /* Helpers to export CXL mmaped IO registers via debugfs */ static int debugfs_io_u64_get(void *data, u64 *val) { @@ -62,9 +40,14 @@ static struct dentry *debugfs_create_io_x64(const char *name, umode_t mode, void cxl_debugfs_add_adapter_regs_psl9(struct cxl *adapter, struct dentry *dir) { debugfs_create_io_x64("fir1", S_IRUSR, dir, _cxl_p1_addr(adapter, CXL_PSL9_FIR1)); - debugfs_create_io_x64("fir2", S_IRUSR, dir, _cxl_p1_addr(adapter, CXL_PSL9_FIR2)); + debugfs_create_io_x64("fir_mask", 0400, dir, + _cxl_p1_addr(adapter, CXL_PSL9_FIR_MASK)); debugfs_create_io_x64("fir_cntl", S_IRUSR, dir, _cxl_p1_addr(adapter, CXL_PSL9_FIR_CNTL)); debugfs_create_io_x64("trace", S_IRUSR | S_IWUSR, dir, _cxl_p1_addr(adapter, CXL_PSL9_TRACECFG)); + debugfs_create_io_x64("debug", 0600, dir, + _cxl_p1_addr(adapter, CXL_PSL9_DEBUG)); + debugfs_create_io_x64("xsl-debug", 0600, dir, + _cxl_p1_addr(adapter, CXL_XSL9_DBG)); } void cxl_debugfs_add_adapter_regs_psl8(struct cxl *adapter, struct dentry *dir) diff --git a/drivers/misc/cxl/fault.c b/drivers/misc/cxl/fault.c index f17f72ea0545..70dbb6de102c 100644 --- a/drivers/misc/cxl/fault.c +++ b/drivers/misc/cxl/fault.c @@ -220,22 +220,11 @@ static bool cxl_is_segment_miss(struct cxl_context *ctx, u64 dsisr) static bool cxl_is_page_fault(struct cxl_context *ctx, u64 dsisr) { - u64 crs; /* Translation Checkout Response Status */ - if ((cxl_is_power8()) && (dsisr & CXL_PSL_DSISR_An_DM)) return true; - if (cxl_is_power9()) { - crs = (dsisr & CXL_PSL9_DSISR_An_CO_MASK); - if ((crs == CXL_PSL9_DSISR_An_PF_SLR) || - (crs == CXL_PSL9_DSISR_An_PF_RGC) || - (crs == CXL_PSL9_DSISR_An_PF_RGP) || - (crs == CXL_PSL9_DSISR_An_PF_HRH) || - (crs == CXL_PSL9_DSISR_An_PF_STEG) || - (crs == CXL_PSL9_DSISR_An_URTCH)) { - return true; - } - } + if (cxl_is_power9()) + return true; return false; } diff --git a/drivers/misc/cxl/file.c b/drivers/misc/cxl/file.c index 4bfad9f6dc9f..76c0b0ca9388 100644 --- a/drivers/misc/cxl/file.c +++ b/drivers/misc/cxl/file.c @@ -19,6 +19,7 @@ #include <linux/mm.h> #include <linux/slab.h> #include <linux/sched/mm.h> +#include <linux/mmu_context.h> #include <asm/cputable.h> #include <asm/current.h> #include <asm/copro.h> @@ -220,9 +221,12 @@ static long afu_ioctl_start_work(struct cxl_context *ctx, /* ensure this mm_struct can't be freed */ cxl_context_mm_count_get(ctx); - /* decrement the use count */ - if (ctx->mm) + if (ctx->mm) { + /* decrement the use count from above */ mmput(ctx->mm); + /* make TLBIs for this context global */ + mm_context_add_copro(ctx->mm); + } /* * Increment driver use count. Enables global TLBIs for hash @@ -230,6 +234,20 @@ static long afu_ioctl_start_work(struct cxl_context *ctx, */ cxl_ctx_get(); + /* + * A barrier is needed to make sure all TLBIs are global + * before we attach and the context starts being used by the + * adapter. + * + * Needed after mm_context_add_copro() for radix and + * cxl_ctx_get() for hash/p8. + * + * The barrier should really be mb(), since it involves a + * device. However, it's only useful when we have local + * vs. global TLBIs, i.e SMP=y. So keep smp_mb(). + */ + smp_mb(); + trace_cxl_attach(ctx, work.work_element_descriptor, work.num_interrupts, amr); if ((rc = cxl_ops->attach_process(ctx, false, work.work_element_descriptor, @@ -240,6 +258,8 @@ static long afu_ioctl_start_work(struct cxl_context *ctx, ctx->pid = NULL; cxl_ctx_put(); cxl_context_mm_count_put(ctx); + if (ctx->mm) + mm_context_remove_copro(ctx->mm); goto out; } diff --git a/drivers/misc/cxl/native.c b/drivers/misc/cxl/native.c index 4a82c313cf71..02b6b45b4c20 100644 --- a/drivers/misc/cxl/native.c +++ b/drivers/misc/cxl/native.c @@ -897,6 +897,14 @@ int cxl_attach_dedicated_process_psl9(struct cxl_context *ctx, u64 wed, u64 amr) if (ctx->afu->adapter->native->sl_ops->update_dedicated_ivtes) afu->adapter->native->sl_ops->update_dedicated_ivtes(ctx); + ctx->elem->software_state = cpu_to_be32(CXL_PE_SOFTWARE_STATE_V); + /* + * Ideally we should do a wmb() here to make sure the changes to the + * PE are visible to the card before we call afu_enable. + * On ppc64 though all mmios are preceded by a 'sync' instruction hence + * we dont dont need one here. + */ + result = cxl_ops->afu_reset(afu); if (result) return result; @@ -1077,13 +1085,11 @@ static int native_get_irq_info(struct cxl_afu *afu, struct cxl_irq_info *info) void cxl_native_irq_dump_regs_psl9(struct cxl_context *ctx) { - u64 fir1, fir2, serr; + u64 fir1, serr; fir1 = cxl_p1_read(ctx->afu->adapter, CXL_PSL9_FIR1); - fir2 = cxl_p1_read(ctx->afu->adapter, CXL_PSL9_FIR2); dev_crit(&ctx->afu->dev, "PSL_FIR1: 0x%016llx\n", fir1); - dev_crit(&ctx->afu->dev, "PSL_FIR2: 0x%016llx\n", fir2); if (ctx->afu->adapter->native->sl_ops->register_serr_irq) { serr = cxl_p1n_read(ctx->afu, CXL_PSL_SERR_An); cxl_afu_decode_psl_serr(ctx->afu, serr); @@ -1257,14 +1263,23 @@ static irqreturn_t native_slice_irq_err(int irq, void *data) return IRQ_HANDLED; } -void cxl_native_err_irq_dump_regs(struct cxl *adapter) +void cxl_native_err_irq_dump_regs_psl9(struct cxl *adapter) +{ + u64 fir1; + + fir1 = cxl_p1_read(adapter, CXL_PSL9_FIR1); + dev_crit(&adapter->dev, "PSL_FIR: 0x%016llx\n", fir1); +} + +void cxl_native_err_irq_dump_regs_psl8(struct cxl *adapter) { u64 fir1, fir2; fir1 = cxl_p1_read(adapter, CXL_PSL_FIR1); fir2 = cxl_p1_read(adapter, CXL_PSL_FIR2); - - dev_crit(&adapter->dev, "PSL_FIR1: 0x%016llx\nPSL_FIR2: 0x%016llx\n", fir1, fir2); + dev_crit(&adapter->dev, + "PSL_FIR1: 0x%016llx\nPSL_FIR2: 0x%016llx\n", + fir1, fir2); } static irqreturn_t native_irq_err(int irq, void *data) diff --git a/drivers/misc/cxl/pci.c b/drivers/misc/cxl/pci.c index 3ba04f371380..bb7fd3f4edab 100644 --- a/drivers/misc/cxl/pci.c +++ b/drivers/misc/cxl/pci.c @@ -401,7 +401,8 @@ int cxl_calc_capp_routing(struct pci_dev *dev, u64 *chipid, *capp_unit_id = get_capp_unit_id(np, *phb_index); of_node_put(np); if (!*capp_unit_id) { - pr_err("cxl: invalid capp unit id\n"); + pr_err("cxl: invalid capp unit id (phb_index: %d)\n", + *phb_index); return -ENODEV; } @@ -475,37 +476,37 @@ static int init_implementation_adapter_regs_psl9(struct cxl *adapter, psl_fircntl |= 0x1ULL; /* ce_thresh */ cxl_p1_write(adapter, CXL_PSL9_FIR_CNTL, psl_fircntl); - /* vccredits=0x1 pcklat=0x4 */ - cxl_p1_write(adapter, CXL_PSL9_DSNDCTL, 0x0000000000001810ULL); - - /* - * For debugging with trace arrays. - * Configure RX trace 0 segmented mode. - * Configure CT trace 0 segmented mode. - * Configure LA0 trace 0 segmented mode. - * Configure LA1 trace 0 segmented mode. + /* Setup the PSL to transmit packets on the PCIe before the + * CAPP is enabled */ - cxl_p1_write(adapter, CXL_PSL9_TRACECFG, 0x8040800080000000ULL); - cxl_p1_write(adapter, CXL_PSL9_TRACECFG, 0x8040800080000003ULL); - cxl_p1_write(adapter, CXL_PSL9_TRACECFG, 0x8040800080000005ULL); - cxl_p1_write(adapter, CXL_PSL9_TRACECFG, 0x8040800080000006ULL); + cxl_p1_write(adapter, CXL_PSL9_DSNDCTL, 0x0001001000002A10ULL); /* * A response to an ASB_Notify request is returned by the * system as an MMIO write to the address defined in - * the PSL_TNR_ADDR register + * the PSL_TNR_ADDR register. + * keep the Reset Value: 0x00020000E0000000 */ - /* PSL_TNR_ADDR */ - /* NORST */ - cxl_p1_write(adapter, CXL_PSL9_DEBUG, 0x8000000000000000ULL); + /* Enable XSL rty limit */ + cxl_p1_write(adapter, CXL_XSL9_DEF, 0x51F8000000000005ULL); + + /* Change XSL_INV dummy read threshold */ + cxl_p1_write(adapter, CXL_XSL9_INV, 0x0000040007FFC200ULL); + + if (phb_index == 3) { + /* disable machines 31-47 and 20-27 for DMA */ + cxl_p1_write(adapter, CXL_PSL9_APCDEDTYPE, 0x40000FF3FFFF0000ULL); + } - /* allocate the apc machines */ - cxl_p1_write(adapter, CXL_PSL9_APCDEDTYPE, 0x40000003FFFF0000ULL); + /* Snoop machines */ + cxl_p1_write(adapter, CXL_PSL9_APCDEDALLOC, 0x800F000200000000ULL); - /* Disable vc dd1 fix */ - if (cxl_is_power9_dd1()) - cxl_p1_write(adapter, CXL_PSL9_GP_CT, 0x0400000000000001ULL); + if (cxl_is_power9_dd1()) { + /* Disabling deadlock counter CAR */ + cxl_p1_write(adapter, CXL_PSL9_GP_CT, 0x0020000000000001ULL); + } else + cxl_p1_write(adapter, CXL_PSL9_DEBUG, 0x4000000000000000ULL); return 0; } @@ -1746,6 +1747,44 @@ static void cxl_deconfigure_adapter(struct cxl *adapter) pci_disable_device(pdev); } +static void cxl_stop_trace_psl9(struct cxl *adapter) +{ + int traceid; + u64 trace_state, trace_mask; + struct pci_dev *dev = to_pci_dev(adapter->dev.parent); + + /* read each tracearray state and issue mmio to stop them is needed */ + for (traceid = 0; traceid <= CXL_PSL9_TRACEID_MAX; ++traceid) { + trace_state = cxl_p1_read(adapter, CXL_PSL9_CTCCFG); + trace_mask = (0x3ULL << (62 - traceid * 2)); + trace_state = (trace_state & trace_mask) >> (62 - traceid * 2); + dev_dbg(&dev->dev, "cxl: Traceid-%d trace_state=0x%0llX\n", + traceid, trace_state); + + /* issue mmio if the trace array isn't in FIN state */ + if (trace_state != CXL_PSL9_TRACESTATE_FIN) + cxl_p1_write(adapter, CXL_PSL9_TRACECFG, + 0x8400000000000000ULL | traceid); + } +} + +static void cxl_stop_trace_psl8(struct cxl *adapter) +{ + int slice; + + /* Stop the trace */ + cxl_p1_write(adapter, CXL_PSL_TRACE, 0x8000000000000017LL); + + /* Stop the slice traces */ + spin_lock(&adapter->afu_list_lock); + for (slice = 0; slice < adapter->slices; slice++) { + if (adapter->afu[slice]) + cxl_p1n_write(adapter->afu[slice], CXL_PSL_SLICE_TRACE, + 0x8000000000000000LL); + } + spin_unlock(&adapter->afu_list_lock); +} + static const struct cxl_service_layer_ops psl9_ops = { .adapter_regs_init = init_implementation_adapter_regs_psl9, .invalidate_all = cxl_invalidate_all_psl9, @@ -1762,6 +1801,7 @@ static const struct cxl_service_layer_ops psl9_ops = { .debugfs_add_adapter_regs = cxl_debugfs_add_adapter_regs_psl9, .debugfs_add_afu_regs = cxl_debugfs_add_afu_regs_psl9, .psl_irq_dump_registers = cxl_native_irq_dump_regs_psl9, + .err_irq_dump_registers = cxl_native_err_irq_dump_regs_psl9, .debugfs_stop_trace = cxl_stop_trace_psl9, .write_timebase_ctrl = write_timebase_ctrl_psl9, .timebase_read = timebase_read_psl9, @@ -1785,7 +1825,7 @@ static const struct cxl_service_layer_ops psl8_ops = { .debugfs_add_adapter_regs = cxl_debugfs_add_adapter_regs_psl8, .debugfs_add_afu_regs = cxl_debugfs_add_afu_regs_psl8, .psl_irq_dump_registers = cxl_native_irq_dump_regs_psl8, - .err_irq_dump_registers = cxl_native_err_irq_dump_regs, + .err_irq_dump_registers = cxl_native_err_irq_dump_regs_psl8, .debugfs_stop_trace = cxl_stop_trace_psl8, .write_timebase_ctrl = write_timebase_ctrl_psl8, .timebase_read = timebase_read_psl8, diff --git a/drivers/mtd/devices/powernv_flash.c b/drivers/mtd/devices/powernv_flash.c index f5396f26ddb4..26f9feaa5d17 100644 --- a/drivers/mtd/devices/powernv_flash.c +++ b/drivers/mtd/devices/powernv_flash.c @@ -47,6 +47,11 @@ enum flash_op { FLASH_OP_ERASE, }; +/* + * Don't return -ERESTARTSYS if we can't get a token, the MTD core + * might have split up the call from userspace and called into the + * driver more than once, we'll already have done some amount of work. + */ static int powernv_flash_async_op(struct mtd_info *mtd, enum flash_op op, loff_t offset, size_t len, size_t *retlen, u_char *buf) { @@ -63,7 +68,8 @@ static int powernv_flash_async_op(struct mtd_info *mtd, enum flash_op op, if (token < 0) { if (token != -ERESTARTSYS) dev_err(dev, "Failed to get an async token\n"); - + else + token = -EINTR; return token; } @@ -78,32 +84,53 @@ static int powernv_flash_async_op(struct mtd_info *mtd, enum flash_op op, rc = opal_flash_erase(info->id, offset, len, token); break; default: - BUG_ON(1); - } - - if (rc != OPAL_ASYNC_COMPLETION) { - dev_err(dev, "opal_flash_async_op(op=%d) failed (rc %d)\n", - op, rc); + WARN_ON_ONCE(1); opal_async_release_token(token); return -EIO; } - rc = opal_async_wait_response(token, &msg); - opal_async_release_token(token); - if (rc) { - dev_err(dev, "opal async wait failed (rc %d)\n", rc); - return -EIO; + if (rc == OPAL_ASYNC_COMPLETION) { + rc = opal_async_wait_response_interruptible(token, &msg); + if (rc) { + /* + * If we return the mtd core will free the + * buffer we've just passed to OPAL but OPAL + * will continue to read or write from that + * memory. + * It may be tempting to ultimately return 0 + * if we're doing a read or a write since we + * are going to end up waiting until OPAL is + * done. However, because the MTD core sends + * us the userspace request in chunks, we need + * it to know we've been interrupted. + */ + rc = -EINTR; + if (opal_async_wait_response(token, &msg)) + dev_err(dev, "opal_async_wait_response() failed\n"); + goto out; + } + rc = opal_get_async_rc(msg); } - rc = opal_get_async_rc(msg); - if (rc == OPAL_SUCCESS) { - rc = 0; - if (retlen) - *retlen = len; - } else { - rc = -EIO; - } + /* + * OPAL does mutual exclusion on the flash, it will return + * OPAL_BUSY. + * During firmware updates by the service processor OPAL may + * be (temporarily) prevented from accessing the flash, in + * this case OPAL will also return OPAL_BUSY. + * Both cases aren't errors exactly but the flash could have + * changed, userspace should be informed. + */ + if (rc != OPAL_SUCCESS && rc != OPAL_BUSY) + dev_err(dev, "opal_flash_async_op(op=%d) failed (rc %d)\n", + op, rc); + + if (rc == OPAL_SUCCESS && retlen) + *retlen = len; + rc = opal_error_code(rc); +out: + opal_async_release_token(token); return rc; } @@ -220,21 +247,20 @@ static int powernv_flash_probe(struct platform_device *pdev) int ret; data = devm_kzalloc(dev, sizeof(*data), GFP_KERNEL); - if (!data) { - ret = -ENOMEM; - goto out; - } + if (!data) + return -ENOMEM; + data->mtd.priv = data; ret = of_property_read_u32(dev->of_node, "ibm,opal-id", &(data->id)); if (ret) { dev_err(dev, "no device property 'ibm,opal-id'\n"); - goto out; + return ret; } ret = powernv_flash_set_driver_info(dev, &data->mtd); if (ret) - goto out; + return ret; dev_set_drvdata(dev, data); @@ -243,10 +269,7 @@ static int powernv_flash_probe(struct platform_device *pdev) * with an ffs partition at the start, it should prove easier for users * to deal with partitions or not as they see fit */ - ret = mtd_device_register(&data->mtd, NULL, 0); - -out: - return ret; + return mtd_device_register(&data->mtd, NULL, 0); } /** diff --git a/tools/testing/selftests/powerpc/benchmarks/context_switch.c b/tools/testing/selftests/powerpc/benchmarks/context_switch.c index f4241339edd2..87f1f0252299 100644 --- a/tools/testing/selftests/powerpc/benchmarks/context_switch.c +++ b/tools/testing/selftests/powerpc/benchmarks/context_switch.c @@ -10,6 +10,7 @@ */ #define _GNU_SOURCE +#include <errno.h> #include <sched.h> #include <string.h> #include <stdio.h> @@ -75,6 +76,7 @@ static void touch(void) static void start_thread_on(void *(*fn)(void *), void *arg, unsigned long cpu) { + int rc; pthread_t tid; cpu_set_t cpuset; pthread_attr_t attr; @@ -82,14 +84,23 @@ static void start_thread_on(void *(*fn)(void *), void *arg, unsigned long cpu) CPU_ZERO(&cpuset); CPU_SET(cpu, &cpuset); - pthread_attr_init(&attr); + rc = pthread_attr_init(&attr); + if (rc) { + errno = rc; + perror("pthread_attr_init"); + exit(1); + } - if (pthread_attr_setaffinity_np(&attr, sizeof(cpu_set_t), &cpuset)) { + rc = pthread_attr_setaffinity_np(&attr, sizeof(cpu_set_t), &cpuset); + if (rc) { + errno = rc; perror("pthread_attr_setaffinity_np"); exit(1); } - if (pthread_create(&tid, &attr, fn, arg)) { + rc = pthread_create(&tid, &attr, fn, arg); + if (rc) { + errno = rc; perror("pthread_create"); exit(1); } diff --git a/tools/testing/selftests/powerpc/dscr/dscr_sysfs_test.c b/tools/testing/selftests/powerpc/dscr/dscr_sysfs_test.c index 17fb1b43c320..1899bd85121f 100644 --- a/tools/testing/selftests/powerpc/dscr/dscr_sysfs_test.c +++ b/tools/testing/selftests/powerpc/dscr/dscr_sysfs_test.c @@ -53,6 +53,8 @@ static int check_all_cpu_dscr_defaults(unsigned long val) } while ((dp = readdir(sysfs))) { + int len; + if (!(dp->d_type & DT_DIR)) continue; if (!strcmp(dp->d_name, "cpuidle")) @@ -60,7 +62,9 @@ static int check_all_cpu_dscr_defaults(unsigned long val) if (!strstr(dp->d_name, "cpu")) continue; - sprintf(file, "%s%s/dscr", CPU_PATH, dp->d_name); + len = snprintf(file, LEN_MAX, "%s%s/dscr", CPU_PATH, dp->d_name); + if (len >= LEN_MAX) + continue; if (access(file, F_OK)) continue; diff --git a/tools/testing/selftests/powerpc/tm/.gitignore b/tools/testing/selftests/powerpc/tm/.gitignore index 2f1f7b013293..241a4a4ee0e4 100644 --- a/tools/testing/selftests/powerpc/tm/.gitignore +++ b/tools/testing/selftests/powerpc/tm/.gitignore @@ -12,3 +12,4 @@ tm-signal-context-chk-gpr tm-signal-context-chk-vmx tm-signal-context-chk-vsx tm-vmx-unavail +tm-unavailable diff --git a/tools/testing/selftests/powerpc/tm/Makefile b/tools/testing/selftests/powerpc/tm/Makefile index 7bfcd454fb2a..24855c0eee53 100644 --- a/tools/testing/selftests/powerpc/tm/Makefile +++ b/tools/testing/selftests/powerpc/tm/Makefile @@ -2,7 +2,7 @@ SIGNAL_CONTEXT_CHK_TESTS := tm-signal-context-chk-gpr tm-signal-context-chk-fpu tm-signal-context-chk-vmx tm-signal-context-chk-vsx TEST_GEN_PROGS := tm-resched-dscr tm-syscall tm-signal-msr-resv tm-signal-stack \ - tm-vmxcopy tm-fork tm-tar tm-tmspr tm-vmx-unavail \ + tm-vmxcopy tm-fork tm-tar tm-tmspr tm-vmx-unavail tm-unavailable \ $(SIGNAL_CONTEXT_CHK_TESTS) include ../../lib.mk @@ -16,6 +16,7 @@ $(OUTPUT)/tm-syscall: CFLAGS += -I../../../../../usr/include $(OUTPUT)/tm-tmspr: CFLAGS += -pthread $(OUTPUT)/tm-vmx-unavail: CFLAGS += -pthread -m64 $(OUTPUT)/tm-resched-dscr: ../pmu/lib.o +$(OUTPUT)/tm-unavailable: CFLAGS += -O0 -pthread -m64 -Wno-error=uninitialized -mvsx SIGNAL_CONTEXT_CHK_TESTS := $(patsubst %,$(OUTPUT)/%,$(SIGNAL_CONTEXT_CHK_TESTS)) $(SIGNAL_CONTEXT_CHK_TESTS): tm-signal.S diff --git a/tools/testing/selftests/powerpc/tm/tm-unavailable.c b/tools/testing/selftests/powerpc/tm/tm-unavailable.c new file mode 100644 index 000000000000..96c37f84ce54 --- /dev/null +++ b/tools/testing/selftests/powerpc/tm/tm-unavailable.c @@ -0,0 +1,371 @@ +/* + * Copyright 2017, Gustavo Romero, Breno Leitao, Cyril Bur, IBM Corp. + * Licensed under GPLv2. + * + * Force FP, VEC and VSX unavailable exception during transaction in all + * possible scenarios regarding the MSR.FP and MSR.VEC state, e.g. when FP + * is enable and VEC is disable, when FP is disable and VEC is enable, and + * so on. Then we check if the restored state is correctly set for the + * FP and VEC registers to the previous state we set just before we entered + * in TM, i.e. we check if it corrupts somehow the recheckpointed FP and + * VEC/Altivec registers on abortion due to an unavailable exception in TM. + * N.B. In this test we do not test all the FP/Altivec/VSX registers for + * corruption, but only for registers vs0 and vs32, which are respectively + * representatives of FP and VEC/Altivec reg sets. + */ + +#define _GNU_SOURCE +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <inttypes.h> +#include <stdbool.h> +#include <pthread.h> +#include <sched.h> + +#include "tm.h" + +#define DEBUG 0 + +/* Unavailable exceptions to test in HTM */ +#define FP_UNA_EXCEPTION 0 +#define VEC_UNA_EXCEPTION 1 +#define VSX_UNA_EXCEPTION 2 + +#define NUM_EXCEPTIONS 3 + +struct Flags { + int touch_fp; + int touch_vec; + int result; + int exception; +} flags; + +bool expecting_failure(void) +{ + if (flags.touch_fp && flags.exception == FP_UNA_EXCEPTION) + return false; + + if (flags.touch_vec && flags.exception == VEC_UNA_EXCEPTION) + return false; + + /* + * If both FP and VEC are touched it does not mean that touching VSX + * won't raise an exception. However since FP and VEC state are already + * correctly loaded, the transaction is not aborted (i.e. + * treclaimed/trecheckpointed) and MSR.VSX is just set as 1, so a TM + * failure is not expected also in this case. + */ + if ((flags.touch_fp && flags.touch_vec) && + flags.exception == VSX_UNA_EXCEPTION) + return false; + + return true; +} + +/* Check if failure occurred whilst in transaction. */ +bool is_failure(uint64_t condition_reg) +{ + /* + * When failure handling occurs, CR0 is set to 0b1010 (0xa). Otherwise + * transaction completes without failure and hence reaches out 'tend.' + * that sets CR0 to 0b0100 (0x4). + */ + return ((condition_reg >> 28) & 0xa) == 0xa; +} + +void *ping(void *input) +{ + + /* + * Expected values for vs0 and vs32 after a TM failure. They must never + * change, otherwise they got corrupted. + */ + uint64_t high_vs0 = 0x5555555555555555; + uint64_t low_vs0 = 0xffffffffffffffff; + uint64_t high_vs32 = 0x5555555555555555; + uint64_t low_vs32 = 0xffffffffffffffff; + + /* Counter for busy wait */ + uint64_t counter = 0x1ff000000; + + /* + * Variable to keep a copy of CR register content taken just after we + * leave the transactional state. + */ + uint64_t cr_ = 0; + + /* + * Wait a bit so thread can get its name "ping". This is not important + * to reproduce the issue but it's nice to have for systemtap debugging. + */ + if (DEBUG) + sleep(1); + + printf("If MSR.FP=%d MSR.VEC=%d: ", flags.touch_fp, flags.touch_vec); + + if (flags.exception != FP_UNA_EXCEPTION && + flags.exception != VEC_UNA_EXCEPTION && + flags.exception != VSX_UNA_EXCEPTION) { + printf("No valid exception specified to test.\n"); + return NULL; + } + + asm ( + /* Prepare to merge low and high. */ + " mtvsrd 33, %[high_vs0] ;" + " mtvsrd 34, %[low_vs0] ;" + + /* + * Adjust VS0 expected value after an TM failure, + * i.e. vs0 = 0x5555555555555555555FFFFFFFFFFFFFFFF + */ + " xxmrghd 0, 33, 34 ;" + + /* + * Adjust VS32 expected value after an TM failure, + * i.e. vs32 = 0x5555555555555555555FFFFFFFFFFFFFFFF + */ + " xxmrghd 32, 33, 34 ;" + + /* + * Wait an amount of context switches so load_fp and load_vec + * overflow and MSR.FP, MSR.VEC, and MSR.VSX become zero (off). + */ + " mtctr %[counter] ;" + + /* Decrement CTR branch if CTR non zero. */ + "1: bdnz 1b ;" + + /* + * Check if we want to touch FP prior to the test in order + * to set MSR.FP = 1 before provoking an unavailable + * exception in TM. + */ + " cmpldi %[touch_fp], 0 ;" + " beq no_fp ;" + " fadd 10, 10, 10 ;" + "no_fp: ;" + + /* + * Check if we want to touch VEC prior to the test in order + * to set MSR.VEC = 1 before provoking an unavailable + * exception in TM. + */ + " cmpldi %[touch_vec], 0 ;" + " beq no_vec ;" + " vaddcuw 10, 10, 10 ;" + "no_vec: ;" + + /* + * Perhaps it would be a better idea to do the + * compares outside transactional context and simply + * duplicate code. + */ + " tbegin. ;" + " beq trans_fail ;" + + /* Do we do FP Unavailable? */ + " cmpldi %[exception], %[ex_fp] ;" + " bne 1f ;" + " fadd 10, 10, 10 ;" + " b done ;" + + /* Do we do VEC Unavailable? */ + "1: cmpldi %[exception], %[ex_vec] ;" + " bne 2f ;" + " vaddcuw 10, 10, 10 ;" + " b done ;" + + /* + * Not FP or VEC, therefore VSX. Ensure this + * instruction always generates a VSX Unavailable. + * ISA 3.0 is tricky here. + * (xxmrghd will on ISA 2.07 and ISA 3.0) + */ + "2: xxmrghd 10, 10, 10 ;" + + "done: tend. ;" + + "trans_fail: ;" + + /* Give values back to C. */ + " mfvsrd %[high_vs0], 0 ;" + " xxsldwi 3, 0, 0, 2 ;" + " mfvsrd %[low_vs0], 3 ;" + " mfvsrd %[high_vs32], 32 ;" + " xxsldwi 3, 32, 32, 2 ;" + " mfvsrd %[low_vs32], 3 ;" + + /* Give CR back to C so that it can check what happened. */ + " mfcr %[cr_] ;" + + : [high_vs0] "+r" (high_vs0), + [low_vs0] "+r" (low_vs0), + [high_vs32] "=r" (high_vs32), + [low_vs32] "=r" (low_vs32), + [cr_] "+r" (cr_) + : [touch_fp] "r" (flags.touch_fp), + [touch_vec] "r" (flags.touch_vec), + [exception] "r" (flags.exception), + [ex_fp] "i" (FP_UNA_EXCEPTION), + [ex_vec] "i" (VEC_UNA_EXCEPTION), + [ex_vsx] "i" (VSX_UNA_EXCEPTION), + [counter] "r" (counter) + + : "cr0", "ctr", "v10", "vs0", "vs10", "vs3", "vs32", "vs33", + "vs34", "fr10" + + ); + + /* + * Check if we were expecting a failure and it did not occur by checking + * CR0 state just after we leave the transaction. Either way we check if + * vs0 or vs32 got corrupted. + */ + if (expecting_failure() && !is_failure(cr_)) { + printf("\n\tExpecting the transaction to fail, %s", + "but it didn't\n\t"); + flags.result++; + } + + /* Check if we were not expecting a failure and a it occurred. */ + if (!expecting_failure() && is_failure(cr_)) { + printf("\n\tUnexpected transaction failure 0x%02lx\n\t", + failure_code()); + return (void *) -1; + } + + /* + * Check if TM failed due to the cause we were expecting. 0xda is a + * TM_CAUSE_FAC_UNAV cause, otherwise it's an unexpected cause. + */ + if (is_failure(cr_) && !failure_is_unavailable()) { + printf("\n\tUnexpected failure cause 0x%02lx\n\t", + failure_code()); + return (void *) -1; + } + + /* 0x4 is a success and 0xa is a fail. See comment in is_failure(). */ + if (DEBUG) + printf("CR0: 0x%1lx ", cr_ >> 28); + + /* Check FP (vs0) for the expected value. */ + if (high_vs0 != 0x5555555555555555 || low_vs0 != 0xFFFFFFFFFFFFFFFF) { + printf("FP corrupted!"); + printf(" high = %#16" PRIx64 " low = %#16" PRIx64 " ", + high_vs0, low_vs0); + flags.result++; + } else + printf("FP ok "); + + /* Check VEC (vs32) for the expected value. */ + if (high_vs32 != 0x5555555555555555 || low_vs32 != 0xFFFFFFFFFFFFFFFF) { + printf("VEC corrupted!"); + printf(" high = %#16" PRIx64 " low = %#16" PRIx64, + high_vs32, low_vs32); + flags.result++; + } else + printf("VEC ok"); + + putchar('\n'); + + return NULL; +} + +/* Thread to force context switch */ +void *pong(void *not_used) +{ + /* Wait thread get its name "pong". */ + if (DEBUG) + sleep(1); + + /* Classed as an interactive-like thread. */ + while (1) + sched_yield(); +} + +/* Function that creates a thread and launches the "ping" task. */ +void test_fp_vec(int fp, int vec, pthread_attr_t *attr) +{ + int retries = 2; + void *ret_value; + pthread_t t0; + + flags.touch_fp = fp; + flags.touch_vec = vec; + + /* + * Without luck it's possible that the transaction is aborted not due to + * the unavailable exception caught in the middle as we expect but also, + * for instance, due to a context switch or due to a KVM reschedule (if + * it's running on a VM). Thus we try a few times before giving up, + * checking if the failure cause is the one we expect. + */ + do { + /* Bind 'ping' to CPU 0, as specified in 'attr'. */ + pthread_create(&t0, attr, ping, (void *) &flags); + pthread_setname_np(t0, "ping"); + pthread_join(t0, &ret_value); + retries--; + } while (ret_value != NULL && retries); + + if (!retries) { + flags.result = 1; + if (DEBUG) + printf("All transactions failed unexpectedly\n"); + + } +} + +int main(int argc, char **argv) +{ + int exception; /* FP = 0, VEC = 1, VSX = 2 */ + pthread_t t1; + pthread_attr_t attr; + cpu_set_t cpuset; + + /* Set only CPU 0 in the mask. Both threads will be bound to CPU 0. */ + CPU_ZERO(&cpuset); + CPU_SET(0, &cpuset); + + /* Init pthread attribute. */ + pthread_attr_init(&attr); + + /* Set CPU 0 mask into the pthread attribute. */ + pthread_attr_setaffinity_np(&attr, sizeof(cpu_set_t), &cpuset); + + pthread_create(&t1, &attr /* Bind 'pong' to CPU 0 */, pong, NULL); + pthread_setname_np(t1, "pong"); /* Name it for systemtap convenience */ + + flags.result = 0; + + for (exception = 0; exception < NUM_EXCEPTIONS; exception++) { + printf("Checking if FP/VEC registers are sane after"); + + if (exception == FP_UNA_EXCEPTION) + printf(" a FP unavailable exception...\n"); + + else if (exception == VEC_UNA_EXCEPTION) + printf(" a VEC unavailable exception...\n"); + + else + printf(" a VSX unavailable exception...\n"); + + flags.exception = exception; + + test_fp_vec(0, 0, &attr); + test_fp_vec(1, 0, &attr); + test_fp_vec(0, 1, &attr); + test_fp_vec(1, 1, &attr); + + } + + if (flags.result > 0) { + printf("result: failed!\n"); + exit(1); + } else { + printf("result: success\n"); + exit(0); + } +} diff --git a/tools/testing/selftests/powerpc/tm/tm.h b/tools/testing/selftests/powerpc/tm/tm.h index 0ffff04433c5..df4204247d45 100644 --- a/tools/testing/selftests/powerpc/tm/tm.h +++ b/tools/testing/selftests/powerpc/tm/tm.h @@ -47,6 +47,11 @@ static inline bool failure_is_syscall(void) return (failure_code() & TM_CAUSE_SYSCALL) == TM_CAUSE_SYSCALL; } +static inline bool failure_is_unavailable(void) +{ + return (failure_code() & TM_CAUSE_FAC_UNAV) == TM_CAUSE_FAC_UNAV; +} + static inline bool failure_is_nesting(void) { return (__builtin_get_texasru() & 0x400000); |