From 81c22041d9f19df07b9cba95e3cd02e0f41bc1e1 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Mon, 9 Dec 2019 16:08:03 +0100 Subject: bpf, x86, arm64: Enable jit by default when not built as always-on After Spectre 2 fix via 290af86629b2 ("bpf: introduce BPF_JIT_ALWAYS_ON config") most major distros use BPF_JIT_ALWAYS_ON configuration these days which compiles out the BPF interpreter entirely and always enables the JIT. Also given recent fix in e1608f3fa857 ("bpf: Avoid setting bpf insns pages read-only when prog is jited"), we additionally avoid fragmenting the direct map for the BPF insns pages sitting in the general data heap since they are not used during execution. Latter is only needed when run through the interpreter. Since both x86 and arm64 JITs have seen a lot of exposure over the years, are generally most up to date and maintained, there is more downside in !BPF_JIT_ALWAYS_ON configurations to have the interpreter enabled by default rather than the JIT. Add a ARCH_WANT_DEFAULT_BPF_JIT config which archs can use to set the bpf_jit_{enable,kallsyms} to 1. Back in the days the bpf_jit_kallsyms knob was set to 0 by default since major distros still had /proc/kallsyms addresses exposed to unprivileged user space which is not the case anymore. Hence both knobs are set via BPF_JIT_DEFAULT_ON which is set to 'y' in case of BPF_JIT_ALWAYS_ON or ARCH_WANT_DEFAULT_BPF_JIT. Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov Acked-by: Will Deacon Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/f78ad24795c2966efcc2ee19025fa3459f622185.1575903816.git.daniel@iogearbox.net --- init/Kconfig | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'init') diff --git a/init/Kconfig b/init/Kconfig index a34064a031a5..890aaa62efde 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1604,6 +1604,9 @@ config BPF_SYSCALL Enable the bpf() system call that allows to manipulate eBPF programs and maps via file descriptors. +config ARCH_WANT_DEFAULT_BPF_JIT + bool + config BPF_JIT_ALWAYS_ON bool "Permanently enable BPF JIT and remove BPF interpreter" depends on BPF_SYSCALL && HAVE_EBPF_JIT && BPF_JIT @@ -1611,6 +1614,10 @@ config BPF_JIT_ALWAYS_ON Enables BPF JIT and removes BPF interpreter to avoid speculative execution of BPF instructions by the interpreter +config BPF_JIT_DEFAULT_ON + def_bool ARCH_WANT_DEFAULT_BPF_JIT || BPF_JIT_ALWAYS_ON + depends on HAVE_EBPF_JIT && BPF_JIT + config USERFAULTFD bool "Enable userfaultfd() system call" depends on MMU -- cgit v1.2.3 From 1091670637be8bd34a39dd1ddcc0a10a7c88d4e2 Mon Sep 17 00:00:00 2001 From: Shile Zhang Date: Wed, 4 Dec 2019 08:46:31 +0800 Subject: scripts/sorttable: Rename 'sortextable' to 'sorttable' Use a more generic name for additional table sorting usecases, such as the upcoming ORC table sorting feature. This tool is not tied to exception table sorting anymore. No functional changes intended. [ mingo: Rewrote the changelog. ] Signed-off-by: Shile Zhang Acked-by: Peter Zijlstra (Intel) Cc: Josh Poimboeuf Cc: Masahiro Yamada Cc: Michal Marek Cc: linux-kbuild@vger.kernel.org Link: https://lkml.kernel.org/r/20191204004633.88660-6-shile.zhang@linux.alibaba.com Signed-off-by: Ingo Molnar --- arch/arc/Kconfig | 2 +- arch/arm/Kconfig | 2 +- arch/arm64/Kconfig | 2 +- arch/microblaze/Kconfig | 2 +- arch/mips/Kconfig | 2 +- arch/parisc/Kconfig | 2 +- arch/powerpc/Kconfig | 2 +- arch/s390/Kconfig | 2 +- arch/x86/Kconfig | 2 +- arch/xtensa/Kconfig | 2 +- init/Kconfig | 2 +- scripts/.gitignore | 2 +- scripts/Makefile | 4 +- scripts/link-vmlinux.sh | 10 +- scripts/sortextable.c | 373 ------------------------------------------------ scripts/sortextable.h | 210 --------------------------- scripts/sorttable.c | 373 ++++++++++++++++++++++++++++++++++++++++++++++++ scripts/sorttable.h | 210 +++++++++++++++++++++++++++ 18 files changed, 602 insertions(+), 602 deletions(-) delete mode 100644 scripts/sortextable.c delete mode 100644 scripts/sortextable.h create mode 100644 scripts/sorttable.c create mode 100644 scripts/sorttable.h (limited to 'init') diff --git a/arch/arc/Kconfig b/arch/arc/Kconfig index 26108ea785c2..5f448201955b 100644 --- a/arch/arc/Kconfig +++ b/arch/arc/Kconfig @@ -13,7 +13,7 @@ config ARC select ARCH_HAS_SYNC_DMA_FOR_DEVICE select ARCH_SUPPORTS_ATOMIC_RMW if ARC_HAS_LLSC select ARCH_32BIT_OFF_T - select BUILDTIME_EXTABLE_SORT + select BUILDTIME_TABLE_SORT select CLONE_BACKWARDS select COMMON_CLK select DMA_DIRECT_REMAP diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index ba75e3661a41..cb3493b4db83 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -36,7 +36,7 @@ config ARM select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT if MMU select ARCH_WANT_IPC_PARSE_VERSION select BINFMT_FLAT_ARGVP_ENVP_ON_STACK - select BUILDTIME_EXTABLE_SORT if MMU + select BUILDTIME_TABLE_SORT if MMU select CLONE_BACKWARDS select CPU_PM if SUSPEND || CPU_IDLE select DCACHE_WORD_ACCESS if HAVE_EFFICIENT_UNALIGNED_ACCESS diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index b1b4476ddb83..6f360dd2b1ad 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -81,7 +81,7 @@ config ARM64 select ARM_GIC_V3 select ARM_GIC_V3_ITS if PCI select ARM_PSCI_FW - select BUILDTIME_EXTABLE_SORT + select BUILDTIME_TABLE_SORT select CLONE_BACKWARDS select COMMON_CLK select CPU_PM if (SUSPEND || CPU_IDLE) diff --git a/arch/microblaze/Kconfig b/arch/microblaze/Kconfig index 5f46ebe7bfe3..a105f113fd67 100644 --- a/arch/microblaze/Kconfig +++ b/arch/microblaze/Kconfig @@ -11,7 +11,7 @@ config MICROBLAZE select ARCH_HAS_UNCACHED_SEGMENT if !MMU select ARCH_MIGHT_HAVE_PC_PARPORT select ARCH_WANT_IPC_PARSE_VERSION - select BUILDTIME_EXTABLE_SORT + select BUILDTIME_TABLE_SORT select TIMER_OF select CLONE_BACKWARDS3 select COMMON_CLK diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig index add388236f4e..5531942baf5d 100644 --- a/arch/mips/Kconfig +++ b/arch/mips/Kconfig @@ -15,7 +15,7 @@ config MIPS select ARCH_USE_QUEUED_SPINLOCKS select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT if MMU select ARCH_WANT_IPC_PARSE_VERSION - select BUILDTIME_EXTABLE_SORT + select BUILDTIME_TABLE_SORT select CLONE_BACKWARDS select CPU_NO_EFFICIENT_FFS if (TARGET_ISA_REV < 1) select CPU_PM if CPU_IDLE diff --git a/arch/parisc/Kconfig b/arch/parisc/Kconfig index b16237c95ea3..e1ef610a5a2b 100644 --- a/arch/parisc/Kconfig +++ b/arch/parisc/Kconfig @@ -18,7 +18,7 @@ config PARISC select RTC_DRV_GENERIC select INIT_ALL_POSSIBLE select BUG - select BUILDTIME_EXTABLE_SORT + select BUILDTIME_TABLE_SORT select HAVE_PCI select HAVE_PERF_EVENTS select HAVE_KERNEL_BZIP2 diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 1ec34e16ed65..18a21b4f3711 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -149,7 +149,7 @@ config PPC select ARCH_WANT_IPC_PARSE_VERSION select ARCH_WEAK_RELEASE_ACQUIRE select BINFMT_ELF - select BUILDTIME_EXTABLE_SORT + select BUILDTIME_TABLE_SORT select CLONE_BACKWARDS select DCACHE_WORD_ACCESS if PPC64 && CPU_LITTLE_ENDIAN select DYNAMIC_FTRACE if FUNCTION_TRACER diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index d4051e88e625..884194fab698 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -110,7 +110,7 @@ config S390 select ARCH_USE_CMPXCHG_LOCKREF select ARCH_WANTS_DYNAMIC_TASK_STRUCT select ARCH_WANT_IPC_PARSE_VERSION - select BUILDTIME_EXTABLE_SORT + select BUILDTIME_TABLE_SORT select CLONE_BACKWARDS2 select DYNAMIC_FTRACE if FUNCTION_TRACER select GENERIC_CLOCKEVENTS diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 5e8949953660..e71158af8dd5 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -96,7 +96,7 @@ config X86 select ARCH_WANTS_DYNAMIC_TASK_STRUCT select ARCH_WANT_HUGE_PMD_SHARE select ARCH_WANTS_THP_SWAP if X86_64 - select BUILDTIME_EXTABLE_SORT + select BUILDTIME_TABLE_SORT select CLKEVT_I8253 select CLOCKSOURCE_VALIDATE_LAST_CYCLE select CLOCKSOURCE_WATCHDOG diff --git a/arch/xtensa/Kconfig b/arch/xtensa/Kconfig index 4a3fa295d8fe..fb9746db0a6d 100644 --- a/arch/xtensa/Kconfig +++ b/arch/xtensa/Kconfig @@ -11,7 +11,7 @@ config XTENSA select ARCH_USE_QUEUED_SPINLOCKS select ARCH_WANT_FRAME_POINTERS select ARCH_WANT_IPC_PARSE_VERSION - select BUILDTIME_EXTABLE_SORT + select BUILDTIME_TABLE_SORT select CLONE_BACKWARDS select COMMON_CLK select DMA_REMAP if MMU diff --git a/init/Kconfig b/init/Kconfig index a34064a031a5..ad9b6c5eaf1d 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -58,7 +58,7 @@ config CONSTRUCTORS config IRQ_WORK bool -config BUILDTIME_EXTABLE_SORT +config BUILDTIME_TABLE_SORT bool config THREAD_INFO_IN_TASK diff --git a/scripts/.gitignore b/scripts/.gitignore index 4aa1806c59c2..306054ef340f 100644 --- a/scripts/.gitignore +++ b/scripts/.gitignore @@ -6,7 +6,7 @@ conmakehash kallsyms unifdef recordmcount -sortextable +sorttable asn1_compiler extract-cert sign-file diff --git a/scripts/Makefile b/scripts/Makefile index 00c47901cb06..7491241e3a0d 100644 --- a/scripts/Makefile +++ b/scripts/Makefile @@ -13,13 +13,13 @@ hostprogs-$(CONFIG_BUILD_BIN2C) += bin2c hostprogs-$(CONFIG_KALLSYMS) += kallsyms hostprogs-$(CONFIG_VT) += conmakehash hostprogs-$(BUILD_C_RECORDMCOUNT) += recordmcount -hostprogs-$(CONFIG_BUILDTIME_EXTABLE_SORT) += sortextable +hostprogs-$(CONFIG_BUILDTIME_TABLE_SORT) += sorttable hostprogs-$(CONFIG_ASN1) += asn1_compiler hostprogs-$(CONFIG_MODULE_SIG_FORMAT) += sign-file hostprogs-$(CONFIG_SYSTEM_TRUSTED_KEYRING) += extract-cert hostprogs-$(CONFIG_SYSTEM_EXTRA_CERTIFICATE) += insert-sys-cert -HOSTCFLAGS_sortextable.o = -I$(srctree)/tools/include +HOSTCFLAGS_sorttable.o = -I$(srctree)/tools/include HOSTCFLAGS_asn1_compiler.o = -I$(srctree)/include HOSTLDLIBS_sign-file = -lcrypto HOSTLDLIBS_extract-cert = -lcrypto diff --git a/scripts/link-vmlinux.sh b/scripts/link-vmlinux.sh index 436379940356..a81aa76bbe18 100755 --- a/scripts/link-vmlinux.sh +++ b/scripts/link-vmlinux.sh @@ -180,9 +180,9 @@ mksysmap() ${CONFIG_SHELL} "${srctree}/scripts/mksysmap" ${1} ${2} } -sortextable() +sorttable() { - ${objtree}/scripts/sortextable ${1} + ${objtree}/scripts/sorttable ${1} } # Delete output files in case of error @@ -304,9 +304,9 @@ fi vmlinux_link vmlinux "${kallsymso}" ${btf_vmlinux_bin_o} -if [ -n "${CONFIG_BUILDTIME_EXTABLE_SORT}" ]; then - info SORTEX vmlinux - sortextable vmlinux +if [ -n "${CONFIG_BUILDTIME_TABLE_SORT}" ]; then + info SORTTAB vmlinux + sorttable vmlinux fi info SYSMAP System.map diff --git a/scripts/sortextable.c b/scripts/sortextable.c deleted file mode 100644 index efa2839865cd..000000000000 --- a/scripts/sortextable.c +++ /dev/null @@ -1,373 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * sortextable.c: Sort the kernel's exception table - * - * Copyright 2011 - 2012 Cavium, Inc. - * - * Based on code taken from recortmcount.c which is: - * - * Copyright 2009 John F. Reiser . All rights reserved. - * - * Restructured to fit Linux format, as well as other updates: - * Copyright 2010 Steven Rostedt , Red Hat Inc. - */ - -/* - * Strategy: alter the vmlinux file in-place. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include - -#ifndef EM_ARCOMPACT -#define EM_ARCOMPACT 93 -#endif - -#ifndef EM_XTENSA -#define EM_XTENSA 94 -#endif - -#ifndef EM_AARCH64 -#define EM_AARCH64 183 -#endif - -#ifndef EM_MICROBLAZE -#define EM_MICROBLAZE 189 -#endif - -#ifndef EM_ARCV2 -#define EM_ARCV2 195 -#endif - -static uint32_t (*r)(const uint32_t *); -static uint16_t (*r2)(const uint16_t *); -static uint64_t (*r8)(const uint64_t *); -static void (*w)(uint32_t, uint32_t *); -static void (*w2)(uint16_t, uint16_t *); -static void (*w8)(uint64_t, uint64_t *); -typedef void (*table_sort_t)(char *, int); - -/* - * Get the whole file as a programming convenience in order to avoid - * malloc+lseek+read+free of many pieces. If successful, then mmap - * avoids copying unused pieces; else just read the whole file. - * Open for both read and write. - */ -static void *mmap_file(char const *fname, size_t *size) -{ - int fd; - struct stat sb; - void *addr = NULL; - - fd = open(fname, O_RDWR); - if (fd < 0) { - perror(fname); - return NULL; - } - if (fstat(fd, &sb) < 0) { - perror(fname); - goto out; - } - if (!S_ISREG(sb.st_mode)) { - fprintf(stderr, "not a regular file: %s\n", fname); - goto out; - } - - addr = mmap(0, sb.st_size, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0); - if (addr == MAP_FAILED) { - fprintf(stderr, "Could not mmap file: %s\n", fname); - goto out; - } - - *size = sb.st_size; - -out: - close(fd); - return addr; -} - -static uint32_t rbe(const uint32_t *x) -{ - return get_unaligned_be32(x); -} - -static uint16_t r2be(const uint16_t *x) -{ - return get_unaligned_be16(x); -} - -static uint64_t r8be(const uint64_t *x) -{ - return get_unaligned_be64(x); -} - -static uint32_t rle(const uint32_t *x) -{ - return get_unaligned_le32(x); -} - -static uint16_t r2le(const uint16_t *x) -{ - return get_unaligned_le16(x); -} - -static uint64_t r8le(const uint64_t *x) -{ - return get_unaligned_le64(x); -} - -static void wbe(uint32_t val, uint32_t *x) -{ - put_unaligned_be32(val, x); -} - -static void w2be(uint16_t val, uint16_t *x) -{ - put_unaligned_be16(val, x); -} - -static void w8be(uint64_t val, uint64_t *x) -{ - put_unaligned_be64(val, x); -} - -static void wle(uint32_t val, uint32_t *x) -{ - put_unaligned_le32(val, x); -} - -static void w2le(uint16_t val, uint16_t *x) -{ - put_unaligned_le16(val, x); -} - -static void w8le(uint64_t val, uint64_t *x) -{ - put_unaligned_le64(val, x); -} - -/* - * Move reserved section indices SHN_LORESERVE..SHN_HIRESERVE out of - * the way to -256..-1, to avoid conflicting with real section - * indices. - */ -#define SPECIAL(i) ((i) - (SHN_HIRESERVE + 1)) - -static inline int is_shndx_special(unsigned int i) -{ - return i != SHN_XINDEX && i >= SHN_LORESERVE && i <= SHN_HIRESERVE; -} - -/* Accessor for sym->st_shndx, hides ugliness of "64k sections" */ -static inline unsigned int get_secindex(unsigned int shndx, - unsigned int sym_offs, - const Elf32_Word *symtab_shndx_start) -{ - if (is_shndx_special(shndx)) - return SPECIAL(shndx); - if (shndx != SHN_XINDEX) - return shndx; - return r(&symtab_shndx_start[sym_offs]); -} - -/* 32 bit and 64 bit are very similar */ -#include "sortextable.h" -#define SORTEXTABLE_64 -#include "sortextable.h" - -static int compare_relative_table(const void *a, const void *b) -{ - int32_t av = (int32_t)r(a); - int32_t bv = (int32_t)r(b); - - if (av < bv) - return -1; - if (av > bv) - return 1; - return 0; -} - -static void sort_relative_table(char *extab_image, int image_size) -{ - int i = 0; - - /* - * Do the same thing the runtime sort does, first normalize to - * being relative to the start of the section. - */ - while (i < image_size) { - uint32_t *loc = (uint32_t *)(extab_image + i); - w(r(loc) + i, loc); - i += 4; - } - - qsort(extab_image, image_size / 8, 8, compare_relative_table); - - /* Now denormalize. */ - i = 0; - while (i < image_size) { - uint32_t *loc = (uint32_t *)(extab_image + i); - w(r(loc) - i, loc); - i += 4; - } -} - -static void x86_sort_relative_table(char *extab_image, int image_size) -{ - int i = 0; - - while (i < image_size) { - uint32_t *loc = (uint32_t *)(extab_image + i); - - w(r(loc) + i, loc); - w(r(loc + 1) + i + 4, loc + 1); - w(r(loc + 2) + i + 8, loc + 2); - - i += sizeof(uint32_t) * 3; - } - - qsort(extab_image, image_size / 12, 12, compare_relative_table); - - i = 0; - while (i < image_size) { - uint32_t *loc = (uint32_t *)(extab_image + i); - - w(r(loc) - i, loc); - w(r(loc + 1) - (i + 4), loc + 1); - w(r(loc + 2) - (i + 8), loc + 2); - - i += sizeof(uint32_t) * 3; - } -} - -static int do_file(char const *const fname, void *addr) -{ - int rc = -1; - Elf32_Ehdr *ehdr = addr; - table_sort_t custom_sort = NULL; - - switch (ehdr->e_ident[EI_DATA]) { - case ELFDATA2LSB: - r = rle; - r2 = r2le; - r8 = r8le; - w = wle; - w2 = w2le; - w8 = w8le; - break; - case ELFDATA2MSB: - r = rbe; - r2 = r2be; - r8 = r8be; - w = wbe; - w2 = w2be; - w8 = w8be; - break; - default: - fprintf(stderr, "unrecognized ELF data encoding %d: %s\n", - ehdr->e_ident[EI_DATA], fname); - return -1; - } - - if (memcmp(ELFMAG, ehdr->e_ident, SELFMAG) != 0 || - (r2(&ehdr->e_type) != ET_EXEC && r2(&ehdr->e_type) != ET_DYN) || - ehdr->e_ident[EI_VERSION] != EV_CURRENT) { - fprintf(stderr, "unrecognized ET_EXEC/ET_DYN file %s\n", fname); - return -1; - } - - switch (r2(&ehdr->e_machine)) { - case EM_386: - case EM_X86_64: - custom_sort = x86_sort_relative_table; - break; - case EM_S390: - case EM_AARCH64: - case EM_PARISC: - case EM_PPC: - case EM_PPC64: - custom_sort = sort_relative_table; - break; - case EM_ARCOMPACT: - case EM_ARCV2: - case EM_ARM: - case EM_MICROBLAZE: - case EM_MIPS: - case EM_XTENSA: - break; - default: - fprintf(stderr, "unrecognized e_machine %d %s\n", - r2(&ehdr->e_machine), fname); - return -1; - } - - switch (ehdr->e_ident[EI_CLASS]) { - case ELFCLASS32: - if (r2(&ehdr->e_ehsize) != sizeof(Elf32_Ehdr) || - r2(&ehdr->e_shentsize) != sizeof(Elf32_Shdr)) { - fprintf(stderr, - "unrecognized ET_EXEC/ET_DYN file: %s\n", fname); - break; - } - rc = do_sort_32(ehdr, fname, custom_sort); - break; - case ELFCLASS64: - { - Elf64_Ehdr *const ghdr = (Elf64_Ehdr *)ehdr; - if (r2(&ghdr->e_ehsize) != sizeof(Elf64_Ehdr) || - r2(&ghdr->e_shentsize) != sizeof(Elf64_Shdr)) { - fprintf(stderr, - "unrecognized ET_EXEC/ET_DYN file: %s\n", - fname); - break; - } - rc = do_sort_64(ghdr, fname, custom_sort); - } - break; - default: - fprintf(stderr, "unrecognized ELF class %d %s\n", - ehdr->e_ident[EI_CLASS], fname); - break; - } - - return rc; -} - -int main(int argc, char *argv[]) -{ - int i, n_error = 0; /* gcc-4.3.0 false positive complaint */ - size_t size = 0; - void *addr = NULL; - - if (argc < 2) { - fprintf(stderr, "usage: sortextable vmlinux...\n"); - return 0; - } - - /* Process each file in turn, allowing deep failure. */ - for (i = 1; i < argc; i++) { - addr = mmap_file(argv[i], &size); - if (!addr) { - ++n_error; - continue; - } - - if (do_file(argv[i], addr)) - ++n_error; - - munmap(addr, size); - } - - return !!n_error; -} diff --git a/scripts/sortextable.h b/scripts/sortextable.h deleted file mode 100644 index 6485513f7cae..000000000000 --- a/scripts/sortextable.h +++ /dev/null @@ -1,210 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * sortextable.h - * - * Copyright 2011 - 2012 Cavium, Inc. - * - * Some of this code was taken out of recordmcount.h written by: - * - * Copyright 2009 John F. Reiser . All rights reserved. - * Copyright 2010 Steven Rostedt , Red Hat Inc. - */ - -#undef extable_ent_size -#undef compare_extable -#undef do_sort -#undef Elf_Addr -#undef Elf_Ehdr -#undef Elf_Shdr -#undef Elf_Rel -#undef Elf_Rela -#undef Elf_Sym -#undef ELF_R_SYM -#undef Elf_r_sym -#undef ELF_R_INFO -#undef Elf_r_info -#undef ELF_ST_BIND -#undef ELF_ST_TYPE -#undef fn_ELF_R_SYM -#undef fn_ELF_R_INFO -#undef uint_t -#undef _r -#undef _w - -#ifdef SORTEXTABLE_64 -# define extable_ent_size 16 -# define compare_extable compare_extable_64 -# define do_sort do_sort_64 -# define Elf_Addr Elf64_Addr -# define Elf_Ehdr Elf64_Ehdr -# define Elf_Shdr Elf64_Shdr -# define Elf_Rel Elf64_Rel -# define Elf_Rela Elf64_Rela -# define Elf_Sym Elf64_Sym -# define ELF_R_SYM ELF64_R_SYM -# define Elf_r_sym Elf64_r_sym -# define ELF_R_INFO ELF64_R_INFO -# define Elf_r_info Elf64_r_info -# define ELF_ST_BIND ELF64_ST_BIND -# define ELF_ST_TYPE ELF64_ST_TYPE -# define fn_ELF_R_SYM fn_ELF64_R_SYM -# define fn_ELF_R_INFO fn_ELF64_R_INFO -# define uint_t uint64_t -# define _r r8 -# define _w w8 -#else -# define extable_ent_size 8 -# define compare_extable compare_extable_32 -# define do_sort do_sort_32 -# define Elf_Addr Elf32_Addr -# define Elf_Ehdr Elf32_Ehdr -# define Elf_Shdr Elf32_Shdr -# define Elf_Rel Elf32_Rel -# define Elf_Rela Elf32_Rela -# define Elf_Sym Elf32_Sym -# define ELF_R_SYM ELF32_R_SYM -# define Elf_r_sym Elf32_r_sym -# define ELF_R_INFO ELF32_R_INFO -# define Elf_r_info Elf32_r_info -# define ELF_ST_BIND ELF32_ST_BIND -# define ELF_ST_TYPE ELF32_ST_TYPE -# define fn_ELF_R_SYM fn_ELF32_R_SYM -# define fn_ELF_R_INFO fn_ELF32_R_INFO -# define uint_t uint32_t -# define _r r -# define _w w -#endif - -static int compare_extable(const void *a, const void *b) -{ - Elf_Addr av = _r(a); - Elf_Addr bv = _r(b); - - if (av < bv) - return -1; - if (av > bv) - return 1; - return 0; -} - -static int do_sort(Elf_Ehdr *ehdr, - char const *const fname, - table_sort_t custom_sort) -{ - Elf_Shdr *s, *shdr = (Elf_Shdr *)((char *)ehdr + _r(&ehdr->e_shoff)); - Elf_Shdr *strtab_sec = NULL; - Elf_Shdr *symtab_sec = NULL; - Elf_Shdr *extab_sec = NULL; - Elf_Sym *sym; - const Elf_Sym *symtab; - Elf32_Word *symtab_shndx = NULL; - Elf_Sym *sort_needed_sym = NULL; - Elf_Shdr *sort_needed_sec; - Elf_Rel *relocs = NULL; - int relocs_size = 0; - uint32_t *sort_needed_loc; - const char *secstrings; - const char *strtab; - char *extab_image; - int extab_index = 0; - int i; - int idx; - unsigned int shnum; - unsigned int shstrndx; - - shstrndx = r2(&ehdr->e_shstrndx); - if (shstrndx == SHN_XINDEX) - shstrndx = r(&shdr[0].sh_link); - secstrings = (const char *)ehdr + _r(&shdr[shstrndx].sh_offset); - - shnum = r2(&ehdr->e_shnum); - if (shnum == SHN_UNDEF) - shnum = _r(&shdr[0].sh_size); - - for (i = 0, s = shdr; s < shdr + shnum; i++, s++) { - idx = r(&s->sh_name); - if (!strcmp(secstrings + idx, "__ex_table")) { - extab_sec = s; - extab_index = i; - } - if (!strcmp(secstrings + idx, ".symtab")) - symtab_sec = s; - if (!strcmp(secstrings + idx, ".strtab")) - strtab_sec = s; - - if ((r(&s->sh_type) == SHT_REL || - r(&s->sh_type) == SHT_RELA) && - r(&s->sh_info) == extab_index) { - relocs = (void *)ehdr + _r(&s->sh_offset); - relocs_size = _r(&s->sh_size); - } - if (r(&s->sh_type) == SHT_SYMTAB_SHNDX) - symtab_shndx = (Elf32_Word *)((const char *)ehdr + - _r(&s->sh_offset)); - } - - if (!extab_sec) { - fprintf(stderr, "no __ex_table in file: %s\n", fname); - return -1; - } - - if (!symtab_sec) { - fprintf(stderr, "no .symtab in file: %s\n", fname); - return -1; - } - - if (!strtab_sec) { - fprintf(stderr, "no .strtab in file: %s\n", fname); - return -1; - } - - extab_image = (void *)ehdr + _r(&extab_sec->sh_offset); - strtab = (const char *)ehdr + _r(&strtab_sec->sh_offset); - symtab = (const Elf_Sym *)((const char *)ehdr + - _r(&symtab_sec->sh_offset)); - - if (custom_sort) { - custom_sort(extab_image, _r(&extab_sec->sh_size)); - } else { - int num_entries = _r(&extab_sec->sh_size) / extable_ent_size; - qsort(extab_image, num_entries, - extable_ent_size, compare_extable); - } - - /* If there were relocations, we no longer need them. */ - if (relocs) - memset(relocs, 0, relocs_size); - - /* find the flag main_extable_sort_needed */ - for (sym = (void *)ehdr + _r(&symtab_sec->sh_offset); - sym < sym + _r(&symtab_sec->sh_size) / sizeof(Elf_Sym); - sym++) { - if (ELF_ST_TYPE(sym->st_info) != STT_OBJECT) - continue; - if (!strcmp(strtab + r(&sym->st_name), - "main_extable_sort_needed")) { - sort_needed_sym = sym; - break; - } - } - - if (!sort_needed_sym) { - fprintf(stderr, - "no main_extable_sort_needed symbol in file: %s\n", - fname); - return -1; - } - - sort_needed_sec = &shdr[get_secindex(r2(&sym->st_shndx), - sort_needed_sym - symtab, - symtab_shndx)]; - sort_needed_loc = (void *)ehdr + - _r(&sort_needed_sec->sh_offset) + - _r(&sort_needed_sym->st_value) - - _r(&sort_needed_sec->sh_addr); - - /* extable has been sorted, clear the flag */ - w(0, sort_needed_loc); - - return 0; -} diff --git a/scripts/sorttable.c b/scripts/sorttable.c new file mode 100644 index 000000000000..ff98b7db20c6 --- /dev/null +++ b/scripts/sorttable.c @@ -0,0 +1,373 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * sorttable.c: Sort the kernel's table + * + * Copyright 2011 - 2012 Cavium, Inc. + * + * Based on code taken from recortmcount.c which is: + * + * Copyright 2009 John F. Reiser . All rights reserved. + * + * Restructured to fit Linux format, as well as other updates: + * Copyright 2010 Steven Rostedt , Red Hat Inc. + */ + +/* + * Strategy: alter the vmlinux file in-place. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#ifndef EM_ARCOMPACT +#define EM_ARCOMPACT 93 +#endif + +#ifndef EM_XTENSA +#define EM_XTENSA 94 +#endif + +#ifndef EM_AARCH64 +#define EM_AARCH64 183 +#endif + +#ifndef EM_MICROBLAZE +#define EM_MICROBLAZE 189 +#endif + +#ifndef EM_ARCV2 +#define EM_ARCV2 195 +#endif + +static uint32_t (*r)(const uint32_t *); +static uint16_t (*r2)(const uint16_t *); +static uint64_t (*r8)(const uint64_t *); +static void (*w)(uint32_t, uint32_t *); +static void (*w2)(uint16_t, uint16_t *); +static void (*w8)(uint64_t, uint64_t *); +typedef void (*table_sort_t)(char *, int); + +/* + * Get the whole file as a programming convenience in order to avoid + * malloc+lseek+read+free of many pieces. If successful, then mmap + * avoids copying unused pieces; else just read the whole file. + * Open for both read and write. + */ +static void *mmap_file(char const *fname, size_t *size) +{ + int fd; + struct stat sb; + void *addr = NULL; + + fd = open(fname, O_RDWR); + if (fd < 0) { + perror(fname); + return NULL; + } + if (fstat(fd, &sb) < 0) { + perror(fname); + goto out; + } + if (!S_ISREG(sb.st_mode)) { + fprintf(stderr, "not a regular file: %s\n", fname); + goto out; + } + + addr = mmap(0, sb.st_size, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0); + if (addr == MAP_FAILED) { + fprintf(stderr, "Could not mmap file: %s\n", fname); + goto out; + } + + *size = sb.st_size; + +out: + close(fd); + return addr; +} + +static uint32_t rbe(const uint32_t *x) +{ + return get_unaligned_be32(x); +} + +static uint16_t r2be(const uint16_t *x) +{ + return get_unaligned_be16(x); +} + +static uint64_t r8be(const uint64_t *x) +{ + return get_unaligned_be64(x); +} + +static uint32_t rle(const uint32_t *x) +{ + return get_unaligned_le32(x); +} + +static uint16_t r2le(const uint16_t *x) +{ + return get_unaligned_le16(x); +} + +static uint64_t r8le(const uint64_t *x) +{ + return get_unaligned_le64(x); +} + +static void wbe(uint32_t val, uint32_t *x) +{ + put_unaligned_be32(val, x); +} + +static void w2be(uint16_t val, uint16_t *x) +{ + put_unaligned_be16(val, x); +} + +static void w8be(uint64_t val, uint64_t *x) +{ + put_unaligned_be64(val, x); +} + +static void wle(uint32_t val, uint32_t *x) +{ + put_unaligned_le32(val, x); +} + +static void w2le(uint16_t val, uint16_t *x) +{ + put_unaligned_le16(val, x); +} + +static void w8le(uint64_t val, uint64_t *x) +{ + put_unaligned_le64(val, x); +} + +/* + * Move reserved section indices SHN_LORESERVE..SHN_HIRESERVE out of + * the way to -256..-1, to avoid conflicting with real section + * indices. + */ +#define SPECIAL(i) ((i) - (SHN_HIRESERVE + 1)) + +static inline int is_shndx_special(unsigned int i) +{ + return i != SHN_XINDEX && i >= SHN_LORESERVE && i <= SHN_HIRESERVE; +} + +/* Accessor for sym->st_shndx, hides ugliness of "64k sections" */ +static inline unsigned int get_secindex(unsigned int shndx, + unsigned int sym_offs, + const Elf32_Word *symtab_shndx_start) +{ + if (is_shndx_special(shndx)) + return SPECIAL(shndx); + if (shndx != SHN_XINDEX) + return shndx; + return r(&symtab_shndx_start[sym_offs]); +} + +/* 32 bit and 64 bit are very similar */ +#include "sorttable.h" +#define SORTTABLE_64 +#include "sorttable.h" + +static int compare_relative_table(const void *a, const void *b) +{ + int32_t av = (int32_t)r(a); + int32_t bv = (int32_t)r(b); + + if (av < bv) + return -1; + if (av > bv) + return 1; + return 0; +} + +static void sort_relative_table(char *extab_image, int image_size) +{ + int i = 0; + + /* + * Do the same thing the runtime sort does, first normalize to + * being relative to the start of the section. + */ + while (i < image_size) { + uint32_t *loc = (uint32_t *)(extab_image + i); + w(r(loc) + i, loc); + i += 4; + } + + qsort(extab_image, image_size / 8, 8, compare_relative_table); + + /* Now denormalize. */ + i = 0; + while (i < image_size) { + uint32_t *loc = (uint32_t *)(extab_image + i); + w(r(loc) - i, loc); + i += 4; + } +} + +static void x86_sort_relative_table(char *extab_image, int image_size) +{ + int i = 0; + + while (i < image_size) { + uint32_t *loc = (uint32_t *)(extab_image + i); + + w(r(loc) + i, loc); + w(r(loc + 1) + i + 4, loc + 1); + w(r(loc + 2) + i + 8, loc + 2); + + i += sizeof(uint32_t) * 3; + } + + qsort(extab_image, image_size / 12, 12, compare_relative_table); + + i = 0; + while (i < image_size) { + uint32_t *loc = (uint32_t *)(extab_image + i); + + w(r(loc) - i, loc); + w(r(loc + 1) - (i + 4), loc + 1); + w(r(loc + 2) - (i + 8), loc + 2); + + i += sizeof(uint32_t) * 3; + } +} + +static int do_file(char const *const fname, void *addr) +{ + int rc = -1; + Elf32_Ehdr *ehdr = addr; + table_sort_t custom_sort = NULL; + + switch (ehdr->e_ident[EI_DATA]) { + case ELFDATA2LSB: + r = rle; + r2 = r2le; + r8 = r8le; + w = wle; + w2 = w2le; + w8 = w8le; + break; + case ELFDATA2MSB: + r = rbe; + r2 = r2be; + r8 = r8be; + w = wbe; + w2 = w2be; + w8 = w8be; + break; + default: + fprintf(stderr, "unrecognized ELF data encoding %d: %s\n", + ehdr->e_ident[EI_DATA], fname); + return -1; + } + + if (memcmp(ELFMAG, ehdr->e_ident, SELFMAG) != 0 || + (r2(&ehdr->e_type) != ET_EXEC && r2(&ehdr->e_type) != ET_DYN) || + ehdr->e_ident[EI_VERSION] != EV_CURRENT) { + fprintf(stderr, "unrecognized ET_EXEC/ET_DYN file %s\n", fname); + return -1; + } + + switch (r2(&ehdr->e_machine)) { + case EM_386: + case EM_X86_64: + custom_sort = x86_sort_relative_table; + break; + case EM_S390: + case EM_AARCH64: + case EM_PARISC: + case EM_PPC: + case EM_PPC64: + custom_sort = sort_relative_table; + break; + case EM_ARCOMPACT: + case EM_ARCV2: + case EM_ARM: + case EM_MICROBLAZE: + case EM_MIPS: + case EM_XTENSA: + break; + default: + fprintf(stderr, "unrecognized e_machine %d %s\n", + r2(&ehdr->e_machine), fname); + return -1; + } + + switch (ehdr->e_ident[EI_CLASS]) { + case ELFCLASS32: + if (r2(&ehdr->e_ehsize) != sizeof(Elf32_Ehdr) || + r2(&ehdr->e_shentsize) != sizeof(Elf32_Shdr)) { + fprintf(stderr, + "unrecognized ET_EXEC/ET_DYN file: %s\n", fname); + break; + } + rc = do_sort_32(ehdr, fname, custom_sort); + break; + case ELFCLASS64: + { + Elf64_Ehdr *const ghdr = (Elf64_Ehdr *)ehdr; + if (r2(&ghdr->e_ehsize) != sizeof(Elf64_Ehdr) || + r2(&ghdr->e_shentsize) != sizeof(Elf64_Shdr)) { + fprintf(stderr, + "unrecognized ET_EXEC/ET_DYN file: %s\n", + fname); + break; + } + rc = do_sort_64(ghdr, fname, custom_sort); + } + break; + default: + fprintf(stderr, "unrecognized ELF class %d %s\n", + ehdr->e_ident[EI_CLASS], fname); + break; + } + + return rc; +} + +int main(int argc, char *argv[]) +{ + int i, n_error = 0; /* gcc-4.3.0 false positive complaint */ + size_t size = 0; + void *addr = NULL; + + if (argc < 2) { + fprintf(stderr, "usage: sorttable vmlinux...\n"); + return 0; + } + + /* Process each file in turn, allowing deep failure. */ + for (i = 1; i < argc; i++) { + addr = mmap_file(argv[i], &size); + if (!addr) { + ++n_error; + continue; + } + + if (do_file(argv[i], addr)) + ++n_error; + + munmap(addr, size); + } + + return !!n_error; +} diff --git a/scripts/sorttable.h b/scripts/sorttable.h new file mode 100644 index 000000000000..82589ff90e25 --- /dev/null +++ b/scripts/sorttable.h @@ -0,0 +1,210 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * sorttable.h + * + * Copyright 2011 - 2012 Cavium, Inc. + * + * Some of this code was taken out of recordmcount.h written by: + * + * Copyright 2009 John F. Reiser . All rights reserved. + * Copyright 2010 Steven Rostedt , Red Hat Inc. + */ + +#undef extable_ent_size +#undef compare_extable +#undef do_sort +#undef Elf_Addr +#undef Elf_Ehdr +#undef Elf_Shdr +#undef Elf_Rel +#undef Elf_Rela +#undef Elf_Sym +#undef ELF_R_SYM +#undef Elf_r_sym +#undef ELF_R_INFO +#undef Elf_r_info +#undef ELF_ST_BIND +#undef ELF_ST_TYPE +#undef fn_ELF_R_SYM +#undef fn_ELF_R_INFO +#undef uint_t +#undef _r +#undef _w + +#ifdef SORTTABLE_64 +# define extable_ent_size 16 +# define compare_extable compare_extable_64 +# define do_sort do_sort_64 +# define Elf_Addr Elf64_Addr +# define Elf_Ehdr Elf64_Ehdr +# define Elf_Shdr Elf64_Shdr +# define Elf_Rel Elf64_Rel +# define Elf_Rela Elf64_Rela +# define Elf_Sym Elf64_Sym +# define ELF_R_SYM ELF64_R_SYM +# define Elf_r_sym Elf64_r_sym +# define ELF_R_INFO ELF64_R_INFO +# define Elf_r_info Elf64_r_info +# define ELF_ST_BIND ELF64_ST_BIND +# define ELF_ST_TYPE ELF64_ST_TYPE +# define fn_ELF_R_SYM fn_ELF64_R_SYM +# define fn_ELF_R_INFO fn_ELF64_R_INFO +# define uint_t uint64_t +# define _r r8 +# define _w w8 +#else +# define extable_ent_size 8 +# define compare_extable compare_extable_32 +# define do_sort do_sort_32 +# define Elf_Addr Elf32_Addr +# define Elf_Ehdr Elf32_Ehdr +# define Elf_Shdr Elf32_Shdr +# define Elf_Rel Elf32_Rel +# define Elf_Rela Elf32_Rela +# define Elf_Sym Elf32_Sym +# define ELF_R_SYM ELF32_R_SYM +# define Elf_r_sym Elf32_r_sym +# define ELF_R_INFO ELF32_R_INFO +# define Elf_r_info Elf32_r_info +# define ELF_ST_BIND ELF32_ST_BIND +# define ELF_ST_TYPE ELF32_ST_TYPE +# define fn_ELF_R_SYM fn_ELF32_R_SYM +# define fn_ELF_R_INFO fn_ELF32_R_INFO +# define uint_t uint32_t +# define _r r +# define _w w +#endif + +static int compare_extable(const void *a, const void *b) +{ + Elf_Addr av = _r(a); + Elf_Addr bv = _r(b); + + if (av < bv) + return -1; + if (av > bv) + return 1; + return 0; +} + +static int do_sort(Elf_Ehdr *ehdr, + char const *const fname, + table_sort_t custom_sort) +{ + Elf_Shdr *s, *shdr = (Elf_Shdr *)((char *)ehdr + _r(&ehdr->e_shoff)); + Elf_Shdr *strtab_sec = NULL; + Elf_Shdr *symtab_sec = NULL; + Elf_Shdr *extab_sec = NULL; + Elf_Sym *sym; + const Elf_Sym *symtab; + Elf32_Word *symtab_shndx = NULL; + Elf_Sym *sort_needed_sym = NULL; + Elf_Shdr *sort_needed_sec; + Elf_Rel *relocs = NULL; + int relocs_size = 0; + uint32_t *sort_needed_loc; + const char *secstrings; + const char *strtab; + char *extab_image; + int extab_index = 0; + int i; + int idx; + unsigned int shnum; + unsigned int shstrndx; + + shstrndx = r2(&ehdr->e_shstrndx); + if (shstrndx == SHN_XINDEX) + shstrndx = r(&shdr[0].sh_link); + secstrings = (const char *)ehdr + _r(&shdr[shstrndx].sh_offset); + + shnum = r2(&ehdr->e_shnum); + if (shnum == SHN_UNDEF) + shnum = _r(&shdr[0].sh_size); + + for (i = 0, s = shdr; s < shdr + shnum; i++, s++) { + idx = r(&s->sh_name); + if (!strcmp(secstrings + idx, "__ex_table")) { + extab_sec = s; + extab_index = i; + } + if (!strcmp(secstrings + idx, ".symtab")) + symtab_sec = s; + if (!strcmp(secstrings + idx, ".strtab")) + strtab_sec = s; + + if ((r(&s->sh_type) == SHT_REL || + r(&s->sh_type) == SHT_RELA) && + r(&s->sh_info) == extab_index) { + relocs = (void *)ehdr + _r(&s->sh_offset); + relocs_size = _r(&s->sh_size); + } + if (r(&s->sh_type) == SHT_SYMTAB_SHNDX) + symtab_shndx = (Elf32_Word *)((const char *)ehdr + + _r(&s->sh_offset)); + } + + if (!extab_sec) { + fprintf(stderr, "no __ex_table in file: %s\n", fname); + return -1; + } + + if (!symtab_sec) { + fprintf(stderr, "no .symtab in file: %s\n", fname); + return -1; + } + + if (!strtab_sec) { + fprintf(stderr, "no .strtab in file: %s\n", fname); + return -1; + } + + extab_image = (void *)ehdr + _r(&extab_sec->sh_offset); + strtab = (const char *)ehdr + _r(&strtab_sec->sh_offset); + symtab = (const Elf_Sym *)((const char *)ehdr + + _r(&symtab_sec->sh_offset)); + + if (custom_sort) { + custom_sort(extab_image, _r(&extab_sec->sh_size)); + } else { + int num_entries = _r(&extab_sec->sh_size) / extable_ent_size; + qsort(extab_image, num_entries, + extable_ent_size, compare_extable); + } + + /* If there were relocations, we no longer need them. */ + if (relocs) + memset(relocs, 0, relocs_size); + + /* find the flag main_extable_sort_needed */ + for (sym = (void *)ehdr + _r(&symtab_sec->sh_offset); + sym < sym + _r(&symtab_sec->sh_size) / sizeof(Elf_Sym); + sym++) { + if (ELF_ST_TYPE(sym->st_info) != STT_OBJECT) + continue; + if (!strcmp(strtab + r(&sym->st_name), + "main_extable_sort_needed")) { + sort_needed_sym = sym; + break; + } + } + + if (!sort_needed_sym) { + fprintf(stderr, + "no main_extable_sort_needed symbol in file: %s\n", + fname); + return -1; + } + + sort_needed_sec = &shdr[get_secindex(r2(&sym->st_shndx), + sort_needed_sym - symtab, + symtab_shndx)]; + sort_needed_loc = (void *)ehdr + + _r(&sort_needed_sec->sh_offset) + + _r(&sort_needed_sym->st_value) - + _r(&sort_needed_sec->sh_addr); + + /* extable has been sorted, clear the flag */ + w(0, sort_needed_loc); + + return 0; +} -- cgit v1.2.3 From 4c002c978b7f2f2306d53de051c054504af920a9 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Mon, 9 Dec 2019 20:33:03 +0100 Subject: device.h: move 'struct driver' stuff out to device/driver.h device.h has everything and the kitchen sink when it comes to struct device things, so split out the struct driver things things to a separate .h file to make things easier to maintain and manage over time. Cc: "Rafael J. Wysocki" Cc: Suzuki K Poulose Cc: Saravana Kannan Cc: Heikki Krogerus Link: https://lore.kernel.org/r/20191209193303.1694546-7-gregkh@linuxfoundation.org Signed-off-by: Greg Kroah-Hartman --- drivers/base/driver.c | 1 + include/linux/device.h | 272 +-------------------------------------- include/linux/device/driver.h | 292 ++++++++++++++++++++++++++++++++++++++++++ init/main.c | 2 +- 4 files changed, 295 insertions(+), 272 deletions(-) create mode 100644 include/linux/device/driver.h (limited to 'init') diff --git a/drivers/base/driver.c b/drivers/base/driver.c index 4e5ca632f35e..57c68769e157 100644 --- a/drivers/base/driver.c +++ b/drivers/base/driver.c @@ -8,6 +8,7 @@ * Copyright (c) 2007 Novell Inc. */ +#include #include #include #include diff --git a/include/linux/device.h b/include/linux/device.h index 4c261fbd7053..0cd7c647c16c 100644 --- a/include/linux/device.h +++ b/include/linux/device.h @@ -28,6 +28,7 @@ #include #include #include +#include #include struct device; @@ -45,227 +46,6 @@ struct iommu_fwspec; struct dev_pin_info; struct iommu_param; -/** - * enum probe_type - device driver probe type to try - * Device drivers may opt in for special handling of their - * respective probe routines. This tells the core what to - * expect and prefer. - * - * @PROBE_DEFAULT_STRATEGY: Used by drivers that work equally well - * whether probed synchronously or asynchronously. - * @PROBE_PREFER_ASYNCHRONOUS: Drivers for "slow" devices which - * probing order is not essential for booting the system may - * opt into executing their probes asynchronously. - * @PROBE_FORCE_SYNCHRONOUS: Use this to annotate drivers that need - * their probe routines to run synchronously with driver and - * device registration (with the exception of -EPROBE_DEFER - * handling - re-probing always ends up being done asynchronously). - * - * Note that the end goal is to switch the kernel to use asynchronous - * probing by default, so annotating drivers with - * %PROBE_PREFER_ASYNCHRONOUS is a temporary measure that allows us - * to speed up boot process while we are validating the rest of the - * drivers. - */ -enum probe_type { - PROBE_DEFAULT_STRATEGY, - PROBE_PREFER_ASYNCHRONOUS, - PROBE_FORCE_SYNCHRONOUS, -}; - -/** - * struct device_driver - The basic device driver structure - * @name: Name of the device driver. - * @bus: The bus which the device of this driver belongs to. - * @owner: The module owner. - * @mod_name: Used for built-in modules. - * @suppress_bind_attrs: Disables bind/unbind via sysfs. - * @probe_type: Type of the probe (synchronous or asynchronous) to use. - * @of_match_table: The open firmware table. - * @acpi_match_table: The ACPI match table. - * @probe: Called to query the existence of a specific device, - * whether this driver can work with it, and bind the driver - * to a specific device. - * @sync_state: Called to sync device state to software state after all the - * state tracking consumers linked to this device (present at - * the time of late_initcall) have successfully bound to a - * driver. If the device has no consumers, this function will - * be called at late_initcall_sync level. If the device has - * consumers that are never bound to a driver, this function - * will never get called until they do. - * @remove: Called when the device is removed from the system to - * unbind a device from this driver. - * @shutdown: Called at shut-down time to quiesce the device. - * @suspend: Called to put the device to sleep mode. Usually to a - * low power state. - * @resume: Called to bring a device from sleep mode. - * @groups: Default attributes that get created by the driver core - * automatically. - * @dev_groups: Additional attributes attached to device instance once the - * it is bound to the driver. - * @pm: Power management operations of the device which matched - * this driver. - * @coredump: Called when sysfs entry is written to. The device driver - * is expected to call the dev_coredump API resulting in a - * uevent. - * @p: Driver core's private data, no one other than the driver - * core can touch this. - * - * The device driver-model tracks all of the drivers known to the system. - * The main reason for this tracking is to enable the driver core to match - * up drivers with new devices. Once drivers are known objects within the - * system, however, a number of other things become possible. Device drivers - * can export information and configuration variables that are independent - * of any specific device. - */ -struct device_driver { - const char *name; - struct bus_type *bus; - - struct module *owner; - const char *mod_name; /* used for built-in modules */ - - bool suppress_bind_attrs; /* disables bind/unbind via sysfs */ - enum probe_type probe_type; - - const struct of_device_id *of_match_table; - const struct acpi_device_id *acpi_match_table; - - int (*probe) (struct device *dev); - void (*sync_state)(struct device *dev); - int (*remove) (struct device *dev); - void (*shutdown) (struct device *dev); - int (*suspend) (struct device *dev, pm_message_t state); - int (*resume) (struct device *dev); - const struct attribute_group **groups; - const struct attribute_group **dev_groups; - - const struct dev_pm_ops *pm; - void (*coredump) (struct device *dev); - - struct driver_private *p; -}; - - -extern int __must_check driver_register(struct device_driver *drv); -extern void driver_unregister(struct device_driver *drv); - -extern struct device_driver *driver_find(const char *name, - struct bus_type *bus); -extern int driver_probe_done(void); -extern void wait_for_device_probe(void); - -/* sysfs interface for exporting driver attributes */ - -struct driver_attribute { - struct attribute attr; - ssize_t (*show)(struct device_driver *driver, char *buf); - ssize_t (*store)(struct device_driver *driver, const char *buf, - size_t count); -}; - -#define DRIVER_ATTR_RW(_name) \ - struct driver_attribute driver_attr_##_name = __ATTR_RW(_name) -#define DRIVER_ATTR_RO(_name) \ - struct driver_attribute driver_attr_##_name = __ATTR_RO(_name) -#define DRIVER_ATTR_WO(_name) \ - struct driver_attribute driver_attr_##_name = __ATTR_WO(_name) - -extern int __must_check driver_create_file(struct device_driver *driver, - const struct driver_attribute *attr); -extern void driver_remove_file(struct device_driver *driver, - const struct driver_attribute *attr); - -extern int __must_check driver_for_each_device(struct device_driver *drv, - struct device *start, - void *data, - int (*fn)(struct device *dev, - void *)); -struct device *driver_find_device(struct device_driver *drv, - struct device *start, const void *data, - int (*match)(struct device *dev, const void *data)); - -/** - * driver_find_device_by_name - device iterator for locating a particular device - * of a specific name. - * @drv: the driver we're iterating - * @name: name of the device to match - */ -static inline struct device *driver_find_device_by_name(struct device_driver *drv, - const char *name) -{ - return driver_find_device(drv, NULL, name, device_match_name); -} - -/** - * driver_find_device_by_of_node- device iterator for locating a particular device - * by of_node pointer. - * @drv: the driver we're iterating - * @np: of_node pointer to match. - */ -static inline struct device * -driver_find_device_by_of_node(struct device_driver *drv, - const struct device_node *np) -{ - return driver_find_device(drv, NULL, np, device_match_of_node); -} - -/** - * driver_find_device_by_fwnode- device iterator for locating a particular device - * by fwnode pointer. - * @drv: the driver we're iterating - * @fwnode: fwnode pointer to match. - */ -static inline struct device * -driver_find_device_by_fwnode(struct device_driver *drv, - const struct fwnode_handle *fwnode) -{ - return driver_find_device(drv, NULL, fwnode, device_match_fwnode); -} - -/** - * driver_find_device_by_devt- device iterator for locating a particular device - * by devt. - * @drv: the driver we're iterating - * @devt: devt pointer to match. - */ -static inline struct device *driver_find_device_by_devt(struct device_driver *drv, - dev_t devt) -{ - return driver_find_device(drv, NULL, &devt, device_match_devt); -} - -static inline struct device *driver_find_next_device(struct device_driver *drv, - struct device *start) -{ - return driver_find_device(drv, start, NULL, device_match_any); -} - -#ifdef CONFIG_ACPI -/** - * driver_find_device_by_acpi_dev : device iterator for locating a particular - * device matching the ACPI_COMPANION device. - * @drv: the driver we're iterating - * @adev: ACPI_COMPANION device to match. - */ -static inline struct device * -driver_find_device_by_acpi_dev(struct device_driver *drv, - const struct acpi_device *adev) -{ - return driver_find_device(drv, NULL, adev, device_match_acpi_dev); -} -#else -static inline struct device * -driver_find_device_by_acpi_dev(struct device_driver *drv, const void *adev) -{ - return NULL; -} -#endif - -void driver_deferred_probe_add(struct device *dev); -int driver_deferred_probe_check_state(struct device *dev); -int driver_deferred_probe_check_state_continue(struct device *dev); - /** * struct subsys_interface - interfaces to device functions * @name: name of the device function @@ -1018,8 +798,6 @@ static inline struct device_node *dev_of_node(struct device *dev) return dev->of_node; } -void driver_init(void); - /* * High level routines for use by the bus drivers */ @@ -1193,52 +971,4 @@ extern long sysfs_deprecated; #define sysfs_deprecated 0 #endif -/** - * module_driver() - Helper macro for drivers that don't do anything - * special in module init/exit. This eliminates a lot of boilerplate. - * Each module may only use this macro once, and calling it replaces - * module_init() and module_exit(). - * - * @__driver: driver name - * @__register: register function for this driver type - * @__unregister: unregister function for this driver type - * @...: Additional arguments to be passed to __register and __unregister. - * - * Use this macro to construct bus specific macros for registering - * drivers, and do not use it on its own. - */ -#define module_driver(__driver, __register, __unregister, ...) \ -static int __init __driver##_init(void) \ -{ \ - return __register(&(__driver) , ##__VA_ARGS__); \ -} \ -module_init(__driver##_init); \ -static void __exit __driver##_exit(void) \ -{ \ - __unregister(&(__driver) , ##__VA_ARGS__); \ -} \ -module_exit(__driver##_exit); - -/** - * builtin_driver() - Helper macro for drivers that don't do anything - * special in init and have no exit. This eliminates some boilerplate. - * Each driver may only use this macro once, and calling it replaces - * device_initcall (or in some cases, the legacy __initcall). This is - * meant to be a direct parallel of module_driver() above but without - * the __exit stuff that is not used for builtin cases. - * - * @__driver: driver name - * @__register: register function for this driver type - * @...: Additional arguments to be passed to __register - * - * Use this macro to construct bus specific macros for registering - * drivers, and do not use it on its own. - */ -#define builtin_driver(__driver, __register, ...) \ -static int __init __driver##_init(void) \ -{ \ - return __register(&(__driver) , ##__VA_ARGS__); \ -} \ -device_initcall(__driver##_init); - #endif /* _DEVICE_H_ */ diff --git a/include/linux/device/driver.h b/include/linux/device/driver.h new file mode 100644 index 000000000000..1188260f9a02 --- /dev/null +++ b/include/linux/device/driver.h @@ -0,0 +1,292 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * The driver-specific portions of the driver model + * + * Copyright (c) 2001-2003 Patrick Mochel + * Copyright (c) 2004-2009 Greg Kroah-Hartman + * Copyright (c) 2008-2009 Novell Inc. + * Copyright (c) 2012-2019 Greg Kroah-Hartman + * Copyright (c) 2012-2019 Linux Foundation + * + * See Documentation/driver-api/driver-model/ for more information. + */ + +#ifndef _DEVICE_DRIVER_H_ +#define _DEVICE_DRIVER_H_ + +#include +#include +#include +#include + +/** + * enum probe_type - device driver probe type to try + * Device drivers may opt in for special handling of their + * respective probe routines. This tells the core what to + * expect and prefer. + * + * @PROBE_DEFAULT_STRATEGY: Used by drivers that work equally well + * whether probed synchronously or asynchronously. + * @PROBE_PREFER_ASYNCHRONOUS: Drivers for "slow" devices which + * probing order is not essential for booting the system may + * opt into executing their probes asynchronously. + * @PROBE_FORCE_SYNCHRONOUS: Use this to annotate drivers that need + * their probe routines to run synchronously with driver and + * device registration (with the exception of -EPROBE_DEFER + * handling - re-probing always ends up being done asynchronously). + * + * Note that the end goal is to switch the kernel to use asynchronous + * probing by default, so annotating drivers with + * %PROBE_PREFER_ASYNCHRONOUS is a temporary measure that allows us + * to speed up boot process while we are validating the rest of the + * drivers. + */ +enum probe_type { + PROBE_DEFAULT_STRATEGY, + PROBE_PREFER_ASYNCHRONOUS, + PROBE_FORCE_SYNCHRONOUS, +}; + +/** + * struct device_driver - The basic device driver structure + * @name: Name of the device driver. + * @bus: The bus which the device of this driver belongs to. + * @owner: The module owner. + * @mod_name: Used for built-in modules. + * @suppress_bind_attrs: Disables bind/unbind via sysfs. + * @probe_type: Type of the probe (synchronous or asynchronous) to use. + * @of_match_table: The open firmware table. + * @acpi_match_table: The ACPI match table. + * @probe: Called to query the existence of a specific device, + * whether this driver can work with it, and bind the driver + * to a specific device. + * @sync_state: Called to sync device state to software state after all the + * state tracking consumers linked to this device (present at + * the time of late_initcall) have successfully bound to a + * driver. If the device has no consumers, this function will + * be called at late_initcall_sync level. If the device has + * consumers that are never bound to a driver, this function + * will never get called until they do. + * @remove: Called when the device is removed from the system to + * unbind a device from this driver. + * @shutdown: Called at shut-down time to quiesce the device. + * @suspend: Called to put the device to sleep mode. Usually to a + * low power state. + * @resume: Called to bring a device from sleep mode. + * @groups: Default attributes that get created by the driver core + * automatically. + * @dev_groups: Additional attributes attached to device instance once the + * it is bound to the driver. + * @pm: Power management operations of the device which matched + * this driver. + * @coredump: Called when sysfs entry is written to. The device driver + * is expected to call the dev_coredump API resulting in a + * uevent. + * @p: Driver core's private data, no one other than the driver + * core can touch this. + * + * The device driver-model tracks all of the drivers known to the system. + * The main reason for this tracking is to enable the driver core to match + * up drivers with new devices. Once drivers are known objects within the + * system, however, a number of other things become possible. Device drivers + * can export information and configuration variables that are independent + * of any specific device. + */ +struct device_driver { + const char *name; + struct bus_type *bus; + + struct module *owner; + const char *mod_name; /* used for built-in modules */ + + bool suppress_bind_attrs; /* disables bind/unbind via sysfs */ + enum probe_type probe_type; + + const struct of_device_id *of_match_table; + const struct acpi_device_id *acpi_match_table; + + int (*probe) (struct device *dev); + void (*sync_state)(struct device *dev); + int (*remove) (struct device *dev); + void (*shutdown) (struct device *dev); + int (*suspend) (struct device *dev, pm_message_t state); + int (*resume) (struct device *dev); + const struct attribute_group **groups; + const struct attribute_group **dev_groups; + + const struct dev_pm_ops *pm; + void (*coredump) (struct device *dev); + + struct driver_private *p; +}; + + +extern int __must_check driver_register(struct device_driver *drv); +extern void driver_unregister(struct device_driver *drv); + +extern struct device_driver *driver_find(const char *name, + struct bus_type *bus); +extern int driver_probe_done(void); +extern void wait_for_device_probe(void); + +/* sysfs interface for exporting driver attributes */ + +struct driver_attribute { + struct attribute attr; + ssize_t (*show)(struct device_driver *driver, char *buf); + ssize_t (*store)(struct device_driver *driver, const char *buf, + size_t count); +}; + +#define DRIVER_ATTR_RW(_name) \ + struct driver_attribute driver_attr_##_name = __ATTR_RW(_name) +#define DRIVER_ATTR_RO(_name) \ + struct driver_attribute driver_attr_##_name = __ATTR_RO(_name) +#define DRIVER_ATTR_WO(_name) \ + struct driver_attribute driver_attr_##_name = __ATTR_WO(_name) + +extern int __must_check driver_create_file(struct device_driver *driver, + const struct driver_attribute *attr); +extern void driver_remove_file(struct device_driver *driver, + const struct driver_attribute *attr); + +extern int __must_check driver_for_each_device(struct device_driver *drv, + struct device *start, + void *data, + int (*fn)(struct device *dev, + void *)); +struct device *driver_find_device(struct device_driver *drv, + struct device *start, const void *data, + int (*match)(struct device *dev, const void *data)); + +/** + * driver_find_device_by_name - device iterator for locating a particular device + * of a specific name. + * @drv: the driver we're iterating + * @name: name of the device to match + */ +static inline struct device *driver_find_device_by_name(struct device_driver *drv, + const char *name) +{ + return driver_find_device(drv, NULL, name, device_match_name); +} + +/** + * driver_find_device_by_of_node- device iterator for locating a particular device + * by of_node pointer. + * @drv: the driver we're iterating + * @np: of_node pointer to match. + */ +static inline struct device * +driver_find_device_by_of_node(struct device_driver *drv, + const struct device_node *np) +{ + return driver_find_device(drv, NULL, np, device_match_of_node); +} + +/** + * driver_find_device_by_fwnode- device iterator for locating a particular device + * by fwnode pointer. + * @drv: the driver we're iterating + * @fwnode: fwnode pointer to match. + */ +static inline struct device * +driver_find_device_by_fwnode(struct device_driver *drv, + const struct fwnode_handle *fwnode) +{ + return driver_find_device(drv, NULL, fwnode, device_match_fwnode); +} + +/** + * driver_find_device_by_devt- device iterator for locating a particular device + * by devt. + * @drv: the driver we're iterating + * @devt: devt pointer to match. + */ +static inline struct device *driver_find_device_by_devt(struct device_driver *drv, + dev_t devt) +{ + return driver_find_device(drv, NULL, &devt, device_match_devt); +} + +static inline struct device *driver_find_next_device(struct device_driver *drv, + struct device *start) +{ + return driver_find_device(drv, start, NULL, device_match_any); +} + +#ifdef CONFIG_ACPI +/** + * driver_find_device_by_acpi_dev : device iterator for locating a particular + * device matching the ACPI_COMPANION device. + * @drv: the driver we're iterating + * @adev: ACPI_COMPANION device to match. + */ +static inline struct device * +driver_find_device_by_acpi_dev(struct device_driver *drv, + const struct acpi_device *adev) +{ + return driver_find_device(drv, NULL, adev, device_match_acpi_dev); +} +#else +static inline struct device * +driver_find_device_by_acpi_dev(struct device_driver *drv, const void *adev) +{ + return NULL; +} +#endif + +void driver_deferred_probe_add(struct device *dev); +int driver_deferred_probe_check_state(struct device *dev); +int driver_deferred_probe_check_state_continue(struct device *dev); +void driver_init(void); + +/** + * module_driver() - Helper macro for drivers that don't do anything + * special in module init/exit. This eliminates a lot of boilerplate. + * Each module may only use this macro once, and calling it replaces + * module_init() and module_exit(). + * + * @__driver: driver name + * @__register: register function for this driver type + * @__unregister: unregister function for this driver type + * @...: Additional arguments to be passed to __register and __unregister. + * + * Use this macro to construct bus specific macros for registering + * drivers, and do not use it on its own. + */ +#define module_driver(__driver, __register, __unregister, ...) \ +static int __init __driver##_init(void) \ +{ \ + return __register(&(__driver) , ##__VA_ARGS__); \ +} \ +module_init(__driver##_init); \ +static void __exit __driver##_exit(void) \ +{ \ + __unregister(&(__driver) , ##__VA_ARGS__); \ +} \ +module_exit(__driver##_exit); + +/** + * builtin_driver() - Helper macro for drivers that don't do anything + * special in init and have no exit. This eliminates some boilerplate. + * Each driver may only use this macro once, and calling it replaces + * device_initcall (or in some cases, the legacy __initcall). This is + * meant to be a direct parallel of module_driver() above but without + * the __exit stuff that is not used for builtin cases. + * + * @__driver: driver name + * @__register: register function for this driver type + * @...: Additional arguments to be passed to __register + * + * Use this macro to construct bus specific macros for registering + * drivers, and do not use it on its own. + */ +#define builtin_driver(__driver, __register, ...) \ +static int __init __driver##_init(void) \ +{ \ + return __register(&(__driver) , ##__VA_ARGS__); \ +} \ +device_initcall(__driver##_init); + +#endif /* _DEVICE_DRIVER_H_ */ diff --git a/init/main.c b/init/main.c index ec3a1463ac69..22840d3048b3 100644 --- a/init/main.c +++ b/init/main.c @@ -63,7 +63,7 @@ #include #include #include -#include +#include #include #include #include -- cgit v1.2.3 From 76db5a27a827c2c89e5120a3d486472da847863b Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Sat, 11 Jan 2020 01:03:32 +0900 Subject: bootconfig: Add Extra Boot Config support Extra Boot Config (XBC) allows admin to pass a tree-structured boot configuration file when boot up the kernel. This extends the kernel command line in an efficient way. Boot config will contain some key-value commands, e.g. key.word = value1 another.key.word = value2 It can fold same keys with braces, also you can write array data. For example, key { word1 { setting1 = data setting2 } word2.array = "val1", "val2" } User can access these key-value pair and tree structure via SKC APIs. Link: http://lkml.kernel.org/r/157867221257.17873.1775090991929862549.stgit@devnote2 Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- MAINTAINERS | 6 + include/linux/bootconfig.h | 224 +++++++++++++ init/Kconfig | 11 + lib/Kconfig | 3 + lib/Makefile | 2 + lib/bootconfig.c | 803 +++++++++++++++++++++++++++++++++++++++++++++ 6 files changed, 1049 insertions(+) create mode 100644 include/linux/bootconfig.h create mode 100644 lib/bootconfig.c (limited to 'init') diff --git a/MAINTAINERS b/MAINTAINERS index 4017e6b760be..8597285eb7c8 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -15770,6 +15770,12 @@ W: http://www.stlinux.com S: Supported F: drivers/net/ethernet/stmicro/stmmac/ +EXTRA BOOT CONFIG +M: Masami Hiramatsu +S: Maintained +F: lib/bootconfig.c +F: include/linux/bootconfig.h + SUN3/3X M: Sam Creasey W: http://sammy.net/sun3/ diff --git a/include/linux/bootconfig.h b/include/linux/bootconfig.h new file mode 100644 index 000000000000..7e18c939663e --- /dev/null +++ b/include/linux/bootconfig.h @@ -0,0 +1,224 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_XBC_H +#define _LINUX_XBC_H +/* + * Extra Boot Config + * Copyright (C) 2019 Linaro Ltd. + * Author: Masami Hiramatsu + */ + +#include +#include + +/* XBC tree node */ +struct xbc_node { + u16 next; + u16 child; + u16 parent; + u16 data; +} __attribute__ ((__packed__)); + +#define XBC_KEY 0 +#define XBC_VALUE (1 << 15) +/* Maximum size of boot config is 32KB - 1 */ +#define XBC_DATA_MAX (XBC_VALUE - 1) + +#define XBC_NODE_MAX 1024 +#define XBC_KEYLEN_MAX 256 +#define XBC_DEPTH_MAX 16 + +/* Node tree access raw APIs */ +struct xbc_node * __init xbc_root_node(void); +int __init xbc_node_index(struct xbc_node *node); +struct xbc_node * __init xbc_node_get_parent(struct xbc_node *node); +struct xbc_node * __init xbc_node_get_child(struct xbc_node *node); +struct xbc_node * __init xbc_node_get_next(struct xbc_node *node); +const char * __init xbc_node_get_data(struct xbc_node *node); + +/** + * xbc_node_is_value() - Test the node is a value node + * @node: An XBC node. + * + * Test the @node is a value node and return true if a value node, false if not. + */ +static inline __init bool xbc_node_is_value(struct xbc_node *node) +{ + return node->data & XBC_VALUE; +} + +/** + * xbc_node_is_key() - Test the node is a key node + * @node: An XBC node. + * + * Test the @node is a key node and return true if a key node, false if not. + */ +static inline __init bool xbc_node_is_key(struct xbc_node *node) +{ + return !xbc_node_is_value(node); +} + +/** + * xbc_node_is_array() - Test the node is an arraied value node + * @node: An XBC node. + * + * Test the @node is an arraied value node. + */ +static inline __init bool xbc_node_is_array(struct xbc_node *node) +{ + return xbc_node_is_value(node) && node->next != 0; +} + +/** + * xbc_node_is_leaf() - Test the node is a leaf key node + * @node: An XBC node. + * + * Test the @node is a leaf key node which is a key node and has a value node + * or no child. Returns true if it is a leaf node, or false if not. + */ +static inline __init bool xbc_node_is_leaf(struct xbc_node *node) +{ + return xbc_node_is_key(node) && + (!node->child || xbc_node_is_value(xbc_node_get_child(node))); +} + +/* Tree-based key-value access APIs */ +struct xbc_node * __init xbc_node_find_child(struct xbc_node *parent, + const char *key); + +const char * __init xbc_node_find_value(struct xbc_node *parent, + const char *key, + struct xbc_node **vnode); + +struct xbc_node * __init xbc_node_find_next_leaf(struct xbc_node *root, + struct xbc_node *leaf); + +const char * __init xbc_node_find_next_key_value(struct xbc_node *root, + struct xbc_node **leaf); + +/** + * xbc_find_value() - Find a value which matches the key + * @key: Search key + * @vnode: A container pointer of XBC value node. + * + * Search a value whose key matches @key from whole of XBC tree and return + * the value if found. Found value node is stored in *@vnode. + * Note that this can return 0-length string and store NULL in *@vnode for + * key-only (non-value) entry. + */ +static inline const char * __init +xbc_find_value(const char *key, struct xbc_node **vnode) +{ + return xbc_node_find_value(NULL, key, vnode); +} + +/** + * xbc_find_node() - Find a node which matches the key + * @key: Search key + * + * Search a (key) node whose key matches @key from whole of XBC tree and + * return the node if found. If not found, returns NULL. + */ +static inline struct xbc_node * __init xbc_find_node(const char *key) +{ + return xbc_node_find_child(NULL, key); +} + +/** + * xbc_array_for_each_value() - Iterate value nodes on an array + * @anode: An XBC arraied value node + * @value: A value + * + * Iterate array value nodes and values starts from @anode. This is expected to + * be used with xbc_find_value() and xbc_node_find_value(), so that user can + * process each array entry node. + */ +#define xbc_array_for_each_value(anode, value) \ + for (value = xbc_node_get_data(anode); anode != NULL ; \ + anode = xbc_node_get_next(anode), \ + value = anode ? xbc_node_get_data(anode) : NULL) + +/** + * xbc_node_for_each_child() - Iterate child nodes + * @parent: An XBC node. + * @child: Iterated XBC node. + * + * Iterate child nodes of @parent. Each child nodes are stored to @child. + */ +#define xbc_node_for_each_child(parent, child) \ + for (child = xbc_node_get_child(parent); child != NULL ; \ + child = xbc_node_get_next(child)) + +/** + * xbc_node_for_each_array_value() - Iterate array entries of geven key + * @node: An XBC node. + * @key: A key string searched under @node + * @anode: Iterated XBC node of array entry. + * @value: Iterated value of array entry. + * + * Iterate array entries of given @key under @node. Each array entry node + * is stroed to @anode and @value. If the @node doesn't have @key node, + * it does nothing. + * Note that even if the found key node has only one value (not array) + * this executes block once. Hoever, if the found key node has no value + * (key-only node), this does nothing. So don't use this for testing the + * key-value pair existence. + */ +#define xbc_node_for_each_array_value(node, key, anode, value) \ + for (value = xbc_node_find_value(node, key, &anode); value != NULL; \ + anode = xbc_node_get_next(anode), \ + value = anode ? xbc_node_get_data(anode) : NULL) + +/** + * xbc_node_for_each_key_value() - Iterate key-value pairs under a node + * @node: An XBC node. + * @knode: Iterated key node + * @value: Iterated value string + * + * Iterate key-value pairs under @node. Each key node and value string are + * stored in @knode and @value respectively. + */ +#define xbc_node_for_each_key_value(node, knode, value) \ + for (knode = NULL, value = xbc_node_find_next_key_value(node, &knode);\ + knode != NULL; value = xbc_node_find_next_key_value(node, &knode)) + +/** + * xbc_for_each_key_value() - Iterate key-value pairs + * @knode: Iterated key node + * @value: Iterated value string + * + * Iterate key-value pairs in whole XBC tree. Each key node and value string + * are stored in @knode and @value respectively. + */ +#define xbc_for_each_key_value(knode, value) \ + xbc_node_for_each_key_value(NULL, knode, value) + +/* Compose partial key */ +int __init xbc_node_compose_key_after(struct xbc_node *root, + struct xbc_node *node, char *buf, size_t size); + +/** + * xbc_node_compose_key() - Compose full key string of the XBC node + * @node: An XBC node. + * @buf: A buffer to store the key. + * @size: The size of the @buf. + * + * Compose the full-length key of the @node into @buf. Returns the total + * length of the key stored in @buf. Or returns -EINVAL if @node is NULL, + * and -ERANGE if the key depth is deeper than max depth. + */ +static inline int __init xbc_node_compose_key(struct xbc_node *node, + char *buf, size_t size) +{ + return xbc_node_compose_key_after(NULL, node, buf, size); +} + +/* XBC node initializer */ +int __init xbc_init(char *buf); + +/* XBC cleanup data structures */ +void __init xbc_destroy_all(void); + +/* Debug dump functions */ +void __init xbc_debug_dump(void); + +#endif diff --git a/init/Kconfig b/init/Kconfig index a34064a031a5..63450d3bbf12 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1215,6 +1215,17 @@ source "usr/Kconfig" endif +config BOOT_CONFIG + bool "Boot config support" + select LIBXBC + default y + help + Extra boot config allows system admin to pass a config file as + complemental extension of kernel cmdline when booting. + The boot config file is usually attached at the end of initramfs. + + If unsure, say Y. + choice prompt "Compiler optimization level" default CC_OPTIMIZE_FOR_PERFORMANCE diff --git a/lib/Kconfig b/lib/Kconfig index 6e790dc55c5b..10012b646009 100644 --- a/lib/Kconfig +++ b/lib/Kconfig @@ -566,6 +566,9 @@ config DIMLIB config LIBFDT bool +config LIBXBC + bool + config OID_REGISTRY tristate help diff --git a/lib/Makefile b/lib/Makefile index 93217d44237f..75a64d2552a2 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -228,6 +228,8 @@ $(foreach file, $(libfdt_files), \ $(eval CFLAGS_$(file) = -I $(srctree)/scripts/dtc/libfdt)) lib-$(CONFIG_LIBFDT) += $(libfdt_files) +lib-$(CONFIG_LIBXBC) += bootconfig.o + obj-$(CONFIG_RBTREE_TEST) += rbtree_test.o obj-$(CONFIG_INTERVAL_TREE_TEST) += interval_tree_test.o diff --git a/lib/bootconfig.c b/lib/bootconfig.c new file mode 100644 index 000000000000..055014e233a5 --- /dev/null +++ b/lib/bootconfig.c @@ -0,0 +1,803 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Extra Boot Config + * Masami Hiramatsu + */ + +#define pr_fmt(fmt) "bootconfig: " fmt + +#include +#include +#include +#include +#include +#include +#include + +/* + * Extra Boot Config (XBC) is given as tree-structured ascii text of + * key-value pairs on memory. + * xbc_parse() parses the text to build a simple tree. Each tree node is + * simply a key word or a value. A key node may have a next key node or/and + * a child node (both key and value). A value node may have a next value + * node (for array). + */ + +static struct xbc_node xbc_nodes[XBC_NODE_MAX] __initdata; +static int xbc_node_num __initdata; +static char *xbc_data __initdata; +static size_t xbc_data_size __initdata; +static struct xbc_node *last_parent __initdata; + +static int __init xbc_parse_error(const char *msg, const char *p) +{ + int pos = p - xbc_data; + + pr_err("Parse error at pos %d: %s\n", pos, msg); + return -EINVAL; +} + +/** + * xbc_root_node() - Get the root node of extended boot config + * + * Return the address of root node of extended boot config. If the + * extended boot config is not initiized, return NULL. + */ +struct xbc_node * __init xbc_root_node(void) +{ + if (unlikely(!xbc_data)) + return NULL; + + return xbc_nodes; +} + +/** + * xbc_node_index() - Get the index of XBC node + * @node: A target node of getting index. + * + * Return the index number of @node in XBC node list. + */ +int __init xbc_node_index(struct xbc_node *node) +{ + return node - &xbc_nodes[0]; +} + +/** + * xbc_node_get_parent() - Get the parent XBC node + * @node: An XBC node. + * + * Return the parent node of @node. If the node is top node of the tree, + * return NULL. + */ +struct xbc_node * __init xbc_node_get_parent(struct xbc_node *node) +{ + return node->parent == XBC_NODE_MAX ? NULL : &xbc_nodes[node->parent]; +} + +/** + * xbc_node_get_child() - Get the child XBC node + * @node: An XBC node. + * + * Return the first child node of @node. If the node has no child, return + * NULL. + */ +struct xbc_node * __init xbc_node_get_child(struct xbc_node *node) +{ + return node->child ? &xbc_nodes[node->child] : NULL; +} + +/** + * xbc_node_get_next() - Get the next sibling XBC node + * @node: An XBC node. + * + * Return the NEXT sibling node of @node. If the node has no next sibling, + * return NULL. Note that even if this returns NULL, it doesn't mean @node + * has no siblings. (You also has to check whether the parent's child node + * is @node or not.) + */ +struct xbc_node * __init xbc_node_get_next(struct xbc_node *node) +{ + return node->next ? &xbc_nodes[node->next] : NULL; +} + +/** + * xbc_node_get_data() - Get the data of XBC node + * @node: An XBC node. + * + * Return the data (which is always a null terminated string) of @node. + * If the node has invalid data, warn and return NULL. + */ +const char * __init xbc_node_get_data(struct xbc_node *node) +{ + int offset = node->data & ~XBC_VALUE; + + if (WARN_ON(offset >= xbc_data_size)) + return NULL; + + return xbc_data + offset; +} + +static bool __init +xbc_node_match_prefix(struct xbc_node *node, const char **prefix) +{ + const char *p = xbc_node_get_data(node); + int len = strlen(p); + + if (strncmp(*prefix, p, len)) + return false; + + p = *prefix + len; + if (*p == '.') + p++; + else if (*p != '\0') + return false; + *prefix = p; + + return true; +} + +/** + * xbc_node_find_child() - Find a child node which matches given key + * @parent: An XBC node. + * @key: A key string. + * + * Search a node under @parent which matches @key. The @key can contain + * several words jointed with '.'. If @parent is NULL, this searches the + * node from whole tree. Return NULL if no node is matched. + */ +struct xbc_node * __init +xbc_node_find_child(struct xbc_node *parent, const char *key) +{ + struct xbc_node *node; + + if (parent) + node = xbc_node_get_child(parent); + else + node = xbc_root_node(); + + while (node && xbc_node_is_key(node)) { + if (!xbc_node_match_prefix(node, &key)) + node = xbc_node_get_next(node); + else if (*key != '\0') + node = xbc_node_get_child(node); + else + break; + } + + return node; +} + +/** + * xbc_node_find_value() - Find a value node which matches given key + * @parent: An XBC node. + * @key: A key string. + * @vnode: A container pointer of found XBC node. + * + * Search a value node under @parent whose (parent) key node matches @key, + * store it in *@vnode, and returns the value string. + * The @key can contain several words jointed with '.'. If @parent is NULL, + * this searches the node from whole tree. Return the value string if a + * matched key found, return NULL if no node is matched. + * Note that this returns 0-length string and stores NULL in *@vnode if the + * key has no value. And also it will return the value of the first entry if + * the value is an array. + */ +const char * __init +xbc_node_find_value(struct xbc_node *parent, const char *key, + struct xbc_node **vnode) +{ + struct xbc_node *node = xbc_node_find_child(parent, key); + + if (!node || !xbc_node_is_key(node)) + return NULL; + + node = xbc_node_get_child(node); + if (node && !xbc_node_is_value(node)) + return NULL; + + if (vnode) + *vnode = node; + + return node ? xbc_node_get_data(node) : ""; +} + +/** + * xbc_node_compose_key_after() - Compose partial key string of the XBC node + * @root: Root XBC node + * @node: Target XBC node. + * @buf: A buffer to store the key. + * @size: The size of the @buf. + * + * Compose the partial key of the @node into @buf, which is starting right + * after @root (@root is not included.) If @root is NULL, this returns full + * key words of @node. + * Returns the total length of the key stored in @buf. Returns -EINVAL + * if @node is NULL or @root is not the ancestor of @node or @root is @node, + * or returns -ERANGE if the key depth is deeper than max depth. + * This is expected to be used with xbc_find_node() to list up all (child) + * keys under given key. + */ +int __init xbc_node_compose_key_after(struct xbc_node *root, + struct xbc_node *node, + char *buf, size_t size) +{ + u16 keys[XBC_DEPTH_MAX]; + int depth = 0, ret = 0, total = 0; + + if (!node || node == root) + return -EINVAL; + + if (xbc_node_is_value(node)) + node = xbc_node_get_parent(node); + + while (node && node != root) { + keys[depth++] = xbc_node_index(node); + if (depth == XBC_DEPTH_MAX) + return -ERANGE; + node = xbc_node_get_parent(node); + } + if (!node && root) + return -EINVAL; + + while (--depth >= 0) { + node = xbc_nodes + keys[depth]; + ret = snprintf(buf, size, "%s%s", xbc_node_get_data(node), + depth ? "." : ""); + if (ret < 0) + return ret; + if (ret > size) { + size = 0; + } else { + size -= ret; + buf += ret; + } + total += ret; + } + + return total; +} + +/** + * xbc_node_find_next_leaf() - Find the next leaf node under given node + * @root: An XBC root node + * @node: An XBC node which starts from. + * + * Search the next leaf node (which means the terminal key node) of @node + * under @root node (including @root node itself). + * Return the next node or NULL if next leaf node is not found. + */ +struct xbc_node * __init xbc_node_find_next_leaf(struct xbc_node *root, + struct xbc_node *node) +{ + if (unlikely(!xbc_data)) + return NULL; + + if (!node) { /* First try */ + node = root; + if (!node) + node = xbc_nodes; + } else { + if (node == root) /* @root was a leaf, no child node. */ + return NULL; + + while (!node->next) { + node = xbc_node_get_parent(node); + if (node == root) + return NULL; + /* User passed a node which is not uder parent */ + if (WARN_ON(!node)) + return NULL; + } + node = xbc_node_get_next(node); + } + + while (node && !xbc_node_is_leaf(node)) + node = xbc_node_get_child(node); + + return node; +} + +/** + * xbc_node_find_next_key_value() - Find the next key-value pair nodes + * @root: An XBC root node + * @leaf: A container pointer of XBC node which starts from. + * + * Search the next leaf node (which means the terminal key node) of *@leaf + * under @root node. Returns the value and update *@leaf if next leaf node + * is found, or NULL if no next leaf node is found. + * Note that this returns 0-length string if the key has no value, or + * the value of the first entry if the value is an array. + */ +const char * __init xbc_node_find_next_key_value(struct xbc_node *root, + struct xbc_node **leaf) +{ + /* tip must be passed */ + if (WARN_ON(!leaf)) + return NULL; + + *leaf = xbc_node_find_next_leaf(root, *leaf); + if (!*leaf) + return NULL; + if ((*leaf)->child) + return xbc_node_get_data(xbc_node_get_child(*leaf)); + else + return ""; /* No value key */ +} + +/* XBC parse and tree build */ + +static struct xbc_node * __init xbc_add_node(char *data, u32 flag) +{ + struct xbc_node *node; + unsigned long offset; + + if (xbc_node_num == XBC_NODE_MAX) + return NULL; + + node = &xbc_nodes[xbc_node_num++]; + offset = data - xbc_data; + node->data = (u16)offset; + if (WARN_ON(offset >= XBC_DATA_MAX)) + return NULL; + node->data |= flag; + node->child = 0; + node->next = 0; + + return node; +} + +static inline __init struct xbc_node *xbc_last_sibling(struct xbc_node *node) +{ + while (node->next) + node = xbc_node_get_next(node); + + return node; +} + +static struct xbc_node * __init xbc_add_sibling(char *data, u32 flag) +{ + struct xbc_node *sib, *node = xbc_add_node(data, flag); + + if (node) { + if (!last_parent) { + node->parent = XBC_NODE_MAX; + sib = xbc_last_sibling(xbc_nodes); + sib->next = xbc_node_index(node); + } else { + node->parent = xbc_node_index(last_parent); + if (!last_parent->child) { + last_parent->child = xbc_node_index(node); + } else { + sib = xbc_node_get_child(last_parent); + sib = xbc_last_sibling(sib); + sib->next = xbc_node_index(node); + } + } + } + + return node; +} + +static inline __init struct xbc_node *xbc_add_child(char *data, u32 flag) +{ + struct xbc_node *node = xbc_add_sibling(data, flag); + + if (node) + last_parent = node; + + return node; +} + +static inline __init bool xbc_valid_keyword(char *key) +{ + if (key[0] == '\0') + return false; + + while (isalnum(*key) || *key == '-' || *key == '_') + key++; + + return *key == '\0'; +} + +static char *skip_comment(char *p) +{ + char *ret; + + ret = strchr(p, '\n'); + if (!ret) + ret = p + strlen(p); + else + ret++; + + return ret; +} + +static char *skip_spaces_until_newline(char *p) +{ + while (isspace(*p) && *p != '\n') + p++; + return p; +} + +static int __init __xbc_open_brace(void) +{ + /* Mark the last key as open brace */ + last_parent->next = XBC_NODE_MAX; + + return 0; +} + +static int __init __xbc_close_brace(char *p) +{ + struct xbc_node *node; + + if (!last_parent || last_parent->next != XBC_NODE_MAX) + return xbc_parse_error("Unexpected closing brace", p); + + node = last_parent; + node->next = 0; + do { + node = xbc_node_get_parent(node); + } while (node && node->next != XBC_NODE_MAX); + last_parent = node; + + return 0; +} + +/* + * Return delimiter or error, no node added. As same as lib/cmdline.c, + * you can use " around spaces, but can't escape " for value. + */ +static int __init __xbc_parse_value(char **__v, char **__n) +{ + char *p, *v = *__v; + int c, quotes = 0; + + v = skip_spaces(v); + while (*v == '#') { + v = skip_comment(v); + v = skip_spaces(v); + } + if (*v == '"' || *v == '\'') { + quotes = *v; + v++; + } + p = v - 1; + while ((c = *++p)) { + if (!isprint(c) && !isspace(c)) + return xbc_parse_error("Non printable value", p); + if (quotes) { + if (c != quotes) + continue; + quotes = 0; + *p++ = '\0'; + p = skip_spaces_until_newline(p); + c = *p; + if (c && !strchr(",;\n#}", c)) + return xbc_parse_error("No value delimiter", p); + if (*p) + p++; + break; + } + if (strchr(",;\n#}", c)) { + v = strim(v); + *p++ = '\0'; + break; + } + } + if (quotes) + return xbc_parse_error("No closing quotes", p); + if (c == '#') { + p = skip_comment(p); + c = '\n'; /* A comment must be treated as a newline */ + } + *__n = p; + *__v = v; + + return c; +} + +static int __init xbc_parse_array(char **__v) +{ + struct xbc_node *node; + char *next; + int c = 0; + + do { + c = __xbc_parse_value(__v, &next); + if (c < 0) + return c; + + node = xbc_add_sibling(*__v, XBC_VALUE); + if (!node) + return -ENOMEM; + *__v = next; + } while (c == ','); + node->next = 0; + + return c; +} + +static inline __init +struct xbc_node *find_match_node(struct xbc_node *node, char *k) +{ + while (node) { + if (!strcmp(xbc_node_get_data(node), k)) + break; + node = xbc_node_get_next(node); + } + return node; +} + +static int __init __xbc_add_key(char *k) +{ + struct xbc_node *node; + + if (!xbc_valid_keyword(k)) + return xbc_parse_error("Invalid keyword", k); + + if (unlikely(xbc_node_num == 0)) + goto add_node; + + if (!last_parent) /* the first level */ + node = find_match_node(xbc_nodes, k); + else + node = find_match_node(xbc_node_get_child(last_parent), k); + + if (node) + last_parent = node; + else { +add_node: + node = xbc_add_child(k, XBC_KEY); + if (!node) + return -ENOMEM; + } + return 0; +} + +static int __init __xbc_parse_keys(char *k) +{ + char *p; + int ret; + + k = strim(k); + while ((p = strchr(k, '.'))) { + *p++ = '\0'; + ret = __xbc_add_key(k); + if (ret) + return ret; + k = p; + } + + return __xbc_add_key(k); +} + +static int __init xbc_parse_kv(char **k, char *v) +{ + struct xbc_node *prev_parent = last_parent; + struct xbc_node *node; + char *next; + int c, ret; + + ret = __xbc_parse_keys(*k); + if (ret) + return ret; + + c = __xbc_parse_value(&v, &next); + if (c < 0) + return c; + + node = xbc_add_sibling(v, XBC_VALUE); + if (!node) + return -ENOMEM; + + if (c == ',') { /* Array */ + c = xbc_parse_array(&next); + if (c < 0) + return c; + } + + last_parent = prev_parent; + + if (c == '}') { + ret = __xbc_close_brace(next - 1); + if (ret < 0) + return ret; + } + + *k = next; + + return 0; +} + +static int __init xbc_parse_key(char **k, char *n) +{ + struct xbc_node *prev_parent = last_parent; + int ret; + + *k = strim(*k); + if (**k != '\0') { + ret = __xbc_parse_keys(*k); + if (ret) + return ret; + last_parent = prev_parent; + } + *k = n; + + return 0; +} + +static int __init xbc_open_brace(char **k, char *n) +{ + int ret; + + ret = __xbc_parse_keys(*k); + if (ret) + return ret; + *k = n; + + return __xbc_open_brace(); +} + +static int __init xbc_close_brace(char **k, char *n) +{ + int ret; + + ret = xbc_parse_key(k, n); + if (ret) + return ret; + /* k is updated in xbc_parse_key() */ + + return __xbc_close_brace(n - 1); +} + +static int __init xbc_verify_tree(void) +{ + int i, depth, len, wlen; + struct xbc_node *n, *m; + + /* Empty tree */ + if (xbc_node_num == 0) + return -ENOENT; + + for (i = 0; i < xbc_node_num; i++) { + if (xbc_nodes[i].next > xbc_node_num) { + return xbc_parse_error("No closing brace", + xbc_node_get_data(xbc_nodes + i)); + } + } + + /* Key tree limitation check */ + n = &xbc_nodes[0]; + depth = 1; + len = 0; + + while (n) { + wlen = strlen(xbc_node_get_data(n)) + 1; + len += wlen; + if (len > XBC_KEYLEN_MAX) + return xbc_parse_error("Too long key length", + xbc_node_get_data(n)); + + m = xbc_node_get_child(n); + if (m && xbc_node_is_key(m)) { + n = m; + depth++; + if (depth > XBC_DEPTH_MAX) + return xbc_parse_error("Too many key words", + xbc_node_get_data(n)); + continue; + } + len -= wlen; + m = xbc_node_get_next(n); + while (!m) { + n = xbc_node_get_parent(n); + if (!n) + break; + len -= strlen(xbc_node_get_data(n)) + 1; + depth--; + m = xbc_node_get_next(n); + } + n = m; + } + + return 0; +} + +/** + * xbc_destroy_all() - Clean up all parsed bootconfig + * + * This clears all data structures of parsed bootconfig on memory. + * If you need to reuse xbc_init() with new boot config, you can + * use this. + */ +void __init xbc_destroy_all(void) +{ + xbc_data = NULL; + xbc_data_size = 0; + xbc_node_num = 0; + memset(xbc_nodes, 0, sizeof(xbc_nodes)); +} + +/** + * xbc_init() - Parse given XBC file and build XBC internal tree + * @buf: boot config text + * + * This parses the boot config text in @buf. @buf must be a + * null terminated string and smaller than XBC_DATA_MAX. + * Return 0 if succeeded, or -errno if there is any error. + */ +int __init xbc_init(char *buf) +{ + char *p, *q; + int ret, c; + + if (xbc_data) + return -EBUSY; + + ret = strlen(buf); + if (ret > XBC_DATA_MAX - 1 || ret == 0) + return -ERANGE; + + xbc_data = buf; + xbc_data_size = ret + 1; + last_parent = NULL; + + p = buf; + do { + q = strpbrk(p, "{}=;\n#"); + if (!q) { + p = skip_spaces(p); + if (*p != '\0') + ret = xbc_parse_error("No delimiter", p); + break; + } + + c = *q; + *q++ = '\0'; + switch (c) { + case '=': + ret = xbc_parse_kv(&p, q); + break; + case '{': + ret = xbc_open_brace(&p, q); + break; + case '#': + q = skip_comment(q); + /* fall through */ + case ';': + case '\n': + ret = xbc_parse_key(&p, q); + break; + case '}': + ret = xbc_close_brace(&p, q); + break; + } + } while (!ret); + + if (!ret) + ret = xbc_verify_tree(); + + if (ret < 0) + xbc_destroy_all(); + + return ret; +} + +/** + * xbc_debug_dump() - Dump current XBC node list + * + * Dump the current XBC node list on printk buffer for debug. + */ +void __init xbc_debug_dump(void) +{ + int i; + + for (i = 0; i < xbc_node_num; i++) { + pr_debug("[%d] %s (%s) .next=%d, .child=%d .parent=%d\n", i, + xbc_node_get_data(xbc_nodes + i), + xbc_node_is_value(xbc_nodes + i) ? "value" : "key", + xbc_nodes[i].next, xbc_nodes[i].child, + xbc_nodes[i].parent); + } +} -- cgit v1.2.3 From 7684b8582c24537dbe079a7d40e1d7e57ca56939 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Sat, 11 Jan 2020 01:03:44 +0900 Subject: bootconfig: Load boot config from the tail of initrd Load the extended boot config data from the tail of initrd image. If there is an SKC data there, it has [(u32)size][(u32)checksum] header (in really, this is a footer) at the end of initrd. If the checksum (simple sum of bytes) is match, this starts parsing it from there. Link: http://lkml.kernel.org/r/157867222435.17873.9936667353335606867.stgit@devnote2 Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- init/Kconfig | 1 + init/main.c | 54 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 55 insertions(+) (limited to 'init') diff --git a/init/Kconfig b/init/Kconfig index 63450d3bbf12..ffd240fb88c3 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1217,6 +1217,7 @@ endif config BOOT_CONFIG bool "Boot config support" + depends on BLK_DEV_INITRD select LIBXBC default y help diff --git a/init/main.c b/init/main.c index 2cd736059416..59c418a57f92 100644 --- a/init/main.c +++ b/init/main.c @@ -28,6 +28,7 @@ #include #include #include +#include #include #include #include @@ -245,6 +246,58 @@ static int __init loglevel(char *str) early_param("loglevel", loglevel); +#ifdef CONFIG_BOOT_CONFIG +u32 boot_config_checksum(unsigned char *p, u32 size) +{ + u32 ret = 0; + + while (size--) + ret += *p++; + + return ret; +} + +static void __init setup_boot_config(void) +{ + u32 size, csum; + char *data, *copy; + u32 *hdr; + + if (!initrd_end) + return; + + hdr = (u32 *)(initrd_end - 8); + size = hdr[0]; + csum = hdr[1]; + + if (size >= XBC_DATA_MAX) + return; + + data = ((void *)hdr) - size; + if ((unsigned long)data < initrd_start) + return; + + if (boot_config_checksum((unsigned char *)data, size) != csum) + return; + + copy = memblock_alloc(size + 1, SMP_CACHE_BYTES); + if (!copy) { + pr_err("Failed to allocate memory for boot config\n"); + return; + } + + memcpy(copy, data, size); + copy[size] = '\0'; + + if (xbc_init(copy) < 0) + pr_err("Failed to parse boot config\n"); + else + pr_info("Load boot config: %d bytes\n", size); +} +#else +#define setup_boot_config() do { } while (0) +#endif + /* Change NUL term back to "=", to make "param" the whole string. */ static int __init repair_env_string(char *param, char *val, const char *unused, void *arg) @@ -595,6 +648,7 @@ asmlinkage __visible void __init start_kernel(void) pr_notice("%s", linux_banner); early_security_init(); setup_arch(&command_line); + setup_boot_config(); setup_command_line(command_line); setup_nr_cpu_ids(); setup_per_cpu_areas(); -- cgit v1.2.3 From 0068c92a92707789b8711e40d584a2433481a29d Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Sat, 11 Jan 2020 01:04:31 +0900 Subject: init/main.c: Alloc initcall_command_line in do_initcall() and free it Since initcall_command_line is used as a temporary buffer, it could be freed after usage. Allocate it in do_initcall() and free it after used. Link: http://lkml.kernel.org/r/157867227145.17873.17513760552008505454.stgit@devnote2 Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- init/main.c | 26 +++++++++++++++----------- 1 file changed, 15 insertions(+), 11 deletions(-) (limited to 'init') diff --git a/init/main.c b/init/main.c index 59c418a57f92..0b4e0c8ccf16 100644 --- a/init/main.c +++ b/init/main.c @@ -137,8 +137,6 @@ char __initdata boot_command_line[COMMAND_LINE_SIZE]; char *saved_command_line; /* Command line for parameter parsing */ static char *static_command_line; -/* Command line for per-initcall parameter parsing */ -static char *initcall_command_line; static char *execute_command; static char *ramdisk_execute_command; @@ -433,10 +431,6 @@ static void __init setup_command_line(char *command_line) if (!saved_command_line) panic("%s: Failed to allocate %zu bytes\n", __func__, len); - initcall_command_line = memblock_alloc(len, SMP_CACHE_BYTES); - if (!initcall_command_line) - panic("%s: Failed to allocate %zu bytes\n", __func__, len); - static_command_line = memblock_alloc(len, SMP_CACHE_BYTES); if (!static_command_line) panic("%s: Failed to allocate %zu bytes\n", __func__, len); @@ -1044,13 +1038,12 @@ static const char *initcall_level_names[] __initdata = { "late", }; -static void __init do_initcall_level(int level) +static void __init do_initcall_level(int level, char *command_line) { initcall_entry_t *fn; - strcpy(initcall_command_line, saved_command_line); parse_args(initcall_level_names[level], - initcall_command_line, __start___param, + command_line, __start___param, __stop___param - __start___param, level, level, NULL, &repair_env_string); @@ -1063,9 +1056,20 @@ static void __init do_initcall_level(int level) static void __init do_initcalls(void) { int level; + size_t len = strlen(saved_command_line) + 1; + char *command_line; + + command_line = kzalloc(len, GFP_KERNEL); + if (!command_line) + panic("%s: Failed to allocate %zu bytes\n", __func__, len); + + for (level = 0; level < ARRAY_SIZE(initcall_levels) - 1; level++) { + /* Parser modifies command_line, restore it each time */ + strcpy(command_line, saved_command_line); + do_initcall_level(level, command_line); + } - for (level = 0; level < ARRAY_SIZE(initcall_levels) - 1; level++) - do_initcall_level(level); + kfree(command_line); } /* -- cgit v1.2.3 From 51887d03aca101a24ab049179d1ab430464a24e6 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Sat, 11 Jan 2020 01:04:43 +0900 Subject: bootconfig: init: Allow admin to use bootconfig for kernel command line Since the current kernel command line is too short to describe many options which supported by kernel, allow user to use boot config to setup (add) the command line options. All kernel parameters under "kernel." keywords will be used for setting up extra kernel command line. For example, kernel { audit = on audit_backlog_limit = 256 } Note that you can not specify some early parameters (like console etc.) by this method, since it is loaded after early parameters parsed. Link: http://lkml.kernel.org/r/157867228333.17873.11962796367032622466.stgit@devnote2 Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- init/main.c | 106 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 101 insertions(+), 5 deletions(-) (limited to 'init') diff --git a/init/main.c b/init/main.c index 0b4e0c8ccf16..c0017d9d16e7 100644 --- a/init/main.c +++ b/init/main.c @@ -137,6 +137,8 @@ char __initdata boot_command_line[COMMAND_LINE_SIZE]; char *saved_command_line; /* Command line for parameter parsing */ static char *static_command_line; +/* Untouched extra command line */ +static char *extra_command_line; static char *execute_command; static char *ramdisk_execute_command; @@ -245,6 +247,83 @@ static int __init loglevel(char *str) early_param("loglevel", loglevel); #ifdef CONFIG_BOOT_CONFIG + +char xbc_namebuf[XBC_KEYLEN_MAX] __initdata; + +#define rest(dst, end) ((end) > (dst) ? (end) - (dst) : 0) + +static int __init xbc_snprint_cmdline(char *buf, size_t size, + struct xbc_node *root) +{ + struct xbc_node *knode, *vnode; + char *end = buf + size; + char c = '\"'; + const char *val; + int ret; + + xbc_node_for_each_key_value(root, knode, val) { + ret = xbc_node_compose_key_after(root, knode, + xbc_namebuf, XBC_KEYLEN_MAX); + if (ret < 0) + return ret; + + vnode = xbc_node_get_child(knode); + ret = snprintf(buf, rest(buf, end), "%s%c", xbc_namebuf, + vnode ? '=' : ' '); + if (ret < 0) + return ret; + buf += ret; + if (!vnode) + continue; + + c = '\"'; + xbc_array_for_each_value(vnode, val) { + ret = snprintf(buf, rest(buf, end), "%c%s", c, val); + if (ret < 0) + return ret; + buf += ret; + c = ','; + } + if (rest(buf, end) > 2) + strcpy(buf, "\" "); + buf += 2; + } + + return buf - (end - size); +} +#undef rest + +/* Make an extra command line under given key word */ +static char * __init xbc_make_cmdline(const char *key) +{ + struct xbc_node *root; + char *new_cmdline; + int ret, len = 0; + + root = xbc_find_node(key); + if (!root) + return NULL; + + /* Count required buffer size */ + len = xbc_snprint_cmdline(NULL, 0, root); + if (len <= 0) + return NULL; + + new_cmdline = memblock_alloc(len + 1, SMP_CACHE_BYTES); + if (!new_cmdline) { + pr_err("Failed to allocate memory for extra kernel cmdline.\n"); + return NULL; + } + + ret = xbc_snprint_cmdline(new_cmdline, len + 1, root); + if (ret < 0 || ret > len) { + pr_err("Failed to print extra kernel cmdline.\n"); + return NULL; + } + + return new_cmdline; +} + u32 boot_config_checksum(unsigned char *p, u32 size) { u32 ret = 0; @@ -289,8 +368,11 @@ static void __init setup_boot_config(void) if (xbc_init(copy) < 0) pr_err("Failed to parse boot config\n"); - else + else { pr_info("Load boot config: %d bytes\n", size); + /* keys starting with "kernel." are passed via cmdline */ + extra_command_line = xbc_make_cmdline("kernel"); + } } #else #define setup_boot_config() do { } while (0) @@ -425,7 +507,12 @@ static inline void smp_prepare_cpus(unsigned int maxcpus) { } */ static void __init setup_command_line(char *command_line) { - size_t len = strlen(boot_command_line) + 1; + size_t len, xlen = 0; + + if (extra_command_line) + xlen = strlen(extra_command_line); + + len = xlen + strlen(boot_command_line) + 1; saved_command_line = memblock_alloc(len, SMP_CACHE_BYTES); if (!saved_command_line) @@ -435,8 +522,17 @@ static void __init setup_command_line(char *command_line) if (!static_command_line) panic("%s: Failed to allocate %zu bytes\n", __func__, len); - strcpy(saved_command_line, boot_command_line); - strcpy(static_command_line, command_line); + if (xlen) { + /* + * We have to put extra_command_line before boot command + * lines because there could be dashes (separator of init + * command line) in the command lines. + */ + strcpy(saved_command_line, extra_command_line); + strcpy(static_command_line, extra_command_line); + } + strcpy(saved_command_line + xlen, boot_command_line); + strcpy(static_command_line + xlen, command_line); } /* @@ -652,7 +748,7 @@ asmlinkage __visible void __init start_kernel(void) build_all_zonelists(NULL); page_alloc_init(); - pr_notice("Kernel command line: %s\n", boot_command_line); + pr_notice("Kernel command line: %s\n", saved_command_line); /* parameters may set static keys */ jump_label_init(); parse_early_param(); -- cgit v1.2.3 From 1319916209ce8f55a4f4f848e74500633e24bb99 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Sat, 11 Jan 2020 01:04:55 +0900 Subject: bootconfig: init: Allow admin to use bootconfig for init command line Since the current kernel command line is too short to describe long and many options for init (e.g. systemd command line options), this allows admin to use boot config for init command line. All init command line under "init." keywords will be passed to init. For example, init.systemd { unified_cgroup_hierarchy = 1 debug_shell default_timeout_start_sec = 60 } Link: http://lkml.kernel.org/r/157867229521.17873.654222294326542349.stgit@devnote2 Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- init/main.c | 31 ++++++++++++++++++++++++++++--- 1 file changed, 28 insertions(+), 3 deletions(-) (limited to 'init') diff --git a/init/main.c b/init/main.c index c0017d9d16e7..dd7da62d99a5 100644 --- a/init/main.c +++ b/init/main.c @@ -139,6 +139,8 @@ char *saved_command_line; static char *static_command_line; /* Untouched extra command line */ static char *extra_command_line; +/* Extra init arguments */ +static char *extra_init_args; static char *execute_command; static char *ramdisk_execute_command; @@ -372,6 +374,8 @@ static void __init setup_boot_config(void) pr_info("Load boot config: %d bytes\n", size); /* keys starting with "kernel." are passed via cmdline */ extra_command_line = xbc_make_cmdline("kernel"); + /* Also, "init." keys are init arguments */ + extra_init_args = xbc_make_cmdline("init"); } } #else @@ -507,16 +511,18 @@ static inline void smp_prepare_cpus(unsigned int maxcpus) { } */ static void __init setup_command_line(char *command_line) { - size_t len, xlen = 0; + size_t len, xlen = 0, ilen = 0; if (extra_command_line) xlen = strlen(extra_command_line); + if (extra_init_args) + ilen = strlen(extra_init_args) + 4; /* for " -- " */ len = xlen + strlen(boot_command_line) + 1; - saved_command_line = memblock_alloc(len, SMP_CACHE_BYTES); + saved_command_line = memblock_alloc(len + ilen, SMP_CACHE_BYTES); if (!saved_command_line) - panic("%s: Failed to allocate %zu bytes\n", __func__, len); + panic("%s: Failed to allocate %zu bytes\n", __func__, len + ilen); static_command_line = memblock_alloc(len, SMP_CACHE_BYTES); if (!static_command_line) @@ -533,6 +539,22 @@ static void __init setup_command_line(char *command_line) } strcpy(saved_command_line + xlen, boot_command_line); strcpy(static_command_line + xlen, command_line); + + if (ilen) { + /* + * Append supplemental init boot args to saved_command_line + * so that user can check what command line options passed + * to init. + */ + len = strlen(saved_command_line); + if (!strstr(boot_command_line, " -- ")) { + strcpy(saved_command_line + len, " -- "); + len += 4; + } else + saved_command_line[len++] = ' '; + + strcpy(saved_command_line + len, extra_init_args); + } } /* @@ -759,6 +781,9 @@ asmlinkage __visible void __init start_kernel(void) if (!IS_ERR_OR_NULL(after_dashes)) parse_args("Setting init args", after_dashes, NULL, 0, -1, -1, NULL, set_init_arg); + if (extra_init_args) + parse_args("Setting extra init args", extra_init_args, + NULL, 0, -1, -1, NULL, set_init_arg); /* * These use large bootmem allocations and must precede -- cgit v1.2.3 From 769071ac9f20b6a447410c7eaa55d1a5233ef40c Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Tue, 12 Nov 2019 01:26:52 +0000 Subject: ns: Introduce Time Namespace Time Namespace isolates clock values. The kernel provides access to several clocks CLOCK_REALTIME, CLOCK_MONOTONIC, CLOCK_BOOTTIME, etc. CLOCK_REALTIME System-wide clock that measures real (i.e., wall-clock) time. CLOCK_MONOTONIC Clock that cannot be set and represents monotonic time since some unspecified starting point. CLOCK_BOOTTIME Identical to CLOCK_MONOTONIC, except it also includes any time that the system is suspended. For many users, the time namespace means the ability to changes date and time in a container (CLOCK_REALTIME). Providing per namespace notions of CLOCK_REALTIME would be complex with a massive overhead, but has a dubious value. But in the context of checkpoint/restore functionality, monotonic and boottime clocks become interesting. Both clocks are monotonic with unspecified starting points. These clocks are widely used to measure time slices and set timers. After restoring or migrating processes, it has to be guaranteed that they never go backward. In an ideal case, the behavior of these clocks should be the same as for a case when a whole system is suspended. All this means that it is required to set CLOCK_MONOTONIC and CLOCK_BOOTTIME clocks, which can be achieved by adding per-namespace offsets for clocks. A time namespace is similar to a pid namespace in the way how it is created: unshare(CLONE_NEWTIME) system call creates a new time namespace, but doesn't set it to the current process. Then all children of the process will be born in the new time namespace, or a process can use the setns() system call to join a namespace. This scheme allows setting clock offsets for a namespace, before any processes appear in it. All available clone flags have been used, so CLONE_NEWTIME uses the highest bit of CSIGNAL. It means that it can be used only with the unshare() and the clone3() system calls. [ tglx: Adjusted paragraph about clone3() to reality and massaged the changelog a bit. ] Co-developed-by: Dmitry Safonov Signed-off-by: Andrei Vagin Signed-off-by: Dmitry Safonov Signed-off-by: Thomas Gleixner Link: https://criu.org/Time_namespace Link: https://lists.openvz.org/pipermail/criu/2018-June/041504.html Link: https://lore.kernel.org/r/20191112012724.250792-4-dima@arista.com --- MAINTAINERS | 2 + fs/proc/namespaces.c | 4 + include/linux/nsproxy.h | 2 + include/linux/proc_ns.h | 3 + include/linux/time_namespace.h | 71 ++++++++++++++ include/linux/user_namespace.h | 1 + include/uapi/linux/sched.h | 6 ++ init/Kconfig | 7 ++ kernel/fork.c | 16 ++- kernel/nsproxy.c | 41 ++++++-- kernel/time/Makefile | 1 + kernel/time/namespace.c | 217 +++++++++++++++++++++++++++++++++++++++++ 12 files changed, 361 insertions(+), 10 deletions(-) create mode 100644 include/linux/time_namespace.h create mode 100644 kernel/time/namespace.c (limited to 'init') diff --git a/MAINTAINERS b/MAINTAINERS index 8982c6e013b3..f6d00023e56f 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -13214,6 +13214,8 @@ T: git git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git timers/core S: Maintained F: fs/timerfd.c F: include/linux/timer* +F: include/linux/time_namespace.h +F: kernel/time_namespace.c F: kernel/time/*timer* POWER MANAGEMENT CORE diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c index dd2b35f78b09..8b5c720fe5d7 100644 --- a/fs/proc/namespaces.c +++ b/fs/proc/namespaces.c @@ -33,6 +33,10 @@ static const struct proc_ns_operations *ns_entries[] = { #ifdef CONFIG_CGROUPS &cgroupns_operations, #endif +#ifdef CONFIG_TIME_NS + &timens_operations, + &timens_for_children_operations, +#endif }; static const char *proc_ns_get_link(struct dentry *dentry, diff --git a/include/linux/nsproxy.h b/include/linux/nsproxy.h index 2ae1b1a4d84d..074f395b9ad2 100644 --- a/include/linux/nsproxy.h +++ b/include/linux/nsproxy.h @@ -35,6 +35,8 @@ struct nsproxy { struct mnt_namespace *mnt_ns; struct pid_namespace *pid_ns_for_children; struct net *net_ns; + struct time_namespace *time_ns; + struct time_namespace *time_ns_for_children; struct cgroup_namespace *cgroup_ns; }; extern struct nsproxy init_nsproxy; diff --git a/include/linux/proc_ns.h b/include/linux/proc_ns.h index d31cb6215905..d312e6281e69 100644 --- a/include/linux/proc_ns.h +++ b/include/linux/proc_ns.h @@ -32,6 +32,8 @@ extern const struct proc_ns_operations pidns_for_children_operations; extern const struct proc_ns_operations userns_operations; extern const struct proc_ns_operations mntns_operations; extern const struct proc_ns_operations cgroupns_operations; +extern const struct proc_ns_operations timens_operations; +extern const struct proc_ns_operations timens_for_children_operations; /* * We always define these enumerators @@ -43,6 +45,7 @@ enum { PROC_USER_INIT_INO = 0xEFFFFFFDU, PROC_PID_INIT_INO = 0xEFFFFFFCU, PROC_CGROUP_INIT_INO = 0xEFFFFFFBU, + PROC_TIME_INIT_INO = 0xEFFFFFFAU, }; #ifdef CONFIG_PROC_FS diff --git a/include/linux/time_namespace.h b/include/linux/time_namespace.h new file mode 100644 index 000000000000..8c74cc12ad24 --- /dev/null +++ b/include/linux/time_namespace.h @@ -0,0 +1,71 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_TIMENS_H +#define _LINUX_TIMENS_H + + +#include +#include +#include +#include +#include + +struct user_namespace; +extern struct user_namespace init_user_ns; + +struct time_namespace { + struct kref kref; + struct user_namespace *user_ns; + struct ucounts *ucounts; + struct ns_common ns; +} __randomize_layout; + +extern struct time_namespace init_time_ns; + +#ifdef CONFIG_TIME_NS +static inline struct time_namespace *get_time_ns(struct time_namespace *ns) +{ + kref_get(&ns->kref); + return ns; +} + +struct time_namespace *copy_time_ns(unsigned long flags, + struct user_namespace *user_ns, + struct time_namespace *old_ns); +void free_time_ns(struct kref *kref); +int timens_on_fork(struct nsproxy *nsproxy, struct task_struct *tsk); + +static inline void put_time_ns(struct time_namespace *ns) +{ + kref_put(&ns->kref, free_time_ns); +} + +#else +static inline struct time_namespace *get_time_ns(struct time_namespace *ns) +{ + return NULL; +} + +static inline void put_time_ns(struct time_namespace *ns) +{ +} + +static inline +struct time_namespace *copy_time_ns(unsigned long flags, + struct user_namespace *user_ns, + struct time_namespace *old_ns) +{ + if (flags & CLONE_NEWTIME) + return ERR_PTR(-EINVAL); + + return old_ns; +} + +static inline int timens_on_fork(struct nsproxy *nsproxy, + struct task_struct *tsk) +{ + return 0; +} + +#endif + +#endif /* _LINUX_TIMENS_H */ diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h index fb9f4f799554..6ef1c7109fc4 100644 --- a/include/linux/user_namespace.h +++ b/include/linux/user_namespace.h @@ -45,6 +45,7 @@ enum ucount_type { UCOUNT_NET_NAMESPACES, UCOUNT_MNT_NAMESPACES, UCOUNT_CGROUP_NAMESPACES, + UCOUNT_TIME_NAMESPACES, #ifdef CONFIG_INOTIFY_USER UCOUNT_INOTIFY_INSTANCES, UCOUNT_INOTIFY_WATCHES, diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h index 4a0217832464..2e3bc22c6f20 100644 --- a/include/uapi/linux/sched.h +++ b/include/uapi/linux/sched.h @@ -36,6 +36,12 @@ /* Flags for the clone3() syscall. */ #define CLONE_CLEAR_SIGHAND 0x100000000ULL /* Clear any signal handler and reset to SIG_DFL. */ +/* + * cloning flags intersect with CSIGNAL so can be used with unshare and clone3 + * syscalls only: + */ +#define CLONE_NEWTIME 0x00000080 /* New time namespace */ + #ifndef __ASSEMBLY__ /** * struct clone_args - arguments for the clone3 syscall diff --git a/init/Kconfig b/init/Kconfig index a34064a031a5..b34314fc75f7 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1080,6 +1080,13 @@ config UTS_NS In this namespace tasks see different info provided with the uname() system call +config TIME_NS + bool "TIME namespace" + default y + help + In this namespace boottime and monotonic clocks can be set. + The time will keep going with the same pace. + config IPC_NS bool "IPC namespace" depends on (SYSVIPC || POSIX_MQUEUE) diff --git a/kernel/fork.c b/kernel/fork.c index 2508a4f238a3..363595815144 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1832,6 +1832,7 @@ static __latent_entropy struct task_struct *copy_process( struct multiprocess_signals delayed; struct file *pidfile = NULL; u64 clone_flags = args->flags; + struct nsproxy *nsp = current->nsproxy; /* * Don't allow sharing the root directory with processes in a different @@ -1874,8 +1875,16 @@ static __latent_entropy struct task_struct *copy_process( */ if (clone_flags & CLONE_THREAD) { if ((clone_flags & (CLONE_NEWUSER | CLONE_NEWPID)) || - (task_active_pid_ns(current) != - current->nsproxy->pid_ns_for_children)) + (task_active_pid_ns(current) != nsp->pid_ns_for_children)) + return ERR_PTR(-EINVAL); + } + + /* + * If the new process will be in a different time namespace + * do not allow it to share VM or a thread group with the forking task. + */ + if (clone_flags & (CLONE_THREAD | CLONE_VM)) { + if (nsp->time_ns != nsp->time_ns_for_children) return ERR_PTR(-EINVAL); } @@ -2811,7 +2820,8 @@ static int check_unshare_flags(unsigned long unshare_flags) if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND| CLONE_VM|CLONE_FILES|CLONE_SYSVSEM| CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWNET| - CLONE_NEWUSER|CLONE_NEWPID|CLONE_NEWCGROUP)) + CLONE_NEWUSER|CLONE_NEWPID|CLONE_NEWCGROUP| + CLONE_NEWTIME)) return -EINVAL; /* * Not implemented, but pretend it works if there is nothing diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index c815f58e6bc0..ed9882108cd2 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include @@ -40,6 +41,10 @@ struct nsproxy init_nsproxy = { #ifdef CONFIG_CGROUPS .cgroup_ns = &init_cgroup_ns, #endif +#ifdef CONFIG_TIME_NS + .time_ns = &init_time_ns, + .time_ns_for_children = &init_time_ns, +#endif }; static inline struct nsproxy *create_nsproxy(void) @@ -106,8 +111,18 @@ static struct nsproxy *create_new_namespaces(unsigned long flags, goto out_net; } + new_nsp->time_ns_for_children = copy_time_ns(flags, user_ns, + tsk->nsproxy->time_ns_for_children); + if (IS_ERR(new_nsp->time_ns_for_children)) { + err = PTR_ERR(new_nsp->time_ns_for_children); + goto out_time; + } + new_nsp->time_ns = get_time_ns(tsk->nsproxy->time_ns); + return new_nsp; +out_time: + put_net(new_nsp->net_ns); out_net: put_cgroup_ns(new_nsp->cgroup_ns); out_cgroup: @@ -136,15 +151,16 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk) struct nsproxy *old_ns = tsk->nsproxy; struct user_namespace *user_ns = task_cred_xxx(tsk, user_ns); struct nsproxy *new_ns; + int ret; if (likely(!(flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC | CLONE_NEWPID | CLONE_NEWNET | - CLONE_NEWCGROUP)))) { - get_nsproxy(old_ns); - return 0; - } - - if (!ns_capable(user_ns, CAP_SYS_ADMIN)) + CLONE_NEWCGROUP | CLONE_NEWTIME)))) { + if (likely(old_ns->time_ns_for_children == old_ns->time_ns)) { + get_nsproxy(old_ns); + return 0; + } + } else if (!ns_capable(user_ns, CAP_SYS_ADMIN)) return -EPERM; /* @@ -162,6 +178,12 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk) if (IS_ERR(new_ns)) return PTR_ERR(new_ns); + ret = timens_on_fork(new_ns, tsk); + if (ret) { + free_nsproxy(new_ns); + return ret; + } + tsk->nsproxy = new_ns; return 0; } @@ -176,6 +198,10 @@ void free_nsproxy(struct nsproxy *ns) put_ipc_ns(ns->ipc_ns); if (ns->pid_ns_for_children) put_pid_ns(ns->pid_ns_for_children); + if (ns->time_ns) + put_time_ns(ns->time_ns); + if (ns->time_ns_for_children) + put_time_ns(ns->time_ns_for_children); put_cgroup_ns(ns->cgroup_ns); put_net(ns->net_ns); kmem_cache_free(nsproxy_cachep, ns); @@ -192,7 +218,8 @@ int unshare_nsproxy_namespaces(unsigned long unshare_flags, int err = 0; if (!(unshare_flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC | - CLONE_NEWNET | CLONE_NEWPID | CLONE_NEWCGROUP))) + CLONE_NEWNET | CLONE_NEWPID | CLONE_NEWCGROUP | + CLONE_NEWTIME))) return 0; user_ns = new_cred ? new_cred->user_ns : current_user_ns(); diff --git a/kernel/time/Makefile b/kernel/time/Makefile index 1867044800bb..c8f00168afe8 100644 --- a/kernel/time/Makefile +++ b/kernel/time/Makefile @@ -19,3 +19,4 @@ obj-$(CONFIG_TICK_ONESHOT) += tick-oneshot.o tick-sched.o obj-$(CONFIG_HAVE_GENERIC_VDSO) += vsyscall.o obj-$(CONFIG_DEBUG_FS) += timekeeping_debug.o obj-$(CONFIG_TEST_UDELAY) += test_udelay.o +obj-$(CONFIG_TIME_NS) += namespace.o diff --git a/kernel/time/namespace.c b/kernel/time/namespace.c new file mode 100644 index 000000000000..2662a69e0382 --- /dev/null +++ b/kernel/time/namespace.c @@ -0,0 +1,217 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Author: Andrei Vagin + * Author: Dmitry Safonov + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static struct ucounts *inc_time_namespaces(struct user_namespace *ns) +{ + return inc_ucount(ns, current_euid(), UCOUNT_TIME_NAMESPACES); +} + +static void dec_time_namespaces(struct ucounts *ucounts) +{ + dec_ucount(ucounts, UCOUNT_TIME_NAMESPACES); +} + +/** + * clone_time_ns - Clone a time namespace + * @user_ns: User namespace which owns a new namespace. + * @old_ns: Namespace to clone + * + * Clone @old_ns and set the clone refcount to 1 + * + * Return: The new namespace or ERR_PTR. + */ +static struct time_namespace *clone_time_ns(struct user_namespace *user_ns, + struct time_namespace *old_ns) +{ + struct time_namespace *ns; + struct ucounts *ucounts; + int err; + + err = -ENOSPC; + ucounts = inc_time_namespaces(user_ns); + if (!ucounts) + goto fail; + + err = -ENOMEM; + ns = kmalloc(sizeof(*ns), GFP_KERNEL); + if (!ns) + goto fail_dec; + + kref_init(&ns->kref); + + err = ns_alloc_inum(&ns->ns); + if (err) + goto fail_free; + + ns->ucounts = ucounts; + ns->ns.ops = &timens_operations; + ns->user_ns = get_user_ns(user_ns); + return ns; + +fail_free: + kfree(ns); +fail_dec: + dec_time_namespaces(ucounts); +fail: + return ERR_PTR(err); +} + +/** + * copy_time_ns - Create timens_for_children from @old_ns + * @flags: Cloning flags + * @user_ns: User namespace which owns a new namespace. + * @old_ns: Namespace to clone + * + * If CLONE_NEWTIME specified in @flags, creates a new timens_for_children; + * adds a refcounter to @old_ns otherwise. + * + * Return: timens_for_children namespace or ERR_PTR. + */ +struct time_namespace *copy_time_ns(unsigned long flags, + struct user_namespace *user_ns, struct time_namespace *old_ns) +{ + if (!(flags & CLONE_NEWTIME)) + return get_time_ns(old_ns); + + return clone_time_ns(user_ns, old_ns); +} + +void free_time_ns(struct kref *kref) +{ + struct time_namespace *ns; + + ns = container_of(kref, struct time_namespace, kref); + dec_time_namespaces(ns->ucounts); + put_user_ns(ns->user_ns); + ns_free_inum(&ns->ns); + kfree(ns); +} + +static struct time_namespace *to_time_ns(struct ns_common *ns) +{ + return container_of(ns, struct time_namespace, ns); +} + +static struct ns_common *timens_get(struct task_struct *task) +{ + struct time_namespace *ns = NULL; + struct nsproxy *nsproxy; + + task_lock(task); + nsproxy = task->nsproxy; + if (nsproxy) { + ns = nsproxy->time_ns; + get_time_ns(ns); + } + task_unlock(task); + + return ns ? &ns->ns : NULL; +} + +static struct ns_common *timens_for_children_get(struct task_struct *task) +{ + struct time_namespace *ns = NULL; + struct nsproxy *nsproxy; + + task_lock(task); + nsproxy = task->nsproxy; + if (nsproxy) { + ns = nsproxy->time_ns_for_children; + get_time_ns(ns); + } + task_unlock(task); + + return ns ? &ns->ns : NULL; +} + +static void timens_put(struct ns_common *ns) +{ + put_time_ns(to_time_ns(ns)); +} + +static int timens_install(struct nsproxy *nsproxy, struct ns_common *new) +{ + struct time_namespace *ns = to_time_ns(new); + + if (!current_is_single_threaded()) + return -EUSERS; + + if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN) || + !ns_capable(current_user_ns(), CAP_SYS_ADMIN)) + return -EPERM; + + get_time_ns(ns); + put_time_ns(nsproxy->time_ns); + nsproxy->time_ns = ns; + + get_time_ns(ns); + put_time_ns(nsproxy->time_ns_for_children); + nsproxy->time_ns_for_children = ns; + return 0; +} + +int timens_on_fork(struct nsproxy *nsproxy, struct task_struct *tsk) +{ + struct ns_common *nsc = &nsproxy->time_ns_for_children->ns; + struct time_namespace *ns = to_time_ns(nsc); + + /* create_new_namespaces() already incremented the ref counter */ + if (nsproxy->time_ns == nsproxy->time_ns_for_children) + return 0; + + get_time_ns(ns); + put_time_ns(nsproxy->time_ns); + nsproxy->time_ns = ns; + + return 0; +} + +static struct user_namespace *timens_owner(struct ns_common *ns) +{ + return to_time_ns(ns)->user_ns; +} + +const struct proc_ns_operations timens_operations = { + .name = "time", + .type = CLONE_NEWTIME, + .get = timens_get, + .put = timens_put, + .install = timens_install, + .owner = timens_owner, +}; + +const struct proc_ns_operations timens_for_children_operations = { + .name = "time_for_children", + .type = CLONE_NEWTIME, + .get = timens_for_children_get, + .put = timens_put, + .install = timens_install, + .owner = timens_owner, +}; + +struct time_namespace init_time_ns = { + .kref = KREF_INIT(3), + .user_ns = &init_user_ns, + .ns.inum = PROC_TIME_INIT_INO, + .ns.ops = &timens_operations, +}; + +static int __init time_ns_init(void) +{ + return 0; +} +subsys_initcall(time_ns_init); -- cgit v1.2.3 From 660fd04f9317172ae90f414c68b18a26ae88a829 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 12 Nov 2019 01:27:09 +0000 Subject: lib/vdso: Prepare for time namespace support To support time namespaces in the vdso with a minimal impact on regular non time namespace affected tasks, the namespace handling needs to be hidden in a slow path. The most obvious place is vdso_seq_begin(). If a task belongs to a time namespace then the VVAR page which contains the system wide vdso data is replaced with a namespace specific page which has the same layout as the VVAR page. That page has vdso_data->seq set to 1 to enforce the slow path and vdso_data->clock_mode set to VCLOCK_TIMENS to enforce the time namespace handling path. The extra check in the case that vdso_data->seq is odd, e.g. a concurrent update of the vdso data is in progress, is not really affecting regular tasks which are not part of a time namespace as the task is spin waiting for the update to finish and vdso_data->seq to become even again. If a time namespace task hits that code path, it invokes the corresponding time getter function which retrieves the real VVAR page, reads host time and then adds the offset for the requested clock which is stored in the special VVAR page. If VDSO time namespace support is disabled the whole magic is compiled out. Initial testing shows that the disabled case is almost identical to the host case which does not take the slow timens path. With the special timens page installed the performance hit is constant time and in the range of 5-7%. For the vdso functions which are not using the sequence count an unconditional check for vdso_data->clock_mode is added which switches to the real vdso when the clock_mode is VCLOCK_TIMENS. [avagin: Make do_hres_timens() work with raw clocks too: choose vdso_data pointer by CS_RAW offset.] Suggested-by: Andy Lutomirski Signed-off-by: Thomas Gleixner Signed-off-by: Andrei Vagin Signed-off-by: Dmitry Safonov Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20191112012724.250792-21-dima@arista.com --- include/linux/time.h | 6 ++ include/vdso/datapage.h | 19 ++++++- init/Kconfig | 1 + lib/vdso/Kconfig | 6 ++ lib/vdso/gettimeofday.c | 142 ++++++++++++++++++++++++++++++++++++++++++++++-- 5 files changed, 169 insertions(+), 5 deletions(-) (limited to 'init') diff --git a/include/linux/time.h b/include/linux/time.h index 8e10b9dbd8c2..8ef5e5cc9f57 100644 --- a/include/linux/time.h +++ b/include/linux/time.h @@ -110,4 +110,10 @@ static inline bool itimerspec64_valid(const struct itimerspec64 *its) * Equivalent to !(time_before32(@t, @l) || time_after32(@t, @h)). */ #define time_between32(t, l, h) ((u32)(h) - (u32)(l) >= (u32)(t) - (u32)(l)) + +struct timens_offset { + s64 sec; + u64 nsec; +}; + #endif diff --git a/include/vdso/datapage.h b/include/vdso/datapage.h index 2e302c0f41f7..c5f347cc5e55 100644 --- a/include/vdso/datapage.h +++ b/include/vdso/datapage.h @@ -21,6 +21,8 @@ #define CS_RAW 1 #define CS_BASES (CS_RAW + 1) +#define VCLOCK_TIMENS UINT_MAX + /** * struct vdso_timestamp - basetime per clock_id * @sec: seconds @@ -48,6 +50,7 @@ struct vdso_timestamp { * @mult: clocksource multiplier * @shift: clocksource shift * @basetime[clock_id]: basetime per clock_id + * @offset[clock_id]: time namespace offset per clock_id * @tz_minuteswest: minutes west of Greenwich * @tz_dsttime: type of DST correction * @hrtimer_res: hrtimer resolution @@ -55,6 +58,17 @@ struct vdso_timestamp { * * vdso_data will be accessed by 64 bit and compat code at the same time * so we should be careful before modifying this structure. + * + * @basetime is used to store the base time for the system wide time getter + * VVAR page. + * + * @offset is used by the special time namespace VVAR pages which are + * installed instead of the real VVAR page. These namespace pages must set + * @seq to 1 and @clock_mode to VLOCK_TIMENS to force the code into the + * time namespace slow path. The namespace aware functions retrieve the + * real system wide VVAR page, read host time and add the per clock offset. + * For clocks which are not affected by time namespace adjustment the + * offset must be zero. */ struct vdso_data { u32 seq; @@ -65,7 +79,10 @@ struct vdso_data { u32 mult; u32 shift; - struct vdso_timestamp basetime[VDSO_BASES]; + union { + struct vdso_timestamp basetime[VDSO_BASES]; + struct timens_offset offset[VDSO_BASES]; + }; s32 tz_minuteswest; s32 tz_dsttime; diff --git a/init/Kconfig b/init/Kconfig index b34314fc75f7..9b7f144a6d35 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1082,6 +1082,7 @@ config UTS_NS config TIME_NS bool "TIME namespace" + depends on GENERIC_VDSO_TIME_NS default y help In this namespace boottime and monotonic clocks can be set. diff --git a/lib/vdso/Kconfig b/lib/vdso/Kconfig index 9fe698ff62ec..d883ac299508 100644 --- a/lib/vdso/Kconfig +++ b/lib/vdso/Kconfig @@ -24,4 +24,10 @@ config GENERIC_COMPAT_VDSO help This config option enables the compat VDSO layer. +config GENERIC_VDSO_TIME_NS + bool + help + Selected by architectures which support time namespaces in the + VDSO + endif diff --git a/lib/vdso/gettimeofday.c b/lib/vdso/gettimeofday.c index b453d2469b63..f342ac1fce77 100644 --- a/lib/vdso/gettimeofday.c +++ b/lib/vdso/gettimeofday.c @@ -38,15 +38,89 @@ u64 vdso_calc_delta(u64 cycles, u64 last, u64 mask, u32 mult) } #endif +#ifdef CONFIG_TIME_NS +static int do_hres_timens(const struct vdso_data *vdns, clockid_t clk, + struct __kernel_timespec *ts) +{ + const struct vdso_data *vd = __arch_get_timens_vdso_data(); + const struct timens_offset *offs = &vdns->offset[clk]; + const struct vdso_timestamp *vdso_ts; + u64 cycles, last, ns; + u32 seq; + s64 sec; + + if (clk != CLOCK_MONOTONIC_RAW) + vd = &vd[CS_HRES_COARSE]; + else + vd = &vd[CS_RAW]; + vdso_ts = &vd->basetime[clk]; + + do { + seq = vdso_read_begin(vd); + cycles = __arch_get_hw_counter(vd->clock_mode); + ns = vdso_ts->nsec; + last = vd->cycle_last; + if (unlikely((s64)cycles < 0)) + return -1; + + ns += vdso_calc_delta(cycles, last, vd->mask, vd->mult); + ns >>= vd->shift; + sec = vdso_ts->sec; + } while (unlikely(vdso_read_retry(vd, seq))); + + /* Add the namespace offset */ + sec += offs->sec; + ns += offs->nsec; + + /* + * Do this outside the loop: a race inside the loop could result + * in __iter_div_u64_rem() being extremely slow. + */ + ts->tv_sec = sec + __iter_div_u64_rem(ns, NSEC_PER_SEC, &ns); + ts->tv_nsec = ns; + + return 0; +} +#else +static __always_inline const struct vdso_data *__arch_get_timens_vdso_data(void) +{ + return NULL; +} + +static int do_hres_timens(const struct vdso_data *vdns, clockid_t clk, + struct __kernel_timespec *ts) +{ + return -EINVAL; +} +#endif + static __always_inline int do_hres(const struct vdso_data *vd, clockid_t clk, - struct __kernel_timespec *ts) + struct __kernel_timespec *ts) { const struct vdso_timestamp *vdso_ts = &vd->basetime[clk]; u64 cycles, last, sec, ns; u32 seq; do { - seq = vdso_read_begin(vd); + /* + * Open coded to handle VCLOCK_TIMENS. Time namespace + * enabled tasks have a special VVAR page installed which + * has vd->seq set to 1 and vd->clock_mode set to + * VCLOCK_TIMENS. For non time namespace affected tasks + * this does not affect performance because if vd->seq is + * odd, i.e. a concurrent update is in progress the extra + * check for vd->clock_mode is just a few extra + * instructions while spin waiting for vd->seq to become + * even again. + */ + while (unlikely((seq = READ_ONCE(vd->seq)) & 1)) { + if (IS_ENABLED(CONFIG_TIME_NS) && + vd->clock_mode == VCLOCK_TIMENS) + return do_hres_timens(vd, clk, ts); + cpu_relax(); + } + smp_rmb(); + cycles = __arch_get_hw_counter(vd->clock_mode); ns = vdso_ts->nsec; last = vd->cycle_last; @@ -68,6 +142,43 @@ static __always_inline int do_hres(const struct vdso_data *vd, clockid_t clk, return 0; } +#ifdef CONFIG_TIME_NS +static int do_coarse_timens(const struct vdso_data *vdns, clockid_t clk, + struct __kernel_timespec *ts) +{ + const struct vdso_data *vd = __arch_get_timens_vdso_data(); + const struct vdso_timestamp *vdso_ts = &vd->basetime[clk]; + const struct timens_offset *offs = &vdns->offset[clk]; + u64 nsec; + s64 sec; + s32 seq; + + do { + seq = vdso_read_begin(vd); + sec = vdso_ts->sec; + nsec = vdso_ts->nsec; + } while (unlikely(vdso_read_retry(vd, seq))); + + /* Add the namespace offset */ + sec += offs->sec; + nsec += offs->nsec; + + /* + * Do this outside the loop: a race inside the loop could result + * in __iter_div_u64_rem() being extremely slow. + */ + ts->tv_sec = sec + __iter_div_u64_rem(nsec, NSEC_PER_SEC, &nsec); + ts->tv_nsec = nsec; + return 0; +} +#else +static int do_coarse_timens(const struct vdso_data *vdns, clockid_t clk, + struct __kernel_timespec *ts) +{ + return -1; +} +#endif + static __always_inline int do_coarse(const struct vdso_data *vd, clockid_t clk, struct __kernel_timespec *ts) { @@ -75,7 +186,18 @@ static __always_inline int do_coarse(const struct vdso_data *vd, clockid_t clk, u32 seq; do { - seq = vdso_read_begin(vd); + /* + * Open coded to handle VCLOCK_TIMENS. See comment in + * do_hres(). + */ + while ((seq = READ_ONCE(vd->seq)) & 1) { + if (IS_ENABLED(CONFIG_TIME_NS) && + vd->clock_mode == VCLOCK_TIMENS) + return do_coarse_timens(vd, clk, ts); + cpu_relax(); + } + smp_rmb(); + ts->tv_sec = vdso_ts->sec; ts->tv_nsec = vdso_ts->nsec; } while (unlikely(vdso_read_retry(vd, seq))); @@ -156,6 +278,10 @@ __cvdso_gettimeofday(struct __kernel_old_timeval *tv, struct timezone *tz) } if (unlikely(tz != NULL)) { + if (IS_ENABLED(CONFIG_TIME_NS) && + vd->clock_mode == VCLOCK_TIMENS) + vd = __arch_get_timens_vdso_data(); + tz->tz_minuteswest = vd[CS_HRES_COARSE].tz_minuteswest; tz->tz_dsttime = vd[CS_HRES_COARSE].tz_dsttime; } @@ -167,7 +293,12 @@ __cvdso_gettimeofday(struct __kernel_old_timeval *tv, struct timezone *tz) static __maybe_unused __kernel_old_time_t __cvdso_time(__kernel_old_time_t *time) { const struct vdso_data *vd = __arch_get_vdso_data(); - __kernel_old_time_t t = READ_ONCE(vd[CS_HRES_COARSE].basetime[CLOCK_REALTIME].sec); + __kernel_old_time_t t; + + if (IS_ENABLED(CONFIG_TIME_NS) && vd->clock_mode == VCLOCK_TIMENS) + vd = __arch_get_timens_vdso_data(); + + t = READ_ONCE(vd[CS_HRES_COARSE].basetime[CLOCK_REALTIME].sec); if (time) *time = t; @@ -189,6 +320,9 @@ int __cvdso_clock_getres_common(clockid_t clock, struct __kernel_timespec *res) if (unlikely((u32) clock >= MAX_CLOCKS)) return -1; + if (IS_ENABLED(CONFIG_TIME_NS) && vd->clock_mode == VCLOCK_TIMENS) + vd = __arch_get_timens_vdso_data(); + hrtimer_res = READ_ONCE(vd[CS_HRES_COARSE].hrtimer_res); /* * Convert the clockid to a bitmask and use it to check which -- cgit v1.2.3 From 87c9366e17259040a9118e06b6dc8de986e5d3d1 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 4 Dec 2019 17:43:46 +0100 Subject: Revert "um: Enable CONFIG_CONSTRUCTORS" This reverts commit 786b2384bf1c ("um: Enable CONFIG_CONSTRUCTORS"). There are two issues with this commit, uncovered by Anton in tests on some (Debian) systems: 1) I completely forgot to call any constructors if CONFIG_CONSTRUCTORS isn't set. Don't recall now if it just wasn't needed on my system, or if I never tested this case. 2) With that fixed, it works - with CONFIG_CONSTRUCTORS *unset*. If I set CONFIG_CONSTRUCTORS, it fails again, which isn't totally unexpected since whatever wanted to run is likely to have to run before the kernel init etc. that calls the constructors in this case. Basically, some constructors that gcc emits (libc has?) need to run very early during init; the failure mode otherwise was that the ptrace fork test already failed: ---------------------- $ ./linux mem=512M Core dump limits : soft - 0 hard - NONE Checking that ptrace can change system call numbers...check_ptrace : child exited with exitcode 6, while expecting 0; status 0x67f Aborted ---------------------- Thinking more about this, it's clear that we simply cannot support CONFIG_CONSTRUCTORS in UML. All the cases we need now (gcov, kasan) involve not use of the __attribute__((constructor)), but instead some constructor code/entry generated by gcc. Therefore, we cannot distinguish between kernel constructors and system constructors. Thus, revert this commit. Cc: stable@vger.kernel.org [5.4+] Fixes: 786b2384bf1c ("um: Enable CONFIG_CONSTRUCTORS") Reported-by: Anton Ivanov Signed-off-by: Johannes Berg Acked-by: Anton Ivanov Signed-off-by: Richard Weinberger --- arch/um/include/asm/common.lds.S | 2 +- arch/um/kernel/dyn.lds.S | 1 + init/Kconfig | 1 + kernel/gcov/Kconfig | 2 +- 4 files changed, 4 insertions(+), 2 deletions(-) (limited to 'init') diff --git a/arch/um/include/asm/common.lds.S b/arch/um/include/asm/common.lds.S index 7145ce699982..eca6c452a41b 100644 --- a/arch/um/include/asm/common.lds.S +++ b/arch/um/include/asm/common.lds.S @@ -82,8 +82,8 @@ __preinit_array_end = .; } .init_array : { - /* dummy - we call this ourselves */ __init_array_start = .; + *(.init_array) __init_array_end = .; } .fini_array : { diff --git a/arch/um/kernel/dyn.lds.S b/arch/um/kernel/dyn.lds.S index c69d69ee96be..f5001481010c 100644 --- a/arch/um/kernel/dyn.lds.S +++ b/arch/um/kernel/dyn.lds.S @@ -103,6 +103,7 @@ SECTIONS be empty, which isn't pretty. */ . = ALIGN(32 / 8); .preinit_array : { *(.preinit_array) } + .init_array : { *(.init_array) } .fini_array : { *(.fini_array) } .data : { INIT_TASK_DATA(KERNEL_STACK_SIZE) diff --git a/init/Kconfig b/init/Kconfig index a34064a031a5..47d40f399000 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -54,6 +54,7 @@ config CC_DISABLE_WARN_MAYBE_UNINITIALIZED config CONSTRUCTORS bool + depends on !UML config IRQ_WORK bool diff --git a/kernel/gcov/Kconfig b/kernel/gcov/Kconfig index 060e8e726755..3941a9c48f83 100644 --- a/kernel/gcov/Kconfig +++ b/kernel/gcov/Kconfig @@ -4,7 +4,7 @@ menu "GCOV-based kernel profiling" config GCOV_KERNEL bool "Enable gcov-based kernel profiling" depends on DEBUG_FS - select CONSTRUCTORS + select CONSTRUCTORS if !UML default n ---help--- This option enables gcov-based code profiling (e.g. for code coverage -- cgit v1.2.3 From 0947db01d9b9fd77a627e501fe563bdaf4c724a1 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Mon, 20 Jan 2020 12:23:00 +0900 Subject: bootconfig: Fix Kconfig help message for BOOT_CONFIG Fix Kconfig help message since the bootconfig file is only available to be appended to initramfs. And also add a reference to the documentation. Link: http://lkml.kernel.org/r/157949058031.25888.18399447161895787505.stgit@devnote2 Reported-by: Randy Dunlap Acked-by: Randy Dunlap Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- init/Kconfig | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'init') diff --git a/init/Kconfig b/init/Kconfig index ffd240fb88c3..9506299a53e3 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1223,7 +1223,9 @@ config BOOT_CONFIG help Extra boot config allows system admin to pass a config file as complemental extension of kernel cmdline when booting. - The boot config file is usually attached at the end of initramfs. + The boot config file must be attached at the end of initramfs + with checksum and size. + See for details. If unsure, say Y. -- cgit v1.2.3 From b88c50ac304a14f14e70c4ad22577b6b84632d5e Mon Sep 17 00:00:00 2001 From: Arvind Sankar Date: Thu, 30 Jan 2020 22:17:13 -0800 Subject: init/main.c: log arguments and environment passed to init Extend logging in `run_init_process` to also show the arguments and environment that we are passing to init. Link: http://lkml.kernel.org/r/20191212180023.24339-2-nivedita@alum.mit.edu Signed-off-by: Arvind Sankar Cc: Chris Metcalf Cc: Krzysztof Mazur Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- init/main.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'init') diff --git a/init/main.c b/init/main.c index db13a76c036e..424d88fef08e 100644 --- a/init/main.c +++ b/init/main.c @@ -1043,8 +1043,16 @@ static void __init do_pre_smp_initcalls(void) static int run_init_process(const char *init_filename) { + const char *const *p; + argv_init[0] = init_filename; pr_info("Run %s as init process\n", init_filename); + pr_debug(" with arguments:\n"); + for (p = argv_init; *p; p++) + pr_debug(" %s\n", *p); + pr_debug(" with environment:\n"); + for (p = envp_init; *p; p++) + pr_debug(" %s\n", *p); return do_execve(getname_kernel(init_filename), (const char __user *const __user *)argv_init, (const char __user *const __user *)envp_init); -- cgit v1.2.3 From 7e2762e1d9f4b9cb67ed6c3b7aff2020c367d4ac Mon Sep 17 00:00:00 2001 From: Arvind Sankar Date: Thu, 30 Jan 2020 22:17:16 -0800 Subject: init/main.c: remove unnecessary repair_env_string in do_initcall_level Since commit 08746a65c296 ("init: fix in-place parameter modification regression"), parse_args in do_initcall_level is called on a copy of saved_command_line. It is unnecessary to call repair_env_string during this parsing, as this copy is not used for anything later. Remove the now unnecessary arguments from repair_env_string as well. Link: http://lkml.kernel.org/r/20191212180023.24339-3-nivedita@alum.mit.edu Signed-off-by: Arvind Sankar Cc: Krzysztof Mazur Cc: Chris Metcalf Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- init/main.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) (limited to 'init') diff --git a/init/main.c b/init/main.c index 424d88fef08e..51fced4ca87a 100644 --- a/init/main.c +++ b/init/main.c @@ -246,8 +246,7 @@ static int __init loglevel(char *str) early_param("loglevel", loglevel); /* Change NUL term back to "=", to make "param" the whole string. */ -static int __init repair_env_string(char *param, char *val, - const char *unused, void *arg) +static void __init repair_env_string(char *param, char *val) { if (val) { /* param=val or param="val"? */ @@ -260,7 +259,6 @@ static int __init repair_env_string(char *param, char *val, } else BUG(); } - return 0; } /* Anything after -- gets handed straight to init. */ @@ -272,7 +270,7 @@ static int __init set_init_arg(char *param, char *val, if (panic_later) return 0; - repair_env_string(param, val, unused, NULL); + repair_env_string(param, val); for (i = 0; argv_init[i]; i++) { if (i == MAX_INIT_ARGS) { @@ -292,7 +290,7 @@ static int __init set_init_arg(char *param, char *val, static int __init unknown_bootoption(char *param, char *val, const char *unused, void *arg) { - repair_env_string(param, val, unused, NULL); + repair_env_string(param, val); /* Handle obsolete-style parameters */ if (obsolete_checksetup(param)) @@ -991,6 +989,12 @@ static const char *initcall_level_names[] __initdata = { "late", }; +static int __init ignore_unknown_bootoption(char *param, char *val, + const char *unused, void *arg) +{ + return 0; +} + static void __init do_initcall_level(int level) { initcall_entry_t *fn; @@ -1000,7 +1004,7 @@ static void __init do_initcall_level(int level) initcall_command_line, __start___param, __stop___param - __start___param, level, level, - NULL, &repair_env_string); + NULL, ignore_unknown_bootoption); trace_initcall_level(initcall_level_names[level]); for (fn = initcall_levels[level]; fn < initcall_levels[level+1]; fn++) -- cgit v1.2.3 From 283900e82847e10898650c66a249c10b213934fb Mon Sep 17 00:00:00 2001 From: Arvind Sankar Date: Thu, 30 Jan 2020 22:17:19 -0800 Subject: init/main.c: fix quoted value handling in unknown_bootoption Patch series "init/main.c: minor cleanup/bugfix of envvar handling", v2. unknown_bootoption passes unrecognized command line arguments to init as either environment variables or arguments. Some of the logic in the function is broken for quoted command line arguments. When an argument of the form param="value" is processed by parse_args and passed to unknown_bootoption, the command line has param\0"value\0 with val pointing to the beginning of value. The helper function repair_env_string is then used to restore the '=' character that was removed by parse_args, and strip the quotes off fully. This results in param=value\0\0 and val ends up pointing to the 'a' instead of the 'v' in value. This bug was introduced when repair_env_string was refactored into a separate function, and the decrement of val in repair_env_string became dead code. This causes two problems in unknown_bootoption in the two places where the val pointer is used as a substitute for the length of param: 1. An argument of the form param=".value" is misinterpreted as a potential module parameter, with the result that it will not be placed in init's environment. 2. An argument of the form param="value" is checked to see if param is an existing environment variable that should be overwritten, but the comparison is off-by-one and compares 'param=v' instead of 'param=' against the existing environment. So passing, for example, TERM="vt100" on the command line results in init being passed both TERM=linux and TERM=vt100 in its environment. Patch 1 adds logging for the arguments and environment passed to init and is independent of the rest: it can be dropped if this is unnecessarily verbose. Patch 2 removes repair_env_string from initcall parameter parsing in do_initcall_level, as that uses a separate copy of the command line now and the repairing is no longer necessary. Patch 3 fixes the bug in unknown_bootoption by recording the length of param explicitly instead of implying it from val-param. This patch (of 3): Commit a99cd1125189 ("init: fix bug where environment vars can't be passed via boot args") introduced two minor bugs in unknown_bootoption by factoring out the quoted value handling into a separate function. When value is quoted, repair_env_string will move the value up 1 byte to strip the quotes, so val in unknown_bootoption no longer points to the actual location of the value. The result is that an argument of the form param=".value" is mistakenly treated as a potential module parameter and is not placed in init's environment, and an argument of the form param="value" can result in a duplicate environment variable: eg TERM="vt100" on the command line will result in both TERM=linux and TERM=vt100 being placed into init's environment. Fix this by recording the length of the param before calling repair_env_string instead of relying on val. Link: http://lkml.kernel.org/r/20191212180023.24339-4-nivedita@alum.mit.edu Signed-off-by: Arvind Sankar Cc: Chris Metcalf Cc: Krzysztof Mazur Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- init/main.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'init') diff --git a/init/main.c b/init/main.c index 51fced4ca87a..75eded9aafb0 100644 --- a/init/main.c +++ b/init/main.c @@ -255,7 +255,6 @@ static void __init repair_env_string(char *param, char *val) else if (val == param+strlen(param)+2) { val[-2] = '='; memmove(val-1, val, strlen(val)+1); - val--; } else BUG(); } @@ -290,6 +289,8 @@ static int __init set_init_arg(char *param, char *val, static int __init unknown_bootoption(char *param, char *val, const char *unused, void *arg) { + size_t len = strlen(param); + repair_env_string(param, val); /* Handle obsolete-style parameters */ @@ -297,7 +298,7 @@ static int __init unknown_bootoption(char *param, char *val, return 0; /* Unused module parameter. */ - if (strchr(param, '.') && (!val || strchr(param, '.') < val)) + if (strnchr(param, len, '.')) return 0; if (panic_later) @@ -311,7 +312,7 @@ static int __init unknown_bootoption(char *param, char *val, panic_later = "env"; panic_param = param; } - if (!strncmp(param, envp_init[i], val - param)) + if (!strncmp(param, envp_init[i], len+1)) break; } envp_init[i] = param; -- cgit v1.2.3 From f596ded1a044517afc4e9325f0dea449858450ac Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Thu, 30 Jan 2020 22:17:23 -0800 Subject: init/main.c: fix misleading "This architecture does not have kernel memory protection" message This message leads to thinking that memory protection is not implemented for the said architecture, whereas absence of CONFIG_STRICT_KERNEL_RWX only means that memory protection has not been selected at compile time. Don't print this message when CONFIG_ARCH_HAS_STRICT_KERNEL_RWX is selected by the architecture. Instead, print "Kernel memory protection not selected by kernel config." Link: http://lkml.kernel.org/r/62477e446d9685459d4f27d193af6ff1bd69d55f.1578557581.git.christophe.leroy@c-s.fr Signed-off-by: Christophe Leroy Acked-by: Kees Cook Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Michael Ellerman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- init/main.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'init') diff --git a/init/main.c b/init/main.c index 75eded9aafb0..d8c7e86c2d28 100644 --- a/init/main.c +++ b/init/main.c @@ -1104,6 +1104,11 @@ static void mark_readonly(void) } else pr_info("Kernel memory protection disabled.\n"); } +#elif defined(CONFIG_ARCH_HAS_STRICT_KERNEL_RWX) +static inline void mark_readonly(void) +{ + pr_warn("Kernel memory protection not selected by kernel config.\n"); +} #else static inline void mark_readonly(void) { -- cgit v1.2.3 From 7495e0926fdf302cb9e62a49f7c22198815624cd Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Tue, 4 Feb 2020 07:33:53 -0500 Subject: bootconfig: Only load bootconfig if "bootconfig" is on the kernel cmdline As the bootconfig is appended to the initrd it is not as easy to modify as the kernel command line. If there's some issue with the kernel, and the developer wants to boot a pristine kernel, it should not be needed to modify the initrd to remove the bootconfig for a single boot. As bootconfig is silently added (if the admin does not know where to look they may not know it's being loaded). It should be explicitly added to the kernel cmdline. The loading of the bootconfig is only done if "bootconfig" is on the kernel command line. This will let admins know that the kernel command line is extended. Note, after adding printk()s for when the size is too great or the checksum is wrong, exposed that the current method always looked for the boot config, and if this size and checksum matched, it would parse it (as if either is wrong a printk has been added to show this). It's better to only check this if the boot config is asked to be looked for. Link: https://lore.kernel.org/r/CAHk-=wjfjO+h6bQzrTf=YCZA53Y3EDyAs3Z4gEsT7icA3u_Psw@mail.gmail.com Acked-by: Masami Hiramatsu Suggested-by: Linus Torvalds Signed-off-by: Steven Rostedt (VMware) --- Documentation/admin-guide/bootconfig.rst | 2 ++ Documentation/admin-guide/kernel-parameters.txt | 6 ++++++ init/main.c | 28 ++++++++++++++++++------- 3 files changed, 29 insertions(+), 7 deletions(-) (limited to 'init') diff --git a/Documentation/admin-guide/bootconfig.rst b/Documentation/admin-guide/bootconfig.rst index 4d617693c0c8..b342a6796392 100644 --- a/Documentation/admin-guide/bootconfig.rst +++ b/Documentation/admin-guide/bootconfig.rst @@ -123,6 +123,8 @@ To remove the config from the image, you can use -d option as below:: # tools/bootconfig/bootconfig -d /boot/initrd.img-X.Y.Z +Then add "bootconfig" on the normal kernel command line to tell the +kernel to look for the bootconfig at the end of the initrd file. Config File Limitation ====================== diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index ade4e6ec23e0..b48c70ba9841 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -437,6 +437,12 @@ no delay (0). Format: integer + bootconfig [KNL] + Extended command line options can be added to an initrd + and this will cause the kernel to look for it. + + See Documentation/admin-guide/bootconfig.rst + bert_disable [ACPI] Disable BERT OS support on buggy BIOSes. diff --git a/init/main.c b/init/main.c index dd7da62d99a5..f174a59d3903 100644 --- a/init/main.c +++ b/init/main.c @@ -336,28 +336,39 @@ u32 boot_config_checksum(unsigned char *p, u32 size) return ret; } -static void __init setup_boot_config(void) +static void __init setup_boot_config(const char *cmdline) { u32 size, csum; char *data, *copy; + const char *p; u32 *hdr; - if (!initrd_end) + p = strstr(cmdline, "bootconfig"); + if (!p || (p != cmdline && !isspace(*(p-1))) || + (p[10] && !isspace(p[10]))) return; + if (!initrd_end) + goto not_found; + hdr = (u32 *)(initrd_end - 8); size = hdr[0]; csum = hdr[1]; - if (size >= XBC_DATA_MAX) + if (size >= XBC_DATA_MAX) { + pr_err("bootconfig size %d greater than max size %d\n", + size, XBC_DATA_MAX); return; + } data = ((void *)hdr) - size; if ((unsigned long)data < initrd_start) - return; + goto not_found; - if (boot_config_checksum((unsigned char *)data, size) != csum) + if (boot_config_checksum((unsigned char *)data, size) != csum) { + pr_err("bootconfig checksum failed\n"); return; + } copy = memblock_alloc(size + 1, SMP_CACHE_BYTES); if (!copy) { @@ -377,9 +388,12 @@ static void __init setup_boot_config(void) /* Also, "init." keys are init arguments */ extra_init_args = xbc_make_cmdline("init"); } + return; +not_found: + pr_err("'bootconfig' found on command line, but no bootconfig found\n"); } #else -#define setup_boot_config() do { } while (0) +#define setup_boot_config(cmdline) do { } while (0) #endif /* Change NUL term back to "=", to make "param" the whole string. */ @@ -760,7 +774,7 @@ asmlinkage __visible void __init start_kernel(void) pr_notice("%s", linux_banner); early_security_init(); setup_arch(&command_line); - setup_boot_config(); + setup_boot_config(command_line); setup_command_line(command_line); setup_nr_cpu_ids(); setup_per_cpu_areas(); -- cgit v1.2.3 From e241d14a82910ffa9060d81864760c93b7256195 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Wed, 5 Feb 2020 22:49:54 +0900 Subject: bootconfig: Use bootconfig instead of boot config Use "bootconfig" (1 word) instead of "boot config" (2 words) in the boot message. Link: http://lkml.kernel.org/r/158091059459.27924.14414336187441539879.stgit@devnote2 Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- init/main.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'init') diff --git a/init/main.c b/init/main.c index f174a59d3903..2de2f9f7aab9 100644 --- a/init/main.c +++ b/init/main.c @@ -372,7 +372,7 @@ static void __init setup_boot_config(const char *cmdline) copy = memblock_alloc(size + 1, SMP_CACHE_BYTES); if (!copy) { - pr_err("Failed to allocate memory for boot config\n"); + pr_err("Failed to allocate memory for bootconfig\n"); return; } @@ -380,9 +380,9 @@ static void __init setup_boot_config(const char *cmdline) copy[size] = '\0'; if (xbc_init(copy) < 0) - pr_err("Failed to parse boot config\n"); + pr_err("Failed to parse bootconfig\n"); else { - pr_info("Load boot config: %d bytes\n", size); + pr_info("Load bootconfig: %d bytes\n", size); /* keys starting with "kernel." are passed via cmdline */ extra_command_line = xbc_make_cmdline("kernel"); /* Also, "init." keys are init arguments */ -- cgit v1.2.3 From a00574036c261421721fa770ccd21a1012e1fbbd Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Wed, 5 Feb 2020 22:50:23 +0900 Subject: bootconfig: Show the number of nodes on boot message Show the number of bootconfig nodes on boot message. Link: http://lkml.kernel.org/r/158091062297.27924.9051634676068550285.stgit@devnote2 Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- init/main.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'init') diff --git a/init/main.c b/init/main.c index 2de2f9f7aab9..491f1cdb3105 100644 --- a/init/main.c +++ b/init/main.c @@ -342,6 +342,7 @@ static void __init setup_boot_config(const char *cmdline) char *data, *copy; const char *p; u32 *hdr; + int ret; p = strstr(cmdline, "bootconfig"); if (!p || (p != cmdline && !isspace(*(p-1))) || @@ -379,10 +380,11 @@ static void __init setup_boot_config(const char *cmdline) memcpy(copy, data, size); copy[size] = '\0'; - if (xbc_init(copy) < 0) + ret = xbc_init(copy); + if (ret < 0) pr_err("Failed to parse bootconfig\n"); else { - pr_info("Load bootconfig: %d bytes\n", size); + pr_info("Load bootconfig: %d bytes %d nodes\n", size, ret); /* keys starting with "kernel." are passed via cmdline */ extra_command_line = xbc_make_cmdline("kernel"); /* Also, "init." keys are init arguments */ -- cgit v1.2.3 From 26445f98ead38b90423a2deffce2caa3b97a3d78 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Thu, 6 Feb 2020 20:14:53 +0900 Subject: bootconfig: Remove unneeded CONFIG_LIBXBC Since there is no user except CONFIG_BOOT_CONFIG and no plan to use it from other functions, CONFIG_LIBXBC can be removed and we can use CONFIG_BOOT_CONFIG directly. Link: http://lkml.kernel.org/r/158098769281.939.16293492056419481105.stgit@devnote2 Suggested-by: Geert Uytterhoeven Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- init/Kconfig | 1 - lib/Kconfig | 3 --- lib/Makefile | 2 +- 3 files changed, 1 insertion(+), 5 deletions(-) (limited to 'init') diff --git a/init/Kconfig b/init/Kconfig index 9506299a53e3..4a672c6629d0 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1218,7 +1218,6 @@ endif config BOOT_CONFIG bool "Boot config support" depends on BLK_DEV_INITRD - select LIBXBC default y help Extra boot config allows system admin to pass a config file as diff --git a/lib/Kconfig b/lib/Kconfig index 10012b646009..6e790dc55c5b 100644 --- a/lib/Kconfig +++ b/lib/Kconfig @@ -566,9 +566,6 @@ config DIMLIB config LIBFDT bool -config LIBXBC - bool - config OID_REGISTRY tristate help diff --git a/lib/Makefile b/lib/Makefile index 75a64d2552a2..74c1223828c1 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -228,7 +228,7 @@ $(foreach file, $(libfdt_files), \ $(eval CFLAGS_$(file) = -I $(srctree)/scripts/dtc/libfdt)) lib-$(CONFIG_LIBFDT) += $(libfdt_files) -lib-$(CONFIG_LIBXBC) += bootconfig.o +lib-$(CONFIG_BOOT_CONFIG) += bootconfig.o obj-$(CONFIG_RBTREE_TEST) += rbtree_test.o obj-$(CONFIG_INTERVAL_TREE_TEST) += interval_tree_test.o -- cgit v1.2.3 From f61872bb58a1cd8f0422aab1940eeee8be579d38 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Fri, 7 Feb 2020 19:07:37 -0500 Subject: bootconfig: Use parse_args() to find bootconfig and '--' The current implementation does a naive search of "bootconfig" on the kernel command line. But this could find "bootconfig" that is part of another option in quotes (although highly unlikely). But it also needs to find '--' on the kernel command line to know if it should append a '--' or not when a bootconfig in the initrd file has an "init" section. The check uses the naive strstr() to find to see if it exists. But this can return a false positive if it exists in an option and then the "init" section in the initrd will not be appended properly. Using parse_args() to find both of these will solve both of these problems. Link: https://lore.kernel.org/r/202002070954.C18E7F58B@keescook Fixes: 7495e0926fdf3 ("bootconfig: Only load bootconfig if "bootconfig" is on the kernel cmdline") Fixes: 1319916209ce8 ("bootconfig: init: Allow admin to use bootconfig for init command line") Reported-by: Kees Cook Reviewed-by: Kees Cook Acked-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- init/main.c | 37 ++++++++++++++++++++++++++++++------- 1 file changed, 30 insertions(+), 7 deletions(-) (limited to 'init') diff --git a/init/main.c b/init/main.c index 491f1cdb3105..59248717c925 100644 --- a/init/main.c +++ b/init/main.c @@ -142,6 +142,15 @@ static char *extra_command_line; /* Extra init arguments */ static char *extra_init_args; +#ifdef CONFIG_BOOT_CONFIG +/* Is bootconfig on command line? */ +static bool bootconfig_found; +static bool initargs_found; +#else +# define bootconfig_found false +# define initargs_found false +#endif + static char *execute_command; static char *ramdisk_execute_command; @@ -336,17 +345,30 @@ u32 boot_config_checksum(unsigned char *p, u32 size) return ret; } +static int __init bootconfig_params(char *param, char *val, + const char *unused, void *arg) +{ + if (strcmp(param, "bootconfig") == 0) { + bootconfig_found = true; + } else if (strcmp(param, "--") == 0) { + initargs_found = true; + } + return 0; +} + static void __init setup_boot_config(const char *cmdline) { + static char tmp_cmdline[COMMAND_LINE_SIZE] __initdata; u32 size, csum; char *data, *copy; - const char *p; u32 *hdr; int ret; - p = strstr(cmdline, "bootconfig"); - if (!p || (p != cmdline && !isspace(*(p-1))) || - (p[10] && !isspace(p[10]))) + strlcpy(tmp_cmdline, boot_command_line, COMMAND_LINE_SIZE); + parse_args("bootconfig", tmp_cmdline, NULL, 0, 0, 0, NULL, + bootconfig_params); + + if (!bootconfig_found) return; if (!initrd_end) @@ -563,11 +585,12 @@ static void __init setup_command_line(char *command_line) * to init. */ len = strlen(saved_command_line); - if (!strstr(boot_command_line, " -- ")) { + if (initargs_found) { + saved_command_line[len++] = ' '; + } else { strcpy(saved_command_line + len, " -- "); len += 4; - } else - saved_command_line[len++] = ' '; + } strcpy(saved_command_line + len, extra_init_args); } -- cgit v1.2.3