diff options
Diffstat (limited to 'arch/x86')
305 files changed, 7185 insertions, 4632 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 67745ceab0db..3604074a878b 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -69,6 +69,7 @@ config X86 select ARCH_ENABLE_THP_MIGRATION if X86_64 && TRANSPARENT_HUGEPAGE select ARCH_HAS_ACPI_TABLE_UPGRADE if ACPI select ARCH_HAS_CACHE_LINE_SIZE + select ARCH_HAS_CPU_CACHE_INVALIDATE_MEMREGION select ARCH_HAS_CURRENT_STACK_POINTER select ARCH_HAS_DEBUG_VIRTUAL select ARCH_HAS_DEBUG_VM_PGTABLE if !X86_PAE @@ -81,6 +82,7 @@ config X86 select ARCH_HAS_KCOV if X86_64 select ARCH_HAS_MEM_ENCRYPT select ARCH_HAS_MEMBARRIER_SYNC_CORE + select ARCH_HAS_NMI_SAFE_THIS_CPU_OPS select ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE select ARCH_HAS_PMEM_API if X86_64 select ARCH_HAS_PTE_DEVMAP if X86_64 @@ -157,7 +159,7 @@ config X86 select GENERIC_TIME_VSYSCALL select GENERIC_GETTIMEOFDAY select GENERIC_VDSO_TIME_NS - select GUP_GET_PTE_LOW_HIGH if X86_PAE + select GUP_GET_PXX_LOW_HIGH if X86_PAE select HARDIRQS_SW_RESEND select HARDLOCKUP_CHECK_TIMESTAMP if X86_64 select HAVE_ACPI_APEI if ACPI @@ -195,6 +197,7 @@ config X86 select HAVE_CONTEXT_TRACKING_USER_OFFSTACK if HAVE_CONTEXT_TRACKING_USER select HAVE_C_RECORDMCOUNT select HAVE_OBJTOOL_MCOUNT if HAVE_OBJTOOL + select HAVE_OBJTOOL_NOP_MCOUNT if HAVE_OBJTOOL_MCOUNT select HAVE_BUILDTIME_MCOUNT_SORT select HAVE_DEBUG_KMEMLEAK select HAVE_DMA_CONTIGUOUS @@ -290,6 +293,8 @@ config X86 select X86_FEATURE_NAMES if PROC_FS select PROC_PID_ARCH_STATUS if PROC_FS select HAVE_ARCH_NODE_DEV_GROUP if X86_SGX + select FUNCTION_ALIGNMENT_16B if X86_64 || X86_ALIGNMENT_16 + select FUNCTION_ALIGNMENT_4B imply IMA_SECURE_AND_OR_TRUSTED_BOOT if EFI select HAVE_DYNAMIC_FTRACE_NO_PATCHABLE @@ -357,11 +362,6 @@ config ARCH_HAS_CPU_RELAX config ARCH_HIBERNATION_POSSIBLE def_bool y -config ARCH_NR_GPIO - int - default 1024 if X86_64 - default 512 - config ARCH_SUSPEND_POSSIBLE def_bool y @@ -462,8 +462,8 @@ config X86_X2APIC Some Intel systems circa 2022 and later are locked into x2APIC mode and can not fall back to the legacy APIC modes if SGX or TDX are - enabled in the BIOS. They will be unable to boot without enabling - this option. + enabled in the BIOS. They will boot with very reduced functionality + without enabling this option. If you don't know what to do here, say N. @@ -1109,7 +1109,6 @@ config X86_LOCAL_APIC def_bool y depends on X86_64 || SMP || X86_32_NON_STANDARD || X86_UP_APIC || PCI_MSI select IRQ_DOMAIN_HIERARCHY - select PCI_MSI_IRQ_DOMAIN if PCI_MSI config X86_IO_APIC def_bool y @@ -1854,7 +1853,7 @@ config CC_HAS_IBT config X86_KERNEL_IBT prompt "Indirect Branch Tracking" - bool + def_bool y depends on X86_64 && CC_HAS_IBT && HAVE_OBJTOOL # https://github.com/llvm/llvm-project/commit/9d7001eba9c4cb311e03cd8cdc231f9e579f2d0f depends on !LD_IS_LLD || LLD_VERSION >= 140000 @@ -1980,6 +1979,23 @@ config EFI_STUB See Documentation/admin-guide/efi-stub.rst for more information. +config EFI_HANDOVER_PROTOCOL + bool "EFI handover protocol (DEPRECATED)" + depends on EFI_STUB + default y + help + Select this in order to include support for the deprecated EFI + handover protocol, which defines alternative entry points into the + EFI stub. This is a practice that has no basis in the UEFI + specification, and requires a priori knowledge on the part of the + bootloader about Linux/x86 specific ways of passing the command line + and initrd, and where in memory those assets may be loaded. + + If in doubt, say Y. Even though the corresponding support is not + present in upstream GRUB or other bootloaders, most distros build + GRUB with numerous downstream patches applied, and may rely on the + handover protocol as as result. + config EFI_MIXED bool "EFI mixed-mode support" depends on EFI_STUB && X86_64 @@ -1994,6 +2010,37 @@ config EFI_MIXED If unsure, say N. +config EFI_FAKE_MEMMAP + bool "Enable EFI fake memory map" + depends on EFI + help + Saying Y here will enable "efi_fake_mem" boot option. By specifying + this parameter, you can add arbitrary attribute to specific memory + range by updating original (firmware provided) EFI memmap. This is + useful for debugging of EFI memmap related feature, e.g., Address + Range Mirroring feature. + +config EFI_MAX_FAKE_MEM + int "maximum allowable number of ranges in efi_fake_mem boot option" + depends on EFI_FAKE_MEMMAP + range 1 128 + default 8 + help + Maximum allowable number of ranges in efi_fake_mem boot option. + Ranges can be set up to this value using comma-separated list. + The default value is 8. + +config EFI_RUNTIME_MAP + bool "Export EFI runtime maps to sysfs" if EXPERT + depends on EFI + default KEXEC_CORE + help + Export EFI runtime memory regions to /sys/firmware/efi/runtime-map. + That memory map is required by the 2nd kernel to set up EFI virtual + mappings after kexec, but can also be used for debugging purposes. + + See also Documentation/ABI/testing/sysfs-firmware-efi-runtime-map. + source "kernel/Kconfig.hz" config KEXEC @@ -2443,6 +2490,46 @@ config CC_HAS_SLS config CC_HAS_RETURN_THUNK def_bool $(cc-option,-mfunction-return=thunk-extern) +config CC_HAS_ENTRY_PADDING + def_bool $(cc-option,-fpatchable-function-entry=16,16) + +config FUNCTION_PADDING_CFI + int + default 59 if FUNCTION_ALIGNMENT_64B + default 27 if FUNCTION_ALIGNMENT_32B + default 11 if FUNCTION_ALIGNMENT_16B + default 3 if FUNCTION_ALIGNMENT_8B + default 0 + +# Basically: FUNCTION_ALIGNMENT - 5*CFI_CLANG +# except Kconfig can't do arithmetic :/ +config FUNCTION_PADDING_BYTES + int + default FUNCTION_PADDING_CFI if CFI_CLANG + default FUNCTION_ALIGNMENT + +config CALL_PADDING + def_bool n + depends on CC_HAS_ENTRY_PADDING && OBJTOOL + select FUNCTION_ALIGNMENT_16B + +config FINEIBT + def_bool y + depends on X86_KERNEL_IBT && CFI_CLANG && RETPOLINE + select CALL_PADDING + +config HAVE_CALL_THUNKS + def_bool y + depends on CC_HAS_ENTRY_PADDING && RETHUNK && OBJTOOL + +config CALL_THUNKS + def_bool n + select CALL_PADDING + +config PREFIX_SYMBOLS + def_bool y + depends on CALL_PADDING && !CFI_CLANG + menuconfig SPECULATION_MITIGATIONS bool "Mitigations for speculative execution vulnerabilities" default y @@ -2494,6 +2581,37 @@ config CPU_UNRET_ENTRY help Compile the kernel with support for the retbleed=unret mitigation. +config CALL_DEPTH_TRACKING + bool "Mitigate RSB underflow with call depth tracking" + depends on CPU_SUP_INTEL && HAVE_CALL_THUNKS + select HAVE_DYNAMIC_FTRACE_NO_PATCHABLE + select CALL_THUNKS + default y + help + Compile the kernel with call depth tracking to mitigate the Intel + SKL Return-Speculation-Buffer (RSB) underflow issue. The + mitigation is off by default and needs to be enabled on the + kernel command line via the retbleed=stuff option. For + non-affected systems the overhead of this option is marginal as + the call depth tracking is using run-time generated call thunks + in a compiler generated padding area and call patching. This + increases text size by ~5%. For non affected systems this space + is unused. On affected SKL systems this results in a significant + performance gain over the IBRS mitigation. + +config CALL_THUNKS_DEBUG + bool "Enable call thunks and call depth tracking debugging" + depends on CALL_DEPTH_TRACKING + select FUNCTION_ALIGNMENT_32B + default n + help + Enable call/ret counters for imbalance detection and build in + a noisy dmesg about callthunks generation and call patching for + trouble shooting. The debug prints need to be enabled on the + kernel command line with 'debug-callthunks'. + Only enable this, when you are debugging call thunks as this + creates a noticable runtime overhead. If unsure say N. + config CPU_IBPB_ENTRY bool "Enable IBPB on kernel entry" depends on CPU_SUP_AMD && X86_64 diff --git a/arch/x86/Makefile b/arch/x86/Makefile index 415a5d138de4..9cf07322875a 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -208,10 +208,16 @@ ifdef CONFIG_SLS KBUILD_CFLAGS += -mharden-sls=all endif +ifdef CONFIG_CALL_PADDING +PADDING_CFLAGS := -fpatchable-function-entry=$(CONFIG_FUNCTION_PADDING_BYTES),$(CONFIG_FUNCTION_PADDING_BYTES) +KBUILD_CFLAGS += $(PADDING_CFLAGS) +export PADDING_CFLAGS +endif + KBUILD_LDFLAGS += -m elf_$(UTS_MACHINE) ifdef CONFIG_LTO_CLANG -ifeq ($(shell test $(CONFIG_LLD_VERSION) -lt 130000; echo $$?),0) +ifeq ($(call test-lt, $(CONFIG_LLD_VERSION), 130000),y) KBUILD_LDFLAGS += -plugin-opt=-stack-alignment=$(if $(CONFIG_X86_32),4,8) endif endif diff --git a/arch/x86/boot/Makefile b/arch/x86/boot/Makefile index 9860ca5979f8..9e38ffaadb5d 100644 --- a/arch/x86/boot/Makefile +++ b/arch/x86/boot/Makefile @@ -83,7 +83,7 @@ cmd_image = $(obj)/tools/build $(obj)/setup.bin $(obj)/vmlinux.bin \ $(obj)/bzImage: $(obj)/setup.bin $(obj)/vmlinux.bin $(obj)/tools/build FORCE $(call if_changed,image) - @$(kecho) 'Kernel: $@ is ready' ' (#'`cat .version`')' + @$(kecho) 'Kernel: $@ is ready' ' (#'$(or $(KBUILD_BUILD_VERSION),`cat .version`)')' OBJCOPYFLAGS_vmlinux.bin := -O binary -R .note -R .comment -S $(obj)/vmlinux.bin: $(obj)/compressed/vmlinux FORCE diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile index 3a261abb6d15..1acff356d97a 100644 --- a/arch/x86/boot/compressed/Makefile +++ b/arch/x86/boot/compressed/Makefile @@ -68,7 +68,7 @@ KBUILD_LDFLAGS += $(call ld-option,--no-ld-generated-unwind-info) # address by the bootloader. LDFLAGS_vmlinux := -pie $(call ld-option, --no-dynamic-linker) ifdef CONFIG_LD_ORPHAN_WARN -LDFLAGS_vmlinux += --orphan-handling=warn +LDFLAGS_vmlinux += --orphan-handling=$(CONFIG_LD_ORPHAN_WARN_LEVEL) endif LDFLAGS_vmlinux += -z noexecstack ifeq ($(CONFIG_LD_IS_BFD),y) @@ -100,7 +100,7 @@ vmlinux-objs-$(CONFIG_RANDOMIZE_BASE) += $(obj)/kaslr.o ifdef CONFIG_X86_64 vmlinux-objs-y += $(obj)/ident_map_64.o vmlinux-objs-y += $(obj)/idt_64.o $(obj)/idt_handlers_64.o - vmlinux-objs-y += $(obj)/mem_encrypt.o + vmlinux-objs-$(CONFIG_AMD_MEM_ENCRYPT) += $(obj)/mem_encrypt.o vmlinux-objs-y += $(obj)/pgtable_64.o vmlinux-objs-$(CONFIG_AMD_MEM_ENCRYPT) += $(obj)/sev.o endif @@ -108,11 +108,11 @@ endif vmlinux-objs-$(CONFIG_ACPI) += $(obj)/acpi.o vmlinux-objs-$(CONFIG_INTEL_TDX_GUEST) += $(obj)/tdx.o $(obj)/tdcall.o -vmlinux-objs-$(CONFIG_EFI_MIXED) += $(obj)/efi_thunk_$(BITS).o vmlinux-objs-$(CONFIG_EFI) += $(obj)/efi.o -efi-obj-$(CONFIG_EFI_STUB) = $(objtree)/drivers/firmware/efi/libstub/lib.a +vmlinux-objs-$(CONFIG_EFI_MIXED) += $(obj)/efi_mixed.o +vmlinux-objs-$(CONFIG_EFI_STUB) += $(objtree)/drivers/firmware/efi/libstub/lib.a -$(obj)/vmlinux: $(vmlinux-objs-y) $(efi-obj-y) FORCE +$(obj)/vmlinux: $(vmlinux-objs-y) FORCE $(call if_changed,ld) OBJCOPYFLAGS_vmlinux.bin := -R .comment -S diff --git a/arch/x86/boot/compressed/efi_mixed.S b/arch/x86/boot/compressed/efi_mixed.S new file mode 100644 index 000000000000..4ca70bf93dc0 --- /dev/null +++ b/arch/x86/boot/compressed/efi_mixed.S @@ -0,0 +1,345 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2014, 2015 Intel Corporation; author Matt Fleming + * + * Early support for invoking 32-bit EFI services from a 64-bit kernel. + * + * Because this thunking occurs before ExitBootServices() we have to + * restore the firmware's 32-bit GDT and IDT before we make EFI service + * calls. + * + * On the plus side, we don't have to worry about mangling 64-bit + * addresses into 32-bits because we're executing with an identity + * mapped pagetable and haven't transitioned to 64-bit virtual addresses + * yet. + */ + +#include <linux/linkage.h> +#include <asm/msr.h> +#include <asm/page_types.h> +#include <asm/processor-flags.h> +#include <asm/segment.h> + + .code64 + .text +/* + * When booting in 64-bit mode on 32-bit EFI firmware, startup_64_mixed_mode() + * is the first thing that runs after switching to long mode. Depending on + * whether the EFI handover protocol or the compat entry point was used to + * enter the kernel, it will either branch to the 64-bit EFI handover + * entrypoint at offset 0x390 in the image, or to the 64-bit EFI PE/COFF + * entrypoint efi_pe_entry(). In the former case, the bootloader must provide a + * struct bootparams pointer as the third argument, so the presence of such a + * pointer is used to disambiguate. + * + * +--------------+ + * +------------------+ +------------+ +------>| efi_pe_entry | + * | efi32_pe_entry |---->| | | +-----------+--+ + * +------------------+ | | +------+----------------+ | + * | startup_32 |---->| startup_64_mixed_mode | | + * +------------------+ | | +------+----------------+ V + * | efi32_stub_entry |---->| | | +------------------+ + * +------------------+ +------------+ +---->| efi64_stub_entry | + * +-------------+----+ + * +------------+ +----------+ | + * | startup_64 |<----| efi_main |<--------------+ + * +------------+ +----------+ + */ +SYM_FUNC_START(startup_64_mixed_mode) + lea efi32_boot_args(%rip), %rdx + mov 0(%rdx), %edi + mov 4(%rdx), %esi + mov 8(%rdx), %edx // saved bootparams pointer + test %edx, %edx + jnz efi64_stub_entry + /* + * efi_pe_entry uses MS calling convention, which requires 32 bytes of + * shadow space on the stack even if all arguments are passed in + * registers. We also need an additional 8 bytes for the space that + * would be occupied by the return address, and this also results in + * the correct stack alignment for entry. + */ + sub $40, %rsp + mov %rdi, %rcx // MS calling convention + mov %rsi, %rdx + jmp efi_pe_entry +SYM_FUNC_END(startup_64_mixed_mode) + +SYM_FUNC_START(__efi64_thunk) + push %rbp + push %rbx + + movl %ds, %eax + push %rax + movl %es, %eax + push %rax + movl %ss, %eax + push %rax + + /* Copy args passed on stack */ + movq 0x30(%rsp), %rbp + movq 0x38(%rsp), %rbx + movq 0x40(%rsp), %rax + + /* + * Convert x86-64 ABI params to i386 ABI + */ + subq $64, %rsp + movl %esi, 0x0(%rsp) + movl %edx, 0x4(%rsp) + movl %ecx, 0x8(%rsp) + movl %r8d, 0xc(%rsp) + movl %r9d, 0x10(%rsp) + movl %ebp, 0x14(%rsp) + movl %ebx, 0x18(%rsp) + movl %eax, 0x1c(%rsp) + + leaq 0x20(%rsp), %rbx + sgdt (%rbx) + sidt 16(%rbx) + + leaq 1f(%rip), %rbp + + /* + * Switch to IDT and GDT with 32-bit segments. These are the firmware + * GDT and IDT that were installed when the kernel started executing. + * The pointers were saved by the efi32_entry() routine below. + * + * Pass the saved DS selector to the 32-bit code, and use far return to + * restore the saved CS selector. + */ + lidt efi32_boot_idt(%rip) + lgdt efi32_boot_gdt(%rip) + + movzwl efi32_boot_ds(%rip), %edx + movzwq efi32_boot_cs(%rip), %rax + pushq %rax + leaq efi_enter32(%rip), %rax + pushq %rax + lretq + +1: addq $64, %rsp + movq %rdi, %rax + + pop %rbx + movl %ebx, %ss + pop %rbx + movl %ebx, %es + pop %rbx + movl %ebx, %ds + /* Clear out 32-bit selector from FS and GS */ + xorl %ebx, %ebx + movl %ebx, %fs + movl %ebx, %gs + + pop %rbx + pop %rbp + RET +SYM_FUNC_END(__efi64_thunk) + + .code32 +/* + * EFI service pointer must be in %edi. + * + * The stack should represent the 32-bit calling convention. + */ +SYM_FUNC_START_LOCAL(efi_enter32) + /* Load firmware selector into data and stack segment registers */ + movl %edx, %ds + movl %edx, %es + movl %edx, %fs + movl %edx, %gs + movl %edx, %ss + + /* Reload pgtables */ + movl %cr3, %eax + movl %eax, %cr3 + + /* Disable paging */ + movl %cr0, %eax + btrl $X86_CR0_PG_BIT, %eax + movl %eax, %cr0 + + /* Disable long mode via EFER */ + movl $MSR_EFER, %ecx + rdmsr + btrl $_EFER_LME, %eax + wrmsr + + call *%edi + + /* We must preserve return value */ + movl %eax, %edi + + /* + * Some firmware will return with interrupts enabled. Be sure to + * disable them before we switch GDTs and IDTs. + */ + cli + + lidtl 16(%ebx) + lgdtl (%ebx) + + movl %cr4, %eax + btsl $(X86_CR4_PAE_BIT), %eax + movl %eax, %cr4 + + movl %cr3, %eax + movl %eax, %cr3 + + movl $MSR_EFER, %ecx + rdmsr + btsl $_EFER_LME, %eax + wrmsr + + xorl %eax, %eax + lldt %ax + + pushl $__KERNEL_CS + pushl %ebp + + /* Enable paging */ + movl %cr0, %eax + btsl $X86_CR0_PG_BIT, %eax + movl %eax, %cr0 + lret +SYM_FUNC_END(efi_enter32) + +/* + * This is the common EFI stub entry point for mixed mode. + * + * Arguments: %ecx image handle + * %edx EFI system table pointer + * %esi struct bootparams pointer (or NULL when not using + * the EFI handover protocol) + * + * Since this is the point of no return for ordinary execution, no registers + * are considered live except for the function parameters. [Note that the EFI + * stub may still exit and return to the firmware using the Exit() EFI boot + * service.] + */ +SYM_FUNC_START(efi32_entry) + call 1f +1: pop %ebx + + /* Save firmware GDTR and code/data selectors */ + sgdtl (efi32_boot_gdt - 1b)(%ebx) + movw %cs, (efi32_boot_cs - 1b)(%ebx) + movw %ds, (efi32_boot_ds - 1b)(%ebx) + + /* Store firmware IDT descriptor */ + sidtl (efi32_boot_idt - 1b)(%ebx) + + /* Store boot arguments */ + leal (efi32_boot_args - 1b)(%ebx), %ebx + movl %ecx, 0(%ebx) + movl %edx, 4(%ebx) + movl %esi, 8(%ebx) + movb $0x0, 12(%ebx) // efi_is64 + + /* Disable paging */ + movl %cr0, %eax + btrl $X86_CR0_PG_BIT, %eax + movl %eax, %cr0 + + jmp startup_32 +SYM_FUNC_END(efi32_entry) + +#define ST32_boottime 60 // offsetof(efi_system_table_32_t, boottime) +#define BS32_handle_protocol 88 // offsetof(efi_boot_services_32_t, handle_protocol) +#define LI32_image_base 32 // offsetof(efi_loaded_image_32_t, image_base) + +/* + * efi_status_t efi32_pe_entry(efi_handle_t image_handle, + * efi_system_table_32_t *sys_table) + */ +SYM_FUNC_START(efi32_pe_entry) + pushl %ebp + movl %esp, %ebp + pushl %eax // dummy push to allocate loaded_image + + pushl %ebx // save callee-save registers + pushl %edi + + call verify_cpu // check for long mode support + testl %eax, %eax + movl $0x80000003, %eax // EFI_UNSUPPORTED + jnz 2f + + call 1f +1: pop %ebx + + /* Get the loaded image protocol pointer from the image handle */ + leal -4(%ebp), %eax + pushl %eax // &loaded_image + leal (loaded_image_proto - 1b)(%ebx), %eax + pushl %eax // pass the GUID address + pushl 8(%ebp) // pass the image handle + + /* + * Note the alignment of the stack frame. + * sys_table + * handle <-- 16-byte aligned on entry by ABI + * return address + * frame pointer + * loaded_image <-- local variable + * saved %ebx <-- 16-byte aligned here + * saved %edi + * &loaded_image + * &loaded_image_proto + * handle <-- 16-byte aligned for call to handle_protocol + */ + + movl 12(%ebp), %eax // sys_table + movl ST32_boottime(%eax), %eax // sys_table->boottime + call *BS32_handle_protocol(%eax) // sys_table->boottime->handle_protocol + addl $12, %esp // restore argument space + testl %eax, %eax + jnz 2f + + movl 8(%ebp), %ecx // image_handle + movl 12(%ebp), %edx // sys_table + movl -4(%ebp), %esi // loaded_image + movl LI32_image_base(%esi), %esi // loaded_image->image_base + leal (startup_32 - 1b)(%ebx), %ebp // runtime address of startup_32 + /* + * We need to set the image_offset variable here since startup_32() will + * use it before we get to the 64-bit efi_pe_entry() in C code. + */ + subl %esi, %ebp // calculate image_offset + movl %ebp, (image_offset - 1b)(%ebx) // save image_offset + xorl %esi, %esi + jmp efi32_entry // pass %ecx, %edx, %esi + // no other registers remain live + +2: popl %edi // restore callee-save registers + popl %ebx + leave + RET +SYM_FUNC_END(efi32_pe_entry) + + .section ".rodata" + /* EFI loaded image protocol GUID */ + .balign 4 +SYM_DATA_START_LOCAL(loaded_image_proto) + .long 0x5b1b31a1 + .word 0x9562, 0x11d2 + .byte 0x8e, 0x3f, 0x00, 0xa0, 0xc9, 0x69, 0x72, 0x3b +SYM_DATA_END(loaded_image_proto) + + .data + .balign 8 +SYM_DATA_START_LOCAL(efi32_boot_gdt) + .word 0 + .quad 0 +SYM_DATA_END(efi32_boot_gdt) + +SYM_DATA_START_LOCAL(efi32_boot_idt) + .word 0 + .quad 0 +SYM_DATA_END(efi32_boot_idt) + +SYM_DATA_LOCAL(efi32_boot_cs, .word 0) +SYM_DATA_LOCAL(efi32_boot_ds, .word 0) +SYM_DATA_LOCAL(efi32_boot_args, .long 0, 0, 0) +SYM_DATA(efi_is64, .byte 1) diff --git a/arch/x86/boot/compressed/efi_thunk_64.S b/arch/x86/boot/compressed/efi_thunk_64.S deleted file mode 100644 index 67e7edcdfea8..000000000000 --- a/arch/x86/boot/compressed/efi_thunk_64.S +++ /dev/null @@ -1,195 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * Copyright (C) 2014, 2015 Intel Corporation; author Matt Fleming - * - * Early support for invoking 32-bit EFI services from a 64-bit kernel. - * - * Because this thunking occurs before ExitBootServices() we have to - * restore the firmware's 32-bit GDT and IDT before we make EFI service - * calls. - * - * On the plus side, we don't have to worry about mangling 64-bit - * addresses into 32-bits because we're executing with an identity - * mapped pagetable and haven't transitioned to 64-bit virtual addresses - * yet. - */ - -#include <linux/linkage.h> -#include <asm/msr.h> -#include <asm/page_types.h> -#include <asm/processor-flags.h> -#include <asm/segment.h> - - .code64 - .text -SYM_FUNC_START(__efi64_thunk) - push %rbp - push %rbx - - movl %ds, %eax - push %rax - movl %es, %eax - push %rax - movl %ss, %eax - push %rax - - /* Copy args passed on stack */ - movq 0x30(%rsp), %rbp - movq 0x38(%rsp), %rbx - movq 0x40(%rsp), %rax - - /* - * Convert x86-64 ABI params to i386 ABI - */ - subq $64, %rsp - movl %esi, 0x0(%rsp) - movl %edx, 0x4(%rsp) - movl %ecx, 0x8(%rsp) - movl %r8d, 0xc(%rsp) - movl %r9d, 0x10(%rsp) - movl %ebp, 0x14(%rsp) - movl %ebx, 0x18(%rsp) - movl %eax, 0x1c(%rsp) - - leaq 0x20(%rsp), %rbx - sgdt (%rbx) - - addq $16, %rbx - sidt (%rbx) - - leaq 1f(%rip), %rbp - - /* - * Switch to IDT and GDT with 32-bit segments. This is the firmware GDT - * and IDT that was installed when the kernel started executing. The - * pointers were saved at the EFI stub entry point in head_64.S. - * - * Pass the saved DS selector to the 32-bit code, and use far return to - * restore the saved CS selector. - */ - leaq efi32_boot_idt(%rip), %rax - lidt (%rax) - leaq efi32_boot_gdt(%rip), %rax - lgdt (%rax) - - movzwl efi32_boot_ds(%rip), %edx - movzwq efi32_boot_cs(%rip), %rax - pushq %rax - leaq efi_enter32(%rip), %rax - pushq %rax - lretq - -1: addq $64, %rsp - movq %rdi, %rax - - pop %rbx - movl %ebx, %ss - pop %rbx - movl %ebx, %es - pop %rbx - movl %ebx, %ds - /* Clear out 32-bit selector from FS and GS */ - xorl %ebx, %ebx - movl %ebx, %fs - movl %ebx, %gs - - /* - * Convert 32-bit status code into 64-bit. - */ - roll $1, %eax - rorq $1, %rax - - pop %rbx - pop %rbp - RET -SYM_FUNC_END(__efi64_thunk) - - .code32 -/* - * EFI service pointer must be in %edi. - * - * The stack should represent the 32-bit calling convention. - */ -SYM_FUNC_START_LOCAL(efi_enter32) - /* Load firmware selector into data and stack segment registers */ - movl %edx, %ds - movl %edx, %es - movl %edx, %fs - movl %edx, %gs - movl %edx, %ss - - /* Reload pgtables */ - movl %cr3, %eax - movl %eax, %cr3 - - /* Disable paging */ - movl %cr0, %eax - btrl $X86_CR0_PG_BIT, %eax - movl %eax, %cr0 - - /* Disable long mode via EFER */ - movl $MSR_EFER, %ecx - rdmsr - btrl $_EFER_LME, %eax - wrmsr - - call *%edi - - /* We must preserve return value */ - movl %eax, %edi - - /* - * Some firmware will return with interrupts enabled. Be sure to - * disable them before we switch GDTs and IDTs. - */ - cli - - lidtl (%ebx) - subl $16, %ebx - - lgdtl (%ebx) - - movl %cr4, %eax - btsl $(X86_CR4_PAE_BIT), %eax - movl %eax, %cr4 - - movl %cr3, %eax - movl %eax, %cr3 - - movl $MSR_EFER, %ecx - rdmsr - btsl $_EFER_LME, %eax - wrmsr - - xorl %eax, %eax - lldt %ax - - pushl $__KERNEL_CS - pushl %ebp - - /* Enable paging */ - movl %cr0, %eax - btsl $X86_CR0_PG_BIT, %eax - movl %eax, %cr0 - lret -SYM_FUNC_END(efi_enter32) - - .data - .balign 8 -SYM_DATA_START(efi32_boot_gdt) - .word 0 - .quad 0 -SYM_DATA_END(efi32_boot_gdt) - -SYM_DATA_START(efi32_boot_idt) - .word 0 - .quad 0 -SYM_DATA_END(efi32_boot_idt) - -SYM_DATA_START(efi32_boot_cs) - .word 0 -SYM_DATA_END(efi32_boot_cs) - -SYM_DATA_START(efi32_boot_ds) - .word 0 -SYM_DATA_END(efi32_boot_ds) diff --git a/arch/x86/boot/compressed/head_32.S b/arch/x86/boot/compressed/head_32.S index 3b354eb9516d..6589ddd4cfaf 100644 --- a/arch/x86/boot/compressed/head_32.S +++ b/arch/x86/boot/compressed/head_32.S @@ -208,10 +208,6 @@ SYM_DATA_START_LOCAL(gdt) .quad 0x00cf92000000ffff /* __KERNEL_DS */ SYM_DATA_END_LABEL(gdt, SYM_L_LOCAL, gdt_end) -#ifdef CONFIG_EFI_STUB -SYM_DATA(image_offset, .long 0) -#endif - /* * Stack and heap for uncompression */ diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S index d33f060900d2..a75712991df3 100644 --- a/arch/x86/boot/compressed/head_64.S +++ b/arch/x86/boot/compressed/head_64.S @@ -38,6 +38,14 @@ #include "pgtable.h" /* + * Fix alignment at 16 bytes. Following CONFIG_FUNCTION_ALIGNMENT will result + * in assembly errors due to trying to move .org backward due to the excessive + * alignment. + */ +#undef __ALIGN +#define __ALIGN .balign 16, 0x90 + +/* * Locally defined symbols should be marked hidden: */ .hidden _bss @@ -118,7 +126,9 @@ SYM_FUNC_START(startup_32) 1: /* Setup Exception handling for SEV-ES */ +#ifdef CONFIG_AMD_MEM_ENCRYPT call startup32_load_idt +#endif /* Make sure cpu supports long mode. */ call verify_cpu @@ -178,12 +188,13 @@ SYM_FUNC_START(startup_32) */ /* * If SEV is active then set the encryption mask in the page tables. - * This will insure that when the kernel is copied and decompressed + * This will ensure that when the kernel is copied and decompressed * it will be done so encrypted. */ - call get_sev_encryption_bit xorl %edx, %edx #ifdef CONFIG_AMD_MEM_ENCRYPT + call get_sev_encryption_bit + xorl %edx, %edx testl %eax, %eax jz 1f subl $32, %eax /* Encryption bit is always above bit 31 */ @@ -249,6 +260,11 @@ SYM_FUNC_START(startup_32) movl $__BOOT_TSS, %eax ltr %ax +#ifdef CONFIG_AMD_MEM_ENCRYPT + /* Check if the C-bit position is correct when SEV is active */ + call startup32_check_sev_cbit +#endif + /* * Setup for the jump to 64bit mode * @@ -261,29 +277,11 @@ SYM_FUNC_START(startup_32) */ leal rva(startup_64)(%ebp), %eax #ifdef CONFIG_EFI_MIXED - movl rva(efi32_boot_args)(%ebp), %edi - testl %edi, %edi - jz 1f - leal rva(efi64_stub_entry)(%ebp), %eax - movl rva(efi32_boot_args+4)(%ebp), %esi - movl rva(efi32_boot_args+8)(%ebp), %edx // saved bootparams pointer - testl %edx, %edx - jnz 1f - /* - * efi_pe_entry uses MS calling convention, which requires 32 bytes of - * shadow space on the stack even if all arguments are passed in - * registers. We also need an additional 8 bytes for the space that - * would be occupied by the return address, and this also results in - * the correct stack alignment for entry. - */ - subl $40, %esp - leal rva(efi_pe_entry)(%ebp), %eax - movl %edi, %ecx // MS calling convention - movl %esi, %edx + cmpb $1, rva(efi_is64)(%ebp) + je 1f + leal rva(startup_64_mixed_mode)(%ebp), %eax 1: #endif - /* Check if the C-bit position is correct when SEV is active */ - call startup32_check_sev_cbit pushl $__KERNEL_CS pushl %eax @@ -296,38 +294,14 @@ SYM_FUNC_START(startup_32) lret SYM_FUNC_END(startup_32) -#ifdef CONFIG_EFI_MIXED +#if IS_ENABLED(CONFIG_EFI_MIXED) && IS_ENABLED(CONFIG_EFI_HANDOVER_PROTOCOL) .org 0x190 SYM_FUNC_START(efi32_stub_entry) add $0x4, %esp /* Discard return address */ popl %ecx popl %edx popl %esi - - call 1f -1: pop %ebp - subl $ rva(1b), %ebp - - movl %esi, rva(efi32_boot_args+8)(%ebp) -SYM_INNER_LABEL(efi32_pe_stub_entry, SYM_L_LOCAL) - movl %ecx, rva(efi32_boot_args)(%ebp) - movl %edx, rva(efi32_boot_args+4)(%ebp) - movb $0, rva(efi_is64)(%ebp) - - /* Save firmware GDTR and code/data selectors */ - sgdtl rva(efi32_boot_gdt)(%ebp) - movw %cs, rva(efi32_boot_cs)(%ebp) - movw %ds, rva(efi32_boot_ds)(%ebp) - - /* Store firmware IDT descriptor */ - sidtl rva(efi32_boot_idt)(%ebp) - - /* Disable paging */ - movl %cr0, %eax - btrl $X86_CR0_PG_BIT, %eax - movl %eax, %cr0 - - jmp startup_32 + jmp efi32_entry SYM_FUNC_END(efi32_stub_entry) #endif @@ -550,7 +524,9 @@ trampoline_return: SYM_CODE_END(startup_64) #ifdef CONFIG_EFI_STUB +#ifdef CONFIG_EFI_HANDOVER_PROTOCOL .org 0x390 +#endif SYM_FUNC_START(efi64_stub_entry) and $~0xf, %rsp /* realign the stack */ movq %rdx, %rbx /* save boot_params pointer */ @@ -713,6 +689,7 @@ SYM_FUNC_START_LOCAL_NOALIGN(.Lno_longmode) jmp 1b SYM_FUNC_END(.Lno_longmode) + .globl verify_cpu #include "../../kernel/verify_cpu.S" .data @@ -744,242 +721,6 @@ SYM_DATA_START(boot_idt) .endr SYM_DATA_END_LABEL(boot_idt, SYM_L_GLOBAL, boot_idt_end) -#ifdef CONFIG_AMD_MEM_ENCRYPT -SYM_DATA_START(boot32_idt_desc) - .word boot32_idt_end - boot32_idt - 1 - .long 0 -SYM_DATA_END(boot32_idt_desc) - .balign 8 -SYM_DATA_START(boot32_idt) - .rept 32 - .quad 0 - .endr -SYM_DATA_END_LABEL(boot32_idt, SYM_L_GLOBAL, boot32_idt_end) -#endif - -#ifdef CONFIG_EFI_STUB -SYM_DATA(image_offset, .long 0) -#endif -#ifdef CONFIG_EFI_MIXED -SYM_DATA_LOCAL(efi32_boot_args, .long 0, 0, 0) -SYM_DATA(efi_is64, .byte 1) - -#define ST32_boottime 60 // offsetof(efi_system_table_32_t, boottime) -#define BS32_handle_protocol 88 // offsetof(efi_boot_services_32_t, handle_protocol) -#define LI32_image_base 32 // offsetof(efi_loaded_image_32_t, image_base) - - __HEAD - .code32 -SYM_FUNC_START(efi32_pe_entry) -/* - * efi_status_t efi32_pe_entry(efi_handle_t image_handle, - * efi_system_table_32_t *sys_table) - */ - - pushl %ebp - movl %esp, %ebp - pushl %eax // dummy push to allocate loaded_image - - pushl %ebx // save callee-save registers - pushl %edi - - call verify_cpu // check for long mode support - testl %eax, %eax - movl $0x80000003, %eax // EFI_UNSUPPORTED - jnz 2f - - call 1f -1: pop %ebx - subl $ rva(1b), %ebx - - /* Get the loaded image protocol pointer from the image handle */ - leal -4(%ebp), %eax - pushl %eax // &loaded_image - leal rva(loaded_image_proto)(%ebx), %eax - pushl %eax // pass the GUID address - pushl 8(%ebp) // pass the image handle - - /* - * Note the alignment of the stack frame. - * sys_table - * handle <-- 16-byte aligned on entry by ABI - * return address - * frame pointer - * loaded_image <-- local variable - * saved %ebx <-- 16-byte aligned here - * saved %edi - * &loaded_image - * &loaded_image_proto - * handle <-- 16-byte aligned for call to handle_protocol - */ - - movl 12(%ebp), %eax // sys_table - movl ST32_boottime(%eax), %eax // sys_table->boottime - call *BS32_handle_protocol(%eax) // sys_table->boottime->handle_protocol - addl $12, %esp // restore argument space - testl %eax, %eax - jnz 2f - - movl 8(%ebp), %ecx // image_handle - movl 12(%ebp), %edx // sys_table - movl -4(%ebp), %esi // loaded_image - movl LI32_image_base(%esi), %esi // loaded_image->image_base - movl %ebx, %ebp // startup_32 for efi32_pe_stub_entry - /* - * We need to set the image_offset variable here since startup_32() will - * use it before we get to the 64-bit efi_pe_entry() in C code. - */ - subl %esi, %ebx - movl %ebx, rva(image_offset)(%ebp) // save image_offset - jmp efi32_pe_stub_entry - -2: popl %edi // restore callee-save registers - popl %ebx - leave - RET -SYM_FUNC_END(efi32_pe_entry) - - .section ".rodata" - /* EFI loaded image protocol GUID */ - .balign 4 -SYM_DATA_START_LOCAL(loaded_image_proto) - .long 0x5b1b31a1 - .word 0x9562, 0x11d2 - .byte 0x8e, 0x3f, 0x00, 0xa0, 0xc9, 0x69, 0x72, 0x3b -SYM_DATA_END(loaded_image_proto) -#endif - -#ifdef CONFIG_AMD_MEM_ENCRYPT - __HEAD - .code32 -/* - * Write an IDT entry into boot32_idt - * - * Parameters: - * - * %eax: Handler address - * %edx: Vector number - * - * Physical offset is expected in %ebp - */ -SYM_FUNC_START(startup32_set_idt_entry) - push %ebx - push %ecx - - /* IDT entry address to %ebx */ - leal rva(boot32_idt)(%ebp), %ebx - shl $3, %edx - addl %edx, %ebx - - /* Build IDT entry, lower 4 bytes */ - movl %eax, %edx - andl $0x0000ffff, %edx # Target code segment offset [15:0] - movl $__KERNEL32_CS, %ecx # Target code segment selector - shl $16, %ecx - orl %ecx, %edx - - /* Store lower 4 bytes to IDT */ - movl %edx, (%ebx) - - /* Build IDT entry, upper 4 bytes */ - movl %eax, %edx - andl $0xffff0000, %edx # Target code segment offset [31:16] - orl $0x00008e00, %edx # Present, Type 32-bit Interrupt Gate - - /* Store upper 4 bytes to IDT */ - movl %edx, 4(%ebx) - - pop %ecx - pop %ebx - RET -SYM_FUNC_END(startup32_set_idt_entry) -#endif - -SYM_FUNC_START(startup32_load_idt) -#ifdef CONFIG_AMD_MEM_ENCRYPT - /* #VC handler */ - leal rva(startup32_vc_handler)(%ebp), %eax - movl $X86_TRAP_VC, %edx - call startup32_set_idt_entry - - /* Load IDT */ - leal rva(boot32_idt)(%ebp), %eax - movl %eax, rva(boot32_idt_desc+2)(%ebp) - lidt rva(boot32_idt_desc)(%ebp) -#endif - RET -SYM_FUNC_END(startup32_load_idt) - -/* - * Check for the correct C-bit position when the startup_32 boot-path is used. - * - * The check makes use of the fact that all memory is encrypted when paging is - * disabled. The function creates 64 bits of random data using the RDRAND - * instruction. RDRAND is mandatory for SEV guests, so always available. If the - * hypervisor violates that the kernel will crash right here. - * - * The 64 bits of random data are stored to a memory location and at the same - * time kept in the %eax and %ebx registers. Since encryption is always active - * when paging is off the random data will be stored encrypted in main memory. - * - * Then paging is enabled. When the C-bit position is correct all memory is - * still mapped encrypted and comparing the register values with memory will - * succeed. An incorrect C-bit position will map all memory unencrypted, so that - * the compare will use the encrypted random data and fail. - */ -SYM_FUNC_START(startup32_check_sev_cbit) -#ifdef CONFIG_AMD_MEM_ENCRYPT - pushl %eax - pushl %ebx - pushl %ecx - pushl %edx - - /* Check for non-zero sev_status */ - movl rva(sev_status)(%ebp), %eax - testl %eax, %eax - jz 4f - - /* - * Get two 32-bit random values - Don't bail out if RDRAND fails - * because it is better to prevent forward progress if no random value - * can be gathered. - */ -1: rdrand %eax - jnc 1b -2: rdrand %ebx - jnc 2b - - /* Store to memory and keep it in the registers */ - movl %eax, rva(sev_check_data)(%ebp) - movl %ebx, rva(sev_check_data+4)(%ebp) - - /* Enable paging to see if encryption is active */ - movl %cr0, %edx /* Backup %cr0 in %edx */ - movl $(X86_CR0_PG | X86_CR0_PE), %ecx /* Enable Paging and Protected mode */ - movl %ecx, %cr0 - - cmpl %eax, rva(sev_check_data)(%ebp) - jne 3f - cmpl %ebx, rva(sev_check_data+4)(%ebp) - jne 3f - - movl %edx, %cr0 /* Restore previous %cr0 */ - - jmp 4f - -3: /* Check failed - hlt the machine */ - hlt - jmp 3b - -4: - popl %edx - popl %ecx - popl %ebx - popl %eax -#endif - RET -SYM_FUNC_END(startup32_check_sev_cbit) - /* * Stack and heap for uncompression */ diff --git a/arch/x86/boot/compressed/kaslr.c b/arch/x86/boot/compressed/kaslr.c index e476bcbd9b42..454757fbdfe5 100644 --- a/arch/x86/boot/compressed/kaslr.c +++ b/arch/x86/boot/compressed/kaslr.c @@ -668,7 +668,7 @@ static bool process_mem_region(struct mem_vector *region, } } #endif - return 0; + return false; } #ifdef CONFIG_EFI diff --git a/arch/x86/boot/compressed/mem_encrypt.S b/arch/x86/boot/compressed/mem_encrypt.S index a73e4d783cae..32f7cc8a8625 100644 --- a/arch/x86/boot/compressed/mem_encrypt.S +++ b/arch/x86/boot/compressed/mem_encrypt.S @@ -12,16 +12,13 @@ #include <asm/processor-flags.h> #include <asm/msr.h> #include <asm/asm-offsets.h> +#include <asm/segment.h> +#include <asm/trapnr.h> .text .code32 SYM_FUNC_START(get_sev_encryption_bit) - xor %eax, %eax - -#ifdef CONFIG_AMD_MEM_ENCRYPT push %ebx - push %ecx - push %edx movl $0x80000000, %eax /* CPUID to check the highest leaf */ cpuid @@ -52,12 +49,7 @@ SYM_FUNC_START(get_sev_encryption_bit) xor %eax, %eax .Lsev_exit: - pop %edx - pop %ecx pop %ebx - -#endif /* CONFIG_AMD_MEM_ENCRYPT */ - RET SYM_FUNC_END(get_sev_encryption_bit) @@ -98,7 +90,7 @@ SYM_CODE_START_LOCAL(sev_es_req_cpuid) jmp 1b SYM_CODE_END(sev_es_req_cpuid) -SYM_CODE_START(startup32_vc_handler) +SYM_CODE_START_LOCAL(startup32_vc_handler) pushl %eax pushl %ebx pushl %ecx @@ -184,15 +176,149 @@ SYM_CODE_START(startup32_vc_handler) jmp .Lfail SYM_CODE_END(startup32_vc_handler) +/* + * Write an IDT entry into boot32_idt + * + * Parameters: + * + * %eax: Handler address + * %edx: Vector number + * %ecx: IDT address + */ +SYM_FUNC_START_LOCAL(startup32_set_idt_entry) + /* IDT entry address to %ecx */ + leal (%ecx, %edx, 8), %ecx + + /* Build IDT entry, lower 4 bytes */ + movl %eax, %edx + andl $0x0000ffff, %edx # Target code segment offset [15:0] + orl $(__KERNEL32_CS << 16), %edx # Target code segment selector + + /* Store lower 4 bytes to IDT */ + movl %edx, (%ecx) + + /* Build IDT entry, upper 4 bytes */ + movl %eax, %edx + andl $0xffff0000, %edx # Target code segment offset [31:16] + orl $0x00008e00, %edx # Present, Type 32-bit Interrupt Gate + + /* Store upper 4 bytes to IDT */ + movl %edx, 4(%ecx) + + RET +SYM_FUNC_END(startup32_set_idt_entry) + +SYM_FUNC_START(startup32_load_idt) + push %ebp + push %ebx + + call 1f +1: pop %ebp + + leal (boot32_idt - 1b)(%ebp), %ebx + + /* #VC handler */ + leal (startup32_vc_handler - 1b)(%ebp), %eax + movl $X86_TRAP_VC, %edx + movl %ebx, %ecx + call startup32_set_idt_entry + + /* Load IDT */ + leal (boot32_idt_desc - 1b)(%ebp), %ecx + movl %ebx, 2(%ecx) + lidt (%ecx) + + pop %ebx + pop %ebp + RET +SYM_FUNC_END(startup32_load_idt) + +/* + * Check for the correct C-bit position when the startup_32 boot-path is used. + * + * The check makes use of the fact that all memory is encrypted when paging is + * disabled. The function creates 64 bits of random data using the RDRAND + * instruction. RDRAND is mandatory for SEV guests, so always available. If the + * hypervisor violates that the kernel will crash right here. + * + * The 64 bits of random data are stored to a memory location and at the same + * time kept in the %eax and %ebx registers. Since encryption is always active + * when paging is off the random data will be stored encrypted in main memory. + * + * Then paging is enabled. When the C-bit position is correct all memory is + * still mapped encrypted and comparing the register values with memory will + * succeed. An incorrect C-bit position will map all memory unencrypted, so that + * the compare will use the encrypted random data and fail. + */ +SYM_FUNC_START(startup32_check_sev_cbit) + pushl %ebx + pushl %ebp + + call 0f +0: popl %ebp + + /* Check for non-zero sev_status */ + movl (sev_status - 0b)(%ebp), %eax + testl %eax, %eax + jz 4f + + /* + * Get two 32-bit random values - Don't bail out if RDRAND fails + * because it is better to prevent forward progress if no random value + * can be gathered. + */ +1: rdrand %eax + jnc 1b +2: rdrand %ebx + jnc 2b + + /* Store to memory and keep it in the registers */ + leal (sev_check_data - 0b)(%ebp), %ebp + movl %eax, 0(%ebp) + movl %ebx, 4(%ebp) + + /* Enable paging to see if encryption is active */ + movl %cr0, %edx /* Backup %cr0 in %edx */ + movl $(X86_CR0_PG | X86_CR0_PE), %ecx /* Enable Paging and Protected mode */ + movl %ecx, %cr0 + + cmpl %eax, 0(%ebp) + jne 3f + cmpl %ebx, 4(%ebp) + jne 3f + + movl %edx, %cr0 /* Restore previous %cr0 */ + + jmp 4f + +3: /* Check failed - hlt the machine */ + hlt + jmp 3b + +4: + popl %ebp + popl %ebx + RET +SYM_FUNC_END(startup32_check_sev_cbit) + .code64 #include "../../kernel/sev_verify_cbit.S" .data -#ifdef CONFIG_AMD_MEM_ENCRYPT .balign 8 SYM_DATA(sme_me_mask, .quad 0) SYM_DATA(sev_status, .quad 0) SYM_DATA(sev_check_data, .quad 0) -#endif + +SYM_DATA_START_LOCAL(boot32_idt) + .rept 32 + .quad 0 + .endr +SYM_DATA_END(boot32_idt) + +SYM_DATA_START_LOCAL(boot32_idt_desc) + .word . - boot32_idt - 1 + .long 0 +SYM_DATA_END(boot32_idt_desc) diff --git a/arch/x86/boot/cpuflags.c b/arch/x86/boot/cpuflags.c index a83d67ec627d..d75237ba7ce9 100644 --- a/arch/x86/boot/cpuflags.c +++ b/arch/x86/boot/cpuflags.c @@ -64,20 +64,11 @@ int has_eflag(unsigned long mask) return !!((f0^f1) & mask); } -/* Handle x86_32 PIC using ebx. */ -#if defined(__i386__) && defined(__PIC__) -# define EBX_REG "=r" -#else -# define EBX_REG "=b" -#endif - void cpuid_count(u32 id, u32 count, u32 *a, u32 *b, u32 *c, u32 *d) { - asm volatile(".ifnc %%ebx,%3 ; movl %%ebx,%3 ; .endif \n\t" - "cpuid \n\t" - ".ifnc %%ebx,%3 ; xchgl %%ebx,%3 ; .endif \n\t" - : "=a" (*a), "=c" (*c), "=d" (*d), EBX_REG (*b) - : "a" (id), "c" (count) + asm volatile("cpuid" + : "=a" (*a), "=b" (*b), "=c" (*c), "=d" (*d) + : "0" (id), "2" (count) ); } diff --git a/arch/x86/boot/header.S b/arch/x86/boot/header.S index f912d7770130..9338c68e7413 100644 --- a/arch/x86/boot/header.S +++ b/arch/x86/boot/header.S @@ -80,10 +80,11 @@ bs_die: ljmp $0xf000,$0xfff0 #ifdef CONFIG_EFI_STUB - .org 0x3c + .org 0x38 # # Offset to the PE header. # + .long LINUX_PE_MAGIC .long pe_header #endif /* CONFIG_EFI_STUB */ @@ -406,7 +407,7 @@ xloadflags: # define XLF1 0 #endif -#ifdef CONFIG_EFI_STUB +#ifdef CONFIG_EFI_HANDOVER_PROTOCOL # ifdef CONFIG_EFI_MIXED # define XLF23 (XLF_EFI_HANDOVER_32|XLF_EFI_HANDOVER_64) # else diff --git a/arch/x86/boot/string.c b/arch/x86/boot/string.c index 8a3fff9128bb..1c8541ae3b3a 100644 --- a/arch/x86/boot/string.c +++ b/arch/x86/boot/string.c @@ -350,7 +350,7 @@ static int _kstrtoul(const char *s, unsigned int base, unsigned long *res) } /** - * kstrtoul - convert a string to an unsigned long + * boot_kstrtoul - convert a string to an unsigned long * @s: The start of the string. The string must be null-terminated, and may also * include a single newline before its terminating null. The first character * may also be a plus sign, but not a minus sign. diff --git a/arch/x86/boot/tools/build.c b/arch/x86/boot/tools/build.c index a3725ad46c5a..bd247692b701 100644 --- a/arch/x86/boot/tools/build.c +++ b/arch/x86/boot/tools/build.c @@ -290,6 +290,7 @@ static void efi_stub_entry_update(void) { unsigned long addr = efi32_stub_entry; +#ifdef CONFIG_EFI_HANDOVER_PROTOCOL #ifdef CONFIG_X86_64 /* Yes, this is really how we defined it :( */ addr = efi64_stub_entry - 0x200; @@ -299,6 +300,7 @@ static void efi_stub_entry_update(void) if (efi32_stub_entry != addr) die("32-bit and 64-bit EFI entry points do not match\n"); #endif +#endif put_unaligned_le32(addr, &buf[0x264]); } diff --git a/arch/x86/coco/tdx/tdx.c b/arch/x86/coco/tdx/tdx.c index b8998cf0508a..669d9e4f2901 100644 --- a/arch/x86/coco/tdx/tdx.c +++ b/arch/x86/coco/tdx/tdx.c @@ -5,6 +5,8 @@ #define pr_fmt(fmt) "tdx: " fmt #include <linux/cpufeature.h> +#include <linux/export.h> +#include <linux/io.h> #include <asm/coco.h> #include <asm/tdx.h> #include <asm/vmx.h> @@ -15,6 +17,7 @@ /* TDX module Call Leaf IDs */ #define TDX_GET_INFO 1 #define TDX_GET_VEINFO 3 +#define TDX_GET_REPORT 4 #define TDX_ACCEPT_PAGE 6 /* TDX hypercall Leaf IDs */ @@ -36,6 +39,12 @@ #define ATTR_SEPT_VE_DISABLE BIT(28) +/* TDX Module call error codes */ +#define TDCALL_RETURN_CODE(a) ((a) >> 32) +#define TDCALL_INVALID_OPERAND 0xc0000100 + +#define TDREPORT_SUBTYPE_0 0 + /* * Wrapper for standard use of __tdx_hypercall with no output aside from * return code. @@ -100,6 +109,37 @@ static inline void tdx_module_call(u64 fn, u64 rcx, u64 rdx, u64 r8, u64 r9, panic("TDCALL %lld failed (Buggy TDX module!)\n", fn); } +/** + * tdx_mcall_get_report0() - Wrapper to get TDREPORT0 (a.k.a. TDREPORT + * subtype 0) using TDG.MR.REPORT TDCALL. + * @reportdata: Address of the input buffer which contains user-defined + * REPORTDATA to be included into TDREPORT. + * @tdreport: Address of the output buffer to store TDREPORT. + * + * Refer to section titled "TDG.MR.REPORT leaf" in the TDX Module + * v1.0 specification for more information on TDG.MR.REPORT TDCALL. + * It is used in the TDX guest driver module to get the TDREPORT0. + * + * Return 0 on success, -EINVAL for invalid operands, or -EIO on + * other TDCALL failures. + */ +int tdx_mcall_get_report0(u8 *reportdata, u8 *tdreport) +{ + u64 ret; + + ret = __tdx_module_call(TDX_GET_REPORT, virt_to_phys(tdreport), + virt_to_phys(reportdata), TDREPORT_SUBTYPE_0, + 0, NULL); + if (ret) { + if (TDCALL_RETURN_CODE(ret) == TDCALL_INVALID_OPERAND) + return -EINVAL; + return -EIO; + } + + return 0; +} +EXPORT_SYMBOL_GPL(tdx_mcall_get_report0); + static void tdx_parse_tdinfo(u64 *cc_mask) { struct tdx_module_output out; @@ -346,8 +386,8 @@ static int handle_mmio(struct pt_regs *regs, struct ve_info *ve) { unsigned long *reg, val, vaddr; char buffer[MAX_INSN_SIZE]; + enum insn_mmio_type mmio; struct insn insn = {}; - enum mmio_type mmio; int size, extend_size; u8 extend_val = 0; @@ -362,10 +402,10 @@ static int handle_mmio(struct pt_regs *regs, struct ve_info *ve) return -EINVAL; mmio = insn_decode_mmio(&insn, &size); - if (WARN_ON_ONCE(mmio == MMIO_DECODE_FAILED)) + if (WARN_ON_ONCE(mmio == INSN_MMIO_DECODE_FAILED)) return -EINVAL; - if (mmio != MMIO_WRITE_IMM && mmio != MMIO_MOVS) { + if (mmio != INSN_MMIO_WRITE_IMM && mmio != INSN_MMIO_MOVS) { reg = insn_get_modrm_reg_ptr(&insn, regs); if (!reg) return -EINVAL; @@ -386,23 +426,23 @@ static int handle_mmio(struct pt_regs *regs, struct ve_info *ve) /* Handle writes first */ switch (mmio) { - case MMIO_WRITE: + case INSN_MMIO_WRITE: memcpy(&val, reg, size); if (!mmio_write(size, ve->gpa, val)) return -EIO; return insn.length; - case MMIO_WRITE_IMM: + case INSN_MMIO_WRITE_IMM: val = insn.immediate.value; if (!mmio_write(size, ve->gpa, val)) return -EIO; return insn.length; - case MMIO_READ: - case MMIO_READ_ZERO_EXTEND: - case MMIO_READ_SIGN_EXTEND: + case INSN_MMIO_READ: + case INSN_MMIO_READ_ZERO_EXTEND: + case INSN_MMIO_READ_SIGN_EXTEND: /* Reads are handled below */ break; - case MMIO_MOVS: - case MMIO_DECODE_FAILED: + case INSN_MMIO_MOVS: + case INSN_MMIO_DECODE_FAILED: /* * MMIO was accessed with an instruction that could not be * decoded or handled properly. It was likely not using io.h @@ -419,15 +459,15 @@ static int handle_mmio(struct pt_regs *regs, struct ve_info *ve) return -EIO; switch (mmio) { - case MMIO_READ: + case INSN_MMIO_READ: /* Zero-extend for 32-bit operation */ extend_size = size == 4 ? sizeof(*reg) : 0; break; - case MMIO_READ_ZERO_EXTEND: + case INSN_MMIO_READ_ZERO_EXTEND: /* Zero extend based on operand size */ extend_size = insn.opnd_bytes; break; - case MMIO_READ_SIGN_EXTEND: + case INSN_MMIO_READ_SIGN_EXTEND: /* Sign extend based on operand size */ extend_size = insn.opnd_bytes; if (size == 1 && val & BIT(7)) diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile index 3b1d701a4f6c..3e7a329235bd 100644 --- a/arch/x86/crypto/Makefile +++ b/arch/x86/crypto/Makefile @@ -107,3 +107,6 @@ quiet_cmd_perlasm = PERLASM $@ cmd_perlasm = $(PERL) $< > $@ $(obj)/%.S: $(src)/%.pl FORCE $(call if_changed,perlasm) + +# Disable GCOV in odd or sensitive code +GCOV_PROFILE_curve25519-x86_64.o := n diff --git a/arch/x86/crypto/aegis128-aesni-asm.S b/arch/x86/crypto/aegis128-aesni-asm.S index b48ddebb4748..cdf3215ec272 100644 --- a/arch/x86/crypto/aegis128-aesni-asm.S +++ b/arch/x86/crypto/aegis128-aesni-asm.S @@ -7,6 +7,7 @@ */ #include <linux/linkage.h> +#include <linux/cfi_types.h> #include <asm/frame.h> #define STATE0 %xmm0 @@ -402,7 +403,7 @@ SYM_FUNC_END(crypto_aegis128_aesni_ad) * void crypto_aegis128_aesni_enc(void *state, unsigned int length, * const void *src, void *dst); */ -SYM_FUNC_START(crypto_aegis128_aesni_enc) +SYM_TYPED_FUNC_START(crypto_aegis128_aesni_enc) FRAME_BEGIN cmp $0x10, LEN @@ -499,7 +500,7 @@ SYM_FUNC_END(crypto_aegis128_aesni_enc) * void crypto_aegis128_aesni_enc_tail(void *state, unsigned int length, * const void *src, void *dst); */ -SYM_FUNC_START(crypto_aegis128_aesni_enc_tail) +SYM_TYPED_FUNC_START(crypto_aegis128_aesni_enc_tail) FRAME_BEGIN /* load the state: */ @@ -556,7 +557,7 @@ SYM_FUNC_END(crypto_aegis128_aesni_enc_tail) * void crypto_aegis128_aesni_dec(void *state, unsigned int length, * const void *src, void *dst); */ -SYM_FUNC_START(crypto_aegis128_aesni_dec) +SYM_TYPED_FUNC_START(crypto_aegis128_aesni_dec) FRAME_BEGIN cmp $0x10, LEN @@ -653,7 +654,7 @@ SYM_FUNC_END(crypto_aegis128_aesni_dec) * void crypto_aegis128_aesni_dec_tail(void *state, unsigned int length, * const void *src, void *dst); */ -SYM_FUNC_START(crypto_aegis128_aesni_dec_tail) +SYM_TYPED_FUNC_START(crypto_aegis128_aesni_dec_tail) FRAME_BEGIN /* load the state: */ diff --git a/arch/x86/crypto/aria-aesni-avx-asm_64.S b/arch/x86/crypto/aria-aesni-avx-asm_64.S index c75fd7d015ed..03ae4cd1d976 100644 --- a/arch/x86/crypto/aria-aesni-avx-asm_64.S +++ b/arch/x86/crypto/aria-aesni-avx-asm_64.S @@ -7,6 +7,7 @@ */ #include <linux/linkage.h> +#include <linux/cfi_types.h> #include <asm/frame.h> /* struct aria_ctx: */ @@ -913,7 +914,7 @@ SYM_FUNC_START_LOCAL(__aria_aesni_avx_crypt_16way) RET; SYM_FUNC_END(__aria_aesni_avx_crypt_16way) -SYM_FUNC_START(aria_aesni_avx_encrypt_16way) +SYM_TYPED_FUNC_START(aria_aesni_avx_encrypt_16way) /* input: * %rdi: ctx, CTX * %rsi: dst @@ -938,7 +939,7 @@ SYM_FUNC_START(aria_aesni_avx_encrypt_16way) RET; SYM_FUNC_END(aria_aesni_avx_encrypt_16way) -SYM_FUNC_START(aria_aesni_avx_decrypt_16way) +SYM_TYPED_FUNC_START(aria_aesni_avx_decrypt_16way) /* input: * %rdi: ctx, CTX * %rsi: dst @@ -1039,7 +1040,7 @@ SYM_FUNC_START_LOCAL(__aria_aesni_avx_ctr_gen_keystream_16way) RET; SYM_FUNC_END(__aria_aesni_avx_ctr_gen_keystream_16way) -SYM_FUNC_START(aria_aesni_avx_ctr_crypt_16way) +SYM_TYPED_FUNC_START(aria_aesni_avx_ctr_crypt_16way) /* input: * %rdi: ctx * %rsi: dst @@ -1208,7 +1209,7 @@ SYM_FUNC_START_LOCAL(__aria_aesni_avx_gfni_crypt_16way) RET; SYM_FUNC_END(__aria_aesni_avx_gfni_crypt_16way) -SYM_FUNC_START(aria_aesni_avx_gfni_encrypt_16way) +SYM_TYPED_FUNC_START(aria_aesni_avx_gfni_encrypt_16way) /* input: * %rdi: ctx, CTX * %rsi: dst @@ -1233,7 +1234,7 @@ SYM_FUNC_START(aria_aesni_avx_gfni_encrypt_16way) RET; SYM_FUNC_END(aria_aesni_avx_gfni_encrypt_16way) -SYM_FUNC_START(aria_aesni_avx_gfni_decrypt_16way) +SYM_TYPED_FUNC_START(aria_aesni_avx_gfni_decrypt_16way) /* input: * %rdi: ctx, CTX * %rsi: dst @@ -1258,7 +1259,7 @@ SYM_FUNC_START(aria_aesni_avx_gfni_decrypt_16way) RET; SYM_FUNC_END(aria_aesni_avx_gfni_decrypt_16way) -SYM_FUNC_START(aria_aesni_avx_gfni_ctr_crypt_16way) +SYM_TYPED_FUNC_START(aria_aesni_avx_gfni_ctr_crypt_16way) /* input: * %rdi: ctx * %rsi: dst diff --git a/arch/x86/crypto/camellia-aesni-avx-asm_64.S b/arch/x86/crypto/camellia-aesni-avx-asm_64.S index 2e1658ddbe1a..4a30618281ec 100644 --- a/arch/x86/crypto/camellia-aesni-avx-asm_64.S +++ b/arch/x86/crypto/camellia-aesni-avx-asm_64.S @@ -712,7 +712,6 @@ SYM_FUNC_END(roundsm16_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab) .text -.align 8 SYM_FUNC_START_LOCAL(__camellia_enc_blk16) /* input: * %rdi: ctx, CTX @@ -799,7 +798,6 @@ SYM_FUNC_START_LOCAL(__camellia_enc_blk16) jmp .Lenc_done; SYM_FUNC_END(__camellia_enc_blk16) -.align 8 SYM_FUNC_START_LOCAL(__camellia_dec_blk16) /* input: * %rdi: ctx, CTX diff --git a/arch/x86/crypto/camellia-aesni-avx2-asm_64.S b/arch/x86/crypto/camellia-aesni-avx2-asm_64.S index 0e4e9abbf4de..deaf62aa73a6 100644 --- a/arch/x86/crypto/camellia-aesni-avx2-asm_64.S +++ b/arch/x86/crypto/camellia-aesni-avx2-asm_64.S @@ -221,7 +221,6 @@ * Size optimization... with inlined roundsm32 binary would be over 5 times * larger and would only marginally faster. */ -.align 8 SYM_FUNC_START_LOCAL(roundsm32_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd) roundsm32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, %ymm15, @@ -229,7 +228,6 @@ SYM_FUNC_START_LOCAL(roundsm32_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_c RET; SYM_FUNC_END(roundsm32_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd) -.align 8 SYM_FUNC_START_LOCAL(roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab) roundsm32(%ymm4, %ymm5, %ymm6, %ymm7, %ymm0, %ymm1, %ymm2, %ymm3, %ymm12, %ymm13, %ymm14, %ymm15, %ymm8, %ymm9, %ymm10, %ymm11, @@ -748,7 +746,6 @@ SYM_FUNC_END(roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab) .text -.align 8 SYM_FUNC_START_LOCAL(__camellia_enc_blk32) /* input: * %rdi: ctx, CTX @@ -835,7 +832,6 @@ SYM_FUNC_START_LOCAL(__camellia_enc_blk32) jmp .Lenc_done; SYM_FUNC_END(__camellia_enc_blk32) -.align 8 SYM_FUNC_START_LOCAL(__camellia_dec_blk32) /* input: * %rdi: ctx, CTX diff --git a/arch/x86/crypto/cast5-avx-x86_64-asm_64.S b/arch/x86/crypto/cast5-avx-x86_64-asm_64.S index b258af420c92..0326a01503c3 100644 --- a/arch/x86/crypto/cast5-avx-x86_64-asm_64.S +++ b/arch/x86/crypto/cast5-avx-x86_64-asm_64.S @@ -208,7 +208,6 @@ .text -.align 16 SYM_FUNC_START_LOCAL(__cast5_enc_blk16) /* input: * %rdi: ctx @@ -282,7 +281,6 @@ SYM_FUNC_START_LOCAL(__cast5_enc_blk16) RET; SYM_FUNC_END(__cast5_enc_blk16) -.align 16 SYM_FUNC_START_LOCAL(__cast5_dec_blk16) /* input: * %rdi: ctx diff --git a/arch/x86/crypto/crct10dif-pcl-asm_64.S b/arch/x86/crypto/crct10dif-pcl-asm_64.S index 721474abfb71..5286db5b8165 100644 --- a/arch/x86/crypto/crct10dif-pcl-asm_64.S +++ b/arch/x86/crypto/crct10dif-pcl-asm_64.S @@ -94,7 +94,6 @@ # # Assumes len >= 16. # -.align 16 SYM_FUNC_START(crc_t10dif_pcl) movdqa .Lbswap_mask(%rip), BSWAP_MASK diff --git a/arch/x86/crypto/nh-avx2-x86_64.S b/arch/x86/crypto/nh-avx2-x86_64.S index 6a0b15e7196a..ef73a3ab8726 100644 --- a/arch/x86/crypto/nh-avx2-x86_64.S +++ b/arch/x86/crypto/nh-avx2-x86_64.S @@ -8,6 +8,7 @@ */ #include <linux/linkage.h> +#include <linux/cfi_types.h> #define PASS0_SUMS %ymm0 #define PASS1_SUMS %ymm1 @@ -65,11 +66,11 @@ /* * void nh_avx2(const u32 *key, const u8 *message, size_t message_len, - * u8 hash[NH_HASH_BYTES]) + * __le64 hash[NH_NUM_PASSES]) * * It's guaranteed that message_len % 16 == 0. */ -SYM_FUNC_START(nh_avx2) +SYM_TYPED_FUNC_START(nh_avx2) vmovdqu 0x00(KEY), K0 vmovdqu 0x10(KEY), K1 diff --git a/arch/x86/crypto/nh-sse2-x86_64.S b/arch/x86/crypto/nh-sse2-x86_64.S index 34c567bbcb4f..75fb994b6d17 100644 --- a/arch/x86/crypto/nh-sse2-x86_64.S +++ b/arch/x86/crypto/nh-sse2-x86_64.S @@ -8,6 +8,7 @@ */ #include <linux/linkage.h> +#include <linux/cfi_types.h> #define PASS0_SUMS %xmm0 #define PASS1_SUMS %xmm1 @@ -67,11 +68,11 @@ /* * void nh_sse2(const u32 *key, const u8 *message, size_t message_len, - * u8 hash[NH_HASH_BYTES]) + * __le64 hash[NH_NUM_PASSES]) * * It's guaranteed that message_len % 16 == 0. */ -SYM_FUNC_START(nh_sse2) +SYM_TYPED_FUNC_START(nh_sse2) movdqu 0x00(KEY), K0 movdqu 0x10(KEY), K1 diff --git a/arch/x86/crypto/nhpoly1305-avx2-glue.c b/arch/x86/crypto/nhpoly1305-avx2-glue.c index 8ea5ab0f1ca7..46b036204ed9 100644 --- a/arch/x86/crypto/nhpoly1305-avx2-glue.c +++ b/arch/x86/crypto/nhpoly1305-avx2-glue.c @@ -14,14 +14,7 @@ #include <asm/simd.h> asmlinkage void nh_avx2(const u32 *key, const u8 *message, size_t message_len, - u8 hash[NH_HASH_BYTES]); - -/* wrapper to avoid indirect call to assembly, which doesn't work with CFI */ -static void _nh_avx2(const u32 *key, const u8 *message, size_t message_len, - __le64 hash[NH_NUM_PASSES]) -{ - nh_avx2(key, message, message_len, (u8 *)hash); -} + __le64 hash[NH_NUM_PASSES]); static int nhpoly1305_avx2_update(struct shash_desc *desc, const u8 *src, unsigned int srclen) @@ -33,7 +26,7 @@ static int nhpoly1305_avx2_update(struct shash_desc *desc, unsigned int n = min_t(unsigned int, srclen, SZ_4K); kernel_fpu_begin(); - crypto_nhpoly1305_update_helper(desc, src, n, _nh_avx2); + crypto_nhpoly1305_update_helper(desc, src, n, nh_avx2); kernel_fpu_end(); src += n; srclen -= n; diff --git a/arch/x86/crypto/nhpoly1305-sse2-glue.c b/arch/x86/crypto/nhpoly1305-sse2-glue.c index 2b353d42ed13..4a4970d75107 100644 --- a/arch/x86/crypto/nhpoly1305-sse2-glue.c +++ b/arch/x86/crypto/nhpoly1305-sse2-glue.c @@ -14,14 +14,7 @@ #include <asm/simd.h> asmlinkage void nh_sse2(const u32 *key, const u8 *message, size_t message_len, - u8 hash[NH_HASH_BYTES]); - -/* wrapper to avoid indirect call to assembly, which doesn't work with CFI */ -static void _nh_sse2(const u32 *key, const u8 *message, size_t message_len, - __le64 hash[NH_NUM_PASSES]) -{ - nh_sse2(key, message, message_len, (u8 *)hash); -} + __le64 hash[NH_NUM_PASSES]); static int nhpoly1305_sse2_update(struct shash_desc *desc, const u8 *src, unsigned int srclen) @@ -33,7 +26,7 @@ static int nhpoly1305_sse2_update(struct shash_desc *desc, unsigned int n = min_t(unsigned int, srclen, SZ_4K); kernel_fpu_begin(); - crypto_nhpoly1305_update_helper(desc, src, n, _nh_sse2); + crypto_nhpoly1305_update_helper(desc, src, n, nh_sse2); kernel_fpu_end(); src += n; srclen -= n; diff --git a/arch/x86/crypto/poly1305-x86_64-cryptogams.pl b/arch/x86/crypto/poly1305-x86_64-cryptogams.pl index 2077ce7a5647..b9abcd79c1f4 100644 --- a/arch/x86/crypto/poly1305-x86_64-cryptogams.pl +++ b/arch/x86/crypto/poly1305-x86_64-cryptogams.pl @@ -108,7 +108,6 @@ if (!$kernel) { sub declare_function() { my ($name, $align, $nargs) = @_; if($kernel) { - $code .= ".align $align\n"; $code .= "SYM_FUNC_START($name)\n"; $code .= ".L$name:\n"; } else { diff --git a/arch/x86/crypto/serpent-avx-x86_64-asm_64.S b/arch/x86/crypto/serpent-avx-x86_64-asm_64.S index 82f2313f512b..97e283621851 100644 --- a/arch/x86/crypto/serpent-avx-x86_64-asm_64.S +++ b/arch/x86/crypto/serpent-avx-x86_64-asm_64.S @@ -550,7 +550,6 @@ #define write_blocks(x0, x1, x2, x3, t0, t1, t2) \ transpose_4x4(x0, x1, x2, x3, t0, t1, t2) -.align 8 SYM_FUNC_START_LOCAL(__serpent_enc_blk8_avx) /* input: * %rdi: ctx, CTX @@ -604,7 +603,6 @@ SYM_FUNC_START_LOCAL(__serpent_enc_blk8_avx) RET; SYM_FUNC_END(__serpent_enc_blk8_avx) -.align 8 SYM_FUNC_START_LOCAL(__serpent_dec_blk8_avx) /* input: * %rdi: ctx, CTX diff --git a/arch/x86/crypto/serpent-avx2-asm_64.S b/arch/x86/crypto/serpent-avx2-asm_64.S index 8ea34c9b9316..6d60c50593a9 100644 --- a/arch/x86/crypto/serpent-avx2-asm_64.S +++ b/arch/x86/crypto/serpent-avx2-asm_64.S @@ -550,7 +550,6 @@ #define write_blocks(x0, x1, x2, x3, t0, t1, t2) \ transpose_4x4(x0, x1, x2, x3, t0, t1, t2) -.align 8 SYM_FUNC_START_LOCAL(__serpent_enc_blk16) /* input: * %rdi: ctx, CTX @@ -604,7 +603,6 @@ SYM_FUNC_START_LOCAL(__serpent_enc_blk16) RET; SYM_FUNC_END(__serpent_enc_blk16) -.align 8 SYM_FUNC_START_LOCAL(__serpent_dec_blk16) /* input: * %rdi: ctx, CTX diff --git a/arch/x86/crypto/sha1_ni_asm.S b/arch/x86/crypto/sha1_ni_asm.S index 2f94ec0e763b..cade913d4882 100644 --- a/arch/x86/crypto/sha1_ni_asm.S +++ b/arch/x86/crypto/sha1_ni_asm.S @@ -54,6 +54,7 @@ */ #include <linux/linkage.h> +#include <linux/cfi_types.h> #define DIGEST_PTR %rdi /* 1st arg */ #define DATA_PTR %rsi /* 2nd arg */ @@ -92,8 +93,7 @@ * numBlocks: Number of blocks to process */ .text -.align 32 -SYM_FUNC_START(sha1_ni_transform) +SYM_TYPED_FUNC_START(sha1_ni_transform) push %rbp mov %rsp, %rbp sub $FRAME_SIZE, %rsp diff --git a/arch/x86/crypto/sha1_ssse3_asm.S b/arch/x86/crypto/sha1_ssse3_asm.S index 263f916362e0..f54988c80eb4 100644 --- a/arch/x86/crypto/sha1_ssse3_asm.S +++ b/arch/x86/crypto/sha1_ssse3_asm.S @@ -25,6 +25,7 @@ */ #include <linux/linkage.h> +#include <linux/cfi_types.h> #define CTX %rdi // arg1 #define BUF %rsi // arg2 @@ -67,7 +68,7 @@ * param: function's name */ .macro SHA1_VECTOR_ASM name - SYM_FUNC_START(\name) + SYM_TYPED_FUNC_START(\name) push %rbx push %r12 diff --git a/arch/x86/crypto/sha256-avx-asm.S b/arch/x86/crypto/sha256-avx-asm.S index 3baa1ec39097..5555b5d5215a 100644 --- a/arch/x86/crypto/sha256-avx-asm.S +++ b/arch/x86/crypto/sha256-avx-asm.S @@ -48,6 +48,7 @@ ######################################################################## #include <linux/linkage.h> +#include <linux/cfi_types.h> ## assume buffers not aligned #define VMOVDQ vmovdqu @@ -346,8 +347,7 @@ a = TMP_ ## arg 3 : Num blocks ######################################################################## .text -SYM_FUNC_START(sha256_transform_avx) -.align 32 +SYM_TYPED_FUNC_START(sha256_transform_avx) pushq %rbx pushq %r12 pushq %r13 diff --git a/arch/x86/crypto/sha256-avx2-asm.S b/arch/x86/crypto/sha256-avx2-asm.S index 9bcdbc47b8b4..3eada9416852 100644 --- a/arch/x86/crypto/sha256-avx2-asm.S +++ b/arch/x86/crypto/sha256-avx2-asm.S @@ -49,6 +49,7 @@ ######################################################################## #include <linux/linkage.h> +#include <linux/cfi_types.h> ## assume buffers not aligned #define VMOVDQ vmovdqu @@ -523,8 +524,7 @@ STACK_SIZE = _CTX + _CTX_SIZE ## arg 3 : Num blocks ######################################################################## .text -SYM_FUNC_START(sha256_transform_rorx) -.align 32 +SYM_TYPED_FUNC_START(sha256_transform_rorx) pushq %rbx pushq %r12 pushq %r13 diff --git a/arch/x86/crypto/sha256-ssse3-asm.S b/arch/x86/crypto/sha256-ssse3-asm.S index c4a5db612c32..959288eecc68 100644 --- a/arch/x86/crypto/sha256-ssse3-asm.S +++ b/arch/x86/crypto/sha256-ssse3-asm.S @@ -47,6 +47,7 @@ ######################################################################## #include <linux/linkage.h> +#include <linux/cfi_types.h> ## assume buffers not aligned #define MOVDQ movdqu @@ -355,8 +356,7 @@ a = TMP_ ## arg 3 : Num blocks ######################################################################## .text -SYM_FUNC_START(sha256_transform_ssse3) -.align 32 +SYM_TYPED_FUNC_START(sha256_transform_ssse3) pushq %rbx pushq %r12 pushq %r13 diff --git a/arch/x86/crypto/sha256_ni_asm.S b/arch/x86/crypto/sha256_ni_asm.S index 94d50dd27cb5..537b6dcd7ed8 100644 --- a/arch/x86/crypto/sha256_ni_asm.S +++ b/arch/x86/crypto/sha256_ni_asm.S @@ -54,6 +54,7 @@ */ #include <linux/linkage.h> +#include <linux/cfi_types.h> #define DIGEST_PTR %rdi /* 1st arg */ #define DATA_PTR %rsi /* 2nd arg */ @@ -96,8 +97,7 @@ */ .text -.align 32 -SYM_FUNC_START(sha256_ni_transform) +SYM_TYPED_FUNC_START(sha256_ni_transform) shl $6, NUM_BLKS /* convert to bytes */ jz .Ldone_hash diff --git a/arch/x86/crypto/sha512-avx-asm.S b/arch/x86/crypto/sha512-avx-asm.S index 1fefe6dd3a9e..b0984f19fdb4 100644 --- a/arch/x86/crypto/sha512-avx-asm.S +++ b/arch/x86/crypto/sha512-avx-asm.S @@ -48,6 +48,7 @@ ######################################################################## #include <linux/linkage.h> +#include <linux/cfi_types.h> .text @@ -273,7 +274,7 @@ frame_size = frame_WK + WK_SIZE # of SHA512 message blocks. # "blocks" is the message length in SHA512 blocks ######################################################################## -SYM_FUNC_START(sha512_transform_avx) +SYM_TYPED_FUNC_START(sha512_transform_avx) test msglen, msglen je nowork diff --git a/arch/x86/crypto/sha512-avx2-asm.S b/arch/x86/crypto/sha512-avx2-asm.S index 5cdaab7d6901..b1ca99055ef9 100644 --- a/arch/x86/crypto/sha512-avx2-asm.S +++ b/arch/x86/crypto/sha512-avx2-asm.S @@ -50,6 +50,7 @@ ######################################################################## #include <linux/linkage.h> +#include <linux/cfi_types.h> .text @@ -565,7 +566,7 @@ frame_size = frame_CTX + CTX_SIZE # of SHA512 message blocks. # "blocks" is the message length in SHA512 blocks ######################################################################## -SYM_FUNC_START(sha512_transform_rorx) +SYM_TYPED_FUNC_START(sha512_transform_rorx) # Save GPRs push %rbx push %r12 diff --git a/arch/x86/crypto/sha512-ssse3-asm.S b/arch/x86/crypto/sha512-ssse3-asm.S index b84c22e06c5f..c06afb5270e5 100644 --- a/arch/x86/crypto/sha512-ssse3-asm.S +++ b/arch/x86/crypto/sha512-ssse3-asm.S @@ -48,6 +48,7 @@ ######################################################################## #include <linux/linkage.h> +#include <linux/cfi_types.h> .text @@ -274,7 +275,7 @@ frame_size = frame_WK + WK_SIZE # of SHA512 message blocks. # "blocks" is the message length in SHA512 blocks. ######################################################################## -SYM_FUNC_START(sha512_transform_ssse3) +SYM_TYPED_FUNC_START(sha512_transform_ssse3) test msglen, msglen je nowork diff --git a/arch/x86/crypto/sm3-avx-asm_64.S b/arch/x86/crypto/sm3-avx-asm_64.S index b12b9efb5ec5..503bab450a91 100644 --- a/arch/x86/crypto/sm3-avx-asm_64.S +++ b/arch/x86/crypto/sm3-avx-asm_64.S @@ -12,6 +12,7 @@ */ #include <linux/linkage.h> +#include <linux/cfi_types.h> #include <asm/frame.h> /* Context structure */ @@ -327,8 +328,7 @@ * void sm3_transform_avx(struct sm3_state *state, * const u8 *data, int nblocks); */ -.align 16 -SYM_FUNC_START(sm3_transform_avx) +SYM_TYPED_FUNC_START(sm3_transform_avx) /* input: * %rdi: ctx, CTX * %rsi: data (64*nblks bytes) diff --git a/arch/x86/crypto/sm4-aesni-avx-asm_64.S b/arch/x86/crypto/sm4-aesni-avx-asm_64.S index 4767ab61ff48..e2668d2fe6ce 100644 --- a/arch/x86/crypto/sm4-aesni-avx-asm_64.S +++ b/arch/x86/crypto/sm4-aesni-avx-asm_64.S @@ -14,6 +14,7 @@ */ #include <linux/linkage.h> +#include <linux/cfi_types.h> #include <asm/frame.h> #define rRIP (%rip) @@ -139,13 +140,11 @@ .text -.align 16 /* * void sm4_aesni_avx_crypt4(const u32 *rk, u8 *dst, * const u8 *src, int nblocks) */ -.align 8 SYM_FUNC_START(sm4_aesni_avx_crypt4) /* input: * %rdi: round key array, CTX @@ -249,7 +248,6 @@ SYM_FUNC_START(sm4_aesni_avx_crypt4) RET; SYM_FUNC_END(sm4_aesni_avx_crypt4) -.align 8 SYM_FUNC_START_LOCAL(__sm4_crypt_blk8) /* input: * %rdi: round key array, CTX @@ -363,7 +361,6 @@ SYM_FUNC_END(__sm4_crypt_blk8) * void sm4_aesni_avx_crypt8(const u32 *rk, u8 *dst, * const u8 *src, int nblocks) */ -.align 8 SYM_FUNC_START(sm4_aesni_avx_crypt8) /* input: * %rdi: round key array, CTX @@ -419,8 +416,7 @@ SYM_FUNC_END(sm4_aesni_avx_crypt8) * void sm4_aesni_avx_ctr_enc_blk8(const u32 *rk, u8 *dst, * const u8 *src, u8 *iv) */ -.align 8 -SYM_FUNC_START(sm4_aesni_avx_ctr_enc_blk8) +SYM_TYPED_FUNC_START(sm4_aesni_avx_ctr_enc_blk8) /* input: * %rdi: round key array, CTX * %rsi: dst (8 blocks) @@ -494,8 +490,7 @@ SYM_FUNC_END(sm4_aesni_avx_ctr_enc_blk8) * void sm4_aesni_avx_cbc_dec_blk8(const u32 *rk, u8 *dst, * const u8 *src, u8 *iv) */ -.align 8 -SYM_FUNC_START(sm4_aesni_avx_cbc_dec_blk8) +SYM_TYPED_FUNC_START(sm4_aesni_avx_cbc_dec_blk8) /* input: * %rdi: round key array, CTX * %rsi: dst (8 blocks) @@ -544,8 +539,7 @@ SYM_FUNC_END(sm4_aesni_avx_cbc_dec_blk8) * void sm4_aesni_avx_cfb_dec_blk8(const u32 *rk, u8 *dst, * const u8 *src, u8 *iv) */ -.align 8 -SYM_FUNC_START(sm4_aesni_avx_cfb_dec_blk8) +SYM_TYPED_FUNC_START(sm4_aesni_avx_cfb_dec_blk8) /* input: * %rdi: round key array, CTX * %rsi: dst (8 blocks) diff --git a/arch/x86/crypto/sm4-aesni-avx2-asm_64.S b/arch/x86/crypto/sm4-aesni-avx2-asm_64.S index 4732fe8bb65b..98ede9459287 100644 --- a/arch/x86/crypto/sm4-aesni-avx2-asm_64.S +++ b/arch/x86/crypto/sm4-aesni-avx2-asm_64.S @@ -14,6 +14,7 @@ */ #include <linux/linkage.h> +#include <linux/cfi_types.h> #include <asm/frame.h> #define rRIP (%rip) @@ -153,9 +154,6 @@ .long 0xdeadbeef, 0xdeadbeef, 0xdeadbeef .text -.align 16 - -.align 8 SYM_FUNC_START_LOCAL(__sm4_crypt_blk16) /* input: * %rdi: round key array, CTX @@ -281,8 +279,7 @@ SYM_FUNC_END(__sm4_crypt_blk16) * void sm4_aesni_avx2_ctr_enc_blk16(const u32 *rk, u8 *dst, * const u8 *src, u8 *iv) */ -.align 8 -SYM_FUNC_START(sm4_aesni_avx2_ctr_enc_blk16) +SYM_TYPED_FUNC_START(sm4_aesni_avx2_ctr_enc_blk16) /* input: * %rdi: round key array, CTX * %rsi: dst (16 blocks) @@ -394,8 +391,7 @@ SYM_FUNC_END(sm4_aesni_avx2_ctr_enc_blk16) * void sm4_aesni_avx2_cbc_dec_blk16(const u32 *rk, u8 *dst, * const u8 *src, u8 *iv) */ -.align 8 -SYM_FUNC_START(sm4_aesni_avx2_cbc_dec_blk16) +SYM_TYPED_FUNC_START(sm4_aesni_avx2_cbc_dec_blk16) /* input: * %rdi: round key array, CTX * %rsi: dst (16 blocks) @@ -448,8 +444,7 @@ SYM_FUNC_END(sm4_aesni_avx2_cbc_dec_blk16) * void sm4_aesni_avx2_cfb_dec_blk16(const u32 *rk, u8 *dst, * const u8 *src, u8 *iv) */ -.align 8 -SYM_FUNC_START(sm4_aesni_avx2_cfb_dec_blk16) +SYM_TYPED_FUNC_START(sm4_aesni_avx2_cfb_dec_blk16) /* input: * %rdi: round key array, CTX * %rsi: dst (16 blocks) diff --git a/arch/x86/crypto/twofish-avx-x86_64-asm_64.S b/arch/x86/crypto/twofish-avx-x86_64-asm_64.S index 31f9b2ec3857..12fde271cd3f 100644 --- a/arch/x86/crypto/twofish-avx-x86_64-asm_64.S +++ b/arch/x86/crypto/twofish-avx-x86_64-asm_64.S @@ -228,7 +228,6 @@ vpxor x2, wkey, x2; \ vpxor x3, wkey, x3; -.align 8 SYM_FUNC_START_LOCAL(__twofish_enc_blk8) /* input: * %rdi: ctx, CTX @@ -270,7 +269,6 @@ SYM_FUNC_START_LOCAL(__twofish_enc_blk8) RET; SYM_FUNC_END(__twofish_enc_blk8) -.align 8 SYM_FUNC_START_LOCAL(__twofish_dec_blk8) /* input: * %rdi: ctx, CTX diff --git a/arch/x86/crypto/twofish_glue.c b/arch/x86/crypto/twofish_glue.c index f9c4adc27404..0614beece279 100644 --- a/arch/x86/crypto/twofish_glue.c +++ b/arch/x86/crypto/twofish_glue.c @@ -38,8 +38,8 @@ * Third Edition. */ +#include <crypto/algapi.h> #include <crypto/twofish.h> -#include <linux/crypto.h> #include <linux/init.h> #include <linux/module.h> #include <linux/types.h> diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index e309e7156038..91397f58ac30 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -1181,7 +1181,7 @@ SYM_CODE_START(asm_exc_nmi) * is using the thread stack right now, so it's safe for us to use it. */ movl %esp, %ebx - movl PER_CPU_VAR(cpu_current_top_of_stack), %esp + movl PER_CPU_VAR(pcpu_hot + X86_top_of_stack), %esp call exc_nmi movl %ebx, %esp @@ -1243,7 +1243,7 @@ SYM_CODE_START(rewind_stack_and_make_dead) /* Prevent any naive code from trying to unwind to our caller. */ xorl %ebp, %ebp - movl PER_CPU_VAR(cpu_current_top_of_stack), %esi + movl PER_CPU_VAR(pcpu_hot + X86_top_of_stack), %esi leal -TOP_OF_KERNEL_STACK_PADDING-PTREGS_SIZE(%esi), %esp call make_task_dead diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 9953d966d124..15739a2c0983 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -92,7 +92,7 @@ SYM_CODE_START(entry_SYSCALL_64) /* tss.sp2 is scratch space. */ movq %rsp, PER_CPU_VAR(cpu_tss_rw + TSS_sp2) SWITCH_TO_KERNEL_CR3 scratch_reg=%rsp - movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp + movq PER_CPU_VAR(pcpu_hot + X86_top_of_stack), %rsp SYM_INNER_LABEL(entry_SYSCALL_64_safe_stack, SYM_L_GLOBAL) ANNOTATE_NOENDBR @@ -252,7 +252,7 @@ SYM_FUNC_START(__switch_to_asm) #ifdef CONFIG_STACKPROTECTOR movq TASK_stack_canary(%rsi), %rbx - movq %rbx, PER_CPU_VAR(fixed_percpu_data) + stack_canary_offset + movq %rbx, PER_CPU_VAR(fixed_percpu_data) + FIXED_stack_canary #endif /* @@ -284,9 +284,11 @@ SYM_FUNC_END(__switch_to_asm) * r12: kernel thread arg */ .pushsection .text, "ax" -SYM_CODE_START(ret_from_fork) + __FUNC_ALIGN +SYM_CODE_START_NOALIGN(ret_from_fork) UNWIND_HINT_EMPTY ANNOTATE_NOENDBR // copy_thread + CALL_DEPTH_ACCOUNT movq %rax, %rdi call schedule_tail /* rdi: 'prev' task parameter */ @@ -326,11 +328,12 @@ SYM_CODE_END(ret_from_fork) #endif .endm -SYM_CODE_START_LOCAL(xen_error_entry) +SYM_CODE_START(xen_error_entry) + ANNOTATE_NOENDBR UNWIND_HINT_FUNC PUSH_AND_CLEAR_REGS save_ret=1 ENCODE_FRAME_POINTER 8 - UNTRAIN_RET + UNTRAIN_RET_FROM_CALL RET SYM_CODE_END(xen_error_entry) @@ -600,13 +603,13 @@ SYM_CODE_END(\asmsym) * shared between 32 and 64 bit and emit the __irqentry_text_* markers * so the stacktrace boundary checks work. */ - .align 16 + __ALIGN .globl __irqentry_text_start __irqentry_text_start: #include <asm/idtentry.h> - .align 16 + __ALIGN .globl __irqentry_text_end __irqentry_text_end: ANNOTATE_NOENDBR @@ -828,7 +831,8 @@ EXPORT_SYMBOL(asm_load_gs_index) * * C calling convention: exc_xen_hypervisor_callback(struct *pt_regs) */ -SYM_CODE_START_LOCAL(exc_xen_hypervisor_callback) + __FUNC_ALIGN +SYM_CODE_START_LOCAL_NOALIGN(exc_xen_hypervisor_callback) /* * Since we don't modify %rdi, evtchn_do_upall(struct *pt_regs) will @@ -856,7 +860,8 @@ SYM_CODE_END(exc_xen_hypervisor_callback) * We distinguish between categories by comparing each saved segment register * with its current contents: any discrepancy means we in category 1. */ -SYM_CODE_START(xen_failsafe_callback) + __FUNC_ALIGN +SYM_CODE_START_NOALIGN(xen_failsafe_callback) UNWIND_HINT_EMPTY ENDBR movl %ds, %ecx @@ -903,7 +908,8 @@ SYM_CODE_END(xen_failsafe_callback) * R14 - old CR3 * R15 - old SPEC_CTRL */ -SYM_CODE_START_LOCAL(paranoid_entry) +SYM_CODE_START(paranoid_entry) + ANNOTATE_NOENDBR UNWIND_HINT_FUNC PUSH_AND_CLEAR_REGS save_ret=1 ENCODE_FRAME_POINTER 8 @@ -972,7 +978,7 @@ SYM_CODE_START_LOCAL(paranoid_entry) * CR3 above, keep the old value in a callee saved register. */ IBRS_ENTER save_reg=%r15 - UNTRAIN_RET + UNTRAIN_RET_FROM_CALL RET SYM_CODE_END(paranoid_entry) @@ -1038,7 +1044,8 @@ SYM_CODE_END(paranoid_exit) /* * Switch GS and CR3 if needed. */ -SYM_CODE_START_LOCAL(error_entry) +SYM_CODE_START(error_entry) + ANNOTATE_NOENDBR UNWIND_HINT_FUNC PUSH_AND_CLEAR_REGS save_ret=1 @@ -1056,14 +1063,11 @@ SYM_CODE_START_LOCAL(error_entry) /* We have user CR3. Change to kernel CR3. */ SWITCH_TO_KERNEL_CR3 scratch_reg=%rax IBRS_ENTER - UNTRAIN_RET + UNTRAIN_RET_FROM_CALL leaq 8(%rsp), %rdi /* arg0 = pt_regs pointer */ -.Lerror_entry_from_usermode_after_swapgs: - /* Put us onto the real thread stack. */ - call sync_regs - RET + jmp sync_regs /* * There are two places in the kernel that can potentially fault with @@ -1094,6 +1098,7 @@ SYM_CODE_START_LOCAL(error_entry) */ .Lerror_entry_done_lfence: FENCE_SWAPGS_KERNEL_ENTRY + CALL_DEPTH_ACCOUNT leaq 8(%rsp), %rax /* return pt_regs pointer */ ANNOTATE_UNRET_END RET @@ -1112,7 +1117,7 @@ SYM_CODE_START_LOCAL(error_entry) FENCE_SWAPGS_USER_ENTRY SWITCH_TO_KERNEL_CR3 scratch_reg=%rax IBRS_ENTER - UNTRAIN_RET + UNTRAIN_RET_FROM_CALL /* * Pretend that the exception came from user mode: set up pt_regs @@ -1121,7 +1126,7 @@ SYM_CODE_START_LOCAL(error_entry) leaq 8(%rsp), %rdi /* arg0 = pt_regs pointer */ call fixup_bad_iret mov %rax, %rdi - jmp .Lerror_entry_from_usermode_after_swapgs + jmp sync_regs SYM_CODE_END(error_entry) SYM_CODE_START_LOCAL(error_return) @@ -1206,7 +1211,7 @@ SYM_CODE_START(asm_exc_nmi) FENCE_SWAPGS_USER_ENTRY SWITCH_TO_KERNEL_CR3 scratch_reg=%rdx movq %rsp, %rdx - movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp + movq PER_CPU_VAR(pcpu_hot + X86_top_of_stack), %rsp UNWIND_HINT_IRET_REGS base=%rdx offset=8 pushq 5*8(%rdx) /* pt_regs->ss */ pushq 4*8(%rdx) /* pt_regs->rsp */ @@ -1516,12 +1521,13 @@ SYM_CODE_END(ignore_sysret) #endif .pushsection .text, "ax" -SYM_CODE_START(rewind_stack_and_make_dead) + __FUNC_ALIGN +SYM_CODE_START_NOALIGN(rewind_stack_and_make_dead) UNWIND_HINT_FUNC /* Prevent any naive code from trying to unwind to our caller. */ xorl %ebp, %ebp - movq PER_CPU_VAR(cpu_current_top_of_stack), %rax + movq PER_CPU_VAR(pcpu_hot + X86_top_of_stack), %rax leaq -PTREGS_SIZE(%rax), %rsp UNWIND_HINT_REGS diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S index 4dd19819053a..70150298f8bd 100644 --- a/arch/x86/entry/entry_64_compat.S +++ b/arch/x86/entry/entry_64_compat.S @@ -58,10 +58,10 @@ SYM_CODE_START(entry_SYSENTER_compat) SWITCH_TO_KERNEL_CR3 scratch_reg=%rax popq %rax - movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp + movq PER_CPU_VAR(pcpu_hot + X86_top_of_stack), %rsp /* Construct struct pt_regs on stack */ - pushq $__USER32_DS /* pt_regs->ss */ + pushq $__USER_DS /* pt_regs->ss */ pushq $0 /* pt_regs->sp = 0 (placeholder) */ /* @@ -128,7 +128,6 @@ SYM_INNER_LABEL(entry_SYSENTER_compat_after_hwframe, SYM_L_GLOBAL) popfq jmp .Lsysenter_flags_fixed SYM_INNER_LABEL(__end_entry_SYSENTER_compat, SYM_L_GLOBAL) - ANNOTATE_NOENDBR // is_sysenter_singlestep SYM_CODE_END(entry_SYSENTER_compat) /* @@ -191,13 +190,13 @@ SYM_CODE_START(entry_SYSCALL_compat) SWITCH_TO_KERNEL_CR3 scratch_reg=%rsp /* Switch to the kernel stack */ - movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp + movq PER_CPU_VAR(pcpu_hot + X86_top_of_stack), %rsp SYM_INNER_LABEL(entry_SYSCALL_compat_safe_stack, SYM_L_GLOBAL) ANNOTATE_NOENDBR /* Construct struct pt_regs on stack */ - pushq $__USER32_DS /* pt_regs->ss */ + pushq $__USER_DS /* pt_regs->ss */ pushq %r8 /* pt_regs->sp */ pushq %r11 /* pt_regs->flags */ pushq $__USER32_CS /* pt_regs->cs */ @@ -332,7 +331,7 @@ SYM_CODE_START(entry_INT80_compat) ALTERNATIVE "", "jmp .Lint80_keep_stack", X86_FEATURE_XENPV movq %rsp, %rax - movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp + movq PER_CPU_VAR(pcpu_hot + X86_top_of_stack), %rsp pushq 5*8(%rax) /* regs->ss */ pushq 4*8(%rax) /* regs->rsp */ diff --git a/arch/x86/entry/thunk_64.S b/arch/x86/entry/thunk_64.S index f38b07d2768b..5e37f41e5f14 100644 --- a/arch/x86/entry/thunk_64.S +++ b/arch/x86/entry/thunk_64.S @@ -11,7 +11,7 @@ /* rdi: arg1 ... normal C conventions. rax is saved/restored. */ .macro THUNK name, func -SYM_FUNC_START_NOALIGN(\name) +SYM_FUNC_START(\name) pushq %rbp movq %rsp, %rbp @@ -36,7 +36,7 @@ SYM_FUNC_END(\name) EXPORT_SYMBOL(preempt_schedule_thunk) EXPORT_SYMBOL(preempt_schedule_notrace_thunk) -SYM_CODE_START_LOCAL_NOALIGN(__thunk_restore) +SYM_CODE_START_LOCAL(__thunk_restore) popq %r11 popq %r10 popq %r9 diff --git a/arch/x86/entry/vdso/Makefile b/arch/x86/entry/vdso/Makefile index 3e88b9df8c8f..838613ac15b8 100644 --- a/arch/x86/entry/vdso/Makefile +++ b/arch/x86/entry/vdso/Makefile @@ -33,11 +33,12 @@ vobjs32-y += vdso32/vclock_gettime.o vobjs-$(CONFIG_X86_SGX) += vsgx.o # files to link into kernel -obj-y += vma.o extable.o -KASAN_SANITIZE_vma.o := y -UBSAN_SANITIZE_vma.o := y -KCSAN_SANITIZE_vma.o := y -OBJECT_FILES_NON_STANDARD_vma.o := n +obj-y += vma.o extable.o +KASAN_SANITIZE_vma.o := y +UBSAN_SANITIZE_vma.o := y +KCSAN_SANITIZE_vma.o := y +OBJECT_FILES_NON_STANDARD_vma.o := n +OBJECT_FILES_NON_STANDARD_extable.o := n # vDSO images to build vdso_img-$(VDSO64-y) += 64 @@ -94,7 +95,7 @@ ifneq ($(RETPOLINE_VDSO_CFLAGS),) endif endif -$(vobjs): KBUILD_CFLAGS := $(filter-out $(CC_FLAGS_LTO) $(CC_FLAGS_CFI) $(RANDSTRUCT_CFLAGS) $(GCC_PLUGINS_CFLAGS) $(RETPOLINE_CFLAGS),$(KBUILD_CFLAGS)) $(CFL) +$(vobjs): KBUILD_CFLAGS := $(filter-out $(PADDING_CFLAGS) $(CC_FLAGS_LTO) $(CC_FLAGS_CFI) $(RANDSTRUCT_CFLAGS) $(GCC_PLUGINS_CFLAGS) $(RETPOLINE_CFLAGS),$(KBUILD_CFLAGS)) $(CFL) $(vobjs): KBUILD_AFLAGS += -DBUILD_VDSO # @@ -157,6 +158,7 @@ KBUILD_CFLAGS_32 := $(filter-out $(GCC_PLUGINS_CFLAGS),$(KBUILD_CFLAGS_32)) KBUILD_CFLAGS_32 := $(filter-out $(RETPOLINE_CFLAGS),$(KBUILD_CFLAGS_32)) KBUILD_CFLAGS_32 := $(filter-out $(CC_FLAGS_LTO),$(KBUILD_CFLAGS_32)) KBUILD_CFLAGS_32 := $(filter-out $(CC_FLAGS_CFI),$(KBUILD_CFLAGS_32)) +KBUILD_CFLAGS_32 := $(filter-out $(PADDING_CFLAGS),$(KBUILD_CFLAGS_32)) KBUILD_CFLAGS_32 += -m32 -msoft-float -mregparm=0 -fpic KBUILD_CFLAGS_32 += -fno-stack-protector KBUILD_CFLAGS_32 += $(call cc-option, -foptimize-sibling-calls) diff --git a/arch/x86/entry/vdso/vdso.lds.S b/arch/x86/entry/vdso/vdso.lds.S index 4bf48462fca7..e8c60ae7a7c8 100644 --- a/arch/x86/entry/vdso/vdso.lds.S +++ b/arch/x86/entry/vdso/vdso.lds.S @@ -27,7 +27,9 @@ VERSION { __vdso_time; clock_getres; __vdso_clock_getres; +#ifdef CONFIG_X86_SGX __vdso_sgx_enter_enclave; +#endif local: *; }; } diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c index 311eae30e089..b8f3f9b9e53c 100644 --- a/arch/x86/entry/vdso/vma.c +++ b/arch/x86/entry/vdso/vma.c @@ -98,24 +98,6 @@ static int vdso_mremap(const struct vm_special_mapping *sm, } #ifdef CONFIG_TIME_NS -static struct page *find_timens_vvar_page(struct vm_area_struct *vma) -{ - if (likely(vma->vm_mm == current->mm)) - return current->nsproxy->time_ns->vvar_page; - - /* - * VM_PFNMAP | VM_IO protect .fault() handler from being called - * through interfaces like /proc/$pid/mem or - * process_vm_{readv,writev}() as long as there's no .access() - * in special_mapping_vmops(). - * For more details check_vma_flags() and __access_remote_vm() - */ - - WARN(1, "vvar_page accessed remotely"); - - return NULL; -} - /* * The vvar page layout depends on whether a task belongs to the root or * non-root time namespace. Whenever a task changes its namespace, the VVAR @@ -140,11 +122,6 @@ int vdso_join_timens(struct task_struct *task, struct time_namespace *ns) return 0; } -#else -static inline struct page *find_timens_vvar_page(struct vm_area_struct *vma) -{ - return NULL; -} #endif static vm_fault_t vvar_fault(const struct vm_special_mapping *sm, @@ -210,11 +187,10 @@ static vm_fault_t vvar_fault(const struct vm_special_mapping *sm, pgprot_decrypted(vma->vm_page_prot)); } } else if (sym_offset == image->sym_hvclock_page) { - struct ms_hyperv_tsc_page *tsc_pg = hv_get_tsc_page(); + pfn = hv_get_tsc_pfn(); - if (tsc_pg && vclock_was_used(VDSO_CLOCKMODE_HVCLOCK)) - return vmf_insert_pfn(vma, vmf->address, - virt_to_phys(tsc_pg) >> PAGE_SHIFT); + if (pfn && vclock_was_used(VDSO_CLOCKMODE_HVCLOCK)) + return vmf_insert_pfn(vma, vmf->address, pfn); } else if (sym_offset == image->sym_timens_page) { struct page *timens_page = find_timens_vvar_page(vma); @@ -327,7 +303,7 @@ static unsigned long vdso_addr(unsigned long start, unsigned len) end -= len; if (end > start) { - offset = prandom_u32_max(((end - start) >> PAGE_SHIFT) + 1); + offset = get_random_u32_below(((end - start) >> PAGE_SHIFT) + 1); addr = start + (offset << PAGE_SHIFT); } else { addr = start; diff --git a/arch/x86/events/amd/brs.c b/arch/x86/events/amd/brs.c index f1bff153d945..58461fa18b6f 100644 --- a/arch/x86/events/amd/brs.c +++ b/arch/x86/events/amd/brs.c @@ -384,7 +384,7 @@ static void amd_brs_poison_buffer(void) * On ctxswin, sched_in = true, called after the PMU has started * On ctxswout, sched_in = false, called before the PMU is stopped */ -void amd_pmu_brs_sched_task(struct perf_event_context *ctx, bool sched_in) +void amd_pmu_brs_sched_task(struct perf_event_pmu_context *pmu_ctx, bool sched_in) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); diff --git a/arch/x86/events/amd/core.c b/arch/x86/events/amd/core.c index 8b70237c33f7..4386b10682ce 100644 --- a/arch/x86/events/amd/core.c +++ b/arch/x86/events/amd/core.c @@ -861,8 +861,7 @@ static int amd_pmu_handle_irq(struct pt_regs *regs) pmu_enabled = cpuc->enabled; cpuc->enabled = 0; - /* stop everything (includes BRS) */ - amd_pmu_disable_all(); + amd_brs_disable_all(); /* Drain BRS is in use (could be inactive) */ if (cpuc->lbr_users) @@ -873,7 +872,7 @@ static int amd_pmu_handle_irq(struct pt_regs *regs) cpuc->enabled = pmu_enabled; if (pmu_enabled) - amd_pmu_enable_all(0); + amd_brs_enable_all(); return amd_pmu_adjust_nmi_window(handled); } @@ -1388,7 +1387,7 @@ static int __init amd_core_pmu_init(void) * numbered counter following it. */ for (i = 0; i < x86_pmu.num_counters - 1; i += 2) - even_ctr_mask |= 1 << i; + even_ctr_mask |= BIT_ULL(i); pair_constraint = (struct event_constraint) __EVENT_CONSTRAINT(0, even_ctr_mask, 0, diff --git a/arch/x86/events/amd/ibs.c b/arch/x86/events/amd/ibs.c index 4cb710efbdd9..da3f5ebac4e1 100644 --- a/arch/x86/events/amd/ibs.c +++ b/arch/x86/events/amd/ibs.c @@ -631,7 +631,7 @@ static const struct attribute_group *op_attr_update[] = { static struct perf_ibs perf_ibs_fetch = { .pmu = { - .task_ctx_nr = perf_invalid_context, + .task_ctx_nr = perf_hw_context, .event_init = perf_ibs_init, .add = perf_ibs_add, @@ -655,7 +655,7 @@ static struct perf_ibs perf_ibs_fetch = { static struct perf_ibs perf_ibs_op = { .pmu = { - .task_ctx_nr = perf_invalid_context, + .task_ctx_nr = perf_hw_context, .event_init = perf_ibs_init, .add = perf_ibs_add, diff --git a/arch/x86/events/amd/lbr.c b/arch/x86/events/amd/lbr.c index 38a75216c12c..eb31f850841a 100644 --- a/arch/x86/events/amd/lbr.c +++ b/arch/x86/events/amd/lbr.c @@ -352,7 +352,7 @@ void amd_pmu_lbr_add(struct perf_event *event) cpuc->br_sel = reg->reg; } - perf_sched_cb_inc(event->ctx->pmu); + perf_sched_cb_inc(event->pmu); if (!cpuc->lbr_users++ && !event->total_time_running) amd_pmu_lbr_reset(); @@ -370,10 +370,10 @@ void amd_pmu_lbr_del(struct perf_event *event) cpuc->lbr_users--; WARN_ON_ONCE(cpuc->lbr_users < 0); - perf_sched_cb_dec(event->ctx->pmu); + perf_sched_cb_dec(event->pmu); } -void amd_pmu_lbr_sched_task(struct perf_event_context *ctx, bool sched_in) +void amd_pmu_lbr_sched_task(struct perf_event_pmu_context *pmu_ctx, bool sched_in) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); diff --git a/arch/x86/events/amd/uncore.c b/arch/x86/events/amd/uncore.c index d568afc705d2..83f15fe411b3 100644 --- a/arch/x86/events/amd/uncore.c +++ b/arch/x86/events/amd/uncore.c @@ -553,6 +553,7 @@ static void uncore_clean_online(void) hlist_for_each_entry_safe(uncore, n, &uncore_unused_list, node) { hlist_del(&uncore->node); + kfree(uncore->events); kfree(uncore); } } diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index b30b8bbcd1e2..85a63a41c471 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -90,6 +90,8 @@ DEFINE_STATIC_CALL_NULL(x86_pmu_swap_task_ctx, *x86_pmu.swap_task_ctx); DEFINE_STATIC_CALL_NULL(x86_pmu_drain_pebs, *x86_pmu.drain_pebs); DEFINE_STATIC_CALL_NULL(x86_pmu_pebs_aliases, *x86_pmu.pebs_aliases); +DEFINE_STATIC_CALL_NULL(x86_pmu_filter, *x86_pmu.filter); + /* * This one is magic, it will get called even when PMU init fails (because * there is no PMU), in which case it should simply return NULL. @@ -2031,6 +2033,7 @@ static void x86_pmu_static_call_update(void) static_call_update(x86_pmu_pebs_aliases, x86_pmu.pebs_aliases); static_call_update(x86_pmu_guest_get_msrs, x86_pmu.guest_get_msrs); + static_call_update(x86_pmu_filter, x86_pmu.filter); } static void _x86_pmu_read(struct perf_event *event) @@ -2052,23 +2055,6 @@ void x86_pmu_show_pmu_cap(int num_counters, int num_counters_fixed, pr_info("... event mask: %016Lx\n", intel_ctrl); } -/* - * The generic code is not hybrid friendly. The hybrid_pmu->pmu - * of the first registered PMU is unconditionally assigned to - * each possible cpuctx->ctx.pmu. - * Update the correct hybrid PMU to the cpuctx->ctx.pmu. - */ -void x86_pmu_update_cpu_context(struct pmu *pmu, int cpu) -{ - struct perf_cpu_context *cpuctx; - - if (!pmu->pmu_cpu_context) - return; - - cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu); - cpuctx->ctx.pmu = pmu; -} - static int __init init_hw_perf_events(void) { struct x86_pmu_quirk *quirk; @@ -2175,13 +2161,9 @@ static int __init init_hw_perf_events(void) if (err) goto out2; } else { - u8 cpu_type = get_this_hybrid_cpu_type(); struct x86_hybrid_pmu *hybrid_pmu; int i, j; - if (!cpu_type && x86_pmu.get_hybrid_cpu_type) - cpu_type = x86_pmu.get_hybrid_cpu_type(); - for (i = 0; i < x86_pmu.num_hybrid_pmus; i++) { hybrid_pmu = &x86_pmu.hybrid_pmu[i]; @@ -2195,9 +2177,6 @@ static int __init init_hw_perf_events(void) (hybrid_pmu->cpu_type == hybrid_big) ? PERF_TYPE_RAW : -1); if (err) break; - - if (cpu_type == hybrid_pmu->cpu_type) - x86_pmu_update_cpu_context(&hybrid_pmu->pmu, raw_smp_processor_id()); } if (i < x86_pmu.num_hybrid_pmus) { @@ -2646,15 +2625,15 @@ static const struct attribute_group *x86_pmu_attr_groups[] = { NULL, }; -static void x86_pmu_sched_task(struct perf_event_context *ctx, bool sched_in) +static void x86_pmu_sched_task(struct perf_event_pmu_context *pmu_ctx, bool sched_in) { - static_call_cond(x86_pmu_sched_task)(ctx, sched_in); + static_call_cond(x86_pmu_sched_task)(pmu_ctx, sched_in); } -static void x86_pmu_swap_task_ctx(struct perf_event_context *prev, - struct perf_event_context *next) +static void x86_pmu_swap_task_ctx(struct perf_event_pmu_context *prev_epc, + struct perf_event_pmu_context *next_epc) { - static_call_cond(x86_pmu_swap_task_ctx)(prev, next); + static_call_cond(x86_pmu_swap_task_ctx)(prev_epc, next_epc); } void perf_check_microcode(void) @@ -2689,12 +2668,13 @@ static int x86_pmu_aux_output_match(struct perf_event *event) return 0; } -static int x86_pmu_filter_match(struct perf_event *event) +static bool x86_pmu_filter(struct pmu *pmu, int cpu) { - if (x86_pmu.filter_match) - return x86_pmu.filter_match(event); + bool ret = false; - return 1; + static_call_cond(x86_pmu_filter)(pmu, cpu, &ret); + + return ret; } static struct pmu pmu = { @@ -2725,7 +2705,7 @@ static struct pmu pmu = { .aux_output_match = x86_pmu_aux_output_match, - .filter_match = x86_pmu_filter_match, + .filter = x86_pmu_filter, }; void arch_perf_update_userpage(struct perf_event *event, diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 1b92bf05fd65..dfd2c124cdf8 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -4536,8 +4536,6 @@ end: cpumask_set_cpu(cpu, &pmu->supported_cpus); cpuc->pmu = &pmu->pmu; - x86_pmu_update_cpu_context(&pmu->pmu, cpu); - return true; } @@ -4671,17 +4669,17 @@ static void intel_pmu_cpu_dead(int cpu) cpumask_clear_cpu(cpu, &hybrid_pmu(cpuc->pmu)->supported_cpus); } -static void intel_pmu_sched_task(struct perf_event_context *ctx, +static void intel_pmu_sched_task(struct perf_event_pmu_context *pmu_ctx, bool sched_in) { - intel_pmu_pebs_sched_task(ctx, sched_in); - intel_pmu_lbr_sched_task(ctx, sched_in); + intel_pmu_pebs_sched_task(pmu_ctx, sched_in); + intel_pmu_lbr_sched_task(pmu_ctx, sched_in); } -static void intel_pmu_swap_task_ctx(struct perf_event_context *prev, - struct perf_event_context *next) +static void intel_pmu_swap_task_ctx(struct perf_event_pmu_context *prev_epc, + struct perf_event_pmu_context *next_epc) { - intel_pmu_lbr_swap_task_ctx(prev, next); + intel_pmu_lbr_swap_task_ctx(prev_epc, next_epc); } static int intel_pmu_check_period(struct perf_event *event, u64 value) @@ -4705,12 +4703,11 @@ static int intel_pmu_aux_output_match(struct perf_event *event) return is_intel_pt_event(event); } -static int intel_pmu_filter_match(struct perf_event *event) +static void intel_pmu_filter(struct pmu *pmu, int cpu, bool *ret) { - struct x86_hybrid_pmu *pmu = hybrid_pmu(event->pmu); - unsigned int cpu = smp_processor_id(); + struct x86_hybrid_pmu *hpmu = hybrid_pmu(pmu); - return cpumask_test_cpu(cpu, &pmu->supported_cpus); + *ret = !cpumask_test_cpu(cpu, &hpmu->supported_cpus); } PMU_FORMAT_ATTR(offcore_rsp, "config1:0-63"); @@ -6413,7 +6410,7 @@ __init int intel_pmu_init(void) static_call_update(intel_pmu_set_topdown_event_period, &adl_set_topdown_event_period); - x86_pmu.filter_match = intel_pmu_filter_match; + x86_pmu.filter = intel_pmu_filter; x86_pmu.get_event_constraints = adl_get_event_constraints; x86_pmu.hw_config = adl_hw_config; x86_pmu.limit_period = spr_limit_period; diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index 446d2833efa7..88e58b6ee73c 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -1069,7 +1069,7 @@ static inline bool pebs_needs_sched_cb(struct cpu_hw_events *cpuc) return cpuc->n_pebs && (cpuc->n_pebs == cpuc->n_large_pebs); } -void intel_pmu_pebs_sched_task(struct perf_event_context *ctx, bool sched_in) +void intel_pmu_pebs_sched_task(struct perf_event_pmu_context *pmu_ctx, bool sched_in) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); @@ -1177,7 +1177,7 @@ static void pebs_update_state(bool needed_cb, struct cpu_hw_events *cpuc, struct perf_event *event, bool add) { - struct pmu *pmu = event->ctx->pmu; + struct pmu *pmu = event->pmu; /* * Make sure we get updated with the first PEBS * event. It will trigger also during removal, but diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c index 4dbde69c423b..1f21f576ca77 100644 --- a/arch/x86/events/intel/lbr.c +++ b/arch/x86/events/intel/lbr.c @@ -515,21 +515,21 @@ static void __intel_pmu_lbr_save(void *ctx) cpuc->last_log_id = ++task_context_opt(ctx)->log_id; } -void intel_pmu_lbr_swap_task_ctx(struct perf_event_context *prev, - struct perf_event_context *next) +void intel_pmu_lbr_swap_task_ctx(struct perf_event_pmu_context *prev_epc, + struct perf_event_pmu_context *next_epc) { void *prev_ctx_data, *next_ctx_data; - swap(prev->task_ctx_data, next->task_ctx_data); + swap(prev_epc->task_ctx_data, next_epc->task_ctx_data); /* - * Architecture specific synchronization makes sense in - * case both prev->task_ctx_data and next->task_ctx_data + * Architecture specific synchronization makes sense in case + * both prev_epc->task_ctx_data and next_epc->task_ctx_data * pointers are allocated. */ - prev_ctx_data = next->task_ctx_data; - next_ctx_data = prev->task_ctx_data; + prev_ctx_data = next_epc->task_ctx_data; + next_ctx_data = prev_epc->task_ctx_data; if (!prev_ctx_data || !next_ctx_data) return; @@ -538,7 +538,7 @@ void intel_pmu_lbr_swap_task_ctx(struct perf_event_context *prev, task_context_opt(next_ctx_data)->lbr_callstack_users); } -void intel_pmu_lbr_sched_task(struct perf_event_context *ctx, bool sched_in) +void intel_pmu_lbr_sched_task(struct perf_event_pmu_context *pmu_ctx, bool sched_in) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); void *task_ctx; @@ -551,7 +551,7 @@ void intel_pmu_lbr_sched_task(struct perf_event_context *ctx, bool sched_in) * the task was scheduled out, restore the stack. Otherwise flush * the LBR stack. */ - task_ctx = ctx ? ctx->task_ctx_data : NULL; + task_ctx = pmu_ctx ? pmu_ctx->task_ctx_data : NULL; if (task_ctx) { if (sched_in) __intel_pmu_lbr_restore(task_ctx); @@ -587,8 +587,8 @@ void intel_pmu_lbr_add(struct perf_event *event) cpuc->br_sel = event->hw.branch_reg.reg; - if (branch_user_callstack(cpuc->br_sel) && event->ctx->task_ctx_data) - task_context_opt(event->ctx->task_ctx_data)->lbr_callstack_users++; + if (branch_user_callstack(cpuc->br_sel) && event->pmu_ctx->task_ctx_data) + task_context_opt(event->pmu_ctx->task_ctx_data)->lbr_callstack_users++; /* * Request pmu::sched_task() callback, which will fire inside the @@ -611,7 +611,7 @@ void intel_pmu_lbr_add(struct perf_event *event) */ if (x86_pmu.intel_cap.pebs_baseline && event->attr.precise_ip > 0) cpuc->lbr_pebs_users++; - perf_sched_cb_inc(event->ctx->pmu); + perf_sched_cb_inc(event->pmu); if (!cpuc->lbr_users++ && !event->total_time_running) intel_pmu_lbr_reset(); } @@ -664,8 +664,8 @@ void intel_pmu_lbr_del(struct perf_event *event) return; if (branch_user_callstack(cpuc->br_sel) && - event->ctx->task_ctx_data) - task_context_opt(event->ctx->task_ctx_data)->lbr_callstack_users--; + event->pmu_ctx->task_ctx_data) + task_context_opt(event->pmu_ctx->task_ctx_data)->lbr_callstack_users--; if (event->hw.flags & PERF_X86_EVENT_LBR_SELECT) cpuc->lbr_select = 0; @@ -675,7 +675,7 @@ void intel_pmu_lbr_del(struct perf_event *event) cpuc->lbr_users--; WARN_ON_ONCE(cpuc->lbr_users < 0); WARN_ON_ONCE(cpuc->lbr_pebs_users < 0); - perf_sched_cb_dec(event->ctx->pmu); + perf_sched_cb_dec(event->pmu); } static inline bool vlbr_exclude_host(void) diff --git a/arch/x86/events/intel/p4.c b/arch/x86/events/intel/p4.c index 03bbcc2fa2ff..35936188db01 100644 --- a/arch/x86/events/intel/p4.c +++ b/arch/x86/events/intel/p4.c @@ -24,7 +24,7 @@ struct p4_event_bind { unsigned int escr_msr[2]; /* ESCR MSR for this event */ unsigned int escr_emask; /* valid ESCR EventMask bits */ unsigned int shared; /* event is shared across threads */ - char cntr[2][P4_CNTR_LIMIT]; /* counter index (offset), -1 on absence */ + signed char cntr[2][P4_CNTR_LIMIT]; /* counter index (offset), -1 on absence */ }; struct p4_pebs_bind { diff --git a/arch/x86/events/intel/pt.c b/arch/x86/events/intel/pt.c index 82ef87e9a897..42a55794004a 100644 --- a/arch/x86/events/intel/pt.c +++ b/arch/x86/events/intel/pt.c @@ -1263,6 +1263,15 @@ static int pt_buffer_try_single(struct pt_buffer *buf, int nr_pages) if (1 << order != nr_pages) goto out; + /* + * Some processors cannot always support single range for more than + * 4KB - refer errata TGL052, ADL037 and RPL017. Future processors might + * also be affected, so for now rather than trying to keep track of + * which ones, just disable it for all. + */ + if (nr_pages > 1) + goto out; + buf->single = true; buf->nr_pages = nr_pages; ret = 0; diff --git a/arch/x86/events/intel/uncore.h b/arch/x86/events/intel/uncore.h index 2adeaf4de4df..e278e2e7c051 100644 --- a/arch/x86/events/intel/uncore.h +++ b/arch/x86/events/intel/uncore.h @@ -2,6 +2,7 @@ #include <linux/slab.h> #include <linux/pci.h> #include <asm/apicdef.h> +#include <asm/intel-family.h> #include <linux/io-64-nonatomic-lo-hi.h> #include <linux/perf_event.h> @@ -88,12 +89,12 @@ struct intel_uncore_type { * to identify which platform component each PMON block of that type is * supposed to monitor. */ - struct intel_uncore_topology *topology; + struct intel_uncore_topology **topology; /* * Optional callbacks for managing mapping of Uncore units to PMONs */ int (*get_topology)(struct intel_uncore_type *type); - int (*set_mapping)(struct intel_uncore_type *type); + void (*set_mapping)(struct intel_uncore_type *type); void (*cleanup_mapping)(struct intel_uncore_type *type); }; @@ -178,11 +179,26 @@ struct freerunning_counters { unsigned *box_offsets; }; -struct intel_uncore_topology { - u64 configuration; +struct uncore_iio_topology { + int pci_bus_no; int segment; }; +struct uncore_upi_topology { + int die_to; + int pmu_idx_to; + int enabled; +}; + +struct intel_uncore_topology { + int pmu_idx; + union { + void *untyped; + struct uncore_iio_topology *iio; + struct uncore_upi_topology *upi; + }; +}; + struct pci2phy_map { struct list_head list; int segment; diff --git a/arch/x86/events/intel/uncore_snb.c b/arch/x86/events/intel/uncore_snb.c index 1ef4f7861e2e..1f4869227efb 100644 --- a/arch/x86/events/intel/uncore_snb.c +++ b/arch/x86/events/intel/uncore_snb.c @@ -1338,6 +1338,7 @@ static void __uncore_imc_init_box(struct intel_uncore_box *box, /* MCHBAR is disabled */ if (!(mch_bar & BIT(0))) { pr_warn("perf uncore: MCHBAR is disabled. Failed to map IMC free-running counters.\n"); + pci_dev_put(pdev); return; } mch_bar &= ~BIT(0); @@ -1352,6 +1353,8 @@ static void __uncore_imc_init_box(struct intel_uncore_box *box, box->io_addr = ioremap(addr, type->mmio_map_size); if (!box->io_addr) pr_warn("perf uncore: Failed to ioremap for %s.\n", type->name); + + pci_dev_put(pdev); } static void tgl_uncore_imc_freerunning_init_box(struct intel_uncore_box *box) diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c index ed869443efb2..44c2f879f708 100644 --- a/arch/x86/events/intel/uncore_snbep.c +++ b/arch/x86/events/intel/uncore_snbep.c @@ -445,6 +445,7 @@ #define ICX_UPI_PCI_PMON_CTR0 0x320 #define ICX_UPI_PCI_PMON_BOX_CTL 0x318 #define ICX_UPI_CTL_UMASK_EXT 0xffffff +#define ICX_UBOX_DID 0x3450 /* ICX M3UPI*/ #define ICX_M3UPI_PCI_PMON_CTL0 0xd8 @@ -457,6 +458,7 @@ /* SPR */ #define SPR_RAW_EVENT_MASK_EXT 0xffffff +#define SPR_UBOX_DID 0x3250 /* SPR CHA */ #define SPR_CHA_PMON_CTL_TID_EN (1 << 16) @@ -1372,6 +1374,28 @@ static struct pci_driver snbep_uncore_pci_driver = { #define NODE_ID_MASK 0x7 +/* Each three bits from 0 to 23 of GIDNIDMAP register correspond Node ID. */ +#define GIDNIDMAP(config, id) (((config) >> (3 * (id))) & 0x7) + +static int upi_nodeid_groupid(struct pci_dev *ubox_dev, int nodeid_loc, int idmap_loc, + int *nodeid, int *groupid) +{ + int ret; + + /* get the Node ID of the local register */ + ret = pci_read_config_dword(ubox_dev, nodeid_loc, nodeid); + if (ret) + goto err; + + *nodeid = *nodeid & NODE_ID_MASK; + /* get the Node ID mapping */ + ret = pci_read_config_dword(ubox_dev, idmap_loc, groupid); + if (ret) + goto err; +err: + return ret; +} + /* * build pci bus to socket mapping */ @@ -1397,13 +1421,8 @@ static int snbep_pci2phy_map_init(int devid, int nodeid_loc, int idmap_loc, bool * the topology. */ if (nr_node_ids <= 8) { - /* get the Node ID of the local register */ - err = pci_read_config_dword(ubox_dev, nodeid_loc, &config); - if (err) - break; - nodeid = config & NODE_ID_MASK; - /* get the Node ID mapping */ - err = pci_read_config_dword(ubox_dev, idmap_loc, &config); + err = upi_nodeid_groupid(ubox_dev, nodeid_loc, idmap_loc, + &nodeid, &config); if (err) break; @@ -1421,7 +1440,7 @@ static int snbep_pci2phy_map_init(int devid, int nodeid_loc, int idmap_loc, bool * to a particular node. */ for (i = 0; i < 8; i++) { - if (nodeid == ((config >> (3 * i)) & 0x7)) { + if (nodeid == GIDNIDMAP(config, i)) { if (topology_max_die_per_package() > 1) die_id = i; else @@ -2891,6 +2910,7 @@ static bool hswep_has_limit_sbox(unsigned int device) return false; pci_read_config_dword(dev, HSWEP_PCU_CAPID4_OFFET, &capid4); + pci_dev_put(dev); if (!hswep_get_chop(capid4)) return true; @@ -3699,10 +3719,16 @@ static struct intel_uncore_ops skx_uncore_iio_ops = { .read_counter = uncore_msr_read_counter, }; -static inline u8 skx_iio_stack(struct intel_uncore_pmu *pmu, int die) +static struct intel_uncore_topology *pmu_topology(struct intel_uncore_pmu *pmu, int die) { - return pmu->type->topology[die].configuration >> - (pmu->pmu_idx * BUS_NUM_STRIDE); + int idx; + + for (idx = 0; idx < pmu->type->num_boxes; idx++) { + if (pmu->type->topology[die][idx].pmu_idx == pmu->pmu_idx) + return &pmu->type->topology[die][idx]; + } + + return NULL; } static umode_t @@ -3710,8 +3736,9 @@ pmu_iio_mapping_visible(struct kobject *kobj, struct attribute *attr, int die, int zero_bus_pmu) { struct intel_uncore_pmu *pmu = dev_to_uncore_pmu(kobj_to_dev(kobj)); + struct intel_uncore_topology *pmut = pmu_topology(pmu, die); - return (!skx_iio_stack(pmu, die) && pmu->pmu_idx != zero_bus_pmu) ? 0 : attr->mode; + return (pmut && !pmut->iio->pci_bus_no && pmu->pmu_idx != zero_bus_pmu) ? 0 : attr->mode; } static umode_t @@ -3727,9 +3754,10 @@ static ssize_t skx_iio_mapping_show(struct device *dev, struct intel_uncore_pmu *pmu = dev_to_uncore_pmu(dev); struct dev_ext_attribute *ea = to_dev_ext_attribute(attr); long die = (long)ea->var; + struct intel_uncore_topology *pmut = pmu_topology(pmu, die); - return sprintf(buf, "%04x:%02x\n", pmu->type->topology[die].segment, - skx_iio_stack(pmu, die)); + return sprintf(buf, "%04x:%02x\n", pmut ? pmut->iio->segment : 0, + pmut ? pmut->iio->pci_bus_no : 0); } static int skx_msr_cpu_bus_read(int cpu, u64 *topology) @@ -3764,18 +3792,79 @@ static int die_to_cpu(int die) return res; } -static int skx_iio_get_topology(struct intel_uncore_type *type) +enum { + IIO_TOPOLOGY_TYPE, + UPI_TOPOLOGY_TYPE, + TOPOLOGY_MAX +}; + +static const size_t topology_size[TOPOLOGY_MAX] = { + sizeof(*((struct intel_uncore_topology *)NULL)->iio), + sizeof(*((struct intel_uncore_topology *)NULL)->upi) +}; + +static int pmu_alloc_topology(struct intel_uncore_type *type, int topology_type) { - int die, ret = -EPERM; + int die, idx; + struct intel_uncore_topology **topology; + + if (!type->num_boxes) + return -EPERM; - type->topology = kcalloc(uncore_max_dies(), sizeof(*type->topology), - GFP_KERNEL); - if (!type->topology) - return -ENOMEM; + topology = kcalloc(uncore_max_dies(), sizeof(*topology), GFP_KERNEL); + if (!topology) + goto err; for (die = 0; die < uncore_max_dies(); die++) { - ret = skx_msr_cpu_bus_read(die_to_cpu(die), - &type->topology[die].configuration); + topology[die] = kcalloc(type->num_boxes, sizeof(**topology), GFP_KERNEL); + if (!topology[die]) + goto clear; + for (idx = 0; idx < type->num_boxes; idx++) { + topology[die][idx].untyped = kcalloc(type->num_boxes, + topology_size[topology_type], + GFP_KERNEL); + if (!topology[die][idx].untyped) + goto clear; + } + } + + type->topology = topology; + + return 0; +clear: + for (; die >= 0; die--) { + for (idx = 0; idx < type->num_boxes; idx++) + kfree(topology[die][idx].untyped); + kfree(topology[die]); + } + kfree(topology); +err: + return -ENOMEM; +} + +static void pmu_free_topology(struct intel_uncore_type *type) +{ + int die, idx; + + if (type->topology) { + for (die = 0; die < uncore_max_dies(); die++) { + for (idx = 0; idx < type->num_boxes; idx++) + kfree(type->topology[die][idx].untyped); + kfree(type->topology[die]); + } + kfree(type->topology); + type->topology = NULL; + } +} + +static int skx_pmu_get_topology(struct intel_uncore_type *type, + int (*topology_cb)(struct intel_uncore_type*, int, int, u64)) +{ + int die, ret = -EPERM; + u64 cpu_bus_msr; + + for (die = 0; die < uncore_max_dies(); die++) { + ret = skx_msr_cpu_bus_read(die_to_cpu(die), &cpu_bus_msr); if (ret) break; @@ -3783,15 +3872,33 @@ static int skx_iio_get_topology(struct intel_uncore_type *type) if (ret < 0) break; - type->topology[die].segment = ret; + ret = topology_cb(type, ret, die, cpu_bus_msr); + if (ret) + break; } - if (ret < 0) { - kfree(type->topology); - type->topology = NULL; + return ret; +} + +static int skx_iio_topology_cb(struct intel_uncore_type *type, int segment, + int die, u64 cpu_bus_msr) +{ + int idx; + struct intel_uncore_topology *t; + + for (idx = 0; idx < type->num_boxes; idx++) { + t = &type->topology[die][idx]; + t->pmu_idx = idx; + t->iio->segment = segment; + t->iio->pci_bus_no = (cpu_bus_msr >> (idx * BUS_NUM_STRIDE)) & 0xff; } - return ret; + return 0; +} + +static int skx_iio_get_topology(struct intel_uncore_type *type) +{ + return skx_pmu_get_topology(type, skx_iio_topology_cb); } static struct attribute_group skx_iio_mapping_group = { @@ -3803,8 +3910,25 @@ static const struct attribute_group *skx_iio_attr_update[] = { NULL, }; -static int -pmu_iio_set_mapping(struct intel_uncore_type *type, struct attribute_group *ag) +static void pmu_clear_mapping_attr(const struct attribute_group **groups, + struct attribute_group *ag) +{ + int i; + + for (i = 0; groups[i]; i++) { + if (groups[i] == ag) { + for (i++; groups[i]; i++) + groups[i - 1] = groups[i]; + groups[i - 1] = NULL; + break; + } + } +} + +static void +pmu_set_mapping(struct intel_uncore_type *type, struct attribute_group *ag, + ssize_t (*show)(struct device*, struct device_attribute*, char*), + int topology_type) { char buf[64]; int ret; @@ -3812,11 +3936,13 @@ pmu_iio_set_mapping(struct intel_uncore_type *type, struct attribute_group *ag) struct attribute **attrs = NULL; struct dev_ext_attribute *eas = NULL; - ret = type->get_topology(type); + ret = pmu_alloc_topology(type, topology_type); if (ret < 0) goto clear_attr_update; - ret = -ENOMEM; + ret = type->get_topology(type); + if (ret < 0) + goto clear_topology; /* One more for NULL. */ attrs = kcalloc((uncore_max_dies() + 1), sizeof(*attrs), GFP_KERNEL); @@ -3828,20 +3954,20 @@ pmu_iio_set_mapping(struct intel_uncore_type *type, struct attribute_group *ag) goto clear_attrs; for (die = 0; die < uncore_max_dies(); die++) { - sprintf(buf, "die%ld", die); + snprintf(buf, sizeof(buf), "die%ld", die); sysfs_attr_init(&eas[die].attr.attr); eas[die].attr.attr.name = kstrdup(buf, GFP_KERNEL); if (!eas[die].attr.attr.name) goto err; eas[die].attr.attr.mode = 0444; - eas[die].attr.show = skx_iio_mapping_show; + eas[die].attr.show = show; eas[die].attr.store = NULL; eas[die].var = (void *)die; attrs[die] = &eas[die].attr.attr; } ag->attrs = attrs; - return 0; + return; err: for (; die >= 0; die--) kfree(eas[die].attr.attr.name); @@ -3849,14 +3975,13 @@ err: clear_attrs: kfree(attrs); clear_topology: - kfree(type->topology); + pmu_free_topology(type); clear_attr_update: - type->attr_update = NULL; - return ret; + pmu_clear_mapping_attr(type->attr_update, ag); } static void -pmu_iio_cleanup_mapping(struct intel_uncore_type *type, struct attribute_group *ag) +pmu_cleanup_mapping(struct intel_uncore_type *type, struct attribute_group *ag) { struct attribute **attr = ag->attrs; @@ -3868,17 +3993,23 @@ pmu_iio_cleanup_mapping(struct intel_uncore_type *type, struct attribute_group * kfree(attr_to_ext_attr(*ag->attrs)); kfree(ag->attrs); ag->attrs = NULL; - kfree(type->topology); + pmu_free_topology(type); +} + +static void +pmu_iio_set_mapping(struct intel_uncore_type *type, struct attribute_group *ag) +{ + pmu_set_mapping(type, ag, skx_iio_mapping_show, IIO_TOPOLOGY_TYPE); } -static int skx_iio_set_mapping(struct intel_uncore_type *type) +static void skx_iio_set_mapping(struct intel_uncore_type *type) { - return pmu_iio_set_mapping(type, &skx_iio_mapping_group); + pmu_iio_set_mapping(type, &skx_iio_mapping_group); } static void skx_iio_cleanup_mapping(struct intel_uncore_type *type) { - pmu_iio_cleanup_mapping(type, &skx_iio_mapping_group); + pmu_cleanup_mapping(type, &skx_iio_mapping_group); } static struct intel_uncore_type skx_uncore_iio = { @@ -4139,6 +4270,132 @@ static struct intel_uncore_ops skx_upi_uncore_pci_ops = { .read_counter = snbep_uncore_pci_read_counter, }; +static umode_t +skx_upi_mapping_visible(struct kobject *kobj, struct attribute *attr, int die) +{ + struct intel_uncore_pmu *pmu = dev_to_uncore_pmu(kobj_to_dev(kobj)); + + return pmu->type->topology[die][pmu->pmu_idx].upi->enabled ? attr->mode : 0; +} + +static ssize_t skx_upi_mapping_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct intel_uncore_pmu *pmu = dev_to_uncore_pmu(dev); + struct dev_ext_attribute *ea = to_dev_ext_attribute(attr); + long die = (long)ea->var; + struct uncore_upi_topology *upi = pmu->type->topology[die][pmu->pmu_idx].upi; + + return sysfs_emit(buf, "upi_%d,die_%d\n", upi->pmu_idx_to, upi->die_to); +} + +#define SKX_UPI_REG_DID 0x2058 +#define SKX_UPI_REGS_ADDR_DEVICE_LINK0 0x0e +#define SKX_UPI_REGS_ADDR_FUNCTION 0x00 + +/* + * UPI Link Parameter 0 + * | Bit | Default | Description + * | 19:16 | 0h | base_nodeid - The NodeID of the sending socket. + * | 12:8 | 00h | sending_port - The processor die port number of the sending port. + */ +#define SKX_KTILP0_OFFSET 0x94 + +/* + * UPI Pcode Status. This register is used by PCode to store the link training status. + * | Bit | Default | Description + * | 4 | 0h | ll_status_valid — Bit indicates the valid training status + * logged from PCode to the BIOS. + */ +#define SKX_KTIPCSTS_OFFSET 0x120 + +static int upi_fill_topology(struct pci_dev *dev, struct intel_uncore_topology *tp, + int pmu_idx) +{ + int ret; + u32 upi_conf; + struct uncore_upi_topology *upi = tp->upi; + + tp->pmu_idx = pmu_idx; + ret = pci_read_config_dword(dev, SKX_KTIPCSTS_OFFSET, &upi_conf); + if (ret) { + ret = pcibios_err_to_errno(ret); + goto err; + } + upi->enabled = (upi_conf >> 4) & 1; + if (upi->enabled) { + ret = pci_read_config_dword(dev, SKX_KTILP0_OFFSET, + &upi_conf); + if (ret) { + ret = pcibios_err_to_errno(ret); + goto err; + } + upi->die_to = (upi_conf >> 16) & 0xf; + upi->pmu_idx_to = (upi_conf >> 8) & 0x1f; + } +err: + return ret; +} + +static int skx_upi_topology_cb(struct intel_uncore_type *type, int segment, + int die, u64 cpu_bus_msr) +{ + int idx, ret; + struct intel_uncore_topology *upi; + unsigned int devfn; + struct pci_dev *dev = NULL; + u8 bus = cpu_bus_msr >> (3 * BUS_NUM_STRIDE); + + for (idx = 0; idx < type->num_boxes; idx++) { + upi = &type->topology[die][idx]; + devfn = PCI_DEVFN(SKX_UPI_REGS_ADDR_DEVICE_LINK0 + idx, + SKX_UPI_REGS_ADDR_FUNCTION); + dev = pci_get_domain_bus_and_slot(segment, bus, devfn); + if (dev) { + ret = upi_fill_topology(dev, upi, idx); + if (ret) + break; + } + } + + pci_dev_put(dev); + return ret; +} + +static int skx_upi_get_topology(struct intel_uncore_type *type) +{ + /* CPX case is not supported */ + if (boot_cpu_data.x86_stepping == 11) + return -EPERM; + + return skx_pmu_get_topology(type, skx_upi_topology_cb); +} + +static struct attribute_group skx_upi_mapping_group = { + .is_visible = skx_upi_mapping_visible, +}; + +static const struct attribute_group *skx_upi_attr_update[] = { + &skx_upi_mapping_group, + NULL +}; + +static void +pmu_upi_set_mapping(struct intel_uncore_type *type, struct attribute_group *ag) +{ + pmu_set_mapping(type, ag, skx_upi_mapping_show, UPI_TOPOLOGY_TYPE); +} + +static void skx_upi_set_mapping(struct intel_uncore_type *type) +{ + pmu_upi_set_mapping(type, &skx_upi_mapping_group); +} + +static void skx_upi_cleanup_mapping(struct intel_uncore_type *type) +{ + pmu_cleanup_mapping(type, &skx_upi_mapping_group); +} + static struct intel_uncore_type skx_uncore_upi = { .name = "upi", .num_counters = 4, @@ -4151,6 +4408,10 @@ static struct intel_uncore_type skx_uncore_upi = { .box_ctl = SKX_UPI_PCI_PMON_BOX_CTL, .ops = &skx_upi_uncore_pci_ops, .format_group = &skx_upi_uncore_format_group, + .attr_update = skx_upi_attr_update, + .get_topology = skx_upi_get_topology, + .set_mapping = skx_upi_set_mapping, + .cleanup_mapping = skx_upi_cleanup_mapping, }; static void skx_m2m_uncore_pci_init_box(struct intel_uncore_box *box) @@ -4461,11 +4722,6 @@ static int sad_cfg_iio_topology(struct intel_uncore_type *type, u8 *sad_pmon_map int die, stack_id, ret = -EPERM; struct pci_dev *dev = NULL; - type->topology = kcalloc(uncore_max_dies(), sizeof(*type->topology), - GFP_KERNEL); - if (!type->topology) - return -ENOMEM; - while ((dev = pci_get_device(PCI_VENDOR_ID_INTEL, SNR_ICX_MESH2IIO_MMAP_DID, dev))) { ret = pci_read_config_dword(dev, SNR_ICX_SAD_CONTROL_CFG, &sad_cfg); if (ret) { @@ -4483,14 +4739,12 @@ static int sad_cfg_iio_topology(struct intel_uncore_type *type, u8 *sad_pmon_map /* Convert stack id from SAD_CONTROL to PMON notation. */ stack_id = sad_pmon_mapping[stack_id]; - ((u8 *)&(type->topology[die].configuration))[stack_id] = dev->bus->number; - type->topology[die].segment = pci_domain_nr(dev->bus); + type->topology[die][stack_id].iio->segment = pci_domain_nr(dev->bus); + type->topology[die][stack_id].pmu_idx = stack_id; + type->topology[die][stack_id].iio->pci_bus_no = dev->bus->number; } - if (ret) { - kfree(type->topology); - type->topology = NULL; - } + pci_dev_put(dev); return ret; } @@ -4519,14 +4773,14 @@ static int snr_iio_get_topology(struct intel_uncore_type *type) return sad_cfg_iio_topology(type, snr_sad_pmon_mapping); } -static int snr_iio_set_mapping(struct intel_uncore_type *type) +static void snr_iio_set_mapping(struct intel_uncore_type *type) { - return pmu_iio_set_mapping(type, &snr_iio_mapping_group); + pmu_iio_set_mapping(type, &snr_iio_mapping_group); } static void snr_iio_cleanup_mapping(struct intel_uncore_type *type) { - pmu_iio_cleanup_mapping(type, &snr_iio_mapping_group); + pmu_cleanup_mapping(type, &snr_iio_mapping_group); } static struct event_constraint snr_uncore_iio_constraints[] = { @@ -4857,6 +5111,8 @@ static int snr_uncore_mmio_map(struct intel_uncore_box *box, addr += box_ctl; + pci_dev_put(pdev); + box->io_addr = ioremap(addr, type->mmio_map_size); if (!box->io_addr) { pr_warn("perf uncore: Failed to ioremap for %s.\n", type->name); @@ -5137,14 +5393,19 @@ static int icx_iio_get_topology(struct intel_uncore_type *type) return sad_cfg_iio_topology(type, icx_sad_pmon_mapping); } -static int icx_iio_set_mapping(struct intel_uncore_type *type) +static void icx_iio_set_mapping(struct intel_uncore_type *type) { - return pmu_iio_set_mapping(type, &icx_iio_mapping_group); + /* Detect ICX-D system. This case is not supported */ + if (boot_cpu_data.x86_model == INTEL_FAM6_ICELAKE_D) { + pmu_clear_mapping_attr(type->attr_update, &icx_iio_mapping_group); + return; + } + pmu_iio_set_mapping(type, &icx_iio_mapping_group); } static void icx_iio_cleanup_mapping(struct intel_uncore_type *type) { - pmu_iio_cleanup_mapping(type, &icx_iio_mapping_group); + pmu_cleanup_mapping(type, &icx_iio_mapping_group); } static struct intel_uncore_type icx_uncore_iio = { @@ -5337,6 +5598,76 @@ static const struct attribute_group icx_upi_uncore_format_group = { .attrs = icx_upi_uncore_formats_attr, }; +#define ICX_UPI_REGS_ADDR_DEVICE_LINK0 0x02 +#define ICX_UPI_REGS_ADDR_FUNCTION 0x01 + +static int discover_upi_topology(struct intel_uncore_type *type, int ubox_did, int dev_link0) +{ + struct pci_dev *ubox = NULL; + struct pci_dev *dev = NULL; + u32 nid, gid; + int i, idx, ret = -EPERM; + struct intel_uncore_topology *upi; + unsigned int devfn; + + /* GIDNIDMAP method supports machines which have less than 8 sockets. */ + if (uncore_max_dies() > 8) + goto err; + + while ((ubox = pci_get_device(PCI_VENDOR_ID_INTEL, ubox_did, ubox))) { + ret = upi_nodeid_groupid(ubox, SKX_CPUNODEID, SKX_GIDNIDMAP, &nid, &gid); + if (ret) { + ret = pcibios_err_to_errno(ret); + break; + } + + for (i = 0; i < 8; i++) { + if (nid != GIDNIDMAP(gid, i)) + continue; + for (idx = 0; idx < type->num_boxes; idx++) { + upi = &type->topology[nid][idx]; + devfn = PCI_DEVFN(dev_link0 + idx, ICX_UPI_REGS_ADDR_FUNCTION); + dev = pci_get_domain_bus_and_slot(pci_domain_nr(ubox->bus), + ubox->bus->number, + devfn); + if (dev) { + ret = upi_fill_topology(dev, upi, idx); + if (ret) + goto err; + } + } + } + } +err: + pci_dev_put(ubox); + pci_dev_put(dev); + return ret; +} + +static int icx_upi_get_topology(struct intel_uncore_type *type) +{ + return discover_upi_topology(type, ICX_UBOX_DID, ICX_UPI_REGS_ADDR_DEVICE_LINK0); +} + +static struct attribute_group icx_upi_mapping_group = { + .is_visible = skx_upi_mapping_visible, +}; + +static const struct attribute_group *icx_upi_attr_update[] = { + &icx_upi_mapping_group, + NULL +}; + +static void icx_upi_set_mapping(struct intel_uncore_type *type) +{ + pmu_upi_set_mapping(type, &icx_upi_mapping_group); +} + +static void icx_upi_cleanup_mapping(struct intel_uncore_type *type) +{ + pmu_cleanup_mapping(type, &icx_upi_mapping_group); +} + static struct intel_uncore_type icx_uncore_upi = { .name = "upi", .num_counters = 4, @@ -5349,6 +5680,10 @@ static struct intel_uncore_type icx_uncore_upi = { .box_ctl = ICX_UPI_PCI_PMON_BOX_CTL, .ops = &skx_upi_uncore_pci_ops, .format_group = &icx_upi_uncore_format_group, + .attr_update = icx_upi_attr_update, + .get_topology = icx_upi_get_topology, + .set_mapping = icx_upi_set_mapping, + .cleanup_mapping = icx_upi_cleanup_mapping, }; static struct event_constraint icx_uncore_m3upi_constraints[] = { @@ -5780,9 +6115,43 @@ static struct intel_uncore_type spr_uncore_m2m = { .name = "m2m", }; +static struct attribute_group spr_upi_mapping_group = { + .is_visible = skx_upi_mapping_visible, +}; + +static const struct attribute_group *spr_upi_attr_update[] = { + &uncore_alias_group, + &spr_upi_mapping_group, + NULL +}; + +#define SPR_UPI_REGS_ADDR_DEVICE_LINK0 0x01 + +static void spr_upi_set_mapping(struct intel_uncore_type *type) +{ + pmu_upi_set_mapping(type, &spr_upi_mapping_group); +} + +static void spr_upi_cleanup_mapping(struct intel_uncore_type *type) +{ + pmu_cleanup_mapping(type, &spr_upi_mapping_group); +} + +static int spr_upi_get_topology(struct intel_uncore_type *type) +{ + return discover_upi_topology(type, SPR_UBOX_DID, SPR_UPI_REGS_ADDR_DEVICE_LINK0); +} + static struct intel_uncore_type spr_uncore_upi = { - SPR_UNCORE_PCI_COMMON_FORMAT(), + .event_mask = SNBEP_PMON_RAW_EVENT_MASK, + .event_mask_ext = SPR_RAW_EVENT_MASK_EXT, + .format_group = &spr_uncore_raw_format_group, + .ops = &spr_uncore_pci_ops, .name = "upi", + .attr_update = spr_upi_attr_update, + .get_topology = spr_upi_get_topology, + .set_mapping = spr_upi_set_mapping, + .cleanup_mapping = spr_upi_cleanup_mapping, }; static struct intel_uncore_type spr_uncore_m3upi = { @@ -5986,6 +6355,12 @@ static void uncore_type_customized_copy(struct intel_uncore_type *to_type, to_type->format_group = from_type->format_group; if (from_type->attr_update) to_type->attr_update = from_type->attr_update; + if (from_type->set_mapping) + to_type->set_mapping = from_type->set_mapping; + if (from_type->get_topology) + to_type->get_topology = from_type->get_topology; + if (from_type->cleanup_mapping) + to_type->cleanup_mapping = from_type->cleanup_mapping; } static struct intel_uncore_type ** diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index 332d2e6d8ae4..0e849f28a5c1 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -811,7 +811,7 @@ struct x86_pmu { void (*cpu_dead)(int cpu); void (*check_microcode)(void); - void (*sched_task)(struct perf_event_context *ctx, + void (*sched_task)(struct perf_event_pmu_context *pmu_ctx, bool sched_in); /* @@ -894,12 +894,12 @@ struct x86_pmu { int num_topdown_events; /* - * perf task context (i.e. struct perf_event_context::task_ctx_data) + * perf task context (i.e. struct perf_event_pmu_context::task_ctx_data) * switch helper to bridge calls from perf/core to perf/x86. * See struct pmu::swap_task_ctx() usage for examples; */ - void (*swap_task_ctx)(struct perf_event_context *prev, - struct perf_event_context *next); + void (*swap_task_ctx)(struct perf_event_pmu_context *prev_epc, + struct perf_event_pmu_context *next_epc); /* * AMD bits @@ -925,7 +925,7 @@ struct x86_pmu { int (*aux_output_match) (struct perf_event *event); - int (*filter_match)(struct perf_event *event); + void (*filter)(struct pmu *pmu, int cpu, bool *ret); /* * Hybrid support * @@ -1180,8 +1180,6 @@ int x86_pmu_handle_irq(struct pt_regs *regs); void x86_pmu_show_pmu_cap(int num_counters, int num_counters_fixed, u64 intel_ctrl); -void x86_pmu_update_cpu_context(struct pmu *pmu, int cpu); - extern struct event_constraint emptyconstraint; extern struct event_constraint unconstrained; @@ -1306,7 +1304,7 @@ void amd_pmu_lbr_reset(void); void amd_pmu_lbr_read(void); void amd_pmu_lbr_add(struct perf_event *event); void amd_pmu_lbr_del(struct perf_event *event); -void amd_pmu_lbr_sched_task(struct perf_event_context *ctx, bool sched_in); +void amd_pmu_lbr_sched_task(struct perf_event_pmu_context *pmu_ctx, bool sched_in); void amd_pmu_lbr_enable_all(void); void amd_pmu_lbr_disable_all(void); int amd_pmu_lbr_hw_config(struct perf_event *event); @@ -1322,7 +1320,6 @@ void amd_brs_enable_all(void); void amd_brs_disable_all(void); void amd_brs_drain(void); void amd_brs_lopwr_init(void); -void amd_brs_disable_all(void); int amd_brs_hw_config(struct perf_event *event); void amd_brs_reset(void); @@ -1330,7 +1327,7 @@ static inline void amd_pmu_brs_add(struct perf_event *event) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); - perf_sched_cb_inc(event->ctx->pmu); + perf_sched_cb_inc(event->pmu); cpuc->lbr_users++; /* * No need to reset BRS because it is reset @@ -1345,10 +1342,10 @@ static inline void amd_pmu_brs_del(struct perf_event *event) cpuc->lbr_users--; WARN_ON_ONCE(cpuc->lbr_users < 0); - perf_sched_cb_dec(event->ctx->pmu); + perf_sched_cb_dec(event->pmu); } -void amd_pmu_brs_sched_task(struct perf_event_context *ctx, bool sched_in); +void amd_pmu_brs_sched_task(struct perf_event_pmu_context *pmu_ctx, bool sched_in); #else static inline int amd_brs_init(void) { @@ -1373,7 +1370,7 @@ static inline void amd_pmu_brs_del(struct perf_event *event) { } -static inline void amd_pmu_brs_sched_task(struct perf_event_context *ctx, bool sched_in) +static inline void amd_pmu_brs_sched_task(struct perf_event_pmu_context *pmu_ctx, bool sched_in) { } @@ -1533,7 +1530,7 @@ void intel_pmu_pebs_enable_all(void); void intel_pmu_pebs_disable_all(void); -void intel_pmu_pebs_sched_task(struct perf_event_context *ctx, bool sched_in); +void intel_pmu_pebs_sched_task(struct perf_event_pmu_context *pmu_ctx, bool sched_in); void intel_pmu_auto_reload_read(struct perf_event *event); @@ -1541,10 +1538,10 @@ void intel_pmu_store_pebs_lbrs(struct lbr_entry *lbr); void intel_ds_init(void); -void intel_pmu_lbr_swap_task_ctx(struct perf_event_context *prev, - struct perf_event_context *next); +void intel_pmu_lbr_swap_task_ctx(struct perf_event_pmu_context *prev_epc, + struct perf_event_pmu_context *next_epc); -void intel_pmu_lbr_sched_task(struct perf_event_context *ctx, bool sched_in); +void intel_pmu_lbr_sched_task(struct perf_event_pmu_context *pmu_ctx, bool sched_in); u64 lbr_from_signext_quirk_wr(u64 val); diff --git a/arch/x86/events/rapl.c b/arch/x86/events/rapl.c index a829492bca4c..52e6e7ed4f78 100644 --- a/arch/x86/events/rapl.c +++ b/arch/x86/events/rapl.c @@ -800,13 +800,18 @@ static const struct x86_cpu_id rapl_model_match[] __initconst = { X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_X, &model_hsx), X86_MATCH_INTEL_FAM6_MODEL(COMETLAKE_L, &model_skl), X86_MATCH_INTEL_FAM6_MODEL(COMETLAKE, &model_skl), + X86_MATCH_INTEL_FAM6_MODEL(TIGERLAKE_L, &model_skl), + X86_MATCH_INTEL_FAM6_MODEL(TIGERLAKE, &model_skl), X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE, &model_skl), X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE_L, &model_skl), X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE_N, &model_skl), X86_MATCH_INTEL_FAM6_MODEL(SAPPHIRERAPIDS_X, &model_spr), + X86_MATCH_INTEL_FAM6_MODEL(EMERALDRAPIDS_X, &model_spr), X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE, &model_skl), X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE_P, &model_skl), X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE_S, &model_skl), + X86_MATCH_INTEL_FAM6_MODEL(METEORLAKE, &model_skl), + X86_MATCH_INTEL_FAM6_MODEL(METEORLAKE_L, &model_skl), {}, }; MODULE_DEVICE_TABLE(x86cpu, rapl_model_match); diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c index 29774126e931..41ef036ebb7b 100644 --- a/arch/x86/hyperv/hv_init.c +++ b/arch/x86/hyperv/hv_init.c @@ -77,7 +77,7 @@ static int hyperv_init_ghcb(void) static int hv_cpu_init(unsigned int cpu) { union hv_vp_assist_msr_contents msr = { 0 }; - struct hv_vp_assist_page **hvp = &hv_vp_assist_page[smp_processor_id()]; + struct hv_vp_assist_page **hvp = &hv_vp_assist_page[cpu]; int ret; ret = hv_common_cpu_init(cpu); @@ -87,34 +87,32 @@ static int hv_cpu_init(unsigned int cpu) if (!hv_vp_assist_page) return 0; - if (!*hvp) { - if (hv_root_partition) { - /* - * For root partition we get the hypervisor provided VP assist - * page, instead of allocating a new page. - */ - rdmsrl(HV_X64_MSR_VP_ASSIST_PAGE, msr.as_uint64); - *hvp = memremap(msr.pfn << - HV_X64_MSR_VP_ASSIST_PAGE_ADDRESS_SHIFT, - PAGE_SIZE, MEMREMAP_WB); - } else { - /* - * The VP assist page is an "overlay" page (see Hyper-V TLFS's - * Section 5.2.1 "GPA Overlay Pages"). Here it must be zeroed - * out to make sure we always write the EOI MSR in - * hv_apic_eoi_write() *after* the EOI optimization is disabled - * in hv_cpu_die(), otherwise a CPU may not be stopped in the - * case of CPU offlining and the VM will hang. - */ + if (hv_root_partition) { + /* + * For root partition we get the hypervisor provided VP assist + * page, instead of allocating a new page. + */ + rdmsrl(HV_X64_MSR_VP_ASSIST_PAGE, msr.as_uint64); + *hvp = memremap(msr.pfn << HV_X64_MSR_VP_ASSIST_PAGE_ADDRESS_SHIFT, + PAGE_SIZE, MEMREMAP_WB); + } else { + /* + * The VP assist page is an "overlay" page (see Hyper-V TLFS's + * Section 5.2.1 "GPA Overlay Pages"). Here it must be zeroed + * out to make sure we always write the EOI MSR in + * hv_apic_eoi_write() *after* the EOI optimization is disabled + * in hv_cpu_die(), otherwise a CPU may not be stopped in the + * case of CPU offlining and the VM will hang. + */ + if (!*hvp) *hvp = __vmalloc(PAGE_SIZE, GFP_KERNEL | __GFP_ZERO); - if (*hvp) - msr.pfn = vmalloc_to_pfn(*hvp); - } - WARN_ON(!(*hvp)); - if (*hvp) { - msr.enable = 1; - wrmsrl(HV_X64_MSR_VP_ASSIST_PAGE, msr.as_uint64); - } + if (*hvp) + msr.pfn = vmalloc_to_pfn(*hvp); + + } + if (!WARN_ON(!(*hvp))) { + msr.enable = 1; + wrmsrl(HV_X64_MSR_VP_ASSIST_PAGE, msr.as_uint64); } return hyperv_init_ghcb(); @@ -444,7 +442,7 @@ void __init hyperv_init(void) if (hv_root_partition) { struct page *pg; - void *src, *dst; + void *src; /* * For the root partition, the hypervisor will set up its @@ -459,13 +457,13 @@ void __init hyperv_init(void) wrmsrl(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64); pg = vmalloc_to_page(hv_hypercall_pg); - dst = kmap_local_page(pg); src = memremap(hypercall_msr.guest_physical_address << PAGE_SHIFT, PAGE_SIZE, MEMREMAP_WB); - BUG_ON(!(src && dst)); - memcpy(dst, src, HV_HYP_PAGE_SIZE); + BUG_ON(!src); + memcpy_to_page(pg, 0, src, HV_HYP_PAGE_SIZE); memunmap(src); - kunmap_local(dst); + + hv_remap_tsc_clocksource(); } else { hypercall_msr.guest_physical_address = vmalloc_to_pfn(hv_hypercall_pg); wrmsrl(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64); @@ -537,8 +535,7 @@ common_free: void hyperv_cleanup(void) { union hv_x64_msr_hypercall_contents hypercall_msr; - - unregister_syscore_ops(&hv_syscore_ops); + union hv_reference_tsc_msr tsc_msr; /* Reset our OS id */ wrmsrl(HV_X64_MSR_GUEST_OS_ID, 0); @@ -552,12 +549,14 @@ void hyperv_cleanup(void) hv_hypercall_pg = NULL; /* Reset the hypercall page */ - hypercall_msr.as_uint64 = 0; - wrmsrl(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64); + hypercall_msr.as_uint64 = hv_get_register(HV_X64_MSR_HYPERCALL); + hypercall_msr.enable = 0; + hv_set_register(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64); /* Reset the TSC page */ - hypercall_msr.as_uint64 = 0; - wrmsrl(HV_X64_MSR_REFERENCE_TSC, hypercall_msr.as_uint64); + tsc_msr.as_uint64 = hv_get_register(HV_X64_MSR_REFERENCE_TSC); + tsc_msr.enable = 0; + hv_set_register(HV_X64_MSR_REFERENCE_TSC, tsc_msr.as_uint64); } void hyperv_report_panic(struct pt_regs *regs, long err, bool in_die) diff --git a/arch/x86/ia32/Makefile b/arch/x86/ia32/Makefile index e481056698de..333556a86b2a 100644 --- a/arch/x86/ia32/Makefile +++ b/arch/x86/ia32/Makefile @@ -3,7 +3,5 @@ # Makefile for the ia32 kernel emulation subsystem. # -obj-$(CONFIG_IA32_EMULATION) := ia32_signal.o - audit-class-$(CONFIG_AUDIT) := audit.o obj-$(CONFIG_IA32_EMULATION) += $(audit-class-y) diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h index 9542c582d546..7659217f4d49 100644 --- a/arch/x86/include/asm/alternative.h +++ b/arch/x86/include/asm/alternative.h @@ -78,8 +78,43 @@ extern void apply_alternatives(struct alt_instr *start, struct alt_instr *end); extern void apply_retpolines(s32 *start, s32 *end); extern void apply_returns(s32 *start, s32 *end); extern void apply_ibt_endbr(s32 *start, s32 *end); +extern void apply_fineibt(s32 *start_retpoline, s32 *end_retpoine, + s32 *start_cfi, s32 *end_cfi); struct module; +struct paravirt_patch_site; + +struct callthunk_sites { + s32 *call_start, *call_end; + struct paravirt_patch_site *pv_start, *pv_end; +}; + +#ifdef CONFIG_CALL_THUNKS +extern void callthunks_patch_builtin_calls(void); +extern void callthunks_patch_module_calls(struct callthunk_sites *sites, + struct module *mod); +extern void *callthunks_translate_call_dest(void *dest); +extern bool is_callthunk(void *addr); +extern int x86_call_depth_emit_accounting(u8 **pprog, void *func); +#else +static __always_inline void callthunks_patch_builtin_calls(void) {} +static __always_inline void +callthunks_patch_module_calls(struct callthunk_sites *sites, + struct module *mod) {} +static __always_inline void *callthunks_translate_call_dest(void *dest) +{ + return dest; +} +static __always_inline bool is_callthunk(void *addr) +{ + return false; +} +static __always_inline int x86_call_depth_emit_accounting(u8 **pprog, + void *func) +{ + return 0; +} +#endif #ifdef CONFIG_SMP extern void alternatives_smp_module_add(struct module *mod, char *name, @@ -347,6 +382,7 @@ static inline int alternatives_text_reserved(void *start, void *end) #define old_len 141b-140b #define new_len1 144f-143f #define new_len2 145f-144f +#define new_len3 146f-145f /* * gas compatible max based on the idea from: @@ -354,7 +390,8 @@ static inline int alternatives_text_reserved(void *start, void *end) * * The additional "-" is needed because gas uses a "true" value of -1. */ -#define alt_max_short(a, b) ((a) ^ (((a) ^ (b)) & -(-((a) < (b))))) +#define alt_max_2(a, b) ((a) ^ (((a) ^ (b)) & -(-((a) < (b))))) +#define alt_max_3(a, b, c) (alt_max_2(alt_max_2(a, b), c)) /* @@ -366,13 +403,36 @@ static inline int alternatives_text_reserved(void *start, void *end) 140: \oldinstr 141: - .skip -((alt_max_short(new_len1, new_len2) - (old_len)) > 0) * \ - (alt_max_short(new_len1, new_len2) - (old_len)),0x90 + .skip -((alt_max_2(new_len1, new_len2) - (old_len)) > 0) * \ + (alt_max_2(new_len1, new_len2) - (old_len)),0x90 +142: + + .pushsection .altinstructions,"a" + altinstruction_entry 140b,143f,\feature1,142b-140b,144f-143f + altinstruction_entry 140b,144f,\feature2,142b-140b,145f-144f + .popsection + + .pushsection .altinstr_replacement,"ax" +143: + \newinstr1 +144: + \newinstr2 +145: + .popsection +.endm + +.macro ALTERNATIVE_3 oldinstr, newinstr1, feature1, newinstr2, feature2, newinstr3, feature3 +140: + \oldinstr +141: + .skip -((alt_max_3(new_len1, new_len2, new_len3) - (old_len)) > 0) * \ + (alt_max_3(new_len1, new_len2, new_len3) - (old_len)),0x90 142: .pushsection .altinstructions,"a" altinstruction_entry 140b,143f,\feature1,142b-140b,144f-143f altinstruction_entry 140b,144f,\feature2,142b-140b,145f-144f + altinstruction_entry 140b,145f,\feature3,142b-140b,146f-145f .popsection .pushsection .altinstr_replacement,"ax" @@ -381,6 +441,8 @@ static inline int alternatives_text_reserved(void *start, void *end) 144: \newinstr2 145: + \newinstr3 +146: .popsection .endm diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index 3415321c8240..3216da7074ba 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -249,7 +249,6 @@ static inline u64 native_x2apic_icr_read(void) extern int x2apic_mode; extern int x2apic_phys; extern void __init x2apic_set_max_apicid(u32 apicid); -extern void __init check_x2apic(void); extern void x2apic_setup(void); static inline int x2apic_enabled(void) { @@ -258,13 +257,13 @@ static inline int x2apic_enabled(void) #define x2apic_supported() (boot_cpu_has(X86_FEATURE_X2APIC)) #else /* !CONFIG_X86_X2APIC */ -static inline void check_x2apic(void) { } static inline void x2apic_setup(void) { } static inline int x2apic_enabled(void) { return 0; } #define x2apic_mode (0) #define x2apic_supported() (0) #endif /* !CONFIG_X86_X2APIC */ +extern void __init check_x2apic(void); struct irq_data; diff --git a/arch/x86/include/asm/cacheinfo.h b/arch/x86/include/asm/cacheinfo.h index 86b2e0dcc4bf..ce9685fc78d8 100644 --- a/arch/x86/include/asm/cacheinfo.h +++ b/arch/x86/include/asm/cacheinfo.h @@ -2,7 +2,20 @@ #ifndef _ASM_X86_CACHEINFO_H #define _ASM_X86_CACHEINFO_H +/* Kernel controls MTRR and/or PAT MSRs. */ +extern unsigned int memory_caching_control; +#define CACHE_MTRR 0x01 +#define CACHE_PAT 0x02 + void cacheinfo_amd_init_llc_id(struct cpuinfo_x86 *c, int cpu); void cacheinfo_hygon_init_llc_id(struct cpuinfo_x86 *c, int cpu); +void cache_disable(void); +void cache_enable(void); +void set_cache_aps_delayed_init(bool val); +bool get_cache_aps_delayed_init(void); +void cache_bp_init(void); +void cache_bp_restore(void); +void cache_aps_init(void); + #endif /* _ASM_X86_CACHEINFO_H */ diff --git a/arch/x86/include/asm/cmpxchg_32.h b/arch/x86/include/asm/cmpxchg_32.h index 215f5a65790f..6ba80ce9438d 100644 --- a/arch/x86/include/asm/cmpxchg_32.h +++ b/arch/x86/include/asm/cmpxchg_32.h @@ -7,34 +7,6 @@ * you need to test for the feature in boot_cpu_data. */ -/* - * CMPXCHG8B only writes to the target if we had the previous - * value in registers, otherwise it acts as a read and gives us the - * "new previous" value. That is why there is a loop. Preloading - * EDX:EAX is a performance optimization: in the common case it means - * we need only one locked operation. - * - * A SIMD/3DNOW!/MMX/FPU 64-bit store here would require at the very - * least an FPU save and/or %cr0.ts manipulation. - * - * cmpxchg8b must be used with the lock prefix here to allow the - * instruction to be executed atomically. We need to have the reader - * side to see the coherent 64bit value. - */ -static inline void set_64bit(volatile u64 *ptr, u64 value) -{ - u32 low = value; - u32 high = value >> 32; - u64 prev = *ptr; - - asm volatile("\n1:\t" - LOCK_PREFIX "cmpxchg8b %0\n\t" - "jnz 1b" - : "=m" (*ptr), "+A" (prev) - : "b" (low), "c" (high) - : "memory"); -} - #ifdef CONFIG_X86_CMPXCHG64 #define arch_cmpxchg64(ptr, o, n) \ ((__typeof__(*(ptr)))__cmpxchg64((ptr), (unsigned long long)(o), \ diff --git a/arch/x86/include/asm/cmpxchg_64.h b/arch/x86/include/asm/cmpxchg_64.h index 250187ac8248..0d3beb27b7fe 100644 --- a/arch/x86/include/asm/cmpxchg_64.h +++ b/arch/x86/include/asm/cmpxchg_64.h @@ -2,11 +2,6 @@ #ifndef _ASM_X86_CMPXCHG_64_H #define _ASM_X86_CMPXCHG_64_H -static inline void set_64bit(volatile u64 *ptr, u64 val) -{ - *ptr = val; -} - #define arch_cmpxchg64(ptr, o, n) \ ({ \ BUILD_BUG_ON(sizeof(*(ptr)) != 8); \ diff --git a/arch/x86/include/asm/cpu.h b/arch/x86/include/asm/cpu.h index b472ef76826a..78796b98a544 100644 --- a/arch/x86/include/asm/cpu.h +++ b/arch/x86/include/asm/cpu.h @@ -95,5 +95,7 @@ static inline bool intel_cpu_signatures_match(unsigned int s1, unsigned int p1, } extern u64 x86_read_arch_cap_msr(void); +int intel_find_matching_signature(void *mc, unsigned int csig, int cpf); +int intel_microcode_sanity_check(void *mc, bool print_err, int hdr_type); #endif /* _ASM_X86_CPU_H */ diff --git a/arch/x86/include/asm/cpu_entry_area.h b/arch/x86/include/asm/cpu_entry_area.h index 75efc4c6f076..462fc34f1317 100644 --- a/arch/x86/include/asm/cpu_entry_area.h +++ b/arch/x86/include/asm/cpu_entry_area.h @@ -130,10 +130,6 @@ struct cpu_entry_area { }; #define CPU_ENTRY_AREA_SIZE (sizeof(struct cpu_entry_area)) -#define CPU_ENTRY_AREA_ARRAY_SIZE (CPU_ENTRY_AREA_SIZE * NR_CPUS) - -/* Total size includes the readonly IDT mapping page as well: */ -#define CPU_ENTRY_AREA_TOTAL_SIZE (CPU_ENTRY_AREA_ARRAY_SIZE + PAGE_SIZE) DECLARE_PER_CPU(struct cpu_entry_area *, cpu_entry_area); DECLARE_PER_CPU(struct cea_exception_stacks *, cea_exception_stacks); diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 1419c4e04d45..61012476d66e 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -304,6 +304,9 @@ #define X86_FEATURE_UNRET (11*32+15) /* "" AMD BTB untrain return */ #define X86_FEATURE_USE_IBPB_FW (11*32+16) /* "" Use IBPB during runtime firmware calls */ #define X86_FEATURE_RSB_VMEXIT_LITE (11*32+17) /* "" Fill RSB on VM exit when EIBRS is enabled */ +#define X86_FEATURE_SGX_EDECCSSA (11*32+18) /* "" SGX EDECCSSA user leaf function */ +#define X86_FEATURE_CALL_DEPTH (11*32+19) /* "" Call depth tracking for RSB stuffing */ +#define X86_FEATURE_MSR_TSX_CTRL (11*32+20) /* "" MSR IA32_TSX_CTRL (Intel) implemented */ /* Intel-defined CPU features, CPUID level 0x00000007:1 (EAX), word 12 */ #define X86_FEATURE_AVX_VNNI (12*32+ 4) /* AVX VNNI instructions */ diff --git a/arch/x86/include/asm/cpuid.h b/arch/x86/include/asm/cpuid.h index 70b2db18165e..9bee3e7bf973 100644 --- a/arch/x86/include/asm/cpuid.h +++ b/arch/x86/include/asm/cpuid.h @@ -1,13 +1,132 @@ /* SPDX-License-Identifier: GPL-2.0 */ /* * CPUID-related helpers/definitions - * - * Derived from arch/x86/kvm/cpuid.c */ #ifndef _ASM_X86_CPUID_H #define _ASM_X86_CPUID_H +#include <asm/string.h> + +struct cpuid_regs { + u32 eax, ebx, ecx, edx; +}; + +enum cpuid_regs_idx { + CPUID_EAX = 0, + CPUID_EBX, + CPUID_ECX, + CPUID_EDX, +}; + +#ifdef CONFIG_X86_32 +extern int have_cpuid_p(void); +#else +static inline int have_cpuid_p(void) +{ + return 1; +} +#endif +static inline void native_cpuid(unsigned int *eax, unsigned int *ebx, + unsigned int *ecx, unsigned int *edx) +{ + /* ecx is often an input as well as an output. */ + asm volatile("cpuid" + : "=a" (*eax), + "=b" (*ebx), + "=c" (*ecx), + "=d" (*edx) + : "0" (*eax), "2" (*ecx) + : "memory"); +} + +#define native_cpuid_reg(reg) \ +static inline unsigned int native_cpuid_##reg(unsigned int op) \ +{ \ + unsigned int eax = op, ebx, ecx = 0, edx; \ + \ + native_cpuid(&eax, &ebx, &ecx, &edx); \ + \ + return reg; \ +} + +/* + * Native CPUID functions returning a single datum. + */ +native_cpuid_reg(eax) +native_cpuid_reg(ebx) +native_cpuid_reg(ecx) +native_cpuid_reg(edx) + +#ifdef CONFIG_PARAVIRT_XXL +#include <asm/paravirt.h> +#else +#define __cpuid native_cpuid +#endif + +/* + * Generic CPUID function + * clear %ecx since some cpus (Cyrix MII) do not set or clear %ecx + * resulting in stale register contents being returned. + */ +static inline void cpuid(unsigned int op, + unsigned int *eax, unsigned int *ebx, + unsigned int *ecx, unsigned int *edx) +{ + *eax = op; + *ecx = 0; + __cpuid(eax, ebx, ecx, edx); +} + +/* Some CPUID calls want 'count' to be placed in ecx */ +static inline void cpuid_count(unsigned int op, int count, + unsigned int *eax, unsigned int *ebx, + unsigned int *ecx, unsigned int *edx) +{ + *eax = op; + *ecx = count; + __cpuid(eax, ebx, ecx, edx); +} + +/* + * CPUID functions returning a single datum + */ +static inline unsigned int cpuid_eax(unsigned int op) +{ + unsigned int eax, ebx, ecx, edx; + + cpuid(op, &eax, &ebx, &ecx, &edx); + + return eax; +} + +static inline unsigned int cpuid_ebx(unsigned int op) +{ + unsigned int eax, ebx, ecx, edx; + + cpuid(op, &eax, &ebx, &ecx, &edx); + + return ebx; +} + +static inline unsigned int cpuid_ecx(unsigned int op) +{ + unsigned int eax, ebx, ecx, edx; + + cpuid(op, &eax, &ebx, &ecx, &edx); + + return ecx; +} + +static inline unsigned int cpuid_edx(unsigned int op) +{ + unsigned int eax, ebx, ecx, edx; + + cpuid(op, &eax, &ebx, &ecx, &edx); + + return edx; +} + static __always_inline bool cpuid_function_is_indexed(u32 function) { switch (function) { @@ -31,4 +150,22 @@ static __always_inline bool cpuid_function_is_indexed(u32 function) return false; } +#define for_each_possible_hypervisor_cpuid_base(function) \ + for (function = 0x40000000; function < 0x40010000; function += 0x100) + +static inline uint32_t hypervisor_cpuid_base(const char *sig, uint32_t leaves) +{ + uint32_t base, eax, signature[3]; + + for_each_possible_hypervisor_cpuid_base(base) { + cpuid(base, &eax, &signature[0], &signature[1], &signature[2]); + + if (!memcmp(sig, signature, 12) && + (leaves == 0 || ((eax - base) >= leaves))) + return base; + } + + return 0; +} + #endif /* _ASM_X86_CPUID_H */ diff --git a/arch/x86/include/asm/current.h b/arch/x86/include/asm/current.h index 3e204e6140b5..a1168e7b69e5 100644 --- a/arch/x86/include/asm/current.h +++ b/arch/x86/include/asm/current.h @@ -3,16 +3,42 @@ #define _ASM_X86_CURRENT_H #include <linux/compiler.h> -#include <asm/percpu.h> #ifndef __ASSEMBLY__ + +#include <linux/cache.h> +#include <asm/percpu.h> + struct task_struct; -DECLARE_PER_CPU(struct task_struct *, current_task); +struct pcpu_hot { + union { + struct { + struct task_struct *current_task; + int preempt_count; + int cpu_number; +#ifdef CONFIG_CALL_DEPTH_TRACKING + u64 call_depth; +#endif + unsigned long top_of_stack; + void *hardirq_stack_ptr; + u16 softirq_pending; +#ifdef CONFIG_X86_64 + bool hardirq_stack_inuse; +#else + void *softirq_stack_ptr; +#endif + }; + u8 pad[64]; + }; +}; +static_assert(sizeof(struct pcpu_hot) == 64); + +DECLARE_PER_CPU_ALIGNED(struct pcpu_hot, pcpu_hot); static __always_inline struct task_struct *get_current(void) { - return this_cpu_read_stable(current_task); + return this_cpu_read_stable(pcpu_hot.current_task); } #define current get_current() diff --git a/arch/x86/include/asm/debugreg.h b/arch/x86/include/asm/debugreg.h index cfdf307ddc01..b049d950612f 100644 --- a/arch/x86/include/asm/debugreg.h +++ b/arch/x86/include/asm/debugreg.h @@ -2,8 +2,8 @@ #ifndef _ASM_X86_DEBUGREG_H #define _ASM_X86_DEBUGREG_H - #include <linux/bug.h> +#include <linux/percpu.h> #include <uapi/asm/debugreg.h> DECLARE_PER_CPU(unsigned long, cpu_dr7); diff --git a/arch/x86/include/asm/disabled-features.h b/arch/x86/include/asm/disabled-features.h index 33d2cd04d254..c44b56f7ffba 100644 --- a/arch/x86/include/asm/disabled-features.h +++ b/arch/x86/include/asm/disabled-features.h @@ -69,6 +69,12 @@ # define DISABLE_UNRET (1 << (X86_FEATURE_UNRET & 31)) #endif +#ifdef CONFIG_CALL_DEPTH_TRACKING +# define DISABLE_CALL_DEPTH_TRACKING 0 +#else +# define DISABLE_CALL_DEPTH_TRACKING (1 << (X86_FEATURE_CALL_DEPTH & 31)) +#endif + #ifdef CONFIG_INTEL_IOMMU_SVM # define DISABLE_ENQCMD 0 #else @@ -81,6 +87,12 @@ # define DISABLE_SGX (1 << (X86_FEATURE_SGX & 31)) #endif +#ifdef CONFIG_XEN_PV +# define DISABLE_XENPV 0 +#else +# define DISABLE_XENPV (1 << (X86_FEATURE_XENPV & 31)) +#endif + #ifdef CONFIG_INTEL_TDX_GUEST # define DISABLE_TDX_GUEST 0 #else @@ -98,10 +110,11 @@ #define DISABLED_MASK5 0 #define DISABLED_MASK6 0 #define DISABLED_MASK7 (DISABLE_PTI) -#define DISABLED_MASK8 (DISABLE_TDX_GUEST) +#define DISABLED_MASK8 (DISABLE_XENPV|DISABLE_TDX_GUEST) #define DISABLED_MASK9 (DISABLE_SGX) #define DISABLED_MASK10 0 -#define DISABLED_MASK11 (DISABLE_RETPOLINE|DISABLE_RETHUNK|DISABLE_UNRET) +#define DISABLED_MASK11 (DISABLE_RETPOLINE|DISABLE_RETHUNK|DISABLE_UNRET| \ + DISABLE_CALL_DEPTH_TRACKING) #define DISABLED_MASK12 0 #define DISABLED_MASK13 0 #define DISABLED_MASK14 0 diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h index 233ae6986d6f..a63154e049d7 100644 --- a/arch/x86/include/asm/efi.h +++ b/arch/x86/include/asm/efi.h @@ -178,7 +178,7 @@ struct efi_setup_data { extern u64 efi_setup; #ifdef CONFIG_EFI -extern efi_status_t __efi64_thunk(u32, ...); +extern u64 __efi64_thunk(u32, ...); #define efi64_thunk(...) ({ \ u64 __pad[3]; /* must have space for 3 args on the stack */ \ @@ -228,16 +228,15 @@ static inline bool efi_is_native(void) return efi_is_64bit(); } -#define efi_mixed_mode_cast(attr) \ - __builtin_choose_expr( \ - __builtin_types_compatible_p(u32, __typeof__(attr)), \ - (unsigned long)(attr), (attr)) - #define efi_table_attr(inst, attr) \ - (efi_is_native() \ - ? inst->attr \ - : (__typeof__(inst->attr)) \ - efi_mixed_mode_cast(inst->mixed_mode.attr)) + (efi_is_native() ? (inst)->attr \ + : efi_mixed_table_attr((inst), attr)) + +#define efi_mixed_table_attr(inst, attr) \ + (__typeof__(inst->attr)) \ + _Generic(inst->mixed_mode.attr, \ + u32: (unsigned long)(inst->mixed_mode.attr), \ + default: (inst->mixed_mode.attr)) /* * The following macros allow translating arguments if necessary from native to @@ -325,6 +324,17 @@ static inline u32 efi64_convert_status(efi_status_t status) #define __efi64_argmap_set_memory_space_attributes(phys, size, flags) \ (__efi64_split(phys), __efi64_split(size), __efi64_split(flags)) +/* file protocol */ +#define __efi64_argmap_open(prot, newh, fname, mode, attr) \ + ((prot), efi64_zero_upper(newh), (fname), __efi64_split(mode), \ + __efi64_split(attr)) + +#define __efi64_argmap_set_position(pos) (__efi64_split(pos)) + +/* file system protocol */ +#define __efi64_argmap_open_volume(prot, file) \ + ((prot), efi64_zero_upper(file)) + /* * The macros below handle the plumbing for the argument mapping. To add a * mapping for a specific EFI method, simply define a macro @@ -344,31 +354,27 @@ static inline u32 efi64_convert_status(efi_status_t status) #define __efi_eat(...) #define __efi_eval(...) __VA_ARGS__ -/* The three macros below handle dispatching via the thunk if needed */ - -#define efi_call_proto(inst, func, ...) \ - (efi_is_native() \ - ? inst->func(inst, ##__VA_ARGS__) \ - : __efi64_thunk_map(inst, func, inst, ##__VA_ARGS__)) +static inline efi_status_t __efi64_widen_efi_status(u64 status) +{ + /* use rotate to move the value of bit #31 into position #63 */ + return ror64(rol32(status, 1), 1); +} -#define efi_bs_call(func, ...) \ - (efi_is_native() \ - ? efi_system_table->boottime->func(__VA_ARGS__) \ - : __efi64_thunk_map(efi_table_attr(efi_system_table, \ - boottime), \ - func, __VA_ARGS__)) +/* The macro below handles dispatching via the thunk if needed */ -#define efi_rt_call(func, ...) \ - (efi_is_native() \ - ? efi_system_table->runtime->func(__VA_ARGS__) \ - : __efi64_thunk_map(efi_table_attr(efi_system_table, \ - runtime), \ - func, __VA_ARGS__)) +#define efi_fn_call(inst, func, ...) \ + (efi_is_native() ? (inst)->func(__VA_ARGS__) \ + : efi_mixed_call((inst), func, ##__VA_ARGS__)) -#define efi_dxe_call(func, ...) \ - (efi_is_native() \ - ? efi_dxe_table->func(__VA_ARGS__) \ - : __efi64_thunk_map(efi_dxe_table, func, __VA_ARGS__)) +#define efi_mixed_call(inst, func, ...) \ + _Generic(inst->func(__VA_ARGS__), \ + efi_status_t: \ + __efi64_widen_efi_status( \ + __efi64_thunk_map(inst, func, ##__VA_ARGS__)), \ + u64: ({ BUILD_BUG(); ULONG_MAX; }), \ + default: \ + (__typeof__(inst->func(__VA_ARGS__))) \ + __efi64_thunk_map(inst, func, ##__VA_ARGS__)) #else /* CONFIG_EFI_MIXED */ @@ -400,13 +406,52 @@ static inline void efi_reserve_boot_services(void) #ifdef CONFIG_EFI_FAKE_MEMMAP extern void __init efi_fake_memmap_early(void); +extern void __init efi_fake_memmap(void); #else static inline void efi_fake_memmap_early(void) { } + +static inline void efi_fake_memmap(void) +{ +} #endif +extern int __init efi_memmap_alloc(unsigned int num_entries, + struct efi_memory_map_data *data); +extern void __efi_memmap_free(u64 phys, unsigned long size, + unsigned long flags); +#define __efi_memmap_free __efi_memmap_free + +extern int __init efi_memmap_install(struct efi_memory_map_data *data); +extern int __init efi_memmap_split_count(efi_memory_desc_t *md, + struct range *range); +extern void __init efi_memmap_insert(struct efi_memory_map *old_memmap, + void *buf, struct efi_mem_range *mem); + #define arch_ima_efi_boot_mode \ ({ extern struct boot_params boot_params; boot_params.secure_boot; }) +#ifdef CONFIG_EFI_RUNTIME_MAP +int efi_get_runtime_map_size(void); +int efi_get_runtime_map_desc_size(void); +int efi_runtime_map_copy(void *buf, size_t bufsz); +#else +static inline int efi_get_runtime_map_size(void) +{ + return 0; +} + +static inline int efi_get_runtime_map_desc_size(void) +{ + return 0; +} + +static inline int efi_runtime_map_copy(void *buf, size_t bufsz) +{ + return 0; +} + +#endif + #endif /* _ASM_X86_EFI_H */ diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h index cb0ff1055ab1..18fd06f7936a 100644 --- a/arch/x86/include/asm/elf.h +++ b/arch/x86/include/asm/elf.h @@ -152,10 +152,6 @@ do { \ (elf_check_arch_ia32(x) || \ (IS_ENABLED(CONFIG_X86_X32_ABI) && (x)->e_machine == EM_X86_64)) -#if __USER32_DS != __USER_DS -# error "The following code assumes __USER32_DS == __USER_DS" -#endif - static inline void elf_common_init(struct thread_struct *t, struct pt_regs *regs, const u16 ds) { @@ -226,7 +222,6 @@ do { \ /* I'm not sure if we can use '-' here */ #define ELF_PLATFORM ("x86_64") extern void set_personality_64bit(void); -extern unsigned int sysctl_vsyscall32; extern int force_personality32; #endif /* !CONFIG_X86_32 */ diff --git a/arch/x86/include/asm/entry-common.h b/arch/x86/include/asm/entry-common.h index 674ed46d3ced..117903881fe4 100644 --- a/arch/x86/include/asm/entry-common.h +++ b/arch/x86/include/asm/entry-common.h @@ -24,8 +24,8 @@ static __always_inline void arch_enter_from_user_mode(struct pt_regs *regs) /* * For !SMAP hardware we patch out CLAC on entry. */ - if (boot_cpu_has(X86_FEATURE_SMAP) || - (IS_ENABLED(CONFIG_64BIT) && boot_cpu_has(X86_FEATURE_XENPV))) + if (cpu_feature_enabled(X86_FEATURE_SMAP) || + cpu_feature_enabled(X86_FEATURE_XENPV)) mask |= X86_EFLAGS_AC; WARN_ON_ONCE(flags & mask); diff --git a/arch/x86/include/asm/fpu/signal.h b/arch/x86/include/asm/fpu/signal.h index e1c9df9102a5..611fa41711af 100644 --- a/arch/x86/include/asm/fpu/signal.h +++ b/arch/x86/include/asm/fpu/signal.h @@ -13,16 +13,9 @@ #ifdef CONFIG_X86_64 # include <uapi/asm/sigcontext.h> # include <asm/user32.h> -struct ksignal; -int ia32_setup_rt_frame(int sig, struct ksignal *ksig, - compat_sigset_t *set, struct pt_regs *regs); -int ia32_setup_frame(int sig, struct ksignal *ksig, - compat_sigset_t *set, struct pt_regs *regs); #else # define user_i387_ia32_struct user_i387_struct # define user32_fxsr_struct user_fxsr_struct -# define ia32_setup_frame __setup_frame -# define ia32_setup_rt_frame __setup_rt_frame #endif extern void convert_from_fxsr(struct user_i387_ia32_struct *env, diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h index 908d99b127d3..5061ac98ffa1 100644 --- a/arch/x86/include/asm/ftrace.h +++ b/arch/x86/include/asm/ftrace.h @@ -34,19 +34,6 @@ static inline unsigned long ftrace_call_adjust(unsigned long addr) return addr; } -/* - * When a ftrace registered caller is tracing a function that is - * also set by a register_ftrace_direct() call, it needs to be - * differentiated in the ftrace_caller trampoline. To do this, we - * place the direct caller in the ORIG_AX part of pt_regs. This - * tells the ftrace_caller that there's a direct caller. - */ -static inline void arch_ftrace_set_direct_caller(struct pt_regs *regs, unsigned long addr) -{ - /* Emulate a call */ - regs->orig_ax = addr; -} - #ifdef CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS struct ftrace_regs { struct pt_regs regs; @@ -61,9 +48,25 @@ arch_ftrace_get_regs(struct ftrace_regs *fregs) return &fregs->regs; } -#define ftrace_instruction_pointer_set(fregs, _ip) \ +#define ftrace_regs_set_instruction_pointer(fregs, _ip) \ do { (fregs)->regs.ip = (_ip); } while (0) +#define ftrace_regs_get_instruction_pointer(fregs) \ + ((fregs)->regs.ip) + +#define ftrace_regs_get_argument(fregs, n) \ + regs_get_kernel_argument(&(fregs)->regs, n) +#define ftrace_regs_get_stack_pointer(fregs) \ + kernel_stack_pointer(&(fregs)->regs) +#define ftrace_regs_return_value(fregs) \ + regs_return_value(&(fregs)->regs) +#define ftrace_regs_set_return_value(fregs, ret) \ + regs_set_return_value(&(fregs)->regs, ret) +#define ftrace_override_function_with_return(fregs) \ + override_function_with_return(&(fregs)->regs) +#define ftrace_regs_query_register_offset(name) \ + regs_query_register_offset(name) + struct ftrace_ops; #define ftrace_graph_func ftrace_graph_func void ftrace_graph_func(unsigned long ip, unsigned long parent_ip, @@ -72,6 +75,24 @@ void ftrace_graph_func(unsigned long ip, unsigned long parent_ip, #define FTRACE_GRAPH_TRAMP_ADDR FTRACE_GRAPH_ADDR #endif +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS +/* + * When a ftrace registered caller is tracing a function that is + * also set by a register_ftrace_direct() call, it needs to be + * differentiated in the ftrace_caller trampoline. To do this, we + * place the direct caller in the ORIG_AX part of pt_regs. This + * tells the ftrace_caller that there's a direct caller. + */ +static inline void +__arch_ftrace_set_direct_caller(struct pt_regs *regs, unsigned long addr) +{ + /* Emulate a call */ + regs->orig_ax = addr; +} +#define arch_ftrace_set_direct_caller(fregs, addr) \ + __arch_ftrace_set_direct_caller(&(fregs)->regs, addr) +#endif /* CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS */ + #ifdef CONFIG_DYNAMIC_FTRACE struct dyn_arch_ftrace { diff --git a/arch/x86/include/asm/hardirq.h b/arch/x86/include/asm/hardirq.h index 275e7fd20310..66837b8c67f1 100644 --- a/arch/x86/include/asm/hardirq.h +++ b/arch/x86/include/asm/hardirq.h @@ -3,9 +3,9 @@ #define _ASM_X86_HARDIRQ_H #include <linux/threads.h> +#include <asm/current.h> typedef struct { - u16 __softirq_pending; #if IS_ENABLED(CONFIG_KVM_INTEL) u8 kvm_cpu_l1tf_flush_l1d; #endif @@ -60,6 +60,7 @@ extern u64 arch_irq_stat_cpu(unsigned int cpu); extern u64 arch_irq_stat(void); #define arch_irq_stat arch_irq_stat +#define local_softirq_pending_ref pcpu_hot.softirq_pending #if IS_ENABLED(CONFIG_KVM_INTEL) static inline void kvm_set_cpu_l1tf_flush_l1d(void) diff --git a/arch/x86/include/asm/hyperv-tlfs.h b/arch/x86/include/asm/hyperv-tlfs.h index 617332dd64ac..1e2fe398b66d 100644 --- a/arch/x86/include/asm/hyperv-tlfs.h +++ b/arch/x86/include/asm/hyperv-tlfs.h @@ -379,11 +379,20 @@ struct hv_nested_enlightenments_control { struct hv_vp_assist_page { __u32 apic_assist; __u32 reserved1; - __u64 vtl_control[3]; + __u32 vtl_entry_reason; + __u32 vtl_reserved; + __u64 vtl_ret_x64rax; + __u64 vtl_ret_x64rcx; struct hv_nested_enlightenments_control nested_control; __u8 enlighten_vmentry; __u8 reserved2[7]; __u64 current_nested_vmcs; + __u8 synthetic_time_unhalted_timer_expired; + __u8 reserved3[7]; + __u8 virtualization_fault_information[40]; + __u8 reserved4[8]; + __u8 intercept_message[256]; + __u8 vtl_ret_actions[256]; } __packed; struct hv_enlightened_vmcs { diff --git a/arch/x86/include/asm/hyperv_timer.h b/arch/x86/include/asm/hyperv_timer.h new file mode 100644 index 000000000000..388fa81b8f38 --- /dev/null +++ b/arch/x86/include/asm/hyperv_timer.h @@ -0,0 +1,9 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_X86_HYPERV_TIMER_H +#define _ASM_X86_HYPERV_TIMER_H + +#include <asm/msr.h> + +#define hv_get_raw_timer() rdtsc_ordered() + +#endif diff --git a/arch/x86/include/asm/insn-eval.h b/arch/x86/include/asm/insn-eval.h index f07faa61c7f3..54368a43abf6 100644 --- a/arch/x86/include/asm/insn-eval.h +++ b/arch/x86/include/asm/insn-eval.h @@ -32,16 +32,16 @@ int insn_fetch_from_user_inatomic(struct pt_regs *regs, bool insn_decode_from_regs(struct insn *insn, struct pt_regs *regs, unsigned char buf[MAX_INSN_SIZE], int buf_size); -enum mmio_type { - MMIO_DECODE_FAILED, - MMIO_WRITE, - MMIO_WRITE_IMM, - MMIO_READ, - MMIO_READ_ZERO_EXTEND, - MMIO_READ_SIGN_EXTEND, - MMIO_MOVS, +enum insn_mmio_type { + INSN_MMIO_DECODE_FAILED, + INSN_MMIO_WRITE, + INSN_MMIO_WRITE_IMM, + INSN_MMIO_READ, + INSN_MMIO_READ_ZERO_EXTEND, + INSN_MMIO_READ_SIGN_EXTEND, + INSN_MMIO_MOVS, }; -enum mmio_type insn_decode_mmio(struct insn *insn, int *bytes); +enum insn_mmio_type insn_decode_mmio(struct insn *insn, int *bytes); #endif /* _ASM_X86_INSN_EVAL_H */ diff --git a/arch/x86/include/asm/irq_remapping.h b/arch/x86/include/asm/irq_remapping.h index 7cc49432187f..7a2ed154a5e1 100644 --- a/arch/x86/include/asm/irq_remapping.h +++ b/arch/x86/include/asm/irq_remapping.h @@ -44,10 +44,6 @@ extern int irq_remapping_reenable(int); extern int irq_remap_enable_fault_handling(void); extern void panic_if_irq_remap(const char *msg); -/* Create PCI MSI/MSIx irqdomain, use @parent as the parent irqdomain. */ -extern struct irq_domain * -arch_create_remap_msi_irq_domain(struct irq_domain *par, const char *n, int id); - /* Get parent irqdomain for interrupt remapping irqdomain */ static inline struct irq_domain *arch_get_ir_parent_domain(void) { diff --git a/arch/x86/include/asm/irq_stack.h b/arch/x86/include/asm/irq_stack.h index 147cb8fdda92..798183867d78 100644 --- a/arch/x86/include/asm/irq_stack.h +++ b/arch/x86/include/asm/irq_stack.h @@ -116,7 +116,7 @@ ASM_CALL_ARG2 #define call_on_irqstack(func, asm_call, argconstr...) \ - call_on_stack(__this_cpu_read(hardirq_stack_ptr), \ + call_on_stack(__this_cpu_read(pcpu_hot.hardirq_stack_ptr), \ func, asm_call, argconstr) /* Macros to assert type correctness for run_*_on_irqstack macros */ @@ -135,7 +135,7 @@ * User mode entry and interrupt on the irq stack do not \ * switch stacks. If from user mode the task stack is empty. \ */ \ - if (user_mode(regs) || __this_cpu_read(hardirq_stack_inuse)) { \ + if (user_mode(regs) || __this_cpu_read(pcpu_hot.hardirq_stack_inuse)) { \ irq_enter_rcu(); \ func(c_args); \ irq_exit_rcu(); \ @@ -146,9 +146,9 @@ * places. Invoke the stack switch macro with the call \ * sequence which matches the above direct invocation. \ */ \ - __this_cpu_write(hardirq_stack_inuse, true); \ + __this_cpu_write(pcpu_hot.hardirq_stack_inuse, true); \ call_on_irqstack(func, asm_call, constr); \ - __this_cpu_write(hardirq_stack_inuse, false); \ + __this_cpu_write(pcpu_hot.hardirq_stack_inuse, false); \ } \ } @@ -212,9 +212,9 @@ */ #define do_softirq_own_stack() \ { \ - __this_cpu_write(hardirq_stack_inuse, true); \ + __this_cpu_write(pcpu_hot.hardirq_stack_inuse, true); \ call_on_irqstack(__do_softirq, ASM_CALL_ARG0); \ - __this_cpu_write(hardirq_stack_inuse, false); \ + __this_cpu_write(pcpu_hot.hardirq_stack_inuse, false); \ } #endif diff --git a/arch/x86/include/asm/irqdomain.h b/arch/x86/include/asm/irqdomain.h index 125c23b7bad3..30c325c235c0 100644 --- a/arch/x86/include/asm/irqdomain.h +++ b/arch/x86/include/asm/irqdomain.h @@ -7,9 +7,7 @@ #ifdef CONFIG_X86_LOCAL_APIC enum { - /* Allocate contiguous CPU vectors */ - X86_IRQ_ALLOC_CONTIGUOUS_VECTORS = 0x1, - X86_IRQ_ALLOC_LEGACY = 0x2, + X86_IRQ_ALLOC_LEGACY = 0x1, }; extern int x86_fwspec_is_ioapic(struct irq_fwspec *fwspec); diff --git a/arch/x86/include/asm/kasan.h b/arch/x86/include/asm/kasan.h index 13e70da38bed..de75306b932e 100644 --- a/arch/x86/include/asm/kasan.h +++ b/arch/x86/include/asm/kasan.h @@ -28,9 +28,12 @@ #ifdef CONFIG_KASAN void __init kasan_early_init(void); void __init kasan_init(void); +void __init kasan_populate_shadow_for_vaddr(void *va, size_t size, int nid); #else static inline void kasan_early_init(void) { } static inline void kasan_init(void) { } +static inline void kasan_populate_shadow_for_vaddr(void *va, size_t size, + int nid) { } #endif #endif diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h index 84f43caef9b7..8dc345cc6318 100644 --- a/arch/x86/include/asm/kvm-x86-ops.h +++ b/arch/x86/include/asm/kvm-x86-ops.h @@ -14,6 +14,7 @@ BUILD_BUG_ON(1) * to make a definition optional, but in this case the default will * be __static_call_return0. */ +KVM_X86_OP(check_processor_compatibility) KVM_X86_OP(hardware_enable) KVM_X86_OP(hardware_disable) KVM_X86_OP(hardware_unsetup) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 7ca854714ccd..4d2bc08794e4 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1123,6 +1123,7 @@ struct msr_bitmap_range { /* Xen emulation context */ struct kvm_xen { + struct mutex xen_lock; u32 xen_version; bool long_mode; bool runstate_update_flag; @@ -1535,6 +1536,8 @@ static inline u16 kvm_lapic_irq_dest_mode(bool dest_mode_logical) struct kvm_x86_ops { const char *name; + int (*check_processor_compatibility)(void); + int (*hardware_enable)(void); void (*hardware_disable)(void); void (*hardware_unsetup)(void); @@ -1748,9 +1751,6 @@ struct kvm_x86_nested_ops { }; struct kvm_x86_init_ops { - int (*cpu_has_kvm_support)(void); - int (*disabled_by_bios)(void); - int (*check_processor_compatibility)(void); int (*hardware_setup)(void); unsigned int (*handle_intel_pt_intr)(void); @@ -1777,6 +1777,9 @@ extern struct kvm_x86_ops kvm_x86_ops; #define KVM_X86_OP_OPTIONAL_RET0 KVM_X86_OP #include <asm/kvm-x86-ops.h> +int kvm_x86_vendor_init(struct kvm_x86_init_ops *ops); +void kvm_x86_vendor_exit(void); + #define __KVM_HAVE_ARCH_VM_ALLOC static inline struct kvm *kvm_arch_alloc_vm(void) { diff --git a/arch/x86/include/asm/linkage.h b/arch/x86/include/asm/linkage.h index f484d656d34e..dd9b8118f784 100644 --- a/arch/x86/include/asm/linkage.h +++ b/arch/x86/include/asm/linkage.h @@ -12,13 +12,26 @@ #define asmlinkage CPP_ASMLINKAGE __attribute__((regparm(0))) #endif /* CONFIG_X86_32 */ -#ifdef __ASSEMBLY__ - -#if defined(CONFIG_X86_64) || defined(CONFIG_X86_ALIGNMENT_16) -#define __ALIGN .p2align 4, 0x90 +#define __ALIGN .balign CONFIG_FUNCTION_ALIGNMENT, 0x90; #define __ALIGN_STR __stringify(__ALIGN) + +#if defined(CONFIG_CALL_PADDING) && !defined(__DISABLE_EXPORTS) && !defined(BUILD_VDSO) +#define FUNCTION_PADDING .skip CONFIG_FUNCTION_ALIGNMENT, 0x90; +#else +#define FUNCTION_PADDING +#endif + +#if (CONFIG_FUNCTION_ALIGNMENT > 8) && !defined(__DISABLE_EXPORTS) && !defined(BULID_VDSO) +# define __FUNC_ALIGN __ALIGN; FUNCTION_PADDING +#else +# define __FUNC_ALIGN __ALIGN #endif +#define ASM_FUNC_ALIGN __stringify(__FUNC_ALIGN) +#define SYM_F_ALIGN __FUNC_ALIGN + +#ifdef __ASSEMBLY__ + #if defined(CONFIG_RETHUNK) && !defined(__DISABLE_EXPORTS) && !defined(BUILD_VDSO) #define RET jmp __x86_return_thunk #else /* CONFIG_RETPOLINE */ @@ -43,11 +56,45 @@ #endif /* __ASSEMBLY__ */ +/* + * Depending on -fpatchable-function-entry=N,N usage (CONFIG_CALL_PADDING) the + * CFI symbol layout changes. + * + * Without CALL_THUNKS: + * + * .align FUNCTION_ALIGNMENT + * __cfi_##name: + * .skip FUNCTION_PADDING, 0x90 + * .byte 0xb8 + * .long __kcfi_typeid_##name + * name: + * + * With CALL_THUNKS: + * + * .align FUNCTION_ALIGNMENT + * __cfi_##name: + * .byte 0xb8 + * .long __kcfi_typeid_##name + * .skip FUNCTION_PADDING, 0x90 + * name: + * + * In both cases the whole thing is FUNCTION_ALIGNMENT aligned and sized. + */ + +#ifdef CONFIG_CALL_PADDING +#define CFI_PRE_PADDING +#define CFI_POST_PADDING .skip CONFIG_FUNCTION_PADDING_BYTES, 0x90; +#else +#define CFI_PRE_PADDING .skip CONFIG_FUNCTION_PADDING_BYTES, 0x90; +#define CFI_POST_PADDING +#endif + #define __CFI_TYPE(name) \ SYM_START(__cfi_##name, SYM_L_LOCAL, SYM_A_NONE) \ - .fill 11, 1, 0x90 ASM_NL \ + CFI_PRE_PADDING \ .byte 0xb8 ASM_NL \ .long __kcfi_typeid_##name ASM_NL \ + CFI_POST_PADDING \ SYM_FUNC_END(__cfi_##name) /* SYM_TYPED_FUNC_START -- use for indirectly called globals, w/ CFI type */ @@ -57,7 +104,7 @@ /* SYM_FUNC_START -- use for global functions */ #define SYM_FUNC_START(name) \ - SYM_START(name, SYM_L_GLOBAL, SYM_A_ALIGN) \ + SYM_START(name, SYM_L_GLOBAL, SYM_F_ALIGN) \ ENDBR /* SYM_FUNC_START_NOALIGN -- use for global functions, w/o alignment */ @@ -67,7 +114,7 @@ /* SYM_FUNC_START_LOCAL -- use for local functions */ #define SYM_FUNC_START_LOCAL(name) \ - SYM_START(name, SYM_L_LOCAL, SYM_A_ALIGN) \ + SYM_START(name, SYM_L_LOCAL, SYM_F_ALIGN) \ ENDBR /* SYM_FUNC_START_LOCAL_NOALIGN -- use for local functions, w/o alignment */ @@ -77,7 +124,7 @@ /* SYM_FUNC_START_WEAK -- use for weak functions */ #define SYM_FUNC_START_WEAK(name) \ - SYM_START(name, SYM_L_WEAK, SYM_A_ALIGN) \ + SYM_START(name, SYM_L_WEAK, SYM_F_ALIGN) \ ENDBR /* SYM_FUNC_START_WEAK_NOALIGN -- use for weak functions, w/o alignment */ diff --git a/arch/x86/include/asm/memtype.h b/arch/x86/include/asm/memtype.h index 9ca760e430b9..113b2fa51849 100644 --- a/arch/x86/include/asm/memtype.h +++ b/arch/x86/include/asm/memtype.h @@ -6,9 +6,8 @@ #include <asm/pgtable_types.h> extern bool pat_enabled(void); -extern void pat_disable(const char *reason); -extern void pat_init(void); -extern void init_cache_modes(void); +extern void pat_bp_init(void); +extern void pat_cpu_init(void); extern int memtype_reserve(u64 start, u64 end, enum page_cache_mode req_pcm, enum page_cache_mode *ret_pcm); diff --git a/arch/x86/include/asm/microcode.h b/arch/x86/include/asm/microcode.h index 74ecc2bd6cd0..d5a58bde091c 100644 --- a/arch/x86/include/asm/microcode.h +++ b/arch/x86/include/asm/microcode.h @@ -33,8 +33,7 @@ enum ucode_state { }; struct microcode_ops { - enum ucode_state (*request_microcode_fw) (int cpu, struct device *, - bool refresh_fw); + enum ucode_state (*request_microcode_fw) (int cpu, struct device *); void (*microcode_fini_cpu) (int cpu); @@ -50,7 +49,6 @@ struct microcode_ops { struct ucode_cpu_info { struct cpu_signature cpu_sig; - int valid; void *mc; }; extern struct ucode_cpu_info ucode_cpu_info[]; diff --git a/arch/x86/include/asm/microcode_intel.h b/arch/x86/include/asm/microcode_intel.h index 4c92cea7e4b5..f1fa979e05bf 100644 --- a/arch/x86/include/asm/microcode_intel.h +++ b/arch/x86/include/asm/microcode_intel.h @@ -14,7 +14,8 @@ struct microcode_header_intel { unsigned int pf; unsigned int datasize; unsigned int totalsize; - unsigned int reserved[3]; + unsigned int metasize; + unsigned int reserved[2]; }; struct microcode_intel { @@ -41,6 +42,8 @@ struct extended_sigtable { #define DEFAULT_UCODE_TOTALSIZE (DEFAULT_UCODE_DATASIZE + MC_HEADER_SIZE) #define EXT_HEADER_SIZE (sizeof(struct extended_sigtable)) #define EXT_SIGNATURE_SIZE (sizeof(struct extended_signature)) +#define MC_HEADER_TYPE_MICROCODE 1 +#define MC_HEADER_TYPE_IFS 2 #define get_totalsize(mc) \ (((struct microcode_intel *)mc)->hdr.datasize ? \ diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h index 61f0c206bff0..6d502f3efb0f 100644 --- a/arch/x86/include/asm/mshyperv.h +++ b/arch/x86/include/asm/mshyperv.h @@ -19,8 +19,6 @@ typedef int (*hyperv_fill_flush_list_func)( struct hv_guest_mapping_flush_list *flush, void *data); -#define hv_get_raw_timer() rdtsc_ordered() - void hyperv_vector_handler(struct pt_regs *regs); #if IS_ENABLED(CONFIG_HYPERV) diff --git a/arch/x86/include/asm/msi.h b/arch/x86/include/asm/msi.h index d71c7e8b738d..935c6d470341 100644 --- a/arch/x86/include/asm/msi.h +++ b/arch/x86/include/asm/msi.h @@ -62,4 +62,10 @@ typedef struct x86_msi_addr_hi { struct msi_msg; u32 x86_msi_msg_get_destid(struct msi_msg *msg, bool extid); +#define X86_VECTOR_MSI_FLAGS_SUPPORTED \ + (MSI_GENERIC_FLAGS_MASK | MSI_FLAG_PCI_MSIX | MSI_FLAG_PCI_MSIX_ALLOC_DYN) + +#define X86_VECTOR_MSI_FLAGS_REQUIRED \ + (MSI_FLAG_USE_DEF_DOM_OPS | MSI_FLAG_USE_DEF_CHIP_OPS) + #endif /* _ASM_X86_MSI_H */ diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 10ac52705892..37ff47552bcb 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -4,12 +4,7 @@ #include <linux/bits.h> -/* - * CPU model specific register (MSR) numbers. - * - * Do not add new entries to this file unless the definitions are shared - * between multiple compilation units. - */ +/* CPU model specific register (MSR) numbers. */ /* x86-64 specific MSRs */ #define MSR_EFER 0xc0000080 /* extended feature register */ @@ -535,6 +530,11 @@ #define MSR_AMD64_CPUID_FN_1 0xc0011004 #define MSR_AMD64_LS_CFG 0xc0011020 #define MSR_AMD64_DC_CFG 0xc0011022 + +#define MSR_AMD64_DE_CFG 0xc0011029 +#define MSR_AMD64_DE_CFG_LFENCE_SERIALIZE_BIT 1 +#define MSR_AMD64_DE_CFG_LFENCE_SERIALIZE BIT_ULL(MSR_AMD64_DE_CFG_LFENCE_SERIALIZE_BIT) + #define MSR_AMD64_BU_CFG2 0xc001102a #define MSR_AMD64_IBSFETCHCTL 0xc0011030 #define MSR_AMD64_IBSFETCHLINAD 0xc0011031 @@ -640,9 +640,6 @@ #define FAM10H_MMIO_CONF_BASE_MASK 0xfffffffULL #define FAM10H_MMIO_CONF_BASE_SHIFT 20 #define MSR_FAM10H_NODE_ID 0xc001100c -#define MSR_F10H_DECFG 0xc0011029 -#define MSR_F10H_DECFG_LFENCE_SERIALIZE_BIT 1 -#define MSR_F10H_DECFG_LFENCE_SERIALIZE BIT_ULL(MSR_F10H_DECFG_LFENCE_SERIALIZE_BIT) /* K8 MSRs */ #define MSR_K8_TOP_MEM1 0xc001001a @@ -796,6 +793,7 @@ #define ENERGY_PERF_BIAS_PERFORMANCE 0 #define ENERGY_PERF_BIAS_BALANCE_PERFORMANCE 4 #define ENERGY_PERF_BIAS_NORMAL 6 +#define ENERGY_PERF_BIAS_NORMAL_POWERSAVE 7 #define ENERGY_PERF_BIAS_BALANCE_POWERSAVE 8 #define ENERGY_PERF_BIAS_POWERSAVE 15 @@ -1050,6 +1048,20 @@ #define VMX_BASIC_MEM_TYPE_WB 6LLU #define VMX_BASIC_INOUT 0x0040000000000000LLU +/* Resctrl MSRs: */ +/* - Intel: */ +#define MSR_IA32_L3_QOS_CFG 0xc81 +#define MSR_IA32_L2_QOS_CFG 0xc82 +#define MSR_IA32_QM_EVTSEL 0xc8d +#define MSR_IA32_QM_CTR 0xc8e +#define MSR_IA32_PQR_ASSOC 0xc8f +#define MSR_IA32_L3_CBM_BASE 0xc90 +#define MSR_IA32_L2_CBM_BASE 0xd10 +#define MSR_IA32_MBA_THRTL_BASE 0xd50 + +/* - AMD: */ +#define MSR_IA32_MBA_BW_BASE 0xc0000200 + /* MSR_IA32_VMX_MISC bits */ #define MSR_IA32_VMX_MISC_INTEL_PT (1ULL << 14) #define MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS (1ULL << 29) diff --git a/arch/x86/include/asm/mtrr.h b/arch/x86/include/asm/mtrr.h index 76d726074c16..f0eeaf6e5f5f 100644 --- a/arch/x86/include/asm/mtrr.h +++ b/arch/x86/include/asm/mtrr.h @@ -25,13 +25,12 @@ #include <uapi/asm/mtrr.h> -void mtrr_bp_init(void); - /* * The following functions are for use by other drivers that cannot use * arch_phys_wc_add and arch_phys_wc_del. */ # ifdef CONFIG_MTRR +void mtrr_bp_init(void); extern u8 mtrr_type_lookup(u64 addr, u64 end, u8 *uniform); extern void mtrr_save_fixed_ranges(void *); extern void mtrr_save_state(void); @@ -42,12 +41,12 @@ extern int mtrr_add_page(unsigned long base, unsigned long size, extern int mtrr_del(int reg, unsigned long base, unsigned long size); extern int mtrr_del_page(int reg, unsigned long base, unsigned long size); extern void mtrr_centaur_report_mcr(int mcr, u32 lo, u32 hi); -extern void mtrr_ap_init(void); -extern void set_mtrr_aps_delayed_init(void); -extern void mtrr_aps_init(void); extern void mtrr_bp_restore(void); extern int mtrr_trim_uncached_memory(unsigned long end_pfn); extern int amd_special_default_mtrr(void); +void mtrr_disable(void); +void mtrr_enable(void); +void mtrr_generic_set_state(void); # else static inline u8 mtrr_type_lookup(u64 addr, u64 end, u8 *uniform) { @@ -83,10 +82,11 @@ static inline int mtrr_trim_uncached_memory(unsigned long end_pfn) static inline void mtrr_centaur_report_mcr(int mcr, u32 lo, u32 hi) { } -#define mtrr_ap_init() do {} while (0) -#define set_mtrr_aps_delayed_init() do {} while (0) -#define mtrr_aps_init() do {} while (0) +#define mtrr_bp_init() do {} while (0) #define mtrr_bp_restore() do {} while (0) +#define mtrr_disable() do {} while (0) +#define mtrr_enable() do {} while (0) +#define mtrr_generic_set_state() do {} while (0) # endif #ifdef CONFIG_COMPAT diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h index c936ce9f0c47..771b0a2b7a34 100644 --- a/arch/x86/include/asm/nospec-branch.h +++ b/arch/x86/include/asm/nospec-branch.h @@ -12,8 +12,104 @@ #include <asm/msr-index.h> #include <asm/unwind_hints.h> #include <asm/percpu.h> +#include <asm/current.h> -#define RETPOLINE_THUNK_SIZE 32 +/* + * Call depth tracking for Intel SKL CPUs to address the RSB underflow + * issue in software. + * + * The tracking does not use a counter. It uses uses arithmetic shift + * right on call entry and logical shift left on return. + * + * The depth tracking variable is initialized to 0x8000.... when the call + * depth is zero. The arithmetic shift right sign extends the MSB and + * saturates after the 12th call. The shift count is 5 for both directions + * so the tracking covers 12 nested calls. + * + * Call + * 0: 0x8000000000000000 0x0000000000000000 + * 1: 0xfc00000000000000 0xf000000000000000 + * ... + * 11: 0xfffffffffffffff8 0xfffffffffffffc00 + * 12: 0xffffffffffffffff 0xffffffffffffffe0 + * + * After a return buffer fill the depth is credited 12 calls before the + * next stuffing has to take place. + * + * There is a inaccuracy for situations like this: + * + * 10 calls + * 5 returns + * 3 calls + * 4 returns + * 3 calls + * .... + * + * The shift count might cause this to be off by one in either direction, + * but there is still a cushion vs. the RSB depth. The algorithm does not + * claim to be perfect and it can be speculated around by the CPU, but it + * is considered that it obfuscates the problem enough to make exploitation + * extremly difficult. + */ +#define RET_DEPTH_SHIFT 5 +#define RSB_RET_STUFF_LOOPS 16 +#define RET_DEPTH_INIT 0x8000000000000000ULL +#define RET_DEPTH_INIT_FROM_CALL 0xfc00000000000000ULL +#define RET_DEPTH_CREDIT 0xffffffffffffffffULL + +#ifdef CONFIG_CALL_THUNKS_DEBUG +# define CALL_THUNKS_DEBUG_INC_CALLS \ + incq %gs:__x86_call_count; +# define CALL_THUNKS_DEBUG_INC_RETS \ + incq %gs:__x86_ret_count; +# define CALL_THUNKS_DEBUG_INC_STUFFS \ + incq %gs:__x86_stuffs_count; +# define CALL_THUNKS_DEBUG_INC_CTXSW \ + incq %gs:__x86_ctxsw_count; +#else +# define CALL_THUNKS_DEBUG_INC_CALLS +# define CALL_THUNKS_DEBUG_INC_RETS +# define CALL_THUNKS_DEBUG_INC_STUFFS +# define CALL_THUNKS_DEBUG_INC_CTXSW +#endif + +#if defined(CONFIG_CALL_DEPTH_TRACKING) && !defined(COMPILE_OFFSETS) + +#include <asm/asm-offsets.h> + +#define CREDIT_CALL_DEPTH \ + movq $-1, PER_CPU_VAR(pcpu_hot + X86_call_depth); + +#define ASM_CREDIT_CALL_DEPTH \ + movq $-1, PER_CPU_VAR(pcpu_hot + X86_call_depth); + +#define RESET_CALL_DEPTH \ + mov $0x80, %rax; \ + shl $56, %rax; \ + movq %rax, PER_CPU_VAR(pcpu_hot + X86_call_depth); + +#define RESET_CALL_DEPTH_FROM_CALL \ + mov $0xfc, %rax; \ + shl $56, %rax; \ + movq %rax, PER_CPU_VAR(pcpu_hot + X86_call_depth); \ + CALL_THUNKS_DEBUG_INC_CALLS + +#define INCREMENT_CALL_DEPTH \ + sarq $5, %gs:pcpu_hot + X86_call_depth; \ + CALL_THUNKS_DEBUG_INC_CALLS + +#define ASM_INCREMENT_CALL_DEPTH \ + sarq $5, PER_CPU_VAR(pcpu_hot + X86_call_depth); \ + CALL_THUNKS_DEBUG_INC_CALLS + +#else +#define CREDIT_CALL_DEPTH +#define ASM_CREDIT_CALL_DEPTH +#define RESET_CALL_DEPTH +#define INCREMENT_CALL_DEPTH +#define ASM_INCREMENT_CALL_DEPTH +#define RESET_CALL_DEPTH_FROM_CALL +#endif /* * Fill the CPU return stack buffer. @@ -32,6 +128,7 @@ * from C via asm(".include <asm/nospec-branch.h>") but let's not go there. */ +#define RETPOLINE_THUNK_SIZE 32 #define RSB_CLEAR_LOOPS 32 /* To forcibly overwrite all entries */ /* @@ -60,7 +157,9 @@ dec reg; \ jnz 771b; \ /* barrier for jnz misprediction */ \ - lfence; + lfence; \ + ASM_CREDIT_CALL_DEPTH \ + CALL_THUNKS_DEBUG_INC_CTXSW #else /* * i386 doesn't unconditionally have LFENCE, as such it can't @@ -185,11 +284,32 @@ * where we have a stack but before any RET instruction. */ .macro UNTRAIN_RET -#if defined(CONFIG_CPU_UNRET_ENTRY) || defined(CONFIG_CPU_IBPB_ENTRY) +#if defined(CONFIG_CPU_UNRET_ENTRY) || defined(CONFIG_CPU_IBPB_ENTRY) || \ + defined(CONFIG_CALL_DEPTH_TRACKING) ANNOTATE_UNRET_END - ALTERNATIVE_2 "", \ - CALL_ZEN_UNTRAIN_RET, X86_FEATURE_UNRET, \ - "call entry_ibpb", X86_FEATURE_ENTRY_IBPB + ALTERNATIVE_3 "", \ + CALL_ZEN_UNTRAIN_RET, X86_FEATURE_UNRET, \ + "call entry_ibpb", X86_FEATURE_ENTRY_IBPB, \ + __stringify(RESET_CALL_DEPTH), X86_FEATURE_CALL_DEPTH +#endif +.endm + +.macro UNTRAIN_RET_FROM_CALL +#if defined(CONFIG_CPU_UNRET_ENTRY) || defined(CONFIG_CPU_IBPB_ENTRY) || \ + defined(CONFIG_CALL_DEPTH_TRACKING) + ANNOTATE_UNRET_END + ALTERNATIVE_3 "", \ + CALL_ZEN_UNTRAIN_RET, X86_FEATURE_UNRET, \ + "call entry_ibpb", X86_FEATURE_ENTRY_IBPB, \ + __stringify(RESET_CALL_DEPTH_FROM_CALL), X86_FEATURE_CALL_DEPTH +#endif +.endm + + +.macro CALL_DEPTH_ACCOUNT +#ifdef CONFIG_CALL_DEPTH_TRACKING + ALTERNATIVE "", \ + __stringify(ASM_INCREMENT_CALL_DEPTH), X86_FEATURE_CALL_DEPTH #endif .endm @@ -203,11 +323,45 @@ typedef u8 retpoline_thunk_t[RETPOLINE_THUNK_SIZE]; extern retpoline_thunk_t __x86_indirect_thunk_array[]; +extern retpoline_thunk_t __x86_indirect_call_thunk_array[]; +extern retpoline_thunk_t __x86_indirect_jump_thunk_array[]; extern void __x86_return_thunk(void); extern void zen_untrain_ret(void); extern void entry_ibpb(void); +#ifdef CONFIG_CALL_THUNKS +extern void (*x86_return_thunk)(void); +#else +#define x86_return_thunk (&__x86_return_thunk) +#endif + +#ifdef CONFIG_CALL_DEPTH_TRACKING +extern void __x86_return_skl(void); + +static inline void x86_set_skl_return_thunk(void) +{ + x86_return_thunk = &__x86_return_skl; +} + +#define CALL_DEPTH_ACCOUNT \ + ALTERNATIVE("", \ + __stringify(INCREMENT_CALL_DEPTH), \ + X86_FEATURE_CALL_DEPTH) + +#ifdef CONFIG_CALL_THUNKS_DEBUG +DECLARE_PER_CPU(u64, __x86_call_count); +DECLARE_PER_CPU(u64, __x86_ret_count); +DECLARE_PER_CPU(u64, __x86_stuffs_count); +DECLARE_PER_CPU(u64, __x86_ctxsw_count); +#endif +#else +static inline void x86_set_skl_return_thunk(void) {} + +#define CALL_DEPTH_ACCOUNT "" + +#endif + #ifdef CONFIG_RETPOLINE #define GEN(reg) \ @@ -215,6 +369,16 @@ extern void entry_ibpb(void); #include <asm/GEN-for-each-reg.h> #undef GEN +#define GEN(reg) \ + extern retpoline_thunk_t __x86_indirect_call_thunk_ ## reg; +#include <asm/GEN-for-each-reg.h> +#undef GEN + +#define GEN(reg) \ + extern retpoline_thunk_t __x86_indirect_jump_thunk_ ## reg; +#include <asm/GEN-for-each-reg.h> +#undef GEN + #ifdef CONFIG_X86_64 /* @@ -321,7 +485,7 @@ static inline void indirect_branch_prediction_barrier(void) /* The Intel SPEC CTRL MSR base value cache */ extern u64 x86_spec_ctrl_base; DECLARE_PER_CPU(u64, x86_spec_ctrl_current); -extern void write_spec_ctrl_current(u64 val, bool force); +extern void update_spec_ctrl_cond(u64 val); extern u64 spec_ctrl_current(void); /* diff --git a/arch/x86/include/asm/page_types.h b/arch/x86/include/asm/page_types.h index a506a411474d..86bd4311daf8 100644 --- a/arch/x86/include/asm/page_types.h +++ b/arch/x86/include/asm/page_types.h @@ -11,20 +11,14 @@ #define PAGE_SIZE (_AC(1,UL) << PAGE_SHIFT) #define PAGE_MASK (~(PAGE_SIZE-1)) -#define PMD_PAGE_SIZE (_AC(1, UL) << PMD_SHIFT) -#define PMD_PAGE_MASK (~(PMD_PAGE_SIZE-1)) - -#define PUD_PAGE_SIZE (_AC(1, UL) << PUD_SHIFT) -#define PUD_PAGE_MASK (~(PUD_PAGE_SIZE-1)) - #define __VIRTUAL_MASK ((1UL << __VIRTUAL_MASK_SHIFT) - 1) -/* Cast *PAGE_MASK to a signed type so that it is sign-extended if +/* Cast P*D_MASK to a signed type so that it is sign-extended if virtual addresses are 32-bits but physical addresses are larger (ie, 32-bit PAE). */ #define PHYSICAL_PAGE_MASK (((signed long)PAGE_MASK) & __PHYSICAL_MASK) -#define PHYSICAL_PMD_PAGE_MASK (((signed long)PMD_PAGE_MASK) & __PHYSICAL_MASK) -#define PHYSICAL_PUD_PAGE_MASK (((signed long)PUD_PAGE_MASK) & __PHYSICAL_MASK) +#define PHYSICAL_PMD_PAGE_MASK (((signed long)PMD_MASK) & __PHYSICAL_MASK) +#define PHYSICAL_PUD_PAGE_MASK (((signed long)PUD_MASK) & __PHYSICAL_MASK) #define HPAGE_SHIFT PMD_SHIFT #define HPAGE_SIZE (_AC(1,UL) << HPAGE_SHIFT) diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index 2a0b8dd4ec33..73e9522db7c1 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -4,13 +4,13 @@ /* Various instructions on x86 need to be replaced for * para-virtualization: those hooks are defined here. */ +#include <asm/paravirt_types.h> + #ifdef CONFIG_PARAVIRT #include <asm/pgtable_types.h> #include <asm/asm.h> #include <asm/nospec-branch.h> -#include <asm/paravirt_types.h> - #ifndef __ASSEMBLY__ #include <linux/bug.h> #include <linux/types.h> @@ -665,6 +665,7 @@ bool __raw_callee_save___native_vcpu_is_preempted(long cpu); asm(".pushsection " section ", \"ax\";" \ ".globl " PV_THUNK_NAME(func) ";" \ ".type " PV_THUNK_NAME(func) ", @function;" \ + ASM_FUNC_ALIGN \ PV_THUNK_NAME(func) ":" \ ASM_ENDBR \ FRAME_BEGIN \ @@ -730,6 +731,18 @@ static __always_inline unsigned long arch_local_irq_save(void) #undef PVOP_VCALL4 #undef PVOP_CALL4 +#define DEFINE_PARAVIRT_ASM(func, instr, sec) \ + asm (".pushsection " #sec ", \"ax\"\n" \ + ".global " #func "\n\t" \ + ".type " #func ", @function\n\t" \ + ASM_FUNC_ALIGN "\n" \ + #func ":\n\t" \ + ASM_ENDBR \ + instr "\n\t" \ + ASM_RET \ + ".size " #func ", . - " #func "\n\t" \ + ".popsection") + extern void default_banner(void); #else /* __ASSEMBLY__ */ diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h index f3d601574730..8c1da419260f 100644 --- a/arch/x86/include/asm/paravirt_types.h +++ b/arch/x86/include/asm/paravirt_types.h @@ -2,36 +2,23 @@ #ifndef _ASM_X86_PARAVIRT_TYPES_H #define _ASM_X86_PARAVIRT_TYPES_H -/* Bitmask of what can be clobbered: usually at least eax. */ -#define CLBR_EAX (1 << 0) -#define CLBR_ECX (1 << 1) -#define CLBR_EDX (1 << 2) -#define CLBR_EDI (1 << 3) - -#ifdef CONFIG_X86_32 -/* CLBR_ANY should match all regs platform has. For i386, that's just it */ -#define CLBR_ANY ((1 << 4) - 1) - -#define CLBR_ARG_REGS (CLBR_EAX | CLBR_EDX | CLBR_ECX) -#define CLBR_RET_REG (CLBR_EAX | CLBR_EDX) -#else -#define CLBR_RAX CLBR_EAX -#define CLBR_RCX CLBR_ECX -#define CLBR_RDX CLBR_EDX -#define CLBR_RDI CLBR_EDI -#define CLBR_RSI (1 << 4) -#define CLBR_R8 (1 << 5) -#define CLBR_R9 (1 << 6) -#define CLBR_R10 (1 << 7) -#define CLBR_R11 (1 << 8) - -#define CLBR_ANY ((1 << 9) - 1) +#ifndef __ASSEMBLY__ +/* These all sit in the .parainstructions section to tell us what to patch. */ +struct paravirt_patch_site { + u8 *instr; /* original instructions */ + u8 type; /* type of this instruction */ + u8 len; /* length of original instruction */ +}; -#define CLBR_ARG_REGS (CLBR_RDI | CLBR_RSI | CLBR_RDX | \ - CLBR_RCX | CLBR_R8 | CLBR_R9) -#define CLBR_RET_REG (CLBR_RAX) +/* Lazy mode for batching updates / context switch */ +enum paravirt_lazy_mode { + PARAVIRT_LAZY_NONE, + PARAVIRT_LAZY_MMU, + PARAVIRT_LAZY_CPU, +}; +#endif -#endif /* X86_64 */ +#ifdef CONFIG_PARAVIRT #ifndef __ASSEMBLY__ @@ -279,27 +266,23 @@ extern struct paravirt_patch_template pv_ops; #define paravirt_type(op) \ [paravirt_typenum] "i" (PARAVIRT_PATCH(op)), \ [paravirt_opptr] "m" (pv_ops.op) -#define paravirt_clobber(clobber) \ - [paravirt_clobber] "i" (clobber) - /* * Generate some code, and mark it as patchable by the * apply_paravirt() alternate instruction patcher. */ -#define _paravirt_alt(insn_string, type, clobber) \ +#define _paravirt_alt(insn_string, type) \ "771:\n\t" insn_string "\n" "772:\n" \ ".pushsection .parainstructions,\"a\"\n" \ _ASM_ALIGN "\n" \ _ASM_PTR " 771b\n" \ " .byte " type "\n" \ " .byte 772b-771b\n" \ - " .short " clobber "\n" \ _ASM_ALIGN "\n" \ ".popsection\n" /* Generate patchable code, with the default asm parameters. */ #define paravirt_alt(insn_string) \ - _paravirt_alt(insn_string, "%c[paravirt_typenum]", "%c[paravirt_clobber]") + _paravirt_alt(insn_string, "%c[paravirt_typenum]") /* Simple instruction patching code. */ #define NATIVE_LABEL(a,x,b) "\n\t.globl " a #x "_" #b "\n" a #x "_" #b ":\n\t" @@ -451,20 +434,19 @@ int paravirt_disable_iospace(void); }) -#define ____PVOP_CALL(ret, op, clbr, call_clbr, extra_clbr, ...) \ +#define ____PVOP_CALL(ret, op, call_clbr, extra_clbr, ...) \ ({ \ PVOP_CALL_ARGS; \ PVOP_TEST_NULL(op); \ asm volatile(paravirt_alt(PARAVIRT_CALL) \ : call_clbr, ASM_CALL_CONSTRAINT \ : paravirt_type(op), \ - paravirt_clobber(clbr), \ ##__VA_ARGS__ \ : "memory", "cc" extra_clbr); \ ret; \ }) -#define ____PVOP_ALT_CALL(ret, op, alt, cond, clbr, call_clbr, \ +#define ____PVOP_ALT_CALL(ret, op, alt, cond, call_clbr, \ extra_clbr, ...) \ ({ \ PVOP_CALL_ARGS; \ @@ -473,45 +455,44 @@ int paravirt_disable_iospace(void); alt, cond) \ : call_clbr, ASM_CALL_CONSTRAINT \ : paravirt_type(op), \ - paravirt_clobber(clbr), \ ##__VA_ARGS__ \ : "memory", "cc" extra_clbr); \ ret; \ }) #define __PVOP_CALL(rettype, op, ...) \ - ____PVOP_CALL(PVOP_RETVAL(rettype), op, CLBR_ANY, \ + ____PVOP_CALL(PVOP_RETVAL(rettype), op, \ PVOP_CALL_CLOBBERS, EXTRA_CLOBBERS, ##__VA_ARGS__) #define __PVOP_ALT_CALL(rettype, op, alt, cond, ...) \ - ____PVOP_ALT_CALL(PVOP_RETVAL(rettype), op, alt, cond, CLBR_ANY,\ + ____PVOP_ALT_CALL(PVOP_RETVAL(rettype), op, alt, cond, \ PVOP_CALL_CLOBBERS, EXTRA_CLOBBERS, \ ##__VA_ARGS__) #define __PVOP_CALLEESAVE(rettype, op, ...) \ - ____PVOP_CALL(PVOP_RETVAL(rettype), op.func, CLBR_RET_REG, \ + ____PVOP_CALL(PVOP_RETVAL(rettype), op.func, \ PVOP_CALLEE_CLOBBERS, , ##__VA_ARGS__) #define __PVOP_ALT_CALLEESAVE(rettype, op, alt, cond, ...) \ ____PVOP_ALT_CALL(PVOP_RETVAL(rettype), op.func, alt, cond, \ - CLBR_RET_REG, PVOP_CALLEE_CLOBBERS, , ##__VA_ARGS__) + PVOP_CALLEE_CLOBBERS, , ##__VA_ARGS__) #define __PVOP_VCALL(op, ...) \ - (void)____PVOP_CALL(, op, CLBR_ANY, PVOP_VCALL_CLOBBERS, \ + (void)____PVOP_CALL(, op, PVOP_VCALL_CLOBBERS, \ VEXTRA_CLOBBERS, ##__VA_ARGS__) #define __PVOP_ALT_VCALL(op, alt, cond, ...) \ - (void)____PVOP_ALT_CALL(, op, alt, cond, CLBR_ANY, \ + (void)____PVOP_ALT_CALL(, op, alt, cond, \ PVOP_VCALL_CLOBBERS, VEXTRA_CLOBBERS, \ ##__VA_ARGS__) #define __PVOP_VCALLEESAVE(op, ...) \ - (void)____PVOP_CALL(, op.func, CLBR_RET_REG, \ + (void)____PVOP_CALL(, op.func, \ PVOP_VCALLEE_CLOBBERS, , ##__VA_ARGS__) #define __PVOP_ALT_VCALLEESAVE(op, alt, cond, ...) \ - (void)____PVOP_ALT_CALL(, op.func, alt, cond, CLBR_RET_REG, \ + (void)____PVOP_ALT_CALL(, op.func, alt, cond, \ PVOP_VCALLEE_CLOBBERS, , ##__VA_ARGS__) @@ -571,13 +552,6 @@ int paravirt_disable_iospace(void); __PVOP_VCALL(op, PVOP_CALL_ARG1(arg1), PVOP_CALL_ARG2(arg2), \ PVOP_CALL_ARG3(arg3), PVOP_CALL_ARG4(arg4)) -/* Lazy mode for batching updates / context switch */ -enum paravirt_lazy_mode { - PARAVIRT_LAZY_NONE, - PARAVIRT_LAZY_MMU, - PARAVIRT_LAZY_CPU, -}; - enum paravirt_lazy_mode paravirt_get_lazy_mode(void); void paravirt_start_context_switch(struct task_struct *prev); void paravirt_end_context_switch(struct task_struct *next); @@ -593,16 +567,9 @@ unsigned long paravirt_ret0(void); #define paravirt_nop ((void *)_paravirt_nop) -/* These all sit in the .parainstructions section to tell us what to patch. */ -struct paravirt_patch_site { - u8 *instr; /* original instructions */ - u8 type; /* type of this instruction */ - u8 len; /* length of original instruction */ -}; - extern struct paravirt_patch_site __parainstructions[], __parainstructions_end[]; #endif /* __ASSEMBLY__ */ - +#endif /* CONFIG_PARAVIRT */ #endif /* _ASM_X86_PARAVIRT_TYPES_H */ diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h index 736793d65bcb..b40c462b4af3 100644 --- a/arch/x86/include/asm/pci.h +++ b/arch/x86/include/asm/pci.h @@ -21,7 +21,7 @@ struct pci_sysdata { #ifdef CONFIG_X86_64 void *iommu; /* IOMMU private data */ #endif -#ifdef CONFIG_PCI_MSI_IRQ_DOMAIN +#ifdef CONFIG_PCI_MSI void *fwnode; /* IRQ domain for MSI assignment */ #endif #if IS_ENABLED(CONFIG_VMD) @@ -52,7 +52,7 @@ static inline int pci_proc_domain(struct pci_bus *bus) } #endif -#ifdef CONFIG_PCI_MSI_IRQ_DOMAIN +#ifdef CONFIG_PCI_MSI static inline void *_pci_root_bus_fwnode(struct pci_bus *bus) { return to_pci_sysdata(bus)->fwnode; @@ -92,6 +92,7 @@ void pcibios_scan_root(int bus); struct irq_routing_table *pcibios_get_irq_routing_table(void); int pcibios_set_irq_routing(struct pci_dev *dev, int pin, int irq); +bool pci_dev_has_default_msi_parent_domain(struct pci_dev *dev); #define HAVE_PCI_MMAP #define arch_can_pci_mmap_wc() pat_enabled() diff --git a/arch/x86/include/asm/pgtable-3level.h b/arch/x86/include/asm/pgtable-3level.h index 28421a887209..967b135fa2c0 100644 --- a/arch/x86/include/asm/pgtable-3level.h +++ b/arch/x86/include/asm/pgtable-3level.h @@ -2,8 +2,6 @@ #ifndef _ASM_X86_PGTABLE_3LEVEL_H #define _ASM_X86_PGTABLE_3LEVEL_H -#include <asm/atomic64_32.h> - /* * Intel Physical Address Extension (PAE) Mode - three-level page * tables on PPro+ CPUs. @@ -21,7 +19,15 @@ pr_err("%s:%d: bad pgd %p(%016Lx)\n", \ __FILE__, __LINE__, &(e), pgd_val(e)) -/* Rules for using set_pte: the pte being assigned *must* be +#define pxx_xchg64(_pxx, _ptr, _val) ({ \ + _pxx##val_t *_p = (_pxx##val_t *)_ptr; \ + _pxx##val_t _o = *_p; \ + do { } while (!try_cmpxchg64(_p, &_o, (_val))); \ + native_make_##_pxx(_o); \ +}) + +/* + * Rules for using set_pte: the pte being assigned *must* be * either not present or in a state where the hardware will * not attempt to update the pte. In places where this is * not possible, use pte_get_and_clear to obtain the old pte @@ -29,75 +35,19 @@ */ static inline void native_set_pte(pte_t *ptep, pte_t pte) { - ptep->pte_high = pte.pte_high; + WRITE_ONCE(ptep->pte_high, pte.pte_high); smp_wmb(); - ptep->pte_low = pte.pte_low; -} - -#define pmd_read_atomic pmd_read_atomic -/* - * pte_offset_map_lock() on 32-bit PAE kernels was reading the pmd_t with - * a "*pmdp" dereference done by GCC. Problem is, in certain places - * where pte_offset_map_lock() is called, concurrent page faults are - * allowed, if the mmap_lock is hold for reading. An example is mincore - * vs page faults vs MADV_DONTNEED. On the page fault side - * pmd_populate() rightfully does a set_64bit(), but if we're reading the - * pmd_t with a "*pmdp" on the mincore side, a SMP race can happen - * because GCC will not read the 64-bit value of the pmd atomically. - * - * To fix this all places running pte_offset_map_lock() while holding the - * mmap_lock in read mode, shall read the pmdp pointer using this - * function to know if the pmd is null or not, and in turn to know if - * they can run pte_offset_map_lock() or pmd_trans_huge() or other pmd - * operations. - * - * Without THP if the mmap_lock is held for reading, the pmd can only - * transition from null to not null while pmd_read_atomic() runs. So - * we can always return atomic pmd values with this function. - * - * With THP if the mmap_lock is held for reading, the pmd can become - * trans_huge or none or point to a pte (and in turn become "stable") - * at any time under pmd_read_atomic(). We could read it truly - * atomically here with an atomic64_read() for the THP enabled case (and - * it would be a whole lot simpler), but to avoid using cmpxchg8b we - * only return an atomic pmdval if the low part of the pmdval is later - * found to be stable (i.e. pointing to a pte). We are also returning a - * 'none' (zero) pmdval if the low part of the pmd is zero. - * - * In some cases the high and low part of the pmdval returned may not be - * consistent if THP is enabled (the low part may point to previously - * mapped hugepage, while the high part may point to a more recently - * mapped hugepage), but pmd_none_or_trans_huge_or_clear_bad() only - * needs the low part of the pmd to be read atomically to decide if the - * pmd is unstable or not, with the only exception when the low part - * of the pmd is zero, in which case we return a 'none' pmd. - */ -static inline pmd_t pmd_read_atomic(pmd_t *pmdp) -{ - pmdval_t ret; - u32 *tmp = (u32 *)pmdp; - - ret = (pmdval_t) (*tmp); - if (ret) { - /* - * If the low part is null, we must not read the high part - * or we can end up with a partial pmd. - */ - smp_rmb(); - ret |= ((pmdval_t)*(tmp + 1)) << 32; - } - - return (pmd_t) { ret }; + WRITE_ONCE(ptep->pte_low, pte.pte_low); } static inline void native_set_pte_atomic(pte_t *ptep, pte_t pte) { - set_64bit((unsigned long long *)(ptep), native_pte_val(pte)); + pxx_xchg64(pte, ptep, native_pte_val(pte)); } static inline void native_set_pmd(pmd_t *pmdp, pmd_t pmd) { - set_64bit((unsigned long long *)(pmdp), native_pmd_val(pmd)); + pxx_xchg64(pmd, pmdp, native_pmd_val(pmd)); } static inline void native_set_pud(pud_t *pudp, pud_t pud) @@ -105,7 +55,7 @@ static inline void native_set_pud(pud_t *pudp, pud_t pud) #ifdef CONFIG_PAGE_TABLE_ISOLATION pud.p4d.pgd = pti_set_user_pgtbl(&pudp->p4d.pgd, pud.p4d.pgd); #endif - set_64bit((unsigned long long *)(pudp), native_pud_val(pud)); + pxx_xchg64(pud, pudp, native_pud_val(pud)); } /* @@ -116,17 +66,16 @@ static inline void native_set_pud(pud_t *pudp, pud_t pud) static inline void native_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { - ptep->pte_low = 0; + WRITE_ONCE(ptep->pte_low, 0); smp_wmb(); - ptep->pte_high = 0; + WRITE_ONCE(ptep->pte_high, 0); } -static inline void native_pmd_clear(pmd_t *pmd) +static inline void native_pmd_clear(pmd_t *pmdp) { - u32 *tmp = (u32 *)pmd; - *tmp = 0; + WRITE_ONCE(pmdp->pmd_low, 0); smp_wmb(); - *(tmp + 1) = 0; + WRITE_ONCE(pmdp->pmd_high, 0); } static inline void native_pud_clear(pud_t *pudp) @@ -149,41 +98,26 @@ static inline void pud_clear(pud_t *pudp) */ } + #ifdef CONFIG_SMP static inline pte_t native_ptep_get_and_clear(pte_t *ptep) { - pte_t res; - - res.pte = (pteval_t)arch_atomic64_xchg((atomic64_t *)ptep, 0); - - return res; + return pxx_xchg64(pte, ptep, 0ULL); } -#else -#define native_ptep_get_and_clear(xp) native_local_ptep_get_and_clear(xp) -#endif -union split_pmd { - struct { - u32 pmd_low; - u32 pmd_high; - }; - pmd_t pmd; -}; - -#ifdef CONFIG_SMP static inline pmd_t native_pmdp_get_and_clear(pmd_t *pmdp) { - union split_pmd res, *orig = (union split_pmd *)pmdp; - - /* xchg acts as a barrier before setting of the high bits */ - res.pmd_low = xchg(&orig->pmd_low, 0); - res.pmd_high = orig->pmd_high; - orig->pmd_high = 0; + return pxx_xchg64(pmd, pmdp, 0ULL); +} - return res.pmd; +static inline pud_t native_pudp_get_and_clear(pud_t *pudp) +{ + return pxx_xchg64(pud, pudp, 0ULL); } #else +#define native_ptep_get_and_clear(xp) native_local_ptep_get_and_clear(xp) #define native_pmdp_get_and_clear(xp) native_local_pmdp_get_and_clear(xp) +#define native_pudp_get_and_clear(xp) native_local_pudp_get_and_clear(xp) #endif #ifndef pmdp_establish @@ -199,53 +133,16 @@ static inline pmd_t pmdp_establish(struct vm_area_struct *vma, * anybody. */ if (!(pmd_val(pmd) & _PAGE_PRESENT)) { - union split_pmd old, new, *ptr; - - ptr = (union split_pmd *)pmdp; - - new.pmd = pmd; - /* xchg acts as a barrier before setting of the high bits */ - old.pmd_low = xchg(&ptr->pmd_low, new.pmd_low); - old.pmd_high = ptr->pmd_high; - ptr->pmd_high = new.pmd_high; - return old.pmd; - } - - do { - old = *pmdp; - } while (cmpxchg64(&pmdp->pmd, old.pmd, pmd.pmd) != old.pmd); - - return old; -} -#endif - -#ifdef CONFIG_SMP -union split_pud { - struct { - u32 pud_low; - u32 pud_high; - }; - pud_t pud; -}; - -static inline pud_t native_pudp_get_and_clear(pud_t *pudp) -{ - union split_pud res, *orig = (union split_pud *)pudp; + old.pmd_low = xchg(&pmdp->pmd_low, pmd.pmd_low); + old.pmd_high = READ_ONCE(pmdp->pmd_high); + WRITE_ONCE(pmdp->pmd_high, pmd.pmd_high); -#ifdef CONFIG_PAGE_TABLE_ISOLATION - pti_set_user_pgtbl(&pudp->p4d.pgd, __pgd(0)); -#endif - - /* xchg acts as a barrier before setting of the high bits */ - res.pud_low = xchg(&orig->pud_low, 0); - res.pud_high = orig->pud_high; - orig->pud_high = 0; + return old; + } - return res.pud; + return pxx_xchg64(pmd, pmdp, pmd.pmd); } -#else -#define native_pudp_get_and_clear(xp) native_local_pudp_get_and_clear(xp) #endif /* Encode and de-code a swap entry */ diff --git a/arch/x86/include/asm/pgtable-3level_types.h b/arch/x86/include/asm/pgtable-3level_types.h index 56baf43befb4..80911349519e 100644 --- a/arch/x86/include/asm/pgtable-3level_types.h +++ b/arch/x86/include/asm/pgtable-3level_types.h @@ -18,6 +18,13 @@ typedef union { }; pteval_t pte; } pte_t; + +typedef union { + struct { + unsigned long pmd_low, pmd_high; + }; + pmdval_t pmd; +} pmd_t; #endif /* !__ASSEMBLY__ */ #define SHARED_KERNEL_PMD (!static_cpu_has(X86_FEATURE_PTI)) diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 5059799bebe3..0564edd24ffb 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -139,6 +139,7 @@ static inline int pmd_dirty(pmd_t pmd) return pmd_flags(pmd) & _PAGE_DIRTY; } +#define pmd_young pmd_young static inline int pmd_young(pmd_t pmd) { return pmd_flags(pmd) & _PAGE_ACCESSED; @@ -291,7 +292,23 @@ static inline pte_t pte_clear_flags(pte_t pte, pteval_t clear) #ifdef CONFIG_HAVE_ARCH_USERFAULTFD_WP static inline int pte_uffd_wp(pte_t pte) { - return pte_flags(pte) & _PAGE_UFFD_WP; + bool wp = pte_flags(pte) & _PAGE_UFFD_WP; + +#ifdef CONFIG_DEBUG_VM + /* + * Having write bit for wr-protect-marked present ptes is fatal, + * because it means the uffd-wp bit will be ignored and write will + * just go through. + * + * Use any chance of pgtable walking to verify this (e.g., when + * page swapped out or being migrated for all purposes). It means + * something is already wrong. Tell the admin even before the + * process crashes. We also nail it with wrong pgtable setup. + */ + WARN_ON_ONCE(wp && pte_write(pte)); +#endif + + return wp; } static inline pte_t pte_mkuffd_wp(pte_t pte) @@ -1438,6 +1455,14 @@ static inline bool arch_has_hw_pte_young(void) return true; } +#ifdef CONFIG_XEN_PV +#define arch_has_hw_nonleaf_pmd_young arch_has_hw_nonleaf_pmd_young +static inline bool arch_has_hw_nonleaf_pmd_young(void) +{ + return !cpu_feature_enabled(X86_FEATURE_XENPV); +} +#endif + #ifdef CONFIG_PAGE_TABLE_CHECK static inline bool pte_user_accessible_page(pte_t pte) { diff --git a/arch/x86/include/asm/pgtable_32.h b/arch/x86/include/asm/pgtable_32.h index 7c9c968a42ef..7d4ad8907297 100644 --- a/arch/x86/include/asm/pgtable_32.h +++ b/arch/x86/include/asm/pgtable_32.h @@ -48,15 +48,6 @@ do { \ #endif /* !__ASSEMBLY__ */ /* - * kern_addr_valid() is (1) for FLATMEM and (0) for SPARSEMEM - */ -#ifdef CONFIG_FLATMEM -#define kern_addr_valid(addr) (1) -#else -#define kern_addr_valid(kaddr) (0) -#endif - -/* * This is used to calculate the .brk reservation for initial pagetables. * Enough space is reserved to allocate pagetables sufficient to cover all * of LOWMEM_PAGES, which is an upper bound on the size of the direct map of diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h index e479491da8d5..7929327abe00 100644 --- a/arch/x86/include/asm/pgtable_64.h +++ b/arch/x86/include/asm/pgtable_64.h @@ -240,7 +240,6 @@ static inline void native_pgd_clear(pgd_t *pgd) #define __swp_entry_to_pte(x) ((pte_t) { .pte = (x).val }) #define __swp_entry_to_pmd(x) ((pmd_t) { .pmd = (x).val }) -extern int kern_addr_valid(unsigned long addr); extern void cleanup_highmap(void); #define HAVE_ARCH_UNMAPPED_AREA diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h index 04f36063ad54..38bf837e3554 100644 --- a/arch/x86/include/asm/pgtable_64_types.h +++ b/arch/x86/include/asm/pgtable_64_types.h @@ -19,6 +19,7 @@ typedef unsigned long pgdval_t; typedef unsigned long pgprotval_t; typedef struct { pteval_t pte; } pte_t; +typedef struct { pmdval_t pmd; } pmd_t; #ifdef CONFIG_X86_5LEVEL extern unsigned int __pgtable_l5_enabled; diff --git a/arch/x86/include/asm/pgtable_areas.h b/arch/x86/include/asm/pgtable_areas.h index d34cce1b995c..4f056fb88174 100644 --- a/arch/x86/include/asm/pgtable_areas.h +++ b/arch/x86/include/asm/pgtable_areas.h @@ -11,6 +11,12 @@ #define CPU_ENTRY_AREA_RO_IDT_VADDR ((void *)CPU_ENTRY_AREA_RO_IDT) -#define CPU_ENTRY_AREA_MAP_SIZE (CPU_ENTRY_AREA_PER_CPU + CPU_ENTRY_AREA_ARRAY_SIZE - CPU_ENTRY_AREA_BASE) +#ifdef CONFIG_X86_32 +#define CPU_ENTRY_AREA_MAP_SIZE (CPU_ENTRY_AREA_PER_CPU + \ + (CPU_ENTRY_AREA_SIZE * NR_CPUS) - \ + CPU_ENTRY_AREA_BASE) +#else +#define CPU_ENTRY_AREA_MAP_SIZE P4D_SIZE +#endif #endif /* _ASM_X86_PGTABLE_AREAS_H */ diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index aa174fed3a71..447d4bee25c4 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h @@ -361,11 +361,9 @@ static inline pudval_t native_pud_val(pud_t pud) #endif #if CONFIG_PGTABLE_LEVELS > 2 -typedef struct { pmdval_t pmd; } pmd_t; - static inline pmd_t native_make_pmd(pmdval_t val) { - return (pmd_t) { val }; + return (pmd_t) { .pmd = val }; } static inline pmdval_t native_pmd_val(pmd_t pmd) diff --git a/arch/x86/include/asm/preempt.h b/arch/x86/include/asm/preempt.h index 5f6daea1ee24..2d13f25b1bd8 100644 --- a/arch/x86/include/asm/preempt.h +++ b/arch/x86/include/asm/preempt.h @@ -4,11 +4,11 @@ #include <asm/rmwcc.h> #include <asm/percpu.h> +#include <asm/current.h> + #include <linux/thread_info.h> #include <linux/static_call_types.h> -DECLARE_PER_CPU(int, __preempt_count); - /* We use the MSB mostly because its available */ #define PREEMPT_NEED_RESCHED 0x80000000 @@ -24,7 +24,7 @@ DECLARE_PER_CPU(int, __preempt_count); */ static __always_inline int preempt_count(void) { - return raw_cpu_read_4(__preempt_count) & ~PREEMPT_NEED_RESCHED; + return raw_cpu_read_4(pcpu_hot.preempt_count) & ~PREEMPT_NEED_RESCHED; } static __always_inline void preempt_count_set(int pc) @@ -32,10 +32,10 @@ static __always_inline void preempt_count_set(int pc) int old, new; do { - old = raw_cpu_read_4(__preempt_count); + old = raw_cpu_read_4(pcpu_hot.preempt_count); new = (old & PREEMPT_NEED_RESCHED) | (pc & ~PREEMPT_NEED_RESCHED); - } while (raw_cpu_cmpxchg_4(__preempt_count, old, new) != old); + } while (raw_cpu_cmpxchg_4(pcpu_hot.preempt_count, old, new) != old); } /* @@ -44,7 +44,7 @@ static __always_inline void preempt_count_set(int pc) #define init_task_preempt_count(p) do { } while (0) #define init_idle_preempt_count(p, cpu) do { \ - per_cpu(__preempt_count, (cpu)) = PREEMPT_DISABLED; \ + per_cpu(pcpu_hot.preempt_count, (cpu)) = PREEMPT_DISABLED; \ } while (0) /* @@ -58,17 +58,17 @@ static __always_inline void preempt_count_set(int pc) static __always_inline void set_preempt_need_resched(void) { - raw_cpu_and_4(__preempt_count, ~PREEMPT_NEED_RESCHED); + raw_cpu_and_4(pcpu_hot.preempt_count, ~PREEMPT_NEED_RESCHED); } static __always_inline void clear_preempt_need_resched(void) { - raw_cpu_or_4(__preempt_count, PREEMPT_NEED_RESCHED); + raw_cpu_or_4(pcpu_hot.preempt_count, PREEMPT_NEED_RESCHED); } static __always_inline bool test_preempt_need_resched(void) { - return !(raw_cpu_read_4(__preempt_count) & PREEMPT_NEED_RESCHED); + return !(raw_cpu_read_4(pcpu_hot.preempt_count) & PREEMPT_NEED_RESCHED); } /* @@ -77,12 +77,12 @@ static __always_inline bool test_preempt_need_resched(void) static __always_inline void __preempt_count_add(int val) { - raw_cpu_add_4(__preempt_count, val); + raw_cpu_add_4(pcpu_hot.preempt_count, val); } static __always_inline void __preempt_count_sub(int val) { - raw_cpu_add_4(__preempt_count, -val); + raw_cpu_add_4(pcpu_hot.preempt_count, -val); } /* @@ -92,7 +92,8 @@ static __always_inline void __preempt_count_sub(int val) */ static __always_inline bool __preempt_count_dec_and_test(void) { - return GEN_UNARY_RMWcc("decl", __preempt_count, e, __percpu_arg([var])); + return GEN_UNARY_RMWcc("decl", pcpu_hot.preempt_count, e, + __percpu_arg([var])); } /* @@ -100,7 +101,7 @@ static __always_inline bool __preempt_count_dec_and_test(void) */ static __always_inline bool should_resched(int preempt_offset) { - return unlikely(raw_cpu_read_4(__preempt_count) == preempt_offset); + return unlikely(raw_cpu_read_4(pcpu_hot.preempt_count) == preempt_offset); } #ifdef CONFIG_PREEMPTION diff --git a/arch/x86/include/asm/processor-flags.h b/arch/x86/include/asm/processor-flags.h index 02c2cbda4a74..a7f3d9100adb 100644 --- a/arch/x86/include/asm/processor-flags.h +++ b/arch/x86/include/asm/processor-flags.h @@ -35,7 +35,7 @@ */ #ifdef CONFIG_X86_64 /* Mask off the address space ID and SME encryption bits. */ -#define CR3_ADDR_MASK __sme_clr(0x7FFFFFFFFFFFF000ull) +#define CR3_ADDR_MASK __sme_clr(PHYSICAL_PAGE_MASK) #define CR3_PCID_MASK 0xFFFull #define CR3_NOFLUSH BIT_ULL(63) diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 67c9d73b31fa..4e35c66edeb7 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -16,6 +16,7 @@ struct vm86; #include <uapi/asm/sigcontext.h> #include <asm/current.h> #include <asm/cpufeatures.h> +#include <asm/cpuid.h> #include <asm/page.h> #include <asm/pgtable_types.h> #include <asm/percpu.h> @@ -146,17 +147,6 @@ struct cpuinfo_x86 { unsigned initialized : 1; } __randomize_layout; -struct cpuid_regs { - u32 eax, ebx, ecx, edx; -}; - -enum cpuid_regs_idx { - CPUID_EAX = 0, - CPUID_EBX, - CPUID_ECX, - CPUID_EDX, -}; - #define X86_VENDOR_INTEL 0 #define X86_VENDOR_CYRIX 1 #define X86_VENDOR_AMD 2 @@ -205,45 +195,6 @@ extern void identify_secondary_cpu(struct cpuinfo_x86 *); extern void print_cpu_info(struct cpuinfo_x86 *); void print_cpu_msr(struct cpuinfo_x86 *); -#ifdef CONFIG_X86_32 -extern int have_cpuid_p(void); -#else -static inline int have_cpuid_p(void) -{ - return 1; -} -#endif -static inline void native_cpuid(unsigned int *eax, unsigned int *ebx, - unsigned int *ecx, unsigned int *edx) -{ - /* ecx is often an input as well as an output. */ - asm volatile("cpuid" - : "=a" (*eax), - "=b" (*ebx), - "=c" (*ecx), - "=d" (*edx) - : "0" (*eax), "2" (*ecx) - : "memory"); -} - -#define native_cpuid_reg(reg) \ -static inline unsigned int native_cpuid_##reg(unsigned int op) \ -{ \ - unsigned int eax = op, ebx, ecx = 0, edx; \ - \ - native_cpuid(&eax, &ebx, &ecx, &edx); \ - \ - return reg; \ -} - -/* - * Native CPUID functions returning a single datum. - */ -native_cpuid_reg(eax) -native_cpuid_reg(ebx) -native_cpuid_reg(ecx) -native_cpuid_reg(edx) - /* * Friendlier CR3 helpers. */ @@ -426,8 +377,6 @@ struct irq_stack { char stack[IRQ_STACK_SIZE]; } __aligned(IRQ_STACK_SIZE); -DECLARE_PER_CPU(unsigned long, cpu_current_top_of_stack); - #ifdef CONFIG_X86_64 struct fixed_percpu_data { /* @@ -450,8 +399,6 @@ static inline unsigned long cpu_kernelmode_gs_base(int cpu) return (unsigned long)per_cpu(fixed_percpu_data.gs_base, cpu); } -DECLARE_PER_CPU(void *, hardirq_stack_ptr); -DECLARE_PER_CPU(bool, hardirq_stack_inuse); extern asmlinkage void ignore_sysret(void); /* Save actual FS/GS selectors and bases to current->thread */ @@ -460,8 +407,6 @@ void current_save_fsgs(void); #ifdef CONFIG_STACKPROTECTOR DECLARE_PER_CPU(unsigned long, __stack_chk_guard); #endif -DECLARE_PER_CPU(struct irq_stack *, hardirq_stack_ptr); -DECLARE_PER_CPU(struct irq_stack *, softirq_stack_ptr); #endif /* !X86_64 */ struct perf_event; @@ -566,7 +511,7 @@ static __always_inline unsigned long current_top_of_stack(void) * and around vm86 mode and sp0 on x86_64 is special because of the * entry trampoline. */ - return this_cpu_read_stable(cpu_current_top_of_stack); + return this_cpu_read_stable(pcpu_hot.top_of_stack); } static __always_inline bool on_thread_stack(void) @@ -578,7 +523,6 @@ static __always_inline bool on_thread_stack(void) #ifdef CONFIG_PARAVIRT_XXL #include <asm/paravirt.h> #else -#define __cpuid native_cpuid static inline void load_sp0(unsigned long sp0) { @@ -589,69 +533,6 @@ static inline void load_sp0(unsigned long sp0) unsigned long __get_wchan(struct task_struct *p); -/* - * Generic CPUID function - * clear %ecx since some cpus (Cyrix MII) do not set or clear %ecx - * resulting in stale register contents being returned. - */ -static inline void cpuid(unsigned int op, - unsigned int *eax, unsigned int *ebx, - unsigned int *ecx, unsigned int *edx) -{ - *eax = op; - *ecx = 0; - __cpuid(eax, ebx, ecx, edx); -} - -/* Some CPUID calls want 'count' to be placed in ecx */ -static inline void cpuid_count(unsigned int op, int count, - unsigned int *eax, unsigned int *ebx, - unsigned int *ecx, unsigned int *edx) -{ - *eax = op; - *ecx = count; - __cpuid(eax, ebx, ecx, edx); -} - -/* - * CPUID functions returning a single datum - */ -static inline unsigned int cpuid_eax(unsigned int op) -{ - unsigned int eax, ebx, ecx, edx; - - cpuid(op, &eax, &ebx, &ecx, &edx); - - return eax; -} - -static inline unsigned int cpuid_ebx(unsigned int op) -{ - unsigned int eax, ebx, ecx, edx; - - cpuid(op, &eax, &ebx, &ecx, &edx); - - return ebx; -} - -static inline unsigned int cpuid_ecx(unsigned int op) -{ - unsigned int eax, ebx, ecx, edx; - - cpuid(op, &eax, &ebx, &ecx, &edx); - - return ecx; -} - -static inline unsigned int cpuid_edx(unsigned int op) -{ - unsigned int eax, ebx, ecx, edx; - - cpuid(op, &eax, &ebx, &ecx, &edx); - - return edx; -} - extern void select_idle_routine(const struct cpuinfo_x86 *c); extern void amd_e400_c1e_apic_setup(void); @@ -667,10 +548,9 @@ extern int sysenter_setup(void); /* Defined in head.S */ extern struct desc_ptr early_gdt_descr; -extern void switch_to_new_gdt(int); +extern void switch_gdt_and_percpu_base(int); extern void load_direct_gdt(int); extern void load_fixmap_gdt(int); -extern void load_percpu_segment(int); extern void cpu_init(void); extern void cpu_init_secondary(void); extern void cpu_init_exception_handling(void); @@ -805,24 +685,6 @@ static inline u32 amd_get_nodes_per_socket(void) { return 0; } static inline u32 amd_get_highest_perf(void) { return 0; } #endif -#define for_each_possible_hypervisor_cpuid_base(function) \ - for (function = 0x40000000; function < 0x40010000; function += 0x100) - -static inline uint32_t hypervisor_cpuid_base(const char *sig, uint32_t leaves) -{ - uint32_t base, eax, signature[3]; - - for_each_possible_hypervisor_cpuid_base(base) { - cpuid(base, &eax, &signature[0], &signature[1], &signature[2]); - - if (!memcmp(sig, signature, 12) && - (leaves == 0 || ((eax - base) >= leaves))) - return base; - } - - return 0; -} - extern unsigned long arch_align_stack(unsigned long sp); void free_init_pages(const char *what, unsigned long begin, unsigned long end); extern void free_kernel_image_pages(const char *what, void *begin, void *end); diff --git a/arch/x86/include/asm/qspinlock_paravirt.h b/arch/x86/include/asm/qspinlock_paravirt.h index 60ece592b220..42b17cf10b10 100644 --- a/arch/x86/include/asm/qspinlock_paravirt.h +++ b/arch/x86/include/asm/qspinlock_paravirt.h @@ -14,8 +14,6 @@ __PV_CALLEE_SAVE_REGS_THUNK(__pv_queued_spin_unlock_slowpath, ".spinlock.text"); #define __pv_queued_spin_unlock __pv_queued_spin_unlock -#define PV_UNLOCK "__raw_callee_save___pv_queued_spin_unlock" -#define PV_UNLOCK_SLOWPATH "__raw_callee_save___pv_queued_spin_unlock_slowpath" /* * Optimized assembly version of __raw_callee_save___pv_queued_spin_unlock @@ -37,32 +35,27 @@ __PV_CALLEE_SAVE_REGS_THUNK(__pv_queued_spin_unlock_slowpath, ".spinlock.text"); * rsi = lockval (second argument) * rdx = internal variable (set to 0) */ -asm (".pushsection .spinlock.text;" - ".globl " PV_UNLOCK ";" - ".type " PV_UNLOCK ", @function;" - ".align 4,0x90;" - PV_UNLOCK ": " - ASM_ENDBR - FRAME_BEGIN - "push %rdx;" - "mov $0x1,%eax;" - "xor %edx,%edx;" - LOCK_PREFIX "cmpxchg %dl,(%rdi);" - "cmp $0x1,%al;" - "jne .slowpath;" - "pop %rdx;" +#define PV_UNLOCK_ASM \ + FRAME_BEGIN \ + "push %rdx\n\t" \ + "mov $0x1,%eax\n\t" \ + "xor %edx,%edx\n\t" \ + LOCK_PREFIX "cmpxchg %dl,(%rdi)\n\t" \ + "cmp $0x1,%al\n\t" \ + "jne .slowpath\n\t" \ + "pop %rdx\n\t" \ + FRAME_END \ + ASM_RET \ + ".slowpath:\n\t" \ + "push %rsi\n\t" \ + "movzbl %al,%esi\n\t" \ + "call __raw_callee_save___pv_queued_spin_unlock_slowpath\n\t" \ + "pop %rsi\n\t" \ + "pop %rdx\n\t" \ FRAME_END - ASM_RET - ".slowpath: " - "push %rsi;" - "movzbl %al,%esi;" - "call " PV_UNLOCK_SLOWPATH ";" - "pop %rsi;" - "pop %rdx;" - FRAME_END - ASM_RET - ".size " PV_UNLOCK ", .-" PV_UNLOCK ";" - ".popsection"); + +DEFINE_PARAVIRT_ASM(__raw_callee_save___pv_queued_spin_unlock, + PV_UNLOCK_ASM, .spinlock.text); #else /* CONFIG_64BIT */ diff --git a/arch/x86/include/asm/realmode.h b/arch/x86/include/asm/realmode.h index fd6f6e5b755a..a336feef0af1 100644 --- a/arch/x86/include/asm/realmode.h +++ b/arch/x86/include/asm/realmode.h @@ -91,6 +91,7 @@ static inline void set_real_mode_mem(phys_addr_t mem) void reserve_real_mode(void); void load_trampoline_pgtable(void); +void init_real_mode(void); #endif /* __ASSEMBLY__ */ diff --git a/arch/x86/include/asm/resctrl.h b/arch/x86/include/asm/resctrl.h index d24b04ebf950..52788f79786f 100644 --- a/arch/x86/include/asm/resctrl.h +++ b/arch/x86/include/asm/resctrl.h @@ -7,8 +7,6 @@ #include <linux/sched.h> #include <linux/jump_label.h> -#define IA32_PQR_ASSOC 0x0c8f - /** * struct resctrl_pqr_state - State cache for the PQR MSR * @cur_rmid: The cached Resource Monitoring ID @@ -16,8 +14,8 @@ * @default_rmid: The user assigned Resource Monitoring ID * @default_closid: The user assigned cached Class Of Service ID * - * The upper 32 bits of IA32_PQR_ASSOC contain closid and the - * lower 10 bits rmid. The update to IA32_PQR_ASSOC always + * The upper 32 bits of MSR_IA32_PQR_ASSOC contain closid and the + * lower 10 bits rmid. The update to MSR_IA32_PQR_ASSOC always * contains both parts, so we need to cache them. This also * stores the user configured per cpu CLOSID and RMID. * @@ -77,7 +75,7 @@ static void __resctrl_sched_in(void) if (closid != state->cur_closid || rmid != state->cur_rmid) { state->cur_closid = closid; state->cur_rmid = rmid; - wrmsr(IA32_PQR_ASSOC, rmid, closid); + wrmsr(MSR_IA32_PQR_ASSOC, rmid, closid); } } diff --git a/arch/x86/include/asm/segment.h b/arch/x86/include/asm/segment.h index 2e7890dd58a4..c390a672d560 100644 --- a/arch/x86/include/asm/segment.h +++ b/arch/x86/include/asm/segment.h @@ -135,6 +135,7 @@ #define __KERNEL_DS (GDT_ENTRY_KERNEL_DS*8) #define __USER_DS (GDT_ENTRY_DEFAULT_USER_DS*8 + 3) #define __USER_CS (GDT_ENTRY_DEFAULT_USER_CS*8 + 3) +#define __USER32_CS __USER_CS #define __ESPFIX_SS (GDT_ENTRY_ESPFIX_SS*8) /* segment for calling fn: */ @@ -210,7 +211,6 @@ #define __KERNEL_DS (GDT_ENTRY_KERNEL_DS*8) #define __USER32_CS (GDT_ENTRY_DEFAULT_USER32_CS*8 + 3) #define __USER_DS (GDT_ENTRY_DEFAULT_USER_DS*8 + 3) -#define __USER32_DS __USER_DS #define __USER_CS (GDT_ENTRY_DEFAULT_USER_CS*8 + 3) #define __CPUNODE_SEG (GDT_ENTRY_CPUNODE*8 + 3) diff --git a/arch/x86/include/asm/set_memory.h b/arch/x86/include/asm/set_memory.h index b45c4d27fd46..a5e89641bd2d 100644 --- a/arch/x86/include/asm/set_memory.h +++ b/arch/x86/include/asm/set_memory.h @@ -6,6 +6,9 @@ #include <asm/page.h> #include <asm-generic/set_memory.h> +#define set_memory_rox set_memory_rox +int set_memory_rox(unsigned long addr, int numpages); + /* * The set_memory_* API can be used to change various attributes of a virtual * address range. The attributes include: diff --git a/arch/x86/include/asm/sgx.h b/arch/x86/include/asm/sgx.h index eae20fa52b93..6a0069761508 100644 --- a/arch/x86/include/asm/sgx.h +++ b/arch/x86/include/asm/sgx.h @@ -115,17 +115,36 @@ enum sgx_miscselect { * %SGX_ATTR_EINITTOKENKEY: Allow to use token signing key that is used to * sign cryptographic tokens that can be passed to * EINIT as an authorization to run an enclave. + * %SGX_ATTR_ASYNC_EXIT_NOTIFY: Allow enclaves to be notified after an + * asynchronous exit has occurred. */ enum sgx_attribute { - SGX_ATTR_INIT = BIT(0), - SGX_ATTR_DEBUG = BIT(1), - SGX_ATTR_MODE64BIT = BIT(2), - SGX_ATTR_PROVISIONKEY = BIT(4), - SGX_ATTR_EINITTOKENKEY = BIT(5), - SGX_ATTR_KSS = BIT(7), + SGX_ATTR_INIT = BIT(0), + SGX_ATTR_DEBUG = BIT(1), + SGX_ATTR_MODE64BIT = BIT(2), + /* BIT(3) is reserved */ + SGX_ATTR_PROVISIONKEY = BIT(4), + SGX_ATTR_EINITTOKENKEY = BIT(5), + /* BIT(6) is for CET */ + SGX_ATTR_KSS = BIT(7), + /* BIT(8) is reserved */ + /* BIT(9) is reserved */ + SGX_ATTR_ASYNC_EXIT_NOTIFY = BIT(10), }; -#define SGX_ATTR_RESERVED_MASK (BIT_ULL(3) | BIT_ULL(6) | GENMASK_ULL(63, 8)) +#define SGX_ATTR_RESERVED_MASK (BIT_ULL(3) | \ + BIT_ULL(6) | \ + BIT_ULL(8) | \ + BIT_ULL(9) | \ + GENMASK_ULL(63, 11)) + +#define SGX_ATTR_UNPRIV_MASK (SGX_ATTR_DEBUG | \ + SGX_ATTR_MODE64BIT | \ + SGX_ATTR_KSS | \ + SGX_ATTR_ASYNC_EXIT_NOTIFY) + +#define SGX_ATTR_PRIV_MASK (SGX_ATTR_PROVISIONKEY | \ + SGX_ATTR_EINITTOKENKEY) /** * struct sgx_secs - SGX Enclave Control Structure (SECS) diff --git a/arch/x86/include/asm/sighandling.h b/arch/x86/include/asm/sighandling.h index 65e667279e0f..e770c4fc47f4 100644 --- a/arch/x86/include/asm/sighandling.h +++ b/arch/x86/include/asm/sighandling.h @@ -15,4 +15,13 @@ void signal_fault(struct pt_regs *regs, void __user *frame, char *where); +void __user * +get_sigframe(struct ksignal *ksig, struct pt_regs *regs, size_t frame_size, + void __user **fpstate); + +int ia32_setup_frame(struct ksignal *ksig, struct pt_regs *regs); +int ia32_setup_rt_frame(struct ksignal *ksig, struct pt_regs *regs); +int x64_setup_rt_frame(struct ksignal *ksig, struct pt_regs *regs); +int x32_setup_rt_frame(struct ksignal *ksig, struct pt_regs *regs); + #endif /* _ASM_X86_SIGHANDLING_H */ diff --git a/arch/x86/include/asm/signal.h b/arch/x86/include/asm/signal.h index 2dfb5fea13af..4a4043ca6493 100644 --- a/arch/x86/include/asm/signal.h +++ b/arch/x86/include/asm/signal.h @@ -28,11 +28,6 @@ typedef struct { #define SA_IA32_ABI 0x02000000u #define SA_X32_ABI 0x01000000u -#ifndef CONFIG_COMPAT -#define compat_sigset_t compat_sigset_t -typedef sigset_t compat_sigset_t; -#endif - #endif /* __ASSEMBLY__ */ #include <uapi/asm/signal.h> #ifndef __ASSEMBLY__ diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h index a73bced40e24..b4dbb20dab1a 100644 --- a/arch/x86/include/asm/smp.h +++ b/arch/x86/include/asm/smp.h @@ -3,10 +3,10 @@ #define _ASM_X86_SMP_H #ifndef __ASSEMBLY__ #include <linux/cpumask.h> -#include <asm/percpu.h> -#include <asm/thread_info.h> #include <asm/cpumask.h> +#include <asm/current.h> +#include <asm/thread_info.h> extern int smp_num_siblings; extern unsigned int num_processors; @@ -19,7 +19,6 @@ DECLARE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_llc_shared_map); DECLARE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_l2c_shared_map); DECLARE_PER_CPU_READ_MOSTLY(u16, cpu_llc_id); DECLARE_PER_CPU_READ_MOSTLY(u16, cpu_l2c_id); -DECLARE_PER_CPU_READ_MOSTLY(int, cpu_number); DECLARE_EARLY_PER_CPU_READ_MOSTLY(u16, x86_cpu_to_apicid); DECLARE_EARLY_PER_CPU_READ_MOSTLY(u32, x86_cpu_to_acpiid); @@ -150,11 +149,10 @@ __visible void smp_call_function_single_interrupt(struct pt_regs *r); /* * This function is needed by all SMP systems. It must _always_ be valid - * from the initial startup. We map APIC_BASE very early in page_setup(), - * so this is correct in the x86 case. + * from the initial startup. */ -#define raw_smp_processor_id() this_cpu_read(cpu_number) -#define __smp_processor_id() __this_cpu_read(cpu_number) +#define raw_smp_processor_id() this_cpu_read(pcpu_hot.cpu_number) +#define __smp_processor_id() __this_cpu_read(pcpu_hot.cpu_number) #ifdef CONFIG_X86_32 extern int safe_smp_processor_id(void); diff --git a/arch/x86/include/asm/stackprotector.h b/arch/x86/include/asm/stackprotector.h index 24a8d6c4fb18..00473a650f51 100644 --- a/arch/x86/include/asm/stackprotector.h +++ b/arch/x86/include/asm/stackprotector.h @@ -34,7 +34,6 @@ #include <asm/percpu.h> #include <asm/desc.h> -#include <linux/random.h> #include <linux/sched.h> /* @@ -50,22 +49,11 @@ */ static __always_inline void boot_init_stack_canary(void) { - u64 canary; - u64 tsc; + unsigned long canary = get_random_canary(); #ifdef CONFIG_X86_64 BUILD_BUG_ON(offsetof(struct fixed_percpu_data, stack_canary) != 40); #endif - /* - * We both use the random pool and the current TSC as a source - * of randomness. The TSC only matters for very early init, - * there it already has some randomness on most systems. Later - * on during the bootup the random pool has true entropy too. - */ - get_random_bytes(&canary, sizeof(canary)); - tsc = rdtsc(); - canary += tsc + (tsc << 32UL); - canary &= CANARY_MASK; current->stack_canary = canary; #ifdef CONFIG_X86_64 diff --git a/arch/x86/include/asm/switch_to.h b/arch/x86/include/asm/switch_to.h index c08eb0fdd11f..5c91305d09d2 100644 --- a/arch/x86/include/asm/switch_to.h +++ b/arch/x86/include/asm/switch_to.h @@ -66,13 +66,10 @@ static inline void update_task_stack(struct task_struct *task) { /* sp0 always points to the entry trampoline stack, which is constant: */ #ifdef CONFIG_X86_32 - if (static_cpu_has(X86_FEATURE_XENPV)) - load_sp0(task->thread.sp0); - else - this_cpu_write(cpu_tss_rw.x86_tss.sp1, task->thread.sp0); + this_cpu_write(cpu_tss_rw.x86_tss.sp1, task->thread.sp0); #else /* Xen PV enters the kernel on the thread stack. */ - if (static_cpu_has(X86_FEATURE_XENPV)) + if (cpu_feature_enabled(X86_FEATURE_XENPV)) load_sp0(task_top_of_stack(task)); #endif } diff --git a/arch/x86/include/asm/tdx.h b/arch/x86/include/asm/tdx.h index 020c81a7c729..28d889c9aa16 100644 --- a/arch/x86/include/asm/tdx.h +++ b/arch/x86/include/asm/tdx.h @@ -67,6 +67,8 @@ void tdx_safe_halt(void); bool tdx_early_handle_ve(struct pt_regs *regs); +int tdx_mcall_get_report0(u8 *reportdata, u8 *tdreport); + #else static inline void tdx_early_init(void) { }; diff --git a/arch/x86/include/asm/text-patching.h b/arch/x86/include/asm/text-patching.h index 1cc15528ce29..f4b87f08f5c5 100644 --- a/arch/x86/include/asm/text-patching.h +++ b/arch/x86/include/asm/text-patching.h @@ -45,6 +45,7 @@ extern void *text_poke(void *addr, const void *opcode, size_t len); extern void text_poke_sync(void); extern void *text_poke_kgdb(void *addr, const void *opcode, size_t len); extern void *text_poke_copy(void *addr, const void *opcode, size_t len); +extern void *text_poke_copy_locked(void *addr, const void *opcode, size_t len, bool core_ok); extern void *text_poke_set(void *addr, int c, size_t len); extern int poke_int3_handler(struct pt_regs *regs); extern void text_poke_bp(void *addr, const void *opcode, size_t len, const void *emulate); diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h index e9170457697e..c1c8c581759d 100644 --- a/arch/x86/include/asm/x86_init.h +++ b/arch/x86/include/asm/x86_init.h @@ -285,6 +285,8 @@ struct x86_hyper_runtime { * possible in x86_early_init_platform_quirks() by * only using the current x86_hardware_subarch * semantics. + * @realmode_reserve: reserve memory for realmode trampoline + * @realmode_init: initialize realmode trampoline * @hyper: x86 hypervisor specific runtime callbacks */ struct x86_platform_ops { @@ -301,6 +303,8 @@ struct x86_platform_ops { void (*apic_post_init)(void); struct x86_legacy_features legacy; void (*set_legacy_features)(void); + void (*realmode_reserve)(void); + void (*realmode_init)(void); struct x86_hyper_runtime hyper; struct x86_guest guest; }; diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index f901658d9f7c..96d51bbc2bd4 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -44,7 +44,7 @@ obj-y += head_$(BITS).o obj-y += head$(BITS).o obj-y += ebda.o obj-y += platform-quirks.o -obj-y += process_$(BITS).o signal.o +obj-y += process_$(BITS).o signal.o signal_$(BITS).o obj-$(CONFIG_COMPAT) += signal_compat.o obj-y += traps.o idt.o irq.o irq_$(BITS).o dumpstack_$(BITS).o obj-y += time.o ioport.o dumpstack.o nmi.o @@ -54,7 +54,7 @@ obj-$(CONFIG_JUMP_LABEL) += jump_label.o obj-$(CONFIG_IRQ_WORK) += irq_work.o obj-y += probe_roms.o obj-$(CONFIG_X86_32) += sys_ia32.o -obj-$(CONFIG_IA32_EMULATION) += sys_ia32.o +obj-$(CONFIG_IA32_EMULATION) += sys_ia32.o signal_32.o obj-$(CONFIG_X86_64) += sys_x86_64.o obj-$(CONFIG_X86_ESPFIX64) += espfix_64.o obj-$(CONFIG_SYSFS) += ksysfs.o @@ -143,6 +143,8 @@ obj-$(CONFIG_AMD_MEM_ENCRYPT) += sev.o obj-$(CONFIG_CFI_CLANG) += cfi.o +obj-$(CONFIG_CALL_THUNKS) += callthunks.o + ### # 64 bit specific files ifeq ($(CONFIG_X86_64),y) diff --git a/arch/x86/kernel/acpi/cstate.c b/arch/x86/kernel/acpi/cstate.c index 7945eae5b315..401808b47af3 100644 --- a/arch/x86/kernel/acpi/cstate.c +++ b/arch/x86/kernel/acpi/cstate.c @@ -52,17 +52,25 @@ void acpi_processor_power_init_bm_check(struct acpi_processor_flags *flags, if (c->x86_vendor == X86_VENDOR_INTEL && (c->x86 > 0xf || (c->x86 == 6 && c->x86_model >= 0x0f))) flags->bm_control = 0; - /* - * For all recent Centaur CPUs, the ucode will make sure that each - * core can keep cache coherence with each other while entering C3 - * type state. So, set bm_check to 1 to indicate that the kernel - * doesn't need to execute a cache flush operation (WBINVD) when - * entering C3 type state. - */ + if (c->x86_vendor == X86_VENDOR_CENTAUR) { if (c->x86 > 6 || (c->x86 == 6 && c->x86_model == 0x0f && - c->x86_stepping >= 0x0e)) + c->x86_stepping >= 0x0e)) { + /* + * For all recent Centaur CPUs, the ucode will make sure that each + * core can keep cache coherence with each other while entering C3 + * type state. So, set bm_check to 1 to indicate that the kernel + * doesn't need to execute a cache flush operation (WBINVD) when + * entering C3 type state. + */ flags->bm_check = 1; + /* + * For all recent Centaur platforms, ARB_DISABLE is a nop. + * Set bm_control to zero to indicate that ARB_DISABLE is + * not required while entering C3 type state. + */ + flags->bm_control = 0; + } } if (c->x86_vendor == X86_VENDOR_ZHAOXIN) { diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 5cadcea035e0..7d8c3cbde368 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -116,6 +116,7 @@ static void __init_or_module add_nops(void *insns, unsigned int len) extern s32 __retpoline_sites[], __retpoline_sites_end[]; extern s32 __return_sites[], __return_sites_end[]; +extern s32 __cfi_sites[], __cfi_sites_end[]; extern s32 __ibt_endbr_seal[], __ibt_endbr_seal_end[]; extern struct alt_instr __alt_instructions[], __alt_instructions_end[]; extern s32 __smp_locks[], __smp_locks_end[]; @@ -377,6 +378,56 @@ static int emit_indirect(int op, int reg, u8 *bytes) return i; } +static inline bool is_jcc32(struct insn *insn) +{ + /* Jcc.d32 second opcode byte is in the range: 0x80-0x8f */ + return insn->opcode.bytes[0] == 0x0f && (insn->opcode.bytes[1] & 0xf0) == 0x80; +} + +static int emit_call_track_retpoline(void *addr, struct insn *insn, int reg, u8 *bytes) +{ + u8 op = insn->opcode.bytes[0]; + int i = 0; + + /* + * Clang does 'weird' Jcc __x86_indirect_thunk_r11 conditional + * tail-calls. Deal with them. + */ + if (is_jcc32(insn)) { + bytes[i++] = op; + op = insn->opcode.bytes[1]; + goto clang_jcc; + } + + if (insn->length == 6) + bytes[i++] = 0x2e; /* CS-prefix */ + + switch (op) { + case CALL_INSN_OPCODE: + __text_gen_insn(bytes+i, op, addr+i, + __x86_indirect_call_thunk_array[reg], + CALL_INSN_SIZE); + i += CALL_INSN_SIZE; + break; + + case JMP32_INSN_OPCODE: +clang_jcc: + __text_gen_insn(bytes+i, op, addr+i, + __x86_indirect_jump_thunk_array[reg], + JMP32_INSN_SIZE); + i += JMP32_INSN_SIZE; + break; + + default: + WARN(1, "%pS %px %*ph\n", addr, addr, 6, addr); + return -1; + } + + WARN_ON_ONCE(i != insn->length); + + return i; +} + /* * Rewrite the compiler generated retpoline thunk calls. * @@ -409,8 +460,12 @@ static int patch_retpoline(void *addr, struct insn *insn, u8 *bytes) BUG_ON(reg == 4); if (cpu_feature_enabled(X86_FEATURE_RETPOLINE) && - !cpu_feature_enabled(X86_FEATURE_RETPOLINE_LFENCE)) + !cpu_feature_enabled(X86_FEATURE_RETPOLINE_LFENCE)) { + if (cpu_feature_enabled(X86_FEATURE_CALL_DEPTH)) + return emit_call_track_retpoline(addr, insn, reg, bytes); + return -1; + } op = insn->opcode.bytes[0]; @@ -427,8 +482,7 @@ static int patch_retpoline(void *addr, struct insn *insn, u8 *bytes) * [ NOP ] * 1: */ - /* Jcc.d32 second opcode byte is in the range: 0x80-0x8f */ - if (op == 0x0f && (insn->opcode.bytes[1] & 0xf0) == 0x80) { + if (is_jcc32(insn)) { cc = insn->opcode.bytes[1] & 0xf; cc ^= 1; /* invert condition */ @@ -518,6 +572,11 @@ void __init_or_module noinline apply_retpolines(s32 *start, s32 *end) } #ifdef CONFIG_RETHUNK + +#ifdef CONFIG_CALL_THUNKS +void (*x86_return_thunk)(void) __ro_after_init = &__x86_return_thunk; +#endif + /* * Rewrite the compiler generated return thunk tail-calls. * @@ -533,14 +592,18 @@ static int patch_return(void *addr, struct insn *insn, u8 *bytes) { int i = 0; - if (cpu_feature_enabled(X86_FEATURE_RETHUNK)) - return -1; + if (cpu_feature_enabled(X86_FEATURE_RETHUNK)) { + if (x86_return_thunk == __x86_return_thunk) + return -1; - bytes[i++] = RET_INSN_OPCODE; + i = JMP32_INSN_SIZE; + __text_gen_insn(bytes, JMP32_INSN_OPCODE, addr, x86_return_thunk, i); + } else { + bytes[i++] = RET_INSN_OPCODE; + } for (; i < insn->length;) bytes[i++] = INT3_INSN_OPCODE; - return i; } @@ -594,6 +657,28 @@ void __init_or_module noinline apply_returns(s32 *start, s32 *end) { } #ifdef CONFIG_X86_KERNEL_IBT +static void poison_endbr(void *addr, bool warn) +{ + u32 endbr, poison = gen_endbr_poison(); + + if (WARN_ON_ONCE(get_kernel_nofault(endbr, addr))) + return; + + if (!is_endbr(endbr)) { + WARN_ON_ONCE(warn); + return; + } + + DPRINTK("ENDBR at: %pS (%px)", addr, addr); + + /* + * When we have IBT, the lack of ENDBR will trigger #CP + */ + DUMP_BYTES(((u8*)addr), 4, "%px: orig: ", addr); + DUMP_BYTES(((u8*)&poison), 4, "%px: repl: ", addr); + text_poke_early(addr, &poison, 4); +} + /* * Generated by: objtool --ibt */ @@ -602,31 +687,391 @@ void __init_or_module noinline apply_ibt_endbr(s32 *start, s32 *end) s32 *s; for (s = start; s < end; s++) { - u32 endbr, poison = gen_endbr_poison(); void *addr = (void *)s + *s; - if (WARN_ON_ONCE(get_kernel_nofault(endbr, addr))) + poison_endbr(addr, true); + if (IS_ENABLED(CONFIG_FINEIBT)) + poison_endbr(addr - 16, false); + } +} + +#else + +void __init_or_module apply_ibt_endbr(s32 *start, s32 *end) { } + +#endif /* CONFIG_X86_KERNEL_IBT */ + +#ifdef CONFIG_FINEIBT + +enum cfi_mode { + CFI_DEFAULT, + CFI_OFF, + CFI_KCFI, + CFI_FINEIBT, +}; + +static enum cfi_mode cfi_mode __ro_after_init = CFI_DEFAULT; +static bool cfi_rand __ro_after_init = true; +static u32 cfi_seed __ro_after_init; + +/* + * Re-hash the CFI hash with a boot-time seed while making sure the result is + * not a valid ENDBR instruction. + */ +static u32 cfi_rehash(u32 hash) +{ + hash ^= cfi_seed; + while (unlikely(is_endbr(hash) || is_endbr(-hash))) { + bool lsb = hash & 1; + hash >>= 1; + if (lsb) + hash ^= 0x80200003; + } + return hash; +} + +static __init int cfi_parse_cmdline(char *str) +{ + if (!str) + return -EINVAL; + + while (str) { + char *next = strchr(str, ','); + if (next) { + *next = 0; + next++; + } + + if (!strcmp(str, "auto")) { + cfi_mode = CFI_DEFAULT; + } else if (!strcmp(str, "off")) { + cfi_mode = CFI_OFF; + cfi_rand = false; + } else if (!strcmp(str, "kcfi")) { + cfi_mode = CFI_KCFI; + } else if (!strcmp(str, "fineibt")) { + cfi_mode = CFI_FINEIBT; + } else if (!strcmp(str, "norand")) { + cfi_rand = false; + } else { + pr_err("Ignoring unknown cfi option (%s).", str); + } + + str = next; + } + + return 0; +} +early_param("cfi", cfi_parse_cmdline); + +/* + * kCFI FineIBT + * + * __cfi_\func: __cfi_\func: + * movl $0x12345678,%eax // 5 endbr64 // 4 + * nop subl $0x12345678,%r10d // 7 + * nop jz 1f // 2 + * nop ud2 // 2 + * nop 1: nop // 1 + * nop + * nop + * nop + * nop + * nop + * nop + * nop + * + * + * caller: caller: + * movl $(-0x12345678),%r10d // 6 movl $0x12345678,%r10d // 6 + * addl $-15(%r11),%r10d // 4 sub $16,%r11 // 4 + * je 1f // 2 nop4 // 4 + * ud2 // 2 + * 1: call __x86_indirect_thunk_r11 // 5 call *%r11; nop2; // 5 + * + */ + +asm( ".pushsection .rodata \n" + "fineibt_preamble_start: \n" + " endbr64 \n" + " subl $0x12345678, %r10d \n" + " je fineibt_preamble_end \n" + " ud2 \n" + " nop \n" + "fineibt_preamble_end: \n" + ".popsection\n" +); + +extern u8 fineibt_preamble_start[]; +extern u8 fineibt_preamble_end[]; + +#define fineibt_preamble_size (fineibt_preamble_end - fineibt_preamble_start) +#define fineibt_preamble_hash 7 + +asm( ".pushsection .rodata \n" + "fineibt_caller_start: \n" + " movl $0x12345678, %r10d \n" + " sub $16, %r11 \n" + ASM_NOP4 + "fineibt_caller_end: \n" + ".popsection \n" +); + +extern u8 fineibt_caller_start[]; +extern u8 fineibt_caller_end[]; + +#define fineibt_caller_size (fineibt_caller_end - fineibt_caller_start) +#define fineibt_caller_hash 2 + +#define fineibt_caller_jmp (fineibt_caller_size - 2) + +static u32 decode_preamble_hash(void *addr) +{ + u8 *p = addr; + + /* b8 78 56 34 12 mov $0x12345678,%eax */ + if (p[0] == 0xb8) + return *(u32 *)(addr + 1); + + return 0; /* invalid hash value */ +} + +static u32 decode_caller_hash(void *addr) +{ + u8 *p = addr; + + /* 41 ba 78 56 34 12 mov $0x12345678,%r10d */ + if (p[0] == 0x41 && p[1] == 0xba) + return -*(u32 *)(addr + 2); + + /* e8 0c 78 56 34 12 jmp.d8 +12 */ + if (p[0] == JMP8_INSN_OPCODE && p[1] == fineibt_caller_jmp) + return -*(u32 *)(addr + 2); + + return 0; /* invalid hash value */ +} + +/* .retpoline_sites */ +static int cfi_disable_callers(s32 *start, s32 *end) +{ + /* + * Disable kCFI by patching in a JMP.d8, this leaves the hash immediate + * in tact for later usage. Also see decode_caller_hash() and + * cfi_rewrite_callers(). + */ + const u8 jmp[] = { JMP8_INSN_OPCODE, fineibt_caller_jmp }; + s32 *s; + + for (s = start; s < end; s++) { + void *addr = (void *)s + *s; + u32 hash; + + addr -= fineibt_caller_size; + hash = decode_caller_hash(addr); + if (!hash) /* nocfi callers */ continue; - if (WARN_ON_ONCE(!is_endbr(endbr))) + text_poke_early(addr, jmp, 2); + } + + return 0; +} + +static int cfi_enable_callers(s32 *start, s32 *end) +{ + /* + * Re-enable kCFI, undo what cfi_disable_callers() did. + */ + const u8 mov[] = { 0x41, 0xba }; + s32 *s; + + for (s = start; s < end; s++) { + void *addr = (void *)s + *s; + u32 hash; + + addr -= fineibt_caller_size; + hash = decode_caller_hash(addr); + if (!hash) /* nocfi callers */ continue; - DPRINTK("ENDBR at: %pS (%px)", addr, addr); + text_poke_early(addr, mov, 2); + } - /* - * When we have IBT, the lack of ENDBR will trigger #CP - */ - DUMP_BYTES(((u8*)addr), 4, "%px: orig: ", addr); - DUMP_BYTES(((u8*)&poison), 4, "%px: repl: ", addr); - text_poke_early(addr, &poison, 4); + return 0; +} + +/* .cfi_sites */ +static int cfi_rand_preamble(s32 *start, s32 *end) +{ + s32 *s; + + for (s = start; s < end; s++) { + void *addr = (void *)s + *s; + u32 hash; + + hash = decode_preamble_hash(addr); + if (WARN(!hash, "no CFI hash found at: %pS %px %*ph\n", + addr, addr, 5, addr)) + return -EINVAL; + + hash = cfi_rehash(hash); + text_poke_early(addr + 1, &hash, 4); + } + + return 0; +} + +static int cfi_rewrite_preamble(s32 *start, s32 *end) +{ + s32 *s; + + for (s = start; s < end; s++) { + void *addr = (void *)s + *s; + u32 hash; + + hash = decode_preamble_hash(addr); + if (WARN(!hash, "no CFI hash found at: %pS %px %*ph\n", + addr, addr, 5, addr)) + return -EINVAL; + + text_poke_early(addr, fineibt_preamble_start, fineibt_preamble_size); + WARN_ON(*(u32 *)(addr + fineibt_preamble_hash) != 0x12345678); + text_poke_early(addr + fineibt_preamble_hash, &hash, 4); + } + + return 0; +} + +/* .retpoline_sites */ +static int cfi_rand_callers(s32 *start, s32 *end) +{ + s32 *s; + + for (s = start; s < end; s++) { + void *addr = (void *)s + *s; + u32 hash; + + addr -= fineibt_caller_size; + hash = decode_caller_hash(addr); + if (hash) { + hash = -cfi_rehash(hash); + text_poke_early(addr + 2, &hash, 4); + } + } + + return 0; +} + +static int cfi_rewrite_callers(s32 *start, s32 *end) +{ + s32 *s; + + for (s = start; s < end; s++) { + void *addr = (void *)s + *s; + u32 hash; + + addr -= fineibt_caller_size; + hash = decode_caller_hash(addr); + if (hash) { + text_poke_early(addr, fineibt_caller_start, fineibt_caller_size); + WARN_ON(*(u32 *)(addr + fineibt_caller_hash) != 0x12345678); + text_poke_early(addr + fineibt_caller_hash, &hash, 4); + } + /* rely on apply_retpolines() */ } + + return 0; +} + +static void __apply_fineibt(s32 *start_retpoline, s32 *end_retpoline, + s32 *start_cfi, s32 *end_cfi, bool builtin) +{ + int ret; + + if (WARN_ONCE(fineibt_preamble_size != 16, + "FineIBT preamble wrong size: %ld", fineibt_preamble_size)) + return; + + if (cfi_mode == CFI_DEFAULT) { + cfi_mode = CFI_KCFI; + if (HAS_KERNEL_IBT && cpu_feature_enabled(X86_FEATURE_IBT)) + cfi_mode = CFI_FINEIBT; + } + + /* + * Rewrite the callers to not use the __cfi_ stubs, such that we might + * rewrite them. This disables all CFI. If this succeeds but any of the + * later stages fails, we're without CFI. + */ + ret = cfi_disable_callers(start_retpoline, end_retpoline); + if (ret) + goto err; + + if (cfi_rand) { + if (builtin) + cfi_seed = get_random_u32(); + + ret = cfi_rand_preamble(start_cfi, end_cfi); + if (ret) + goto err; + + ret = cfi_rand_callers(start_retpoline, end_retpoline); + if (ret) + goto err; + } + + switch (cfi_mode) { + case CFI_OFF: + if (builtin) + pr_info("Disabling CFI\n"); + return; + + case CFI_KCFI: + ret = cfi_enable_callers(start_retpoline, end_retpoline); + if (ret) + goto err; + + if (builtin) + pr_info("Using kCFI\n"); + return; + + case CFI_FINEIBT: + ret = cfi_rewrite_preamble(start_cfi, end_cfi); + if (ret) + goto err; + + ret = cfi_rewrite_callers(start_retpoline, end_retpoline); + if (ret) + goto err; + + if (builtin) + pr_info("Using FineIBT CFI\n"); + return; + + default: + break; + } + +err: + pr_err("Something went horribly wrong trying to rewrite the CFI implementation.\n"); } #else -void __init_or_module noinline apply_ibt_endbr(s32 *start, s32 *end) { } +static void __apply_fineibt(s32 *start_retpoline, s32 *end_retpoline, + s32 *start_cfi, s32 *end_cfi, bool builtin) +{ +} + +#endif -#endif /* CONFIG_X86_KERNEL_IBT */ +void apply_fineibt(s32 *start_retpoline, s32 *end_retpoline, + s32 *start_cfi, s32 *end_cfi) +{ + return __apply_fineibt(start_retpoline, end_retpoline, + start_cfi, end_cfi, + /* .builtin = */ false); +} #ifdef CONFIG_SMP static void alternatives_smp_lock(const s32 *start, const s32 *end, @@ -934,6 +1379,9 @@ void __init alternative_instructions(void) */ apply_paravirt(__parainstructions, __parainstructions_end); + __apply_fineibt(__retpoline_sites, __retpoline_sites_end, + __cfi_sites, __cfi_sites_end, true); + /* * Rewrite the retpolines, must be done before alternatives since * those can rewrite the retpoline thunks. @@ -947,6 +1395,12 @@ void __init alternative_instructions(void) */ apply_alternatives(__alt_instructions, __alt_instructions_end); + /* + * Now all calls are established. Apply the call thunks if + * required. + */ + callthunks_patch_builtin_calls(); + apply_ibt_endbr(__ibt_endbr_seal, __ibt_endbr_seal_end); #ifdef CONFIG_SMP @@ -1236,27 +1690,15 @@ void *text_poke_kgdb(void *addr, const void *opcode, size_t len) return __text_poke(text_poke_memcpy, addr, opcode, len); } -/** - * text_poke_copy - Copy instructions into (an unused part of) RX memory - * @addr: address to modify - * @opcode: source of the copy - * @len: length to copy, could be more than 2x PAGE_SIZE - * - * Not safe against concurrent execution; useful for JITs to dump - * new code blocks into unused regions of RX memory. Can be used in - * conjunction with synchronize_rcu_tasks() to wait for existing - * execution to quiesce after having made sure no existing functions - * pointers are live. - */ -void *text_poke_copy(void *addr, const void *opcode, size_t len) +void *text_poke_copy_locked(void *addr, const void *opcode, size_t len, + bool core_ok) { unsigned long start = (unsigned long)addr; size_t patched = 0; - if (WARN_ON_ONCE(core_kernel_text(start))) + if (WARN_ON_ONCE(!core_ok && core_kernel_text(start))) return NULL; - mutex_lock(&text_mutex); while (patched < len) { unsigned long ptr = start + patched; size_t s; @@ -1266,6 +1708,25 @@ void *text_poke_copy(void *addr, const void *opcode, size_t len) __text_poke(text_poke_memcpy, (void *)ptr, opcode + patched, s); patched += s; } + return addr; +} + +/** + * text_poke_copy - Copy instructions into (an unused part of) RX memory + * @addr: address to modify + * @opcode: source of the copy + * @len: length to copy, could be more than 2x PAGE_SIZE + * + * Not safe against concurrent execution; useful for JITs to dump + * new code blocks into unused regions of RX memory. Can be used in + * conjunction with synchronize_rcu_tasks() to wait for existing + * execution to quiesce after having made sure no existing functions + * pointers are live. + */ +void *text_poke_copy(void *addr, const void *opcode, size_t len) +{ + mutex_lock(&text_mutex); + addr = text_poke_copy_locked(addr, opcode, len, false); mutex_unlock(&text_mutex); return addr; } @@ -1608,7 +2069,7 @@ static void text_poke_loc_init(struct text_poke_loc *tp, void *addr, default: BUG_ON(len != insn.length); - }; + } switch (tp->opcode) { @@ -1681,11 +2142,6 @@ void __ref text_poke_queue(void *addr, const void *opcode, size_t len, const voi { struct text_poke_loc *tp; - if (unlikely(system_state == SYSTEM_BOOTING)) { - text_poke_early(addr, opcode, len); - return; - } - text_poke_flush(addr); tp = &tp_vec[tp_vec_nr++]; @@ -1707,11 +2163,6 @@ void __ref text_poke_bp(void *addr, const void *opcode, size_t len, const void * { struct text_poke_loc tp; - if (unlikely(system_state == SYSTEM_BOOTING)) { - text_poke_early(addr, opcode, len); - return; - } - text_poke_loc_init(&tp, addr, opcode, len, emulate); text_poke_bp_batch(&tp, 1); } diff --git a/arch/x86/kernel/amd_gart_64.c b/arch/x86/kernel/amd_gart_64.c index 19a0207e529f..56a917df410d 100644 --- a/arch/x86/kernel/amd_gart_64.c +++ b/arch/x86/kernel/amd_gart_64.c @@ -504,7 +504,7 @@ static __init unsigned long check_iommu_size(unsigned long aper, u64 aper_size) } a = aper + iommu_size; - iommu_size -= round_up(a, PMD_PAGE_SIZE) - a; + iommu_size -= round_up(a, PMD_SIZE) - a; if (iommu_size < 64*1024*1024) { pr_warn("PCI-DMA: Warning: Small IOMMU %luMB." diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index c6876d3ea4b1..20d9a604da7c 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -1931,16 +1931,19 @@ void __init check_x2apic(void) } } #else /* CONFIG_X86_X2APIC */ -static int __init validate_x2apic(void) +void __init check_x2apic(void) { if (!apic_is_x2apic_enabled()) - return 0; + return; /* - * Checkme: Can we simply turn off x2apic here instead of panic? + * Checkme: Can we simply turn off x2APIC here instead of disabling the APIC? */ - panic("BIOS has enabled x2apic but kernel doesn't support x2apic, please disable x2apic in BIOS.\n"); + pr_err("Kernel does not support x2APIC, please recompile with CONFIG_X86_X2APIC.\n"); + pr_err("Disabling APIC, expect reduced performance and functionality.\n"); + + disable_apic = 1; + setup_clear_cpu_cap(X86_FEATURE_APIC); } -early_initcall(validate_x2apic); static inline void try_to_enable_x2apic(int remap_mode) { } static inline void __x2apic_enable(void) { } diff --git a/arch/x86/kernel/apic/msi.c b/arch/x86/kernel/apic/msi.c index 7517eb05bdc1..35d5b8fb18ef 100644 --- a/arch/x86/kernel/apic/msi.c +++ b/arch/x86/kernel/apic/msi.c @@ -142,70 +142,139 @@ msi_set_affinity(struct irq_data *irqd, const struct cpumask *mask, bool force) return ret; } -/* - * IRQ Chip for MSI PCI/PCI-X/PCI-Express Devices, - * which implement the MSI or MSI-X Capability Structure. +/** + * pci_dev_has_default_msi_parent_domain - Check whether the device has the default + * MSI parent domain associated + * @dev: Pointer to the PCI device */ -static struct irq_chip pci_msi_controller = { - .name = "PCI-MSI", - .irq_unmask = pci_msi_unmask_irq, - .irq_mask = pci_msi_mask_irq, - .irq_ack = irq_chip_ack_parent, - .irq_retrigger = irq_chip_retrigger_hierarchy, - .irq_set_affinity = msi_set_affinity, - .flags = IRQCHIP_SKIP_SET_WAKE | - IRQCHIP_AFFINITY_PRE_STARTUP, -}; +bool pci_dev_has_default_msi_parent_domain(struct pci_dev *dev) +{ + struct irq_domain *domain = dev_get_msi_domain(&dev->dev); -int pci_msi_prepare(struct irq_domain *domain, struct device *dev, int nvec, - msi_alloc_info_t *arg) + if (!domain) + domain = dev_get_msi_domain(&dev->bus->dev); + if (!domain) + return false; + + return domain == x86_vector_domain; +} + +/** + * x86_msi_prepare - Setup of msi_alloc_info_t for allocations + * @domain: The domain for which this setup happens + * @dev: The device for which interrupts are allocated + * @nvec: The number of vectors to allocate + * @alloc: The allocation info structure to initialize + * + * This function is to be used for all types of MSI domains above the x86 + * vector domain and any intermediates. It is always invoked from the + * top level interrupt domain. The domain specific allocation + * functionality is determined via the @domain's bus token which allows to + * map the X86 specific allocation type. + */ +static int x86_msi_prepare(struct irq_domain *domain, struct device *dev, + int nvec, msi_alloc_info_t *alloc) { - init_irq_alloc_info(arg, NULL); - if (to_pci_dev(dev)->msix_enabled) { - arg->type = X86_IRQ_ALLOC_TYPE_PCI_MSIX; - } else { - arg->type = X86_IRQ_ALLOC_TYPE_PCI_MSI; - arg->flags |= X86_IRQ_ALLOC_CONTIGUOUS_VECTORS; + struct msi_domain_info *info = domain->host_data; + + init_irq_alloc_info(alloc, NULL); + + switch (info->bus_token) { + case DOMAIN_BUS_PCI_DEVICE_MSI: + alloc->type = X86_IRQ_ALLOC_TYPE_PCI_MSI; + return 0; + case DOMAIN_BUS_PCI_DEVICE_MSIX: + case DOMAIN_BUS_PCI_DEVICE_IMS: + alloc->type = X86_IRQ_ALLOC_TYPE_PCI_MSIX; + return 0; + default: + return -EINVAL; } - - return 0; } -EXPORT_SYMBOL_GPL(pci_msi_prepare); -static struct msi_domain_ops pci_msi_domain_ops = { - .msi_prepare = pci_msi_prepare, -}; +/** + * x86_init_dev_msi_info - Domain info setup for MSI domains + * @dev: The device for which the domain should be created + * @domain: The (root) domain providing this callback + * @real_parent: The real parent domain of the to initialize domain + * @info: The domain info for the to initialize domain + * + * This function is to be used for all types of MSI domains above the x86 + * vector domain and any intermediates. The domain specific functionality + * is determined via the @real_parent. + */ +static bool x86_init_dev_msi_info(struct device *dev, struct irq_domain *domain, + struct irq_domain *real_parent, struct msi_domain_info *info) +{ + const struct msi_parent_ops *pops = real_parent->msi_parent_ops; + + /* MSI parent domain specific settings */ + switch (real_parent->bus_token) { + case DOMAIN_BUS_ANY: + /* Only the vector domain can have the ANY token */ + if (WARN_ON_ONCE(domain != real_parent)) + return false; + info->chip->irq_set_affinity = msi_set_affinity; + /* See msi_set_affinity() for the gory details */ + info->flags |= MSI_FLAG_NOMASK_QUIRK; + break; + case DOMAIN_BUS_DMAR: + case DOMAIN_BUS_AMDVI: + break; + default: + WARN_ON_ONCE(1); + return false; + } + + /* Is the target supported? */ + switch(info->bus_token) { + case DOMAIN_BUS_PCI_DEVICE_MSI: + case DOMAIN_BUS_PCI_DEVICE_MSIX: + break; + case DOMAIN_BUS_PCI_DEVICE_IMS: + if (!(pops->supported_flags & MSI_FLAG_PCI_IMS)) + return false; + break; + default: + WARN_ON_ONCE(1); + return false; + } + + /* + * Mask out the domain specific MSI feature flags which are not + * supported by the real parent. + */ + info->flags &= pops->supported_flags; + /* Enforce the required flags */ + info->flags |= X86_VECTOR_MSI_FLAGS_REQUIRED; + + /* This is always invoked from the top level MSI domain! */ + info->ops->msi_prepare = x86_msi_prepare; + + info->chip->irq_ack = irq_chip_ack_parent; + info->chip->irq_retrigger = irq_chip_retrigger_hierarchy; + info->chip->flags |= IRQCHIP_SKIP_SET_WAKE | + IRQCHIP_AFFINITY_PRE_STARTUP; -static struct msi_domain_info pci_msi_domain_info = { - .flags = MSI_FLAG_USE_DEF_DOM_OPS | MSI_FLAG_USE_DEF_CHIP_OPS | - MSI_FLAG_PCI_MSIX, - .ops = &pci_msi_domain_ops, - .chip = &pci_msi_controller, - .handler = handle_edge_irq, - .handler_name = "edge", + info->handler = handle_edge_irq; + info->handler_name = "edge"; + + return true; +} + +static const struct msi_parent_ops x86_vector_msi_parent_ops = { + .supported_flags = X86_VECTOR_MSI_FLAGS_SUPPORTED, + .init_dev_msi_info = x86_init_dev_msi_info, }; struct irq_domain * __init native_create_pci_msi_domain(void) { - struct fwnode_handle *fn; - struct irq_domain *d; - if (disable_apic) return NULL; - fn = irq_domain_alloc_named_fwnode("PCI-MSI"); - if (!fn) - return NULL; - - d = pci_msi_create_irq_domain(fn, &pci_msi_domain_info, - x86_vector_domain); - if (!d) { - irq_domain_free_fwnode(fn); - pr_warn("Failed to initialize PCI-MSI irqdomain.\n"); - } else { - d->flags |= IRQ_DOMAIN_MSI_NOMASK_QUIRK; - } - return d; + x86_vector_domain->flags |= IRQ_DOMAIN_FLAG_MSI_PARENT; + x86_vector_domain->msi_parent_ops = &x86_vector_msi_parent_ops; + return x86_vector_domain; } void __init x86_create_pci_msi_domain(void) @@ -213,41 +282,19 @@ void __init x86_create_pci_msi_domain(void) x86_pci_msi_default_domain = x86_init.irqs.create_pci_msi_domain(); } -#ifdef CONFIG_IRQ_REMAP -static struct irq_chip pci_msi_ir_controller = { - .name = "IR-PCI-MSI", - .irq_unmask = pci_msi_unmask_irq, - .irq_mask = pci_msi_mask_irq, - .irq_ack = irq_chip_ack_parent, - .irq_retrigger = irq_chip_retrigger_hierarchy, - .flags = IRQCHIP_SKIP_SET_WAKE | - IRQCHIP_AFFINITY_PRE_STARTUP, -}; - -static struct msi_domain_info pci_msi_ir_domain_info = { - .flags = MSI_FLAG_USE_DEF_DOM_OPS | MSI_FLAG_USE_DEF_CHIP_OPS | - MSI_FLAG_MULTI_PCI_MSI | MSI_FLAG_PCI_MSIX, - .ops = &pci_msi_domain_ops, - .chip = &pci_msi_ir_controller, - .handler = handle_edge_irq, - .handler_name = "edge", -}; - -struct irq_domain *arch_create_remap_msi_irq_domain(struct irq_domain *parent, - const char *name, int id) +/* Keep around for hyperV */ +int pci_msi_prepare(struct irq_domain *domain, struct device *dev, int nvec, + msi_alloc_info_t *arg) { - struct fwnode_handle *fn; - struct irq_domain *d; + init_irq_alloc_info(arg, NULL); - fn = irq_domain_alloc_named_id_fwnode(name, id); - if (!fn) - return NULL; - d = pci_msi_create_irq_domain(fn, &pci_msi_ir_domain_info, parent); - if (!d) - irq_domain_free_fwnode(fn); - return d; + if (to_pci_dev(dev)->msix_enabled) + arg->type = X86_IRQ_ALLOC_TYPE_PCI_MSIX; + else + arg->type = X86_IRQ_ALLOC_TYPE_PCI_MSI; + return 0; } -#endif +EXPORT_SYMBOL_GPL(pci_msi_prepare); #ifdef CONFIG_DMAR_TABLE /* diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c index 3e6f6b448f6a..c1efebd27e6c 100644 --- a/arch/x86/kernel/apic/vector.c +++ b/arch/x86/kernel/apic/vector.c @@ -539,10 +539,6 @@ static int x86_vector_alloc_irqs(struct irq_domain *domain, unsigned int virq, if (disable_apic) return -ENXIO; - /* Currently vector allocator can't guarantee contiguous allocations */ - if ((info->flags & X86_IRQ_ALLOC_CONTIGUOUS_VECTORS) && nr_irqs > 1) - return -ENOSYS; - /* * Catch any attempt to touch the cascade interrupt on a PIC * equipped system. diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c index 437308004ef2..82c783da16a8 100644 --- a/arch/x86/kernel/asm-offsets.c +++ b/arch/x86/kernel/asm-offsets.c @@ -107,4 +107,9 @@ static void __used common(void) OFFSET(TSS_sp0, tss_struct, x86_tss.sp0); OFFSET(TSS_sp1, tss_struct, x86_tss.sp1); OFFSET(TSS_sp2, tss_struct, x86_tss.sp2); + OFFSET(X86_top_of_stack, pcpu_hot, top_of_stack); +#ifdef CONFIG_CALL_DEPTH_TRACKING + OFFSET(X86_call_depth, pcpu_hot, call_depth); +#endif + } diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c index 9b698215d261..bb65371ea9df 100644 --- a/arch/x86/kernel/asm-offsets_64.c +++ b/arch/x86/kernel/asm-offsets_64.c @@ -57,7 +57,7 @@ int main(void) BLANK(); #ifdef CONFIG_STACKPROTECTOR - DEFINE(stack_canary_offset, offsetof(struct fixed_percpu_data, stack_canary)); + OFFSET(FIXED_stack_canary, fixed_percpu_data, stack_canary); BLANK(); #endif return 0; diff --git a/arch/x86/kernel/callthunks.c b/arch/x86/kernel/callthunks.c new file mode 100644 index 000000000000..ffea98f9064b --- /dev/null +++ b/arch/x86/kernel/callthunks.c @@ -0,0 +1,388 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#define pr_fmt(fmt) "callthunks: " fmt + +#include <linux/debugfs.h> +#include <linux/kallsyms.h> +#include <linux/memory.h> +#include <linux/moduleloader.h> +#include <linux/static_call.h> + +#include <asm/alternative.h> +#include <asm/asm-offsets.h> +#include <asm/cpu.h> +#include <asm/ftrace.h> +#include <asm/insn.h> +#include <asm/kexec.h> +#include <asm/nospec-branch.h> +#include <asm/paravirt.h> +#include <asm/sections.h> +#include <asm/switch_to.h> +#include <asm/sync_core.h> +#include <asm/text-patching.h> +#include <asm/xen/hypercall.h> + +static int __initdata_or_module debug_callthunks; + +#define prdbg(fmt, args...) \ +do { \ + if (debug_callthunks) \ + printk(KERN_DEBUG pr_fmt(fmt), ##args); \ +} while(0) + +static int __init debug_thunks(char *str) +{ + debug_callthunks = 1; + return 1; +} +__setup("debug-callthunks", debug_thunks); + +#ifdef CONFIG_CALL_THUNKS_DEBUG +DEFINE_PER_CPU(u64, __x86_call_count); +DEFINE_PER_CPU(u64, __x86_ret_count); +DEFINE_PER_CPU(u64, __x86_stuffs_count); +DEFINE_PER_CPU(u64, __x86_ctxsw_count); +EXPORT_SYMBOL_GPL(__x86_ctxsw_count); +EXPORT_SYMBOL_GPL(__x86_call_count); +#endif + +extern s32 __call_sites[], __call_sites_end[]; + +struct thunk_desc { + void *template; + unsigned int template_size; +}; + +struct core_text { + unsigned long base; + unsigned long end; + const char *name; +}; + +static bool thunks_initialized __ro_after_init; + +static const struct core_text builtin_coretext = { + .base = (unsigned long)_text, + .end = (unsigned long)_etext, + .name = "builtin", +}; + +asm ( + ".pushsection .rodata \n" + ".global skl_call_thunk_template \n" + "skl_call_thunk_template: \n" + __stringify(INCREMENT_CALL_DEPTH)" \n" + ".global skl_call_thunk_tail \n" + "skl_call_thunk_tail: \n" + ".popsection \n" +); + +extern u8 skl_call_thunk_template[]; +extern u8 skl_call_thunk_tail[]; + +#define SKL_TMPL_SIZE \ + ((unsigned int)(skl_call_thunk_tail - skl_call_thunk_template)) + +extern void error_entry(void); +extern void xen_error_entry(void); +extern void paranoid_entry(void); + +static inline bool within_coretext(const struct core_text *ct, void *addr) +{ + unsigned long p = (unsigned long)addr; + + return ct->base <= p && p < ct->end; +} + +static inline bool within_module_coretext(void *addr) +{ + bool ret = false; + +#ifdef CONFIG_MODULES + struct module *mod; + + preempt_disable(); + mod = __module_address((unsigned long)addr); + if (mod && within_module_core((unsigned long)addr, mod)) + ret = true; + preempt_enable(); +#endif + return ret; +} + +static bool is_coretext(const struct core_text *ct, void *addr) +{ + if (ct && within_coretext(ct, addr)) + return true; + if (within_coretext(&builtin_coretext, addr)) + return true; + return within_module_coretext(addr); +} + +static bool skip_addr(void *dest) +{ + if (dest == error_entry) + return true; + if (dest == paranoid_entry) + return true; + if (dest == xen_error_entry) + return true; + /* Does FILL_RSB... */ + if (dest == __switch_to_asm) + return true; + /* Accounts directly */ + if (dest == ret_from_fork) + return true; +#ifdef CONFIG_HOTPLUG_CPU + if (dest == start_cpu0) + return true; +#endif +#ifdef CONFIG_FUNCTION_TRACER + if (dest == __fentry__) + return true; +#endif +#ifdef CONFIG_KEXEC_CORE + if (dest >= (void *)relocate_kernel && + dest < (void*)relocate_kernel + KEXEC_CONTROL_CODE_MAX_SIZE) + return true; +#endif +#ifdef CONFIG_XEN + if (dest >= (void *)hypercall_page && + dest < (void*)hypercall_page + PAGE_SIZE) + return true; +#endif + return false; +} + +static __init_or_module void *call_get_dest(void *addr) +{ + struct insn insn; + void *dest; + int ret; + + ret = insn_decode_kernel(&insn, addr); + if (ret) + return ERR_PTR(ret); + + /* Patched out call? */ + if (insn.opcode.bytes[0] != CALL_INSN_OPCODE) + return NULL; + + dest = addr + insn.length + insn.immediate.value; + if (skip_addr(dest)) + return NULL; + return dest; +} + +static const u8 nops[] = { + 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, + 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, + 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, + 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, +}; + +static void *patch_dest(void *dest, bool direct) +{ + unsigned int tsize = SKL_TMPL_SIZE; + u8 *pad = dest - tsize; + + /* Already patched? */ + if (!bcmp(pad, skl_call_thunk_template, tsize)) + return pad; + + /* Ensure there are nops */ + if (bcmp(pad, nops, tsize)) { + pr_warn_once("Invalid padding area for %pS\n", dest); + return NULL; + } + + if (direct) + memcpy(pad, skl_call_thunk_template, tsize); + else + text_poke_copy_locked(pad, skl_call_thunk_template, tsize, true); + return pad; +} + +static __init_or_module void patch_call(void *addr, const struct core_text *ct) +{ + void *pad, *dest; + u8 bytes[8]; + + if (!within_coretext(ct, addr)) + return; + + dest = call_get_dest(addr); + if (!dest || WARN_ON_ONCE(IS_ERR(dest))) + return; + + if (!is_coretext(ct, dest)) + return; + + pad = patch_dest(dest, within_coretext(ct, dest)); + if (!pad) + return; + + prdbg("Patch call at: %pS %px to %pS %px -> %px \n", addr, addr, + dest, dest, pad); + __text_gen_insn(bytes, CALL_INSN_OPCODE, addr, pad, CALL_INSN_SIZE); + text_poke_early(addr, bytes, CALL_INSN_SIZE); +} + +static __init_or_module void +patch_call_sites(s32 *start, s32 *end, const struct core_text *ct) +{ + s32 *s; + + for (s = start; s < end; s++) + patch_call((void *)s + *s, ct); +} + +static __init_or_module void +patch_paravirt_call_sites(struct paravirt_patch_site *start, + struct paravirt_patch_site *end, + const struct core_text *ct) +{ + struct paravirt_patch_site *p; + + for (p = start; p < end; p++) + patch_call(p->instr, ct); +} + +static __init_or_module void +callthunks_setup(struct callthunk_sites *cs, const struct core_text *ct) +{ + prdbg("Patching call sites %s\n", ct->name); + patch_call_sites(cs->call_start, cs->call_end, ct); + patch_paravirt_call_sites(cs->pv_start, cs->pv_end, ct); + prdbg("Patching call sites done%s\n", ct->name); +} + +void __init callthunks_patch_builtin_calls(void) +{ + struct callthunk_sites cs = { + .call_start = __call_sites, + .call_end = __call_sites_end, + .pv_start = __parainstructions, + .pv_end = __parainstructions_end + }; + + if (!cpu_feature_enabled(X86_FEATURE_CALL_DEPTH)) + return; + + pr_info("Setting up call depth tracking\n"); + mutex_lock(&text_mutex); + callthunks_setup(&cs, &builtin_coretext); + static_call_force_reinit(); + thunks_initialized = true; + mutex_unlock(&text_mutex); +} + +void *callthunks_translate_call_dest(void *dest) +{ + void *target; + + lockdep_assert_held(&text_mutex); + + if (!thunks_initialized || skip_addr(dest)) + return dest; + + if (!is_coretext(NULL, dest)) + return dest; + + target = patch_dest(dest, false); + return target ? : dest; +} + +bool is_callthunk(void *addr) +{ + unsigned int tmpl_size = SKL_TMPL_SIZE; + void *tmpl = skl_call_thunk_template; + unsigned long dest; + + dest = roundup((unsigned long)addr, CONFIG_FUNCTION_ALIGNMENT); + if (!thunks_initialized || skip_addr((void *)dest)) + return false; + + return !bcmp((void *)(dest - tmpl_size), tmpl, tmpl_size); +} + +#ifdef CONFIG_BPF_JIT +int x86_call_depth_emit_accounting(u8 **pprog, void *func) +{ + unsigned int tmpl_size = SKL_TMPL_SIZE; + void *tmpl = skl_call_thunk_template; + + if (!thunks_initialized) + return 0; + + /* Is function call target a thunk? */ + if (func && is_callthunk(func)) + return 0; + + memcpy(*pprog, tmpl, tmpl_size); + *pprog += tmpl_size; + return tmpl_size; +} +#endif + +#ifdef CONFIG_MODULES +void noinline callthunks_patch_module_calls(struct callthunk_sites *cs, + struct module *mod) +{ + struct core_text ct = { + .base = (unsigned long)mod->core_layout.base, + .end = (unsigned long)mod->core_layout.base + mod->core_layout.size, + .name = mod->name, + }; + + if (!thunks_initialized) + return; + + mutex_lock(&text_mutex); + callthunks_setup(cs, &ct); + mutex_unlock(&text_mutex); +} +#endif /* CONFIG_MODULES */ + +#if defined(CONFIG_CALL_THUNKS_DEBUG) && defined(CONFIG_DEBUG_FS) +static int callthunks_debug_show(struct seq_file *m, void *p) +{ + unsigned long cpu = (unsigned long)m->private; + + seq_printf(m, "C: %16llu R: %16llu S: %16llu X: %16llu\n,", + per_cpu(__x86_call_count, cpu), + per_cpu(__x86_ret_count, cpu), + per_cpu(__x86_stuffs_count, cpu), + per_cpu(__x86_ctxsw_count, cpu)); + return 0; +} + +static int callthunks_debug_open(struct inode *inode, struct file *file) +{ + return single_open(file, callthunks_debug_show, inode->i_private); +} + +static const struct file_operations dfs_ops = { + .open = callthunks_debug_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static int __init callthunks_debugfs_init(void) +{ + struct dentry *dir; + unsigned long cpu; + + dir = debugfs_create_dir("callthunks", NULL); + for_each_possible_cpu(cpu) { + void *arg = (void *)cpu; + char name [10]; + + sprintf(name, "cpu%lu", cpu); + debugfs_create_file(name, 0644, dir, arg, &dfs_ops); + } + return 0; +} +__initcall(callthunks_debugfs_init); +#endif diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index f10a921ee756..d7e3ceaf75c1 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -17,9 +17,6 @@ KMSAN_SANITIZE_common.o := n # As above, instrumenting secondary CPU boot code causes boot hangs. KCSAN_SANITIZE_common.o := n -# Make sure load_percpu_segment has no stackprotector -CFLAGS_common.o := -fno-stack-protector - obj-y := cacheinfo.o scattered.o topology.o obj-y += common.o obj-y += rdrand.o diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 860b60273df3..f769d6d08b43 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -770,8 +770,6 @@ static void init_amd_gh(struct cpuinfo_x86 *c) set_cpu_bug(c, X86_BUG_AMD_TLB_MMATCH); } -#define MSR_AMD64_DE_CFG 0xC0011029 - static void init_amd_ln(struct cpuinfo_x86 *c) { /* @@ -965,8 +963,8 @@ static void init_amd(struct cpuinfo_x86 *c) * msr_set_bit() uses the safe accessors, too, even if the MSR * is not present. */ - msr_set_bit(MSR_F10H_DECFG, - MSR_F10H_DECFG_LFENCE_SERIALIZE_BIT); + msr_set_bit(MSR_AMD64_DE_CFG, + MSR_AMD64_DE_CFG_LFENCE_SERIALIZE_BIT); /* A serializing LFENCE stops RDTSC speculation */ set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC); @@ -985,7 +983,7 @@ static void init_amd(struct cpuinfo_x86 *c) set_cpu_cap(c, X86_FEATURE_3DNOWPREFETCH); /* AMD CPUs don't reset SS attributes on SYSRET, Xen does. */ - if (!cpu_has(c, X86_FEATURE_XENPV)) + if (!cpu_feature_enabled(X86_FEATURE_XENPV)) set_cpu_bug(c, X86_BUG_SYSRET_SS_ATTRS); /* diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 3e3230cccaa7..bca0bd8f4846 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -60,11 +60,18 @@ EXPORT_SYMBOL_GPL(x86_spec_ctrl_current); static DEFINE_MUTEX(spec_ctrl_mutex); +/* Update SPEC_CTRL MSR and its cached copy unconditionally */ +static void update_spec_ctrl(u64 val) +{ + this_cpu_write(x86_spec_ctrl_current, val); + wrmsrl(MSR_IA32_SPEC_CTRL, val); +} + /* * Keep track of the SPEC_CTRL MSR value for the current task, which may differ * from x86_spec_ctrl_base due to STIBP/SSB in __speculation_ctrl_update(). */ -void write_spec_ctrl_current(u64 val, bool force) +void update_spec_ctrl_cond(u64 val) { if (this_cpu_read(x86_spec_ctrl_current) == val) return; @@ -75,7 +82,7 @@ void write_spec_ctrl_current(u64 val, bool force) * When KERNEL_IBRS this MSR is written on return-to-user, unless * forced the update can be delayed until that time. */ - if (force || !cpu_feature_enabled(X86_FEATURE_KERNEL_IBRS)) + if (!cpu_feature_enabled(X86_FEATURE_KERNEL_IBRS)) wrmsrl(MSR_IA32_SPEC_CTRL, val); } @@ -780,6 +787,7 @@ enum retbleed_mitigation { RETBLEED_MITIGATION_IBPB, RETBLEED_MITIGATION_IBRS, RETBLEED_MITIGATION_EIBRS, + RETBLEED_MITIGATION_STUFF, }; enum retbleed_mitigation_cmd { @@ -787,6 +795,7 @@ enum retbleed_mitigation_cmd { RETBLEED_CMD_AUTO, RETBLEED_CMD_UNRET, RETBLEED_CMD_IBPB, + RETBLEED_CMD_STUFF, }; static const char * const retbleed_strings[] = { @@ -795,6 +804,7 @@ static const char * const retbleed_strings[] = { [RETBLEED_MITIGATION_IBPB] = "Mitigation: IBPB", [RETBLEED_MITIGATION_IBRS] = "Mitigation: IBRS", [RETBLEED_MITIGATION_EIBRS] = "Mitigation: Enhanced IBRS", + [RETBLEED_MITIGATION_STUFF] = "Mitigation: Stuffing", }; static enum retbleed_mitigation retbleed_mitigation __ro_after_init = @@ -824,8 +834,12 @@ static int __init retbleed_parse_cmdline(char *str) retbleed_cmd = RETBLEED_CMD_UNRET; } else if (!strcmp(str, "ibpb")) { retbleed_cmd = RETBLEED_CMD_IBPB; + } else if (!strcmp(str, "stuff")) { + retbleed_cmd = RETBLEED_CMD_STUFF; } else if (!strcmp(str, "nosmt")) { retbleed_nosmt = true; + } else if (!strcmp(str, "force")) { + setup_force_cpu_bug(X86_BUG_RETBLEED); } else { pr_err("Ignoring unknown retbleed option (%s).", str); } @@ -872,6 +886,21 @@ static void __init retbleed_select_mitigation(void) } break; + case RETBLEED_CMD_STUFF: + if (IS_ENABLED(CONFIG_CALL_DEPTH_TRACKING) && + spectre_v2_enabled == SPECTRE_V2_RETPOLINE) { + retbleed_mitigation = RETBLEED_MITIGATION_STUFF; + + } else { + if (IS_ENABLED(CONFIG_CALL_DEPTH_TRACKING)) + pr_err("WARNING: retbleed=stuff depends on spectre_v2=retpoline\n"); + else + pr_err("WARNING: kernel not compiled with CALL_DEPTH_TRACKING.\n"); + + goto do_cmd_auto; + } + break; + do_cmd_auto: case RETBLEED_CMD_AUTO: default: @@ -909,6 +938,12 @@ do_cmd_auto: mitigate_smt = true; break; + case RETBLEED_MITIGATION_STUFF: + setup_force_cpu_cap(X86_FEATURE_RETHUNK); + setup_force_cpu_cap(X86_FEATURE_CALL_DEPTH); + x86_set_skl_return_thunk(); + break; + default: break; } @@ -919,7 +954,7 @@ do_cmd_auto: /* * Let IBRS trump all on Intel without affecting the effects of the - * retbleed= cmdline option. + * retbleed= cmdline option except for call depth based stuffing */ if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) { switch (spectre_v2_enabled) { @@ -932,7 +967,8 @@ do_cmd_auto: retbleed_mitigation = RETBLEED_MITIGATION_EIBRS; break; default: - pr_err(RETBLEED_INTEL_MSG); + if (retbleed_mitigation != RETBLEED_MITIGATION_STUFF) + pr_err(RETBLEED_INTEL_MSG); } } @@ -1295,7 +1331,7 @@ static enum spectre_v2_mitigation_cmd __init spectre_v2_parse_cmdline(void) return SPECTRE_V2_CMD_AUTO; } - if (cmd == SPECTRE_V2_CMD_IBRS && boot_cpu_has(X86_FEATURE_XENPV)) { + if (cmd == SPECTRE_V2_CMD_IBRS && cpu_feature_enabled(X86_FEATURE_XENPV)) { pr_err("%s selected but running as XenPV guest. Switching to AUTO select\n", mitigation_options[i].option); return SPECTRE_V2_CMD_AUTO; @@ -1328,7 +1364,7 @@ static void __init spec_ctrl_disable_kernel_rrsba(void) if (ia32_cap & ARCH_CAP_RRSBA) { x86_spec_ctrl_base |= SPEC_CTRL_RRSBA_DIS_S; - write_spec_ctrl_current(x86_spec_ctrl_base, true); + update_spec_ctrl(x86_spec_ctrl_base); } } @@ -1406,6 +1442,7 @@ static void __init spectre_v2_select_mitigation(void) if (IS_ENABLED(CONFIG_CPU_IBRS_ENTRY) && boot_cpu_has_bug(X86_BUG_RETBLEED) && retbleed_cmd != RETBLEED_CMD_OFF && + retbleed_cmd != RETBLEED_CMD_STUFF && boot_cpu_has(X86_FEATURE_IBRS) && boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) { mode = SPECTRE_V2_IBRS; @@ -1450,7 +1487,7 @@ static void __init spectre_v2_select_mitigation(void) if (spectre_v2_in_ibrs_mode(mode)) { x86_spec_ctrl_base |= SPEC_CTRL_IBRS; - write_spec_ctrl_current(x86_spec_ctrl_base, true); + update_spec_ctrl(x86_spec_ctrl_base); } switch (mode) { @@ -1564,7 +1601,7 @@ static void __init spectre_v2_select_mitigation(void) static void update_stibp_msr(void * __unused) { u64 val = spec_ctrl_current() | (x86_spec_ctrl_base & SPEC_CTRL_STIBP); - write_spec_ctrl_current(val, true); + update_spec_ctrl(val); } /* Update x86_spec_ctrl_base in case SMT state changed. */ @@ -1797,7 +1834,7 @@ static enum ssb_mitigation __init __ssb_select_mitigation(void) x86_amd_ssb_disable(); } else { x86_spec_ctrl_base |= SPEC_CTRL_SSBD; - write_spec_ctrl_current(x86_spec_ctrl_base, true); + update_spec_ctrl(x86_spec_ctrl_base); } } @@ -1944,6 +1981,8 @@ static int ib_prctl_set(struct task_struct *task, unsigned long ctrl) if (ctrl == PR_SPEC_FORCE_DISABLE) task_set_spec_ib_force_disable(task); task_update_spec_tif(task); + if (task == current) + indirect_branch_prediction_barrier(); break; default: return -ERANGE; @@ -2048,7 +2087,7 @@ int arch_prctl_spec_ctrl_get(struct task_struct *task, unsigned long which) void x86_spec_ctrl_setup_ap(void) { if (boot_cpu_has(X86_FEATURE_MSR_SPEC_CTRL)) - write_spec_ctrl_current(x86_spec_ctrl_base, true); + update_spec_ctrl(x86_spec_ctrl_base); if (ssb_mode == SPEC_STORE_BYPASS_DISABLE) x86_amd_ssb_disable(); @@ -2199,74 +2238,74 @@ static const char * const l1tf_vmx_states[] = { static ssize_t l1tf_show_state(char *buf) { if (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_AUTO) - return sprintf(buf, "%s\n", L1TF_DEFAULT_MSG); + return sysfs_emit(buf, "%s\n", L1TF_DEFAULT_MSG); if (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_EPT_DISABLED || (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_NEVER && sched_smt_active())) { - return sprintf(buf, "%s; VMX: %s\n", L1TF_DEFAULT_MSG, - l1tf_vmx_states[l1tf_vmx_mitigation]); + return sysfs_emit(buf, "%s; VMX: %s\n", L1TF_DEFAULT_MSG, + l1tf_vmx_states[l1tf_vmx_mitigation]); } - return sprintf(buf, "%s; VMX: %s, SMT %s\n", L1TF_DEFAULT_MSG, - l1tf_vmx_states[l1tf_vmx_mitigation], - sched_smt_active() ? "vulnerable" : "disabled"); + return sysfs_emit(buf, "%s; VMX: %s, SMT %s\n", L1TF_DEFAULT_MSG, + l1tf_vmx_states[l1tf_vmx_mitigation], + sched_smt_active() ? "vulnerable" : "disabled"); } static ssize_t itlb_multihit_show_state(char *buf) { if (!boot_cpu_has(X86_FEATURE_MSR_IA32_FEAT_CTL) || !boot_cpu_has(X86_FEATURE_VMX)) - return sprintf(buf, "KVM: Mitigation: VMX unsupported\n"); + return sysfs_emit(buf, "KVM: Mitigation: VMX unsupported\n"); else if (!(cr4_read_shadow() & X86_CR4_VMXE)) - return sprintf(buf, "KVM: Mitigation: VMX disabled\n"); + return sysfs_emit(buf, "KVM: Mitigation: VMX disabled\n"); else if (itlb_multihit_kvm_mitigation) - return sprintf(buf, "KVM: Mitigation: Split huge pages\n"); + return sysfs_emit(buf, "KVM: Mitigation: Split huge pages\n"); else - return sprintf(buf, "KVM: Vulnerable\n"); + return sysfs_emit(buf, "KVM: Vulnerable\n"); } #else static ssize_t l1tf_show_state(char *buf) { - return sprintf(buf, "%s\n", L1TF_DEFAULT_MSG); + return sysfs_emit(buf, "%s\n", L1TF_DEFAULT_MSG); } static ssize_t itlb_multihit_show_state(char *buf) { - return sprintf(buf, "Processor vulnerable\n"); + return sysfs_emit(buf, "Processor vulnerable\n"); } #endif static ssize_t mds_show_state(char *buf) { if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) { - return sprintf(buf, "%s; SMT Host state unknown\n", - mds_strings[mds_mitigation]); + return sysfs_emit(buf, "%s; SMT Host state unknown\n", + mds_strings[mds_mitigation]); } if (boot_cpu_has(X86_BUG_MSBDS_ONLY)) { - return sprintf(buf, "%s; SMT %s\n", mds_strings[mds_mitigation], - (mds_mitigation == MDS_MITIGATION_OFF ? "vulnerable" : - sched_smt_active() ? "mitigated" : "disabled")); + return sysfs_emit(buf, "%s; SMT %s\n", mds_strings[mds_mitigation], + (mds_mitigation == MDS_MITIGATION_OFF ? "vulnerable" : + sched_smt_active() ? "mitigated" : "disabled")); } - return sprintf(buf, "%s; SMT %s\n", mds_strings[mds_mitigation], - sched_smt_active() ? "vulnerable" : "disabled"); + return sysfs_emit(buf, "%s; SMT %s\n", mds_strings[mds_mitigation], + sched_smt_active() ? "vulnerable" : "disabled"); } static ssize_t tsx_async_abort_show_state(char *buf) { if ((taa_mitigation == TAA_MITIGATION_TSX_DISABLED) || (taa_mitigation == TAA_MITIGATION_OFF)) - return sprintf(buf, "%s\n", taa_strings[taa_mitigation]); + return sysfs_emit(buf, "%s\n", taa_strings[taa_mitigation]); if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) { - return sprintf(buf, "%s; SMT Host state unknown\n", - taa_strings[taa_mitigation]); + return sysfs_emit(buf, "%s; SMT Host state unknown\n", + taa_strings[taa_mitigation]); } - return sprintf(buf, "%s; SMT %s\n", taa_strings[taa_mitigation], - sched_smt_active() ? "vulnerable" : "disabled"); + return sysfs_emit(buf, "%s; SMT %s\n", taa_strings[taa_mitigation], + sched_smt_active() ? "vulnerable" : "disabled"); } static ssize_t mmio_stale_data_show_state(char *buf) @@ -2334,73 +2373,72 @@ static char *pbrsb_eibrs_state(void) static ssize_t spectre_v2_show_state(char *buf) { if (spectre_v2_enabled == SPECTRE_V2_LFENCE) - return sprintf(buf, "Vulnerable: LFENCE\n"); + return sysfs_emit(buf, "Vulnerable: LFENCE\n"); if (spectre_v2_enabled == SPECTRE_V2_EIBRS && unprivileged_ebpf_enabled()) - return sprintf(buf, "Vulnerable: eIBRS with unprivileged eBPF\n"); + return sysfs_emit(buf, "Vulnerable: eIBRS with unprivileged eBPF\n"); if (sched_smt_active() && unprivileged_ebpf_enabled() && spectre_v2_enabled == SPECTRE_V2_EIBRS_LFENCE) - return sprintf(buf, "Vulnerable: eIBRS+LFENCE with unprivileged eBPF and SMT\n"); + return sysfs_emit(buf, "Vulnerable: eIBRS+LFENCE with unprivileged eBPF and SMT\n"); - return sprintf(buf, "%s%s%s%s%s%s%s\n", - spectre_v2_strings[spectre_v2_enabled], - ibpb_state(), - boot_cpu_has(X86_FEATURE_USE_IBRS_FW) ? ", IBRS_FW" : "", - stibp_state(), - boot_cpu_has(X86_FEATURE_RSB_CTXSW) ? ", RSB filling" : "", - pbrsb_eibrs_state(), - spectre_v2_module_string()); + return sysfs_emit(buf, "%s%s%s%s%s%s%s\n", + spectre_v2_strings[spectre_v2_enabled], + ibpb_state(), + boot_cpu_has(X86_FEATURE_USE_IBRS_FW) ? ", IBRS_FW" : "", + stibp_state(), + boot_cpu_has(X86_FEATURE_RSB_CTXSW) ? ", RSB filling" : "", + pbrsb_eibrs_state(), + spectre_v2_module_string()); } static ssize_t srbds_show_state(char *buf) { - return sprintf(buf, "%s\n", srbds_strings[srbds_mitigation]); + return sysfs_emit(buf, "%s\n", srbds_strings[srbds_mitigation]); } static ssize_t retbleed_show_state(char *buf) { if (retbleed_mitigation == RETBLEED_MITIGATION_UNRET || retbleed_mitigation == RETBLEED_MITIGATION_IBPB) { - if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD && - boot_cpu_data.x86_vendor != X86_VENDOR_HYGON) - return sprintf(buf, "Vulnerable: untrained return thunk / IBPB on non-AMD based uarch\n"); + if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD && + boot_cpu_data.x86_vendor != X86_VENDOR_HYGON) + return sysfs_emit(buf, "Vulnerable: untrained return thunk / IBPB on non-AMD based uarch\n"); - return sprintf(buf, "%s; SMT %s\n", - retbleed_strings[retbleed_mitigation], - !sched_smt_active() ? "disabled" : - spectre_v2_user_stibp == SPECTRE_V2_USER_STRICT || - spectre_v2_user_stibp == SPECTRE_V2_USER_STRICT_PREFERRED ? - "enabled with STIBP protection" : "vulnerable"); + return sysfs_emit(buf, "%s; SMT %s\n", retbleed_strings[retbleed_mitigation], + !sched_smt_active() ? "disabled" : + spectre_v2_user_stibp == SPECTRE_V2_USER_STRICT || + spectre_v2_user_stibp == SPECTRE_V2_USER_STRICT_PREFERRED ? + "enabled with STIBP protection" : "vulnerable"); } - return sprintf(buf, "%s\n", retbleed_strings[retbleed_mitigation]); + return sysfs_emit(buf, "%s\n", retbleed_strings[retbleed_mitigation]); } static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr, char *buf, unsigned int bug) { if (!boot_cpu_has_bug(bug)) - return sprintf(buf, "Not affected\n"); + return sysfs_emit(buf, "Not affected\n"); switch (bug) { case X86_BUG_CPU_MELTDOWN: if (boot_cpu_has(X86_FEATURE_PTI)) - return sprintf(buf, "Mitigation: PTI\n"); + return sysfs_emit(buf, "Mitigation: PTI\n"); if (hypervisor_is_type(X86_HYPER_XEN_PV)) - return sprintf(buf, "Unknown (XEN PV detected, hypervisor mitigation required)\n"); + return sysfs_emit(buf, "Unknown (XEN PV detected, hypervisor mitigation required)\n"); break; case X86_BUG_SPECTRE_V1: - return sprintf(buf, "%s\n", spectre_v1_strings[spectre_v1_mitigation]); + return sysfs_emit(buf, "%s\n", spectre_v1_strings[spectre_v1_mitigation]); case X86_BUG_SPECTRE_V2: return spectre_v2_show_state(buf); case X86_BUG_SPEC_STORE_BYPASS: - return sprintf(buf, "%s\n", ssb_strings[ssb_mode]); + return sysfs_emit(buf, "%s\n", ssb_strings[ssb_mode]); case X86_BUG_L1TF: if (boot_cpu_has(X86_FEATURE_L1TF_PTEINV)) @@ -2430,7 +2468,7 @@ static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr break; } - return sprintf(buf, "Vulnerable\n"); + return sysfs_emit(buf, "Vulnerable\n"); } ssize_t cpu_show_meltdown(struct device *dev, struct device_attribute *attr, char *buf) diff --git a/arch/x86/kernel/cpu/cacheinfo.c b/arch/x86/kernel/cpu/cacheinfo.c index 66556833d7af..f4e5aa27eec6 100644 --- a/arch/x86/kernel/cpu/cacheinfo.c +++ b/arch/x86/kernel/cpu/cacheinfo.c @@ -11,15 +11,19 @@ #include <linux/slab.h> #include <linux/cacheinfo.h> #include <linux/cpu.h> +#include <linux/cpuhotplug.h> #include <linux/sched.h> #include <linux/capability.h> #include <linux/sysfs.h> #include <linux/pci.h> +#include <linux/stop_machine.h> #include <asm/cpufeature.h> #include <asm/cacheinfo.h> #include <asm/amd_nb.h> #include <asm/smp.h> +#include <asm/mtrr.h> +#include <asm/tlbflush.h> #include "cpu.h" @@ -35,6 +39,9 @@ DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_llc_shared_map); /* Shared L2 cache maps */ DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_l2c_shared_map); +/* Kernel controls MTRR and/or PAT MSRs. */ +unsigned int memory_caching_control __ro_after_init; + struct _cache_table { unsigned char descriptor; char cache_type; @@ -1040,3 +1047,175 @@ int populate_cache_leaves(unsigned int cpu) return 0; } + +/* + * Disable and enable caches. Needed for changing MTRRs and the PAT MSR. + * + * Since we are disabling the cache don't allow any interrupts, + * they would run extremely slow and would only increase the pain. + * + * The caller must ensure that local interrupts are disabled and + * are reenabled after cache_enable() has been called. + */ +static unsigned long saved_cr4; +static DEFINE_RAW_SPINLOCK(cache_disable_lock); + +void cache_disable(void) __acquires(cache_disable_lock) +{ + unsigned long cr0; + + /* + * Note that this is not ideal + * since the cache is only flushed/disabled for this CPU while the + * MTRRs are changed, but changing this requires more invasive + * changes to the way the kernel boots + */ + + raw_spin_lock(&cache_disable_lock); + + /* Enter the no-fill (CD=1, NW=0) cache mode and flush caches. */ + cr0 = read_cr0() | X86_CR0_CD; + write_cr0(cr0); + + /* + * Cache flushing is the most time-consuming step when programming + * the MTRRs. Fortunately, as per the Intel Software Development + * Manual, we can skip it if the processor supports cache self- + * snooping. + */ + if (!static_cpu_has(X86_FEATURE_SELFSNOOP)) + wbinvd(); + + /* Save value of CR4 and clear Page Global Enable (bit 7) */ + if (cpu_feature_enabled(X86_FEATURE_PGE)) { + saved_cr4 = __read_cr4(); + __write_cr4(saved_cr4 & ~X86_CR4_PGE); + } + + /* Flush all TLBs via a mov %cr3, %reg; mov %reg, %cr3 */ + count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL); + flush_tlb_local(); + + if (cpu_feature_enabled(X86_FEATURE_MTRR)) + mtrr_disable(); + + /* Again, only flush caches if we have to. */ + if (!static_cpu_has(X86_FEATURE_SELFSNOOP)) + wbinvd(); +} + +void cache_enable(void) __releases(cache_disable_lock) +{ + /* Flush TLBs (no need to flush caches - they are disabled) */ + count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL); + flush_tlb_local(); + + if (cpu_feature_enabled(X86_FEATURE_MTRR)) + mtrr_enable(); + + /* Enable caches */ + write_cr0(read_cr0() & ~X86_CR0_CD); + + /* Restore value of CR4 */ + if (cpu_feature_enabled(X86_FEATURE_PGE)) + __write_cr4(saved_cr4); + + raw_spin_unlock(&cache_disable_lock); +} + +static void cache_cpu_init(void) +{ + unsigned long flags; + + local_irq_save(flags); + cache_disable(); + + if (memory_caching_control & CACHE_MTRR) + mtrr_generic_set_state(); + + if (memory_caching_control & CACHE_PAT) + pat_cpu_init(); + + cache_enable(); + local_irq_restore(flags); +} + +static bool cache_aps_delayed_init = true; + +void set_cache_aps_delayed_init(bool val) +{ + cache_aps_delayed_init = val; +} + +bool get_cache_aps_delayed_init(void) +{ + return cache_aps_delayed_init; +} + +static int cache_rendezvous_handler(void *unused) +{ + if (get_cache_aps_delayed_init() || !cpu_online(smp_processor_id())) + cache_cpu_init(); + + return 0; +} + +void __init cache_bp_init(void) +{ + mtrr_bp_init(); + pat_bp_init(); + + if (memory_caching_control) + cache_cpu_init(); +} + +void cache_bp_restore(void) +{ + if (memory_caching_control) + cache_cpu_init(); +} + +static int cache_ap_init(unsigned int cpu) +{ + if (!memory_caching_control || get_cache_aps_delayed_init()) + return 0; + + /* + * Ideally we should hold mtrr_mutex here to avoid MTRR entries + * changed, but this routine will be called in CPU boot time, + * holding the lock breaks it. + * + * This routine is called in two cases: + * + * 1. very early time of software resume, when there absolutely + * isn't MTRR entry changes; + * + * 2. CPU hotadd time. We let mtrr_add/del_page hold cpuhotplug + * lock to prevent MTRR entry changes + */ + stop_machine_from_inactive_cpu(cache_rendezvous_handler, NULL, + cpu_callout_mask); + + return 0; +} + +/* + * Delayed cache initialization for all AP's + */ +void cache_aps_init(void) +{ + if (!memory_caching_control || !get_cache_aps_delayed_init()) + return; + + stop_machine(cache_rendezvous_handler, NULL, cpu_online_mask); + set_cache_aps_delayed_init(false); +} + +static int __init cache_ap_register(void) +{ + cpuhp_setup_state_nocalls(CPUHP_AP_CACHECTRL_STARTING, + "x86/cachectrl:starting", + cache_ap_init, NULL); + return 0; +} +core_initcall(cache_ap_register); diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 3e508f239098..9cfca3d7d0e2 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -22,9 +22,9 @@ #include <linux/io.h> #include <linux/syscore_ops.h> #include <linux/pgtable.h> +#include <linux/stackprotector.h> #include <asm/cmdline.h> -#include <asm/stackprotector.h> #include <asm/perf_event.h> #include <asm/mmu_context.h> #include <asm/doublefault.h> @@ -52,6 +52,7 @@ #include <asm/cpu.h> #include <asm/mce.h> #include <asm/msr.h> +#include <asm/cacheinfo.h> #include <asm/memtype.h> #include <asm/microcode.h> #include <asm/microcode_intel.h> @@ -609,6 +610,7 @@ static __always_inline void setup_cet(struct cpuinfo_x86 *c) if (!ibt_selftest()) { pr_err("IBT selftest: Failed!\n"); + wrmsrl(MSR_IA32_S_CET, 0); setup_clear_cpu_cap(X86_FEATURE_IBT); return; } @@ -701,16 +703,6 @@ static const char *table_lookup_model(struct cpuinfo_x86 *c) __u32 cpu_caps_cleared[NCAPINTS + NBUGINTS] __aligned(sizeof(unsigned long)); __u32 cpu_caps_set[NCAPINTS + NBUGINTS] __aligned(sizeof(unsigned long)); -void load_percpu_segment(int cpu) -{ -#ifdef CONFIG_X86_32 - loadsegment(fs, __KERNEL_PERCPU); -#else - __loadsegment_simple(gs, 0); - wrmsrl(MSR_GS_BASE, cpu_kernelmode_gs_base(cpu)); -#endif -} - #ifdef CONFIG_X86_32 /* The 32-bit entry code needs to find cpu_entry_area. */ DEFINE_PER_CPU(struct cpu_entry_area *, cpu_entry_area); @@ -738,16 +730,45 @@ void load_fixmap_gdt(int cpu) } EXPORT_SYMBOL_GPL(load_fixmap_gdt); -/* - * Current gdt points %fs at the "master" per-cpu area: after this, - * it's on the real one. +/** + * switch_gdt_and_percpu_base - Switch to direct GDT and runtime per CPU base + * @cpu: The CPU number for which this is invoked + * + * Invoked during early boot to switch from early GDT and early per CPU to + * the direct GDT and the runtime per CPU area. On 32-bit the percpu base + * switch is implicit by loading the direct GDT. On 64bit this requires + * to update GSBASE. */ -void switch_to_new_gdt(int cpu) +void __init switch_gdt_and_percpu_base(int cpu) { - /* Load the original GDT */ load_direct_gdt(cpu); - /* Reload the per-cpu base */ - load_percpu_segment(cpu); + +#ifdef CONFIG_X86_64 + /* + * No need to load %gs. It is already correct. + * + * Writing %gs on 64bit would zero GSBASE which would make any per + * CPU operation up to the point of the wrmsrl() fault. + * + * Set GSBASE to the new offset. Until the wrmsrl() happens the + * early mapping is still valid. That means the GSBASE update will + * lose any prior per CPU data which was not copied over in + * setup_per_cpu_areas(). + * + * This works even with stackprotector enabled because the + * per CPU stack canary is 0 in both per CPU areas. + */ + wrmsrl(MSR_GS_BASE, cpu_kernelmode_gs_base(cpu)); +#else + /* + * %fs is already set to __KERNEL_PERCPU, but after switching GDT + * it is required to load FS again so that the 'hidden' part is + * updated from the new GDT. Up to this point the early per CPU + * translation is active. Any content of the early per CPU data + * which was not copied over in setup_per_cpu_areas() is lost. + */ + loadsegment(fs, __KERNEL_PERCPU); +#endif } static const struct cpu_dev *cpu_devs[X86_VENDOR_NUM] = {}; @@ -1948,7 +1969,6 @@ void identify_secondary_cpu(struct cpuinfo_x86 *c) #ifdef CONFIG_X86_32 enable_sep_cpu(); #endif - mtrr_ap_init(); validate_apic_and_package_id(c); x86_spec_ctrl_setup_ap(); update_srbds_msr(); @@ -1993,27 +2013,18 @@ static __init int setup_clearcpuid(char *arg) } __setup("clearcpuid=", setup_clearcpuid); +DEFINE_PER_CPU_ALIGNED(struct pcpu_hot, pcpu_hot) = { + .current_task = &init_task, + .preempt_count = INIT_PREEMPT_COUNT, + .top_of_stack = TOP_OF_INIT_STACK, +}; +EXPORT_PER_CPU_SYMBOL(pcpu_hot); + #ifdef CONFIG_X86_64 DEFINE_PER_CPU_FIRST(struct fixed_percpu_data, fixed_percpu_data) __aligned(PAGE_SIZE) __visible; EXPORT_PER_CPU_SYMBOL_GPL(fixed_percpu_data); -/* - * The following percpu variables are hot. Align current_task to - * cacheline size such that they fall in the same cacheline. - */ -DEFINE_PER_CPU(struct task_struct *, current_task) ____cacheline_aligned = - &init_task; -EXPORT_PER_CPU_SYMBOL(current_task); - -DEFINE_PER_CPU(void *, hardirq_stack_ptr); -DEFINE_PER_CPU(bool, hardirq_stack_inuse); - -DEFINE_PER_CPU(int, __preempt_count) = INIT_PREEMPT_COUNT; -EXPORT_PER_CPU_SYMBOL(__preempt_count); - -DEFINE_PER_CPU(unsigned long, cpu_current_top_of_stack) = TOP_OF_INIT_STACK; - static void wrmsrl_cstar(unsigned long val) { /* @@ -2064,20 +2075,6 @@ void syscall_init(void) #else /* CONFIG_X86_64 */ -DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task; -EXPORT_PER_CPU_SYMBOL(current_task); -DEFINE_PER_CPU(int, __preempt_count) = INIT_PREEMPT_COUNT; -EXPORT_PER_CPU_SYMBOL(__preempt_count); - -/* - * On x86_32, vm86 modifies tss.sp0, so sp0 isn't a reliable way to find - * the top of the kernel stack. Use an extra percpu variable to track the - * top of the kernel stack directly. - */ -DEFINE_PER_CPU(unsigned long, cpu_current_top_of_stack) = - (unsigned long)&init_thread_union + THREAD_SIZE; -EXPORT_PER_CPU_SYMBOL(cpu_current_top_of_stack); - #ifdef CONFIG_STACKPROTECTOR DEFINE_PER_CPU(unsigned long, __stack_chk_guard); EXPORT_PER_CPU_SYMBOL(__stack_chk_guard); @@ -2248,12 +2245,6 @@ void cpu_init(void) boot_cpu_has(X86_FEATURE_TSC) || boot_cpu_has(X86_FEATURE_DE)) cr4_clear_bits(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE); - /* - * Initialize the per-CPU GDT with the boot GDT, - * and set up the GDT descriptor: - */ - switch_to_new_gdt(cpu); - if (IS_ENABLED(CONFIG_X86_64)) { loadsegment(fs, 0); memset(cur->thread.tls_array, 0, GDT_ENTRY_TLS_ENTRIES * 8); diff --git a/arch/x86/kernel/cpu/cpuid-deps.c b/arch/x86/kernel/cpu/cpuid-deps.c index c881bcafba7d..d95221117129 100644 --- a/arch/x86/kernel/cpu/cpuid-deps.c +++ b/arch/x86/kernel/cpu/cpuid-deps.c @@ -75,6 +75,7 @@ static const struct cpuid_dep cpuid_deps[] = { { X86_FEATURE_SGX_LC, X86_FEATURE_SGX }, { X86_FEATURE_SGX1, X86_FEATURE_SGX }, { X86_FEATURE_SGX2, X86_FEATURE_SGX1 }, + { X86_FEATURE_SGX_EDECCSSA, X86_FEATURE_SGX1 }, { X86_FEATURE_XFD, X86_FEATURE_XSAVES }, { X86_FEATURE_XFD, X86_FEATURE_XGETBV1 }, { X86_FEATURE_AMX_TILE, X86_FEATURE_XFD }, diff --git a/arch/x86/kernel/cpu/hygon.c b/arch/x86/kernel/cpu/hygon.c index 21fd425088fe..5a2962c492d3 100644 --- a/arch/x86/kernel/cpu/hygon.c +++ b/arch/x86/kernel/cpu/hygon.c @@ -326,8 +326,8 @@ static void init_hygon(struct cpuinfo_x86 *c) * msr_set_bit() uses the safe accessors, too, even if the MSR * is not present. */ - msr_set_bit(MSR_F10H_DECFG, - MSR_F10H_DECFG_LFENCE_SERIALIZE_BIT); + msr_set_bit(MSR_AMD64_DE_CFG, + MSR_AMD64_DE_CFG_LFENCE_SERIALIZE_BIT); /* A serializing LFENCE stops RDTSC speculation */ set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC); @@ -339,7 +339,7 @@ static void init_hygon(struct cpuinfo_x86 *c) set_cpu_cap(c, X86_FEATURE_ARAT); /* Hygon CPUs don't reset SS attributes on SYSRET, Xen does. */ - if (!cpu_has(c, X86_FEATURE_XENPV)) + if (!cpu_feature_enabled(X86_FEATURE_XENPV)) set_cpu_bug(c, X86_BUG_SYSRET_SS_ATTRS); check_null_seg_clears_base(c); diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index 2d7ea5480ec3..291d4167fab8 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -210,12 +210,154 @@ int intel_cpu_collect_info(struct ucode_cpu_info *uci) csig.rev = intel_get_microcode_revision(); uci->cpu_sig = csig; - uci->valid = 1; return 0; } EXPORT_SYMBOL_GPL(intel_cpu_collect_info); +/* + * Returns 1 if update has been found, 0 otherwise. + */ +int intel_find_matching_signature(void *mc, unsigned int csig, int cpf) +{ + struct microcode_header_intel *mc_hdr = mc; + struct extended_sigtable *ext_hdr; + struct extended_signature *ext_sig; + int i; + + if (intel_cpu_signatures_match(csig, cpf, mc_hdr->sig, mc_hdr->pf)) + return 1; + + /* Look for ext. headers: */ + if (get_totalsize(mc_hdr) <= get_datasize(mc_hdr) + MC_HEADER_SIZE) + return 0; + + ext_hdr = mc + get_datasize(mc_hdr) + MC_HEADER_SIZE; + ext_sig = (void *)ext_hdr + EXT_HEADER_SIZE; + + for (i = 0; i < ext_hdr->count; i++) { + if (intel_cpu_signatures_match(csig, cpf, ext_sig->sig, ext_sig->pf)) + return 1; + ext_sig++; + } + return 0; +} +EXPORT_SYMBOL_GPL(intel_find_matching_signature); + +/** + * intel_microcode_sanity_check() - Sanity check microcode file. + * @mc: Pointer to the microcode file contents. + * @print_err: Display failure reason if true, silent if false. + * @hdr_type: Type of file, i.e. normal microcode file or In Field Scan file. + * Validate if the microcode header type matches with the type + * specified here. + * + * Validate certain header fields and verify if computed checksum matches + * with the one specified in the header. + * + * Return: 0 if the file passes all the checks, -EINVAL if any of the checks + * fail. + */ +int intel_microcode_sanity_check(void *mc, bool print_err, int hdr_type) +{ + unsigned long total_size, data_size, ext_table_size; + struct microcode_header_intel *mc_header = mc; + struct extended_sigtable *ext_header = NULL; + u32 sum, orig_sum, ext_sigcount = 0, i; + struct extended_signature *ext_sig; + + total_size = get_totalsize(mc_header); + data_size = get_datasize(mc_header); + + if (data_size + MC_HEADER_SIZE > total_size) { + if (print_err) + pr_err("Error: bad microcode data file size.\n"); + return -EINVAL; + } + + if (mc_header->ldrver != 1 || mc_header->hdrver != hdr_type) { + if (print_err) + pr_err("Error: invalid/unknown microcode update format. Header type %d\n", + mc_header->hdrver); + return -EINVAL; + } + + ext_table_size = total_size - (MC_HEADER_SIZE + data_size); + if (ext_table_size) { + u32 ext_table_sum = 0; + u32 *ext_tablep; + + if (ext_table_size < EXT_HEADER_SIZE || + ((ext_table_size - EXT_HEADER_SIZE) % EXT_SIGNATURE_SIZE)) { + if (print_err) + pr_err("Error: truncated extended signature table.\n"); + return -EINVAL; + } + + ext_header = mc + MC_HEADER_SIZE + data_size; + if (ext_table_size != exttable_size(ext_header)) { + if (print_err) + pr_err("Error: extended signature table size mismatch.\n"); + return -EFAULT; + } + + ext_sigcount = ext_header->count; + + /* + * Check extended table checksum: the sum of all dwords that + * comprise a valid table must be 0. + */ + ext_tablep = (u32 *)ext_header; + + i = ext_table_size / sizeof(u32); + while (i--) + ext_table_sum += ext_tablep[i]; + + if (ext_table_sum) { + if (print_err) + pr_warn("Bad extended signature table checksum, aborting.\n"); + return -EINVAL; + } + } + + /* + * Calculate the checksum of update data and header. The checksum of + * valid update data and header including the extended signature table + * must be 0. + */ + orig_sum = 0; + i = (MC_HEADER_SIZE + data_size) / sizeof(u32); + while (i--) + orig_sum += ((u32 *)mc)[i]; + + if (orig_sum) { + if (print_err) + pr_err("Bad microcode data checksum, aborting.\n"); + return -EINVAL; + } + + if (!ext_table_size) + return 0; + + /* + * Check extended signature checksum: 0 => valid. + */ + for (i = 0; i < ext_sigcount; i++) { + ext_sig = (void *)ext_header + EXT_HEADER_SIZE + + EXT_SIGNATURE_SIZE * i; + + sum = (mc_header->sig + mc_header->pf + mc_header->cksum) - + (ext_sig->sig + ext_sig->pf + ext_sig->cksum); + if (sum) { + if (print_err) + pr_err("Bad extended signature checksum, aborting.\n"); + return -EINVAL; + } + } + return 0; +} +EXPORT_SYMBOL_GPL(intel_microcode_sanity_check); + static void early_init_intel(struct cpuinfo_x86 *c) { u64 misc_enable; @@ -1034,8 +1176,32 @@ static const struct { static struct ratelimit_state bld_ratelimit; +static unsigned int sysctl_sld_mitigate = 1; static DEFINE_SEMAPHORE(buslock_sem); +#ifdef CONFIG_PROC_SYSCTL +static struct ctl_table sld_sysctls[] = { + { + .procname = "split_lock_mitigate", + .data = &sysctl_sld_mitigate, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_douintvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, + {} +}; + +static int __init sld_mitigate_sysctl_init(void) +{ + register_sysctl_init("kernel", sld_sysctls); + return 0; +} + +late_initcall(sld_mitigate_sysctl_init); +#endif + static inline bool match_option(const char *arg, int arglen, const char *opt) { int len = strlen(opt), ratelimit; @@ -1146,12 +1312,20 @@ static void split_lock_init(void) split_lock_verify_msr(sld_state != sld_off); } -static void __split_lock_reenable(struct work_struct *work) +static void __split_lock_reenable_unlock(struct work_struct *work) { sld_update_msr(true); up(&buslock_sem); } +static DECLARE_DELAYED_WORK(sl_reenable_unlock, __split_lock_reenable_unlock); + +static void __split_lock_reenable(struct work_struct *work) +{ + sld_update_msr(true); +} +static DECLARE_DELAYED_WORK(sl_reenable, __split_lock_reenable); + /* * If a CPU goes offline with pending delayed work to re-enable split lock * detection then the delayed work will be executed on some other CPU. That @@ -1169,10 +1343,9 @@ static int splitlock_cpu_offline(unsigned int cpu) return 0; } -static DECLARE_DELAYED_WORK(split_lock_reenable, __split_lock_reenable); - static void split_lock_warn(unsigned long ip) { + struct delayed_work *work; int cpu; if (!current->reported_split_lock) @@ -1180,14 +1353,26 @@ static void split_lock_warn(unsigned long ip) current->comm, current->pid, ip); current->reported_split_lock = 1; - /* misery factor #1, sleep 10ms before trying to execute split lock */ - if (msleep_interruptible(10) > 0) - return; - /* Misery factor #2, only allow one buslocked disabled core at a time */ - if (down_interruptible(&buslock_sem) == -EINTR) - return; + if (sysctl_sld_mitigate) { + /* + * misery factor #1: + * sleep 10ms before trying to execute split lock. + */ + if (msleep_interruptible(10) > 0) + return; + /* + * Misery factor #2: + * only allow one buslocked disabled core at a time. + */ + if (down_interruptible(&buslock_sem) == -EINTR) + return; + work = &sl_reenable_unlock; + } else { + work = &sl_reenable; + } + cpu = get_cpu(); - schedule_delayed_work_on(cpu, &split_lock_reenable, 2); + schedule_delayed_work_on(cpu, work, 2); /* Disable split lock detection on this CPU to make progress */ sld_update_msr(false); diff --git a/arch/x86/kernel/cpu/intel_epb.c b/arch/x86/kernel/cpu/intel_epb.c index fbaf12e43f41..3b8476158236 100644 --- a/arch/x86/kernel/cpu/intel_epb.c +++ b/arch/x86/kernel/cpu/intel_epb.c @@ -204,7 +204,12 @@ static int intel_epb_offline(unsigned int cpu) } static const struct x86_cpu_id intel_epb_normal[] = { - X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE_L, 7), + X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE_L, + ENERGY_PERF_BIAS_NORMAL_POWERSAVE), + X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE_N, + ENERGY_PERF_BIAS_NORMAL_POWERSAVE), + X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE_P, + ENERGY_PERF_BIAS_NORMAL_POWERSAVE), {} }; diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c index 1c87501e0fa3..10fb5b5c9efa 100644 --- a/arch/x86/kernel/cpu/mce/amd.c +++ b/arch/x86/kernel/cpu/mce/amd.c @@ -788,6 +788,24 @@ _log_error_bank(unsigned int bank, u32 msr_stat, u32 msr_addr, u64 misc) return status & MCI_STATUS_DEFERRED; } +static bool _log_error_deferred(unsigned int bank, u32 misc) +{ + if (!_log_error_bank(bank, mca_msr_reg(bank, MCA_STATUS), + mca_msr_reg(bank, MCA_ADDR), misc)) + return false; + + /* + * Non-SMCA systems don't have MCA_DESTAT/MCA_DEADDR registers. + * Return true here to avoid accessing these registers. + */ + if (!mce_flags.smca) + return true; + + /* Clear MCA_DESTAT if the deferred error was logged from MCA_STATUS. */ + wrmsrl(MSR_AMD64_SMCA_MCx_DESTAT(bank), 0); + return true; +} + /* * We have three scenarios for checking for Deferred errors: * @@ -799,19 +817,8 @@ _log_error_bank(unsigned int bank, u32 msr_stat, u32 msr_addr, u64 misc) */ static void log_error_deferred(unsigned int bank) { - bool defrd; - - defrd = _log_error_bank(bank, mca_msr_reg(bank, MCA_STATUS), - mca_msr_reg(bank, MCA_ADDR), 0); - - if (!mce_flags.smca) - return; - - /* Clear MCA_DESTAT if we logged the deferred error from MCA_STATUS. */ - if (defrd) { - wrmsrl(MSR_AMD64_SMCA_MCx_DESTAT(bank), 0); + if (_log_error_deferred(bank, 0)) return; - } /* * Only deferred errors are logged in MCA_DE{STAT,ADDR} so just check @@ -832,7 +839,7 @@ static void amd_deferred_error_interrupt(void) static void log_error_thresholding(unsigned int bank, u64 misc) { - _log_error_bank(bank, mca_msr_reg(bank, MCA_STATUS), mca_msr_reg(bank, MCA_ADDR), misc); + _log_error_deferred(bank, misc); } static void log_and_reset_block(struct threshold_block *block) diff --git a/arch/x86/kernel/cpu/mce/severity.c b/arch/x86/kernel/cpu/mce/severity.c index 00483d1c27e4..c4477162c07d 100644 --- a/arch/x86/kernel/cpu/mce/severity.c +++ b/arch/x86/kernel/cpu/mce/severity.c @@ -203,6 +203,11 @@ static struct severity { BITSET(MCI_STATUS_OVER|MCI_STATUS_UC) ), MCESEV( + PANIC, "Uncorrected in kernel", + BITSET(MCI_STATUS_UC), + KERNEL + ), + MCESEV( UC, "Uncorrected", BITSET(MCI_STATUS_UC) ), @@ -391,9 +396,6 @@ static noinstr int mce_severity_intel(struct mce *m, struct pt_regs *regs, char *msg = s->msg; s->covered = 1; - if (s->sev >= MCE_UC_SEVERITY && ctx == IN_KERNEL) - return MCE_PANIC_SEVERITY; - return s->sev; } } diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c index 3a35dec3ec55..56471f750762 100644 --- a/arch/x86/kernel/cpu/microcode/amd.c +++ b/arch/x86/kernel/cpu/microcode/amd.c @@ -901,8 +901,7 @@ load_microcode_amd(bool save, u8 family, const u8 *data, size_t size) * * These might be larger than 2K. */ -static enum ucode_state request_microcode_amd(int cpu, struct device *device, - bool refresh_fw) +static enum ucode_state request_microcode_amd(int cpu, struct device *device) { char fw_name[36] = "amd-ucode/microcode_amd.bin"; struct cpuinfo_x86 *c = &cpu_data(cpu); @@ -911,7 +910,7 @@ static enum ucode_state request_microcode_amd(int cpu, struct device *device, const struct firmware *fw; /* reload ucode container only on the boot cpu */ - if (!refresh_fw || !bsp) + if (!bsp) return UCODE_OK; if (c->x86 >= 0x15) diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c index 6a41cee242f6..712aafff96e0 100644 --- a/arch/x86/kernel/cpu/microcode/core.c +++ b/arch/x86/kernel/cpu/microcode/core.c @@ -319,60 +319,6 @@ void reload_early_microcode(void) } } -static void collect_cpu_info_local(void *arg) -{ - struct cpu_info_ctx *ctx = arg; - - ctx->err = microcode_ops->collect_cpu_info(smp_processor_id(), - ctx->cpu_sig); -} - -static int collect_cpu_info_on_target(int cpu, struct cpu_signature *cpu_sig) -{ - struct cpu_info_ctx ctx = { .cpu_sig = cpu_sig, .err = 0 }; - int ret; - - ret = smp_call_function_single(cpu, collect_cpu_info_local, &ctx, 1); - if (!ret) - ret = ctx.err; - - return ret; -} - -static int collect_cpu_info(int cpu) -{ - struct ucode_cpu_info *uci = ucode_cpu_info + cpu; - int ret; - - memset(uci, 0, sizeof(*uci)); - - ret = collect_cpu_info_on_target(cpu, &uci->cpu_sig); - if (!ret) - uci->valid = 1; - - return ret; -} - -static void apply_microcode_local(void *arg) -{ - enum ucode_state *err = arg; - - *err = microcode_ops->apply_microcode(smp_processor_id()); -} - -static int apply_microcode_on_target(int cpu) -{ - enum ucode_state err; - int ret; - - ret = smp_call_function_single(cpu, apply_microcode_local, &err, 1); - if (!ret) { - if (err == UCODE_ERROR) - ret = 1; - } - return ret; -} - /* fake device for request_firmware */ static struct platform_device *microcode_pdev; @@ -458,7 +404,7 @@ static int __reload_late(void *info) * below. */ if (cpumask_first(topology_sibling_cpumask(cpu)) == cpu) - apply_microcode_local(&err); + err = microcode_ops->apply_microcode(cpu); else goto wait_for_siblings; @@ -480,7 +426,7 @@ wait_for_siblings: * revision. */ if (cpumask_first(topology_sibling_cpumask(cpu)) != cpu) - apply_microcode_local(&err); + err = microcode_ops->apply_microcode(cpu); return ret; } @@ -531,7 +477,7 @@ static ssize_t reload_store(struct device *dev, if (ret) goto put; - tmp_ret = microcode_ops->request_microcode_fw(bsp, µcode_pdev->dev, true); + tmp_ret = microcode_ops->request_microcode_fw(bsp, µcode_pdev->dev); if (tmp_ret != UCODE_NEW) goto put; @@ -589,91 +535,17 @@ static void microcode_fini_cpu(int cpu) microcode_ops->microcode_fini_cpu(cpu); } -static enum ucode_state microcode_resume_cpu(int cpu) +static enum ucode_state microcode_init_cpu(int cpu) { - if (apply_microcode_on_target(cpu)) - return UCODE_ERROR; - - pr_debug("CPU%d updated upon resume\n", cpu); - - return UCODE_OK; -} - -static enum ucode_state microcode_init_cpu(int cpu, bool refresh_fw) -{ - enum ucode_state ustate; struct ucode_cpu_info *uci = ucode_cpu_info + cpu; - if (uci->valid) - return UCODE_OK; - - if (collect_cpu_info(cpu)) - return UCODE_ERROR; - - /* --dimm. Trigger a delayed update? */ - if (system_state != SYSTEM_RUNNING) - return UCODE_NFOUND; - - ustate = microcode_ops->request_microcode_fw(cpu, µcode_pdev->dev, refresh_fw); - if (ustate == UCODE_NEW) { - pr_debug("CPU%d updated upon init\n", cpu); - apply_microcode_on_target(cpu); - } - - return ustate; -} - -static enum ucode_state microcode_update_cpu(int cpu) -{ - struct ucode_cpu_info *uci = ucode_cpu_info + cpu; - - /* Refresh CPU microcode revision after resume. */ - collect_cpu_info(cpu); - - if (uci->valid) - return microcode_resume_cpu(cpu); - - return microcode_init_cpu(cpu, false); -} - -static int mc_device_add(struct device *dev, struct subsys_interface *sif) -{ - int err, cpu = dev->id; - - if (!cpu_online(cpu)) - return 0; - - pr_debug("CPU%d added\n", cpu); - - err = sysfs_create_group(&dev->kobj, &mc_attr_group); - if (err) - return err; + memset(uci, 0, sizeof(*uci)); - if (microcode_init_cpu(cpu, true) == UCODE_ERROR) - return -EINVAL; + microcode_ops->collect_cpu_info(cpu, &uci->cpu_sig); - return err; + return microcode_ops->apply_microcode(cpu); } -static void mc_device_remove(struct device *dev, struct subsys_interface *sif) -{ - int cpu = dev->id; - - if (!cpu_online(cpu)) - return; - - pr_debug("CPU%d removed\n", cpu); - microcode_fini_cpu(cpu); - sysfs_remove_group(&dev->kobj, &mc_attr_group); -} - -static struct subsys_interface mc_cpu_interface = { - .name = "microcode", - .subsys = &cpu_subsys, - .add_dev = mc_device_add, - .remove_dev = mc_device_remove, -}; - /** * microcode_bsp_resume - Update boot CPU microcode during resume. */ @@ -682,21 +554,23 @@ void microcode_bsp_resume(void) int cpu = smp_processor_id(); struct ucode_cpu_info *uci = ucode_cpu_info + cpu; - if (uci->valid && uci->mc) + if (uci->mc) microcode_ops->apply_microcode(cpu); - else if (!uci->mc) + else reload_early_microcode(); } static struct syscore_ops mc_syscore_ops = { - .resume = microcode_bsp_resume, + .resume = microcode_bsp_resume, }; static int mc_cpu_starting(unsigned int cpu) { - microcode_update_cpu(cpu); - pr_debug("CPU%d added\n", cpu); - return 0; + enum ucode_state err = microcode_ops->apply_microcode(cpu); + + pr_debug("%s: CPU%d, err: %d\n", __func__, cpu, err); + + return err == UCODE_ERROR; } static int mc_cpu_online(unsigned int cpu) @@ -713,13 +587,30 @@ static int mc_cpu_down_prep(unsigned int cpu) struct device *dev; dev = get_cpu_device(cpu); + + microcode_fini_cpu(cpu); + /* Suspend is in progress, only remove the interface */ sysfs_remove_group(&dev->kobj, &mc_attr_group); - pr_debug("CPU%d removed\n", cpu); + pr_debug("%s: CPU%d\n", __func__, cpu); return 0; } +static void setup_online_cpu(struct work_struct *work) +{ + int cpu = smp_processor_id(); + enum ucode_state err; + + err = microcode_init_cpu(cpu); + if (err == UCODE_ERROR) { + pr_err("Error applying microcode on CPU%d\n", cpu); + return; + } + + mc_cpu_online(cpu); +} + static struct attribute *cpu_root_microcode_attrs[] = { #ifdef CONFIG_MICROCODE_LATE_LOADING &dev_attr_reload.attr, @@ -750,28 +641,19 @@ static int __init microcode_init(void) if (!microcode_ops) return -ENODEV; - microcode_pdev = platform_device_register_simple("microcode", -1, - NULL, 0); + microcode_pdev = platform_device_register_simple("microcode", -1, NULL, 0); if (IS_ERR(microcode_pdev)) return PTR_ERR(microcode_pdev); - cpus_read_lock(); - mutex_lock(µcode_mutex); - error = subsys_interface_register(&mc_cpu_interface); - mutex_unlock(µcode_mutex); - cpus_read_unlock(); - - if (error) - goto out_pdev; - - error = sysfs_create_group(&cpu_subsys.dev_root->kobj, - &cpu_root_microcode_group); - + error = sysfs_create_group(&cpu_subsys.dev_root->kobj, &cpu_root_microcode_group); if (error) { pr_err("Error creating microcode group!\n"); - goto out_driver; + goto out_pdev; } + /* Do per-CPU setup */ + schedule_on_each_cpu(setup_online_cpu); + register_syscore_ops(&mc_syscore_ops); cpuhp_setup_state_nocalls(CPUHP_AP_MICROCODE_LOADER, "x86/microcode:starting", mc_cpu_starting, NULL); @@ -782,15 +664,6 @@ static int __init microcode_init(void) return 0; - out_driver: - cpus_read_lock(); - mutex_lock(µcode_mutex); - - subsys_interface_unregister(&mc_cpu_interface); - - mutex_unlock(µcode_mutex); - cpus_read_unlock(); - out_pdev: platform_device_unregister(microcode_pdev); return error; diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c index 1fcbd671f1df..cac2bdb57f0b 100644 --- a/arch/x86/kernel/cpu/microcode/intel.c +++ b/arch/x86/kernel/cpu/microcode/intel.c @@ -48,34 +48,6 @@ static int llc_size_per_core; /* * Returns 1 if update has been found, 0 otherwise. */ -static int find_matching_signature(void *mc, unsigned int csig, int cpf) -{ - struct microcode_header_intel *mc_hdr = mc; - struct extended_sigtable *ext_hdr; - struct extended_signature *ext_sig; - int i; - - if (intel_cpu_signatures_match(csig, cpf, mc_hdr->sig, mc_hdr->pf)) - return 1; - - /* Look for ext. headers: */ - if (get_totalsize(mc_hdr) <= get_datasize(mc_hdr) + MC_HEADER_SIZE) - return 0; - - ext_hdr = mc + get_datasize(mc_hdr) + MC_HEADER_SIZE; - ext_sig = (void *)ext_hdr + EXT_HEADER_SIZE; - - for (i = 0; i < ext_hdr->count; i++) { - if (intel_cpu_signatures_match(csig, cpf, ext_sig->sig, ext_sig->pf)) - return 1; - ext_sig++; - } - return 0; -} - -/* - * Returns 1 if update has been found, 0 otherwise. - */ static int has_newer_microcode(void *mc, unsigned int csig, int cpf, int new_rev) { struct microcode_header_intel *mc_hdr = mc; @@ -83,7 +55,7 @@ static int has_newer_microcode(void *mc, unsigned int csig, int cpf, int new_rev if (mc_hdr->rev <= new_rev) return 0; - return find_matching_signature(mc, csig, cpf); + return intel_find_matching_signature(mc, csig, cpf); } static struct ucode_patch *memdup_patch(void *data, unsigned int size) @@ -117,7 +89,7 @@ static void save_microcode_patch(struct ucode_cpu_info *uci, void *data, unsigne sig = mc_saved_hdr->sig; pf = mc_saved_hdr->pf; - if (find_matching_signature(data, sig, pf)) { + if (intel_find_matching_signature(data, sig, pf)) { prev_found = true; if (mc_hdr->rev <= mc_saved_hdr->rev) @@ -149,7 +121,7 @@ static void save_microcode_patch(struct ucode_cpu_info *uci, void *data, unsigne if (!p) return; - if (!find_matching_signature(p->data, uci->cpu_sig.sig, uci->cpu_sig.pf)) + if (!intel_find_matching_signature(p->data, uci->cpu_sig.sig, uci->cpu_sig.pf)) return; /* @@ -163,104 +135,6 @@ static void save_microcode_patch(struct ucode_cpu_info *uci, void *data, unsigne intel_ucode_patch = p->data; } -static int microcode_sanity_check(void *mc, int print_err) -{ - unsigned long total_size, data_size, ext_table_size; - struct microcode_header_intel *mc_header = mc; - struct extended_sigtable *ext_header = NULL; - u32 sum, orig_sum, ext_sigcount = 0, i; - struct extended_signature *ext_sig; - - total_size = get_totalsize(mc_header); - data_size = get_datasize(mc_header); - - if (data_size + MC_HEADER_SIZE > total_size) { - if (print_err) - pr_err("Error: bad microcode data file size.\n"); - return -EINVAL; - } - - if (mc_header->ldrver != 1 || mc_header->hdrver != 1) { - if (print_err) - pr_err("Error: invalid/unknown microcode update format.\n"); - return -EINVAL; - } - - ext_table_size = total_size - (MC_HEADER_SIZE + data_size); - if (ext_table_size) { - u32 ext_table_sum = 0; - u32 *ext_tablep; - - if ((ext_table_size < EXT_HEADER_SIZE) - || ((ext_table_size - EXT_HEADER_SIZE) % EXT_SIGNATURE_SIZE)) { - if (print_err) - pr_err("Error: truncated extended signature table.\n"); - return -EINVAL; - } - - ext_header = mc + MC_HEADER_SIZE + data_size; - if (ext_table_size != exttable_size(ext_header)) { - if (print_err) - pr_err("Error: extended signature table size mismatch.\n"); - return -EFAULT; - } - - ext_sigcount = ext_header->count; - - /* - * Check extended table checksum: the sum of all dwords that - * comprise a valid table must be 0. - */ - ext_tablep = (u32 *)ext_header; - - i = ext_table_size / sizeof(u32); - while (i--) - ext_table_sum += ext_tablep[i]; - - if (ext_table_sum) { - if (print_err) - pr_warn("Bad extended signature table checksum, aborting.\n"); - return -EINVAL; - } - } - - /* - * Calculate the checksum of update data and header. The checksum of - * valid update data and header including the extended signature table - * must be 0. - */ - orig_sum = 0; - i = (MC_HEADER_SIZE + data_size) / sizeof(u32); - while (i--) - orig_sum += ((u32 *)mc)[i]; - - if (orig_sum) { - if (print_err) - pr_err("Bad microcode data checksum, aborting.\n"); - return -EINVAL; - } - - if (!ext_table_size) - return 0; - - /* - * Check extended signature checksum: 0 => valid. - */ - for (i = 0; i < ext_sigcount; i++) { - ext_sig = (void *)ext_header + EXT_HEADER_SIZE + - EXT_SIGNATURE_SIZE * i; - - sum = (mc_header->sig + mc_header->pf + mc_header->cksum) - - (ext_sig->sig + ext_sig->pf + ext_sig->cksum); - if (sum) { - if (print_err) - pr_err("Bad extended signature checksum, aborting.\n"); - return -EINVAL; - } - } - return 0; -} - /* * Get microcode matching with BSP's model. Only CPUs with the same model as * BSP can stay in the platform. @@ -281,13 +155,13 @@ scan_microcode(void *data, size_t size, struct ucode_cpu_info *uci, bool save) mc_size = get_totalsize(mc_header); if (!mc_size || mc_size > size || - microcode_sanity_check(data, 0) < 0) + intel_microcode_sanity_check(data, false, MC_HEADER_TYPE_MICROCODE) < 0) break; size -= mc_size; - if (!find_matching_signature(data, uci->cpu_sig.sig, - uci->cpu_sig.pf)) { + if (!intel_find_matching_signature(data, uci->cpu_sig.sig, + uci->cpu_sig.pf)) { data += mc_size; continue; } @@ -621,7 +495,6 @@ void load_ucode_intel_ap(void) else iup = &intel_ucode_patch; -reget: if (!*iup) { patch = __load_ucode_intel(&uci); if (!patch) @@ -632,12 +505,7 @@ reget: uci.mc = *iup; - if (apply_microcode_early(&uci, true)) { - /* Mixed-silicon system? Try to refetch the proper patch: */ - *iup = NULL; - - goto reget; - } + apply_microcode_early(&uci, true); } static struct microcode_intel *find_patch(struct ucode_cpu_info *uci) @@ -652,9 +520,9 @@ static struct microcode_intel *find_patch(struct ucode_cpu_info *uci) if (phdr->rev <= uci->cpu_sig.rev) continue; - if (!find_matching_signature(phdr, - uci->cpu_sig.sig, - uci->cpu_sig.pf)) + if (!intel_find_matching_signature(phdr, + uci->cpu_sig.sig, + uci->cpu_sig.pf)) continue; return iter->data; @@ -680,7 +548,6 @@ void reload_ucode_intel(void) static int collect_cpu_info(int cpu_num, struct cpu_signature *csig) { - static struct cpu_signature prev; struct cpuinfo_x86 *c = &cpu_data(cpu_num); unsigned int val[2]; @@ -696,13 +563,6 @@ static int collect_cpu_info(int cpu_num, struct cpu_signature *csig) csig->rev = c->microcode; - /* No extra locking on prev, races are harmless. */ - if (csig->sig != prev.sig || csig->pf != prev.pf || csig->rev != prev.rev) { - pr_info("sig=0x%x, pf=0x%x, revision=0x%x\n", - csig->sig, csig->pf, csig->rev); - prev = *csig; - } - return 0; } @@ -820,7 +680,7 @@ static enum ucode_state generic_load_microcode(int cpu, struct iov_iter *iter) memcpy(mc, &mc_header, sizeof(mc_header)); data = mc + sizeof(mc_header); if (!copy_from_iter_full(data, data_size, iter) || - microcode_sanity_check(mc, 1) < 0) { + intel_microcode_sanity_check(mc, true, MC_HEADER_TYPE_MICROCODE) < 0) { break; } @@ -885,8 +745,7 @@ static bool is_blacklisted(unsigned int cpu) return false; } -static enum ucode_state request_microcode_fw(int cpu, struct device *device, - bool refresh_fw) +static enum ucode_state request_microcode_fw(int cpu, struct device *device) { struct cpuinfo_x86 *c = &cpu_data(cpu); const struct firmware *firmware; @@ -908,7 +767,7 @@ static enum ucode_state request_microcode_fw(int cpu, struct device *device, kvec.iov_base = (void *)firmware->data; kvec.iov_len = firmware->size; - iov_iter_kvec(&iter, WRITE, &kvec, 1, firmware->size); + iov_iter_kvec(&iter, ITER_SOURCE, &kvec, 1, firmware->size); ret = generic_load_microcode(cpu, &iter); release_firmware(firmware); diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c index e402923800d7..bfbdc072017d 100644 --- a/arch/x86/kernel/cpu/mshyperv.c +++ b/arch/x86/kernel/cpu/mshyperv.c @@ -475,6 +475,12 @@ static bool __init ms_hyperv_x2apic_available(void) * (logically) generates MSIs directly to the system APIC irq domain. * There is no HPET, and PCI MSI/MSI-X interrupts are remapped by the * pci-hyperv host bridge. + * + * Note: for a Hyper-V root partition, this will always return false. + * The hypervisor doesn't expose these HYPERV_CPUID_VIRT_STACK_* cpuids by + * default, they are implemented as intercepts by the Windows Hyper-V stack. + * Even a nested root partition (L2 root) will not get them because the + * nested (L1) hypervisor filters them out. */ static bool __init ms_hyperv_msi_ext_dest_id(void) { diff --git a/arch/x86/kernel/cpu/mtrr/amd.c b/arch/x86/kernel/cpu/mtrr/amd.c index a65a0272096d..eff6ac62c0ff 100644 --- a/arch/x86/kernel/cpu/mtrr/amd.c +++ b/arch/x86/kernel/cpu/mtrr/amd.c @@ -109,7 +109,7 @@ amd_validate_add_page(unsigned long base, unsigned long size, unsigned int type) return 0; } -static const struct mtrr_ops amd_mtrr_ops = { +const struct mtrr_ops amd_mtrr_ops = { .vendor = X86_VENDOR_AMD, .set = amd_set_mtrr, .get = amd_get_mtrr, @@ -117,9 +117,3 @@ static const struct mtrr_ops amd_mtrr_ops = { .validate_add_page = amd_validate_add_page, .have_wrcomb = positive_have_wrcomb, }; - -int __init amd_init_mtrr(void) -{ - set_mtrr_ops(&amd_mtrr_ops); - return 0; -} diff --git a/arch/x86/kernel/cpu/mtrr/centaur.c b/arch/x86/kernel/cpu/mtrr/centaur.c index f27177816569..b8a74eddde83 100644 --- a/arch/x86/kernel/cpu/mtrr/centaur.c +++ b/arch/x86/kernel/cpu/mtrr/centaur.c @@ -111,7 +111,7 @@ centaur_validate_add_page(unsigned long base, unsigned long size, unsigned int t return 0; } -static const struct mtrr_ops centaur_mtrr_ops = { +const struct mtrr_ops centaur_mtrr_ops = { .vendor = X86_VENDOR_CENTAUR, .set = centaur_set_mcr, .get = centaur_get_mcr, @@ -119,9 +119,3 @@ static const struct mtrr_ops centaur_mtrr_ops = { .validate_add_page = centaur_validate_add_page, .have_wrcomb = positive_have_wrcomb, }; - -int __init centaur_init_mtrr(void) -{ - set_mtrr_ops(¢aur_mtrr_ops); - return 0; -} diff --git a/arch/x86/kernel/cpu/mtrr/cyrix.c b/arch/x86/kernel/cpu/mtrr/cyrix.c index ca670919b561..173b9e01e623 100644 --- a/arch/x86/kernel/cpu/mtrr/cyrix.c +++ b/arch/x86/kernel/cpu/mtrr/cyrix.c @@ -234,51 +234,11 @@ static void cyrix_set_arr(unsigned int reg, unsigned long base, post_set(); } -typedef struct { - unsigned long base; - unsigned long size; - mtrr_type type; -} arr_state_t; - -static arr_state_t arr_state[8] = { - {0UL, 0UL, 0UL}, {0UL, 0UL, 0UL}, {0UL, 0UL, 0UL}, {0UL, 0UL, 0UL}, - {0UL, 0UL, 0UL}, {0UL, 0UL, 0UL}, {0UL, 0UL, 0UL}, {0UL, 0UL, 0UL} -}; - -static unsigned char ccr_state[7] = { 0, 0, 0, 0, 0, 0, 0 }; - -static void cyrix_set_all(void) -{ - int i; - - prepare_set(); - - /* the CCRs are not contiguous */ - for (i = 0; i < 4; i++) - setCx86(CX86_CCR0 + i, ccr_state[i]); - for (; i < 7; i++) - setCx86(CX86_CCR4 + i, ccr_state[i]); - - for (i = 0; i < 8; i++) { - cyrix_set_arr(i, arr_state[i].base, - arr_state[i].size, arr_state[i].type); - } - - post_set(); -} - -static const struct mtrr_ops cyrix_mtrr_ops = { +const struct mtrr_ops cyrix_mtrr_ops = { .vendor = X86_VENDOR_CYRIX, - .set_all = cyrix_set_all, .set = cyrix_set_arr, .get = cyrix_get_arr, .get_free_region = cyrix_get_free_region, .validate_add_page = generic_validate_add_page, .have_wrcomb = positive_have_wrcomb, }; - -int __init cyrix_init_mtrr(void) -{ - set_mtrr_ops(&cyrix_mtrr_ops); - return 0; -} diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c index 558108296f3c..ee09d359e08f 100644 --- a/arch/x86/kernel/cpu/mtrr/generic.c +++ b/arch/x86/kernel/cpu/mtrr/generic.c @@ -10,6 +10,7 @@ #include <linux/mm.h> #include <asm/processor-flags.h> +#include <asm/cacheinfo.h> #include <asm/cpufeature.h> #include <asm/tlbflush.h> #include <asm/mtrr.h> @@ -396,9 +397,6 @@ print_fixed(unsigned base, unsigned step, const mtrr_type *types) } } -static void prepare_set(void); -static void post_set(void); - static void __init print_mtrr_state(void) { unsigned int i; @@ -444,20 +442,6 @@ static void __init print_mtrr_state(void) pr_debug("TOM2: %016llx aka %lldM\n", mtrr_tom2, mtrr_tom2>>20); } -/* PAT setup for BP. We need to go through sync steps here */ -void __init mtrr_bp_pat_init(void) -{ - unsigned long flags; - - local_irq_save(flags); - prepare_set(); - - pat_init(); - - post_set(); - local_irq_restore(flags); -} - /* Grab all of the MTRR state for this CPU into *state */ bool __init get_mtrr_state(void) { @@ -684,7 +668,10 @@ static u32 deftype_lo, deftype_hi; /** * set_mtrr_state - Set the MTRR state for this CPU. * - * NOTE: The CPU must already be in a safe state for MTRR changes. + * NOTE: The CPU must already be in a safe state for MTRR changes, including + * measures that only a single CPU can be active in set_mtrr_state() in + * order to not be subject to races for usage of deftype_lo. This is + * accomplished by taking cache_disable_lock. * RETURNS: 0 if no changes made, else a mask indicating what was changed. */ static unsigned long set_mtrr_state(void) @@ -715,106 +702,34 @@ static unsigned long set_mtrr_state(void) return change_mask; } - -static unsigned long cr4; -static DEFINE_RAW_SPINLOCK(set_atomicity_lock); - -/* - * Since we are disabling the cache don't allow any interrupts, - * they would run extremely slow and would only increase the pain. - * - * The caller must ensure that local interrupts are disabled and - * are reenabled after post_set() has been called. - */ -static void prepare_set(void) __acquires(set_atomicity_lock) +void mtrr_disable(void) { - unsigned long cr0; - - /* - * Note that this is not ideal - * since the cache is only flushed/disabled for this CPU while the - * MTRRs are changed, but changing this requires more invasive - * changes to the way the kernel boots - */ - - raw_spin_lock(&set_atomicity_lock); - - /* Enter the no-fill (CD=1, NW=0) cache mode and flush caches. */ - cr0 = read_cr0() | X86_CR0_CD; - write_cr0(cr0); - - /* - * Cache flushing is the most time-consuming step when programming - * the MTRRs. Fortunately, as per the Intel Software Development - * Manual, we can skip it if the processor supports cache self- - * snooping. - */ - if (!static_cpu_has(X86_FEATURE_SELFSNOOP)) - wbinvd(); - - /* Save value of CR4 and clear Page Global Enable (bit 7) */ - if (boot_cpu_has(X86_FEATURE_PGE)) { - cr4 = __read_cr4(); - __write_cr4(cr4 & ~X86_CR4_PGE); - } - - /* Flush all TLBs via a mov %cr3, %reg; mov %reg, %cr3 */ - count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL); - flush_tlb_local(); - /* Save MTRR state */ rdmsr(MSR_MTRRdefType, deftype_lo, deftype_hi); /* Disable MTRRs, and set the default type to uncached */ mtrr_wrmsr(MSR_MTRRdefType, deftype_lo & ~0xcff, deftype_hi); - - /* Again, only flush caches if we have to. */ - if (!static_cpu_has(X86_FEATURE_SELFSNOOP)) - wbinvd(); } -static void post_set(void) __releases(set_atomicity_lock) +void mtrr_enable(void) { - /* Flush TLBs (no need to flush caches - they are disabled) */ - count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL); - flush_tlb_local(); - /* Intel (P6) standard MTRRs */ mtrr_wrmsr(MSR_MTRRdefType, deftype_lo, deftype_hi); - - /* Enable caches */ - write_cr0(read_cr0() & ~X86_CR0_CD); - - /* Restore value of CR4 */ - if (boot_cpu_has(X86_FEATURE_PGE)) - __write_cr4(cr4); - raw_spin_unlock(&set_atomicity_lock); } -static void generic_set_all(void) +void mtrr_generic_set_state(void) { unsigned long mask, count; - unsigned long flags; - - local_irq_save(flags); - prepare_set(); /* Actually set the state */ mask = set_mtrr_state(); - /* also set PAT */ - pat_init(); - - post_set(); - local_irq_restore(flags); - /* Use the atomic bitops to update the global mask */ for (count = 0; count < sizeof(mask) * 8; ++count) { if (mask & 0x01) set_bit(count, &smp_changes_mask); mask >>= 1; } - } /** @@ -836,7 +751,7 @@ static void generic_set_mtrr(unsigned int reg, unsigned long base, vr = &mtrr_state.var_ranges[reg]; local_irq_save(flags); - prepare_set(); + cache_disable(); if (size == 0) { /* @@ -855,7 +770,7 @@ static void generic_set_mtrr(unsigned int reg, unsigned long base, mtrr_wrmsr(MTRRphysMask_MSR(reg), vr->mask_lo, vr->mask_hi); } - post_set(); + cache_enable(); local_irq_restore(flags); } @@ -914,8 +829,6 @@ int positive_have_wrcomb(void) * Generic structure... */ const struct mtrr_ops generic_mtrr_ops = { - .use_intel_if = 1, - .set_all = generic_set_all, .get = generic_get_mtrr, .get_free_region = generic_get_free_region, .set = generic_set_mtrr, diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.c b/arch/x86/kernel/cpu/mtrr/mtrr.c index 2746cac9d8a9..783f3210d582 100644 --- a/arch/x86/kernel/cpu/mtrr/mtrr.c +++ b/arch/x86/kernel/cpu/mtrr/mtrr.c @@ -46,6 +46,7 @@ #include <linux/syscore_ops.h> #include <linux/rcupdate.h> +#include <asm/cacheinfo.h> #include <asm/cpufeature.h> #include <asm/e820/api.h> #include <asm/mtrr.h> @@ -58,32 +59,18 @@ #define MTRR_TO_PHYS_WC_OFFSET 1000 u32 num_var_ranges; -static bool __mtrr_enabled; - static bool mtrr_enabled(void) { - return __mtrr_enabled; + return !!mtrr_if; } unsigned int mtrr_usage_table[MTRR_MAX_VAR_RANGES]; static DEFINE_MUTEX(mtrr_mutex); u64 size_or_mask, size_and_mask; -static bool mtrr_aps_delayed_init; - -static const struct mtrr_ops *mtrr_ops[X86_VENDOR_NUM] __ro_after_init; const struct mtrr_ops *mtrr_if; -static void set_mtrr(unsigned int reg, unsigned long base, - unsigned long size, mtrr_type type); - -void __init set_mtrr_ops(const struct mtrr_ops *ops) -{ - if (ops->vendor && ops->vendor < X86_VENDOR_NUM) - mtrr_ops[ops->vendor] = ops; -} - /* Returns non-zero if we have the write-combining memory type */ static int have_wrcomb(void) { @@ -119,11 +106,11 @@ static int have_wrcomb(void) } /* This function returns the number of variable MTRRs */ -static void __init set_num_var_ranges(void) +static void __init set_num_var_ranges(bool use_generic) { unsigned long config = 0, dummy; - if (use_intel()) + if (use_generic) rdmsr(MSR_MTRRcap, config, dummy); else if (is_cpu(AMD) || is_cpu(HYGON)) config = 2; @@ -160,25 +147,8 @@ static int mtrr_rendezvous_handler(void *info) { struct set_mtrr_data *data = info; - /* - * We use this same function to initialize the mtrrs during boot, - * resume, runtime cpu online and on an explicit request to set a - * specific MTRR. - * - * During boot or suspend, the state of the boot cpu's mtrrs has been - * saved, and we want to replicate that across all the cpus that come - * online (either at the end of boot or resume or during a runtime cpu - * online). If we're doing that, @reg is set to something special and on - * all the cpu's we do mtrr_if->set_all() (On the logical cpu that - * started the boot/resume sequence, this might be a duplicate - * set_all()). - */ - if (data->smp_reg != ~0U) { - mtrr_if->set(data->smp_reg, data->smp_base, - data->smp_size, data->smp_type); - } else if (mtrr_aps_delayed_init || !cpu_online(smp_processor_id())) { - mtrr_if->set_all(); - } + mtrr_if->set(data->smp_reg, data->smp_base, + data->smp_size, data->smp_type); return 0; } @@ -248,19 +218,6 @@ static void set_mtrr_cpuslocked(unsigned int reg, unsigned long base, stop_machine_cpuslocked(mtrr_rendezvous_handler, &data, cpu_online_mask); } -static void set_mtrr_from_inactive_cpu(unsigned int reg, unsigned long base, - unsigned long size, mtrr_type type) -{ - struct set_mtrr_data data = { .smp_reg = reg, - .smp_base = base, - .smp_size = size, - .smp_type = type - }; - - stop_machine_from_inactive_cpu(mtrr_rendezvous_handler, &data, - cpu_callout_mask); -} - /** * mtrr_add_page - Add a memory type region * @base: Physical base address of region in pages (in units of 4 kB!) @@ -617,20 +574,6 @@ int arch_phys_wc_index(int handle) } EXPORT_SYMBOL_GPL(arch_phys_wc_index); -/* - * HACK ALERT! - * These should be called implicitly, but we can't yet until all the initcall - * stuff is done... - */ -static void __init init_ifs(void) -{ -#ifndef CONFIG_X86_64 - amd_init_mtrr(); - cyrix_init_mtrr(); - centaur_init_mtrr(); -#endif -} - /* The suspend/resume methods are only for CPU without MTRR. CPU using generic * MTRR driver doesn't require this */ @@ -686,10 +629,9 @@ int __initdata changed_by_mtrr_cleanup; */ void __init mtrr_bp_init(void) { + const char *why = "(not available)"; u32 phys_addr; - init_ifs(); - phys_addr = 32; if (boot_cpu_has(X86_FEATURE_MTRR)) { @@ -730,21 +672,21 @@ void __init mtrr_bp_init(void) case X86_VENDOR_AMD: if (cpu_feature_enabled(X86_FEATURE_K6_MTRR)) { /* Pre-Athlon (K6) AMD CPU MTRRs */ - mtrr_if = mtrr_ops[X86_VENDOR_AMD]; + mtrr_if = &amd_mtrr_ops; size_or_mask = SIZE_OR_MASK_BITS(32); size_and_mask = 0; } break; case X86_VENDOR_CENTAUR: if (cpu_feature_enabled(X86_FEATURE_CENTAUR_MCR)) { - mtrr_if = mtrr_ops[X86_VENDOR_CENTAUR]; + mtrr_if = ¢aur_mtrr_ops; size_or_mask = SIZE_OR_MASK_BITS(32); size_and_mask = 0; } break; case X86_VENDOR_CYRIX: if (cpu_feature_enabled(X86_FEATURE_CYRIX_ARR)) { - mtrr_if = mtrr_ops[X86_VENDOR_CYRIX]; + mtrr_if = &cyrix_mtrr_ops; size_or_mask = SIZE_OR_MASK_BITS(32); size_and_mask = 0; } @@ -754,58 +696,23 @@ void __init mtrr_bp_init(void) } } - if (mtrr_if) { - __mtrr_enabled = true; - set_num_var_ranges(); + if (mtrr_enabled()) { + set_num_var_ranges(mtrr_if == &generic_mtrr_ops); init_table(); - if (use_intel()) { + if (mtrr_if == &generic_mtrr_ops) { /* BIOS may override */ - __mtrr_enabled = get_mtrr_state(); - - if (mtrr_enabled()) - mtrr_bp_pat_init(); - - if (mtrr_cleanup(phys_addr)) { - changed_by_mtrr_cleanup = 1; - mtrr_if->set_all(); + if (get_mtrr_state()) { + memory_caching_control |= CACHE_MTRR; + changed_by_mtrr_cleanup = mtrr_cleanup(phys_addr); + } else { + mtrr_if = NULL; + why = "by BIOS"; } } } - if (!mtrr_enabled()) { - pr_info("Disabled\n"); - - /* - * PAT initialization relies on MTRR's rendezvous handler. - * Skip PAT init until the handler can initialize both - * features independently. - */ - pat_disable("MTRRs disabled, skipping PAT initialization too."); - } -} - -void mtrr_ap_init(void) -{ if (!mtrr_enabled()) - return; - - if (!use_intel() || mtrr_aps_delayed_init) - return; - - /* - * Ideally we should hold mtrr_mutex here to avoid mtrr entries - * changed, but this routine will be called in cpu boot time, - * holding the lock breaks it. - * - * This routine is called in two cases: - * - * 1. very early time of software resume, when there absolutely - * isn't mtrr entry changes; - * - * 2. cpu hotadd time. We let mtrr_add/del_page hold cpuhotplug - * lock to prevent mtrr entry changes - */ - set_mtrr_from_inactive_cpu(~0U, 0, 0, 0); + pr_info("MTRRs disabled %s\n", why); } /** @@ -823,50 +730,12 @@ void mtrr_save_state(void) smp_call_function_single(first_cpu, mtrr_save_fixed_ranges, NULL, 1); } -void set_mtrr_aps_delayed_init(void) -{ - if (!mtrr_enabled()) - return; - if (!use_intel()) - return; - - mtrr_aps_delayed_init = true; -} - -/* - * Delayed MTRR initialization for all AP's - */ -void mtrr_aps_init(void) -{ - if (!use_intel() || !mtrr_enabled()) - return; - - /* - * Check if someone has requested the delay of AP MTRR initialization, - * by doing set_mtrr_aps_delayed_init(), prior to this point. If not, - * then we are done. - */ - if (!mtrr_aps_delayed_init) - return; - - set_mtrr(~0U, 0, 0, 0); - mtrr_aps_delayed_init = false; -} - -void mtrr_bp_restore(void) -{ - if (!use_intel() || !mtrr_enabled()) - return; - - mtrr_if->set_all(); -} - static int __init mtrr_init_finialize(void) { if (!mtrr_enabled()) return 0; - if (use_intel()) { + if (memory_caching_control & CACHE_MTRR) { if (!changed_by_mtrr_cleanup) mtrr_state_warn(); return 0; diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.h b/arch/x86/kernel/cpu/mtrr/mtrr.h index 2ac99e561181..02eb5871492d 100644 --- a/arch/x86/kernel/cpu/mtrr/mtrr.h +++ b/arch/x86/kernel/cpu/mtrr/mtrr.h @@ -14,11 +14,8 @@ extern unsigned int mtrr_usage_table[MTRR_MAX_VAR_RANGES]; struct mtrr_ops { u32 vendor; - u32 use_intel_if; void (*set)(unsigned int reg, unsigned long base, unsigned long size, mtrr_type type); - void (*set_all)(void); - void (*get)(unsigned int reg, unsigned long *base, unsigned long *size, mtrr_type *type); int (*get_free_region)(unsigned long base, unsigned long size, @@ -53,15 +50,11 @@ void set_mtrr_prepare_save(struct set_mtrr_context *ctxt); void fill_mtrr_var_range(unsigned int index, u32 base_lo, u32 base_hi, u32 mask_lo, u32 mask_hi); bool get_mtrr_state(void); -void mtrr_bp_pat_init(void); - -extern void __init set_mtrr_ops(const struct mtrr_ops *ops); extern u64 size_or_mask, size_and_mask; extern const struct mtrr_ops *mtrr_if; #define is_cpu(vnd) (mtrr_if && mtrr_if->vendor == X86_VENDOR_##vnd) -#define use_intel() (mtrr_if && mtrr_if->use_intel_if == 1) extern unsigned int num_var_ranges; extern u64 mtrr_tom2; @@ -71,10 +64,10 @@ void mtrr_state_warn(void); const char *mtrr_attrib_to_str(int x); void mtrr_wrmsr(unsigned, unsigned, unsigned); -/* CPU specific mtrr init functions */ -int amd_init_mtrr(void); -int cyrix_init_mtrr(void); -int centaur_init_mtrr(void); +/* CPU specific mtrr_ops vectors. */ +extern const struct mtrr_ops amd_mtrr_ops; +extern const struct mtrr_ops cyrix_mtrr_ops; +extern const struct mtrr_ops centaur_mtrr_ops; extern int changed_by_mtrr_cleanup; extern int mtrr_cleanup(unsigned address_bits); diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c index 3266ea36667c..c98e52ff5f20 100644 --- a/arch/x86/kernel/cpu/resctrl/core.c +++ b/arch/x86/kernel/cpu/resctrl/core.c @@ -575,7 +575,7 @@ static void clear_closid_rmid(int cpu) state->default_rmid = 0; state->cur_closid = 0; state->cur_rmid = 0; - wrmsr(IA32_PQR_ASSOC, 0, 0); + wrmsr(MSR_IA32_PQR_ASSOC, 0, 0); } static int resctrl_online_cpu(unsigned int cpu) @@ -828,7 +828,6 @@ static __init void rdt_init_res_defs_intel(void) if (r->rid == RDT_RESOURCE_L3 || r->rid == RDT_RESOURCE_L2) { r->cache.arch_has_sparse_bitmaps = false; - r->cache.arch_has_empty_bitmaps = false; r->cache.arch_has_per_cpu_cfg = false; r->cache.min_cbm_bits = 1; } else if (r->rid == RDT_RESOURCE_MBA) { @@ -849,7 +848,6 @@ static __init void rdt_init_res_defs_amd(void) if (r->rid == RDT_RESOURCE_L3 || r->rid == RDT_RESOURCE_L2) { r->cache.arch_has_sparse_bitmaps = true; - r->cache.arch_has_empty_bitmaps = true; r->cache.arch_has_per_cpu_cfg = true; r->cache.min_cbm_bits = 0; } else if (r->rid == RDT_RESOURCE_MBA) { diff --git a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c index 1dafbdc5ac31..1df0e3262bca 100644 --- a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c +++ b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c @@ -105,8 +105,7 @@ static bool cbm_validate(char *buf, u32 *data, struct rdt_resource *r) return false; } - if ((!r->cache.arch_has_empty_bitmaps && val == 0) || - val > r->default_ctrl) { + if ((r->cache.min_cbm_bits > 0 && val == 0) || val > r->default_ctrl) { rdt_last_cmd_puts("Mask out of range\n"); return false; } diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h index 5f7128686cfd..5ebd28e6aa0c 100644 --- a/arch/x86/kernel/cpu/resctrl/internal.h +++ b/arch/x86/kernel/cpu/resctrl/internal.h @@ -8,16 +8,6 @@ #include <linux/fs_context.h> #include <linux/jump_label.h> -#define MSR_IA32_L3_QOS_CFG 0xc81 -#define MSR_IA32_L2_QOS_CFG 0xc82 -#define MSR_IA32_L3_CBM_BASE 0xc90 -#define MSR_IA32_L2_CBM_BASE 0xd10 -#define MSR_IA32_MBA_THRTL_BASE 0xd50 -#define MSR_IA32_MBA_BW_BASE 0xc0000200 - -#define MSR_IA32_QM_CTR 0x0c8e -#define MSR_IA32_QM_EVTSEL 0x0c8d - #define L3_QOS_CDP_ENABLE 0x01ULL #define L2_QOS_CDP_ENABLE 0x01ULL diff --git a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c index d961ae3ed96e..524f8ff3e69c 100644 --- a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c +++ b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c @@ -477,7 +477,7 @@ static int pseudo_lock_fn(void *_rdtgrp) * pseudo-locked followed by reading of kernel memory to load it * into the cache. */ - __wrmsr(IA32_PQR_ASSOC, rmid_p, rdtgrp->closid); + __wrmsr(MSR_IA32_PQR_ASSOC, rmid_p, rdtgrp->closid); /* * Cache was flushed earlier. Now access kernel memory to read it * into cache region associated with just activated plr->closid. @@ -513,7 +513,7 @@ static int pseudo_lock_fn(void *_rdtgrp) * Critical section end: restore closid with capacity bitmask that * does not overlap with pseudo-locked region. */ - __wrmsr(IA32_PQR_ASSOC, rmid_p, closid_p); + __wrmsr(MSR_IA32_PQR_ASSOC, rmid_p, closid_p); /* Re-enable the hardware prefetcher(s) */ wrmsrl(MSR_MISC_FEATURE_CONTROL, saved_msr); @@ -1560,9 +1560,9 @@ static const struct file_operations pseudo_lock_dev_fops = { .mmap = pseudo_lock_dev_mmap, }; -static char *pseudo_lock_devnode(struct device *dev, umode_t *mode) +static char *pseudo_lock_devnode(const struct device *dev, umode_t *mode) { - struct rdtgroup *rdtgrp; + const struct rdtgroup *rdtgrp; rdtgrp = dev_get_drvdata(dev); if (mode) diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c index fc01f81f6e2a..f53944fb8f7f 100644 --- a/arch/x86/kernel/cpu/scattered.c +++ b/arch/x86/kernel/cpu/scattered.c @@ -40,6 +40,7 @@ static const struct cpuid_bit cpuid_bits[] = { { X86_FEATURE_PER_THREAD_MBA, CPUID_ECX, 0, 0x00000010, 3 }, { X86_FEATURE_SGX1, CPUID_EAX, 0, 0x00000012, 0 }, { X86_FEATURE_SGX2, CPUID_EAX, 1, 0x00000012, 0 }, + { X86_FEATURE_SGX_EDECCSSA, CPUID_EAX, 11, 0x00000012, 0 }, { X86_FEATURE_HW_PSTATE, CPUID_EDX, 7, 0x80000007, 0 }, { X86_FEATURE_CPB, CPUID_EDX, 9, 0x80000007, 0 }, { X86_FEATURE_PROC_FEEDBACK, CPUID_EDX, 11, 0x80000007, 0 }, diff --git a/arch/x86/kernel/cpu/sgx/encl.c b/arch/x86/kernel/cpu/sgx/encl.c index 1ec20807de1e..2a0e90fe2abc 100644 --- a/arch/x86/kernel/cpu/sgx/encl.c +++ b/arch/x86/kernel/cpu/sgx/encl.c @@ -160,8 +160,8 @@ static int __sgx_encl_eldu(struct sgx_encl_page *encl_page, return ret; pginfo.addr = encl_page->desc & PAGE_MASK; - pginfo.contents = (unsigned long)kmap_atomic(b.contents); - pcmd_page = kmap_atomic(b.pcmd); + pginfo.contents = (unsigned long)kmap_local_page(b.contents); + pcmd_page = kmap_local_page(b.pcmd); pginfo.metadata = (unsigned long)pcmd_page + b.pcmd_offset; if (secs_page) @@ -187,8 +187,8 @@ static int __sgx_encl_eldu(struct sgx_encl_page *encl_page, */ pcmd_page_empty = !memchr_inv(pcmd_page, 0, PAGE_SIZE); - kunmap_atomic(pcmd_page); - kunmap_atomic((void *)(unsigned long)pginfo.contents); + kunmap_local(pcmd_page); + kunmap_local((void *)(unsigned long)pginfo.contents); get_page(b.pcmd); sgx_encl_put_backing(&b); @@ -197,10 +197,10 @@ static int __sgx_encl_eldu(struct sgx_encl_page *encl_page, if (pcmd_page_empty && !reclaimer_writing_to_pcmd(encl, pcmd_first_page)) { sgx_encl_truncate_backing_page(encl, PFN_DOWN(page_pcmd_off)); - pcmd_page = kmap_atomic(b.pcmd); + pcmd_page = kmap_local_page(b.pcmd); if (memchr_inv(pcmd_page, 0, PAGE_SIZE)) pr_warn("PCMD page not empty after truncate.\n"); - kunmap_atomic(pcmd_page); + kunmap_local(pcmd_page); } put_page(b.pcmd); @@ -268,7 +268,7 @@ static struct sgx_encl_page *sgx_encl_load_page_in_vma(struct sgx_encl *encl, unsigned long addr, unsigned long vm_flags) { - unsigned long vm_prot_bits = vm_flags & (VM_READ | VM_WRITE | VM_EXEC); + unsigned long vm_prot_bits = vm_flags & VM_ACCESS_FLAGS; struct sgx_encl_page *entry; entry = xa_load(&encl->page_array, PFN_DOWN(addr)); @@ -502,7 +502,7 @@ static void sgx_vma_open(struct vm_area_struct *vma) int sgx_encl_may_map(struct sgx_encl *encl, unsigned long start, unsigned long end, unsigned long vm_flags) { - unsigned long vm_prot_bits = vm_flags & (VM_READ | VM_WRITE | VM_EXEC); + unsigned long vm_prot_bits = vm_flags & VM_ACCESS_FLAGS; struct sgx_encl_page *page; unsigned long count = 0; int ret = 0; @@ -680,11 +680,15 @@ const struct vm_operations_struct sgx_vm_ops = { void sgx_encl_release(struct kref *ref) { struct sgx_encl *encl = container_of(ref, struct sgx_encl, refcount); + unsigned long max_page_index = PFN_DOWN(encl->base + encl->size - 1); struct sgx_va_page *va_page; struct sgx_encl_page *entry; - unsigned long index; + unsigned long count = 0; + + XA_STATE(xas, &encl->page_array, PFN_DOWN(encl->base)); - xa_for_each(&encl->page_array, index, entry) { + xas_lock(&xas); + xas_for_each(&xas, entry, max_page_index) { if (entry->epc_page) { /* * The page and its radix tree entry cannot be freed @@ -699,9 +703,20 @@ void sgx_encl_release(struct kref *ref) } kfree(entry); - /* Invoke scheduler to prevent soft lockups. */ - cond_resched(); + /* + * Invoke scheduler on every XA_CHECK_SCHED iteration + * to prevent soft lockups. + */ + if (!(++count % XA_CHECK_SCHED)) { + xas_pause(&xas); + xas_unlock(&xas); + + cond_resched(); + + xas_lock(&xas); + } } + xas_unlock(&xas); xa_destroy(&encl->page_array); diff --git a/arch/x86/kernel/cpu/sgx/ioctl.c b/arch/x86/kernel/cpu/sgx/ioctl.c index ebe79d60619f..21ca0a831b70 100644 --- a/arch/x86/kernel/cpu/sgx/ioctl.c +++ b/arch/x86/kernel/cpu/sgx/ioctl.c @@ -111,7 +111,7 @@ static int sgx_encl_create(struct sgx_encl *encl, struct sgx_secs *secs) encl->base = secs->base; encl->size = secs->size; encl->attributes = secs->attributes; - encl->attributes_mask = SGX_ATTR_DEBUG | SGX_ATTR_MODE64BIT | SGX_ATTR_KSS; + encl->attributes_mask = SGX_ATTR_UNPRIV_MASK; /* Set only after completion, as encl->lock has not been taken. */ set_bit(SGX_ENCL_CREATED, &encl->flags); @@ -221,11 +221,11 @@ static int __sgx_encl_add_page(struct sgx_encl *encl, pginfo.secs = (unsigned long)sgx_get_epc_virt_addr(encl->secs.epc_page); pginfo.addr = encl_page->desc & PAGE_MASK; pginfo.metadata = (unsigned long)secinfo; - pginfo.contents = (unsigned long)kmap_atomic(src_page); + pginfo.contents = (unsigned long)kmap_local_page(src_page); ret = __eadd(&pginfo, sgx_get_epc_virt_addr(epc_page)); - kunmap_atomic((void *)pginfo.contents); + kunmap_local((void *)pginfo.contents); put_page(src_page); return ret ? -EIO : 0; @@ -356,6 +356,9 @@ static int sgx_validate_offset_length(struct sgx_encl *encl, if (!length || !IS_ALIGNED(length, PAGE_SIZE)) return -EINVAL; + if (offset + length < offset) + return -EINVAL; + if (offset + length - PAGE_SIZE >= encl->size) return -EINVAL; diff --git a/arch/x86/kernel/cpu/sgx/main.c b/arch/x86/kernel/cpu/sgx/main.c index 0aad028f04d4..e5a37b6e9aa5 100644 --- a/arch/x86/kernel/cpu/sgx/main.c +++ b/arch/x86/kernel/cpu/sgx/main.c @@ -165,17 +165,17 @@ static int __sgx_encl_ewb(struct sgx_epc_page *epc_page, void *va_slot, pginfo.addr = 0; pginfo.secs = 0; - pginfo.contents = (unsigned long)kmap_atomic(backing->contents); - pginfo.metadata = (unsigned long)kmap_atomic(backing->pcmd) + + pginfo.contents = (unsigned long)kmap_local_page(backing->contents); + pginfo.metadata = (unsigned long)kmap_local_page(backing->pcmd) + backing->pcmd_offset; ret = __ewb(&pginfo, sgx_get_epc_virt_addr(epc_page), va_slot); set_page_dirty(backing->pcmd); set_page_dirty(backing->contents); - kunmap_atomic((void *)(unsigned long)(pginfo.metadata - + kunmap_local((void *)(unsigned long)(pginfo.metadata - backing->pcmd_offset)); - kunmap_atomic((void *)(unsigned long)pginfo.contents); + kunmap_local((void *)(unsigned long)pginfo.contents); return ret; } diff --git a/arch/x86/kernel/cpu/tsx.c b/arch/x86/kernel/cpu/tsx.c index ec7bbac3a9f2..8009c8346d8f 100644 --- a/arch/x86/kernel/cpu/tsx.c +++ b/arch/x86/kernel/cpu/tsx.c @@ -58,24 +58,6 @@ static void tsx_enable(void) wrmsrl(MSR_IA32_TSX_CTRL, tsx); } -static bool tsx_ctrl_is_supported(void) -{ - u64 ia32_cap = x86_read_arch_cap_msr(); - - /* - * TSX is controlled via MSR_IA32_TSX_CTRL. However, support for this - * MSR is enumerated by ARCH_CAP_TSX_MSR bit in MSR_IA32_ARCH_CAPABILITIES. - * - * TSX control (aka MSR_IA32_TSX_CTRL) is only available after a - * microcode update on CPUs that have their MSR_IA32_ARCH_CAPABILITIES - * bit MDS_NO=1. CPUs with MDS_NO=0 are not planned to get - * MSR_IA32_TSX_CTRL support even after a microcode update. Thus, - * tsx= cmdline requests will do nothing on CPUs without - * MSR_IA32_TSX_CTRL support. - */ - return !!(ia32_cap & ARCH_CAP_TSX_CTRL_MSR); -} - static enum tsx_ctrl_states x86_get_tsx_auto_mode(void) { if (boot_cpu_has_bug(X86_BUG_TAA)) @@ -135,7 +117,7 @@ static void tsx_clear_cpuid(void) rdmsrl(MSR_TSX_FORCE_ABORT, msr); msr |= MSR_TFA_TSX_CPUID_CLEAR; wrmsrl(MSR_TSX_FORCE_ABORT, msr); - } else if (tsx_ctrl_is_supported()) { + } else if (cpu_feature_enabled(X86_FEATURE_MSR_TSX_CTRL)) { rdmsrl(MSR_IA32_TSX_CTRL, msr); msr |= TSX_CTRL_CPUID_CLEAR; wrmsrl(MSR_IA32_TSX_CTRL, msr); @@ -158,7 +140,8 @@ static void tsx_dev_mode_disable(void) u64 mcu_opt_ctrl; /* Check if RTM_ALLOW exists */ - if (!boot_cpu_has_bug(X86_BUG_TAA) || !tsx_ctrl_is_supported() || + if (!boot_cpu_has_bug(X86_BUG_TAA) || + !cpu_feature_enabled(X86_FEATURE_MSR_TSX_CTRL) || !cpu_feature_enabled(X86_FEATURE_SRBDS_CTRL)) return; @@ -191,7 +174,20 @@ void __init tsx_init(void) return; } - if (!tsx_ctrl_is_supported()) { + /* + * TSX is controlled via MSR_IA32_TSX_CTRL. However, support for this + * MSR is enumerated by ARCH_CAP_TSX_MSR bit in MSR_IA32_ARCH_CAPABILITIES. + * + * TSX control (aka MSR_IA32_TSX_CTRL) is only available after a + * microcode update on CPUs that have their MSR_IA32_ARCH_CAPABILITIES + * bit MDS_NO=1. CPUs with MDS_NO=0 are not planned to get + * MSR_IA32_TSX_CTRL support even after a microcode update. Thus, + * tsx= cmdline requests will do nothing on CPUs without + * MSR_IA32_TSX_CTRL support. + */ + if (x86_read_arch_cap_msr() & ARCH_CAP_TSX_CTRL_MSR) { + setup_force_cpu_cap(X86_FEATURE_MSR_TSX_CTRL); + } else { tsx_ctrl_state = TSX_CTRL_NOT_SUPPORTED; return; } diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c index 6f7b8cc1bc9f..621ba9c0f17a 100644 --- a/arch/x86/kernel/cpuid.c +++ b/arch/x86/kernel/cpuid.c @@ -139,7 +139,7 @@ static int cpuid_device_destroy(unsigned int cpu) return 0; } -static char *cpuid_devnode(struct device *dev, umode_t *mode) +static char *cpuid_devnode(const struct device *dev, umode_t *mode) { return kasprintf(GFP_KERNEL, "cpu/%u/cpuid", MINOR(dev->devt)); } diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c index 9730c88530fc..305514431f26 100644 --- a/arch/x86/kernel/crash.c +++ b/arch/x86/kernel/crash.c @@ -401,10 +401,8 @@ int crash_load_segments(struct kimage *image) kbuf.buf_align = ELF_CORE_HEADER_ALIGN; kbuf.mem = KEXEC_BUF_MEM_UNKNOWN; ret = kexec_add_buffer(&kbuf); - if (ret) { - vfree((void *)image->elf_headers); + if (ret) return ret; - } image->elf_load_addr = kbuf.mem; pr_debug("Loaded ELF headers at 0x%lx bufsz=0x%lx memsz=0x%lx\n", image->elf_load_addr, kbuf.bufsz, kbuf.memsz); diff --git a/arch/x86/kernel/crash_dump_64.c b/arch/x86/kernel/crash_dump_64.c index e75bc2f217ff..32d710f7eb84 100644 --- a/arch/x86/kernel/crash_dump_64.c +++ b/arch/x86/kernel/crash_dump_64.c @@ -57,7 +57,7 @@ ssize_t elfcorehdr_read(char *buf, size_t count, u64 *ppos) struct kvec kvec = { .iov_base = buf, .iov_len = count }; struct iov_iter iter; - iov_iter_kvec(&iter, READ, &kvec, 1, count); + iov_iter_kvec(&iter, ITER_DEST, &kvec, 1, count); return read_from_oldmem(&iter, count, ppos, cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT)); diff --git a/arch/x86/kernel/devicetree.c b/arch/x86/kernel/devicetree.c index 5cd51f25f446..28da5dd83fc0 100644 --- a/arch/x86/kernel/devicetree.c +++ b/arch/x86/kernel/devicetree.c @@ -31,11 +31,6 @@ char __initdata cmd_line[COMMAND_LINE_SIZE]; int __initdata of_ioapic; -void __init early_init_dt_add_memory_arch(u64 base, u64 size) -{ - BUG(); -} - void __init add_dtb(u64 data) { initial_dtb = data + offsetof(struct setup_data, data); @@ -167,7 +162,14 @@ static void __init dtb_lapic_setup(void) return; } smp_found_config = 1; - pic_mode = 1; + if (of_property_read_bool(dn, "intel,virtual-wire-mode")) { + pr_info("Virtual Wire compatibility mode.\n"); + pic_mode = 0; + } else { + pr_info("IMCR and PIC compatibility mode.\n"); + pic_mode = 1; + } + register_lapic_address(lapic_addr); } @@ -248,7 +250,7 @@ static void __init dtb_add_ioapic(struct device_node *dn) ret = of_address_to_resource(dn, 0, &r); if (ret) { - printk(KERN_ERR "Can't obtain address from device node %pOF.\n", dn); + pr_err("Can't obtain address from device node %pOF.\n", dn); return; } mp_register_ioapic(++ioapic_id, r.start, gsi_top, &cfg); @@ -265,7 +267,7 @@ static void __init dtb_ioapic_setup(void) of_ioapic = 1; return; } - printk(KERN_ERR "Error: No information about IO-APIC in OF.\n"); + pr_err("Error: No information about IO-APIC in OF.\n"); } #else static void __init dtb_ioapic_setup(void) {} diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c index 722fd712e1cf..b4905d5173fd 100644 --- a/arch/x86/kernel/dumpstack_32.c +++ b/arch/x86/kernel/dumpstack_32.c @@ -37,7 +37,7 @@ const char *stack_type_name(enum stack_type type) static bool in_hardirq_stack(unsigned long *stack, struct stack_info *info) { - unsigned long *begin = (unsigned long *)this_cpu_read(hardirq_stack_ptr); + unsigned long *begin = (unsigned long *)this_cpu_read(pcpu_hot.hardirq_stack_ptr); unsigned long *end = begin + (THREAD_SIZE / sizeof(long)); /* @@ -62,7 +62,7 @@ static bool in_hardirq_stack(unsigned long *stack, struct stack_info *info) static bool in_softirq_stack(unsigned long *stack, struct stack_info *info) { - unsigned long *begin = (unsigned long *)this_cpu_read(softirq_stack_ptr); + unsigned long *begin = (unsigned long *)this_cpu_read(pcpu_hot.softirq_stack_ptr); unsigned long *end = begin + (THREAD_SIZE / sizeof(long)); /* diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index 6c5defd6569a..f05339fee778 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -134,7 +134,7 @@ static __always_inline bool in_exception_stack(unsigned long *stack, struct stac static __always_inline bool in_irq_stack(unsigned long *stack, struct stack_info *info) { - unsigned long *end = (unsigned long *)this_cpu_read(hardirq_stack_ptr); + unsigned long *end = (unsigned long *)this_cpu_read(pcpu_hot.hardirq_stack_ptr); unsigned long *begin; /* diff --git a/arch/x86/kernel/espfix_64.c b/arch/x86/kernel/espfix_64.c index 9417d5aa7305..16f9814c9be0 100644 --- a/arch/x86/kernel/espfix_64.c +++ b/arch/x86/kernel/espfix_64.c @@ -94,17 +94,7 @@ static inline unsigned long espfix_base_addr(unsigned int cpu) static void init_espfix_random(void) { - unsigned long rand; - - /* - * This is run before the entropy pools are initialized, - * but this is hopefully better than nothing. - */ - if (!arch_get_random_longs(&rand, 1)) { - /* The constant is an arbitrary large prime */ - rand = rdtsc(); - rand *= 0xc345c6b72fd16123UL; - } + unsigned long rand = get_random_long(); slot_random = rand % ESPFIX_STACKS_PER_PAGE; page_random = (rand / ESPFIX_STACKS_PER_PAGE) diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index 3b28c5b25e12..9baa89a8877d 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -391,8 +391,6 @@ int fpu_copy_uabi_to_guest_fpstate(struct fpu_guest *gfpu, const void *buf, { struct fpstate *kstate = gfpu->fpstate; const union fpregs_state *ustate = buf; - struct pkru_state *xpkru; - int ret; if (!cpu_feature_enabled(X86_FEATURE_XSAVE)) { if (ustate->xsave.header.xfeatures & ~XFEATURE_MASK_FPSSE) @@ -406,16 +404,15 @@ int fpu_copy_uabi_to_guest_fpstate(struct fpu_guest *gfpu, const void *buf, if (ustate->xsave.header.xfeatures & ~xcr0) return -EINVAL; - ret = copy_uabi_from_kernel_to_xstate(kstate, ustate); - if (ret) - return ret; + /* + * Nullify @vpkru to preserve its current value if PKRU's bit isn't set + * in the header. KVM's odd ABI is to leave PKRU untouched in this + * case (all other components are eventually re-initialized). + */ + if (!(ustate->xsave.header.xfeatures & XFEATURE_MASK_PKRU)) + vpkru = NULL; - /* Retrieve PKRU if not in init state */ - if (kstate->regs.xsave.header.xfeatures & XFEATURE_MASK_PKRU) { - xpkru = get_xsave_addr(&kstate->regs.xsave, XFEATURE_PKRU); - *vpkru = xpkru->pkru; - } - return 0; + return copy_uabi_from_kernel_to_xstate(kstate, ustate, vpkru); } EXPORT_SYMBOL_GPL(fpu_copy_uabi_to_guest_fpstate); #endif /* CONFIG_KVM */ @@ -605,9 +602,9 @@ int fpu_clone(struct task_struct *dst, unsigned long clone_flags, bool minimal) if (test_thread_flag(TIF_NEED_FPU_LOAD)) fpregs_restore_userregs(); save_fpregs_to_fpstate(dst_fpu); + fpregs_unlock(); if (!(clone_flags & CLONE_THREAD)) fpu_inherit_perms(dst_fpu); - fpregs_unlock(); /* * Children never inherit PASID state. diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c index 8946f89761cc..851eb13edc01 100644 --- a/arch/x86/kernel/fpu/init.c +++ b/arch/x86/kernel/fpu/init.c @@ -133,9 +133,6 @@ static void __init fpu__init_system_generic(void) fpu__init_system_mxcsr(); } -/* Get alignment of the TYPE. */ -#define TYPE_ALIGN(TYPE) offsetof(struct { char x; TYPE test; }, test) - /* * Enforce that 'MEMBER' is the last field of 'TYPE'. * @@ -143,8 +140,8 @@ static void __init fpu__init_system_generic(void) * because that's how C aligns structs. */ #define CHECK_MEMBER_AT_END_OF(TYPE, MEMBER) \ - BUILD_BUG_ON(sizeof(TYPE) != ALIGN(offsetofend(TYPE, MEMBER), \ - TYPE_ALIGN(TYPE))) + BUILD_BUG_ON(sizeof(TYPE) != \ + ALIGN(offsetofend(TYPE, MEMBER), _Alignof(TYPE))) /* * We append the 'struct fpu' to the task_struct: diff --git a/arch/x86/kernel/fpu/regset.c b/arch/x86/kernel/fpu/regset.c index 75ffaef8c299..6d056b68f4ed 100644 --- a/arch/x86/kernel/fpu/regset.c +++ b/arch/x86/kernel/fpu/regset.c @@ -167,7 +167,7 @@ int xstateregs_set(struct task_struct *target, const struct user_regset *regset, } fpu_force_restore(fpu); - ret = copy_uabi_from_kernel_to_xstate(fpu->fpstate, kbuf ?: tmpbuf); + ret = copy_uabi_from_kernel_to_xstate(fpu->fpstate, kbuf ?: tmpbuf, &target->thread.pkru); out: vfree(tmpbuf); diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c index 91d4b6de58ab..558076dbde5b 100644 --- a/arch/x86/kernel/fpu/signal.c +++ b/arch/x86/kernel/fpu/signal.c @@ -396,7 +396,7 @@ static bool __fpu_restore_sig(void __user *buf, void __user *buf_fx, fpregs = &fpu->fpstate->regs; if (use_xsave() && !fx_only) { - if (copy_sigframe_from_user_to_xstate(fpu->fpstate, buf_fx)) + if (copy_sigframe_from_user_to_xstate(tsk, buf_fx)) return false; } else { if (__copy_from_user(&fpregs->fxsave, buf_fx, diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index 59e543b95a3c..714166cc25f2 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -440,8 +440,8 @@ static void __init __xstate_dump_leaves(void) } } -#define XSTATE_WARN_ON(x) do { \ - if (WARN_ONCE(x, "XSAVE consistency problem, dumping leaves")) { \ +#define XSTATE_WARN_ON(x, fmt, ...) do { \ + if (WARN_ONCE(x, "XSAVE consistency problem: " fmt, ##__VA_ARGS__)) { \ __xstate_dump_leaves(); \ } \ } while (0) @@ -554,8 +554,7 @@ static bool __init check_xstate_against_struct(int nr) (nr >= XFEATURE_MAX) || (nr == XFEATURE_PT_UNIMPLEMENTED_SO_FAR) || ((nr >= XFEATURE_RSRVD_COMP_11) && (nr <= XFEATURE_RSRVD_COMP_16))) { - WARN_ONCE(1, "no structure for xstate: %d\n", nr); - XSTATE_WARN_ON(1); + XSTATE_WARN_ON(1, "No structure for xstate: %d\n", nr); return false; } return true; @@ -598,12 +597,13 @@ static bool __init paranoid_xstate_size_valid(unsigned int kernel_size) * XSAVES. */ if (!xsaves && xfeature_is_supervisor(i)) { - XSTATE_WARN_ON(1); + XSTATE_WARN_ON(1, "Got supervisor feature %d, but XSAVES not advertised\n", i); return false; } } size = xstate_calculate_size(fpu_kernel_cfg.max_features, compacted); - XSTATE_WARN_ON(size != kernel_size); + XSTATE_WARN_ON(size != kernel_size, + "size %u != kernel_size %u\n", size, kernel_size); return size == kernel_size; } @@ -1200,8 +1200,36 @@ static int copy_from_buffer(void *dst, unsigned int offset, unsigned int size, } +/** + * copy_uabi_to_xstate - Copy a UABI format buffer to the kernel xstate + * @fpstate: The fpstate buffer to copy to + * @kbuf: The UABI format buffer, if it comes from the kernel + * @ubuf: The UABI format buffer, if it comes from userspace + * @pkru: The location to write the PKRU value to + * + * Converts from the UABI format into the kernel internal hardware + * dependent format. + * + * This function ultimately has three different callers with distinct PKRU + * behavior. + * 1. When called from sigreturn the PKRU register will be restored from + * @fpstate via an XRSTOR. Correctly copying the UABI format buffer to + * @fpstate is sufficient to cover this case, but the caller will also + * pass a pointer to the thread_struct's pkru field in @pkru and updating + * it is harmless. + * 2. When called from ptrace the PKRU register will be restored from the + * thread_struct's pkru field. A pointer to that is passed in @pkru. + * The kernel will restore it manually, so the XRSTOR behavior that resets + * the PKRU register to the hardware init value (0) if the corresponding + * xfeatures bit is not set is emulated here. + * 3. When called from KVM the PKRU register will be restored from the vcpu's + * pkru field. A pointer to that is passed in @pkru. KVM hasn't used + * XRSTOR and hasn't had the PKRU resetting behavior described above. To + * preserve that KVM behavior, it passes NULL for @pkru if the xfeatures + * bit is not set. + */ static int copy_uabi_to_xstate(struct fpstate *fpstate, const void *kbuf, - const void __user *ubuf) + const void __user *ubuf, u32 *pkru) { struct xregs_state *xsave = &fpstate->regs.xsave; unsigned int offset, size; @@ -1250,6 +1278,20 @@ static int copy_uabi_to_xstate(struct fpstate *fpstate, const void *kbuf, } } + if (hdr.xfeatures & XFEATURE_MASK_PKRU) { + struct pkru_state *xpkru; + + xpkru = __raw_xsave_addr(xsave, XFEATURE_PKRU); + *pkru = xpkru->pkru; + } else { + /* + * KVM may pass NULL here to indicate that it does not need + * PKRU updated. + */ + if (pkru) + *pkru = 0; + } + /* * The state that came in from userspace was user-state only. * Mask all the user states out of 'xfeatures': @@ -1268,9 +1310,9 @@ static int copy_uabi_to_xstate(struct fpstate *fpstate, const void *kbuf, * Convert from a ptrace standard-format kernel buffer to kernel XSAVE[S] * format and copy to the target thread. Used by ptrace and KVM. */ -int copy_uabi_from_kernel_to_xstate(struct fpstate *fpstate, const void *kbuf) +int copy_uabi_from_kernel_to_xstate(struct fpstate *fpstate, const void *kbuf, u32 *pkru) { - return copy_uabi_to_xstate(fpstate, kbuf, NULL); + return copy_uabi_to_xstate(fpstate, kbuf, NULL, pkru); } /* @@ -1278,10 +1320,10 @@ int copy_uabi_from_kernel_to_xstate(struct fpstate *fpstate, const void *kbuf) * XSAVE[S] format and copy to the target thread. This is called from the * sigreturn() and rt_sigreturn() system calls. */ -int copy_sigframe_from_user_to_xstate(struct fpstate *fpstate, +int copy_sigframe_from_user_to_xstate(struct task_struct *tsk, const void __user *ubuf) { - return copy_uabi_to_xstate(fpstate, NULL, ubuf); + return copy_uabi_to_xstate(tsk->thread.fpu.fpstate, NULL, ubuf, &tsk->thread.pkru); } static bool validate_independent_components(u64 mask) diff --git a/arch/x86/kernel/fpu/xstate.h b/arch/x86/kernel/fpu/xstate.h index 5ad47031383b..a4ecb04d8d64 100644 --- a/arch/x86/kernel/fpu/xstate.h +++ b/arch/x86/kernel/fpu/xstate.h @@ -46,8 +46,8 @@ extern void __copy_xstate_to_uabi_buf(struct membuf to, struct fpstate *fpstate, u32 pkru_val, enum xstate_copy_mode copy_mode); extern void copy_xstate_to_uabi_buf(struct membuf to, struct task_struct *tsk, enum xstate_copy_mode mode); -extern int copy_uabi_from_kernel_to_xstate(struct fpstate *fpstate, const void *kbuf); -extern int copy_sigframe_from_user_to_xstate(struct fpstate *fpstate, const void __user *ubuf); +extern int copy_uabi_from_kernel_to_xstate(struct fpstate *fpstate, const void *kbuf, u32 *pkru); +extern int copy_sigframe_from_user_to_xstate(struct task_struct *tsk, const void __user *ubuf); extern void fpu__init_cpu_xstate(void); diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index bd165004776d..5e7ead52cfdb 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -24,10 +24,10 @@ #include <linux/module.h> #include <linux/memory.h> #include <linux/vmalloc.h> +#include <linux/set_memory.h> #include <trace/syscall.h> -#include <asm/set_memory.h> #include <asm/kprobes.h> #include <asm/ftrace.h> #include <asm/nops.h> @@ -69,6 +69,10 @@ static const char *ftrace_nop_replace(void) static const char *ftrace_call_replace(unsigned long ip, unsigned long addr) { + /* + * No need to translate into a callthunk. The trampoline does + * the depth accounting itself. + */ return text_gen_insn(CALL_INSN_OPCODE, (void *)ip, (void *)addr); } @@ -217,7 +221,9 @@ void ftrace_replace_code(int enable) ret = ftrace_verify_code(rec->ip, old); if (ret) { + ftrace_expected = old; ftrace_bug(ret, rec); + ftrace_expected = NULL; return; } } @@ -317,7 +323,7 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size) unsigned long size; unsigned long *ptr; void *trampoline; - void *ip; + void *ip, *dest; /* 48 8b 15 <offset> is movq <offset>(%rip), %rdx */ unsigned const char op_ref[] = { 0x48, 0x8b, 0x15 }; unsigned const char retq[] = { RET_INSN_OPCODE, INT3_INSN_OPCODE }; @@ -359,7 +365,7 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size) ip = trampoline + size; if (cpu_feature_enabled(X86_FEATURE_RETHUNK)) - __text_gen_insn(ip, JMP32_INSN_OPCODE, ip, &__x86_return_thunk, JMP32_INSN_SIZE); + __text_gen_insn(ip, JMP32_INSN_OPCODE, ip, x86_return_thunk, JMP32_INSN_SIZE); else memcpy(ip, retq, sizeof(retq)); @@ -404,20 +410,20 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size) /* put in the call to the function */ mutex_lock(&text_mutex); call_offset -= start_offset; + /* + * No need to translate into a callthunk. The trampoline does + * the depth accounting before the call already. + */ + dest = ftrace_ops_get_func(ops); memcpy(trampoline + call_offset, - text_gen_insn(CALL_INSN_OPCODE, - trampoline + call_offset, - ftrace_ops_get_func(ops)), CALL_INSN_SIZE); + text_gen_insn(CALL_INSN_OPCODE, trampoline + call_offset, dest), + CALL_INSN_SIZE); mutex_unlock(&text_mutex); /* ALLOC_TRAMP flags lets us know we created it */ ops->flags |= FTRACE_OPS_FL_ALLOC_TRAMP; - set_vm_flush_reset_perms(trampoline); - - if (likely(system_state != SYSTEM_BOOTING)) - set_memory_ro((unsigned long)trampoline, npages); - set_memory_x((unsigned long)trampoline, npages); + set_memory_rox((unsigned long)trampoline, npages); return (unsigned long)trampoline; fail: tramp_free(trampoline); diff --git a/arch/x86/kernel/ftrace_64.S b/arch/x86/kernel/ftrace_64.S index 2a4be92fd144..1265ad519249 100644 --- a/arch/x86/kernel/ftrace_64.S +++ b/arch/x86/kernel/ftrace_64.S @@ -3,8 +3,9 @@ * Copyright (C) 2014 Steven Rostedt, Red Hat Inc */ -#include <linux/linkage.h> #include <linux/cfi_types.h> +#include <linux/linkage.h> +#include <asm/asm-offsets.h> #include <asm/ptrace.h> #include <asm/ftrace.h> #include <asm/export.h> @@ -131,16 +132,19 @@ .endm SYM_TYPED_FUNC_START(ftrace_stub) + CALL_DEPTH_ACCOUNT RET SYM_FUNC_END(ftrace_stub) SYM_TYPED_FUNC_START(ftrace_stub_graph) + CALL_DEPTH_ACCOUNT RET SYM_FUNC_END(ftrace_stub_graph) #ifdef CONFIG_DYNAMIC_FTRACE SYM_FUNC_START(__fentry__) + CALL_DEPTH_ACCOUNT RET SYM_FUNC_END(__fentry__) EXPORT_SYMBOL(__fentry__) @@ -149,6 +153,8 @@ SYM_FUNC_START(ftrace_caller) /* save_mcount_regs fills in first two parameters */ save_mcount_regs + CALL_DEPTH_ACCOUNT + /* Stack - skipping return address of ftrace_caller */ leaq MCOUNT_REG_SIZE+8(%rsp), %rcx movq %rcx, RSP(%rsp) @@ -164,6 +170,9 @@ SYM_INNER_LABEL(ftrace_caller_op_ptr, SYM_L_GLOBAL) /* Only ops with REGS flag set should have CS register set */ movq $0, CS(%rsp) + /* Account for the function call below */ + CALL_DEPTH_ACCOUNT + SYM_INNER_LABEL(ftrace_call, SYM_L_GLOBAL) ANNOTATE_NOENDBR call ftrace_stub @@ -193,6 +202,8 @@ SYM_FUNC_START(ftrace_regs_caller) save_mcount_regs 8 /* save_mcount_regs fills in first two parameters */ + CALL_DEPTH_ACCOUNT + SYM_INNER_LABEL(ftrace_regs_caller_op_ptr, SYM_L_GLOBAL) ANNOTATE_NOENDBR /* Load the ftrace_ops into the 3rd parameter */ @@ -223,6 +234,9 @@ SYM_INNER_LABEL(ftrace_regs_caller_op_ptr, SYM_L_GLOBAL) /* regs go into 4th parameter */ leaq (%rsp), %rcx + /* Account for the function call below */ + CALL_DEPTH_ACCOUNT + SYM_INNER_LABEL(ftrace_regs_call, SYM_L_GLOBAL) ANNOTATE_NOENDBR call ftrace_stub @@ -275,7 +289,20 @@ SYM_INNER_LABEL(ftrace_regs_caller_end, SYM_L_GLOBAL) /* Restore flags */ popfq UNWIND_HINT_FUNC - RET + + /* + * The above left an extra return value on the stack; effectively + * doing a tail-call without using a register. This PUSH;RET + * pattern unbalances the RSB, inject a pointless CALL to rebalance. + */ + ANNOTATE_INTRA_FUNCTION_CALL + CALL .Ldo_rebalance + int3 +.Ldo_rebalance: + add $8, %rsp + ALTERNATIVE __stringify(RET), \ + __stringify(ANNOTATE_UNRET_SAFE; ret; int3), \ + X86_FEATURE_CALL_DEPTH SYM_FUNC_END(ftrace_regs_caller) STACK_FRAME_NON_STANDARD_FP(ftrace_regs_caller) @@ -284,6 +311,8 @@ STACK_FRAME_NON_STANDARD_FP(ftrace_regs_caller) #else /* ! CONFIG_DYNAMIC_FTRACE */ SYM_FUNC_START(__fentry__) + CALL_DEPTH_ACCOUNT + cmpq $ftrace_stub, ftrace_trace_function jnz trace RET @@ -337,6 +366,8 @@ SYM_CODE_START(return_to_handler) int3 .Ldo_rop: mov %rdi, (%rsp) - RET + ALTERNATIVE __stringify(RET), \ + __stringify(ANNOTATE_UNRET_SAFE; ret; int3), \ + X86_FEATURE_CALL_DEPTH SYM_CODE_END(return_to_handler) #endif diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 6a3cfaf6b72a..387e4b12e823 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -203,7 +203,7 @@ unsigned long __head __startup_64(unsigned long physaddr, load_delta = physaddr - (unsigned long)(_text - __START_KERNEL_map); /* Is the address not 2M aligned? */ - if (load_delta & ~PMD_PAGE_MASK) + if (load_delta & ~PMD_MASK) for (;;); /* Include the SME encryption mask in the fixup value */ diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index 9b7acc9c7874..67c8ed99144b 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -261,16 +261,6 @@ SYM_FUNC_START(startup_32_smp) addl $__PAGE_OFFSET, %esp /* - * start system 32-bit setup. We need to re-do some of the things done - * in 16-bit mode for the "real" operations. - */ - movl setup_once_ref,%eax - andl %eax,%eax - jz 1f # Did we do this already? - call *%eax -1: - -/* * Check if it is 486 */ movb $4,X86 # at least 486 @@ -331,18 +321,7 @@ SYM_FUNC_END(startup_32_smp) #include "verify_cpu.S" -/* - * setup_once - * - * The setup work we only want to run on the BSP. - * - * Warning: %esi is live across this function. - */ __INIT -setup_once: - andl $0,setup_once_ref /* Once is enough, thanks */ - RET - SYM_FUNC_START(early_idt_handler_array) # 36(%esp) %eflags # 32(%esp) %cs @@ -458,7 +437,6 @@ SYM_DATA(early_recursion_flag, .long 0) __REFDATA .align 4 SYM_DATA(initial_code, .long i386_start_kernel) -SYM_DATA(setup_once_ref, .long setup_once) #ifdef CONFIG_PAGE_TABLE_ISOLATION #define PGD_ALIGN (2 * PAGE_SIZE) diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index d860d437631b..222efd4a09bc 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -370,6 +370,7 @@ SYM_CODE_END(secondary_startup_64) * start_secondary() via .Ljump_to_C_code. */ SYM_CODE_START(start_cpu0) + ANNOTATE_NOENDBR UNWIND_HINT_EMPTY movq initial_stack(%rip), %rsp jmp .Ljump_to_C_code diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c index 668a4a6533d9..bbb0f737aab1 100644 --- a/arch/x86/kernel/hw_breakpoint.c +++ b/arch/x86/kernel/hw_breakpoint.c @@ -266,7 +266,7 @@ static inline bool within_cpu_entry(unsigned long addr, unsigned long end) /* CPU entry erea is always used for CPU entry */ if (within_area(addr, end, CPU_ENTRY_AREA_BASE, - CPU_ENTRY_AREA_TOTAL_SIZE)) + CPU_ENTRY_AREA_MAP_SIZE)) return true; /* diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c index 15aefa3f3e18..3aa5304200c5 100644 --- a/arch/x86/kernel/i8259.c +++ b/arch/x86/kernel/i8259.c @@ -407,7 +407,7 @@ struct legacy_pic null_legacy_pic = { .make_irq = legacy_pic_uint_noop, }; -struct legacy_pic default_legacy_pic = { +static struct legacy_pic default_legacy_pic = { .nr_legacy_irqs = NR_IRQS_LEGACY, .chip = &i8259A_chip, .mask = mask_8259A_irq, diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c index 01833ebf5e8e..dc1049c01f9b 100644 --- a/arch/x86/kernel/irq_32.c +++ b/arch/x86/kernel/irq_32.c @@ -52,9 +52,6 @@ static inline int check_stack_overflow(void) { return 0; } static inline void print_stack_overflow(void) { } #endif -DEFINE_PER_CPU(struct irq_stack *, hardirq_stack_ptr); -DEFINE_PER_CPU(struct irq_stack *, softirq_stack_ptr); - static void call_on_stack(void *func, void *stack) { asm volatile("xchgl %%ebx,%%esp \n" @@ -77,7 +74,7 @@ static inline int execute_on_irq_stack(int overflow, struct irq_desc *desc) u32 *isp, *prev_esp, arg1; curstk = (struct irq_stack *) current_stack(); - irqstk = __this_cpu_read(hardirq_stack_ptr); + irqstk = __this_cpu_read(pcpu_hot.hardirq_stack_ptr); /* * this is where we switch to the IRQ stack. However, if we are @@ -115,7 +112,7 @@ int irq_init_percpu_irqstack(unsigned int cpu) int node = cpu_to_node(cpu); struct page *ph, *ps; - if (per_cpu(hardirq_stack_ptr, cpu)) + if (per_cpu(pcpu_hot.hardirq_stack_ptr, cpu)) return 0; ph = alloc_pages_node(node, THREADINFO_GFP, THREAD_SIZE_ORDER); @@ -127,8 +124,8 @@ int irq_init_percpu_irqstack(unsigned int cpu) return -ENOMEM; } - per_cpu(hardirq_stack_ptr, cpu) = page_address(ph); - per_cpu(softirq_stack_ptr, cpu) = page_address(ps); + per_cpu(pcpu_hot.hardirq_stack_ptr, cpu) = page_address(ph); + per_cpu(pcpu_hot.softirq_stack_ptr, cpu) = page_address(ps); return 0; } @@ -138,7 +135,7 @@ void do_softirq_own_stack(void) struct irq_stack *irqstk; u32 *isp, *prev_esp; - irqstk = __this_cpu_read(softirq_stack_ptr); + irqstk = __this_cpu_read(pcpu_hot.softirq_stack_ptr); /* build the stack frame on the softirq stack */ isp = (u32 *) ((char *)irqstk + sizeof(*irqstk)); diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c index 1c0fb96b9e39..fe0c859873d1 100644 --- a/arch/x86/kernel/irq_64.c +++ b/arch/x86/kernel/irq_64.c @@ -50,7 +50,7 @@ static int map_irq_stack(unsigned int cpu) return -ENOMEM; /* Store actual TOS to avoid adjustment in the hotpath */ - per_cpu(hardirq_stack_ptr, cpu) = va + IRQ_STACK_SIZE - 8; + per_cpu(pcpu_hot.hardirq_stack_ptr, cpu) = va + IRQ_STACK_SIZE - 8; return 0; } #else @@ -63,14 +63,14 @@ static int map_irq_stack(unsigned int cpu) void *va = per_cpu_ptr(&irq_stack_backing_store, cpu); /* Store actual TOS to avoid adjustment in the hotpath */ - per_cpu(hardirq_stack_ptr, cpu) = va + IRQ_STACK_SIZE - 8; + per_cpu(pcpu_hot.hardirq_stack_ptr, cpu) = va + IRQ_STACK_SIZE - 8; return 0; } #endif int irq_init_percpu_irqstack(unsigned int cpu) { - if (per_cpu(hardirq_stack_ptr, cpu)) + if (per_cpu(pcpu_hot.hardirq_stack_ptr, cpu)) return 0; return map_irq_stack(cpu); } diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c index eb8bc82846b9..b36f3c367cb2 100644 --- a/arch/x86/kernel/kprobes/core.c +++ b/arch/x86/kernel/kprobes/core.c @@ -37,12 +37,14 @@ #include <linux/extable.h> #include <linux/kdebug.h> #include <linux/kallsyms.h> +#include <linux/kgdb.h> #include <linux/ftrace.h> #include <linux/kasan.h> #include <linux/moduleloader.h> #include <linux/objtool.h> #include <linux/vmalloc.h> #include <linux/pgtable.h> +#include <linux/set_memory.h> #include <asm/text-patching.h> #include <asm/cacheflush.h> @@ -51,7 +53,6 @@ #include <asm/alternative.h> #include <asm/insn.h> #include <asm/debugreg.h> -#include <asm/set_memory.h> #include <asm/ibt.h> #include "common.h" @@ -281,12 +282,15 @@ static int can_probe(unsigned long paddr) if (ret < 0) return 0; +#ifdef CONFIG_KGDB /* - * Another debugging subsystem might insert this breakpoint. - * In that case, we can't recover it. + * If there is a dynamically installed kgdb sw breakpoint, + * this function should not be probed. */ - if (insn.opcode.bytes[0] == INT3_INSN_OPCODE) + if (insn.opcode.bytes[0] == INT3_INSN_OPCODE && + kgdb_has_hit_break(addr)) return 0; +#endif addr += insn.length; } @@ -414,18 +418,11 @@ void *alloc_insn_page(void) if (!page) return NULL; - set_vm_flush_reset_perms(page); - /* - * First make the page read-only, and only then make it executable to - * prevent it from being W+X in between. - */ - set_memory_ro((unsigned long)page, 1); - /* * TODO: Once additional kernel code protection mechanisms are set, ensure * that the page was not maliciously altered and it is still zeroed. */ - set_memory_x((unsigned long)page, 1); + set_memory_rox((unsigned long)page, 1); return page; } diff --git a/arch/x86/kernel/kprobes/opt.c b/arch/x86/kernel/kprobes/opt.c index e6b8c5362b94..e57e07b0edb6 100644 --- a/arch/x86/kernel/kprobes/opt.c +++ b/arch/x86/kernel/kprobes/opt.c @@ -15,6 +15,7 @@ #include <linux/extable.h> #include <linux/kdebug.h> #include <linux/kallsyms.h> +#include <linux/kgdb.h> #include <linux/ftrace.h> #include <linux/objtool.h> #include <linux/pgtable.h> @@ -279,19 +280,6 @@ static int insn_is_indirect_jump(struct insn *insn) return ret; } -static bool is_padding_int3(unsigned long addr, unsigned long eaddr) -{ - unsigned char ops; - - for (; addr < eaddr; addr++) { - if (get_kernel_nofault(ops, (void *)addr) < 0 || - ops != INT3_INSN_OPCODE) - return false; - } - - return true; -} - /* Decode whole function to ensure any instructions don't jump into target */ static int can_optimize(unsigned long paddr) { @@ -334,15 +322,15 @@ static int can_optimize(unsigned long paddr) ret = insn_decode_kernel(&insn, (void *)recovered_insn); if (ret < 0) return 0; - +#ifdef CONFIG_KGDB /* - * In the case of detecting unknown breakpoint, this could be - * a padding INT3 between functions. Let's check that all the - * rest of the bytes are also INT3. + * If there is a dynamically installed kgdb sw breakpoint, + * this function should not be probed. */ - if (insn.opcode.bytes[0] == INT3_INSN_OPCODE) - return is_padding_int3(addr, paddr - offset + size) ? 1 : 0; - + if (insn.opcode.bytes[0] == INT3_INSN_OPCODE && + kgdb_has_hit_break(addr)) + return 0; +#endif /* Recover address */ insn.kaddr = (void *)addr; insn.next_byte = (void *)(addr + insn.length); diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index cf886f86038a..1cceac5984da 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -798,19 +798,13 @@ extern bool __raw_callee_save___kvm_vcpu_is_preempted(long); * Hand-optimize version for x86-64 to avoid 8 64-bit register saving and * restoring to/from the stack. */ -asm( -".pushsection .text;" -".global __raw_callee_save___kvm_vcpu_is_preempted;" -".type __raw_callee_save___kvm_vcpu_is_preempted, @function;" -"__raw_callee_save___kvm_vcpu_is_preempted:" -ASM_ENDBR -"movq __per_cpu_offset(,%rdi,8), %rax;" -"cmpb $0, " __stringify(KVM_STEAL_TIME_preempted) "+steal_time(%rax);" -"setne %al;" -ASM_RET -".size __raw_callee_save___kvm_vcpu_is_preempted, .-__raw_callee_save___kvm_vcpu_is_preempted;" -".popsection"); +#define PV_VCPU_PREEMPTED_ASM \ + "movq __per_cpu_offset(,%rdi,8), %rax\n\t" \ + "cmpb $0, " __stringify(KVM_STEAL_TIME_preempted) "+steal_time(%rax)\n\t" \ + "setne %al\n\t" +DEFINE_PARAVIRT_ASM(__raw_callee_save___kvm_vcpu_is_preempted, + PV_VCPU_PREEMPTED_ASM, .text); #endif static void __init kvm_guest_init(void) diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c index c032edcd3d95..705fb2a41d7d 100644 --- a/arch/x86/kernel/module.c +++ b/arch/x86/kernel/module.c @@ -53,7 +53,7 @@ static unsigned long int get_module_load_offset(void) */ if (module_load_offset == 0) module_load_offset = - (prandom_u32_max(1024) + 1) * PAGE_SIZE; + get_random_u32_inclusive(1, 1024) * PAGE_SIZE; mutex_unlock(&module_kaslr_mutex); } return module_load_offset; @@ -74,10 +74,11 @@ void *module_alloc(unsigned long size) return NULL; p = __vmalloc_node_range(size, MODULE_ALIGN, - MODULES_VADDR + get_module_load_offset(), - MODULES_END, gfp_mask, - PAGE_KERNEL, VM_DEFER_KMEMLEAK, NUMA_NO_NODE, - __builtin_return_address(0)); + MODULES_VADDR + get_module_load_offset(), + MODULES_END, gfp_mask, PAGE_KERNEL, + VM_FLUSH_RESET_PERMS | VM_DEFER_KMEMLEAK, + NUMA_NO_NODE, __builtin_return_address(0)); + if (p && (kasan_alloc_module_shadow(p, size, gfp_mask) < 0)) { vfree(p); return NULL; @@ -251,14 +252,13 @@ int module_finalize(const Elf_Ehdr *hdr, const Elf_Shdr *sechdrs, struct module *me) { - const Elf_Shdr *s, *text = NULL, *alt = NULL, *locks = NULL, + const Elf_Shdr *s, *alt = NULL, *locks = NULL, *para = NULL, *orc = NULL, *orc_ip = NULL, - *retpolines = NULL, *returns = NULL, *ibt_endbr = NULL; + *retpolines = NULL, *returns = NULL, *ibt_endbr = NULL, + *calls = NULL, *cfi = NULL; char *secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset; for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) { - if (!strcmp(".text", secstrings + s->sh_name)) - text = s; if (!strcmp(".altinstructions", secstrings + s->sh_name)) alt = s; if (!strcmp(".smp_locks", secstrings + s->sh_name)) @@ -273,6 +273,10 @@ int module_finalize(const Elf_Ehdr *hdr, retpolines = s; if (!strcmp(".return_sites", secstrings + s->sh_name)) returns = s; + if (!strcmp(".call_sites", secstrings + s->sh_name)) + calls = s; + if (!strcmp(".cfi_sites", secstrings + s->sh_name)) + cfi = s; if (!strcmp(".ibt_endbr_seal", secstrings + s->sh_name)) ibt_endbr = s; } @@ -285,6 +289,22 @@ int module_finalize(const Elf_Ehdr *hdr, void *pseg = (void *)para->sh_addr; apply_paravirt(pseg, pseg + para->sh_size); } + if (retpolines || cfi) { + void *rseg = NULL, *cseg = NULL; + unsigned int rsize = 0, csize = 0; + + if (retpolines) { + rseg = (void *)retpolines->sh_addr; + rsize = retpolines->sh_size; + } + + if (cfi) { + cseg = (void *)cfi->sh_addr; + csize = cfi->sh_size; + } + + apply_fineibt(rseg, rseg + rsize, cseg, cseg + csize); + } if (retpolines) { void *rseg = (void *)retpolines->sh_addr; apply_retpolines(rseg, rseg + retpolines->sh_size); @@ -298,16 +318,32 @@ int module_finalize(const Elf_Ehdr *hdr, void *aseg = (void *)alt->sh_addr; apply_alternatives(aseg, aseg + alt->sh_size); } + if (calls || para) { + struct callthunk_sites cs = {}; + + if (calls) { + cs.call_start = (void *)calls->sh_addr; + cs.call_end = (void *)calls->sh_addr + calls->sh_size; + } + + if (para) { + cs.pv_start = (void *)para->sh_addr; + cs.pv_end = (void *)para->sh_addr + para->sh_size; + } + + callthunks_patch_module_calls(&cs, me); + } if (ibt_endbr) { void *iseg = (void *)ibt_endbr->sh_addr; apply_ibt_endbr(iseg, iseg + ibt_endbr->sh_size); } - if (locks && text) { + if (locks) { void *lseg = (void *)locks->sh_addr; - void *tseg = (void *)text->sh_addr; + void *text = me->core_layout.base; + void *text_end = text + me->core_layout.text_size; alternatives_smp_module_add(me, me->name, lseg, lseg + locks->sh_size, - tseg, tseg + text->sh_size); + text, text_end); } if (orc && orc_ip) diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c index ed8ac6bcbafb..708751311786 100644 --- a/arch/x86/kernel/msr.c +++ b/arch/x86/kernel/msr.c @@ -250,7 +250,7 @@ static int msr_device_destroy(unsigned int cpu) return 0; } -static char *msr_devnode(struct device *dev, umode_t *mode) +static char *msr_devnode(const struct device *dev, umode_t *mode) { return kasprintf(GFP_KERNEL, "cpu/%u/msr", MINOR(dev->devt)); } diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index 7ca2d46c08cc..327757afb027 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -37,27 +37,10 @@ * nop stub, which must not clobber anything *including the stack* to * avoid confusing the entry prologues. */ -extern void _paravirt_nop(void); -asm (".pushsection .entry.text, \"ax\"\n" - ".global _paravirt_nop\n" - "_paravirt_nop:\n\t" - ASM_ENDBR - ASM_RET - ".size _paravirt_nop, . - _paravirt_nop\n\t" - ".type _paravirt_nop, @function\n\t" - ".popsection"); +DEFINE_PARAVIRT_ASM(_paravirt_nop, "", .entry.text); /* stub always returning 0. */ -asm (".pushsection .entry.text, \"ax\"\n" - ".global paravirt_ret0\n" - "paravirt_ret0:\n\t" - ASM_ENDBR - "xor %" _ASM_AX ", %" _ASM_AX ";\n\t" - ASM_RET - ".size paravirt_ret0, . - paravirt_ret0\n\t" - ".type paravirt_ret0, @function\n\t" - ".popsection"); - +DEFINE_PARAVIRT_ASM(paravirt_ret0, "xor %eax,%eax", .entry.text); void __init default_banner(void) { diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index c21b7347a26d..40d156a31676 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -600,7 +600,7 @@ static __always_inline void __speculation_ctrl_update(unsigned long tifp, } if (updmsr) - write_spec_ctrl_current(msr, false); + update_spec_ctrl_cond(msr); } static unsigned long speculation_ctrl_update_tif(struct task_struct *tsk) @@ -965,7 +965,7 @@ early_param("idle", idle_setup); unsigned long arch_align_stack(unsigned long sp) { if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space) - sp -= prandom_u32_max(8192); + sp -= get_random_u32_below(8192); return sp & ~0xf; } diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 2f314b170c9f..470c128759ea 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -191,13 +191,13 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) arch_end_context_switch(next_p); /* - * Reload esp0 and cpu_current_top_of_stack. This changes + * Reload esp0 and pcpu_hot.top_of_stack. This changes * current_thread_info(). Refresh the SYSENTER configuration in * case prev or next is vm86. */ update_task_stack(next_p); refresh_sysenter_cs(next); - this_cpu_write(cpu_current_top_of_stack, + this_cpu_write(pcpu_hot.top_of_stack, (unsigned long)task_stack_page(next_p) + THREAD_SIZE); @@ -207,7 +207,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) if (prev->gs | next->gs) loadsegment(gs, next->gs); - this_cpu_write(current_task, next_p); + raw_cpu_write(pcpu_hot.current_task, next_p); switch_fpu_finish(); diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 6b3418bff326..4e34b3b68ebd 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -165,7 +165,7 @@ static noinstr unsigned long __rdgsbase_inactive(void) lockdep_assert_irqs_disabled(); - if (!static_cpu_has(X86_FEATURE_XENPV)) { + if (!cpu_feature_enabled(X86_FEATURE_XENPV)) { native_swapgs(); gsbase = rdgsbase(); native_swapgs(); @@ -190,7 +190,7 @@ static noinstr void __wrgsbase_inactive(unsigned long gsbase) { lockdep_assert_irqs_disabled(); - if (!static_cpu_has(X86_FEATURE_XENPV)) { + if (!cpu_feature_enabled(X86_FEATURE_XENPV)) { native_swapgs(); wrgsbase(gsbase); native_swapgs(); @@ -563,7 +563,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) int cpu = smp_processor_id(); WARN_ON_ONCE(IS_ENABLED(CONFIG_DEBUG_ENTRY) && - this_cpu_read(hardirq_stack_inuse)); + this_cpu_read(pcpu_hot.hardirq_stack_inuse)); if (!test_thread_flag(TIF_NEED_FPU_LOAD)) switch_fpu_prepare(prev_fpu, cpu); @@ -617,8 +617,8 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) /* * Switch the PDA and FPU contexts. */ - this_cpu_write(current_task, next_p); - this_cpu_write(cpu_current_top_of_stack, task_top_of_stack(next_p)); + raw_cpu_write(pcpu_hot.current_task, next_p); + raw_cpu_write(pcpu_hot.top_of_stack, task_top_of_stack(next_p)); switch_fpu_finish(); diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 37c12fb92906..dfaa270a7cc9 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -44,16 +44,35 @@ #include "tls.h" -enum x86_regset { - REGSET_GENERAL, - REGSET_FP, - REGSET_XFP, - REGSET_IOPERM64 = REGSET_XFP, - REGSET_XSTATE, - REGSET_TLS, - REGSET_IOPERM32, +enum x86_regset_32 { + REGSET32_GENERAL, + REGSET32_FP, + REGSET32_XFP, + REGSET32_XSTATE, + REGSET32_TLS, + REGSET32_IOPERM, }; +enum x86_regset_64 { + REGSET64_GENERAL, + REGSET64_FP, + REGSET64_IOPERM, + REGSET64_XSTATE, +}; + +#define REGSET_GENERAL \ +({ \ + BUILD_BUG_ON((int)REGSET32_GENERAL != (int)REGSET64_GENERAL); \ + REGSET32_GENERAL; \ +}) + +#define REGSET_FP \ +({ \ + BUILD_BUG_ON((int)REGSET32_FP != (int)REGSET64_FP); \ + REGSET32_FP; \ +}) + + struct pt_regs_offset { const char *name; int offset; @@ -788,13 +807,13 @@ long arch_ptrace(struct task_struct *child, long request, #ifdef CONFIG_X86_32 case PTRACE_GETFPXREGS: /* Get the child extended FPU state. */ return copy_regset_to_user(child, &user_x86_32_view, - REGSET_XFP, + REGSET32_XFP, 0, sizeof(struct user_fxsr_struct), datap) ? -EIO : 0; case PTRACE_SETFPXREGS: /* Set the child extended FPU state. */ return copy_regset_from_user(child, &user_x86_32_view, - REGSET_XFP, + REGSET32_XFP, 0, sizeof(struct user_fxsr_struct), datap) ? -EIO : 0; #endif @@ -1086,13 +1105,13 @@ static long ia32_arch_ptrace(struct task_struct *child, compat_long_t request, case PTRACE_GETFPXREGS: /* Get the child extended FPU state. */ return copy_regset_to_user(child, &user_x86_32_view, - REGSET_XFP, 0, + REGSET32_XFP, 0, sizeof(struct user32_fxsr_struct), datap); case PTRACE_SETFPXREGS: /* Set the child extended FPU state. */ return copy_regset_from_user(child, &user_x86_32_view, - REGSET_XFP, 0, + REGSET32_XFP, 0, sizeof(struct user32_fxsr_struct), datap); @@ -1215,29 +1234,38 @@ long compat_arch_ptrace(struct task_struct *child, compat_long_t request, #ifdef CONFIG_X86_64 static struct user_regset x86_64_regsets[] __ro_after_init = { - [REGSET_GENERAL] = { - .core_note_type = NT_PRSTATUS, - .n = sizeof(struct user_regs_struct) / sizeof(long), - .size = sizeof(long), .align = sizeof(long), - .regset_get = genregs_get, .set = genregs_set + [REGSET64_GENERAL] = { + .core_note_type = NT_PRSTATUS, + .n = sizeof(struct user_regs_struct) / sizeof(long), + .size = sizeof(long), + .align = sizeof(long), + .regset_get = genregs_get, + .set = genregs_set }, - [REGSET_FP] = { - .core_note_type = NT_PRFPREG, - .n = sizeof(struct fxregs_state) / sizeof(long), - .size = sizeof(long), .align = sizeof(long), - .active = regset_xregset_fpregs_active, .regset_get = xfpregs_get, .set = xfpregs_set + [REGSET64_FP] = { + .core_note_type = NT_PRFPREG, + .n = sizeof(struct fxregs_state) / sizeof(long), + .size = sizeof(long), + .align = sizeof(long), + .active = regset_xregset_fpregs_active, + .regset_get = xfpregs_get, + .set = xfpregs_set }, - [REGSET_XSTATE] = { - .core_note_type = NT_X86_XSTATE, - .size = sizeof(u64), .align = sizeof(u64), - .active = xstateregs_active, .regset_get = xstateregs_get, - .set = xstateregs_set + [REGSET64_XSTATE] = { + .core_note_type = NT_X86_XSTATE, + .size = sizeof(u64), + .align = sizeof(u64), + .active = xstateregs_active, + .regset_get = xstateregs_get, + .set = xstateregs_set }, - [REGSET_IOPERM64] = { - .core_note_type = NT_386_IOPERM, - .n = IO_BITMAP_LONGS, - .size = sizeof(long), .align = sizeof(long), - .active = ioperm_active, .regset_get = ioperm_get + [REGSET64_IOPERM] = { + .core_note_type = NT_386_IOPERM, + .n = IO_BITMAP_LONGS, + .size = sizeof(long), + .align = sizeof(long), + .active = ioperm_active, + .regset_get = ioperm_get }, }; @@ -1256,43 +1284,57 @@ static const struct user_regset_view user_x86_64_view = { #if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION static struct user_regset x86_32_regsets[] __ro_after_init = { - [REGSET_GENERAL] = { - .core_note_type = NT_PRSTATUS, - .n = sizeof(struct user_regs_struct32) / sizeof(u32), - .size = sizeof(u32), .align = sizeof(u32), - .regset_get = genregs32_get, .set = genregs32_set + [REGSET32_GENERAL] = { + .core_note_type = NT_PRSTATUS, + .n = sizeof(struct user_regs_struct32) / sizeof(u32), + .size = sizeof(u32), + .align = sizeof(u32), + .regset_get = genregs32_get, + .set = genregs32_set }, - [REGSET_FP] = { - .core_note_type = NT_PRFPREG, - .n = sizeof(struct user_i387_ia32_struct) / sizeof(u32), - .size = sizeof(u32), .align = sizeof(u32), - .active = regset_fpregs_active, .regset_get = fpregs_get, .set = fpregs_set + [REGSET32_FP] = { + .core_note_type = NT_PRFPREG, + .n = sizeof(struct user_i387_ia32_struct) / sizeof(u32), + .size = sizeof(u32), + .align = sizeof(u32), + .active = regset_fpregs_active, + .regset_get = fpregs_get, + .set = fpregs_set }, - [REGSET_XFP] = { - .core_note_type = NT_PRXFPREG, - .n = sizeof(struct fxregs_state) / sizeof(u32), - .size = sizeof(u32), .align = sizeof(u32), - .active = regset_xregset_fpregs_active, .regset_get = xfpregs_get, .set = xfpregs_set + [REGSET32_XFP] = { + .core_note_type = NT_PRXFPREG, + .n = sizeof(struct fxregs_state) / sizeof(u32), + .size = sizeof(u32), + .align = sizeof(u32), + .active = regset_xregset_fpregs_active, + .regset_get = xfpregs_get, + .set = xfpregs_set }, - [REGSET_XSTATE] = { - .core_note_type = NT_X86_XSTATE, - .size = sizeof(u64), .align = sizeof(u64), - .active = xstateregs_active, .regset_get = xstateregs_get, - .set = xstateregs_set + [REGSET32_XSTATE] = { + .core_note_type = NT_X86_XSTATE, + .size = sizeof(u64), + .align = sizeof(u64), + .active = xstateregs_active, + .regset_get = xstateregs_get, + .set = xstateregs_set }, - [REGSET_TLS] = { - .core_note_type = NT_386_TLS, - .n = GDT_ENTRY_TLS_ENTRIES, .bias = GDT_ENTRY_TLS_MIN, - .size = sizeof(struct user_desc), - .align = sizeof(struct user_desc), - .active = regset_tls_active, - .regset_get = regset_tls_get, .set = regset_tls_set + [REGSET32_TLS] = { + .core_note_type = NT_386_TLS, + .n = GDT_ENTRY_TLS_ENTRIES, + .bias = GDT_ENTRY_TLS_MIN, + .size = sizeof(struct user_desc), + .align = sizeof(struct user_desc), + .active = regset_tls_active, + .regset_get = regset_tls_get, + .set = regset_tls_set }, - [REGSET_IOPERM32] = { - .core_note_type = NT_386_IOPERM, - .n = IO_BITMAP_BYTES / sizeof(u32), - .size = sizeof(u32), .align = sizeof(u32), - .active = ioperm_active, .regset_get = ioperm_get + [REGSET32_IOPERM] = { + .core_note_type = NT_386_IOPERM, + .n = IO_BITMAP_BYTES / sizeof(u32), + .size = sizeof(u32), + .align = sizeof(u32), + .active = ioperm_active, + .regset_get = ioperm_get }, }; @@ -1311,10 +1353,10 @@ u64 xstate_fx_sw_bytes[USER_XSTATE_FX_SW_WORDS]; void __init update_regset_xstate_info(unsigned int size, u64 xstate_mask) { #ifdef CONFIG_X86_64 - x86_64_regsets[REGSET_XSTATE].n = size / sizeof(u64); + x86_64_regsets[REGSET64_XSTATE].n = size / sizeof(u64); #endif #if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION - x86_32_regsets[REGSET_XSTATE].n = size / sizeof(u64); + x86_32_regsets[REGSET32_XSTATE].n = size / sizeof(u64); #endif xstate_fx_sw_bytes[USER_XSTATE_XCR0_WORD] = xstate_mask; } diff --git a/arch/x86/kernel/relocate_kernel_64.S b/arch/x86/kernel/relocate_kernel_64.S index 4809c0dc4eb0..4a73351f87f8 100644 --- a/arch/x86/kernel/relocate_kernel_64.S +++ b/arch/x86/kernel/relocate_kernel_64.S @@ -41,6 +41,7 @@ .text .align PAGE_SIZE .code64 +SYM_CODE_START_NOALIGN(relocate_range) SYM_CODE_START_NOALIGN(relocate_kernel) UNWIND_HINT_EMPTY ANNOTATE_NOENDBR @@ -312,5 +313,5 @@ SYM_CODE_START_LOCAL_NOALIGN(swap_pages) int3 SYM_CODE_END(swap_pages) - .globl kexec_control_code_size -.set kexec_control_code_size, . - relocate_kernel + .skip KEXEC_CONTROL_CODE_MAX_SIZE - (. - relocate_kernel), 0xcc +SYM_CODE_END(relocate_range); diff --git a/arch/x86/kernel/resource.c b/arch/x86/kernel/resource.c index bba1abd05bfe..79bc8a97a083 100644 --- a/arch/x86/kernel/resource.c +++ b/arch/x86/kernel/resource.c @@ -42,8 +42,16 @@ static void remove_e820_regions(struct resource *avail) resource_clip(avail, e820_start, e820_end); if (orig.start != avail->start || orig.end != avail->end) { - pr_info("clipped %pR to %pR for e820 entry [mem %#010Lx-%#010Lx]\n", - &orig, avail, e820_start, e820_end); + pr_info("resource: avoiding allocation from e820 entry [mem %#010Lx-%#010Lx]\n", + e820_start, e820_end); + if (avail->end > avail->start) + /* + * Use %pa instead of %pR because "avail" + * is typically IORESOURCE_UNSET, so %pR + * shows the size instead of addresses. + */ + pr_info("resource: remaining [mem %pa-%pa] available\n", + &avail->start, &avail->end); orig = *avail; } } diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 216fee7144ee..88188549647c 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -31,9 +31,11 @@ #include <xen/xen.h> #include <asm/apic.h> +#include <asm/efi.h> #include <asm/numa.h> #include <asm/bios_ebda.h> #include <asm/bugs.h> +#include <asm/cacheinfo.h> #include <asm/cpu.h> #include <asm/efi.h> #include <asm/gart.h> @@ -1074,24 +1076,13 @@ void __init setup_arch(char **cmdline_p) max_pfn = e820__end_of_ram_pfn(); /* update e820 for memory not covered by WB MTRRs */ - if (IS_ENABLED(CONFIG_MTRR)) - mtrr_bp_init(); - else - pat_disable("PAT support disabled because CONFIG_MTRR is disabled in the kernel."); - + cache_bp_init(); if (mtrr_trim_uncached_memory(max_pfn)) max_pfn = e820__end_of_ram_pfn(); max_possible_pfn = max_pfn; /* - * This call is required when the CPU does not support PAT. If - * mtrr_bp_init() invoked it already via pat_init() the call has no - * effect. - */ - init_cache_modes(); - - /* * Define random base addresses for memory sections after max_pfn is * defined and before each memory section base is used. */ @@ -1175,7 +1166,7 @@ void __init setup_arch(char **cmdline_p) * Moreover, on machines with SandyBridge graphics or in setups that use * crashkernel the entire 1M is reserved anyway. */ - reserve_real_mode(); + x86_platform.realmode_reserve(); init_mem_mapping(); diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index 49325caa7307..c242dc47e9cb 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -11,6 +11,7 @@ #include <linux/smp.h> #include <linux/topology.h> #include <linux/pfn.h> +#include <linux/stackprotector.h> #include <asm/sections.h> #include <asm/processor.h> #include <asm/desc.h> @@ -21,10 +22,6 @@ #include <asm/proto.h> #include <asm/cpumask.h> #include <asm/cpu.h> -#include <asm/stackprotector.h> - -DEFINE_PER_CPU_READ_MOSTLY(int, cpu_number); -EXPORT_PER_CPU_SYMBOL(cpu_number); #ifdef CONFIG_X86_64 #define BOOT_PERCPU_OFFSET ((unsigned long)__per_cpu_load) @@ -172,7 +169,7 @@ void __init setup_per_cpu_areas(void) for_each_possible_cpu(cpu) { per_cpu_offset(cpu) = delta + pcpu_unit_offsets[cpu]; per_cpu(this_cpu_off, cpu) = per_cpu_offset(cpu); - per_cpu(cpu_number, cpu) = cpu; + per_cpu(pcpu_hot.cpu_number, cpu) = cpu; setup_percpu_segment(cpu); /* * Copy data used in early init routines from the @@ -211,7 +208,7 @@ void __init setup_per_cpu_areas(void) * area. Reload any changed state for the boot CPU. */ if (!cpu) - switch_to_new_gdt(cpu); + switch_gdt_and_percpu_base(cpu); } /* indicate the early static arrays will soon be gone */ diff --git a/arch/x86/kernel/sev.c b/arch/x86/kernel/sev.c index a428c62330d3..679026a640ef 100644 --- a/arch/x86/kernel/sev.c +++ b/arch/x86/kernel/sev.c @@ -1536,32 +1536,32 @@ static enum es_result vc_handle_mmio_movs(struct es_em_ctxt *ctxt, static enum es_result vc_handle_mmio(struct ghcb *ghcb, struct es_em_ctxt *ctxt) { struct insn *insn = &ctxt->insn; + enum insn_mmio_type mmio; unsigned int bytes = 0; - enum mmio_type mmio; enum es_result ret; u8 sign_byte; long *reg_data; mmio = insn_decode_mmio(insn, &bytes); - if (mmio == MMIO_DECODE_FAILED) + if (mmio == INSN_MMIO_DECODE_FAILED) return ES_DECODE_FAILED; - if (mmio != MMIO_WRITE_IMM && mmio != MMIO_MOVS) { + if (mmio != INSN_MMIO_WRITE_IMM && mmio != INSN_MMIO_MOVS) { reg_data = insn_get_modrm_reg_ptr(insn, ctxt->regs); if (!reg_data) return ES_DECODE_FAILED; } switch (mmio) { - case MMIO_WRITE: + case INSN_MMIO_WRITE: memcpy(ghcb->shared_buffer, reg_data, bytes); ret = vc_do_mmio(ghcb, ctxt, bytes, false); break; - case MMIO_WRITE_IMM: + case INSN_MMIO_WRITE_IMM: memcpy(ghcb->shared_buffer, insn->immediate1.bytes, bytes); ret = vc_do_mmio(ghcb, ctxt, bytes, false); break; - case MMIO_READ: + case INSN_MMIO_READ: ret = vc_do_mmio(ghcb, ctxt, bytes, true); if (ret) break; @@ -1572,7 +1572,7 @@ static enum es_result vc_handle_mmio(struct ghcb *ghcb, struct es_em_ctxt *ctxt) memcpy(reg_data, ghcb->shared_buffer, bytes); break; - case MMIO_READ_ZERO_EXTEND: + case INSN_MMIO_READ_ZERO_EXTEND: ret = vc_do_mmio(ghcb, ctxt, bytes, true); if (ret) break; @@ -1581,7 +1581,7 @@ static enum es_result vc_handle_mmio(struct ghcb *ghcb, struct es_em_ctxt *ctxt) memset(reg_data, 0, insn->opnd_bytes); memcpy(reg_data, ghcb->shared_buffer, bytes); break; - case MMIO_READ_SIGN_EXTEND: + case INSN_MMIO_READ_SIGN_EXTEND: ret = vc_do_mmio(ghcb, ctxt, bytes, true); if (ret) break; @@ -1600,7 +1600,7 @@ static enum es_result vc_handle_mmio(struct ghcb *ghcb, struct es_em_ctxt *ctxt) memset(reg_data, sign_byte, insn->opnd_bytes); memcpy(reg_data, ghcb->shared_buffer, bytes); break; - case MMIO_MOVS: + case INSN_MMIO_MOVS: ret = vc_handle_mmio_movs(ctxt, bytes); break; default: diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index 9c7265b524c7..1504eb8d25aa 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -37,180 +37,27 @@ #include <asm/sighandling.h> #include <asm/vm86.h> -#ifdef CONFIG_X86_64 -#include <linux/compat.h> -#include <asm/proto.h> -#include <asm/ia32_unistd.h> -#include <asm/fpu/xstate.h> -#endif /* CONFIG_X86_64 */ - #include <asm/syscall.h> #include <asm/sigframe.h> #include <asm/signal.h> -#ifdef CONFIG_X86_64 -/* - * If regs->ss will cause an IRET fault, change it. Otherwise leave it - * alone. Using this generally makes no sense unless - * user_64bit_mode(regs) would return true. - */ -static void force_valid_ss(struct pt_regs *regs) +static inline int is_ia32_compat_frame(struct ksignal *ksig) { - u32 ar; - asm volatile ("lar %[old_ss], %[ar]\n\t" - "jz 1f\n\t" /* If invalid: */ - "xorl %[ar], %[ar]\n\t" /* set ar = 0 */ - "1:" - : [ar] "=r" (ar) - : [old_ss] "rm" ((u16)regs->ss)); - - /* - * For a valid 64-bit user context, we need DPL 3, type - * read-write data or read-write exp-down data, and S and P - * set. We can't use VERW because VERW doesn't check the - * P bit. - */ - ar &= AR_DPL_MASK | AR_S | AR_P | AR_TYPE_MASK; - if (ar != (AR_DPL3 | AR_S | AR_P | AR_TYPE_RWDATA) && - ar != (AR_DPL3 | AR_S | AR_P | AR_TYPE_RWDATA_EXPDOWN)) - regs->ss = __USER_DS; + return IS_ENABLED(CONFIG_IA32_EMULATION) && + ksig->ka.sa.sa_flags & SA_IA32_ABI; } -# define CONTEXT_COPY_SIZE offsetof(struct sigcontext, reserved1) -#else -# define CONTEXT_COPY_SIZE sizeof(struct sigcontext) -#endif -static bool restore_sigcontext(struct pt_regs *regs, - struct sigcontext __user *usc, - unsigned long uc_flags) +static inline int is_ia32_frame(struct ksignal *ksig) { - struct sigcontext sc; - - /* Always make any pending restarted system calls return -EINTR */ - current->restart_block.fn = do_no_restart_syscall; - - if (copy_from_user(&sc, usc, CONTEXT_COPY_SIZE)) - return false; - -#ifdef CONFIG_X86_32 - loadsegment(gs, sc.gs); - regs->fs = sc.fs; - regs->es = sc.es; - regs->ds = sc.ds; -#endif /* CONFIG_X86_32 */ - - regs->bx = sc.bx; - regs->cx = sc.cx; - regs->dx = sc.dx; - regs->si = sc.si; - regs->di = sc.di; - regs->bp = sc.bp; - regs->ax = sc.ax; - regs->sp = sc.sp; - regs->ip = sc.ip; - -#ifdef CONFIG_X86_64 - regs->r8 = sc.r8; - regs->r9 = sc.r9; - regs->r10 = sc.r10; - regs->r11 = sc.r11; - regs->r12 = sc.r12; - regs->r13 = sc.r13; - regs->r14 = sc.r14; - regs->r15 = sc.r15; -#endif /* CONFIG_X86_64 */ - - /* Get CS/SS and force CPL3 */ - regs->cs = sc.cs | 0x03; - regs->ss = sc.ss | 0x03; - - regs->flags = (regs->flags & ~FIX_EFLAGS) | (sc.flags & FIX_EFLAGS); - /* disable syscall checks */ - regs->orig_ax = -1; - -#ifdef CONFIG_X86_64 - /* - * Fix up SS if needed for the benefit of old DOSEMU and - * CRIU. - */ - if (unlikely(!(uc_flags & UC_STRICT_RESTORE_SS) && user_64bit_mode(regs))) - force_valid_ss(regs); -#endif - - return fpu__restore_sig((void __user *)sc.fpstate, - IS_ENABLED(CONFIG_X86_32)); + return IS_ENABLED(CONFIG_X86_32) || is_ia32_compat_frame(ksig); } -static __always_inline int -__unsafe_setup_sigcontext(struct sigcontext __user *sc, void __user *fpstate, - struct pt_regs *regs, unsigned long mask) +static inline int is_x32_frame(struct ksignal *ksig) { -#ifdef CONFIG_X86_32 - unsigned int gs; - savesegment(gs, gs); - - unsafe_put_user(gs, (unsigned int __user *)&sc->gs, Efault); - unsafe_put_user(regs->fs, (unsigned int __user *)&sc->fs, Efault); - unsafe_put_user(regs->es, (unsigned int __user *)&sc->es, Efault); - unsafe_put_user(regs->ds, (unsigned int __user *)&sc->ds, Efault); -#endif /* CONFIG_X86_32 */ - - unsafe_put_user(regs->di, &sc->di, Efault); - unsafe_put_user(regs->si, &sc->si, Efault); - unsafe_put_user(regs->bp, &sc->bp, Efault); - unsafe_put_user(regs->sp, &sc->sp, Efault); - unsafe_put_user(regs->bx, &sc->bx, Efault); - unsafe_put_user(regs->dx, &sc->dx, Efault); - unsafe_put_user(regs->cx, &sc->cx, Efault); - unsafe_put_user(regs->ax, &sc->ax, Efault); -#ifdef CONFIG_X86_64 - unsafe_put_user(regs->r8, &sc->r8, Efault); - unsafe_put_user(regs->r9, &sc->r9, Efault); - unsafe_put_user(regs->r10, &sc->r10, Efault); - unsafe_put_user(regs->r11, &sc->r11, Efault); - unsafe_put_user(regs->r12, &sc->r12, Efault); - unsafe_put_user(regs->r13, &sc->r13, Efault); - unsafe_put_user(regs->r14, &sc->r14, Efault); - unsafe_put_user(regs->r15, &sc->r15, Efault); -#endif /* CONFIG_X86_64 */ - - unsafe_put_user(current->thread.trap_nr, &sc->trapno, Efault); - unsafe_put_user(current->thread.error_code, &sc->err, Efault); - unsafe_put_user(regs->ip, &sc->ip, Efault); -#ifdef CONFIG_X86_32 - unsafe_put_user(regs->cs, (unsigned int __user *)&sc->cs, Efault); - unsafe_put_user(regs->flags, &sc->flags, Efault); - unsafe_put_user(regs->sp, &sc->sp_at_signal, Efault); - unsafe_put_user(regs->ss, (unsigned int __user *)&sc->ss, Efault); -#else /* !CONFIG_X86_32 */ - unsafe_put_user(regs->flags, &sc->flags, Efault); - unsafe_put_user(regs->cs, &sc->cs, Efault); - unsafe_put_user(0, &sc->gs, Efault); - unsafe_put_user(0, &sc->fs, Efault); - unsafe_put_user(regs->ss, &sc->ss, Efault); -#endif /* CONFIG_X86_32 */ - - unsafe_put_user(fpstate, (unsigned long __user *)&sc->fpstate, Efault); - - /* non-iBCS2 extensions.. */ - unsafe_put_user(mask, &sc->oldmask, Efault); - unsafe_put_user(current->thread.cr2, &sc->cr2, Efault); - return 0; -Efault: - return -EFAULT; + return IS_ENABLED(CONFIG_X86_X32_ABI) && + ksig->ka.sa.sa_flags & SA_X32_ABI; } -#define unsafe_put_sigcontext(sc, fp, regs, set, label) \ -do { \ - if (__unsafe_setup_sigcontext(sc, fp, regs, set->sig[0])) \ - goto label; \ -} while(0); - -#define unsafe_put_sigmask(set, frame, label) \ - unsafe_put_user(*(__u64 *)(set), \ - (__u64 __user *)&(frame)->uc.uc_sigmask, \ - label) - /* * Set up a signal frame. */ @@ -223,24 +70,12 @@ do { \ /* * Determine which stack to use.. */ -static unsigned long align_sigframe(unsigned long sp) -{ -#ifdef CONFIG_X86_32 - /* - * Align the stack pointer according to the i386 ABI, - * i.e. so that on function entry ((sp + 4) & 15) == 0. - */ - sp = ((sp + 4) & -FRAME_ALIGNMENT) - 4; -#else /* !CONFIG_X86_32 */ - sp = round_down(sp, FRAME_ALIGNMENT) - 8; -#endif - return sp; -} - -static void __user * -get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size, +void __user * +get_sigframe(struct ksignal *ksig, struct pt_regs *regs, size_t frame_size, void __user **fpstate) { + struct k_sigaction *ka = &ksig->ka; + int ia32_frame = is_ia32_frame(ksig); /* Default to using normal stack */ bool nested_altstack = on_sig_stack(regs->sp); bool entering_altstack = false; @@ -249,7 +84,7 @@ get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size, unsigned long buf_fx = 0; /* redzone */ - if (IS_ENABLED(CONFIG_X86_64)) + if (!ia32_frame) sp -= 128; /* This is the X/Open sanctioned signal stack switching. */ @@ -263,7 +98,7 @@ get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size, sp = current->sas_ss_sp + current->sas_ss_size; entering_altstack = true; } - } else if (IS_ENABLED(CONFIG_X86_32) && + } else if (ia32_frame && !nested_altstack && regs->ss != __USER_DS && !(ka->sa.sa_flags & SA_RESTORER) && @@ -273,11 +108,19 @@ get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size, entering_altstack = true; } - sp = fpu__alloc_mathframe(sp, IS_ENABLED(CONFIG_X86_32), - &buf_fx, &math_size); + sp = fpu__alloc_mathframe(sp, ia32_frame, &buf_fx, &math_size); *fpstate = (void __user *)sp; - sp = align_sigframe(sp - frame_size); + sp -= frame_size; + + if (ia32_frame) + /* + * Align the stack pointer according to the i386 ABI, + * i.e. so that on function entry ((sp + 4) & 15) == 0. + */ + sp = ((sp + 4) & -FRAME_ALIGNMENT) - 4; + else + sp = round_down(sp, FRAME_ALIGNMENT) - 8; /* * If we are on the alternate signal stack and would overflow it, don't. @@ -300,391 +143,6 @@ get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size, return (void __user *)sp; } -#ifdef CONFIG_X86_32 -static const struct { - u16 poplmovl; - u32 val; - u16 int80; -} __attribute__((packed)) retcode = { - 0xb858, /* popl %eax; movl $..., %eax */ - __NR_sigreturn, - 0x80cd, /* int $0x80 */ -}; - -static const struct { - u8 movl; - u32 val; - u16 int80; - u8 pad; -} __attribute__((packed)) rt_retcode = { - 0xb8, /* movl $..., %eax */ - __NR_rt_sigreturn, - 0x80cd, /* int $0x80 */ - 0 -}; - -static int -__setup_frame(int sig, struct ksignal *ksig, sigset_t *set, - struct pt_regs *regs) -{ - struct sigframe __user *frame; - void __user *restorer; - void __user *fp = NULL; - - frame = get_sigframe(&ksig->ka, regs, sizeof(*frame), &fp); - - if (!user_access_begin(frame, sizeof(*frame))) - return -EFAULT; - - unsafe_put_user(sig, &frame->sig, Efault); - unsafe_put_sigcontext(&frame->sc, fp, regs, set, Efault); - unsafe_put_user(set->sig[1], &frame->extramask[0], Efault); - if (current->mm->context.vdso) - restorer = current->mm->context.vdso + - vdso_image_32.sym___kernel_sigreturn; - else - restorer = &frame->retcode; - if (ksig->ka.sa.sa_flags & SA_RESTORER) - restorer = ksig->ka.sa.sa_restorer; - - /* Set up to return from userspace. */ - unsafe_put_user(restorer, &frame->pretcode, Efault); - - /* - * This is popl %eax ; movl $__NR_sigreturn, %eax ; int $0x80 - * - * WE DO NOT USE IT ANY MORE! It's only left here for historical - * reasons and because gdb uses it as a signature to notice - * signal handler stack frames. - */ - unsafe_put_user(*((u64 *)&retcode), (u64 *)frame->retcode, Efault); - user_access_end(); - - /* Set up registers for signal handler */ - regs->sp = (unsigned long)frame; - regs->ip = (unsigned long)ksig->ka.sa.sa_handler; - regs->ax = (unsigned long)sig; - regs->dx = 0; - regs->cx = 0; - - regs->ds = __USER_DS; - regs->es = __USER_DS; - regs->ss = __USER_DS; - regs->cs = __USER_CS; - - return 0; - -Efault: - user_access_end(); - return -EFAULT; -} - -static int __setup_rt_frame(int sig, struct ksignal *ksig, - sigset_t *set, struct pt_regs *regs) -{ - struct rt_sigframe __user *frame; - void __user *restorer; - void __user *fp = NULL; - - frame = get_sigframe(&ksig->ka, regs, sizeof(*frame), &fp); - - if (!user_access_begin(frame, sizeof(*frame))) - return -EFAULT; - - unsafe_put_user(sig, &frame->sig, Efault); - unsafe_put_user(&frame->info, &frame->pinfo, Efault); - unsafe_put_user(&frame->uc, &frame->puc, Efault); - - /* Create the ucontext. */ - if (static_cpu_has(X86_FEATURE_XSAVE)) - unsafe_put_user(UC_FP_XSTATE, &frame->uc.uc_flags, Efault); - else - unsafe_put_user(0, &frame->uc.uc_flags, Efault); - unsafe_put_user(0, &frame->uc.uc_link, Efault); - unsafe_save_altstack(&frame->uc.uc_stack, regs->sp, Efault); - - /* Set up to return from userspace. */ - restorer = current->mm->context.vdso + - vdso_image_32.sym___kernel_rt_sigreturn; - if (ksig->ka.sa.sa_flags & SA_RESTORER) - restorer = ksig->ka.sa.sa_restorer; - unsafe_put_user(restorer, &frame->pretcode, Efault); - - /* - * This is movl $__NR_rt_sigreturn, %ax ; int $0x80 - * - * WE DO NOT USE IT ANY MORE! It's only left here for historical - * reasons and because gdb uses it as a signature to notice - * signal handler stack frames. - */ - unsafe_put_user(*((u64 *)&rt_retcode), (u64 *)frame->retcode, Efault); - unsafe_put_sigcontext(&frame->uc.uc_mcontext, fp, regs, set, Efault); - unsafe_put_sigmask(set, frame, Efault); - user_access_end(); - - if (copy_siginfo_to_user(&frame->info, &ksig->info)) - return -EFAULT; - - /* Set up registers for signal handler */ - regs->sp = (unsigned long)frame; - regs->ip = (unsigned long)ksig->ka.sa.sa_handler; - regs->ax = (unsigned long)sig; - regs->dx = (unsigned long)&frame->info; - regs->cx = (unsigned long)&frame->uc; - - regs->ds = __USER_DS; - regs->es = __USER_DS; - regs->ss = __USER_DS; - regs->cs = __USER_CS; - - return 0; -Efault: - user_access_end(); - return -EFAULT; -} -#else /* !CONFIG_X86_32 */ -static unsigned long frame_uc_flags(struct pt_regs *regs) -{ - unsigned long flags; - - if (boot_cpu_has(X86_FEATURE_XSAVE)) - flags = UC_FP_XSTATE | UC_SIGCONTEXT_SS; - else - flags = UC_SIGCONTEXT_SS; - - if (likely(user_64bit_mode(regs))) - flags |= UC_STRICT_RESTORE_SS; - - return flags; -} - -static int __setup_rt_frame(int sig, struct ksignal *ksig, - sigset_t *set, struct pt_regs *regs) -{ - struct rt_sigframe __user *frame; - void __user *fp = NULL; - unsigned long uc_flags; - - /* x86-64 should always use SA_RESTORER. */ - if (!(ksig->ka.sa.sa_flags & SA_RESTORER)) - return -EFAULT; - - frame = get_sigframe(&ksig->ka, regs, sizeof(struct rt_sigframe), &fp); - uc_flags = frame_uc_flags(regs); - - if (!user_access_begin(frame, sizeof(*frame))) - return -EFAULT; - - /* Create the ucontext. */ - unsafe_put_user(uc_flags, &frame->uc.uc_flags, Efault); - unsafe_put_user(0, &frame->uc.uc_link, Efault); - unsafe_save_altstack(&frame->uc.uc_stack, regs->sp, Efault); - - /* Set up to return from userspace. If provided, use a stub - already in userspace. */ - unsafe_put_user(ksig->ka.sa.sa_restorer, &frame->pretcode, Efault); - unsafe_put_sigcontext(&frame->uc.uc_mcontext, fp, regs, set, Efault); - unsafe_put_sigmask(set, frame, Efault); - user_access_end(); - - if (ksig->ka.sa.sa_flags & SA_SIGINFO) { - if (copy_siginfo_to_user(&frame->info, &ksig->info)) - return -EFAULT; - } - - /* Set up registers for signal handler */ - regs->di = sig; - /* In case the signal handler was declared without prototypes */ - regs->ax = 0; - - /* This also works for non SA_SIGINFO handlers because they expect the - next argument after the signal number on the stack. */ - regs->si = (unsigned long)&frame->info; - regs->dx = (unsigned long)&frame->uc; - regs->ip = (unsigned long) ksig->ka.sa.sa_handler; - - regs->sp = (unsigned long)frame; - - /* - * Set up the CS and SS registers to run signal handlers in - * 64-bit mode, even if the handler happens to be interrupting - * 32-bit or 16-bit code. - * - * SS is subtle. In 64-bit mode, we don't need any particular - * SS descriptor, but we do need SS to be valid. It's possible - * that the old SS is entirely bogus -- this can happen if the - * signal we're trying to deliver is #GP or #SS caused by a bad - * SS value. We also have a compatibility issue here: DOSEMU - * relies on the contents of the SS register indicating the - * SS value at the time of the signal, even though that code in - * DOSEMU predates sigreturn's ability to restore SS. (DOSEMU - * avoids relying on sigreturn to restore SS; instead it uses - * a trampoline.) So we do our best: if the old SS was valid, - * we keep it. Otherwise we replace it. - */ - regs->cs = __USER_CS; - - if (unlikely(regs->ss != __USER_DS)) - force_valid_ss(regs); - - return 0; - -Efault: - user_access_end(); - return -EFAULT; -} -#endif /* CONFIG_X86_32 */ - -#ifdef CONFIG_X86_X32_ABI -static int x32_copy_siginfo_to_user(struct compat_siginfo __user *to, - const struct kernel_siginfo *from) -{ - struct compat_siginfo new; - - copy_siginfo_to_external32(&new, from); - if (from->si_signo == SIGCHLD) { - new._sifields._sigchld_x32._utime = from->si_utime; - new._sifields._sigchld_x32._stime = from->si_stime; - } - if (copy_to_user(to, &new, sizeof(struct compat_siginfo))) - return -EFAULT; - return 0; -} - -int copy_siginfo_to_user32(struct compat_siginfo __user *to, - const struct kernel_siginfo *from) -{ - if (in_x32_syscall()) - return x32_copy_siginfo_to_user(to, from); - return __copy_siginfo_to_user32(to, from); -} -#endif /* CONFIG_X86_X32_ABI */ - -static int x32_setup_rt_frame(struct ksignal *ksig, - compat_sigset_t *set, - struct pt_regs *regs) -{ -#ifdef CONFIG_X86_X32_ABI - struct rt_sigframe_x32 __user *frame; - unsigned long uc_flags; - void __user *restorer; - void __user *fp = NULL; - - if (!(ksig->ka.sa.sa_flags & SA_RESTORER)) - return -EFAULT; - - frame = get_sigframe(&ksig->ka, regs, sizeof(*frame), &fp); - - uc_flags = frame_uc_flags(regs); - - if (!user_access_begin(frame, sizeof(*frame))) - return -EFAULT; - - /* Create the ucontext. */ - unsafe_put_user(uc_flags, &frame->uc.uc_flags, Efault); - unsafe_put_user(0, &frame->uc.uc_link, Efault); - unsafe_compat_save_altstack(&frame->uc.uc_stack, regs->sp, Efault); - unsafe_put_user(0, &frame->uc.uc__pad0, Efault); - restorer = ksig->ka.sa.sa_restorer; - unsafe_put_user(restorer, (unsigned long __user *)&frame->pretcode, Efault); - unsafe_put_sigcontext(&frame->uc.uc_mcontext, fp, regs, set, Efault); - unsafe_put_sigmask(set, frame, Efault); - user_access_end(); - - if (ksig->ka.sa.sa_flags & SA_SIGINFO) { - if (x32_copy_siginfo_to_user(&frame->info, &ksig->info)) - return -EFAULT; - } - - /* Set up registers for signal handler */ - regs->sp = (unsigned long) frame; - regs->ip = (unsigned long) ksig->ka.sa.sa_handler; - - /* We use the x32 calling convention here... */ - regs->di = ksig->sig; - regs->si = (unsigned long) &frame->info; - regs->dx = (unsigned long) &frame->uc; - - loadsegment(ds, __USER_DS); - loadsegment(es, __USER_DS); - - regs->cs = __USER_CS; - regs->ss = __USER_DS; -#endif /* CONFIG_X86_X32_ABI */ - - return 0; -#ifdef CONFIG_X86_X32_ABI -Efault: - user_access_end(); - return -EFAULT; -#endif -} - -/* - * Do a signal return; undo the signal stack. - */ -#ifdef CONFIG_X86_32 -SYSCALL_DEFINE0(sigreturn) -{ - struct pt_regs *regs = current_pt_regs(); - struct sigframe __user *frame; - sigset_t set; - - frame = (struct sigframe __user *)(regs->sp - 8); - - if (!access_ok(frame, sizeof(*frame))) - goto badframe; - if (__get_user(set.sig[0], &frame->sc.oldmask) || - __get_user(set.sig[1], &frame->extramask[0])) - goto badframe; - - set_current_blocked(&set); - - /* - * x86_32 has no uc_flags bits relevant to restore_sigcontext. - * Save a few cycles by skipping the __get_user. - */ - if (!restore_sigcontext(regs, &frame->sc, 0)) - goto badframe; - return regs->ax; - -badframe: - signal_fault(regs, frame, "sigreturn"); - - return 0; -} -#endif /* CONFIG_X86_32 */ - -SYSCALL_DEFINE0(rt_sigreturn) -{ - struct pt_regs *regs = current_pt_regs(); - struct rt_sigframe __user *frame; - sigset_t set; - unsigned long uc_flags; - - frame = (struct rt_sigframe __user *)(regs->sp - sizeof(long)); - if (!access_ok(frame, sizeof(*frame))) - goto badframe; - if (__get_user(*(__u64 *)&set, (__u64 __user *)&frame->uc.uc_sigmask)) - goto badframe; - if (__get_user(uc_flags, &frame->uc.uc_flags)) - goto badframe; - - set_current_blocked(&set); - - if (!restore_sigcontext(regs, &frame->uc.uc_mcontext, uc_flags)) - goto badframe; - - if (restore_altstack(&frame->uc.uc_stack)) - goto badframe; - - return regs->ax; - -badframe: - signal_fault(regs, frame, "rt_sigreturn"); - return 0; -} - /* * There are four different struct types for signal frame: sigframe_ia32, * rt_sigframe_ia32, rt_sigframe_x32, and rt_sigframe. Use the worst case @@ -743,43 +201,22 @@ unsigned long get_sigframe_size(void) return max_frame_size; } -static inline int is_ia32_compat_frame(struct ksignal *ksig) -{ - return IS_ENABLED(CONFIG_IA32_EMULATION) && - ksig->ka.sa.sa_flags & SA_IA32_ABI; -} - -static inline int is_ia32_frame(struct ksignal *ksig) -{ - return IS_ENABLED(CONFIG_X86_32) || is_ia32_compat_frame(ksig); -} - -static inline int is_x32_frame(struct ksignal *ksig) -{ - return IS_ENABLED(CONFIG_X86_X32_ABI) && - ksig->ka.sa.sa_flags & SA_X32_ABI; -} - static int setup_rt_frame(struct ksignal *ksig, struct pt_regs *regs) { - int usig = ksig->sig; - sigset_t *set = sigmask_to_save(); - compat_sigset_t *cset = (compat_sigset_t *) set; - /* Perform fixup for the pre-signal frame. */ rseq_signal_deliver(ksig, regs); /* Set up the stack frame */ if (is_ia32_frame(ksig)) { if (ksig->ka.sa.sa_flags & SA_SIGINFO) - return ia32_setup_rt_frame(usig, ksig, cset, regs); + return ia32_setup_rt_frame(ksig, regs); else - return ia32_setup_frame(usig, ksig, cset, regs); + return ia32_setup_frame(ksig, regs); } else if (is_x32_frame(ksig)) { - return x32_setup_rt_frame(ksig, cset, regs); + return x32_setup_rt_frame(ksig, regs); } else { - return __setup_rt_frame(ksig->sig, ksig, set, regs); + return x64_setup_rt_frame(ksig, regs); } } @@ -969,36 +406,3 @@ bool sigaltstack_size_valid(size_t ss_size) return true; } #endif /* CONFIG_DYNAMIC_SIGFRAME */ - -#ifdef CONFIG_X86_X32_ABI -COMPAT_SYSCALL_DEFINE0(x32_rt_sigreturn) -{ - struct pt_regs *regs = current_pt_regs(); - struct rt_sigframe_x32 __user *frame; - sigset_t set; - unsigned long uc_flags; - - frame = (struct rt_sigframe_x32 __user *)(regs->sp - 8); - - if (!access_ok(frame, sizeof(*frame))) - goto badframe; - if (__get_user(set.sig[0], (__u64 __user *)&frame->uc.uc_sigmask)) - goto badframe; - if (__get_user(uc_flags, &frame->uc.uc_flags)) - goto badframe; - - set_current_blocked(&set); - - if (!restore_sigcontext(regs, &frame->uc.uc_mcontext, uc_flags)) - goto badframe; - - if (compat_restore_altstack(&frame->uc.uc_stack)) - goto badframe; - - return regs->ax; - -badframe: - signal_fault(regs, frame, "x32 rt_sigreturn"); - return 0; -} -#endif diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/kernel/signal_32.c index c9c3859322fa..2553136cf39b 100644 --- a/arch/x86/ia32/ia32_signal.c +++ b/arch/x86/kernel/signal_32.c @@ -1,7 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 /* - * linux/arch/x86_64/ia32/ia32_signal.c - * * Copyright (C) 1991, 1992 Linus Torvalds * * 1997-11-28 Modified for POSIX.1b signals by Richard Henderson @@ -26,7 +24,6 @@ #include <linux/uaccess.h> #include <asm/fpu/signal.h> #include <asm/ptrace.h> -#include <asm/ia32_unistd.h> #include <asm/user32.h> #include <uapi/asm/sigcontext.h> #include <asm/proto.h> @@ -35,6 +32,9 @@ #include <asm/sighandling.h> #include <asm/smap.h> +#ifdef CONFIG_IA32_EMULATION +#include <asm/ia32_unistd.h> + static inline void reload_segments(struct sigcontext_32 *sc) { unsigned int cur; @@ -53,6 +53,21 @@ static inline void reload_segments(struct sigcontext_32 *sc) loadsegment(es, sc->es | 0x03); } +#define sigset32_t compat_sigset_t +#define restore_altstack32 compat_restore_altstack +#define unsafe_save_altstack32 unsafe_compat_save_altstack + +#else + +#define sigset32_t sigset_t +#define __NR_ia32_sigreturn __NR_sigreturn +#define __NR_ia32_rt_sigreturn __NR_rt_sigreturn +#define restore_altstack32 restore_altstack +#define unsafe_save_altstack32 unsafe_save_altstack +#define __copy_siginfo_to_user32 copy_siginfo_to_user + +#endif + /* * Do a signal return; undo the signal stack. */ @@ -86,6 +101,7 @@ static bool ia32_restore_sigcontext(struct pt_regs *regs, /* disable syscall checks */ regs->orig_ax = -1; +#ifdef CONFIG_IA32_EMULATION /* * Reload fs and gs if they have changed in the signal * handler. This does not handle long fs/gs base changes in @@ -93,10 +109,17 @@ static bool ia32_restore_sigcontext(struct pt_regs *regs, * normal case. */ reload_segments(&sc); +#else + loadsegment(gs, sc.gs); + regs->fs = sc.fs; + regs->es = sc.es; + regs->ds = sc.ds; +#endif + return fpu__restore_sig(compat_ptr(sc.fpstate), 1); } -COMPAT_SYSCALL_DEFINE0(sigreturn) +SYSCALL32_DEFINE0(sigreturn) { struct pt_regs *regs = current_pt_regs(); struct sigframe_ia32 __user *frame = (struct sigframe_ia32 __user *)(regs->sp-8); @@ -119,7 +142,7 @@ badframe: return 0; } -COMPAT_SYSCALL_DEFINE0(rt_sigreturn) +SYSCALL32_DEFINE0(rt_sigreturn) { struct pt_regs *regs = current_pt_regs(); struct rt_sigframe_ia32 __user *frame; @@ -129,7 +152,7 @@ COMPAT_SYSCALL_DEFINE0(rt_sigreturn) if (!access_ok(frame, sizeof(*frame))) goto badframe; - if (__get_user(set.sig[0], (__u64 __user *)&frame->uc.uc_sigmask)) + if (__get_user(*(__u64 *)&set, (__u64 __user *)&frame->uc.uc_sigmask)) goto badframe; set_current_blocked(&set); @@ -137,7 +160,7 @@ COMPAT_SYSCALL_DEFINE0(rt_sigreturn) if (!ia32_restore_sigcontext(regs, &frame->uc.uc_mcontext)) goto badframe; - if (compat_restore_altstack(&frame->uc.uc_stack)) + if (restore_altstack32(&frame->uc.uc_stack)) goto badframe; return regs->ax; @@ -159,9 +182,15 @@ __unsafe_setup_sigcontext32(struct sigcontext_32 __user *sc, struct pt_regs *regs, unsigned int mask) { unsafe_put_user(get_user_seg(gs), (unsigned int __user *)&sc->gs, Efault); +#ifdef CONFIG_IA32_EMULATION unsafe_put_user(get_user_seg(fs), (unsigned int __user *)&sc->fs, Efault); unsafe_put_user(get_user_seg(ds), (unsigned int __user *)&sc->ds, Efault); unsafe_put_user(get_user_seg(es), (unsigned int __user *)&sc->es, Efault); +#else + unsafe_put_user(regs->fs, (unsigned int __user *)&sc->fs, Efault); + unsafe_put_user(regs->es, (unsigned int __user *)&sc->es, Efault); + unsafe_put_user(regs->ds, (unsigned int __user *)&sc->ds, Efault); +#endif unsafe_put_user(regs->di, &sc->di, Efault); unsafe_put_user(regs->si, &sc->si, Efault); @@ -196,43 +225,9 @@ do { \ goto label; \ } while(0) -/* - * Determine which stack to use.. - */ -static void __user *get_sigframe(struct ksignal *ksig, struct pt_regs *regs, - size_t frame_size, - void __user **fpstate) -{ - unsigned long sp, fx_aligned, math_size; - - /* Default to using normal stack */ - sp = regs->sp; - - /* This is the X/Open sanctioned signal stack switching. */ - if (ksig->ka.sa.sa_flags & SA_ONSTACK) - sp = sigsp(sp, ksig); - /* This is the legacy signal stack switching. */ - else if (regs->ss != __USER32_DS && - !(ksig->ka.sa.sa_flags & SA_RESTORER) && - ksig->ka.sa.sa_restorer) - sp = (unsigned long) ksig->ka.sa.sa_restorer; - - sp = fpu__alloc_mathframe(sp, 1, &fx_aligned, &math_size); - *fpstate = (struct _fpstate_32 __user *) sp; - if (!copy_fpstate_to_sigframe(*fpstate, (void __user *)fx_aligned, - math_size)) - return (void __user *) -1L; - - sp -= frame_size; - /* Align the stack pointer according to the i386 ABI, - * i.e. so that on function entry ((sp + 4) & 15) == 0. */ - sp = ((sp + 4) & -16ul) - 4; - return (void __user *) sp; -} - -int ia32_setup_frame(int sig, struct ksignal *ksig, - compat_sigset_t *set, struct pt_regs *regs) +int ia32_setup_frame(struct ksignal *ksig, struct pt_regs *regs) { + sigset32_t *set = (sigset32_t *) sigmask_to_save(); struct sigframe_ia32 __user *frame; void __user *restorer; void __user *fp = NULL; @@ -264,7 +259,7 @@ int ia32_setup_frame(int sig, struct ksignal *ksig, if (!user_access_begin(frame, sizeof(*frame))) return -EFAULT; - unsafe_put_user(sig, &frame->sig, Efault); + unsafe_put_user(ksig->sig, &frame->sig, Efault); unsafe_put_sigcontext32(&frame->sc, fp, regs, set, Efault); unsafe_put_user(set->sig[1], &frame->extramask[0], Efault); unsafe_put_user(ptr_to_compat(restorer), &frame->pretcode, Efault); @@ -280,15 +275,20 @@ int ia32_setup_frame(int sig, struct ksignal *ksig, regs->ip = (unsigned long) ksig->ka.sa.sa_handler; /* Make -mregparm=3 work */ - regs->ax = sig; + regs->ax = ksig->sig; regs->dx = 0; regs->cx = 0; - loadsegment(ds, __USER32_DS); - loadsegment(es, __USER32_DS); +#ifdef CONFIG_IA32_EMULATION + loadsegment(ds, __USER_DS); + loadsegment(es, __USER_DS); +#else + regs->ds = __USER_DS; + regs->es = __USER_DS; +#endif regs->cs = __USER32_CS; - regs->ss = __USER32_DS; + regs->ss = __USER_DS; return 0; Efault: @@ -296,9 +296,9 @@ Efault: return -EFAULT; } -int ia32_setup_rt_frame(int sig, struct ksignal *ksig, - compat_sigset_t *set, struct pt_regs *regs) +int ia32_setup_rt_frame(struct ksignal *ksig, struct pt_regs *regs) { + sigset32_t *set = (sigset32_t *) sigmask_to_save(); struct rt_sigframe_ia32 __user *frame; void __user *restorer; void __user *fp = NULL; @@ -321,7 +321,7 @@ int ia32_setup_rt_frame(int sig, struct ksignal *ksig, if (!user_access_begin(frame, sizeof(*frame))) return -EFAULT; - unsafe_put_user(sig, &frame->sig, Efault); + unsafe_put_user(ksig->sig, &frame->sig, Efault); unsafe_put_user(ptr_to_compat(&frame->info), &frame->pinfo, Efault); unsafe_put_user(ptr_to_compat(&frame->uc), &frame->puc, Efault); @@ -331,7 +331,7 @@ int ia32_setup_rt_frame(int sig, struct ksignal *ksig, else unsafe_put_user(0, &frame->uc.uc_flags, Efault); unsafe_put_user(0, &frame->uc.uc_link, Efault); - unsafe_compat_save_altstack(&frame->uc.uc_stack, regs->sp, Efault); + unsafe_save_altstack32(&frame->uc.uc_stack, regs->sp, Efault); if (ksig->ka.sa.sa_flags & SA_RESTORER) restorer = ksig->ka.sa.sa_restorer; @@ -357,15 +357,20 @@ int ia32_setup_rt_frame(int sig, struct ksignal *ksig, regs->ip = (unsigned long) ksig->ka.sa.sa_handler; /* Make -mregparm=3 work */ - regs->ax = sig; + regs->ax = ksig->sig; regs->dx = (unsigned long) &frame->info; regs->cx = (unsigned long) &frame->uc; - loadsegment(ds, __USER32_DS); - loadsegment(es, __USER32_DS); +#ifdef CONFIG_IA32_EMULATION + loadsegment(ds, __USER_DS); + loadsegment(es, __USER_DS); +#else + regs->ds = __USER_DS; + regs->es = __USER_DS; +#endif regs->cs = __USER32_CS; - regs->ss = __USER32_DS; + regs->ss = __USER_DS; return 0; Efault: diff --git a/arch/x86/kernel/signal_64.c b/arch/x86/kernel/signal_64.c new file mode 100644 index 000000000000..ff9c55064223 --- /dev/null +++ b/arch/x86/kernel/signal_64.c @@ -0,0 +1,383 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 1991, 1992 Linus Torvalds + * Copyright (C) 2000, 2001, 2002 Andi Kleen SuSE Labs + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/kernel.h> +#include <linux/errno.h> +#include <linux/unistd.h> +#include <linux/uaccess.h> +#include <linux/syscalls.h> + +#include <asm/ucontext.h> +#include <asm/fpu/signal.h> +#include <asm/sighandling.h> + +#include <asm/syscall.h> +#include <asm/sigframe.h> +#include <asm/signal.h> + +/* + * If regs->ss will cause an IRET fault, change it. Otherwise leave it + * alone. Using this generally makes no sense unless + * user_64bit_mode(regs) would return true. + */ +static void force_valid_ss(struct pt_regs *regs) +{ + u32 ar; + asm volatile ("lar %[old_ss], %[ar]\n\t" + "jz 1f\n\t" /* If invalid: */ + "xorl %[ar], %[ar]\n\t" /* set ar = 0 */ + "1:" + : [ar] "=r" (ar) + : [old_ss] "rm" ((u16)regs->ss)); + + /* + * For a valid 64-bit user context, we need DPL 3, type + * read-write data or read-write exp-down data, and S and P + * set. We can't use VERW because VERW doesn't check the + * P bit. + */ + ar &= AR_DPL_MASK | AR_S | AR_P | AR_TYPE_MASK; + if (ar != (AR_DPL3 | AR_S | AR_P | AR_TYPE_RWDATA) && + ar != (AR_DPL3 | AR_S | AR_P | AR_TYPE_RWDATA_EXPDOWN)) + regs->ss = __USER_DS; +} + +static bool restore_sigcontext(struct pt_regs *regs, + struct sigcontext __user *usc, + unsigned long uc_flags) +{ + struct sigcontext sc; + + /* Always make any pending restarted system calls return -EINTR */ + current->restart_block.fn = do_no_restart_syscall; + + if (copy_from_user(&sc, usc, offsetof(struct sigcontext, reserved1))) + return false; + + regs->bx = sc.bx; + regs->cx = sc.cx; + regs->dx = sc.dx; + regs->si = sc.si; + regs->di = sc.di; + regs->bp = sc.bp; + regs->ax = sc.ax; + regs->sp = sc.sp; + regs->ip = sc.ip; + regs->r8 = sc.r8; + regs->r9 = sc.r9; + regs->r10 = sc.r10; + regs->r11 = sc.r11; + regs->r12 = sc.r12; + regs->r13 = sc.r13; + regs->r14 = sc.r14; + regs->r15 = sc.r15; + + /* Get CS/SS and force CPL3 */ + regs->cs = sc.cs | 0x03; + regs->ss = sc.ss | 0x03; + + regs->flags = (regs->flags & ~FIX_EFLAGS) | (sc.flags & FIX_EFLAGS); + /* disable syscall checks */ + regs->orig_ax = -1; + + /* + * Fix up SS if needed for the benefit of old DOSEMU and + * CRIU. + */ + if (unlikely(!(uc_flags & UC_STRICT_RESTORE_SS) && user_64bit_mode(regs))) + force_valid_ss(regs); + + return fpu__restore_sig((void __user *)sc.fpstate, 0); +} + +static __always_inline int +__unsafe_setup_sigcontext(struct sigcontext __user *sc, void __user *fpstate, + struct pt_regs *regs, unsigned long mask) +{ + unsafe_put_user(regs->di, &sc->di, Efault); + unsafe_put_user(regs->si, &sc->si, Efault); + unsafe_put_user(regs->bp, &sc->bp, Efault); + unsafe_put_user(regs->sp, &sc->sp, Efault); + unsafe_put_user(regs->bx, &sc->bx, Efault); + unsafe_put_user(regs->dx, &sc->dx, Efault); + unsafe_put_user(regs->cx, &sc->cx, Efault); + unsafe_put_user(regs->ax, &sc->ax, Efault); + unsafe_put_user(regs->r8, &sc->r8, Efault); + unsafe_put_user(regs->r9, &sc->r9, Efault); + unsafe_put_user(regs->r10, &sc->r10, Efault); + unsafe_put_user(regs->r11, &sc->r11, Efault); + unsafe_put_user(regs->r12, &sc->r12, Efault); + unsafe_put_user(regs->r13, &sc->r13, Efault); + unsafe_put_user(regs->r14, &sc->r14, Efault); + unsafe_put_user(regs->r15, &sc->r15, Efault); + + unsafe_put_user(current->thread.trap_nr, &sc->trapno, Efault); + unsafe_put_user(current->thread.error_code, &sc->err, Efault); + unsafe_put_user(regs->ip, &sc->ip, Efault); + unsafe_put_user(regs->flags, &sc->flags, Efault); + unsafe_put_user(regs->cs, &sc->cs, Efault); + unsafe_put_user(0, &sc->gs, Efault); + unsafe_put_user(0, &sc->fs, Efault); + unsafe_put_user(regs->ss, &sc->ss, Efault); + + unsafe_put_user(fpstate, (unsigned long __user *)&sc->fpstate, Efault); + + /* non-iBCS2 extensions.. */ + unsafe_put_user(mask, &sc->oldmask, Efault); + unsafe_put_user(current->thread.cr2, &sc->cr2, Efault); + return 0; +Efault: + return -EFAULT; +} + +#define unsafe_put_sigcontext(sc, fp, regs, set, label) \ +do { \ + if (__unsafe_setup_sigcontext(sc, fp, regs, set->sig[0])) \ + goto label; \ +} while(0); + +#define unsafe_put_sigmask(set, frame, label) \ + unsafe_put_user(*(__u64 *)(set), \ + (__u64 __user *)&(frame)->uc.uc_sigmask, \ + label) + +static unsigned long frame_uc_flags(struct pt_regs *regs) +{ + unsigned long flags; + + if (boot_cpu_has(X86_FEATURE_XSAVE)) + flags = UC_FP_XSTATE | UC_SIGCONTEXT_SS; + else + flags = UC_SIGCONTEXT_SS; + + if (likely(user_64bit_mode(regs))) + flags |= UC_STRICT_RESTORE_SS; + + return flags; +} + +int x64_setup_rt_frame(struct ksignal *ksig, struct pt_regs *regs) +{ + sigset_t *set = sigmask_to_save(); + struct rt_sigframe __user *frame; + void __user *fp = NULL; + unsigned long uc_flags; + + /* x86-64 should always use SA_RESTORER. */ + if (!(ksig->ka.sa.sa_flags & SA_RESTORER)) + return -EFAULT; + + frame = get_sigframe(ksig, regs, sizeof(struct rt_sigframe), &fp); + uc_flags = frame_uc_flags(regs); + + if (!user_access_begin(frame, sizeof(*frame))) + return -EFAULT; + + /* Create the ucontext. */ + unsafe_put_user(uc_flags, &frame->uc.uc_flags, Efault); + unsafe_put_user(0, &frame->uc.uc_link, Efault); + unsafe_save_altstack(&frame->uc.uc_stack, regs->sp, Efault); + + /* Set up to return from userspace. If provided, use a stub + already in userspace. */ + unsafe_put_user(ksig->ka.sa.sa_restorer, &frame->pretcode, Efault); + unsafe_put_sigcontext(&frame->uc.uc_mcontext, fp, regs, set, Efault); + unsafe_put_sigmask(set, frame, Efault); + user_access_end(); + + if (ksig->ka.sa.sa_flags & SA_SIGINFO) { + if (copy_siginfo_to_user(&frame->info, &ksig->info)) + return -EFAULT; + } + + /* Set up registers for signal handler */ + regs->di = ksig->sig; + /* In case the signal handler was declared without prototypes */ + regs->ax = 0; + + /* This also works for non SA_SIGINFO handlers because they expect the + next argument after the signal number on the stack. */ + regs->si = (unsigned long)&frame->info; + regs->dx = (unsigned long)&frame->uc; + regs->ip = (unsigned long) ksig->ka.sa.sa_handler; + + regs->sp = (unsigned long)frame; + + /* + * Set up the CS and SS registers to run signal handlers in + * 64-bit mode, even if the handler happens to be interrupting + * 32-bit or 16-bit code. + * + * SS is subtle. In 64-bit mode, we don't need any particular + * SS descriptor, but we do need SS to be valid. It's possible + * that the old SS is entirely bogus -- this can happen if the + * signal we're trying to deliver is #GP or #SS caused by a bad + * SS value. We also have a compatibility issue here: DOSEMU + * relies on the contents of the SS register indicating the + * SS value at the time of the signal, even though that code in + * DOSEMU predates sigreturn's ability to restore SS. (DOSEMU + * avoids relying on sigreturn to restore SS; instead it uses + * a trampoline.) So we do our best: if the old SS was valid, + * we keep it. Otherwise we replace it. + */ + regs->cs = __USER_CS; + + if (unlikely(regs->ss != __USER_DS)) + force_valid_ss(regs); + + return 0; + +Efault: + user_access_end(); + return -EFAULT; +} + +/* + * Do a signal return; undo the signal stack. + */ +SYSCALL_DEFINE0(rt_sigreturn) +{ + struct pt_regs *regs = current_pt_regs(); + struct rt_sigframe __user *frame; + sigset_t set; + unsigned long uc_flags; + + frame = (struct rt_sigframe __user *)(regs->sp - sizeof(long)); + if (!access_ok(frame, sizeof(*frame))) + goto badframe; + if (__get_user(*(__u64 *)&set, (__u64 __user *)&frame->uc.uc_sigmask)) + goto badframe; + if (__get_user(uc_flags, &frame->uc.uc_flags)) + goto badframe; + + set_current_blocked(&set); + + if (!restore_sigcontext(regs, &frame->uc.uc_mcontext, uc_flags)) + goto badframe; + + if (restore_altstack(&frame->uc.uc_stack)) + goto badframe; + + return regs->ax; + +badframe: + signal_fault(regs, frame, "rt_sigreturn"); + return 0; +} + +#ifdef CONFIG_X86_X32_ABI +static int x32_copy_siginfo_to_user(struct compat_siginfo __user *to, + const struct kernel_siginfo *from) +{ + struct compat_siginfo new; + + copy_siginfo_to_external32(&new, from); + if (from->si_signo == SIGCHLD) { + new._sifields._sigchld_x32._utime = from->si_utime; + new._sifields._sigchld_x32._stime = from->si_stime; + } + if (copy_to_user(to, &new, sizeof(struct compat_siginfo))) + return -EFAULT; + return 0; +} + +int copy_siginfo_to_user32(struct compat_siginfo __user *to, + const struct kernel_siginfo *from) +{ + if (in_x32_syscall()) + return x32_copy_siginfo_to_user(to, from); + return __copy_siginfo_to_user32(to, from); +} + +int x32_setup_rt_frame(struct ksignal *ksig, struct pt_regs *regs) +{ + compat_sigset_t *set = (compat_sigset_t *) sigmask_to_save(); + struct rt_sigframe_x32 __user *frame; + unsigned long uc_flags; + void __user *restorer; + void __user *fp = NULL; + + if (!(ksig->ka.sa.sa_flags & SA_RESTORER)) + return -EFAULT; + + frame = get_sigframe(ksig, regs, sizeof(*frame), &fp); + + uc_flags = frame_uc_flags(regs); + + if (!user_access_begin(frame, sizeof(*frame))) + return -EFAULT; + + /* Create the ucontext. */ + unsafe_put_user(uc_flags, &frame->uc.uc_flags, Efault); + unsafe_put_user(0, &frame->uc.uc_link, Efault); + unsafe_compat_save_altstack(&frame->uc.uc_stack, regs->sp, Efault); + unsafe_put_user(0, &frame->uc.uc__pad0, Efault); + restorer = ksig->ka.sa.sa_restorer; + unsafe_put_user(restorer, (unsigned long __user *)&frame->pretcode, Efault); + unsafe_put_sigcontext(&frame->uc.uc_mcontext, fp, regs, set, Efault); + unsafe_put_sigmask(set, frame, Efault); + user_access_end(); + + if (ksig->ka.sa.sa_flags & SA_SIGINFO) { + if (x32_copy_siginfo_to_user(&frame->info, &ksig->info)) + return -EFAULT; + } + + /* Set up registers for signal handler */ + regs->sp = (unsigned long) frame; + regs->ip = (unsigned long) ksig->ka.sa.sa_handler; + + /* We use the x32 calling convention here... */ + regs->di = ksig->sig; + regs->si = (unsigned long) &frame->info; + regs->dx = (unsigned long) &frame->uc; + + loadsegment(ds, __USER_DS); + loadsegment(es, __USER_DS); + + regs->cs = __USER_CS; + regs->ss = __USER_DS; + + return 0; + +Efault: + user_access_end(); + return -EFAULT; +} + +COMPAT_SYSCALL_DEFINE0(x32_rt_sigreturn) +{ + struct pt_regs *regs = current_pt_regs(); + struct rt_sigframe_x32 __user *frame; + sigset_t set; + unsigned long uc_flags; + + frame = (struct rt_sigframe_x32 __user *)(regs->sp - 8); + + if (!access_ok(frame, sizeof(*frame))) + goto badframe; + if (__get_user(set.sig[0], (__u64 __user *)&frame->uc.uc_sigmask)) + goto badframe; + if (__get_user(uc_flags, &frame->uc.uc_flags)) + goto badframe; + + set_current_blocked(&set); + + if (!restore_sigcontext(regs, &frame->uc.uc_mcontext, uc_flags)) + goto badframe; + + if (compat_restore_altstack(&frame->uc.uc_stack)) + goto badframe; + + return regs->ax; + +badframe: + signal_fault(regs, frame, "x32 rt_sigreturn"); + return 0; +} +#endif /* CONFIG_X86_X32_ABI */ diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 3f3ea0287f69..55cad72715d9 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -56,8 +56,10 @@ #include <linux/numa.h> #include <linux/pgtable.h> #include <linux/overflow.h> +#include <linux/stackprotector.h> #include <asm/acpi.h> +#include <asm/cacheinfo.h> #include <asm/desc.h> #include <asm/nmi.h> #include <asm/irq.h> @@ -1046,7 +1048,7 @@ int common_cpu_up(unsigned int cpu, struct task_struct *idle) /* Just in case we booted with a single CPU. */ alternatives_enable_smp(); - per_cpu(current_task, cpu) = idle; + per_cpu(pcpu_hot.current_task, cpu) = idle; cpu_init_stack_canary(cpu, idle); /* Initialize the interrupt stack(s) */ @@ -1056,7 +1058,7 @@ int common_cpu_up(unsigned int cpu, struct task_struct *idle) #ifdef CONFIG_X86_32 /* Stack for startup_32 can be just as for start_secondary onwards */ - per_cpu(cpu_current_top_of_stack, cpu) = task_top_of_stack(idle); + per_cpu(pcpu_hot.top_of_stack, cpu) = task_top_of_stack(idle); #else initial_gs = per_cpu_offset(cpu); #endif @@ -1428,8 +1430,6 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus) uv_system_init(); - set_mtrr_aps_delayed_init(); - smp_quirk_init_udelay(); speculative_store_bypass_ht_init(); @@ -1439,12 +1439,12 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus) void arch_thaw_secondary_cpus_begin(void) { - set_mtrr_aps_delayed_init(); + set_cache_aps_delayed_init(true); } void arch_thaw_secondary_cpus_end(void) { - mtrr_aps_init(); + cache_aps_init(); } /* @@ -1453,7 +1453,11 @@ void arch_thaw_secondary_cpus_end(void) void __init native_smp_prepare_boot_cpu(void) { int me = smp_processor_id(); - switch_to_new_gdt(me); + + /* SMP handles this from setup_per_cpu_areas() */ + if (!IS_ENABLED(CONFIG_SMP)) + switch_gdt_and_percpu_base(me); + /* already set me in cpu_online_mask in boot_cpu_init() */ cpumask_set_cpu(me, cpu_callout_mask); cpu_set_state_online(me); @@ -1487,7 +1491,7 @@ void __init native_smp_cpus_done(unsigned int max_cpus) nmi_selftest(); impress_friends(); - mtrr_aps_init(); + cache_aps_init(); } static int __initdata setup_possible_cpus = -1; diff --git a/arch/x86/kernel/static_call.c b/arch/x86/kernel/static_call.c index aaaba85d6d7f..2ebc338980bc 100644 --- a/arch/x86/kernel/static_call.c +++ b/arch/x86/kernel/static_call.c @@ -34,6 +34,7 @@ static void __ref __static_call_transform(void *insn, enum insn_type type, switch (type) { case CALL: + func = callthunks_translate_call_dest(func); code = text_gen_insn(CALL_INSN_OPCODE, insn, func); if (func == &__static_call_return0) { emulate = code; @@ -52,7 +53,7 @@ static void __ref __static_call_transform(void *insn, enum insn_type type, case RET: if (cpu_feature_enabled(X86_FEATURE_RETHUNK)) - code = text_gen_insn(JMP32_INSN_OPCODE, insn, &__x86_return_thunk); + code = text_gen_insn(JMP32_INSN_OPCODE, insn, x86_return_thunk); else code = &retinsn; break; diff --git a/arch/x86/kernel/topology.c b/arch/x86/kernel/topology.c index 8617d1ed9d31..1b83377274b8 100644 --- a/arch/x86/kernel/topology.c +++ b/arch/x86/kernel/topology.c @@ -106,7 +106,7 @@ int arch_register_cpu(int num) * Xen PV guests don't support CPU0 hotplug at all. */ if (c->x86_vendor != X86_VENDOR_INTEL || - boot_cpu_has(X86_FEATURE_XENPV)) + cpu_feature_enabled(X86_FEATURE_XENPV)) cpu0_hotpluggable = 0; /* diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 178015a820f0..d317dc3d06a3 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -15,6 +15,7 @@ #include <linux/context_tracking.h> #include <linux/interrupt.h> #include <linux/kallsyms.h> +#include <linux/kmsan.h> #include <linux/spinlock.h> #include <linux/kprobes.h> #include <linux/uaccess.h> @@ -67,13 +68,13 @@ #ifdef CONFIG_X86_64 #include <asm/x86_init.h> -#include <asm/proto.h> #else #include <asm/processor-flags.h> #include <asm/setup.h> -#include <asm/proto.h> #endif +#include <asm/proto.h> + DECLARE_BITMAP(system_vectors, NR_VECTORS); static inline void cond_local_irq_enable(struct pt_regs *regs) @@ -301,6 +302,12 @@ static noinstr bool handle_bug(struct pt_regs *regs) { bool handled = false; + /* + * Normally @regs are unpoisoned by irqentry_enter(), but handle_bug() + * is a rare case that uses @regs without passing them to + * irqentry_enter(). + */ + kmsan_unpoison_entry_regs(regs); if (!is_valid_bugaddr(regs->ip)) return handled; @@ -851,7 +858,7 @@ DEFINE_IDTENTRY_RAW(exc_int3) */ asmlinkage __visible noinstr struct pt_regs *sync_regs(struct pt_regs *eregs) { - struct pt_regs *regs = (struct pt_regs *)this_cpu_read(cpu_current_top_of_stack) - 1; + struct pt_regs *regs = (struct pt_regs *)this_cpu_read(pcpu_hot.top_of_stack) - 1; if (regs != eregs) *regs = *eregs; return regs; @@ -869,7 +876,7 @@ asmlinkage __visible noinstr struct pt_regs *vc_switch_off_ist(struct pt_regs *r * trust it and switch to the current kernel stack */ if (ip_within_syscall_gap(regs)) { - sp = this_cpu_read(cpu_current_top_of_stack); + sp = this_cpu_read(pcpu_hot.top_of_stack); goto sync; } diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index cafacb2e58cc..a78e73da4a74 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -51,7 +51,7 @@ int tsc_clocksource_reliable; static u32 art_to_tsc_numerator; static u32 art_to_tsc_denominator; static u64 art_to_tsc_offset; -struct clocksource *art_related_clocksource; +static struct clocksource *art_related_clocksource; struct cyc2ns { struct cyc2ns_data data[2]; /* 0 + 2*16 = 32 */ diff --git a/arch/x86/kernel/unwind_orc.c b/arch/x86/kernel/unwind_orc.c index c059820dfaea..cdf6c6060170 100644 --- a/arch/x86/kernel/unwind_orc.c +++ b/arch/x86/kernel/unwind_orc.c @@ -136,6 +136,21 @@ static struct orc_entry null_orc_entry = { .type = UNWIND_HINT_TYPE_CALL }; +#ifdef CONFIG_CALL_THUNKS +static struct orc_entry *orc_callthunk_find(unsigned long ip) +{ + if (!is_callthunk((void *)ip)) + return NULL; + + return &null_orc_entry; +} +#else +static struct orc_entry *orc_callthunk_find(unsigned long ip) +{ + return NULL; +} +#endif + /* Fake frame pointer entry -- used as a fallback for generated code */ static struct orc_entry orc_fp_entry = { .type = UNWIND_HINT_TYPE_CALL, @@ -189,7 +204,11 @@ static struct orc_entry *orc_find(unsigned long ip) if (orc) return orc; - return orc_ftrace_find(ip); + orc = orc_ftrace_find(ip); + if (orc) + return orc; + + return orc_callthunk_find(ip); } #ifdef CONFIG_MODULES diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c index b63cf8f7745e..6c07f6daaa22 100644 --- a/arch/x86/kernel/uprobes.c +++ b/arch/x86/kernel/uprobes.c @@ -722,8 +722,9 @@ static int branch_setup_xol_ops(struct arch_uprobe *auprobe, struct insn *insn) switch (opc1) { case 0xeb: /* jmp 8 */ case 0xe9: /* jmp 32 */ - case 0x90: /* prefix* + nop; same as jmp with .offs = 0 */ break; + case 0x90: /* prefix* + nop; same as jmp with .offs = 0 */ + goto setup; case 0xe8: /* call relative */ branch_clear_offset(auprobe, insn); @@ -753,6 +754,7 @@ static int branch_setup_xol_ops(struct arch_uprobe *auprobe, struct insn *insn) return -ENOTSUPP; } +setup: auprobe->branch.opc1 = opc1; auprobe->branch.ilen = insn->length; auprobe->branch.offs = insn->immediate.value; diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 15f29053cec4..2e0ee14229bf 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -132,18 +132,19 @@ SECTIONS CPUIDLE_TEXT LOCK_TEXT KPROBES_TEXT - ALIGN_ENTRY_TEXT_BEGIN - ENTRY_TEXT - ALIGN_ENTRY_TEXT_END SOFTIRQENTRY_TEXT - STATIC_CALL_TEXT - *(.gnu.warning) - #ifdef CONFIG_RETPOLINE __indirect_thunk_start = .; *(.text.__x86.*) __indirect_thunk_end = .; #endif + STATIC_CALL_TEXT + + ALIGN_ENTRY_TEXT_BEGIN + ENTRY_TEXT + ALIGN_ENTRY_TEXT_END + *(.gnu.warning) + } :text =0xcccc /* End of text section, which should occupy whole number of pages */ @@ -290,6 +291,13 @@ SECTIONS *(.return_sites) __return_sites_end = .; } + + . = ALIGN(8); + .call_sites : AT(ADDR(.call_sites) - LOAD_OFFSET) { + __call_sites = .; + *(.call_sites) + __call_sites_end = .; + } #endif #ifdef CONFIG_X86_KERNEL_IBT @@ -301,6 +309,15 @@ SECTIONS } #endif +#ifdef CONFIG_FINEIBT + . = ALIGN(8); + .cfi_sites : AT(ADDR(.cfi_sites) - LOAD_OFFSET) { + __cfi_sites = .; + *(.cfi_sites) + __cfi_sites_end = .; + } +#endif + /* * struct alt_inst entries. From the header (alternative.h): * "Alternative instructions for different CPU types or capabilities" @@ -493,11 +510,3 @@ INIT_PER_CPU(irq_stack_backing_store); #endif #endif /* CONFIG_X86_64 */ - -#ifdef CONFIG_KEXEC_CORE -#include <asm/kexec.h> - -. = ASSERT(kexec_control_code_size <= KEXEC_CONTROL_CODE_MAX_SIZE, - "kexec control code size is too big"); -#endif - diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index 57353519bc11..ef80d361b463 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -25,6 +25,7 @@ #include <asm/iommu.h> #include <asm/mach_traps.h> #include <asm/irqdomain.h> +#include <asm/realmode.h> void x86_init_noop(void) { } void __init x86_init_uint_noop(unsigned int unused) { } @@ -145,6 +146,8 @@ struct x86_platform_ops x86_platform __ro_after_init = { .get_nmi_reason = default_get_nmi_reason, .save_sched_clock_state = tsc_save_sched_clock_state, .restore_sched_clock_state = tsc_restore_sched_clock_state, + .realmode_reserve = reserve_real_mode, + .realmode_init = init_real_mode, .hyper.pin_vcpu = x86_op_int_noop, .guest = { diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index fbeaa9ddef59..8e578311ca9d 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -49,6 +49,7 @@ config KVM select SRCU select INTERVAL_TREE select HAVE_KVM_PM_NOTIFIER if PM + select KVM_GENERIC_HARDWARE_ENABLING help Support hosting fully virtualized guest machines using hardware virtualization extensions. You will need a fairly recent diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index c3f084185daf..2a9f1e200dbc 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -8,6 +8,7 @@ * Copyright 2011 Red Hat, Inc. and/or its affiliates. * Copyright IBM Corporation, 2008 */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/kvm_host.h> #include <linux/export.h> @@ -676,7 +677,7 @@ void kvm_set_cpu_caps(void) ); kvm_cpu_cap_init_kvm_defined(CPUID_12_EAX, - SF(SGX1) | SF(SGX2) + SF(SGX1) | SF(SGX2) | SF(SGX_EDECCSSA) ); kvm_cpu_cap_mask(CPUID_8000_0001_ECX, @@ -774,16 +775,22 @@ struct kvm_cpuid_array { int nent; }; +static struct kvm_cpuid_entry2 *get_next_cpuid(struct kvm_cpuid_array *array) +{ + if (array->nent >= array->maxnent) + return NULL; + + return &array->entries[array->nent++]; +} + static struct kvm_cpuid_entry2 *do_host_cpuid(struct kvm_cpuid_array *array, u32 function, u32 index) { - struct kvm_cpuid_entry2 *entry; + struct kvm_cpuid_entry2 *entry = get_next_cpuid(array); - if (array->nent >= array->maxnent) + if (!entry) return NULL; - entry = &array->entries[array->nent++]; - memset(entry, 0, sizeof(*entry)); entry->function = function; entry->index = index; @@ -960,22 +967,13 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) entry->edx = edx.full; break; } - /* - * Per Intel's SDM, the 0x1f is a superset of 0xb, - * thus they can be handled by common code. - */ case 0x1f: case 0xb: /* - * Populate entries until the level type (ECX[15:8]) of the - * previous entry is zero. Note, CPUID EAX.{0x1f,0xb}.0 is - * the starting entry, filled by the primary do_host_cpuid(). + * No topology; a valid topology is indicated by the presence + * of subleaf 1. */ - for (i = 1; entry->ecx & 0xff00; ++i) { - entry = do_host_cpuid(array, function, i); - if (!entry) - goto out; - } + entry->eax = entry->ebx = entry->ecx = 0; break; case 0xd: { u64 permitted_xcr0 = kvm_caps.supported_xcr0 & xstate_get_guest_group_perm(); @@ -1062,9 +1060,7 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) * userspace. ATTRIBUTES.XFRM is not adjusted as userspace is * expected to derive it from supported XCR0. */ - entry->eax &= SGX_ATTR_DEBUG | SGX_ATTR_MODE64BIT | - SGX_ATTR_PROVISIONKEY | SGX_ATTR_EINITTOKENKEY | - SGX_ATTR_KSS; + entry->eax &= SGX_ATTR_PRIV_MASK | SGX_ATTR_UNPRIV_MASK; entry->ebx &= 0; break; /* Intel PT */ @@ -1208,6 +1204,9 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) entry->ebx = entry->ecx = entry->edx = 0; break; case 0x8000001e: + /* Do not return host topology information. */ + entry->eax = entry->ebx = entry->ecx = 0; + entry->edx = 0; /* reserved */ break; case 0x8000001F: if (!kvm_cpu_cap_has(X86_FEATURE_SEV)) { diff --git a/arch/x86/kvm/debugfs.c b/arch/x86/kvm/debugfs.c index c1390357126a..ee8c4c3496ed 100644 --- a/arch/x86/kvm/debugfs.c +++ b/arch/x86/kvm/debugfs.c @@ -4,6 +4,8 @@ * * Copyright 2016 Red Hat, Inc. and/or its affiliates. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/kvm_host.h> #include <linux/debugfs.h> #include "lapic.h" diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 5cc3efa0e21c..c3443045cd93 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -17,6 +17,7 @@ * * From: xen-unstable 10676:af9809f51f81a3c43f276f00c81a52ef558afda4 */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/kvm_host.h> #include "kvm_cache_regs.h" diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index 80d082ecd5c5..71aff0edc0ed 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -17,6 +17,7 @@ * Ben-Ami Yassour <benami@il.ibm.com> * Andrey Smetanin <asmetanin@virtuozzo.com> */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include "x86.h" #include "lapic.h" diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c index e0a7a0e7a73c..cd57a517d04a 100644 --- a/arch/x86/kvm/i8254.c +++ b/arch/x86/kvm/i8254.c @@ -30,7 +30,7 @@ * Based on QEMU and Xen. */ -#define pr_fmt(fmt) "pit: " fmt +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/kvm_host.h> #include <linux/slab.h> @@ -351,7 +351,7 @@ static void create_pit_timer(struct kvm_pit *pit, u32 val, int is_period) if (ps->period < min_period) { pr_info_ratelimited( - "kvm: requested %lld ns " + "requested %lld ns " "i8254 timer period limited to %lld ns\n", ps->period, min_period); ps->period = min_period; diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c index e1bb6218bb96..4756bcb5724f 100644 --- a/arch/x86/kvm/i8259.c +++ b/arch/x86/kvm/i8259.c @@ -26,6 +26,8 @@ * Yaozu (Eddie) Dong <Eddie.dong@intel.com> * Port from Qemu. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/mm.h> #include <linux/slab.h> #include <linux/bitops.h> @@ -35,7 +37,7 @@ #include "trace.h" #define pr_pic_unimpl(fmt, ...) \ - pr_err_ratelimited("kvm: pic: " fmt, ## __VA_ARGS__) + pr_err_ratelimited("pic: " fmt, ## __VA_ARGS__) static void pic_irq_request(struct kvm *kvm, int level); diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c index 765943d7cfa5..042dee556125 100644 --- a/arch/x86/kvm/ioapic.c +++ b/arch/x86/kvm/ioapic.c @@ -26,6 +26,7 @@ * Yaozu (Eddie) Dong <eddie.dong@intel.com> * Based on Xen 3.1 code. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/kvm_host.h> #include <linux/kvm.h> diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c index a70952eca905..b2c397dd2bc6 100644 --- a/arch/x86/kvm/irq.c +++ b/arch/x86/kvm/irq.c @@ -7,6 +7,7 @@ * Authors: * Yaozu (Eddie) Dong <Eddie.dong@intel.com> */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/export.h> #include <linux/kvm_host.h> diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c index 3742d9adacfc..16d076a1b91a 100644 --- a/arch/x86/kvm/irq_comm.c +++ b/arch/x86/kvm/irq_comm.c @@ -8,6 +8,7 @@ * * Copyright 2010 Red Hat, Inc. and/or its affiliates. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/kvm_host.h> #include <linux/slab.h> @@ -56,7 +57,7 @@ int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src, if (irq->dest_mode == APIC_DEST_PHYSICAL && irq->dest_id == 0xff && kvm_lowest_prio_delivery(irq)) { - printk(KERN_INFO "kvm: apic: phys broadcast and lowest prio\n"); + pr_info("apic: phys broadcast and lowest prio\n"); irq->delivery_mode = APIC_DM_FIXED; } @@ -199,7 +200,7 @@ int kvm_request_irq_source_id(struct kvm *kvm) irq_source_id = find_first_zero_bit(bitmap, BITS_PER_LONG); if (irq_source_id >= BITS_PER_LONG) { - printk(KERN_WARNING "kvm: exhaust allocatable IRQ sources!\n"); + pr_warn("exhausted allocatable IRQ sources!\n"); irq_source_id = -EFAULT; goto unlock; } @@ -221,7 +222,7 @@ void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id) mutex_lock(&kvm->irq_lock); if (irq_source_id < 0 || irq_source_id >= BITS_PER_LONG) { - printk(KERN_ERR "kvm: IRQ source ID out of range!\n"); + pr_err("IRQ source ID out of range!\n"); goto unlock; } clear_bit(irq_source_id, &kvm->arch.irq_sources_bitmap); diff --git a/arch/x86/kvm/kvm_onhyperv.c b/arch/x86/kvm/kvm_onhyperv.c index ee4f696a0782..482d6639ef88 100644 --- a/arch/x86/kvm/kvm_onhyperv.c +++ b/arch/x86/kvm/kvm_onhyperv.c @@ -2,6 +2,7 @@ /* * KVM L1 hypervisor optimizations on Hyper-V. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/kvm_host.h> #include <asm/mshyperv.h> diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 669ea125b7e2..7cf4eebc9bcc 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -15,6 +15,7 @@ * * Based on Xen 3.1 code, Copyright (c) 2004, Intel Corporation. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/kvm_host.h> #include <linux/kvm.h> @@ -1064,8 +1065,7 @@ static void kvm_apic_disabled_lapic_found(struct kvm *kvm) { if (!kvm->arch.disabled_lapic_found) { kvm->arch.disabled_lapic_found = true; - printk(KERN_INFO - "Disabled LAPIC found during irq injection\n"); + pr_info("Disabled LAPIC found during irq injection\n"); } } @@ -1683,7 +1683,7 @@ static void limit_periodic_timer_frequency(struct kvm_lapic *apic) if (apic->lapic_timer.period < min_period) { pr_info_ratelimited( - "kvm: vcpu %i: requested %lld ns " + "vcpu %i: requested %lld ns " "lapic timer period limited to %lld ns\n", apic->vcpu->vcpu_id, apic->lapic_timer.period, min_period); @@ -1968,7 +1968,7 @@ static bool set_target_expiration(struct kvm_lapic *apic, u32 count_reg) deadline = apic->lapic_timer.period; else if (unlikely(deadline > apic->lapic_timer.period)) { pr_info_ratelimited( - "kvm: vcpu %i: requested lapic timer restore with " + "vcpu %i: requested lapic timer restore with " "starting count register %#x=%u (%lld ns) > initial count (%lld ns). " "Using initial count to start timer.\n", apic->vcpu->vcpu_id, diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 15d389370f88..aeb240b339f5 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -14,6 +14,7 @@ * Yaniv Kamay <yaniv@qumranet.com> * Avi Kivity <avi@qumranet.com> */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include "irq.h" #include "ioapic.h" @@ -2492,6 +2493,7 @@ static bool __kvm_mmu_prepare_zap_page(struct kvm *kvm, { bool list_unstable, zapped_root = false; + lockdep_assert_held_write(&kvm->mmu_lock); trace_kvm_mmu_prepare_zap_page(sp); ++kvm->stat.mmu_shadow_zapped; *nr_zapped = mmu_zap_unsync_children(kvm, sp, invalid_list); @@ -3455,8 +3457,7 @@ static int fast_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) } if (++retry_count > 4) { - printk_once(KERN_WARNING - "kvm: Fast #PF retrying more than 4 times.\n"); + pr_warn_once("Fast #PF retrying more than 4 times.\n"); break; } @@ -6166,7 +6167,7 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end) write_lock(&kvm->mmu_lock); - kvm_mmu_invalidate_begin(kvm, gfn_start, gfn_end); + kvm_mmu_invalidate_begin(kvm, 0, -1ul); flush = kvm_rmap_zap_gfn_range(kvm, gfn_start, gfn_end); @@ -6180,7 +6181,7 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end) kvm_flush_remote_tlbs_with_address(kvm, gfn_start, gfn_end - gfn_start); - kvm_mmu_invalidate_end(kvm, gfn_start, gfn_end); + kvm_mmu_invalidate_end(kvm, 0, -1ul); write_unlock(&kvm->mmu_lock); } @@ -6646,7 +6647,7 @@ void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen) * zap all shadow pages. */ if (unlikely(gen == 0)) { - kvm_debug_ratelimited("kvm: zapping shadow pages for mmio generation wraparound\n"); + kvm_debug_ratelimited("zapping shadow pages for mmio generation wraparound\n"); kvm_mmu_zap_all_fast(kvm); } } diff --git a/arch/x86/kvm/mmu/page_track.c b/arch/x86/kvm/mmu/page_track.c index 2e09d1b6249f..0a2ac438d647 100644 --- a/arch/x86/kvm/mmu/page_track.c +++ b/arch/x86/kvm/mmu/page_track.c @@ -10,6 +10,7 @@ * Author: * Xiao Guangrong <guangrong.xiao@linux.intel.com> */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/kvm_host.h> #include <linux/rculist.h> diff --git a/arch/x86/kvm/mmu/spte.c b/arch/x86/kvm/mmu/spte.c index c0fd7e049b4e..fce6f047399f 100644 --- a/arch/x86/kvm/mmu/spte.c +++ b/arch/x86/kvm/mmu/spte.c @@ -7,7 +7,7 @@ * Copyright (C) 2006 Qumranet, Inc. * Copyright 2020 Red Hat, Inc. and/or its affiliates. */ - +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/kvm_host.h> #include "mmu.h" @@ -352,7 +352,7 @@ u64 mark_spte_for_access_track(u64 spte) WARN_ONCE(spte & (SHADOW_ACC_TRACK_SAVED_BITS_MASK << SHADOW_ACC_TRACK_SAVED_BITS_SHIFT), - "kvm: Access Tracking saved bit locations are not zero\n"); + "Access Tracking saved bit locations are not zero\n"); spte |= (spte & SHADOW_ACC_TRACK_SAVED_BITS_MASK) << SHADOW_ACC_TRACK_SAVED_BITS_SHIFT; diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h index 6f54dc9409c9..0d8deefee66c 100644 --- a/arch/x86/kvm/mmu/spte.h +++ b/arch/x86/kvm/mmu/spte.h @@ -435,11 +435,11 @@ static inline void check_spte_writable_invariants(u64 spte) { if (spte & shadow_mmu_writable_mask) WARN_ONCE(!(spte & shadow_host_writable_mask), - "kvm: MMU-writable SPTE is not Host-writable: %llx", + KBUILD_MODNAME ": MMU-writable SPTE is not Host-writable: %llx", spte); else WARN_ONCE(is_writable_pte(spte), - "kvm: Writable SPTE is not MMU-writable: %llx", spte); + KBUILD_MODNAME ": Writable SPTE is not MMU-writable: %llx", spte); } static inline bool is_mmu_writable_spte(u64 spte) diff --git a/arch/x86/kvm/mmu/tdp_iter.c b/arch/x86/kvm/mmu/tdp_iter.c index 39b48e7d7d1a..e26e744df1d1 100644 --- a/arch/x86/kvm/mmu/tdp_iter.c +++ b/arch/x86/kvm/mmu/tdp_iter.c @@ -1,4 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include "mmu_internal.h" #include "tdp_iter.h" diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index cc1fb9a65620..bba33aea0fb0 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -1,4 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include "mmu.h" #include "mmu_internal.h" diff --git a/arch/x86/kvm/mtrr.c b/arch/x86/kvm/mtrr.c index a8502e02f479..9fac1ec03463 100644 --- a/arch/x86/kvm/mtrr.c +++ b/arch/x86/kvm/mtrr.c @@ -13,6 +13,7 @@ * Paolo Bonzini <pbonzini@redhat.com> * Xiao Guangrong <guangrong.xiao@linux.intel.com> */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/kvm_host.h> #include <asm/mtrr.h> diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index eb594620dd75..d939d3b84e6f 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -9,6 +9,7 @@ * Gleb Natapov <gleb@redhat.com> * Wei Huang <wei@redhat.com> */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/types.h> #include <linux/kvm_host.h> diff --git a/arch/x86/kvm/reverse_cpuid.h b/arch/x86/kvm/reverse_cpuid.h index 25b9b51abb20..4945456fd646 100644 --- a/arch/x86/kvm/reverse_cpuid.h +++ b/arch/x86/kvm/reverse_cpuid.h @@ -37,6 +37,7 @@ enum kvm_only_cpuid_leafs { /* Intel-defined SGX sub-features, CPUID level 0x12 (EAX). */ #define KVM_X86_FEATURE_SGX1 KVM_X86_FEATURE(CPUID_12_EAX, 0) #define KVM_X86_FEATURE_SGX2 KVM_X86_FEATURE(CPUID_12_EAX, 1) +#define KVM_X86_FEATURE_SGX_EDECCSSA KVM_X86_FEATURE(CPUID_12_EAX, 11) /* Intel-defined sub-features, CPUID level 0x00000007:1 (EDX) */ #define X86_FEATURE_AVX_VNNI_INT8 KVM_X86_FEATURE(CPUID_7_1_EDX, 4) @@ -102,6 +103,8 @@ static __always_inline u32 __feature_translate(int x86_feature) return KVM_X86_FEATURE_SGX1; else if (x86_feature == X86_FEATURE_SGX2) return KVM_X86_FEATURE_SGX2; + else if (x86_feature == X86_FEATURE_SGX_EDECCSSA) + return KVM_X86_FEATURE_SGX_EDECCSSA; else if (x86_feature == X86_FEATURE_CONSTANT_TSC) return KVM_X86_FEATURE_CONSTANT_TSC; diff --git a/arch/x86/kvm/smm.c b/arch/x86/kvm/smm.c index a9c1c2af8d94..cc43638d48a3 100644 --- a/arch/x86/kvm/smm.c +++ b/arch/x86/kvm/smm.c @@ -1,4 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/kvm_host.h> #include "x86.h" diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index 14677bc31b83..b3928150a37c 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -12,7 +12,7 @@ * Avi Kivity <avi@qumranet.com> */ -#define pr_fmt(fmt) "SVM: " fmt +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/kvm_types.h> #include <linux/hashtable.h> diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index 34ac03969f28..700df66d23c7 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -12,7 +12,7 @@ * Avi Kivity <avi@qumranet.com> */ -#define pr_fmt(fmt) "SVM: " fmt +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/kvm_types.h> #include <linux/kvm_host.h> @@ -138,15 +138,13 @@ void recalc_intercepts(struct vcpu_svm *svm) c->intercepts[i] = h->intercepts[i]; if (g->int_ctl & V_INTR_MASKING_MASK) { - /* We only want the cr8 intercept bits of L1 */ - vmcb_clr_intercept(c, INTERCEPT_CR8_READ); - vmcb_clr_intercept(c, INTERCEPT_CR8_WRITE); - /* - * Once running L2 with HF_VINTR_MASK, EFLAGS.IF does not - * affect any interrupt we may want to inject; therefore, - * interrupt window vmexits are irrelevant to L0. + * Once running L2 with HF_VINTR_MASK, EFLAGS.IF and CR8 + * does not affect any interrupt we may want to inject; + * therefore, writes to CR8 are irrelevant to L0, as are + * interrupt window vmexits. */ + vmcb_clr_intercept(c, INTERCEPT_CR8_WRITE); vmcb_clr_intercept(c, INTERCEPT_VINTR); } diff --git a/arch/x86/kvm/svm/pmu.c b/arch/x86/kvm/svm/pmu.c index 0e313fbae055..1ff068f23841 100644 --- a/arch/x86/kvm/svm/pmu.c +++ b/arch/x86/kvm/svm/pmu.c @@ -9,6 +9,8 @@ * * Implementation is based on pmu_intel.c file */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/types.h> #include <linux/kvm_host.h> #include <linux/perf_event.h> diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 86d6897f4806..273cba809328 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -6,6 +6,7 @@ * * Copyright 2010 Red Hat, Inc. and/or its affiliates. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/kvm_types.h> #include <linux/kvm_host.h> diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index f2453df77727..d13cf53e7390 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -1,4 +1,4 @@ -#define pr_fmt(fmt) "SVM: " fmt +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/kvm_host.h> @@ -519,21 +519,37 @@ static void svm_init_osvw(struct kvm_vcpu *vcpu) vcpu->arch.osvw.status |= 1; } -static int has_svm(void) +static bool kvm_is_svm_supported(void) { + int cpu = raw_smp_processor_id(); const char *msg; + u64 vm_cr; if (!cpu_has_svm(&msg)) { - printk(KERN_INFO "has_svm: %s\n", msg); - return 0; + pr_err("SVM not supported by CPU %d, %s\n", cpu, msg); + return false; } if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT)) { pr_info("KVM is unsupported when running as an SEV guest\n"); - return 0; + return false; } - return 1; + rdmsrl(MSR_VM_CR, vm_cr); + if (vm_cr & (1 << SVM_VM_CR_SVM_DISABLE)) { + pr_err("SVM disabled (by BIOS) in MSR_VM_CR on CPU %d\n", cpu); + return false; + } + + return true; +} + +static int svm_check_processor_compat(void) +{ + if (!kvm_is_svm_supported()) + return -EIO; + + return 0; } void __svm_write_tsc_multiplier(u64 multiplier) @@ -572,10 +588,6 @@ static int svm_hardware_enable(void) if (efer & EFER_SVME) return -EBUSY; - if (!has_svm()) { - pr_err("%s: err EOPNOTSUPP on %d\n", __func__, me); - return -EINVAL; - } sd = per_cpu_ptr(&svm_data, me); sd->asid_generation = 1; sd->max_asid = cpuid_ebx(SVM_CPUID_FUNC) - 1; @@ -2076,7 +2088,7 @@ static void svm_handle_mce(struct kvm_vcpu *vcpu) * Erratum 383 triggered. Guest state is corrupt so kill the * guest. */ - pr_err("KVM: Guest triggered AMD Erratum 383\n"); + pr_err("Guest triggered AMD Erratum 383\n"); kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); @@ -2705,9 +2717,9 @@ static int svm_get_msr_feature(struct kvm_msr_entry *msr) msr->data = 0; switch (msr->index) { - case MSR_F10H_DECFG: - if (boot_cpu_has(X86_FEATURE_LFENCE_RDTSC)) - msr->data |= MSR_F10H_DECFG_LFENCE_SERIALIZE; + case MSR_AMD64_DE_CFG: + if (cpu_feature_enabled(X86_FEATURE_LFENCE_RDTSC)) + msr->data |= MSR_AMD64_DE_CFG_LFENCE_SERIALIZE; break; default: return KVM_MSR_RET_INVALID; @@ -2806,7 +2818,7 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) msr_info->data = 0x1E; } break; - case MSR_F10H_DECFG: + case MSR_AMD64_DE_CFG: msr_info->data = svm->msr_decfg; break; default: @@ -3035,7 +3047,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) case MSR_VM_IGNNE: vcpu_unimpl(vcpu, "unimplemented wrmsr: 0x%x data 0x%llx\n", ecx, data); break; - case MSR_F10H_DECFG: { + case MSR_AMD64_DE_CFG: { struct kvm_msr_entry msr_entry; msr_entry.index = msr->index; @@ -4076,17 +4088,6 @@ static void svm_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, vmcb_mark_dirty(svm->vmcb, VMCB_CR); } -static int is_disabled(void) -{ - u64 vm_cr; - - rdmsrl(MSR_VM_CR, vm_cr); - if (vm_cr & (1 << SVM_VM_CR_SVM_DISABLE)) - return 1; - - return 0; -} - static void svm_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall) { @@ -4098,11 +4099,6 @@ svm_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall) hypercall[2] = 0xd9; } -static int __init svm_check_processor_compat(void) -{ - return 0; -} - /* * The kvm parameter can be NULL (module initialization, or invocation before * VM creation). Be sure to check the kvm parameter before using it. @@ -4629,7 +4625,7 @@ static bool svm_can_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type, smap = cr4 & X86_CR4_SMAP; is_user = svm_get_cpl(vcpu) == 3; if (smap && (!smep || is_user)) { - pr_err_ratelimited("KVM: SEV Guest triggered AMD Erratum 1096\n"); + pr_err_ratelimited("SEV Guest triggered AMD Erratum 1096\n"); /* * If the fault occurred in userspace, arbitrarily inject #GP @@ -4701,7 +4697,9 @@ static int svm_vm_init(struct kvm *kvm) } static struct kvm_x86_ops svm_x86_ops __initdata = { - .name = "kvm_amd", + .name = KBUILD_MODNAME, + + .check_processor_compatibility = svm_check_processor_compat, .hardware_unsetup = svm_hardware_unsetup, .hardware_enable = svm_hardware_enable, @@ -4978,7 +4976,7 @@ static __init int svm_hardware_setup(void) } if (nested) { - printk(KERN_INFO "kvm: Nested Virtualization enabled\n"); + pr_info("Nested Virtualization enabled\n"); kvm_enable_efer_bits(EFER_SVME | EFER_LMSLE); } @@ -4996,7 +4994,7 @@ static __init int svm_hardware_setup(void) /* Force VM NPT level equal to the host's paging level */ kvm_configure_mmu(npt_enabled, get_npt_level(), get_npt_level(), PG_LEVEL_1G); - pr_info("kvm: Nested Paging %sabled\n", npt_enabled ? "en" : "dis"); + pr_info("Nested Paging %sabled\n", npt_enabled ? "en" : "dis"); /* Setup shadow_me_value and shadow_me_mask */ kvm_mmu_set_me_spte_mask(sme_me_mask, sme_me_mask); @@ -5088,10 +5086,7 @@ err: static struct kvm_x86_init_ops svm_init_ops __initdata = { - .cpu_has_kvm_support = has_svm, - .disabled_by_bios = is_disabled, .hardware_setup = svm_hardware_setup, - .check_processor_compatibility = svm_check_processor_compat, .runtime_ops = &svm_x86_ops, .pmu_ops = &amd_pmu_ops, @@ -5099,15 +5094,37 @@ static struct kvm_x86_init_ops svm_init_ops __initdata = { static int __init svm_init(void) { + int r; + __unused_size_checks(); - return kvm_init(&svm_init_ops, sizeof(struct vcpu_svm), - __alignof__(struct vcpu_svm), THIS_MODULE); + if (!kvm_is_svm_supported()) + return -EOPNOTSUPP; + + r = kvm_x86_vendor_init(&svm_init_ops); + if (r) + return r; + + /* + * Common KVM initialization _must_ come last, after this, /dev/kvm is + * exposed to userspace! + */ + r = kvm_init(sizeof(struct vcpu_svm), __alignof__(struct vcpu_svm), + THIS_MODULE); + if (r) + goto err_kvm_init; + + return 0; + +err_kvm_init: + kvm_x86_vendor_exit(); + return r; } static void __exit svm_exit(void) { kvm_exit(); + kvm_x86_vendor_exit(); } module_init(svm_init) diff --git a/arch/x86/kvm/svm/svm_onhyperv.c b/arch/x86/kvm/svm/svm_onhyperv.c index 26a89d0da93e..7af8422d3382 100644 --- a/arch/x86/kvm/svm/svm_onhyperv.c +++ b/arch/x86/kvm/svm/svm_onhyperv.c @@ -2,6 +2,7 @@ /* * KVM L1 hypervisor optimizations on Hyper-V for SVM. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/kvm_host.h> diff --git a/arch/x86/kvm/svm/svm_onhyperv.h b/arch/x86/kvm/svm/svm_onhyperv.h index 45faf84476ce..6981c1e9a809 100644 --- a/arch/x86/kvm/svm/svm_onhyperv.h +++ b/arch/x86/kvm/svm/svm_onhyperv.h @@ -34,7 +34,7 @@ static inline void svm_hv_hardware_setup(void) { if (npt_enabled && ms_hyperv.nested_features & HV_X64_NESTED_ENLIGHTENED_TLB) { - pr_info("kvm: Hyper-V enlightened NPT TLB flush enabled\n"); + pr_info(KBUILD_MODNAME ": Hyper-V enlightened NPT TLB flush enabled\n"); svm_x86_ops.tlb_remote_flush = hv_remote_flush_tlb; svm_x86_ops.tlb_remote_flush_with_range = hv_remote_flush_tlb_with_range; @@ -43,7 +43,7 @@ static inline void svm_hv_hardware_setup(void) if (ms_hyperv.nested_features & HV_X64_NESTED_DIRECT_FLUSH) { int cpu; - pr_info("kvm: Hyper-V Direct TLB Flush enabled\n"); + pr_info(KBUILD_MODNAME ": Hyper-V Direct TLB Flush enabled\n"); for_each_online_cpu(cpu) { struct hv_vp_assist_page *vp_ap = hv_get_vp_assist_page(cpu); diff --git a/arch/x86/kvm/svm/vmenter.S b/arch/x86/kvm/svm/vmenter.S index 34367dc203f2..8e8295e774f0 100644 --- a/arch/x86/kvm/svm/vmenter.S +++ b/arch/x86/kvm/svm/vmenter.S @@ -1,6 +1,7 @@ /* SPDX-License-Identifier: GPL-2.0 */ #include <linux/linkage.h> #include <asm/asm.h> +#include <asm/asm-offsets.h> #include <asm/bitsperlong.h> #include <asm/kvm_vcpu_regs.h> #include <asm/nospec-branch.h> diff --git a/arch/x86/kvm/vmx/capabilities.h b/arch/x86/kvm/vmx/capabilities.h index cd2ac9536c99..45162c1bcd8f 100644 --- a/arch/x86/kvm/vmx/capabilities.h +++ b/arch/x86/kvm/vmx/capabilities.h @@ -66,13 +66,13 @@ struct vmcs_config { u64 misc; struct nested_vmx_msrs nested; }; -extern struct vmcs_config vmcs_config; +extern struct vmcs_config vmcs_config __ro_after_init; struct vmx_capability { u32 ept; u32 vpid; }; -extern struct vmx_capability vmx_capability; +extern struct vmx_capability vmx_capability __ro_after_init; static inline bool cpu_has_vmx_basic_inout(void) { diff --git a/arch/x86/kvm/vmx/hyperv.c b/arch/x86/kvm/vmx/hyperv.c index f773450fba6e..22daca752797 100644 --- a/arch/x86/kvm/vmx/hyperv.c +++ b/arch/x86/kvm/vmx/hyperv.c @@ -1,6 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 - -#define pr_fmt(fmt) "kvm/hyper-v: " fmt +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/errno.h> #include <linux/smp.h> @@ -527,7 +526,7 @@ int nested_evmcs_check_controls(struct vmcs12 *vmcs12) } \ while (0) -__init void evmcs_sanitize_exec_ctrls(struct vmcs_config *vmcs_conf) +void evmcs_sanitize_exec_ctrls(struct vmcs_config *vmcs_conf) { evmcs_check_vmcs_conf(cpu_based_exec_ctrl, EXEC_CTRL); evmcs_check_vmcs_conf(pin_based_exec_ctrl, PINCTRL); diff --git a/arch/x86/kvm/vmx/hyperv.h b/arch/x86/kvm/vmx/hyperv.h index 883102d567c3..ab08a9b9ab7d 100644 --- a/arch/x86/kvm/vmx/hyperv.h +++ b/arch/x86/kvm/vmx/hyperv.h @@ -179,9 +179,7 @@ static __always_inline int get_evmcs_offset(unsigned long field, { int offset = evmcs_field_offset(field, clean_field); - WARN_ONCE(offset < 0, "KVM: accessing unsupported EVMCS field %lx\n", - field); - + WARN_ONCE(offset < 0, "accessing unsupported EVMCS field %lx\n", field); return offset; } @@ -273,7 +271,7 @@ static inline void evmcs_load(u64 phys_addr) vp_ap->enlighten_vmentry = 1; } -__init void evmcs_sanitize_exec_ctrls(struct vmcs_config *vmcs_conf); +void evmcs_sanitize_exec_ctrls(struct vmcs_config *vmcs_conf); #else /* !IS_ENABLED(CONFIG_HYPERV) */ static __always_inline void evmcs_write64(unsigned long field, u64 value) {} static inline void evmcs_write32(unsigned long field, u32 value) {} diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index d93c715cda6a..557b9c468734 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -1,4 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/objtool.h> #include <linux/percpu.h> @@ -203,7 +204,7 @@ static void nested_vmx_abort(struct kvm_vcpu *vcpu, u32 indicator) { /* TODO: not to reset guest simply here. */ kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); - pr_debug_ratelimited("kvm: nested vmx abort, indicator %d\n", indicator); + pr_debug_ratelimited("nested vmx abort, indicator %d\n", indicator); } static inline bool vmx_control_verify(u32 control, u32 low, u32 high) diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index e5cec07ca8d9..efce9ad70e4e 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -8,6 +8,8 @@ * Avi Kivity <avi@redhat.com> * Gleb Natapov <gleb@redhat.com> */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/types.h> #include <linux/kvm_host.h> #include <linux/perf_event.h> @@ -762,8 +764,7 @@ void vmx_passthrough_lbr_msrs(struct kvm_vcpu *vcpu) return; warn: - pr_warn_ratelimited("kvm: vcpu-%d: fail to passthrough LBR.\n", - vcpu->vcpu_id); + pr_warn_ratelimited("vcpu-%d: fail to passthrough LBR.\n", vcpu->vcpu_id); } static void intel_pmu_cleanup(struct kvm_vcpu *vcpu) diff --git a/arch/x86/kvm/vmx/posted_intr.c b/arch/x86/kvm/vmx/posted_intr.c index 1b56c5e5c9fb..94c38bea60e7 100644 --- a/arch/x86/kvm/vmx/posted_intr.c +++ b/arch/x86/kvm/vmx/posted_intr.c @@ -1,4 +1,6 @@ // SPDX-License-Identifier: GPL-2.0-only +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/kvm_host.h> #include <asm/irq_remapping.h> diff --git a/arch/x86/kvm/vmx/sgx.c b/arch/x86/kvm/vmx/sgx.c index b12da2a6dec9..aa53c98034bf 100644 --- a/arch/x86/kvm/vmx/sgx.c +++ b/arch/x86/kvm/vmx/sgx.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 /* Copyright(c) 2021 Intel Corporation. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <asm/sgx.h> @@ -164,7 +165,7 @@ static int __handle_encls_ecreate(struct kvm_vcpu *vcpu, if (!vcpu->kvm->arch.sgx_provisioning_allowed && (attributes & SGX_ATTR_PROVISIONKEY)) { if (sgx_12_1->eax & SGX_ATTR_PROVISIONKEY) - pr_warn_once("KVM: SGX PROVISIONKEY advertised but not allowed\n"); + pr_warn_once("SGX PROVISIONKEY advertised but not allowed\n"); kvm_inject_gp(vcpu, 0); return 1; } @@ -381,7 +382,7 @@ int handle_encls(struct kvm_vcpu *vcpu) return handle_encls_ecreate(vcpu); if (leaf == EINIT) return handle_encls_einit(vcpu); - WARN(1, "KVM: unexpected exit on ENCLS[%u]", leaf); + WARN_ONCE(1, "unexpected exit on ENCLS[%u]", leaf); vcpu->run->exit_reason = KVM_EXIT_UNKNOWN; vcpu->run->hw.hardware_exit_reason = EXIT_REASON_ENCLS; return 0; diff --git a/arch/x86/kvm/vmx/vmcs12.c b/arch/x86/kvm/vmx/vmcs12.c index 2251b60920f8..106a72c923ca 100644 --- a/arch/x86/kvm/vmx/vmcs12.c +++ b/arch/x86/kvm/vmx/vmcs12.c @@ -1,4 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include "vmcs12.h" diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index ad2ac66ef32e..c788aa382611 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -12,6 +12,7 @@ * Avi Kivity <avi@qumranet.com> * Yaniv Kamay <yaniv@qumranet.com> */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/highmem.h> #include <linux/hrtimer.h> @@ -444,36 +445,36 @@ void vmread_error(unsigned long field, bool fault) if (fault) kvm_spurious_fault(); else - vmx_insn_failed("kvm: vmread failed: field=%lx\n", field); + vmx_insn_failed("vmread failed: field=%lx\n", field); } noinline void vmwrite_error(unsigned long field, unsigned long value) { - vmx_insn_failed("kvm: vmwrite failed: field=%lx val=%lx err=%u\n", + vmx_insn_failed("vmwrite failed: field=%lx val=%lx err=%u\n", field, value, vmcs_read32(VM_INSTRUCTION_ERROR)); } noinline void vmclear_error(struct vmcs *vmcs, u64 phys_addr) { - vmx_insn_failed("kvm: vmclear failed: %p/%llx err=%u\n", + vmx_insn_failed("vmclear failed: %p/%llx err=%u\n", vmcs, phys_addr, vmcs_read32(VM_INSTRUCTION_ERROR)); } noinline void vmptrld_error(struct vmcs *vmcs, u64 phys_addr) { - vmx_insn_failed("kvm: vmptrld failed: %p/%llx err=%u\n", + vmx_insn_failed("vmptrld failed: %p/%llx err=%u\n", vmcs, phys_addr, vmcs_read32(VM_INSTRUCTION_ERROR)); } noinline void invvpid_error(unsigned long ext, u16 vpid, gva_t gva) { - vmx_insn_failed("kvm: invvpid failed: ext=0x%lx vpid=%u gva=0x%lx\n", + vmx_insn_failed("invvpid failed: ext=0x%lx vpid=%u gva=0x%lx\n", ext, vpid, gva); } noinline void invept_error(unsigned long ext, u64 eptp, gpa_t gpa) { - vmx_insn_failed("kvm: invept failed: ext=0x%lx eptp=%llx gpa=0x%llx\n", + vmx_insn_failed("invept failed: ext=0x%lx eptp=%llx gpa=0x%llx\n", ext, eptp, gpa); } @@ -488,8 +489,8 @@ static DEFINE_PER_CPU(struct list_head, loaded_vmcss_on_cpu); static DECLARE_BITMAP(vmx_vpid_bitmap, VMX_NR_VPIDS); static DEFINE_SPINLOCK(vmx_vpid_lock); -struct vmcs_config vmcs_config; -struct vmx_capability vmx_capability; +struct vmcs_config vmcs_config __ro_after_init; +struct vmx_capability vmx_capability __ro_after_init; #define VMX_SEGMENT_FIELD(seg) \ [VCPU_SREG_##seg] = { \ @@ -523,6 +524,8 @@ static inline void vmx_segment_cache_clear(struct vcpu_vmx *vmx) static unsigned long host_idt_base; #if IS_ENABLED(CONFIG_HYPERV) +static struct kvm_x86_ops vmx_x86_ops __initdata; + static bool __read_mostly enlightened_vmcs = true; module_param(enlightened_vmcs, bool, 0444); @@ -551,6 +554,71 @@ static int hv_enable_l2_tlb_flush(struct kvm_vcpu *vcpu) return 0; } +static __init void hv_init_evmcs(void) +{ + int cpu; + + if (!enlightened_vmcs) + return; + + /* + * Enlightened VMCS usage should be recommended and the host needs + * to support eVMCS v1 or above. + */ + if (ms_hyperv.hints & HV_X64_ENLIGHTENED_VMCS_RECOMMENDED && + (ms_hyperv.nested_features & HV_X64_ENLIGHTENED_VMCS_VERSION) >= + KVM_EVMCS_VERSION) { + + /* Check that we have assist pages on all online CPUs */ + for_each_online_cpu(cpu) { + if (!hv_get_vp_assist_page(cpu)) { + enlightened_vmcs = false; + break; + } + } + + if (enlightened_vmcs) { + pr_info("Using Hyper-V Enlightened VMCS\n"); + static_branch_enable(&enable_evmcs); + } + + if (ms_hyperv.nested_features & HV_X64_NESTED_DIRECT_FLUSH) + vmx_x86_ops.enable_l2_tlb_flush + = hv_enable_l2_tlb_flush; + + } else { + enlightened_vmcs = false; + } +} + +static void hv_reset_evmcs(void) +{ + struct hv_vp_assist_page *vp_ap; + + if (!static_branch_unlikely(&enable_evmcs)) + return; + + /* + * KVM should enable eVMCS if and only if all CPUs have a VP assist + * page, and should reject CPU onlining if eVMCS is enabled the CPU + * doesn't have a VP assist page allocated. + */ + vp_ap = hv_get_vp_assist_page(smp_processor_id()); + if (WARN_ON_ONCE(!vp_ap)) + return; + + /* + * Reset everything to support using non-enlightened VMCS access later + * (e.g. when we reload the module with enlightened_vmcs=0) + */ + vp_ap->nested_control.features.directhypercall = 0; + vp_ap->current_nested_vmcs = 0; + vp_ap->enlighten_vmentry = 0; +} + +#else /* IS_ENABLED(CONFIG_HYPERV) */ +static void hv_init_evmcs(void) {} +static void hv_reset_evmcs(void) {} #endif /* IS_ENABLED(CONFIG_HYPERV) */ /* @@ -1613,8 +1681,8 @@ static int skip_emulated_instruction(struct kvm_vcpu *vcpu) if (!instr_len) goto rip_updated; - WARN(exit_reason.enclave_mode, - "KVM: skipping instruction after SGX enclave VM-Exit"); + WARN_ONCE(exit_reason.enclave_mode, + "skipping instruction after SGX enclave VM-Exit"); orig_rip = kvm_rip_read(vcpu); rip = orig_rip + instr_len; @@ -2448,88 +2516,6 @@ static void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg) } } -static __init int cpu_has_kvm_support(void) -{ - return cpu_has_vmx(); -} - -static __init int vmx_disabled_by_bios(void) -{ - return !boot_cpu_has(X86_FEATURE_MSR_IA32_FEAT_CTL) || - !boot_cpu_has(X86_FEATURE_VMX); -} - -static int kvm_cpu_vmxon(u64 vmxon_pointer) -{ - u64 msr; - - cr4_set_bits(X86_CR4_VMXE); - - asm_volatile_goto("1: vmxon %[vmxon_pointer]\n\t" - _ASM_EXTABLE(1b, %l[fault]) - : : [vmxon_pointer] "m"(vmxon_pointer) - : : fault); - return 0; - -fault: - WARN_ONCE(1, "VMXON faulted, MSR_IA32_FEAT_CTL (0x3a) = 0x%llx\n", - rdmsrl_safe(MSR_IA32_FEAT_CTL, &msr) ? 0xdeadbeef : msr); - cr4_clear_bits(X86_CR4_VMXE); - - return -EFAULT; -} - -static int vmx_hardware_enable(void) -{ - int cpu = raw_smp_processor_id(); - u64 phys_addr = __pa(per_cpu(vmxarea, cpu)); - int r; - - if (cr4_read_shadow() & X86_CR4_VMXE) - return -EBUSY; - - /* - * This can happen if we hot-added a CPU but failed to allocate - * VP assist page for it. - */ - if (static_branch_unlikely(&enable_evmcs) && - !hv_get_vp_assist_page(cpu)) - return -EFAULT; - - intel_pt_handle_vmx(1); - - r = kvm_cpu_vmxon(phys_addr); - if (r) { - intel_pt_handle_vmx(0); - return r; - } - - if (enable_ept) - ept_sync_global(); - - return 0; -} - -static void vmclear_local_loaded_vmcss(void) -{ - int cpu = raw_smp_processor_id(); - struct loaded_vmcs *v, *n; - - list_for_each_entry_safe(v, n, &per_cpu(loaded_vmcss_on_cpu, cpu), - loaded_vmcss_on_cpu_link) - __loaded_vmcs_clear(v); -} - -static void vmx_hardware_disable(void) -{ - vmclear_local_loaded_vmcss(); - - if (cpu_vmxoff()) - kvm_spurious_fault(); - - intel_pt_handle_vmx(0); -} - /* * There is no X86_FEATURE for SGX yet, but anyway we need to query CPUID * directly instead of going through cpu_has(), to ensure KVM is trapping @@ -2565,8 +2551,7 @@ static bool cpu_has_perf_global_ctrl_bug(void) return false; } -static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt, - u32 msr, u32 *result) +static int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt, u32 msr, u32 *result) { u32 vmx_msr_low, vmx_msr_high; u32 ctl = ctl_min | ctl_opt; @@ -2584,7 +2569,7 @@ static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt, return 0; } -static __init u64 adjust_vmx_controls64(u64 ctl_opt, u32 msr) +static u64 adjust_vmx_controls64(u64 ctl_opt, u32 msr) { u64 allowed; @@ -2593,8 +2578,8 @@ static __init u64 adjust_vmx_controls64(u64 ctl_opt, u32 msr) return ctl_opt & allowed; } -static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf, - struct vmx_capability *vmx_cap) +static int setup_vmcs_config(struct vmcs_config *vmcs_conf, + struct vmx_capability *vmx_cap) { u32 vmx_msr_low, vmx_msr_high; u32 _pin_based_exec_control = 0; @@ -2760,6 +2745,119 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf, return 0; } +static bool kvm_is_vmx_supported(void) +{ + int cpu = raw_smp_processor_id(); + + if (!cpu_has_vmx()) { + pr_err("VMX not supported by CPU %d\n", cpu); + return false; + } + + if (!this_cpu_has(X86_FEATURE_MSR_IA32_FEAT_CTL) || + !this_cpu_has(X86_FEATURE_VMX)) { + pr_err("VMX not enabled (by BIOS) in MSR_IA32_FEAT_CTL on CPU %d\n", cpu); + return false; + } + + return true; +} + +static int vmx_check_processor_compat(void) +{ + int cpu = raw_smp_processor_id(); + struct vmcs_config vmcs_conf; + struct vmx_capability vmx_cap; + + if (!kvm_is_vmx_supported()) + return -EIO; + + if (setup_vmcs_config(&vmcs_conf, &vmx_cap) < 0) { + pr_err("Failed to setup VMCS config on CPU %d\n", cpu); + return -EIO; + } + if (nested) + nested_vmx_setup_ctls_msrs(&vmcs_conf, vmx_cap.ept); + if (memcmp(&vmcs_config, &vmcs_conf, sizeof(struct vmcs_config))) { + pr_err("Inconsistent VMCS config on CPU %d\n", cpu); + return -EIO; + } + return 0; +} + +static int kvm_cpu_vmxon(u64 vmxon_pointer) +{ + u64 msr; + + cr4_set_bits(X86_CR4_VMXE); + + asm_volatile_goto("1: vmxon %[vmxon_pointer]\n\t" + _ASM_EXTABLE(1b, %l[fault]) + : : [vmxon_pointer] "m"(vmxon_pointer) + : : fault); + return 0; + +fault: + WARN_ONCE(1, "VMXON faulted, MSR_IA32_FEAT_CTL (0x3a) = 0x%llx\n", + rdmsrl_safe(MSR_IA32_FEAT_CTL, &msr) ? 0xdeadbeef : msr); + cr4_clear_bits(X86_CR4_VMXE); + + return -EFAULT; +} + +static int vmx_hardware_enable(void) +{ + int cpu = raw_smp_processor_id(); + u64 phys_addr = __pa(per_cpu(vmxarea, cpu)); + int r; + + if (cr4_read_shadow() & X86_CR4_VMXE) + return -EBUSY; + + /* + * This can happen if we hot-added a CPU but failed to allocate + * VP assist page for it. + */ + if (static_branch_unlikely(&enable_evmcs) && + !hv_get_vp_assist_page(cpu)) + return -EFAULT; + + intel_pt_handle_vmx(1); + + r = kvm_cpu_vmxon(phys_addr); + if (r) { + intel_pt_handle_vmx(0); + return r; + } + + if (enable_ept) + ept_sync_global(); + + return 0; +} + +static void vmclear_local_loaded_vmcss(void) +{ + int cpu = raw_smp_processor_id(); + struct loaded_vmcs *v, *n; + + list_for_each_entry_safe(v, n, &per_cpu(loaded_vmcss_on_cpu, cpu), + loaded_vmcss_on_cpu_link) + __loaded_vmcs_clear(v); +} + +static void vmx_hardware_disable(void) +{ + vmclear_local_loaded_vmcss(); + + if (cpu_vmxoff()) + kvm_spurious_fault(); + + hv_reset_evmcs(); + + intel_pt_handle_vmx(0); +} + struct vmcs *alloc_vmcs_cpu(bool shadow, int cpu, gfp_t flags) { int node = cpu_to_node(cpu); @@ -2955,9 +3053,8 @@ static void fix_rmode_seg(int seg, struct kvm_segment *save) var.type = 0x3; var.avl = 0; if (save->base & 0xf) - printk_once(KERN_WARNING "kvm: segment base is not " - "paragraph aligned when entering " - "protected mode (seg=%d)", seg); + pr_warn_once("segment base is not paragraph aligned " + "when entering protected mode (seg=%d)", seg); } vmcs_write16(sf->selector, var.selector); @@ -2987,8 +3084,7 @@ static void enter_rmode(struct kvm_vcpu *vcpu) * vcpu. Warn the user that an update is overdue. */ if (!kvm_vmx->tss_addr) - printk_once(KERN_WARNING "kvm: KVM_SET_TSS_ADDR need to be " - "called before entering vcpu\n"); + pr_warn_once("KVM_SET_TSS_ADDR needs to be called before running vCPU\n"); vmx_segment_cache_clear(vmx); @@ -6823,7 +6919,7 @@ static void handle_external_interrupt_irqoff(struct kvm_vcpu *vcpu) gate_desc *desc = (gate_desc *)host_idt_base + vector; if (KVM_BUG(!is_external_intr(intr_info), vcpu->kvm, - "KVM: unexpected VM-Exit interrupt info: 0x%x", intr_info)) + "unexpected VM-Exit interrupt info: 0x%x", intr_info)) return; handle_interrupt_nmi_irqoff(vcpu, gate_offset(desc)); @@ -7421,29 +7517,6 @@ static int vmx_vm_init(struct kvm *kvm) return 0; } -static int __init vmx_check_processor_compat(void) -{ - struct vmcs_config vmcs_conf; - struct vmx_capability vmx_cap; - - if (!this_cpu_has(X86_FEATURE_MSR_IA32_FEAT_CTL) || - !this_cpu_has(X86_FEATURE_VMX)) { - pr_err("kvm: VMX is disabled on CPU %d\n", smp_processor_id()); - return -EIO; - } - - if (setup_vmcs_config(&vmcs_conf, &vmx_cap) < 0) - return -EIO; - if (nested) - nested_vmx_setup_ctls_msrs(&vmcs_conf, vmx_cap.ept); - if (memcmp(&vmcs_config, &vmcs_conf, sizeof(struct vmcs_config)) != 0) { - printk(KERN_ERR "kvm: CPU %d feature inconsistency!\n", - smp_processor_id()); - return -EIO; - } - return 0; -} - static u8 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) { u8 cache; @@ -8042,7 +8115,9 @@ static void vmx_vm_destroy(struct kvm *kvm) } static struct kvm_x86_ops vmx_x86_ops __initdata = { - .name = "kvm_intel", + .name = KBUILD_MODNAME, + + .check_processor_compatibility = vmx_check_processor_compat, .hardware_unsetup = vmx_hardware_unsetup, @@ -8262,7 +8337,7 @@ static __init int hardware_setup(void) return -EIO; if (cpu_has_perf_global_ctrl_bug()) - pr_warn_once("kvm: VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL " + pr_warn_once("VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL " "does not work properly. Using workaround\n"); if (boot_cpu_has(X86_FEATURE_NX)) @@ -8270,7 +8345,7 @@ static __init int hardware_setup(void) if (boot_cpu_has(X86_FEATURE_MPX)) { rdmsrl(MSR_IA32_BNDCFGS, host_bndcfgs); - WARN_ONCE(host_bndcfgs, "KVM: BNDCFGS in host will be lost"); + WARN_ONCE(host_bndcfgs, "BNDCFGS in host will be lost"); } if (!cpu_has_vmx_mpx()) @@ -8289,7 +8364,7 @@ static __init int hardware_setup(void) /* NX support is required for shadow paging. */ if (!enable_ept && !boot_cpu_has(X86_FEATURE_NX)) { - pr_err_ratelimited("kvm: NX (Execute Disable) not supported\n"); + pr_err_ratelimited("NX (Execute Disable) not supported\n"); return -EOPNOTSUPP; } @@ -8441,9 +8516,6 @@ static __init int hardware_setup(void) } static struct kvm_x86_init_ops vmx_init_ops __initdata = { - .cpu_has_kvm_support = cpu_has_kvm_support, - .disabled_by_bios = vmx_disabled_by_bios, - .check_processor_compatibility = vmx_check_processor_compat, .hardware_setup = hardware_setup, .handle_intel_pt_intr = NULL, @@ -8461,41 +8533,23 @@ static void vmx_cleanup_l1d_flush(void) l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_AUTO; } -static void vmx_exit(void) +static void __vmx_exit(void) { + allow_smaller_maxphyaddr = false; + #ifdef CONFIG_KEXEC_CORE RCU_INIT_POINTER(crash_vmclear_loaded_vmcss, NULL); synchronize_rcu(); #endif + vmx_cleanup_l1d_flush(); +} +static void vmx_exit(void) +{ kvm_exit(); + kvm_x86_vendor_exit(); -#if IS_ENABLED(CONFIG_HYPERV) - if (static_branch_unlikely(&enable_evmcs)) { - int cpu; - struct hv_vp_assist_page *vp_ap; - /* - * Reset everything to support using non-enlightened VMCS - * access later (e.g. when we reload the module with - * enlightened_vmcs=0) - */ - for_each_online_cpu(cpu) { - vp_ap = hv_get_vp_assist_page(cpu); - - if (!vp_ap) - continue; - - vp_ap->nested_control.features.directhypercall = 0; - vp_ap->current_nested_vmcs = 0; - vp_ap->enlighten_vmentry = 0; - } - - static_branch_disable(&enable_evmcs); - } -#endif - vmx_cleanup_l1d_flush(); - - allow_smaller_maxphyaddr = false; + __vmx_exit(); } module_exit(vmx_exit); @@ -8503,56 +8557,29 @@ static int __init vmx_init(void) { int r, cpu; -#if IS_ENABLED(CONFIG_HYPERV) + if (!kvm_is_vmx_supported()) + return -EOPNOTSUPP; + /* - * Enlightened VMCS usage should be recommended and the host needs - * to support eVMCS v1 or above. We can also disable eVMCS support - * with module parameter. + * Note, hv_init_evmcs() touches only VMX knobs, i.e. there's nothing + * to unwind if a later step fails. */ - if (enlightened_vmcs && - ms_hyperv.hints & HV_X64_ENLIGHTENED_VMCS_RECOMMENDED && - (ms_hyperv.nested_features & HV_X64_ENLIGHTENED_VMCS_VERSION) >= - KVM_EVMCS_VERSION) { + hv_init_evmcs(); - /* Check that we have assist pages on all online CPUs */ - for_each_online_cpu(cpu) { - if (!hv_get_vp_assist_page(cpu)) { - enlightened_vmcs = false; - break; - } - } - - if (enlightened_vmcs) { - pr_info("KVM: vmx: using Hyper-V Enlightened VMCS\n"); - static_branch_enable(&enable_evmcs); - } - - if (ms_hyperv.nested_features & HV_X64_NESTED_DIRECT_FLUSH) - vmx_x86_ops.enable_l2_tlb_flush - = hv_enable_l2_tlb_flush; - - } else { - enlightened_vmcs = false; - } -#endif - - r = kvm_init(&vmx_init_ops, sizeof(struct vcpu_vmx), - __alignof__(struct vcpu_vmx), THIS_MODULE); + r = kvm_x86_vendor_init(&vmx_init_ops); if (r) return r; /* - * Must be called after kvm_init() so enable_ept is properly set + * Must be called after common x86 init so enable_ept is properly set * up. Hand the parameter mitigation value in which was stored in * the pre module init parser. If no parameter was given, it will * contain 'auto' which will be turned into the default 'cond' * mitigation mode. */ r = vmx_setup_l1d_flush(vmentry_l1d_flush_param); - if (r) { - vmx_exit(); - return r; - } + if (r) + goto err_l1d_flush; vmx_setup_fb_clear_ctrl(); @@ -8576,6 +8603,21 @@ static int __init vmx_init(void) if (!enable_ept) allow_smaller_maxphyaddr = true; + /* + * Common KVM initialization _must_ come last, after this, /dev/kvm is + * exposed to userspace! + */ + r = kvm_init(sizeof(struct vcpu_vmx), __alignof__(struct vcpu_vmx), + THIS_MODULE); + if (r) + goto err_kvm_init; + return 0; + +err_kvm_init: + __vmx_exit(); +err_l1d_flush: + kvm_x86_vendor_exit(); + return r; } module_init(vmx_init); diff --git a/arch/x86/kvm/vmx/vmx_ops.h b/arch/x86/kvm/vmx/vmx_ops.h index 842dc898c972..a5282014616c 100644 --- a/arch/x86/kvm/vmx/vmx_ops.h +++ b/arch/x86/kvm/vmx/vmx_ops.h @@ -100,8 +100,8 @@ static __always_inline unsigned long __vmcs_readl(unsigned long field) return value; do_fail: - WARN_ONCE(1, "kvm: vmread failed: field=%lx\n", field); - pr_warn_ratelimited("kvm: vmread failed: field=%lx\n", field); + WARN_ONCE(1, KBUILD_MODNAME ": vmread failed: field=%lx\n", field); + pr_warn_ratelimited(KBUILD_MODNAME ": vmread failed: field=%lx\n", field); return 0; do_exception: diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 5becce5bd45a..508074e47bc0 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -15,6 +15,7 @@ * Amit Shah <amit.shah@qumranet.com> * Ben-Ami Yassour <benami@il.ibm.com> */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/kvm_host.h> #include "irq.h" @@ -128,6 +129,7 @@ static int kvm_vcpu_do_singlestep(struct kvm_vcpu *vcpu); static int __set_sregs2(struct kvm_vcpu *vcpu, struct kvm_sregs2 *sregs2); static void __get_sregs2(struct kvm_vcpu *vcpu, struct kvm_sregs2 *sregs2); +static DEFINE_MUTEX(vendor_module_lock); struct kvm_x86_ops kvm_x86_ops __read_mostly; #define KVM_X86_OP(func) \ @@ -1557,7 +1559,7 @@ static const u32 msr_based_features_all[] = { MSR_IA32_VMX_EPT_VPID_CAP, MSR_IA32_VMX_VMFUNC, - MSR_F10H_DECFG, + MSR_AMD64_DE_CFG, MSR_IA32_UCODE_REV, MSR_IA32_ARCH_CAPABILITIES, MSR_IA32_PERF_CAPABILITIES, @@ -2086,7 +2088,7 @@ static int kvm_emulate_monitor_mwait(struct kvm_vcpu *vcpu, const char *insn) !guest_cpuid_has(vcpu, X86_FEATURE_MWAIT)) return kvm_handle_invalid_op(vcpu); - pr_warn_once("kvm: %s instruction emulated as NOP!\n", insn); + pr_warn_once("%s instruction emulated as NOP!\n", insn); return kvm_emulate_as_nop(vcpu); } int kvm_emulate_mwait(struct kvm_vcpu *vcpu) @@ -2433,7 +2435,8 @@ static int kvm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz) thresh_lo = adjust_tsc_khz(tsc_khz, -tsc_tolerance_ppm); thresh_hi = adjust_tsc_khz(tsc_khz, tsc_tolerance_ppm); if (user_tsc_khz < thresh_lo || user_tsc_khz > thresh_hi) { - pr_debug("kvm: requested TSC rate %u falls outside tolerance [%u,%u]\n", user_tsc_khz, thresh_lo, thresh_hi); + pr_debug("requested TSC rate %u falls outside tolerance [%u,%u]\n", + user_tsc_khz, thresh_lo, thresh_hi); use_scaling = 1; } return set_tsc_khz(vcpu, user_tsc_khz, use_scaling); @@ -7701,7 +7704,7 @@ static int emulator_cmpxchg_emulated(struct x86_emulate_ctxt *ctxt, return X86EMUL_CONTINUE; emul_write: - printk_once(KERN_WARNING "kvm: emulating exchange as write\n"); + pr_warn_once("emulating exchange as write\n"); return emulator_write_emulated(ctxt, addr, new, bytes, exception); } @@ -8262,7 +8265,7 @@ static struct x86_emulate_ctxt *alloc_emulate_ctxt(struct kvm_vcpu *vcpu) ctxt = kmem_cache_zalloc(x86_emulator_cache, GFP_KERNEL_ACCOUNT); if (!ctxt) { - pr_err("kvm: failed to allocate vcpu's emulator\n"); + pr_err("failed to allocate vcpu's emulator\n"); return NULL; } @@ -9273,35 +9276,66 @@ static struct notifier_block pvclock_gtod_notifier = { }; #endif -int kvm_arch_init(void *opaque) +static inline void kvm_ops_update(struct kvm_x86_init_ops *ops) +{ + memcpy(&kvm_x86_ops, ops->runtime_ops, sizeof(kvm_x86_ops)); + +#define __KVM_X86_OP(func) \ + static_call_update(kvm_x86_##func, kvm_x86_ops.func); +#define KVM_X86_OP(func) \ + WARN_ON(!kvm_x86_ops.func); __KVM_X86_OP(func) +#define KVM_X86_OP_OPTIONAL __KVM_X86_OP +#define KVM_X86_OP_OPTIONAL_RET0(func) \ + static_call_update(kvm_x86_##func, (void *)kvm_x86_ops.func ? : \ + (void *)__static_call_return0); +#include <asm/kvm-x86-ops.h> +#undef __KVM_X86_OP + + kvm_pmu_ops_update(ops->pmu_ops); +} + +static int kvm_x86_check_processor_compatibility(void) +{ + int cpu = smp_processor_id(); + struct cpuinfo_x86 *c = &cpu_data(cpu); + + /* + * Compatibility checks are done when loading KVM and when enabling + * hardware, e.g. during CPU hotplug, to ensure all online CPUs are + * compatible, i.e. KVM should never perform a compatibility check on + * an offline CPU. + */ + WARN_ON(!cpu_online(cpu)); + + if (__cr4_reserved_bits(cpu_has, c) != + __cr4_reserved_bits(cpu_has, &boot_cpu_data)) + return -EIO; + + return static_call(kvm_x86_check_processor_compatibility)(); +} + +static void kvm_x86_check_cpu_compat(void *ret) +{ + *(int *)ret = kvm_x86_check_processor_compatibility(); +} + +static int __kvm_x86_vendor_init(struct kvm_x86_init_ops *ops) { - struct kvm_x86_init_ops *ops = opaque; u64 host_pat; - int r; + int r, cpu; if (kvm_x86_ops.hardware_enable) { - pr_err("kvm: already loaded vendor module '%s'\n", kvm_x86_ops.name); + pr_err("already loaded vendor module '%s'\n", kvm_x86_ops.name); return -EEXIST; } - if (!ops->cpu_has_kvm_support()) { - pr_err_ratelimited("kvm: no hardware support for '%s'\n", - ops->runtime_ops->name); - return -EOPNOTSUPP; - } - if (ops->disabled_by_bios()) { - pr_err_ratelimited("kvm: support for '%s' disabled by bios\n", - ops->runtime_ops->name); - return -EOPNOTSUPP; - } - /* * KVM explicitly assumes that the guest has an FPU and * FXSAVE/FXRSTOR. For example, the KVM_GET_FPU explicitly casts the * vCPU's FPU state as a fxregs_state struct. */ if (!boot_cpu_has(X86_FEATURE_FPU) || !boot_cpu_has(X86_FEATURE_FXSR)) { - printk(KERN_ERR "kvm: inadequate fpu\n"); + pr_err("inadequate fpu\n"); return -EOPNOTSUPP; } @@ -9319,19 +9353,19 @@ int kvm_arch_init(void *opaque) */ if (rdmsrl_safe(MSR_IA32_CR_PAT, &host_pat) || (host_pat & GENMASK(2, 0)) != 6) { - pr_err("kvm: host PAT[0] is not WB\n"); + pr_err("host PAT[0] is not WB\n"); return -EIO; } x86_emulator_cache = kvm_alloc_emulator_cache(); if (!x86_emulator_cache) { - pr_err("kvm: failed to allocate cache for x86 emulator\n"); + pr_err("failed to allocate cache for x86 emulator\n"); return -ENOMEM; } user_return_msrs = alloc_percpu(struct kvm_user_return_msrs); if (!user_return_msrs) { - printk(KERN_ERR "kvm: failed to allocate percpu kvm_user_return_msrs\n"); + pr_err("failed to allocate percpu kvm_user_return_msrs\n"); r = -ENOMEM; goto out_free_x86_emulator_cache; } @@ -9341,13 +9375,37 @@ int kvm_arch_init(void *opaque) if (r) goto out_free_percpu; - kvm_timer_init(); - if (boot_cpu_has(X86_FEATURE_XSAVE)) { host_xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK); kvm_caps.supported_xcr0 = host_xcr0 & KVM_SUPPORTED_XCR0; } + rdmsrl_safe(MSR_EFER, &host_efer); + + if (boot_cpu_has(X86_FEATURE_XSAVES)) + rdmsrl(MSR_IA32_XSS, host_xss); + + kvm_init_pmu_capability(); + + r = ops->hardware_setup(); + if (r != 0) + goto out_mmu_exit; + + kvm_ops_update(ops); + + for_each_online_cpu(cpu) { + smp_call_function_single(cpu, kvm_x86_check_cpu_compat, &r, 1); + if (r < 0) + goto out_unwind_ops; + } + + /* + * Point of no return! DO NOT add error paths below this point unless + * absolutely necessary, as most operations from this point forward + * require unwinding. + */ + kvm_timer_init(); + if (pi_inject_timer == -1) pi_inject_timer = housekeeping_enabled(HK_TYPE_TIMER); #ifdef CONFIG_X86_64 @@ -9357,8 +9415,35 @@ int kvm_arch_init(void *opaque) set_hv_tscchange_cb(kvm_hyperv_tsc_notifier); #endif + kvm_register_perf_callbacks(ops->handle_intel_pt_intr); + + if (!kvm_cpu_cap_has(X86_FEATURE_XSAVES)) + kvm_caps.supported_xss = 0; + +#define __kvm_cpu_cap_has(UNUSED_, f) kvm_cpu_cap_has(f) + cr4_reserved_bits = __cr4_reserved_bits(__kvm_cpu_cap_has, UNUSED_); +#undef __kvm_cpu_cap_has + + if (kvm_caps.has_tsc_control) { + /* + * Make sure the user can only configure tsc_khz values that + * fit into a signed integer. + * A min value is not calculated because it will always + * be 1 on all machines. + */ + u64 max = min(0x7fffffffULL, + __scale_tsc(kvm_caps.max_tsc_scaling_ratio, tsc_khz)); + kvm_caps.max_guest_tsc_khz = max; + } + kvm_caps.default_tsc_scaling_ratio = 1ULL << kvm_caps.tsc_scaling_ratio_frac_bits; + kvm_init_msr_list(); return 0; +out_unwind_ops: + kvm_x86_ops.hardware_enable = NULL; + static_call(kvm_x86_hardware_unsetup)(); +out_mmu_exit: + kvm_mmu_vendor_module_exit(); out_free_percpu: free_percpu(user_return_msrs); out_free_x86_emulator_cache: @@ -9366,8 +9451,22 @@ out_free_x86_emulator_cache: return r; } -void kvm_arch_exit(void) +int kvm_x86_vendor_init(struct kvm_x86_init_ops *ops) +{ + int r; + + mutex_lock(&vendor_module_lock); + r = __kvm_x86_vendor_init(ops); + mutex_unlock(&vendor_module_lock); + + return r; +} +EXPORT_SYMBOL_GPL(kvm_x86_vendor_init); + +void kvm_x86_vendor_exit(void) { + kvm_unregister_perf_callbacks(); + #ifdef CONFIG_X86_64 if (hypervisor_is_type(X86_HYPER_MS_HYPERV)) clear_hv_tscchange_cb(); @@ -9384,7 +9483,7 @@ void kvm_arch_exit(void) irq_work_sync(&pvclock_irq_work); cancel_work_sync(&pvclock_gtod_work); #endif - kvm_x86_ops.hardware_enable = NULL; + static_call(kvm_x86_hardware_unsetup)(); kvm_mmu_vendor_module_exit(); free_percpu(user_return_msrs); kmem_cache_destroy(x86_emulator_cache); @@ -9392,7 +9491,11 @@ void kvm_arch_exit(void) static_key_deferred_flush(&kvm_xen_enabled); WARN_ON(static_branch_unlikely(&kvm_xen_enabled.key)); #endif + mutex_lock(&vendor_module_lock); + kvm_x86_ops.hardware_enable = NULL; + mutex_unlock(&vendor_module_lock); } +EXPORT_SYMBOL_GPL(kvm_x86_vendor_exit); static int __kvm_emulate_halt(struct kvm_vcpu *vcpu, int state, int reason) { @@ -11556,7 +11659,7 @@ static int sync_regs(struct kvm_vcpu *vcpu) int kvm_arch_vcpu_precreate(struct kvm *kvm, unsigned int id) { if (kvm_check_tsc_unstable() && kvm->created_vcpus) - pr_warn_once("kvm: SMP vm created on host with unstable TSC; " + pr_warn_once("SMP vm created on host with unstable TSC; " "guest TSC will not be reliable\n"); if (!kvm->arch.max_vcpu_ids) @@ -11633,7 +11736,7 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) goto free_wbinvd_dirty_mask; if (!fpu_alloc_guest_fpstate(&vcpu->arch.guest_fpu)) { - pr_err("kvm: failed to allocate vcpu's fpu\n"); + pr_err("failed to allocate vcpu's fpu\n"); goto free_emulate_ctxt; } @@ -11907,6 +12010,11 @@ int kvm_arch_hardware_enable(void) bool stable, backwards_tsc = false; kvm_user_return_msr_cpu_online(); + + ret = kvm_x86_check_processor_compatibility(); + if (ret) + return ret; + ret = static_call(kvm_x86_hardware_enable)(); if (ret != 0) return ret; @@ -11993,88 +12101,6 @@ void kvm_arch_hardware_disable(void) drop_user_return_notifiers(); } -static inline void kvm_ops_update(struct kvm_x86_init_ops *ops) -{ - memcpy(&kvm_x86_ops, ops->runtime_ops, sizeof(kvm_x86_ops)); - -#define __KVM_X86_OP(func) \ - static_call_update(kvm_x86_##func, kvm_x86_ops.func); -#define KVM_X86_OP(func) \ - WARN_ON(!kvm_x86_ops.func); __KVM_X86_OP(func) -#define KVM_X86_OP_OPTIONAL __KVM_X86_OP -#define KVM_X86_OP_OPTIONAL_RET0(func) \ - static_call_update(kvm_x86_##func, (void *)kvm_x86_ops.func ? : \ - (void *)__static_call_return0); -#include <asm/kvm-x86-ops.h> -#undef __KVM_X86_OP - - kvm_pmu_ops_update(ops->pmu_ops); -} - -int kvm_arch_hardware_setup(void *opaque) -{ - struct kvm_x86_init_ops *ops = opaque; - int r; - - rdmsrl_safe(MSR_EFER, &host_efer); - - if (boot_cpu_has(X86_FEATURE_XSAVES)) - rdmsrl(MSR_IA32_XSS, host_xss); - - kvm_init_pmu_capability(); - - r = ops->hardware_setup(); - if (r != 0) - return r; - - kvm_ops_update(ops); - - kvm_register_perf_callbacks(ops->handle_intel_pt_intr); - - if (!kvm_cpu_cap_has(X86_FEATURE_XSAVES)) - kvm_caps.supported_xss = 0; - -#define __kvm_cpu_cap_has(UNUSED_, f) kvm_cpu_cap_has(f) - cr4_reserved_bits = __cr4_reserved_bits(__kvm_cpu_cap_has, UNUSED_); -#undef __kvm_cpu_cap_has - - if (kvm_caps.has_tsc_control) { - /* - * Make sure the user can only configure tsc_khz values that - * fit into a signed integer. - * A min value is not calculated because it will always - * be 1 on all machines. - */ - u64 max = min(0x7fffffffULL, - __scale_tsc(kvm_caps.max_tsc_scaling_ratio, tsc_khz)); - kvm_caps.max_guest_tsc_khz = max; - } - kvm_caps.default_tsc_scaling_ratio = 1ULL << kvm_caps.tsc_scaling_ratio_frac_bits; - kvm_init_msr_list(); - return 0; -} - -void kvm_arch_hardware_unsetup(void) -{ - kvm_unregister_perf_callbacks(); - - static_call(kvm_x86_hardware_unsetup)(); -} - -int kvm_arch_check_processor_compat(void *opaque) -{ - struct cpuinfo_x86 *c = &cpu_data(smp_processor_id()); - struct kvm_x86_init_ops *ops = opaque; - - WARN_ON(!irqs_disabled()); - - if (__cr4_reserved_bits(cpu_has, c) != - __cr4_reserved_bits(cpu_has, &boot_cpu_data)) - return -EIO; - - return ops->check_processor_compatibility(); -} - bool kvm_vcpu_is_reset_bsp(struct kvm_vcpu *vcpu) { return vcpu->kvm->arch.bsp_vcpu_id == vcpu->vcpu_id; diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c index b178f40bd863..2681e2007e39 100644 --- a/arch/x86/kvm/xen.c +++ b/arch/x86/kvm/xen.c @@ -5,6 +5,7 @@ * * KVM Xen emulation */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include "x86.h" #include "xen.h" @@ -271,7 +272,15 @@ static void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, bool atomic) * Attempt to obtain the GPC lock on *both* (if there are two) * gfn_to_pfn caches that cover the region. */ - read_lock_irqsave(&gpc1->lock, flags); + if (atomic) { + local_irq_save(flags); + if (!read_trylock(&gpc1->lock)) { + local_irq_restore(flags); + return; + } + } else { + read_lock_irqsave(&gpc1->lock, flags); + } while (!kvm_gpc_check(gpc1, user_len1)) { read_unlock_irqrestore(&gpc1->lock, flags); @@ -304,9 +313,18 @@ static void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, bool atomic) * The guest's runstate_info is split across two pages and we * need to hold and validate both GPCs simultaneously. We can * declare a lock ordering GPC1 > GPC2 because nothing else - * takes them more than one at a time. + * takes them more than one at a time. Set a subclass on the + * gpc1 lock to make lockdep shut up about it. */ - read_lock(&gpc2->lock); + lock_set_subclass(&gpc1->lock.dep_map, 1, _THIS_IP_); + if (atomic) { + if (!read_trylock(&gpc2->lock)) { + read_unlock_irqrestore(&gpc1->lock, flags); + return; + } + } else { + read_lock(&gpc2->lock); + } if (!kvm_gpc_check(gpc2, user_len2)) { read_unlock(&gpc2->lock); @@ -590,26 +608,26 @@ int kvm_xen_hvm_set_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data) if (!IS_ENABLED(CONFIG_64BIT) && data->u.long_mode) { r = -EINVAL; } else { - mutex_lock(&kvm->lock); + mutex_lock(&kvm->arch.xen.xen_lock); kvm->arch.xen.long_mode = !!data->u.long_mode; - mutex_unlock(&kvm->lock); + mutex_unlock(&kvm->arch.xen.xen_lock); r = 0; } break; case KVM_XEN_ATTR_TYPE_SHARED_INFO: - mutex_lock(&kvm->lock); + mutex_lock(&kvm->arch.xen.xen_lock); r = kvm_xen_shared_info_init(kvm, data->u.shared_info.gfn); - mutex_unlock(&kvm->lock); + mutex_unlock(&kvm->arch.xen.xen_lock); break; case KVM_XEN_ATTR_TYPE_UPCALL_VECTOR: if (data->u.vector && data->u.vector < 0x10) r = -EINVAL; else { - mutex_lock(&kvm->lock); + mutex_lock(&kvm->arch.xen.xen_lock); kvm->arch.xen.upcall_vector = data->u.vector; - mutex_unlock(&kvm->lock); + mutex_unlock(&kvm->arch.xen.xen_lock); r = 0; } break; @@ -619,9 +637,9 @@ int kvm_xen_hvm_set_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data) break; case KVM_XEN_ATTR_TYPE_XEN_VERSION: - mutex_lock(&kvm->lock); + mutex_lock(&kvm->arch.xen.xen_lock); kvm->arch.xen.xen_version = data->u.xen_version; - mutex_unlock(&kvm->lock); + mutex_unlock(&kvm->arch.xen.xen_lock); r = 0; break; @@ -630,9 +648,9 @@ int kvm_xen_hvm_set_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data) r = -EOPNOTSUPP; break; } - mutex_lock(&kvm->lock); + mutex_lock(&kvm->arch.xen.xen_lock); kvm->arch.xen.runstate_update_flag = !!data->u.runstate_update_flag; - mutex_unlock(&kvm->lock); + mutex_unlock(&kvm->arch.xen.xen_lock); r = 0; break; @@ -647,7 +665,7 @@ int kvm_xen_hvm_get_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data) { int r = -ENOENT; - mutex_lock(&kvm->lock); + mutex_lock(&kvm->arch.xen.xen_lock); switch (data->type) { case KVM_XEN_ATTR_TYPE_LONG_MODE: @@ -686,7 +704,7 @@ int kvm_xen_hvm_get_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data) break; } - mutex_unlock(&kvm->lock); + mutex_unlock(&kvm->arch.xen.xen_lock); return r; } @@ -694,7 +712,7 @@ int kvm_xen_vcpu_set_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data) { int idx, r = -ENOENT; - mutex_lock(&vcpu->kvm->lock); + mutex_lock(&vcpu->kvm->arch.xen.xen_lock); idx = srcu_read_lock(&vcpu->kvm->srcu); switch (data->type) { @@ -922,7 +940,7 @@ int kvm_xen_vcpu_set_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data) } srcu_read_unlock(&vcpu->kvm->srcu, idx); - mutex_unlock(&vcpu->kvm->lock); + mutex_unlock(&vcpu->kvm->arch.xen.xen_lock); return r; } @@ -930,7 +948,7 @@ int kvm_xen_vcpu_get_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data) { int r = -ENOENT; - mutex_lock(&vcpu->kvm->lock); + mutex_lock(&vcpu->kvm->arch.xen.xen_lock); switch (data->type) { case KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO: @@ -1013,7 +1031,7 @@ int kvm_xen_vcpu_get_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data) break; } - mutex_unlock(&vcpu->kvm->lock); + mutex_unlock(&vcpu->kvm->arch.xen.xen_lock); return r; } @@ -1106,7 +1124,7 @@ int kvm_xen_hvm_config(struct kvm *kvm, struct kvm_xen_hvm_config *xhc) xhc->blob_size_32 || xhc->blob_size_64)) return -EINVAL; - mutex_lock(&kvm->lock); + mutex_lock(&kvm->arch.xen.xen_lock); if (xhc->msr && !kvm->arch.xen_hvm_config.msr) static_branch_inc(&kvm_xen_enabled.key); @@ -1115,7 +1133,7 @@ int kvm_xen_hvm_config(struct kvm *kvm, struct kvm_xen_hvm_config *xhc) memcpy(&kvm->arch.xen_hvm_config, xhc, sizeof(*xhc)); - mutex_unlock(&kvm->lock); + mutex_unlock(&kvm->arch.xen.xen_lock); return 0; } @@ -1658,15 +1676,7 @@ static int kvm_xen_set_evtchn(struct kvm_xen_evtchn *xe, struct kvm *kvm) mm_borrowed = true; } - /* - * For the irqfd workqueue, using the main kvm->lock mutex is - * fine since this function is invoked from kvm_set_irq() with - * no other lock held, no srcu. In future if it will be called - * directly from a vCPU thread (e.g. on hypercall for an IPI) - * then it may need to switch to using a leaf-node mutex for - * serializing the shared_info mapping. - */ - mutex_lock(&kvm->lock); + mutex_lock(&kvm->arch.xen.xen_lock); /* * It is theoretically possible for the page to be unmapped @@ -1695,7 +1705,7 @@ static int kvm_xen_set_evtchn(struct kvm_xen_evtchn *xe, struct kvm *kvm) srcu_read_unlock(&kvm->srcu, idx); } while(!rc); - mutex_unlock(&kvm->lock); + mutex_unlock(&kvm->arch.xen.xen_lock); if (mm_borrowed) kthread_unuse_mm(kvm->mm); @@ -1811,7 +1821,7 @@ static int kvm_xen_eventfd_update(struct kvm *kvm, int ret; /* Protect writes to evtchnfd as well as the idr lookup. */ - mutex_lock(&kvm->lock); + mutex_lock(&kvm->arch.xen.xen_lock); evtchnfd = idr_find(&kvm->arch.xen.evtchn_ports, port); ret = -ENOENT; @@ -1842,7 +1852,7 @@ static int kvm_xen_eventfd_update(struct kvm *kvm, } ret = 0; out_unlock: - mutex_unlock(&kvm->lock); + mutex_unlock(&kvm->arch.xen.xen_lock); return ret; } @@ -1905,10 +1915,10 @@ static int kvm_xen_eventfd_assign(struct kvm *kvm, evtchnfd->deliver.port.priority = data->u.evtchn.deliver.port.priority; } - mutex_lock(&kvm->lock); + mutex_lock(&kvm->arch.xen.xen_lock); ret = idr_alloc(&kvm->arch.xen.evtchn_ports, evtchnfd, port, port + 1, GFP_KERNEL); - mutex_unlock(&kvm->lock); + mutex_unlock(&kvm->arch.xen.xen_lock); if (ret >= 0) return 0; @@ -1926,9 +1936,9 @@ static int kvm_xen_eventfd_deassign(struct kvm *kvm, u32 port) { struct evtchnfd *evtchnfd; - mutex_lock(&kvm->lock); + mutex_lock(&kvm->arch.xen.xen_lock); evtchnfd = idr_remove(&kvm->arch.xen.evtchn_ports, port); - mutex_unlock(&kvm->lock); + mutex_unlock(&kvm->arch.xen.xen_lock); if (!evtchnfd) return -ENOENT; @@ -1942,18 +1952,42 @@ static int kvm_xen_eventfd_deassign(struct kvm *kvm, u32 port) static int kvm_xen_eventfd_reset(struct kvm *kvm) { - struct evtchnfd *evtchnfd; + struct evtchnfd *evtchnfd, **all_evtchnfds; int i; + int n = 0; - mutex_lock(&kvm->lock); + mutex_lock(&kvm->arch.xen.xen_lock); + + /* + * Because synchronize_srcu() cannot be called inside the + * critical section, first collect all the evtchnfd objects + * in an array as they are removed from evtchn_ports. + */ + idr_for_each_entry(&kvm->arch.xen.evtchn_ports, evtchnfd, i) + n++; + + all_evtchnfds = kmalloc_array(n, sizeof(struct evtchnfd *), GFP_KERNEL); + if (!all_evtchnfds) { + mutex_unlock(&kvm->arch.xen.xen_lock); + return -ENOMEM; + } + + n = 0; idr_for_each_entry(&kvm->arch.xen.evtchn_ports, evtchnfd, i) { + all_evtchnfds[n++] = evtchnfd; idr_remove(&kvm->arch.xen.evtchn_ports, evtchnfd->send_port); - synchronize_srcu(&kvm->srcu); + } + mutex_unlock(&kvm->arch.xen.xen_lock); + + synchronize_srcu(&kvm->srcu); + + while (n--) { + evtchnfd = all_evtchnfds[n]; if (!evtchnfd->deliver.port.port) eventfd_ctx_put(evtchnfd->deliver.eventfd.ctx); kfree(evtchnfd); } - mutex_unlock(&kvm->lock); + kfree(all_evtchnfds); return 0; } @@ -2045,6 +2079,7 @@ void kvm_xen_destroy_vcpu(struct kvm_vcpu *vcpu) void kvm_xen_init_vm(struct kvm *kvm) { + mutex_init(&kvm->arch.xen.xen_lock); idr_init(&kvm->arch.xen.evtchn_ports); kvm_gpc_init(&kvm->arch.xen.shinfo_cache, kvm, NULL, KVM_HOST_USES_PFN); } diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile index 7ba5f61d7273..4f1a40a86534 100644 --- a/arch/x86/lib/Makefile +++ b/arch/x86/lib/Makefile @@ -60,6 +60,7 @@ ifeq ($(CONFIG_X86_32),y) lib-y += checksum_32.o lib-y += strstr_32.o lib-y += string_32.o + lib-y += memmove_32.o ifneq ($(CONFIG_X86_CMPXCHG64),y) lib-y += cmpxchg8b_emu.o atomic64_386_32.o endif diff --git a/arch/x86/lib/error-inject.c b/arch/x86/lib/error-inject.c index 1e3de0769b81..b5a6d83106bc 100644 --- a/arch/x86/lib/error-inject.c +++ b/arch/x86/lib/error-inject.c @@ -11,6 +11,7 @@ asm( ".text\n" ".type just_return_func, @function\n" ".globl just_return_func\n" + ASM_FUNC_ALIGN "just_return_func:\n" ANNOTATE_NOENDBR ASM_RET diff --git a/arch/x86/lib/insn-eval.c b/arch/x86/lib/insn-eval.c index 21104c41cba0..558a605929db 100644 --- a/arch/x86/lib/insn-eval.c +++ b/arch/x86/lib/insn-eval.c @@ -1595,16 +1595,16 @@ bool insn_decode_from_regs(struct insn *insn, struct pt_regs *regs, * Returns: * * Type of the instruction. Size of the memory operand is stored in - * @bytes. If decode failed, MMIO_DECODE_FAILED returned. + * @bytes. If decode failed, INSN_MMIO_DECODE_FAILED returned. */ -enum mmio_type insn_decode_mmio(struct insn *insn, int *bytes) +enum insn_mmio_type insn_decode_mmio(struct insn *insn, int *bytes) { - enum mmio_type type = MMIO_DECODE_FAILED; + enum insn_mmio_type type = INSN_MMIO_DECODE_FAILED; *bytes = 0; if (insn_get_opcode(insn)) - return MMIO_DECODE_FAILED; + return INSN_MMIO_DECODE_FAILED; switch (insn->opcode.bytes[0]) { case 0x88: /* MOV m8,r8 */ @@ -1613,7 +1613,7 @@ enum mmio_type insn_decode_mmio(struct insn *insn, int *bytes) case 0x89: /* MOV m16/m32/m64, r16/m32/m64 */ if (!*bytes) *bytes = insn->opnd_bytes; - type = MMIO_WRITE; + type = INSN_MMIO_WRITE; break; case 0xc6: /* MOV m8, imm8 */ @@ -1622,7 +1622,7 @@ enum mmio_type insn_decode_mmio(struct insn *insn, int *bytes) case 0xc7: /* MOV m16/m32/m64, imm16/imm32/imm64 */ if (!*bytes) *bytes = insn->opnd_bytes; - type = MMIO_WRITE_IMM; + type = INSN_MMIO_WRITE_IMM; break; case 0x8a: /* MOV r8, m8 */ @@ -1631,7 +1631,7 @@ enum mmio_type insn_decode_mmio(struct insn *insn, int *bytes) case 0x8b: /* MOV r16/r32/r64, m16/m32/m64 */ if (!*bytes) *bytes = insn->opnd_bytes; - type = MMIO_READ; + type = INSN_MMIO_READ; break; case 0xa4: /* MOVS m8, m8 */ @@ -1640,7 +1640,7 @@ enum mmio_type insn_decode_mmio(struct insn *insn, int *bytes) case 0xa5: /* MOVS m16/m32/m64, m16/m32/m64 */ if (!*bytes) *bytes = insn->opnd_bytes; - type = MMIO_MOVS; + type = INSN_MMIO_MOVS; break; case 0x0f: /* Two-byte instruction */ @@ -1651,7 +1651,7 @@ enum mmio_type insn_decode_mmio(struct insn *insn, int *bytes) case 0xb7: /* MOVZX r32/r64, m16 */ if (!*bytes) *bytes = 2; - type = MMIO_READ_ZERO_EXTEND; + type = INSN_MMIO_READ_ZERO_EXTEND; break; case 0xbe: /* MOVSX r16/r32/r64, m8 */ @@ -1660,7 +1660,7 @@ enum mmio_type insn_decode_mmio(struct insn *insn, int *bytes) case 0xbf: /* MOVSX r32/r64, m16 */ if (!*bytes) *bytes = 2; - type = MMIO_READ_SIGN_EXTEND; + type = INSN_MMIO_READ_SIGN_EXTEND; break; } break; diff --git a/arch/x86/lib/iomap_copy_64.S b/arch/x86/lib/iomap_copy_64.S index a1f9416bf67a..6ff2f56cb0f7 100644 --- a/arch/x86/lib/iomap_copy_64.S +++ b/arch/x86/lib/iomap_copy_64.S @@ -10,6 +10,6 @@ */ SYM_FUNC_START(__iowrite32_copy) movl %edx,%ecx - rep movsd + rep movsl RET SYM_FUNC_END(__iowrite32_copy) diff --git a/arch/x86/lib/memcpy_32.c b/arch/x86/lib/memcpy_32.c index ef3af7ff2c8a..a29b64befb93 100644 --- a/arch/x86/lib/memcpy_32.c +++ b/arch/x86/lib/memcpy_32.c @@ -17,190 +17,3 @@ __visible void *memset(void *s, int c, size_t count) return __memset(s, c, count); } EXPORT_SYMBOL(memset); - -__visible void *memmove(void *dest, const void *src, size_t n) -{ - int d0,d1,d2,d3,d4,d5; - char *ret = dest; - - __asm__ __volatile__( - /* Handle more 16 bytes in loop */ - "cmp $0x10, %0\n\t" - "jb 1f\n\t" - - /* Decide forward/backward copy mode */ - "cmp %2, %1\n\t" - "jb 2f\n\t" - - /* - * movs instruction have many startup latency - * so we handle small size by general register. - */ - "cmp $680, %0\n\t" - "jb 3f\n\t" - /* - * movs instruction is only good for aligned case. - */ - "mov %1, %3\n\t" - "xor %2, %3\n\t" - "and $0xff, %3\n\t" - "jz 4f\n\t" - "3:\n\t" - "sub $0x10, %0\n\t" - - /* - * We gobble 16 bytes forward in each loop. - */ - "3:\n\t" - "sub $0x10, %0\n\t" - "mov 0*4(%1), %3\n\t" - "mov 1*4(%1), %4\n\t" - "mov %3, 0*4(%2)\n\t" - "mov %4, 1*4(%2)\n\t" - "mov 2*4(%1), %3\n\t" - "mov 3*4(%1), %4\n\t" - "mov %3, 2*4(%2)\n\t" - "mov %4, 3*4(%2)\n\t" - "lea 0x10(%1), %1\n\t" - "lea 0x10(%2), %2\n\t" - "jae 3b\n\t" - "add $0x10, %0\n\t" - "jmp 1f\n\t" - - /* - * Handle data forward by movs. - */ - ".p2align 4\n\t" - "4:\n\t" - "mov -4(%1, %0), %3\n\t" - "lea -4(%2, %0), %4\n\t" - "shr $2, %0\n\t" - "rep movsl\n\t" - "mov %3, (%4)\n\t" - "jmp 11f\n\t" - /* - * Handle data backward by movs. - */ - ".p2align 4\n\t" - "6:\n\t" - "mov (%1), %3\n\t" - "mov %2, %4\n\t" - "lea -4(%1, %0), %1\n\t" - "lea -4(%2, %0), %2\n\t" - "shr $2, %0\n\t" - "std\n\t" - "rep movsl\n\t" - "mov %3,(%4)\n\t" - "cld\n\t" - "jmp 11f\n\t" - - /* - * Start to prepare for backward copy. - */ - ".p2align 4\n\t" - "2:\n\t" - "cmp $680, %0\n\t" - "jb 5f\n\t" - "mov %1, %3\n\t" - "xor %2, %3\n\t" - "and $0xff, %3\n\t" - "jz 6b\n\t" - - /* - * Calculate copy position to tail. - */ - "5:\n\t" - "add %0, %1\n\t" - "add %0, %2\n\t" - "sub $0x10, %0\n\t" - - /* - * We gobble 16 bytes backward in each loop. - */ - "7:\n\t" - "sub $0x10, %0\n\t" - - "mov -1*4(%1), %3\n\t" - "mov -2*4(%1), %4\n\t" - "mov %3, -1*4(%2)\n\t" - "mov %4, -2*4(%2)\n\t" - "mov -3*4(%1), %3\n\t" - "mov -4*4(%1), %4\n\t" - "mov %3, -3*4(%2)\n\t" - "mov %4, -4*4(%2)\n\t" - "lea -0x10(%1), %1\n\t" - "lea -0x10(%2), %2\n\t" - "jae 7b\n\t" - /* - * Calculate copy position to head. - */ - "add $0x10, %0\n\t" - "sub %0, %1\n\t" - "sub %0, %2\n\t" - - /* - * Move data from 8 bytes to 15 bytes. - */ - ".p2align 4\n\t" - "1:\n\t" - "cmp $8, %0\n\t" - "jb 8f\n\t" - "mov 0*4(%1), %3\n\t" - "mov 1*4(%1), %4\n\t" - "mov -2*4(%1, %0), %5\n\t" - "mov -1*4(%1, %0), %1\n\t" - - "mov %3, 0*4(%2)\n\t" - "mov %4, 1*4(%2)\n\t" - "mov %5, -2*4(%2, %0)\n\t" - "mov %1, -1*4(%2, %0)\n\t" - "jmp 11f\n\t" - - /* - * Move data from 4 bytes to 7 bytes. - */ - ".p2align 4\n\t" - "8:\n\t" - "cmp $4, %0\n\t" - "jb 9f\n\t" - "mov 0*4(%1), %3\n\t" - "mov -1*4(%1, %0), %4\n\t" - "mov %3, 0*4(%2)\n\t" - "mov %4, -1*4(%2, %0)\n\t" - "jmp 11f\n\t" - - /* - * Move data from 2 bytes to 3 bytes. - */ - ".p2align 4\n\t" - "9:\n\t" - "cmp $2, %0\n\t" - "jb 10f\n\t" - "movw 0*2(%1), %%dx\n\t" - "movw -1*2(%1, %0), %%bx\n\t" - "movw %%dx, 0*2(%2)\n\t" - "movw %%bx, -1*2(%2, %0)\n\t" - "jmp 11f\n\t" - - /* - * Move data for 1 byte. - */ - ".p2align 4\n\t" - "10:\n\t" - "cmp $1, %0\n\t" - "jb 11f\n\t" - "movb (%1), %%cl\n\t" - "movb %%cl, (%2)\n\t" - ".p2align 4\n\t" - "11:" - : "=&c" (d0), "=&S" (d1), "=&D" (d2), - "=r" (d3),"=r" (d4), "=r"(d5) - :"0" (n), - "1" (src), - "2" (dest) - :"memory"); - - return ret; - -} -EXPORT_SYMBOL(memmove); diff --git a/arch/x86/lib/memmove_32.S b/arch/x86/lib/memmove_32.S new file mode 100644 index 000000000000..0588b2c0fc95 --- /dev/null +++ b/arch/x86/lib/memmove_32.S @@ -0,0 +1,200 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#include <linux/linkage.h> +#include <asm/export.h> + +SYM_FUNC_START(memmove) +/* + * void *memmove(void *dest_in, const void *src_in, size_t n) + * -mregparm=3 passes these in registers: + * dest_in: %eax + * src_in: %edx + * n: %ecx + * See also: arch/x86/entry/calling.h for description of the calling convention. + * + * n can remain in %ecx, but for `rep movsl`, we'll need dest in %edi and src + * in %esi. + */ +.set dest_in, %eax +.set dest, %edi +.set src_in, %edx +.set src, %esi +.set n, %ecx +.set tmp0, %edx +.set tmp0w, %dx +.set tmp1, %ebx +.set tmp1w, %bx +.set tmp2, %eax +.set tmp3b, %cl + +/* + * Save all callee-saved registers, because this function is going to clobber + * all of them: + */ + pushl %ebp + movl %esp, %ebp // set standard frame pointer + + pushl %ebx + pushl %edi + pushl %esi + pushl %eax // save 'dest_in' parameter [eax] as the return value + + movl src_in, src + movl dest_in, dest + + /* Handle more 16 bytes in loop */ + cmpl $0x10, n + jb .Lmove_16B + + /* Decide forward/backward copy mode */ + cmpl dest, src + jb .Lbackwards_header + + /* + * movs instruction have many startup latency + * so we handle small size by general register. + */ + cmpl $680, n + jb .Ltoo_small_forwards + /* movs instruction is only good for aligned case. */ + movl src, tmp0 + xorl dest, tmp0 + andl $0xff, tmp0 + jz .Lforward_movs +.Ltoo_small_forwards: + subl $0x10, n + + /* We gobble 16 bytes forward in each loop. */ +.Lmove_16B_forwards_loop: + subl $0x10, n + movl 0*4(src), tmp0 + movl 1*4(src), tmp1 + movl tmp0, 0*4(dest) + movl tmp1, 1*4(dest) + movl 2*4(src), tmp0 + movl 3*4(src), tmp1 + movl tmp0, 2*4(dest) + movl tmp1, 3*4(dest) + leal 0x10(src), src + leal 0x10(dest), dest + jae .Lmove_16B_forwards_loop + addl $0x10, n + jmp .Lmove_16B + + /* Handle data forward by movs. */ +.p2align 4 +.Lforward_movs: + movl -4(src, n), tmp0 + leal -4(dest, n), tmp1 + shrl $2, n + rep movsl + movl tmp0, (tmp1) + jmp .Ldone + + /* Handle data backward by movs. */ +.p2align 4 +.Lbackwards_movs: + movl (src), tmp0 + movl dest, tmp1 + leal -4(src, n), src + leal -4(dest, n), dest + shrl $2, n + std + rep movsl + movl tmp0,(tmp1) + cld + jmp .Ldone + + /* Start to prepare for backward copy. */ +.p2align 4 +.Lbackwards_header: + cmpl $680, n + jb .Ltoo_small_backwards + movl src, tmp0 + xorl dest, tmp0 + andl $0xff, tmp0 + jz .Lbackwards_movs + + /* Calculate copy position to tail. */ +.Ltoo_small_backwards: + addl n, src + addl n, dest + subl $0x10, n + + /* We gobble 16 bytes backward in each loop. */ +.Lmove_16B_backwards_loop: + subl $0x10, n + + movl -1*4(src), tmp0 + movl -2*4(src), tmp1 + movl tmp0, -1*4(dest) + movl tmp1, -2*4(dest) + movl -3*4(src), tmp0 + movl -4*4(src), tmp1 + movl tmp0, -3*4(dest) + movl tmp1, -4*4(dest) + leal -0x10(src), src + leal -0x10(dest), dest + jae .Lmove_16B_backwards_loop + /* Calculate copy position to head. */ + addl $0x10, n + subl n, src + subl n, dest + + /* Move data from 8 bytes to 15 bytes. */ +.p2align 4 +.Lmove_16B: + cmpl $8, n + jb .Lmove_8B + movl 0*4(src), tmp0 + movl 1*4(src), tmp1 + movl -2*4(src, n), tmp2 + movl -1*4(src, n), src + + movl tmp0, 0*4(dest) + movl tmp1, 1*4(dest) + movl tmp2, -2*4(dest, n) + movl src, -1*4(dest, n) + jmp .Ldone + + /* Move data from 4 bytes to 7 bytes. */ +.p2align 4 +.Lmove_8B: + cmpl $4, n + jb .Lmove_4B + movl 0*4(src), tmp0 + movl -1*4(src, n), tmp1 + movl tmp0, 0*4(dest) + movl tmp1, -1*4(dest, n) + jmp .Ldone + + /* Move data from 2 bytes to 3 bytes. */ +.p2align 4 +.Lmove_4B: + cmpl $2, n + jb .Lmove_1B + movw 0*2(src), tmp0w + movw -1*2(src, n), tmp1w + movw tmp0w, 0*2(dest) + movw tmp1w, -1*2(dest, n) + jmp .Ldone + + /* Move data for 1 byte. */ +.p2align 4 +.Lmove_1B: + cmpl $1, n + jb .Ldone + movb (src), tmp3b + movb tmp3b, (dest) +.p2align 4 +.Ldone: + popl dest_in // restore 'dest_in' [eax] as the return value + /* Restore all callee-saved registers: */ + popl %esi + popl %edi + popl %ebx + popl %ebp + + RET +SYM_FUNC_END(memmove) +EXPORT_SYMBOL(memmove) diff --git a/arch/x86/lib/putuser.S b/arch/x86/lib/putuser.S index b7dfd60243b7..32125224fcca 100644 --- a/arch/x86/lib/putuser.S +++ b/arch/x86/lib/putuser.S @@ -47,8 +47,6 @@ SYM_FUNC_START(__put_user_1) LOAD_TASK_SIZE_MINUS_N(0) cmp %_ASM_BX,%_ASM_CX jae .Lbad_put_user -SYM_INNER_LABEL(__put_user_nocheck_1, SYM_L_GLOBAL) - ENDBR ASM_STAC 1: movb %al,(%_ASM_CX) xor %ecx,%ecx @@ -56,54 +54,87 @@ SYM_INNER_LABEL(__put_user_nocheck_1, SYM_L_GLOBAL) RET SYM_FUNC_END(__put_user_1) EXPORT_SYMBOL(__put_user_1) + +SYM_FUNC_START(__put_user_nocheck_1) + ENDBR + ASM_STAC +2: movb %al,(%_ASM_CX) + xor %ecx,%ecx + ASM_CLAC + RET +SYM_FUNC_END(__put_user_nocheck_1) EXPORT_SYMBOL(__put_user_nocheck_1) SYM_FUNC_START(__put_user_2) LOAD_TASK_SIZE_MINUS_N(1) cmp %_ASM_BX,%_ASM_CX jae .Lbad_put_user -SYM_INNER_LABEL(__put_user_nocheck_2, SYM_L_GLOBAL) - ENDBR ASM_STAC -2: movw %ax,(%_ASM_CX) +3: movw %ax,(%_ASM_CX) xor %ecx,%ecx ASM_CLAC RET SYM_FUNC_END(__put_user_2) EXPORT_SYMBOL(__put_user_2) + +SYM_FUNC_START(__put_user_nocheck_2) + ENDBR + ASM_STAC +4: movw %ax,(%_ASM_CX) + xor %ecx,%ecx + ASM_CLAC + RET +SYM_FUNC_END(__put_user_nocheck_2) EXPORT_SYMBOL(__put_user_nocheck_2) SYM_FUNC_START(__put_user_4) LOAD_TASK_SIZE_MINUS_N(3) cmp %_ASM_BX,%_ASM_CX jae .Lbad_put_user -SYM_INNER_LABEL(__put_user_nocheck_4, SYM_L_GLOBAL) - ENDBR ASM_STAC -3: movl %eax,(%_ASM_CX) +5: movl %eax,(%_ASM_CX) xor %ecx,%ecx ASM_CLAC RET SYM_FUNC_END(__put_user_4) EXPORT_SYMBOL(__put_user_4) + +SYM_FUNC_START(__put_user_nocheck_4) + ENDBR + ASM_STAC +6: movl %eax,(%_ASM_CX) + xor %ecx,%ecx + ASM_CLAC + RET +SYM_FUNC_END(__put_user_nocheck_4) EXPORT_SYMBOL(__put_user_nocheck_4) SYM_FUNC_START(__put_user_8) LOAD_TASK_SIZE_MINUS_N(7) cmp %_ASM_BX,%_ASM_CX jae .Lbad_put_user -SYM_INNER_LABEL(__put_user_nocheck_8, SYM_L_GLOBAL) - ENDBR ASM_STAC -4: mov %_ASM_AX,(%_ASM_CX) +7: mov %_ASM_AX,(%_ASM_CX) #ifdef CONFIG_X86_32 -5: movl %edx,4(%_ASM_CX) +8: movl %edx,4(%_ASM_CX) #endif xor %ecx,%ecx ASM_CLAC RET SYM_FUNC_END(__put_user_8) EXPORT_SYMBOL(__put_user_8) + +SYM_FUNC_START(__put_user_nocheck_8) + ENDBR + ASM_STAC +9: mov %_ASM_AX,(%_ASM_CX) +#ifdef CONFIG_X86_32 +10: movl %edx,4(%_ASM_CX) +#endif + xor %ecx,%ecx + ASM_CLAC + RET +SYM_FUNC_END(__put_user_nocheck_8) EXPORT_SYMBOL(__put_user_nocheck_8) SYM_CODE_START_LOCAL(.Lbad_put_user_clac) @@ -117,6 +148,11 @@ SYM_CODE_END(.Lbad_put_user_clac) _ASM_EXTABLE_UA(2b, .Lbad_put_user_clac) _ASM_EXTABLE_UA(3b, .Lbad_put_user_clac) _ASM_EXTABLE_UA(4b, .Lbad_put_user_clac) -#ifdef CONFIG_X86_32 _ASM_EXTABLE_UA(5b, .Lbad_put_user_clac) + _ASM_EXTABLE_UA(6b, .Lbad_put_user_clac) + _ASM_EXTABLE_UA(7b, .Lbad_put_user_clac) + _ASM_EXTABLE_UA(9b, .Lbad_put_user_clac) +#ifdef CONFIG_X86_32 + _ASM_EXTABLE_UA(8b, .Lbad_put_user_clac) + _ASM_EXTABLE_UA(10b, .Lbad_put_user_clac) #endif diff --git a/arch/x86/lib/retpoline.S b/arch/x86/lib/retpoline.S index 073289a55f84..5f61c65322be 100644 --- a/arch/x86/lib/retpoline.S +++ b/arch/x86/lib/retpoline.S @@ -5,24 +5,27 @@ #include <asm/dwarf2.h> #include <asm/cpufeatures.h> #include <asm/alternative.h> +#include <asm/asm-offsets.h> #include <asm/export.h> #include <asm/nospec-branch.h> #include <asm/unwind_hints.h> +#include <asm/percpu.h> #include <asm/frame.h> .section .text.__x86.indirect_thunk -.macro RETPOLINE reg + +.macro POLINE reg ANNOTATE_INTRA_FUNCTION_CALL call .Ldo_rop_\@ -.Lspec_trap_\@: - UNWIND_HINT_EMPTY - pause - lfence - jmp .Lspec_trap_\@ + int3 .Ldo_rop_\@: mov %\reg, (%_ASM_SP) UNWIND_HINT_FUNC +.endm + +.macro RETPOLINE reg + POLINE \reg RET .endm @@ -52,7 +55,6 @@ SYM_INNER_LABEL(__x86_indirect_thunk_\reg, SYM_L_GLOBAL) */ #define __EXPORT_THUNK(sym) _ASM_NOKPROBE(sym); EXPORT_SYMBOL(sym) -#define EXPORT_THUNK(reg) __EXPORT_THUNK(__x86_indirect_thunk_ ## reg) .align RETPOLINE_THUNK_SIZE SYM_CODE_START(__x86_indirect_thunk_array) @@ -64,10 +66,65 @@ SYM_CODE_START(__x86_indirect_thunk_array) .align RETPOLINE_THUNK_SIZE SYM_CODE_END(__x86_indirect_thunk_array) -#define GEN(reg) EXPORT_THUNK(reg) +#define GEN(reg) __EXPORT_THUNK(__x86_indirect_thunk_ ## reg) +#include <asm/GEN-for-each-reg.h> +#undef GEN + +#ifdef CONFIG_CALL_DEPTH_TRACKING +.macro CALL_THUNK reg + .align RETPOLINE_THUNK_SIZE + +SYM_INNER_LABEL(__x86_indirect_call_thunk_\reg, SYM_L_GLOBAL) + UNWIND_HINT_EMPTY + ANNOTATE_NOENDBR + + CALL_DEPTH_ACCOUNT + POLINE \reg + ANNOTATE_UNRET_SAFE + ret + int3 +.endm + + .align RETPOLINE_THUNK_SIZE +SYM_CODE_START(__x86_indirect_call_thunk_array) + +#define GEN(reg) CALL_THUNK reg #include <asm/GEN-for-each-reg.h> #undef GEN + .align RETPOLINE_THUNK_SIZE +SYM_CODE_END(__x86_indirect_call_thunk_array) + +#define GEN(reg) __EXPORT_THUNK(__x86_indirect_call_thunk_ ## reg) +#include <asm/GEN-for-each-reg.h> +#undef GEN + +.macro JUMP_THUNK reg + .align RETPOLINE_THUNK_SIZE + +SYM_INNER_LABEL(__x86_indirect_jump_thunk_\reg, SYM_L_GLOBAL) + UNWIND_HINT_EMPTY + ANNOTATE_NOENDBR + POLINE \reg + ANNOTATE_UNRET_SAFE + ret + int3 +.endm + + .align RETPOLINE_THUNK_SIZE +SYM_CODE_START(__x86_indirect_jump_thunk_array) + +#define GEN(reg) JUMP_THUNK reg +#include <asm/GEN-for-each-reg.h> +#undef GEN + + .align RETPOLINE_THUNK_SIZE +SYM_CODE_END(__x86_indirect_jump_thunk_array) + +#define GEN(reg) __EXPORT_THUNK(__x86_indirect_jump_thunk_ ## reg) +#include <asm/GEN-for-each-reg.h> +#undef GEN +#endif /* * This function name is magical and is used by -mfunction-return=thunk-extern * for the compiler to generate JMPs to it. @@ -140,3 +197,37 @@ __EXPORT_THUNK(zen_untrain_ret) EXPORT_SYMBOL(__x86_return_thunk) #endif /* CONFIG_RETHUNK */ + +#ifdef CONFIG_CALL_DEPTH_TRACKING + + .align 64 +SYM_FUNC_START(__x86_return_skl) + ANNOTATE_NOENDBR + /* + * Keep the hotpath in a 16byte I-fetch for the non-debug + * case. + */ + CALL_THUNKS_DEBUG_INC_RETS + shlq $5, PER_CPU_VAR(pcpu_hot + X86_call_depth) + jz 1f + ANNOTATE_UNRET_SAFE + ret + int3 +1: + CALL_THUNKS_DEBUG_INC_STUFFS + .rept 16 + ANNOTATE_INTRA_FUNCTION_CALL + call 2f + int3 +2: + .endr + add $(8*16), %rsp + + CREDIT_CALL_DEPTH + + ANNOTATE_UNRET_SAFE + ret + int3 +SYM_FUNC_END(__x86_return_skl) + +#endif /* CONFIG_CALL_DEPTH_TRACKING */ diff --git a/arch/x86/lib/usercopy.c b/arch/x86/lib/usercopy.c index f1bb18617156..24b48af27417 100644 --- a/arch/x86/lib/usercopy.c +++ b/arch/x86/lib/usercopy.c @@ -6,6 +6,7 @@ #include <linux/uaccess.h> #include <linux/export.h> +#include <linux/instrumented.h> #include <asm/tlbflush.h> @@ -44,7 +45,9 @@ copy_from_user_nmi(void *to, const void __user *from, unsigned long n) * called from other contexts. */ pagefault_disable(); + instrument_copy_from_user_before(to, from, n); ret = raw_copy_from_user(to, from, n); + instrument_copy_from_user_after(to, from, n, ret); pagefault_enable(); return ret; diff --git a/arch/x86/mm/cpu_entry_area.c b/arch/x86/mm/cpu_entry_area.c index 6c2f1b76a0b6..7316a8224259 100644 --- a/arch/x86/mm/cpu_entry_area.c +++ b/arch/x86/mm/cpu_entry_area.c @@ -9,22 +9,60 @@ #include <asm/cpu_entry_area.h> #include <asm/fixmap.h> #include <asm/desc.h> +#include <asm/kasan.h> static DEFINE_PER_CPU_PAGE_ALIGNED(struct entry_stack_page, entry_stack_storage); #ifdef CONFIG_X86_64 static DEFINE_PER_CPU_PAGE_ALIGNED(struct exception_stacks, exception_stacks); DEFINE_PER_CPU(struct cea_exception_stacks*, cea_exception_stacks); -#endif -#ifdef CONFIG_X86_32 +static DEFINE_PER_CPU_READ_MOSTLY(unsigned long, _cea_offset); + +static __always_inline unsigned int cea_offset(unsigned int cpu) +{ + return per_cpu(_cea_offset, cpu); +} + +static __init void init_cea_offsets(void) +{ + unsigned int max_cea; + unsigned int i, j; + + max_cea = (CPU_ENTRY_AREA_MAP_SIZE - PAGE_SIZE) / CPU_ENTRY_AREA_SIZE; + + /* O(sodding terrible) */ + for_each_possible_cpu(i) { + unsigned int cea; + +again: + cea = get_random_u32_below(max_cea); + + for_each_possible_cpu(j) { + if (cea_offset(j) == cea) + goto again; + + if (i == j) + break; + } + + per_cpu(_cea_offset, i) = cea; + } +} +#else /* !X86_64 */ DECLARE_PER_CPU_PAGE_ALIGNED(struct doublefault_stack, doublefault_stack); + +static __always_inline unsigned int cea_offset(unsigned int cpu) +{ + return cpu; +} +static inline void init_cea_offsets(void) { } #endif /* Is called from entry code, so must be noinstr */ noinstr struct cpu_entry_area *get_cpu_entry_area(int cpu) { - unsigned long va = CPU_ENTRY_AREA_PER_CPU + cpu * CPU_ENTRY_AREA_SIZE; + unsigned long va = CPU_ENTRY_AREA_PER_CPU + cea_offset(cpu) * CPU_ENTRY_AREA_SIZE; BUILD_BUG_ON(sizeof(struct cpu_entry_area) % PAGE_SIZE != 0); return (struct cpu_entry_area *) va; @@ -138,20 +176,19 @@ static void __init setup_cpu_entry_area(unsigned int cpu) pgprot_t tss_prot = PAGE_KERNEL_RO; #else /* - * On native 32-bit systems, the GDT cannot be read-only because + * On 32-bit systems, the GDT cannot be read-only because * our double fault handler uses a task gate, and entering through * a task gate needs to change an available TSS to busy. If the * GDT is read-only, that will triple fault. The TSS cannot be * read-only because the CPU writes to it on task switches. - * - * On Xen PV, the GDT must be read-only because the hypervisor - * requires it. */ - pgprot_t gdt_prot = boot_cpu_has(X86_FEATURE_XENPV) ? - PAGE_KERNEL_RO : PAGE_KERNEL; + pgprot_t gdt_prot = PAGE_KERNEL; pgprot_t tss_prot = PAGE_KERNEL; #endif + kasan_populate_shadow_for_vaddr(cea, CPU_ENTRY_AREA_SIZE, + early_cpu_to_node(cpu)); + cea_set_pte(&cea->gdt, get_cpu_gdt_paddr(cpu), gdt_prot); cea_map_percpu_pages(&cea->entry_stack_page, @@ -205,7 +242,6 @@ static __init void setup_cpu_entry_area_ptes(void) /* The +1 is for the readonly IDT: */ BUILD_BUG_ON((CPU_ENTRY_AREA_PAGES+1)*PAGE_SIZE != CPU_ENTRY_AREA_MAP_SIZE); - BUILD_BUG_ON(CPU_ENTRY_AREA_TOTAL_SIZE != CPU_ENTRY_AREA_MAP_SIZE); BUG_ON(CPU_ENTRY_AREA_BASE & ~PMD_MASK); start = CPU_ENTRY_AREA_BASE; @@ -221,6 +257,8 @@ void __init setup_cpu_entry_areas(void) { unsigned int cpu; + init_cea_offsets(); + setup_cpu_entry_area_ptes(); for_each_possible_cpu(cpu) diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c index 6b3033845c6d..5804bbae4f01 100644 --- a/arch/x86/mm/hugetlbpage.c +++ b/arch/x86/mm/hugetlbpage.c @@ -37,8 +37,12 @@ int pmd_huge(pmd_t pmd) */ int pud_huge(pud_t pud) { +#if CONFIG_PGTABLE_LEVELS > 2 return !pud_none(pud) && (pud_val(pud) & (_PAGE_PRESENT|_PAGE_PSE)) != _PAGE_PRESENT; +#else + return 0; +#endif } #ifdef CONFIG_HUGETLB_PAGE diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 9121bc1b9453..d3987359d441 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -801,7 +801,7 @@ void __init poking_init(void) spinlock_t *ptl; pte_t *ptep; - poking_mm = copy_init_mm(); + poking_mm = mm_alloc(); BUG_ON(!poking_mm); /* diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 3f040c6e5d13..a190aae8ceaf 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -1416,47 +1416,6 @@ void mark_rodata_ro(void) debug_checkwx(); } -int kern_addr_valid(unsigned long addr) -{ - unsigned long above = ((long)addr) >> __VIRTUAL_MASK_SHIFT; - pgd_t *pgd; - p4d_t *p4d; - pud_t *pud; - pmd_t *pmd; - pte_t *pte; - - if (above != 0 && above != -1UL) - return 0; - - pgd = pgd_offset_k(addr); - if (pgd_none(*pgd)) - return 0; - - p4d = p4d_offset(pgd, addr); - if (!p4d_present(*p4d)) - return 0; - - pud = pud_offset(p4d, addr); - if (!pud_present(*pud)) - return 0; - - if (pud_large(*pud)) - return pfn_valid(pud_pfn(*pud)); - - pmd = pmd_offset(pud, addr); - if (!pmd_present(*pmd)) - return 0; - - if (pmd_large(*pmd)) - return pfn_valid(pmd_pfn(*pmd)); - - pte = pte_offset_kernel(pmd, addr); - if (pte_none(*pte)) - return 0; - - return pfn_valid(pte_pfn(*pte)); -} - /* * Block size is the minimum amount of memory which can be hotplugged or * hotremoved. It must be power of two and must be equal or larger than @@ -1533,72 +1492,44 @@ static long __meminitdata addr_start, addr_end; static void __meminitdata *p_start, *p_end; static int __meminitdata node_start; -static int __meminit vmemmap_populate_hugepages(unsigned long start, - unsigned long end, int node, struct vmem_altmap *altmap) +void __meminit vmemmap_set_pmd(pmd_t *pmd, void *p, int node, + unsigned long addr, unsigned long next) { - unsigned long addr; - unsigned long next; - pgd_t *pgd; - p4d_t *p4d; - pud_t *pud; - pmd_t *pmd; - - for (addr = start; addr < end; addr = next) { - next = pmd_addr_end(addr, end); - - pgd = vmemmap_pgd_populate(addr, node); - if (!pgd) - return -ENOMEM; - - p4d = vmemmap_p4d_populate(pgd, addr, node); - if (!p4d) - return -ENOMEM; - - pud = vmemmap_pud_populate(p4d, addr, node); - if (!pud) - return -ENOMEM; - - pmd = pmd_offset(pud, addr); - if (pmd_none(*pmd)) { - void *p; - - p = vmemmap_alloc_block_buf(PMD_SIZE, node, altmap); - if (p) { - pte_t entry; - - entry = pfn_pte(__pa(p) >> PAGE_SHIFT, - PAGE_KERNEL_LARGE); - set_pmd(pmd, __pmd(pte_val(entry))); + pte_t entry; + + entry = pfn_pte(__pa(p) >> PAGE_SHIFT, + PAGE_KERNEL_LARGE); + set_pmd(pmd, __pmd(pte_val(entry))); + + /* check to see if we have contiguous blocks */ + if (p_end != p || node_start != node) { + if (p_start) + pr_debug(" [%lx-%lx] PMD -> [%p-%p] on node %d\n", + addr_start, addr_end-1, p_start, p_end-1, node_start); + addr_start = addr; + node_start = node; + p_start = p; + } - /* check to see if we have contiguous blocks */ - if (p_end != p || node_start != node) { - if (p_start) - pr_debug(" [%lx-%lx] PMD -> [%p-%p] on node %d\n", - addr_start, addr_end-1, p_start, p_end-1, node_start); - addr_start = addr; - node_start = node; - p_start = p; - } + addr_end = addr + PMD_SIZE; + p_end = p + PMD_SIZE; - addr_end = addr + PMD_SIZE; - p_end = p + PMD_SIZE; + if (!IS_ALIGNED(addr, PMD_SIZE) || + !IS_ALIGNED(next, PMD_SIZE)) + vmemmap_use_new_sub_pmd(addr, next); +} - if (!IS_ALIGNED(addr, PMD_SIZE) || - !IS_ALIGNED(next, PMD_SIZE)) - vmemmap_use_new_sub_pmd(addr, next); +int __meminit vmemmap_check_pmd(pmd_t *pmd, int node, + unsigned long addr, unsigned long next) +{ + int large = pmd_large(*pmd); - continue; - } else if (altmap) - return -ENOMEM; /* no fallback */ - } else if (pmd_large(*pmd)) { - vmemmap_verify((pte_t *)pmd, node, addr, next); - vmemmap_use_sub_pmd(addr, next); - continue; - } - if (vmemmap_populate_basepages(addr, next, node, NULL)) - return -ENOMEM; + if (pmd_large(*pmd)) { + vmemmap_verify((pte_t *)pmd, node, addr, next); + vmemmap_use_sub_pmd(addr, next); } - return 0; + + return large; } int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node, diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index 78c5bc654cff..6453fbaedb08 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -217,9 +217,15 @@ __ioremap_caller(resource_size_t phys_addr, unsigned long size, * Mappings have to be page-aligned */ offset = phys_addr & ~PAGE_MASK; - phys_addr &= PHYSICAL_PAGE_MASK; + phys_addr &= PAGE_MASK; size = PAGE_ALIGN(last_addr+1) - phys_addr; + /* + * Mask out any bits not part of the actual physical + * address, like memory encryption bits. + */ + phys_addr &= PHYSICAL_PAGE_MASK; + retval = memtype_reserve(phys_addr, (u64)phys_addr + size, pcm, &new_pcm); if (retval) { diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c index e7b9b464a82f..0302491d799d 100644 --- a/arch/x86/mm/kasan_init_64.c +++ b/arch/x86/mm/kasan_init_64.c @@ -316,10 +316,33 @@ void __init kasan_early_init(void) kasan_map_early_shadow(init_top_pgt); } +static unsigned long kasan_mem_to_shadow_align_down(unsigned long va) +{ + unsigned long shadow = (unsigned long)kasan_mem_to_shadow((void *)va); + + return round_down(shadow, PAGE_SIZE); +} + +static unsigned long kasan_mem_to_shadow_align_up(unsigned long va) +{ + unsigned long shadow = (unsigned long)kasan_mem_to_shadow((void *)va); + + return round_up(shadow, PAGE_SIZE); +} + +void __init kasan_populate_shadow_for_vaddr(void *va, size_t size, int nid) +{ + unsigned long shadow_start, shadow_end; + + shadow_start = kasan_mem_to_shadow_align_down((unsigned long)va); + shadow_end = kasan_mem_to_shadow_align_up((unsigned long)va + size); + kasan_populate_shadow(shadow_start, shadow_end, nid); +} + void __init kasan_init(void) { + unsigned long shadow_cea_begin, shadow_cea_per_cpu_begin, shadow_cea_end; int i; - void *shadow_cpu_entry_begin, *shadow_cpu_entry_end; memcpy(early_top_pgt, init_top_pgt, sizeof(early_top_pgt)); @@ -360,16 +383,10 @@ void __init kasan_init(void) map_range(&pfn_mapped[i]); } - shadow_cpu_entry_begin = (void *)CPU_ENTRY_AREA_BASE; - shadow_cpu_entry_begin = kasan_mem_to_shadow(shadow_cpu_entry_begin); - shadow_cpu_entry_begin = (void *)round_down( - (unsigned long)shadow_cpu_entry_begin, PAGE_SIZE); - - shadow_cpu_entry_end = (void *)(CPU_ENTRY_AREA_BASE + - CPU_ENTRY_AREA_MAP_SIZE); - shadow_cpu_entry_end = kasan_mem_to_shadow(shadow_cpu_entry_end); - shadow_cpu_entry_end = (void *)round_up( - (unsigned long)shadow_cpu_entry_end, PAGE_SIZE); + shadow_cea_begin = kasan_mem_to_shadow_align_down(CPU_ENTRY_AREA_BASE); + shadow_cea_per_cpu_begin = kasan_mem_to_shadow_align_up(CPU_ENTRY_AREA_PER_CPU); + shadow_cea_end = kasan_mem_to_shadow_align_up(CPU_ENTRY_AREA_BASE + + CPU_ENTRY_AREA_MAP_SIZE); kasan_populate_early_shadow( kasan_mem_to_shadow((void *)PAGE_OFFSET + MAXMEM), @@ -391,12 +408,18 @@ void __init kasan_init(void) kasan_populate_early_shadow( kasan_mem_to_shadow((void *)VMALLOC_END + 1), - shadow_cpu_entry_begin); + (void *)shadow_cea_begin); - kasan_populate_shadow((unsigned long)shadow_cpu_entry_begin, - (unsigned long)shadow_cpu_entry_end, 0); + /* + * Populate the shadow for the shared portion of the CPU entry area. + * Shadows for the per-CPU areas are mapped on-demand, as each CPU's + * area is randomly placed somewhere in the 512GiB range and mapping + * the entire 512GiB range is prohibitively expensive. + */ + kasan_populate_shadow(shadow_cea_begin, + shadow_cea_per_cpu_begin, 0); - kasan_populate_early_shadow(shadow_cpu_entry_end, + kasan_populate_early_shadow((void *)shadow_cea_end, kasan_mem_to_shadow((void *)__START_KERNEL_map)); kasan_populate_shadow((unsigned long)kasan_mem_to_shadow(_stext), diff --git a/arch/x86/mm/kmmio.c b/arch/x86/mm/kmmio.c index d3efbc5b3449..9f82019179e1 100644 --- a/arch/x86/mm/kmmio.c +++ b/arch/x86/mm/kmmio.c @@ -62,7 +62,13 @@ struct kmmio_context { int active; }; -static DEFINE_SPINLOCK(kmmio_lock); +/* + * The kmmio_lock is taken in int3 context, which is treated as NMI context. + * This causes lockdep to complain about it bein in both NMI and normal + * context. Hide it from lockdep, as it should not have any other locks + * taken under it, and this is only enabled for debugging mmio anyway. + */ +static arch_spinlock_t kmmio_lock = __ARCH_SPIN_LOCK_UNLOCKED; /* Protected by kmmio_lock */ unsigned int kmmio_count; @@ -240,15 +246,14 @@ int kmmio_handler(struct pt_regs *regs, unsigned long addr) page_base &= page_level_mask(l); /* - * Preemption is now disabled to prevent process switch during - * single stepping. We can only handle one active kmmio trace + * Hold the RCU read lock over single stepping to avoid looking + * up the probe and kmmio_fault_page again. The rcu_read_lock_sched() + * also disables preemption and prevents process switch during + * the single stepping. We can only handle one active kmmio trace * per cpu, so ensure that we finish it before something else - * gets to run. We also hold the RCU read lock over single - * stepping to avoid looking up the probe and kmmio_fault_page - * again. + * gets to run. */ - preempt_disable(); - rcu_read_lock(); + rcu_read_lock_sched_notrace(); faultpage = get_kmmio_fault_page(page_base); if (!faultpage) { @@ -317,8 +322,7 @@ int kmmio_handler(struct pt_regs *regs, unsigned long addr) return 1; /* fault handled */ no_kmmio: - rcu_read_unlock(); - preempt_enable_no_resched(); + rcu_read_unlock_sched_notrace(); return ret; } @@ -346,10 +350,10 @@ static int post_kmmio_handler(unsigned long condition, struct pt_regs *regs) ctx->probe->post_handler(ctx->probe, condition, regs); /* Prevent racing against release_kmmio_fault_page(). */ - spin_lock(&kmmio_lock); + arch_spin_lock(&kmmio_lock); if (ctx->fpage->count) arm_kmmio_fault_page(ctx->fpage); - spin_unlock(&kmmio_lock); + arch_spin_unlock(&kmmio_lock); regs->flags &= ~X86_EFLAGS_TF; regs->flags |= ctx->saved_flags; @@ -357,8 +361,7 @@ static int post_kmmio_handler(unsigned long condition, struct pt_regs *regs) /* These were acquired in kmmio_handler(). */ ctx->active--; BUG_ON(ctx->active); - rcu_read_unlock(); - preempt_enable_no_resched(); + rcu_read_unlock_sched_notrace(); /* * if somebody else is singlestepping across a probe point, flags @@ -440,7 +443,8 @@ int register_kmmio_probe(struct kmmio_probe *p) unsigned int l; pte_t *pte; - spin_lock_irqsave(&kmmio_lock, flags); + local_irq_save(flags); + arch_spin_lock(&kmmio_lock); if (get_kmmio_probe(addr)) { ret = -EEXIST; goto out; @@ -460,7 +464,9 @@ int register_kmmio_probe(struct kmmio_probe *p) size += page_level_size(l); } out: - spin_unlock_irqrestore(&kmmio_lock, flags); + arch_spin_unlock(&kmmio_lock); + local_irq_restore(flags); + /* * XXX: What should I do here? * Here was a call to global_flush_tlb(), but it does not exist @@ -494,7 +500,8 @@ static void remove_kmmio_fault_pages(struct rcu_head *head) struct kmmio_fault_page **prevp = &dr->release_list; unsigned long flags; - spin_lock_irqsave(&kmmio_lock, flags); + local_irq_save(flags); + arch_spin_lock(&kmmio_lock); while (f) { if (!f->count) { list_del_rcu(&f->list); @@ -506,7 +513,8 @@ static void remove_kmmio_fault_pages(struct rcu_head *head) } f = *prevp; } - spin_unlock_irqrestore(&kmmio_lock, flags); + arch_spin_unlock(&kmmio_lock); + local_irq_restore(flags); /* This is the real RCU destroy call. */ call_rcu(&dr->rcu, rcu_free_kmmio_fault_pages); @@ -540,14 +548,16 @@ void unregister_kmmio_probe(struct kmmio_probe *p) if (!pte) return; - spin_lock_irqsave(&kmmio_lock, flags); + local_irq_save(flags); + arch_spin_lock(&kmmio_lock); while (size < size_lim) { release_kmmio_fault_page(addr + size, &release_list); size += page_level_size(l); } list_del_rcu(&p->list); kmmio_count--; - spin_unlock_irqrestore(&kmmio_lock, flags); + arch_spin_unlock(&kmmio_lock); + local_irq_restore(flags); if (!release_list) return; diff --git a/arch/x86/mm/mem_encrypt_boot.S b/arch/x86/mm/mem_encrypt_boot.S index 9de3d900bc92..e25288ee33c2 100644 --- a/arch/x86/mm/mem_encrypt_boot.S +++ b/arch/x86/mm/mem_encrypt_boot.S @@ -26,7 +26,7 @@ SYM_FUNC_START(sme_encrypt_execute) * RCX - virtual address of the encryption workarea, including: * - stack page (PAGE_SIZE) * - encryption routine page (PAGE_SIZE) - * - intermediate copy buffer (PMD_PAGE_SIZE) + * - intermediate copy buffer (PMD_SIZE) * R8 - physical address of the pagetables to use for encryption */ @@ -123,7 +123,7 @@ SYM_FUNC_START(__enc_copy) wbinvd /* Invalidate any cache entries */ /* Copy/encrypt up to 2MB at a time */ - movq $PMD_PAGE_SIZE, %r12 + movq $PMD_SIZE, %r12 1: cmpq %r12, %r9 jnb 2f diff --git a/arch/x86/mm/mem_encrypt_identity.c b/arch/x86/mm/mem_encrypt_identity.c index f415498d3175..88cccd65029d 100644 --- a/arch/x86/mm/mem_encrypt_identity.c +++ b/arch/x86/mm/mem_encrypt_identity.c @@ -93,7 +93,7 @@ struct sme_populate_pgd_data { * section is 2MB aligned to allow for simple pagetable setup using only * PMD entries (see vmlinux.lds.S). */ -static char sme_workarea[2 * PMD_PAGE_SIZE] __section(".init.scratch"); +static char sme_workarea[2 * PMD_SIZE] __section(".init.scratch"); static char sme_cmdline_arg[] __initdata = "mem_encrypt"; static char sme_cmdline_on[] __initdata = "on"; @@ -198,8 +198,8 @@ static void __init __sme_map_range_pmd(struct sme_populate_pgd_data *ppd) while (ppd->vaddr < ppd->vaddr_end) { sme_populate_pgd_large(ppd); - ppd->vaddr += PMD_PAGE_SIZE; - ppd->paddr += PMD_PAGE_SIZE; + ppd->vaddr += PMD_SIZE; + ppd->paddr += PMD_SIZE; } } @@ -225,11 +225,11 @@ static void __init __sme_map_range(struct sme_populate_pgd_data *ppd, vaddr_end = ppd->vaddr_end; /* If start is not 2MB aligned, create PTE entries */ - ppd->vaddr_end = ALIGN(ppd->vaddr, PMD_PAGE_SIZE); + ppd->vaddr_end = ALIGN(ppd->vaddr, PMD_SIZE); __sme_map_range_pte(ppd); /* Create PMD entries */ - ppd->vaddr_end = vaddr_end & PMD_PAGE_MASK; + ppd->vaddr_end = vaddr_end & PMD_MASK; __sme_map_range_pmd(ppd); /* If end is not 2MB aligned, create PTE entries */ @@ -325,7 +325,7 @@ void __init sme_encrypt_kernel(struct boot_params *bp) /* Physical addresses gives us the identity mapped virtual addresses */ kernel_start = __pa_symbol(_text); - kernel_end = ALIGN(__pa_symbol(_end), PMD_PAGE_SIZE); + kernel_end = ALIGN(__pa_symbol(_end), PMD_SIZE); kernel_len = kernel_end - kernel_start; initrd_start = 0; @@ -355,12 +355,12 @@ void __init sme_encrypt_kernel(struct boot_params *bp) * executable encryption area size: * stack page (PAGE_SIZE) * encryption routine page (PAGE_SIZE) - * intermediate copy buffer (PMD_PAGE_SIZE) + * intermediate copy buffer (PMD_SIZE) * pagetable structures for the encryption of the kernel * pagetable structures for workarea (in case not currently mapped) */ execute_start = workarea_start; - execute_end = execute_start + (PAGE_SIZE * 2) + PMD_PAGE_SIZE; + execute_end = execute_start + (PAGE_SIZE * 2) + PMD_SIZE; execute_len = execute_end - execute_start; /* @@ -383,7 +383,7 @@ void __init sme_encrypt_kernel(struct boot_params *bp) * before it is mapped. */ workarea_len = execute_len + pgtable_area_len; - workarea_end = ALIGN(workarea_start + workarea_len, PMD_PAGE_SIZE); + workarea_end = ALIGN(workarea_start + workarea_len, PMD_SIZE); /* * Set the address to the start of where newly created pagetable diff --git a/arch/x86/mm/pat/cpa-test.c b/arch/x86/mm/pat/cpa-test.c index 423b21e80929..3d2f7f0a6ed1 100644 --- a/arch/x86/mm/pat/cpa-test.c +++ b/arch/x86/mm/pat/cpa-test.c @@ -136,10 +136,10 @@ static int pageattr_test(void) failed += print_split(&sa); for (i = 0; i < NTEST; i++) { - unsigned long pfn = prandom_u32_max(max_pfn_mapped); + unsigned long pfn = get_random_u32_below(max_pfn_mapped); addr[i] = (unsigned long)__va(pfn << PAGE_SHIFT); - len[i] = prandom_u32_max(NPAGES); + len[i] = get_random_u32_below(NPAGES); len[i] = min_t(unsigned long, len[i], max_pfn_mapped - pfn - 1); if (len[i] == 0) diff --git a/arch/x86/mm/pat/memtype.c b/arch/x86/mm/pat/memtype.c index 66a209f7eb86..46de9cf5c91d 100644 --- a/arch/x86/mm/pat/memtype.c +++ b/arch/x86/mm/pat/memtype.c @@ -43,6 +43,7 @@ #include <linux/rbtree.h> #include <asm/cacheflush.h> +#include <asm/cacheinfo.h> #include <asm/processor.h> #include <asm/tlbflush.h> #include <asm/x86_init.h> @@ -60,41 +61,34 @@ #undef pr_fmt #define pr_fmt(fmt) "" fmt -static bool __read_mostly pat_bp_initialized; static bool __read_mostly pat_disabled = !IS_ENABLED(CONFIG_X86_PAT); -static bool __initdata pat_force_disabled = !IS_ENABLED(CONFIG_X86_PAT); -static bool __read_mostly pat_bp_enabled; -static bool __read_mostly pat_cm_initialized; +static u64 __ro_after_init pat_msr_val; /* * PAT support is enabled by default, but can be disabled for * various user-requested or hardware-forced reasons: */ -void pat_disable(const char *msg_reason) +static void __init pat_disable(const char *msg_reason) { if (pat_disabled) return; - if (pat_bp_initialized) { - WARN_ONCE(1, "x86/PAT: PAT cannot be disabled after initialization\n"); - return; - } - pat_disabled = true; pr_info("x86/PAT: %s\n", msg_reason); + + memory_caching_control &= ~CACHE_PAT; } static int __init nopat(char *str) { pat_disable("PAT support disabled via boot option."); - pat_force_disabled = true; return 0; } early_param("nopat", nopat); bool pat_enabled(void) { - return pat_bp_enabled; + return !pat_disabled; } EXPORT_SYMBOL_GPL(pat_enabled); @@ -192,7 +186,8 @@ enum { #define CM(c) (_PAGE_CACHE_MODE_ ## c) -static enum page_cache_mode pat_get_cache_mode(unsigned pat_val, char *msg) +static enum page_cache_mode __init pat_get_cache_mode(unsigned int pat_val, + char *msg) { enum page_cache_mode cache; char *cache_mode; @@ -219,14 +214,12 @@ static enum page_cache_mode pat_get_cache_mode(unsigned pat_val, char *msg) * configuration. * Using lower indices is preferred, so we start with highest index. */ -static void __init_cache_modes(u64 pat) +static void __init init_cache_modes(u64 pat) { enum page_cache_mode cache; char pat_msg[33]; int i; - WARN_ON_ONCE(pat_cm_initialized); - pat_msg[32] = 0; for (i = 7; i >= 0; i--) { cache = pat_get_cache_mode((pat >> (i * 8)) & 7, @@ -234,34 +227,9 @@ static void __init_cache_modes(u64 pat) update_cache_mode_entry(i, cache); } pr_info("x86/PAT: Configuration [0-7]: %s\n", pat_msg); - - pat_cm_initialized = true; } -#define PAT(x, y) ((u64)PAT_ ## y << ((x)*8)) - -static void pat_bp_init(u64 pat) -{ - u64 tmp_pat; - - if (!boot_cpu_has(X86_FEATURE_PAT)) { - pat_disable("PAT not supported by the CPU."); - return; - } - - rdmsrl(MSR_IA32_CR_PAT, tmp_pat); - if (!tmp_pat) { - pat_disable("PAT support disabled by the firmware."); - return; - } - - wrmsrl(MSR_IA32_CR_PAT, pat); - pat_bp_enabled = true; - - __init_cache_modes(pat); -} - -static void pat_ap_init(u64 pat) +void pat_cpu_init(void) { if (!boot_cpu_has(X86_FEATURE_PAT)) { /* @@ -271,30 +239,39 @@ static void pat_ap_init(u64 pat) panic("x86/PAT: PAT enabled, but not supported by secondary CPU\n"); } - wrmsrl(MSR_IA32_CR_PAT, pat); + wrmsrl(MSR_IA32_CR_PAT, pat_msr_val); } -void __init init_cache_modes(void) +/** + * pat_bp_init - Initialize the PAT MSR value and PAT table + * + * This function initializes PAT MSR value and PAT table with an OS-defined + * value to enable additional cache attributes, WC, WT and WP. + * + * This function prepares the calls of pat_cpu_init() via cache_cpu_init() + * on all CPUs. + */ +void __init pat_bp_init(void) { - u64 pat = 0; + struct cpuinfo_x86 *c = &boot_cpu_data; +#define PAT(p0, p1, p2, p3, p4, p5, p6, p7) \ + (((u64)PAT_ ## p0) | ((u64)PAT_ ## p1 << 8) | \ + ((u64)PAT_ ## p2 << 16) | ((u64)PAT_ ## p3 << 24) | \ + ((u64)PAT_ ## p4 << 32) | ((u64)PAT_ ## p5 << 40) | \ + ((u64)PAT_ ## p6 << 48) | ((u64)PAT_ ## p7 << 56)) - if (pat_cm_initialized) - return; - if (boot_cpu_has(X86_FEATURE_PAT)) { - /* - * CPU supports PAT. Set PAT table to be consistent with - * PAT MSR. This case supports "nopat" boot option, and - * virtual machine environments which support PAT without - * MTRRs. In specific, Xen has unique setup to PAT MSR. - * - * If PAT MSR returns 0, it is considered invalid and emulates - * as No PAT. - */ - rdmsrl(MSR_IA32_CR_PAT, pat); - } + if (!IS_ENABLED(CONFIG_X86_PAT)) + pr_info_once("x86/PAT: PAT support disabled because CONFIG_X86_PAT is disabled in the kernel.\n"); + + if (!cpu_feature_enabled(X86_FEATURE_PAT)) + pat_disable("PAT not supported by the CPU."); + else + rdmsrl(MSR_IA32_CR_PAT, pat_msr_val); + + if (!pat_msr_val) { + pat_disable("PAT support disabled by the firmware."); - if (!pat) { /* * No PAT. Emulate the PAT table that corresponds to the two * cache bits, PWT (Write Through) and PCD (Cache Disable). @@ -313,40 +290,22 @@ void __init init_cache_modes(void) * NOTE: When WC or WP is used, it is redirected to UC- per * the default setup in __cachemode2pte_tbl[]. */ - pat = PAT(0, WB) | PAT(1, WT) | PAT(2, UC_MINUS) | PAT(3, UC) | - PAT(4, WB) | PAT(5, WT) | PAT(6, UC_MINUS) | PAT(7, UC); - } else if (!pat_force_disabled && cpu_feature_enabled(X86_FEATURE_HYPERVISOR)) { - /* - * Clearly PAT is enabled underneath. Allow pat_enabled() to - * reflect this. - */ - pat_bp_enabled = true; + pat_msr_val = PAT(WB, WT, UC_MINUS, UC, WB, WT, UC_MINUS, UC); } - __init_cache_modes(pat); -} - -/** - * pat_init - Initialize the PAT MSR and PAT table on the current CPU - * - * This function initializes PAT MSR and PAT table with an OS-defined value - * to enable additional cache attributes, WC, WT and WP. - * - * This function must be called on all CPUs using the specific sequence of - * operations defined in Intel SDM. mtrr_rendezvous_handler() provides this - * procedure for PAT. - */ -void pat_init(void) -{ - u64 pat; - struct cpuinfo_x86 *c = &boot_cpu_data; - -#ifndef CONFIG_X86_PAT - pr_info_once("x86/PAT: PAT support disabled because CONFIG_X86_PAT is disabled in the kernel.\n"); -#endif - - if (pat_disabled) + /* + * Xen PV doesn't allow to set PAT MSR, but all cache modes are + * supported. + * When running as TDX guest setting the PAT MSR won't work either + * due to the requirement to set CR0.CD when doing so. Rely on + * firmware to have set the PAT MSR correctly. + */ + if (pat_disabled || + cpu_feature_enabled(X86_FEATURE_XENPV) || + cpu_feature_enabled(X86_FEATURE_TDX_GUEST)) { + init_cache_modes(pat_msr_val); return; + } if ((c->x86_vendor == X86_VENDOR_INTEL) && (((c->x86 == 0x6) && (c->x86_model <= 0xd)) || @@ -371,8 +330,7 @@ void pat_init(void) * NOTE: When WT or WP is used, it is redirected to UC- per * the default setup in __cachemode2pte_tbl[]. */ - pat = PAT(0, WB) | PAT(1, WC) | PAT(2, UC_MINUS) | PAT(3, UC) | - PAT(4, WB) | PAT(5, WC) | PAT(6, UC_MINUS) | PAT(7, UC); + pat_msr_val = PAT(WB, WC, UC_MINUS, UC, WB, WC, UC_MINUS, UC); } else { /* * Full PAT support. We put WT in slot 7 to improve @@ -400,19 +358,14 @@ void pat_init(void) * The reserved slots are unused, but mapped to their * corresponding types in the presence of PAT errata. */ - pat = PAT(0, WB) | PAT(1, WC) | PAT(2, UC_MINUS) | PAT(3, UC) | - PAT(4, WB) | PAT(5, WP) | PAT(6, UC_MINUS) | PAT(7, WT); + pat_msr_val = PAT(WB, WC, UC_MINUS, UC, WB, WP, UC_MINUS, WT); } - if (!pat_bp_initialized) { - pat_bp_init(pat); - pat_bp_initialized = true; - } else { - pat_ap_init(pat); - } -} + memory_caching_control |= CACHE_PAT; + init_cache_modes(pat_msr_val); #undef PAT +} static DEFINE_SPINLOCK(memtype_lock); /* protects memtype accesses */ diff --git a/arch/x86/mm/pat/set_memory.c b/arch/x86/mm/pat/set_memory.c index 2e5a045731de..356758b7d4b4 100644 --- a/arch/x86/mm/pat/set_memory.c +++ b/arch/x86/mm/pat/set_memory.c @@ -20,6 +20,7 @@ #include <linux/kernel.h> #include <linux/cc_platform.h> #include <linux/set_memory.h> +#include <linux/memregion.h> #include <asm/e820/api.h> #include <asm/processor.h> @@ -219,6 +220,23 @@ within_inclusive(unsigned long addr, unsigned long start, unsigned long end) #ifdef CONFIG_X86_64 +/* + * The kernel image is mapped into two places in the virtual address space + * (addresses without KASLR, of course): + * + * 1. The kernel direct map (0xffff880000000000) + * 2. The "high kernel map" (0xffffffff81000000) + * + * We actually execute out of #2. If we get the address of a kernel symbol, it + * points to #2, but almost all physical-to-virtual translations point to #1. + * + * This is so that we can have both a directmap of all physical memory *and* + * take full advantage of the the limited (s32) immediate addressing range (2G) + * of x86_64. + * + * See Documentation/x86/x86_64/mm.rst for more detail. + */ + static inline unsigned long highmap_start_pfn(void) { return __pa_symbol(_text) >> PAGE_SHIFT; @@ -330,6 +348,23 @@ void arch_invalidate_pmem(void *addr, size_t size) EXPORT_SYMBOL_GPL(arch_invalidate_pmem); #endif +#ifdef CONFIG_ARCH_HAS_CPU_CACHE_INVALIDATE_MEMREGION +bool cpu_cache_has_invalidate_memregion(void) +{ + return !cpu_feature_enabled(X86_FEATURE_HYPERVISOR); +} +EXPORT_SYMBOL_NS_GPL(cpu_cache_has_invalidate_memregion, DEVMEM); + +int cpu_cache_invalidate_memregion(int res_desc) +{ + if (WARN_ON_ONCE(!cpu_cache_has_invalidate_memregion())) + return -ENXIO; + wbinvd_on_all_cpus(); + return 0; +} +EXPORT_SYMBOL_NS_GPL(cpu_cache_invalidate_memregion, DEVMEM); +#endif + static void __cpa_flush_all(void *arg) { unsigned long cache = (unsigned long)arg; @@ -587,10 +622,6 @@ static inline pgprot_t verify_rwx(pgprot_t old, pgprot_t new, unsigned long star { unsigned long end; - /* Kernel text is rw at boot up */ - if (system_state == SYSTEM_BOOTING) - return new; - /* * 32-bit has some unfixable W+X issues, like EFI code * and writeable data being in the same page. Disable @@ -747,11 +778,11 @@ phys_addr_t slow_virt_to_phys(void *__virt_addr) switch (level) { case PG_LEVEL_1G: phys_addr = (phys_addr_t)pud_pfn(*(pud_t *)pte) << PAGE_SHIFT; - offset = virt_addr & ~PUD_PAGE_MASK; + offset = virt_addr & ~PUD_MASK; break; case PG_LEVEL_2M: phys_addr = (phys_addr_t)pmd_pfn(*(pmd_t *)pte) << PAGE_SHIFT; - offset = virt_addr & ~PMD_PAGE_MASK; + offset = virt_addr & ~PMD_MASK; break; default: phys_addr = (phys_addr_t)pte_pfn(*pte) << PAGE_SHIFT; @@ -1041,7 +1072,7 @@ __split_large_page(struct cpa_data *cpa, pte_t *kpte, unsigned long address, case PG_LEVEL_1G: ref_prot = pud_pgprot(*(pud_t *)kpte); ref_pfn = pud_pfn(*(pud_t *)kpte); - pfninc = PMD_PAGE_SIZE >> PAGE_SHIFT; + pfninc = PMD_SIZE >> PAGE_SHIFT; lpaddr = address & PUD_MASK; lpinc = PMD_SIZE; /* @@ -1628,8 +1659,11 @@ repeat: return err; } -static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias); +static int __change_page_attr_set_clr(struct cpa_data *cpa, int primary); +/* + * Check the directmap and "high kernel map" 'aliases'. + */ static int cpa_process_alias(struct cpa_data *cpa) { struct cpa_data alias_cpa; @@ -1653,6 +1687,12 @@ static int cpa_process_alias(struct cpa_data *cpa) alias_cpa.flags &= ~(CPA_PAGES_ARRAY | CPA_ARRAY); alias_cpa.curpage = 0; + /* Directmap always has NX set, do not modify. */ + if (__supported_pte_mask & _PAGE_NX) { + alias_cpa.mask_clr.pgprot &= ~_PAGE_NX; + alias_cpa.mask_set.pgprot &= ~_PAGE_NX; + } + cpa->force_flush_all = 1; ret = __change_page_attr_set_clr(&alias_cpa, 0); @@ -1675,6 +1715,15 @@ static int cpa_process_alias(struct cpa_data *cpa) alias_cpa.flags &= ~(CPA_PAGES_ARRAY | CPA_ARRAY); alias_cpa.curpage = 0; + /* + * [_text, _brk_end) also covers data, do not modify NX except + * in cases where the highmap is the primary target. + */ + if (__supported_pte_mask & _PAGE_NX) { + alias_cpa.mask_clr.pgprot &= ~_PAGE_NX; + alias_cpa.mask_set.pgprot &= ~_PAGE_NX; + } + cpa->force_flush_all = 1; /* * The high mapping range is imprecise, so ignore the @@ -1687,12 +1736,19 @@ static int cpa_process_alias(struct cpa_data *cpa) return 0; } -static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias) +static int __change_page_attr_set_clr(struct cpa_data *cpa, int primary) { unsigned long numpages = cpa->numpages; unsigned long rempages = numpages; int ret = 0; + /* + * No changes, easy! + */ + if (!(pgprot_val(cpa->mask_set) | pgprot_val(cpa->mask_clr)) && + !cpa->force_split) + return ret; + while (rempages) { /* * Store the remaining nr of pages for the large page @@ -1705,13 +1761,13 @@ static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias) if (!debug_pagealloc_enabled()) spin_lock(&cpa_lock); - ret = __change_page_attr(cpa, checkalias); + ret = __change_page_attr(cpa, primary); if (!debug_pagealloc_enabled()) spin_unlock(&cpa_lock); if (ret) goto out; - if (checkalias) { + if (primary && !(cpa->flags & CPA_NO_CHECK_ALIAS)) { ret = cpa_process_alias(cpa); if (ret) goto out; @@ -1739,7 +1795,7 @@ static int change_page_attr_set_clr(unsigned long *addr, int numpages, struct page **pages) { struct cpa_data cpa; - int ret, cache, checkalias; + int ret, cache; memset(&cpa, 0, sizeof(cpa)); @@ -1785,20 +1841,11 @@ static int change_page_attr_set_clr(unsigned long *addr, int numpages, cpa.numpages = numpages; cpa.mask_set = mask_set; cpa.mask_clr = mask_clr; - cpa.flags = 0; + cpa.flags = in_flag; cpa.curpage = 0; cpa.force_split = force_split; - if (in_flag & (CPA_ARRAY | CPA_PAGES_ARRAY)) - cpa.flags |= in_flag; - - /* No alias checking for _NX bit modifications */ - checkalias = (pgprot_val(mask_set) | pgprot_val(mask_clr)) != _PAGE_NX; - /* Has caller explicitly disabled alias checking? */ - if (in_flag & CPA_NO_CHECK_ALIAS) - checkalias = 0; - - ret = __change_page_attr_set_clr(&cpa, checkalias); + ret = __change_page_attr_set_clr(&cpa, 1); /* * Check whether we really changed something: @@ -2029,6 +2076,16 @@ int set_memory_ro(unsigned long addr, int numpages) return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_RW), 0); } +int set_memory_rox(unsigned long addr, int numpages) +{ + pgprot_t clr = __pgprot(_PAGE_RW); + + if (__supported_pte_mask & _PAGE_NX) + clr.pgprot |= _PAGE_NX; + + return change_page_attr_clear(&addr, numpages, clr, 0); +} + int set_memory_rw(unsigned long addr, int numpages) { return change_page_attr_set(&addr, numpages, __pgprot(_PAGE_RW), 0); @@ -2041,11 +2098,9 @@ int set_memory_np(unsigned long addr, int numpages) int set_memory_np_noalias(unsigned long addr, int numpages) { - int cpa_flags = CPA_NO_CHECK_ALIAS; - return change_page_attr_set_clr(&addr, numpages, __pgprot(0), __pgprot(_PAGE_PRESENT), 0, - cpa_flags, NULL); + CPA_NO_CHECK_ALIAS, NULL); } int set_memory_4k(unsigned long addr, int numpages) @@ -2262,7 +2317,7 @@ static int __set_pages_p(struct page *page, int numpages) .numpages = numpages, .mask_set = __pgprot(_PAGE_PRESENT | _PAGE_RW), .mask_clr = __pgprot(0), - .flags = 0}; + .flags = CPA_NO_CHECK_ALIAS }; /* * No alias checking needed for setting present flag. otherwise, @@ -2270,7 +2325,7 @@ static int __set_pages_p(struct page *page, int numpages) * mappings (this adds to complexity if we want to do this from * atomic context especially). Let's keep it simple! */ - return __change_page_attr_set_clr(&cpa, 0); + return __change_page_attr_set_clr(&cpa, 1); } static int __set_pages_np(struct page *page, int numpages) @@ -2281,7 +2336,7 @@ static int __set_pages_np(struct page *page, int numpages) .numpages = numpages, .mask_set = __pgprot(0), .mask_clr = __pgprot(_PAGE_PRESENT | _PAGE_RW), - .flags = 0}; + .flags = CPA_NO_CHECK_ALIAS }; /* * No alias checking needed for setting not present flag. otherwise, @@ -2289,7 +2344,7 @@ static int __set_pages_np(struct page *page, int numpages) * mappings (this adds to complexity if we want to do this from * atomic context especially). Let's keep it simple! */ - return __change_page_attr_set_clr(&cpa, 0); + return __change_page_attr_set_clr(&cpa, 1); } int set_direct_map_invalid_noflush(struct page *page) @@ -2360,7 +2415,7 @@ int __init kernel_map_pages_in_pgd(pgd_t *pgd, u64 pfn, unsigned long address, .numpages = numpages, .mask_set = __pgprot(0), .mask_clr = __pgprot(~page_flags & (_PAGE_NX|_PAGE_RW)), - .flags = 0, + .flags = CPA_NO_CHECK_ALIAS, }; WARN_ONCE(num_online_cpus() > 1, "Don't call after initializing SMP"); @@ -2373,7 +2428,7 @@ int __init kernel_map_pages_in_pgd(pgd_t *pgd, u64 pfn, unsigned long address, cpa.mask_set = __pgprot(_PAGE_PRESENT | page_flags); - retval = __change_page_attr_set_clr(&cpa, 0); + retval = __change_page_attr_set_clr(&cpa, 1); __flush_tlb_all(); out: @@ -2403,12 +2458,12 @@ int __init kernel_unmap_pages_in_pgd(pgd_t *pgd, unsigned long address, .numpages = numpages, .mask_set = __pgprot(0), .mask_clr = __pgprot(_PAGE_PRESENT | _PAGE_RW), - .flags = 0, + .flags = CPA_NO_CHECK_ALIAS, }; WARN_ONCE(num_online_cpus() > 1, "Don't call after initializing SMP"); - retval = __change_page_attr_set_clr(&cpa, 0); + retval = __change_page_attr_set_clr(&cpa, 1); __flush_tlb_all(); return retval; diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index 8525f2876fb4..e4f499eb0f29 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -299,9 +299,6 @@ static void pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd, pmd_t *pmds[]) pud_t *pud; int i; - if (PREALLOCATED_PMDS == 0) /* Work around gcc-3.4.x bug */ - return; - p4d = p4d_offset(pgd, 0); pud = pud_offset(p4d, 0); @@ -434,10 +431,12 @@ pgd_t *pgd_alloc(struct mm_struct *mm) mm->pgd = pgd; - if (preallocate_pmds(mm, pmds, PREALLOCATED_PMDS) != 0) + if (sizeof(pmds) != 0 && + preallocate_pmds(mm, pmds, PREALLOCATED_PMDS) != 0) goto out_free_pgd; - if (preallocate_pmds(mm, u_pmds, PREALLOCATED_USER_PMDS) != 0) + if (sizeof(u_pmds) != 0 && + preallocate_pmds(mm, u_pmds, PREALLOCATED_USER_PMDS) != 0) goto out_free_pmds; if (paravirt_pgd_alloc(mm) != 0) @@ -451,17 +450,22 @@ pgd_t *pgd_alloc(struct mm_struct *mm) spin_lock(&pgd_lock); pgd_ctor(mm, pgd); - pgd_prepopulate_pmd(mm, pgd, pmds); - pgd_prepopulate_user_pmd(mm, pgd, u_pmds); + if (sizeof(pmds) != 0) + pgd_prepopulate_pmd(mm, pgd, pmds); + + if (sizeof(u_pmds) != 0) + pgd_prepopulate_user_pmd(mm, pgd, u_pmds); spin_unlock(&pgd_lock); return pgd; out_free_user_pmds: - free_pmds(mm, u_pmds, PREALLOCATED_USER_PMDS); + if (sizeof(u_pmds) != 0) + free_pmds(mm, u_pmds, PREALLOCATED_USER_PMDS); out_free_pmds: - free_pmds(mm, pmds, PREALLOCATED_PMDS); + if (sizeof(pmds) != 0) + free_pmds(mm, pmds, PREALLOCATED_PMDS); out_free_pgd: _pgd_free(pgd); out: diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c index ffe3b3a087fe..78414c6d1b5e 100644 --- a/arch/x86/mm/pti.c +++ b/arch/x86/mm/pti.c @@ -592,7 +592,7 @@ static void pti_set_kernel_image_nonglobal(void) * of the image. */ unsigned long start = PFN_ALIGN(_text); - unsigned long end = ALIGN((unsigned long)_end, PMD_PAGE_SIZE); + unsigned long end = ALIGN((unsigned long)_end, PMD_SIZE); /* * This clears _PAGE_GLOBAL from the entire kernel image. diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 00127abd89ee..b808be77635e 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -11,8 +11,8 @@ #include <linux/bpf.h> #include <linux/memory.h> #include <linux/sort.h> -#include <linux/init.h> #include <asm/extable.h> +#include <asm/ftrace.h> #include <asm/set_memory.h> #include <asm/nospec-branch.h> #include <asm/text-patching.h> @@ -341,6 +341,13 @@ static int emit_call(u8 **pprog, void *func, void *ip) return emit_patch(pprog, func, ip, 0xE8); } +static int emit_rsb_call(u8 **pprog, void *func, void *ip) +{ + OPTIMIZER_HIDE_VAR(func); + x86_call_depth_emit_accounting(pprog, func); + return emit_patch(pprog, func, ip, 0xE8); +} + static int emit_jump(u8 **pprog, void *func, void *ip) { return emit_patch(pprog, func, ip, 0xE9); @@ -389,18 +396,6 @@ out: return ret; } -int __init bpf_arch_init_dispatcher_early(void *ip) -{ - const u8 *nop_insn = x86_nops[5]; - - if (is_endbr(*(u32 *)ip)) - ip += ENDBR_INSN_SIZE; - - if (memcmp(ip, nop_insn, X86_PATCH_SIZE)) - text_poke_early(ip, nop_insn, X86_PATCH_SIZE); - return 0; -} - int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t, void *old_addr, void *new_addr) { @@ -430,7 +425,10 @@ static void emit_indirect_jump(u8 **pprog, int reg, u8 *ip) EMIT2(0xFF, 0xE0 + reg); } else if (cpu_feature_enabled(X86_FEATURE_RETPOLINE)) { OPTIMIZER_HIDE_VAR(reg); - emit_jump(&prog, &__x86_indirect_thunk_array[reg], ip); + if (cpu_feature_enabled(X86_FEATURE_CALL_DEPTH)) + emit_jump(&prog, &__x86_indirect_jump_thunk_array[reg], ip); + else + emit_jump(&prog, &__x86_indirect_thunk_array[reg], ip); } else { EMIT2(0xFF, 0xE0 + reg); /* jmp *%\reg */ if (IS_ENABLED(CONFIG_RETPOLINE) || IS_ENABLED(CONFIG_SLS)) @@ -445,7 +443,7 @@ static void emit_return(u8 **pprog, u8 *ip) u8 *prog = *pprog; if (cpu_feature_enabled(X86_FEATURE_RETHUNK)) { - emit_jump(&prog, &__x86_return_thunk, ip); + emit_jump(&prog, x86_return_thunk, ip); } else { EMIT1(0xC3); /* ret */ if (IS_ENABLED(CONFIG_SLS)) @@ -904,6 +902,65 @@ static void emit_nops(u8 **pprog, int len) *pprog = prog; } +/* emit the 3-byte VEX prefix + * + * r: same as rex.r, extra bit for ModRM reg field + * x: same as rex.x, extra bit for SIB index field + * b: same as rex.b, extra bit for ModRM r/m, or SIB base + * m: opcode map select, encoding escape bytes e.g. 0x0f38 + * w: same as rex.w (32 bit or 64 bit) or opcode specific + * src_reg2: additional source reg (encoded as BPF reg) + * l: vector length (128 bit or 256 bit) or reserved + * pp: opcode prefix (none, 0x66, 0xf2 or 0xf3) + */ +static void emit_3vex(u8 **pprog, bool r, bool x, bool b, u8 m, + bool w, u8 src_reg2, bool l, u8 pp) +{ + u8 *prog = *pprog; + const u8 b0 = 0xc4; /* first byte of 3-byte VEX prefix */ + u8 b1, b2; + u8 vvvv = reg2hex[src_reg2]; + + /* reg2hex gives only the lower 3 bit of vvvv */ + if (is_ereg(src_reg2)) + vvvv |= 1 << 3; + + /* + * 2nd byte of 3-byte VEX prefix + * ~ means bit inverted encoding + * + * 7 0 + * +---+---+---+---+---+---+---+---+ + * |~R |~X |~B | m | + * +---+---+---+---+---+---+---+---+ + */ + b1 = (!r << 7) | (!x << 6) | (!b << 5) | (m & 0x1f); + /* + * 3rd byte of 3-byte VEX prefix + * + * 7 0 + * +---+---+---+---+---+---+---+---+ + * | W | ~vvvv | L | pp | + * +---+---+---+---+---+---+---+---+ + */ + b2 = (w << 7) | ((~vvvv & 0xf) << 3) | (l << 2) | (pp & 3); + + EMIT3(b0, b1, b2); + *pprog = prog; +} + +/* emit BMI2 shift instruction */ +static void emit_shiftx(u8 **pprog, u32 dst_reg, u8 src_reg, bool is64, u8 op) +{ + u8 *prog = *pprog; + bool r = is_ereg(dst_reg); + u8 m = 2; /* escape code 0f38 */ + + emit_3vex(&prog, r, false, r, m, is64, src_reg, false, op); + EMIT2(0xf7, add_2reg(0xC0, dst_reg, dst_reg)); + *pprog = prog; +} + #define INSN_SZ_DIFF (((addrs[i] - addrs[i - 1]) - (prog - temp))) static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image, @@ -1150,17 +1207,38 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image case BPF_ALU64 | BPF_LSH | BPF_X: case BPF_ALU64 | BPF_RSH | BPF_X: case BPF_ALU64 | BPF_ARSH | BPF_X: + /* BMI2 shifts aren't better when shift count is already in rcx */ + if (boot_cpu_has(X86_FEATURE_BMI2) && src_reg != BPF_REG_4) { + /* shrx/sarx/shlx dst_reg, dst_reg, src_reg */ + bool w = (BPF_CLASS(insn->code) == BPF_ALU64); + u8 op; + + switch (BPF_OP(insn->code)) { + case BPF_LSH: + op = 1; /* prefix 0x66 */ + break; + case BPF_RSH: + op = 3; /* prefix 0xf2 */ + break; + case BPF_ARSH: + op = 2; /* prefix 0xf3 */ + break; + } + + emit_shiftx(&prog, dst_reg, src_reg, w, op); - /* Check for bad case when dst_reg == rcx */ - if (dst_reg == BPF_REG_4) { - /* mov r11, dst_reg */ - EMIT_mov(AUX_REG, dst_reg); - dst_reg = AUX_REG; + break; } if (src_reg != BPF_REG_4) { /* common case */ - EMIT1(0x51); /* push rcx */ - + /* Check for bad case when dst_reg == rcx */ + if (dst_reg == BPF_REG_4) { + /* mov r11, dst_reg */ + EMIT_mov(AUX_REG, dst_reg); + dst_reg = AUX_REG; + } else { + EMIT1(0x51); /* push rcx */ + } /* mov rcx, src_reg */ EMIT_mov(BPF_REG_4, src_reg); } @@ -1172,12 +1250,14 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image b3 = simple_alu_opcodes[BPF_OP(insn->code)]; EMIT2(0xD3, add_1reg(b3, dst_reg)); - if (src_reg != BPF_REG_4) - EMIT1(0x59); /* pop rcx */ + if (src_reg != BPF_REG_4) { + if (insn->dst_reg == BPF_REG_4) + /* mov dst_reg, r11 */ + EMIT_mov(insn->dst_reg, AUX_REG); + else + EMIT1(0x59); /* pop rcx */ + } - if (insn->dst_reg == BPF_REG_4) - /* mov dst_reg, r11 */ - EMIT_mov(insn->dst_reg, AUX_REG); break; case BPF_ALU | BPF_END | BPF_FROM_BE: @@ -1239,8 +1319,7 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image /* speculation barrier */ case BPF_ST | BPF_NOSPEC: - if (boot_cpu_has(X86_FEATURE_XMM2)) - EMIT_LFENCE(); + EMIT_LFENCE(); break; /* ST: *(u8*)(dst_reg + off) = imm */ @@ -1446,19 +1525,26 @@ st: if (is_imm8(insn->off)) break; /* call */ - case BPF_JMP | BPF_CALL: + case BPF_JMP | BPF_CALL: { + int offs; + func = (u8 *) __bpf_call_base + imm32; if (tail_call_reachable) { /* mov rax, qword ptr [rbp - rounded_stack_depth - 8] */ EMIT3_off32(0x48, 0x8B, 0x85, -round_up(bpf_prog->aux->stack_depth, 8) - 8); - if (!imm32 || emit_call(&prog, func, image + addrs[i - 1] + 7)) + if (!imm32) return -EINVAL; + offs = 7 + x86_call_depth_emit_accounting(&prog, func); } else { - if (!imm32 || emit_call(&prog, func, image + addrs[i - 1])) + if (!imm32) return -EINVAL; + offs = x86_call_depth_emit_accounting(&prog, func); } + if (emit_call(&prog, func, image + addrs[i - 1] + offs)) + return -EINVAL; break; + } case BPF_JMP | BPF_TAIL_CALL: if (imm32) @@ -1826,10 +1912,6 @@ static int invoke_bpf_prog(const struct btf_func_model *m, u8 **pprog, struct bpf_tramp_link *l, int stack_size, int run_ctx_off, bool save_ret) { - void (*exit)(struct bpf_prog *prog, u64 start, - struct bpf_tramp_run_ctx *run_ctx) = __bpf_prog_exit; - u64 (*enter)(struct bpf_prog *prog, - struct bpf_tramp_run_ctx *run_ctx) = __bpf_prog_enter; u8 *prog = *pprog; u8 *jmp_insn; int ctx_cookie_off = offsetof(struct bpf_tramp_run_ctx, bpf_cookie); @@ -1848,23 +1930,12 @@ static int invoke_bpf_prog(const struct btf_func_model *m, u8 **pprog, */ emit_stx(&prog, BPF_DW, BPF_REG_FP, BPF_REG_1, -run_ctx_off + ctx_cookie_off); - if (p->aux->sleepable) { - enter = __bpf_prog_enter_sleepable; - exit = __bpf_prog_exit_sleepable; - } else if (p->type == BPF_PROG_TYPE_STRUCT_OPS) { - enter = __bpf_prog_enter_struct_ops; - exit = __bpf_prog_exit_struct_ops; - } else if (p->expected_attach_type == BPF_LSM_CGROUP) { - enter = __bpf_prog_enter_lsm_cgroup; - exit = __bpf_prog_exit_lsm_cgroup; - } - /* arg1: mov rdi, progs[i] */ emit_mov_imm64(&prog, BPF_REG_1, (long) p >> 32, (u32) (long) p); /* arg2: lea rsi, [rbp - ctx_cookie_off] */ EMIT4(0x48, 0x8D, 0x75, -run_ctx_off); - if (emit_call(&prog, enter, prog)) + if (emit_rsb_call(&prog, bpf_trampoline_enter(p), prog)) return -EINVAL; /* remember prog start time returned by __bpf_prog_enter */ emit_mov_reg(&prog, true, BPF_REG_6, BPF_REG_0); @@ -1885,7 +1956,7 @@ static int invoke_bpf_prog(const struct btf_func_model *m, u8 **pprog, (long) p->insnsi >> 32, (u32) (long) p->insnsi); /* call JITed bpf program or interpreter */ - if (emit_call(&prog, p->bpf_func, prog)) + if (emit_rsb_call(&prog, p->bpf_func, prog)) return -EINVAL; /* @@ -1909,7 +1980,7 @@ static int invoke_bpf_prog(const struct btf_func_model *m, u8 **pprog, emit_mov_reg(&prog, true, BPF_REG_2, BPF_REG_6); /* arg3: lea rdx, [rbp - run_ctx_off] */ EMIT4(0x48, 0x8D, 0x55, -run_ctx_off); - if (emit_call(&prog, exit, prog)) + if (emit_rsb_call(&prog, bpf_trampoline_exit(p), prog)) return -EINVAL; *pprog = prog; @@ -2131,6 +2202,11 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i prog = image; EMIT_ENDBR(); + /* + * This is the direct-call trampoline, as such it needs accounting + * for the __fentry__ call. + */ + x86_call_depth_emit_accounting(&prog, NULL); EMIT1(0x55); /* push rbp */ EMIT3(0x48, 0x89, 0xE5); /* mov rbp, rsp */ EMIT4(0x48, 0x83, 0xEC, stack_size); /* sub rsp, stack_size */ @@ -2157,7 +2233,7 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i if (flags & BPF_TRAMP_F_CALL_ORIG) { /* arg1: mov rdi, im */ emit_mov_imm64(&prog, BPF_REG_1, (long) im >> 32, (u32) (long) im); - if (emit_call(&prog, __bpf_tramp_enter, prog)) { + if (emit_rsb_call(&prog, __bpf_tramp_enter, prog)) { ret = -EINVAL; goto cleanup; } @@ -2189,7 +2265,7 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i EMIT2(0xff, 0xd0); /* call *rax */ } else { /* call original function */ - if (emit_call(&prog, orig_call, prog)) { + if (emit_rsb_call(&prog, orig_call, prog)) { ret = -EINVAL; goto cleanup; } @@ -2233,7 +2309,7 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i im->ip_epilogue = prog; /* arg1: mov rdi, im */ emit_mov_imm64(&prog, BPF_REG_1, (long) im >> 32, (u32) (long) im); - if (emit_call(&prog, __bpf_tramp_exit, prog)) { + if (emit_rsb_call(&prog, __bpf_tramp_exit, prog)) { ret = -EINVAL; goto cleanup; } diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c index 2f82480fd430..ea2eb2ec90e2 100644 --- a/arch/x86/pci/acpi.c +++ b/arch/x86/pci/acpi.c @@ -1,4 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 + +#define pr_fmt(fmt) "PCI: " fmt + #include <linux/pci.h> #include <linux/acpi.h> #include <linux/init.h> @@ -37,15 +40,15 @@ static int __init set_nouse_crs(const struct dmi_system_id *id) static int __init set_ignore_seg(const struct dmi_system_id *id) { - printk(KERN_INFO "PCI: %s detected: ignoring ACPI _SEG\n", id->ident); + pr_info("%s detected: ignoring ACPI _SEG\n", id->ident); pci_ignore_seg = true; return 0; } static int __init set_no_e820(const struct dmi_system_id *id) { - printk(KERN_INFO "PCI: %s detected: not clipping E820 regions from _CRS\n", - id->ident); + pr_info("%s detected: not clipping E820 regions from _CRS\n", + id->ident); pci_use_e820 = false; return 0; } @@ -231,10 +234,9 @@ void __init pci_acpi_crs_quirks(void) else if (pci_probe & PCI_USE__CRS) pci_use_crs = true; - printk(KERN_INFO "PCI: %s host bridge windows from ACPI; " - "if necessary, use \"pci=%s\" and report a bug\n", - pci_use_crs ? "Using" : "Ignoring", - pci_use_crs ? "nocrs" : "use_crs"); + pr_info("%s host bridge windows from ACPI; if necessary, use \"pci=%s\" and report a bug\n", + pci_use_crs ? "Using" : "Ignoring", + pci_use_crs ? "nocrs" : "use_crs"); /* "pci=use_e820"/"pci=no_e820" on the kernel cmdline takes precedence */ if (pci_probe & PCI_NO_E820) @@ -242,19 +244,17 @@ void __init pci_acpi_crs_quirks(void) else if (pci_probe & PCI_USE_E820) pci_use_e820 = true; - printk(KERN_INFO "PCI: %s E820 reservations for host bridge windows\n", - pci_use_e820 ? "Using" : "Ignoring"); + pr_info("%s E820 reservations for host bridge windows\n", + pci_use_e820 ? "Using" : "Ignoring"); if (pci_probe & (PCI_NO_E820 | PCI_USE_E820)) - printk(KERN_INFO "PCI: Please notify linux-pci@vger.kernel.org so future kernels can this automatically\n"); + pr_info("Please notify linux-pci@vger.kernel.org so future kernels can do this automatically\n"); } #ifdef CONFIG_PCI_MMCONFIG static int check_segment(u16 seg, struct device *dev, char *estr) { if (seg) { - dev_err(dev, - "%s can't access PCI configuration " - "space under this host bridge.\n", + dev_err(dev, "%s can't access configuration space under this host bridge\n", estr); return -EIO; } @@ -264,9 +264,7 @@ static int check_segment(u16 seg, struct device *dev, char *estr) * just can't access extended configuration space of * devices under this host bridge. */ - dev_warn(dev, - "%s can't access extended PCI configuration " - "space under this bridge.\n", + dev_warn(dev, "%s can't access extended configuration space under this bridge\n", estr); return 0; @@ -421,9 +419,8 @@ struct pci_bus *pci_acpi_scan_root(struct acpi_pci_root *root) root->segment = domain = 0; if (domain && !pci_domains_supported) { - printk(KERN_WARNING "pci_bus %04x:%02x: " - "ignored (multiple domains not supported)\n", - domain, busnum); + pr_warn("pci_bus %04x:%02x: ignored (multiple domains not supported)\n", + domain, busnum); return NULL; } @@ -491,7 +488,7 @@ int __init pci_acpi_init(void) if (acpi_noirq) return -ENODEV; - printk(KERN_INFO "PCI: Using ACPI for IRQ routing\n"); + pr_info("Using ACPI for IRQ routing\n"); acpi_irq_penalty_init(); pcibios_enable_irq = acpi_pci_irq_enable; pcibios_disable_irq = acpi_pci_irq_disable; @@ -503,7 +500,7 @@ int __init pci_acpi_init(void) * also do it here in case there are still broken drivers that * don't use pci_enable_device(). */ - printk(KERN_INFO "PCI: Routing PCI interrupts for all devices because \"pci=routeirq\" specified\n"); + pr_info("Routing PCI interrupts for all devices because \"pci=routeirq\" specified\n"); for_each_pci_dev(dev) acpi_pci_irq_enable(dev); } diff --git a/arch/x86/platform/efi/Makefile b/arch/x86/platform/efi/Makefile index a50245157685..543df9a1379d 100644 --- a/arch/x86/platform/efi/Makefile +++ b/arch/x86/platform/efi/Makefile @@ -2,5 +2,8 @@ KASAN_SANITIZE := n GCOV_PROFILE := n -obj-$(CONFIG_EFI) += quirks.o efi.o efi_$(BITS).o efi_stub_$(BITS).o +obj-$(CONFIG_EFI) += memmap.o quirks.o efi.o efi_$(BITS).o \ + efi_stub_$(BITS).o obj-$(CONFIG_EFI_MIXED) += efi_thunk_$(BITS).o +obj-$(CONFIG_EFI_FAKE_MEMMAP) += fake_mem.o +obj-$(CONFIG_EFI_RUNTIME_MAP) += runtime-map.o diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c index ebc98a68c400..55d9caf66401 100644 --- a/arch/x86/platform/efi/efi.c +++ b/arch/x86/platform/efi/efi.c @@ -214,9 +214,11 @@ int __init efi_memblock_x86_reserve_range(void) data.desc_size = e->efi_memdesc_size; data.desc_version = e->efi_memdesc_version; - rv = efi_memmap_init_early(&data); - if (rv) - return rv; + if (!efi_enabled(EFI_PARAVIRT)) { + rv = efi_memmap_init_early(&data); + if (rv) + return rv; + } if (add_efi_memmap || do_efi_soft_reserve()) do_add_efi_memmap(); @@ -303,6 +305,50 @@ static void __init efi_clean_memmap(void) } } +/* + * Firmware can use EfiMemoryMappedIO to request that MMIO regions be + * mapped by the OS so they can be accessed by EFI runtime services, but + * should have no other significance to the OS (UEFI r2.10, sec 7.2). + * However, most bootloaders and EFI stubs convert EfiMemoryMappedIO + * regions to E820_TYPE_RESERVED entries, which prevent Linux from + * allocating space from them (see remove_e820_regions()). + * + * Some platforms use EfiMemoryMappedIO entries for PCI MMCONFIG space and + * PCI host bridge windows, which means Linux can't allocate BAR space for + * hot-added devices. + * + * Remove large EfiMemoryMappedIO regions from the E820 map to avoid this + * problem. + * + * Retain small EfiMemoryMappedIO regions because on some platforms, these + * describe non-window space that's included in host bridge _CRS. If we + * assign that space to PCI devices, they don't work. + */ +static void __init efi_remove_e820_mmio(void) +{ + efi_memory_desc_t *md; + u64 size, start, end; + int i = 0; + + for_each_efi_memory_desc(md) { + if (md->type == EFI_MEMORY_MAPPED_IO) { + size = md->num_pages << EFI_PAGE_SHIFT; + start = md->phys_addr; + end = start + size - 1; + if (size >= 256*1024) { + pr_info("Remove mem%02u: MMIO range=[0x%08llx-0x%08llx] (%lluMB) from e820 map\n", + i, start, end, size >> 20); + e820__range_remove(start, size, + E820_TYPE_RESERVED, 1); + } else { + pr_info("Not removing mem%02u: MMIO range=[0x%08llx-0x%08llx] (%lluKB) from e820 map\n", + i, start, end, size >> 10); + } + } + i++; + } +} + void __init efi_print_memmap(void) { efi_memory_desc_t *md; @@ -474,6 +520,8 @@ void __init efi_init(void) set_bit(EFI_RUNTIME_SERVICES, &efi.flags); efi_clean_memmap(); + efi_remove_e820_mmio(); + if (efi_enabled(EFI_DBG)) efi_print_memmap(); } diff --git a/arch/x86/platform/efi/fake_mem.c b/arch/x86/platform/efi/fake_mem.c new file mode 100644 index 000000000000..41d57cad3d84 --- /dev/null +++ b/arch/x86/platform/efi/fake_mem.c @@ -0,0 +1,197 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * fake_mem.c + * + * Copyright (C) 2015 FUJITSU LIMITED + * Author: Taku Izumi <izumi.taku@jp.fujitsu.com> + * + * This code introduces new boot option named "efi_fake_mem" + * By specifying this parameter, you can add arbitrary attribute to + * specific memory range by updating original (firmware provided) EFI + * memmap. + */ + +#include <linux/kernel.h> +#include <linux/efi.h> +#include <linux/init.h> +#include <linux/memblock.h> +#include <linux/types.h> +#include <linux/sort.h> +#include <asm/e820/api.h> +#include <asm/efi.h> + +#define EFI_MAX_FAKEMEM CONFIG_EFI_MAX_FAKE_MEM + +static struct efi_mem_range efi_fake_mems[EFI_MAX_FAKEMEM]; +static int nr_fake_mem; + +static int __init cmp_fake_mem(const void *x1, const void *x2) +{ + const struct efi_mem_range *m1 = x1; + const struct efi_mem_range *m2 = x2; + + if (m1->range.start < m2->range.start) + return -1; + if (m1->range.start > m2->range.start) + return 1; + return 0; +} + +static void __init efi_fake_range(struct efi_mem_range *efi_range) +{ + struct efi_memory_map_data data = { 0 }; + int new_nr_map = efi.memmap.nr_map; + efi_memory_desc_t *md; + void *new_memmap; + + /* count up the number of EFI memory descriptor */ + for_each_efi_memory_desc(md) + new_nr_map += efi_memmap_split_count(md, &efi_range->range); + + /* allocate memory for new EFI memmap */ + if (efi_memmap_alloc(new_nr_map, &data) != 0) + return; + + /* create new EFI memmap */ + new_memmap = early_memremap(data.phys_map, data.size); + if (!new_memmap) { + __efi_memmap_free(data.phys_map, data.size, data.flags); + return; + } + + efi_memmap_insert(&efi.memmap, new_memmap, efi_range); + + /* swap into new EFI memmap */ + early_memunmap(new_memmap, data.size); + + efi_memmap_install(&data); +} + +void __init efi_fake_memmap(void) +{ + int i; + + if (!efi_enabled(EFI_MEMMAP) || !nr_fake_mem) + return; + + for (i = 0; i < nr_fake_mem; i++) + efi_fake_range(&efi_fake_mems[i]); + + /* print new EFI memmap */ + efi_print_memmap(); +} + +static int __init setup_fake_mem(char *p) +{ + u64 start = 0, mem_size = 0, attribute = 0; + int i; + + if (!p) + return -EINVAL; + + while (*p != '\0') { + mem_size = memparse(p, &p); + if (*p == '@') + start = memparse(p+1, &p); + else + break; + + if (*p == ':') + attribute = simple_strtoull(p+1, &p, 0); + else + break; + + if (nr_fake_mem >= EFI_MAX_FAKEMEM) + break; + + efi_fake_mems[nr_fake_mem].range.start = start; + efi_fake_mems[nr_fake_mem].range.end = start + mem_size - 1; + efi_fake_mems[nr_fake_mem].attribute = attribute; + nr_fake_mem++; + + if (*p == ',') + p++; + } + + sort(efi_fake_mems, nr_fake_mem, sizeof(struct efi_mem_range), + cmp_fake_mem, NULL); + + for (i = 0; i < nr_fake_mem; i++) + pr_info("efi_fake_mem: add attr=0x%016llx to [mem 0x%016llx-0x%016llx]", + efi_fake_mems[i].attribute, efi_fake_mems[i].range.start, + efi_fake_mems[i].range.end); + + return *p == '\0' ? 0 : -EINVAL; +} + +early_param("efi_fake_mem", setup_fake_mem); + +void __init efi_fake_memmap_early(void) +{ + int i; + + /* + * The late efi_fake_mem() call can handle all requests if + * EFI_MEMORY_SP support is disabled. + */ + if (!efi_soft_reserve_enabled()) + return; + + if (!efi_enabled(EFI_MEMMAP) || !nr_fake_mem) + return; + + /* + * Given that efi_fake_memmap() needs to perform memblock + * allocations it needs to run after e820__memblock_setup(). + * However, if efi_fake_mem specifies EFI_MEMORY_SP for a given + * address range that potentially needs to mark the memory as + * reserved prior to e820__memblock_setup(). Update e820 + * directly if EFI_MEMORY_SP is specified for an + * EFI_CONVENTIONAL_MEMORY descriptor. + */ + for (i = 0; i < nr_fake_mem; i++) { + struct efi_mem_range *mem = &efi_fake_mems[i]; + efi_memory_desc_t *md; + u64 m_start, m_end; + + if ((mem->attribute & EFI_MEMORY_SP) == 0) + continue; + + m_start = mem->range.start; + m_end = mem->range.end; + for_each_efi_memory_desc(md) { + u64 start, end, size; + + if (md->type != EFI_CONVENTIONAL_MEMORY) + continue; + + start = md->phys_addr; + end = md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT) - 1; + + if (m_start <= end && m_end >= start) + /* fake range overlaps descriptor */; + else + continue; + + /* + * Trim the boundary of the e820 update to the + * descriptor in case the fake range overlaps + * !EFI_CONVENTIONAL_MEMORY + */ + start = max(start, m_start); + end = min(end, m_end); + size = end - start + 1; + + if (end <= start) + continue; + + /* + * Ensure each efi_fake_mem instance results in + * a unique e820 resource + */ + e820__range_remove(start, size, E820_TYPE_RAM, 1); + e820__range_add(start, size, E820_TYPE_SOFT_RESERVED); + e820__update_table(e820_table); + } + } +} diff --git a/arch/x86/platform/efi/memmap.c b/arch/x86/platform/efi/memmap.c new file mode 100644 index 000000000000..c69f8471e6d0 --- /dev/null +++ b/arch/x86/platform/efi/memmap.c @@ -0,0 +1,239 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Common EFI memory map functions. + */ + +#define pr_fmt(fmt) "efi: " fmt + +#include <linux/init.h> +#include <linux/kernel.h> +#include <linux/efi.h> +#include <linux/io.h> +#include <asm/early_ioremap.h> +#include <asm/efi.h> +#include <linux/memblock.h> +#include <linux/slab.h> + +static phys_addr_t __init __efi_memmap_alloc_early(unsigned long size) +{ + return memblock_phys_alloc(size, SMP_CACHE_BYTES); +} + +static phys_addr_t __init __efi_memmap_alloc_late(unsigned long size) +{ + unsigned int order = get_order(size); + struct page *p = alloc_pages(GFP_KERNEL, order); + + if (!p) + return 0; + + return PFN_PHYS(page_to_pfn(p)); +} + +void __init __efi_memmap_free(u64 phys, unsigned long size, unsigned long flags) +{ + if (flags & EFI_MEMMAP_MEMBLOCK) { + if (slab_is_available()) + memblock_free_late(phys, size); + else + memblock_phys_free(phys, size); + } else if (flags & EFI_MEMMAP_SLAB) { + struct page *p = pfn_to_page(PHYS_PFN(phys)); + unsigned int order = get_order(size); + + free_pages((unsigned long) page_address(p), order); + } +} + +/** + * efi_memmap_alloc - Allocate memory for the EFI memory map + * @num_entries: Number of entries in the allocated map. + * @data: efi memmap installation parameters + * + * Depending on whether mm_init() has already been invoked or not, + * either memblock or "normal" page allocation is used. + * + * Returns zero on success, a negative error code on failure. + */ +int __init efi_memmap_alloc(unsigned int num_entries, + struct efi_memory_map_data *data) +{ + /* Expect allocation parameters are zero initialized */ + WARN_ON(data->phys_map || data->size); + + data->size = num_entries * efi.memmap.desc_size; + data->desc_version = efi.memmap.desc_version; + data->desc_size = efi.memmap.desc_size; + data->flags &= ~(EFI_MEMMAP_SLAB | EFI_MEMMAP_MEMBLOCK); + data->flags |= efi.memmap.flags & EFI_MEMMAP_LATE; + + if (slab_is_available()) { + data->flags |= EFI_MEMMAP_SLAB; + data->phys_map = __efi_memmap_alloc_late(data->size); + } else { + data->flags |= EFI_MEMMAP_MEMBLOCK; + data->phys_map = __efi_memmap_alloc_early(data->size); + } + + if (!data->phys_map) + return -ENOMEM; + return 0; +} + +/** + * efi_memmap_install - Install a new EFI memory map in efi.memmap + * @ctx: map allocation parameters (address, size, flags) + * + * Unlike efi_memmap_init_*(), this function does not allow the caller + * to switch from early to late mappings. It simply uses the existing + * mapping function and installs the new memmap. + * + * Returns zero on success, a negative error code on failure. + */ +int __init efi_memmap_install(struct efi_memory_map_data *data) +{ + efi_memmap_unmap(); + + if (efi_enabled(EFI_PARAVIRT)) + return 0; + + return __efi_memmap_init(data); +} + +/** + * efi_memmap_split_count - Count number of additional EFI memmap entries + * @md: EFI memory descriptor to split + * @range: Address range (start, end) to split around + * + * Returns the number of additional EFI memmap entries required to + * accommodate @range. + */ +int __init efi_memmap_split_count(efi_memory_desc_t *md, struct range *range) +{ + u64 m_start, m_end; + u64 start, end; + int count = 0; + + start = md->phys_addr; + end = start + (md->num_pages << EFI_PAGE_SHIFT) - 1; + + /* modifying range */ + m_start = range->start; + m_end = range->end; + + if (m_start <= start) { + /* split into 2 parts */ + if (start < m_end && m_end < end) + count++; + } + + if (start < m_start && m_start < end) { + /* split into 3 parts */ + if (m_end < end) + count += 2; + /* split into 2 parts */ + if (end <= m_end) + count++; + } + + return count; +} + +/** + * efi_memmap_insert - Insert a memory region in an EFI memmap + * @old_memmap: The existing EFI memory map structure + * @buf: Address of buffer to store new map + * @mem: Memory map entry to insert + * + * It is suggested that you call efi_memmap_split_count() first + * to see how large @buf needs to be. + */ +void __init efi_memmap_insert(struct efi_memory_map *old_memmap, void *buf, + struct efi_mem_range *mem) +{ + u64 m_start, m_end, m_attr; + efi_memory_desc_t *md; + u64 start, end; + void *old, *new; + + /* modifying range */ + m_start = mem->range.start; + m_end = mem->range.end; + m_attr = mem->attribute; + + /* + * The EFI memory map deals with regions in EFI_PAGE_SIZE + * units. Ensure that the region described by 'mem' is aligned + * correctly. + */ + if (!IS_ALIGNED(m_start, EFI_PAGE_SIZE) || + !IS_ALIGNED(m_end + 1, EFI_PAGE_SIZE)) { + WARN_ON(1); + return; + } + + for (old = old_memmap->map, new = buf; + old < old_memmap->map_end; + old += old_memmap->desc_size, new += old_memmap->desc_size) { + + /* copy original EFI memory descriptor */ + memcpy(new, old, old_memmap->desc_size); + md = new; + start = md->phys_addr; + end = md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT) - 1; + + if (m_start <= start && end <= m_end) + md->attribute |= m_attr; + + if (m_start <= start && + (start < m_end && m_end < end)) { + /* first part */ + md->attribute |= m_attr; + md->num_pages = (m_end - md->phys_addr + 1) >> + EFI_PAGE_SHIFT; + /* latter part */ + new += old_memmap->desc_size; + memcpy(new, old, old_memmap->desc_size); + md = new; + md->phys_addr = m_end + 1; + md->num_pages = (end - md->phys_addr + 1) >> + EFI_PAGE_SHIFT; + } + + if ((start < m_start && m_start < end) && m_end < end) { + /* first part */ + md->num_pages = (m_start - md->phys_addr) >> + EFI_PAGE_SHIFT; + /* middle part */ + new += old_memmap->desc_size; + memcpy(new, old, old_memmap->desc_size); + md = new; + md->attribute |= m_attr; + md->phys_addr = m_start; + md->num_pages = (m_end - m_start + 1) >> + EFI_PAGE_SHIFT; + /* last part */ + new += old_memmap->desc_size; + memcpy(new, old, old_memmap->desc_size); + md = new; + md->phys_addr = m_end + 1; + md->num_pages = (end - m_end) >> + EFI_PAGE_SHIFT; + } + + if ((start < m_start && m_start < end) && + (end <= m_end)) { + /* first part */ + md->num_pages = (m_start - md->phys_addr) >> + EFI_PAGE_SHIFT; + /* latter part */ + new += old_memmap->desc_size; + memcpy(new, old, old_memmap->desc_size); + md = new; + md->phys_addr = m_start; + md->num_pages = (end - md->phys_addr + 1) >> + EFI_PAGE_SHIFT; + md->attribute |= m_attr; + } + } +} diff --git a/arch/x86/platform/efi/runtime-map.c b/arch/x86/platform/efi/runtime-map.c new file mode 100644 index 000000000000..bbee682ef8cd --- /dev/null +++ b/arch/x86/platform/efi/runtime-map.c @@ -0,0 +1,194 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2013 Red Hat, Inc., Dave Young <dyoung@redhat.com> + */ + +#include <linux/string.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/types.h> +#include <linux/efi.h> +#include <linux/slab.h> + +#include <asm/efi.h> +#include <asm/setup.h> + +struct efi_runtime_map_entry { + efi_memory_desc_t md; + struct kobject kobj; /* kobject for each entry */ +}; + +static struct efi_runtime_map_entry **map_entries; + +struct map_attribute { + struct attribute attr; + ssize_t (*show)(struct efi_runtime_map_entry *entry, char *buf); +}; + +static inline struct map_attribute *to_map_attr(struct attribute *attr) +{ + return container_of(attr, struct map_attribute, attr); +} + +static ssize_t type_show(struct efi_runtime_map_entry *entry, char *buf) +{ + return snprintf(buf, PAGE_SIZE, "0x%x\n", entry->md.type); +} + +#define EFI_RUNTIME_FIELD(var) entry->md.var + +#define EFI_RUNTIME_U64_ATTR_SHOW(name) \ +static ssize_t name##_show(struct efi_runtime_map_entry *entry, char *buf) \ +{ \ + return snprintf(buf, PAGE_SIZE, "0x%llx\n", EFI_RUNTIME_FIELD(name)); \ +} + +EFI_RUNTIME_U64_ATTR_SHOW(phys_addr); +EFI_RUNTIME_U64_ATTR_SHOW(virt_addr); +EFI_RUNTIME_U64_ATTR_SHOW(num_pages); +EFI_RUNTIME_U64_ATTR_SHOW(attribute); + +static inline struct efi_runtime_map_entry *to_map_entry(struct kobject *kobj) +{ + return container_of(kobj, struct efi_runtime_map_entry, kobj); +} + +static ssize_t map_attr_show(struct kobject *kobj, struct attribute *attr, + char *buf) +{ + struct efi_runtime_map_entry *entry = to_map_entry(kobj); + struct map_attribute *map_attr = to_map_attr(attr); + + return map_attr->show(entry, buf); +} + +static struct map_attribute map_type_attr = __ATTR_RO_MODE(type, 0400); +static struct map_attribute map_phys_addr_attr = __ATTR_RO_MODE(phys_addr, 0400); +static struct map_attribute map_virt_addr_attr = __ATTR_RO_MODE(virt_addr, 0400); +static struct map_attribute map_num_pages_attr = __ATTR_RO_MODE(num_pages, 0400); +static struct map_attribute map_attribute_attr = __ATTR_RO_MODE(attribute, 0400); + +/* + * These are default attributes that are added for every memmap entry. + */ +static struct attribute *def_attrs[] = { + &map_type_attr.attr, + &map_phys_addr_attr.attr, + &map_virt_addr_attr.attr, + &map_num_pages_attr.attr, + &map_attribute_attr.attr, + NULL +}; +ATTRIBUTE_GROUPS(def); + +static const struct sysfs_ops map_attr_ops = { + .show = map_attr_show, +}; + +static void map_release(struct kobject *kobj) +{ + struct efi_runtime_map_entry *entry; + + entry = to_map_entry(kobj); + kfree(entry); +} + +static struct kobj_type __refdata map_ktype = { + .sysfs_ops = &map_attr_ops, + .default_groups = def_groups, + .release = map_release, +}; + +static struct kset *map_kset; + +static struct efi_runtime_map_entry * +add_sysfs_runtime_map_entry(struct kobject *kobj, int nr, + efi_memory_desc_t *md) +{ + int ret; + struct efi_runtime_map_entry *entry; + + if (!map_kset) { + map_kset = kset_create_and_add("runtime-map", NULL, kobj); + if (!map_kset) + return ERR_PTR(-ENOMEM); + } + + entry = kzalloc(sizeof(*entry), GFP_KERNEL); + if (!entry) { + kset_unregister(map_kset); + map_kset = NULL; + return ERR_PTR(-ENOMEM); + } + + memcpy(&entry->md, md, sizeof(efi_memory_desc_t)); + + kobject_init(&entry->kobj, &map_ktype); + entry->kobj.kset = map_kset; + ret = kobject_add(&entry->kobj, NULL, "%d", nr); + if (ret) { + kobject_put(&entry->kobj); + kset_unregister(map_kset); + map_kset = NULL; + return ERR_PTR(ret); + } + + return entry; +} + +int efi_get_runtime_map_size(void) +{ + return efi.memmap.nr_map * efi.memmap.desc_size; +} + +int efi_get_runtime_map_desc_size(void) +{ + return efi.memmap.desc_size; +} + +int efi_runtime_map_copy(void *buf, size_t bufsz) +{ + size_t sz = efi_get_runtime_map_size(); + + if (sz > bufsz) + sz = bufsz; + + memcpy(buf, efi.memmap.map, sz); + return 0; +} + +static int __init efi_runtime_map_init(void) +{ + int i, j, ret = 0; + struct efi_runtime_map_entry *entry; + efi_memory_desc_t *md; + + if (!efi_enabled(EFI_MEMMAP) || !efi_kobj) + return 0; + + map_entries = kcalloc(efi.memmap.nr_map, sizeof(entry), GFP_KERNEL); + if (!map_entries) { + ret = -ENOMEM; + goto out; + } + + i = 0; + for_each_efi_memory_desc(md) { + entry = add_sysfs_runtime_map_entry(efi_kobj, i, md); + if (IS_ERR(entry)) { + ret = PTR_ERR(entry); + goto out_add_entry; + } + *(map_entries + i++) = entry; + } + + return 0; +out_add_entry: + for (j = i - 1; j >= 0; j--) { + entry = *(map_entries + j); + kobject_put(&entry->kobj); + } +out: + return ret; +} +subsys_initcall_sync(efi_runtime_map_init); diff --git a/arch/x86/platform/olpc/olpc-xo15-sci.c b/arch/x86/platform/olpc/olpc-xo15-sci.c index 994a229cb79f..68244a3422d1 100644 --- a/arch/x86/platform/olpc/olpc-xo15-sci.c +++ b/arch/x86/platform/olpc/olpc-xo15-sci.c @@ -183,13 +183,12 @@ err_sysfs: return r; } -static int xo15_sci_remove(struct acpi_device *device) +static void xo15_sci_remove(struct acpi_device *device) { acpi_disable_gpe(NULL, xo15_sci_gpe); acpi_remove_gpe_handler(NULL, xo15_sci_gpe, xo15_sci_gpe_handler); cancel_work_sync(&sci_work); sysfs_remove_file(&device->dev.kobj, &lid_wake_on_close_attr.attr); - return 0; } #ifdef CONFIG_PM_SLEEP diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c index bb176c72891c..236447ee9beb 100644 --- a/arch/x86/power/cpu.c +++ b/arch/x86/power/cpu.c @@ -23,6 +23,7 @@ #include <asm/fpu/api.h> #include <asm/debugreg.h> #include <asm/cpu.h> +#include <asm/cacheinfo.h> #include <asm/mmu_context.h> #include <asm/cpu_device_id.h> #include <asm/microcode.h> @@ -261,7 +262,7 @@ static void notrace __restore_processor_state(struct saved_context *ctxt) do_fpu_end(); tsc_verify_tsc_adjust(true); x86_platform.restore_sched_clock_state(); - mtrr_bp_restore(); + cache_bp_restore(); perf_restore_debug_store(); c = &cpu_data(smp_processor_id()); @@ -513,15 +514,23 @@ static int pm_cpu_check(const struct x86_cpu_id *c) static void pm_save_spec_msr(void) { - u32 spec_msr_id[] = { - MSR_IA32_SPEC_CTRL, - MSR_IA32_TSX_CTRL, - MSR_TSX_FORCE_ABORT, - MSR_IA32_MCU_OPT_CTRL, - MSR_AMD64_LS_CFG, + struct msr_enumeration { + u32 msr_no; + u32 feature; + } msr_enum[] = { + { MSR_IA32_SPEC_CTRL, X86_FEATURE_MSR_SPEC_CTRL }, + { MSR_IA32_TSX_CTRL, X86_FEATURE_MSR_TSX_CTRL }, + { MSR_TSX_FORCE_ABORT, X86_FEATURE_TSX_FORCE_ABORT }, + { MSR_IA32_MCU_OPT_CTRL, X86_FEATURE_SRBDS_CTRL }, + { MSR_AMD64_LS_CFG, X86_FEATURE_LS_CFG_SSBD }, + { MSR_AMD64_DE_CFG, X86_FEATURE_LFENCE_RDTSC }, }; + int i; - msr_build_context(spec_msr_id, ARRAY_SIZE(spec_msr_id)); + for (i = 0; i < ARRAY_SIZE(msr_enum); i++) { + if (boot_cpu_has(msr_enum[i].feature)) + msr_build_context(&msr_enum[i].msr_no, 1); + } } static int pm_check_save_msr(void) diff --git a/arch/x86/power/hibernate.c b/arch/x86/power/hibernate.c index e94e0050a583..6f955eb1e163 100644 --- a/arch/x86/power/hibernate.c +++ b/arch/x86/power/hibernate.c @@ -159,7 +159,7 @@ int relocate_restore_code(void) if (!relocated_restore_code) return -ENOMEM; - memcpy((void *)relocated_restore_code, core_restore_code, PAGE_SIZE); + __memcpy((void *)relocated_restore_code, core_restore_code, PAGE_SIZE); /* Make the page containing the relocated code executable */ pgd = (pgd_t *)__va(read_cr3_pa()) + diff --git a/arch/x86/realmode/init.c b/arch/x86/realmode/init.c index 41d7669a97ad..af565816d2ba 100644 --- a/arch/x86/realmode/init.c +++ b/arch/x86/realmode/init.c @@ -200,14 +200,18 @@ static void __init set_real_mode_permissions(void) set_memory_x((unsigned long) text_start, text_size >> PAGE_SHIFT); } -static int __init init_real_mode(void) +void __init init_real_mode(void) { if (!real_mode_header) panic("Real mode trampoline was not allocated"); setup_real_mode(); set_real_mode_permissions(); +} +static int __init do_init_real_mode(void) +{ + x86_platform.realmode_init(); return 0; } -early_initcall(init_real_mode); +early_initcall(do_init_real_mode); diff --git a/arch/x86/um/asm/elf.h b/arch/x86/um/asm/elf.h index dcaf3b38a9e0..6523eb7c3bd1 100644 --- a/arch/x86/um/asm/elf.h +++ b/arch/x86/um/asm/elf.h @@ -201,10 +201,6 @@ typedef struct user_i387_struct elf_fpregset_t; struct task_struct; -extern int elf_core_copy_fpregs(struct task_struct *t, elf_fpregset_t *fpu); - -#define ELF_CORE_COPY_FPREGS(t, fpu) elf_core_copy_fpregs(t, fpu) - #define ELF_EXEC_PAGESIZE 4096 #define ELF_ET_DYN_BASE (TASK_SIZE / 3 * 2) diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c index f82857e48815..5b1379662877 100644 --- a/arch/x86/xen/enlighten_pv.c +++ b/arch/x86/xen/enlighten_pv.c @@ -23,6 +23,7 @@ #include <linux/start_kernel.h> #include <linux/sched.h> #include <linux/kprobes.h> +#include <linux/kstrtox.h> #include <linux/memblock.h> #include <linux/export.h> #include <linux/mm.h> @@ -32,6 +33,7 @@ #include <linux/edd.h> #include <linux/reboot.h> #include <linux/virtio_anchor.h> +#include <linux/stackprotector.h> #include <xen/xen.h> #include <xen/events.h> @@ -64,7 +66,6 @@ #include <asm/pgalloc.h> #include <asm/tlbflush.h> #include <asm/reboot.h> -#include <asm/stackprotector.h> #include <asm/hypervisor.h> #include <asm/mach_traps.h> #include <asm/mwait.h> @@ -113,7 +114,7 @@ static __read_mostly bool xen_msr_safe = IS_ENABLED(CONFIG_XEN_PV_MSR_SAFE); static int __init parse_xen_msr_safe(char *str) { if (str) - return strtobool(str, &xen_msr_safe); + return kstrtobool(str, &xen_msr_safe); return -EINVAL; } early_param("xen_msr_safe", parse_xen_msr_safe); @@ -1209,7 +1210,7 @@ static void __init xen_setup_gdt(int cpu) pv_ops.cpu.write_gdt_entry = xen_write_gdt_entry_boot; pv_ops.cpu.load_gdt = xen_load_gdt_boot; - switch_to_new_gdt(cpu); + switch_gdt_and_percpu_base(cpu); pv_ops.cpu.write_gdt_entry = xen_write_gdt_entry; pv_ops.cpu.load_gdt = xen_load_gdt; @@ -1265,6 +1266,8 @@ asmlinkage __visible void __init xen_start_kernel(struct start_info *si) xen_vcpu_info_reset(0); x86_platform.get_nmi_reason = xen_get_nmi_reason; + x86_platform.realmode_reserve = x86_init_noop; + x86_platform.realmode_init = x86_init_noop; x86_init.resources.memory_setup = xen_memory_setup; x86_init.irqs.intr_mode_select = x86_init_noop; diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index 4f4309500559..8db26f10fb1d 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -7,6 +7,7 @@ #include <linux/init.h> #include <linux/sched.h> +#include <linux/kstrtox.h> #include <linux/mm.h> #include <linux/pm.h> #include <linux/memblock.h> @@ -85,7 +86,7 @@ static void __init xen_parse_512gb(void) arg = strstr(xen_start_info->cmd_line, "xen_512gb_limit="); if (!arg) val = true; - else if (strtobool(arg + strlen("xen_512gb_limit="), &val)) + else if (kstrtobool(arg + strlen("xen_512gb_limit="), &val)) return; xen_512gb_limit = val; diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c index c3e1f9a7d43a..4b0d6fff88de 100644 --- a/arch/x86/xen/smp.c +++ b/arch/x86/xen/smp.c @@ -32,30 +32,30 @@ static irqreturn_t xen_reschedule_interrupt(int irq, void *dev_id) void xen_smp_intr_free(unsigned int cpu) { + kfree(per_cpu(xen_resched_irq, cpu).name); + per_cpu(xen_resched_irq, cpu).name = NULL; if (per_cpu(xen_resched_irq, cpu).irq >= 0) { unbind_from_irqhandler(per_cpu(xen_resched_irq, cpu).irq, NULL); per_cpu(xen_resched_irq, cpu).irq = -1; - kfree(per_cpu(xen_resched_irq, cpu).name); - per_cpu(xen_resched_irq, cpu).name = NULL; } + kfree(per_cpu(xen_callfunc_irq, cpu).name); + per_cpu(xen_callfunc_irq, cpu).name = NULL; if (per_cpu(xen_callfunc_irq, cpu).irq >= 0) { unbind_from_irqhandler(per_cpu(xen_callfunc_irq, cpu).irq, NULL); per_cpu(xen_callfunc_irq, cpu).irq = -1; - kfree(per_cpu(xen_callfunc_irq, cpu).name); - per_cpu(xen_callfunc_irq, cpu).name = NULL; } + kfree(per_cpu(xen_debug_irq, cpu).name); + per_cpu(xen_debug_irq, cpu).name = NULL; if (per_cpu(xen_debug_irq, cpu).irq >= 0) { unbind_from_irqhandler(per_cpu(xen_debug_irq, cpu).irq, NULL); per_cpu(xen_debug_irq, cpu).irq = -1; - kfree(per_cpu(xen_debug_irq, cpu).name); - per_cpu(xen_debug_irq, cpu).name = NULL; } + kfree(per_cpu(xen_callfuncsingle_irq, cpu).name); + per_cpu(xen_callfuncsingle_irq, cpu).name = NULL; if (per_cpu(xen_callfuncsingle_irq, cpu).irq >= 0) { unbind_from_irqhandler(per_cpu(xen_callfuncsingle_irq, cpu).irq, NULL); per_cpu(xen_callfuncsingle_irq, cpu).irq = -1; - kfree(per_cpu(xen_callfuncsingle_irq, cpu).name); - per_cpu(xen_callfuncsingle_irq, cpu).name = NULL; } } @@ -65,6 +65,7 @@ int xen_smp_intr_init(unsigned int cpu) char *resched_name, *callfunc_name, *debug_name; resched_name = kasprintf(GFP_KERNEL, "resched%d", cpu); + per_cpu(xen_resched_irq, cpu).name = resched_name; rc = bind_ipi_to_irqhandler(XEN_RESCHEDULE_VECTOR, cpu, xen_reschedule_interrupt, @@ -74,9 +75,9 @@ int xen_smp_intr_init(unsigned int cpu) if (rc < 0) goto fail; per_cpu(xen_resched_irq, cpu).irq = rc; - per_cpu(xen_resched_irq, cpu).name = resched_name; callfunc_name = kasprintf(GFP_KERNEL, "callfunc%d", cpu); + per_cpu(xen_callfunc_irq, cpu).name = callfunc_name; rc = bind_ipi_to_irqhandler(XEN_CALL_FUNCTION_VECTOR, cpu, xen_call_function_interrupt, @@ -86,10 +87,10 @@ int xen_smp_intr_init(unsigned int cpu) if (rc < 0) goto fail; per_cpu(xen_callfunc_irq, cpu).irq = rc; - per_cpu(xen_callfunc_irq, cpu).name = callfunc_name; if (!xen_fifo_events) { debug_name = kasprintf(GFP_KERNEL, "debug%d", cpu); + per_cpu(xen_debug_irq, cpu).name = debug_name; rc = bind_virq_to_irqhandler(VIRQ_DEBUG, cpu, xen_debug_interrupt, IRQF_PERCPU | IRQF_NOBALANCING, @@ -97,10 +98,10 @@ int xen_smp_intr_init(unsigned int cpu) if (rc < 0) goto fail; per_cpu(xen_debug_irq, cpu).irq = rc; - per_cpu(xen_debug_irq, cpu).name = debug_name; } callfunc_name = kasprintf(GFP_KERNEL, "callfuncsingle%d", cpu); + per_cpu(xen_callfuncsingle_irq, cpu).name = callfunc_name; rc = bind_ipi_to_irqhandler(XEN_CALL_FUNCTION_SINGLE_VECTOR, cpu, xen_call_function_single_interrupt, @@ -110,7 +111,6 @@ int xen_smp_intr_init(unsigned int cpu) if (rc < 0) goto fail; per_cpu(xen_callfuncsingle_irq, cpu).irq = rc; - per_cpu(xen_callfuncsingle_irq, cpu).name = callfunc_name; return 0; diff --git a/arch/x86/xen/smp_pv.c b/arch/x86/xen/smp_pv.c index 480be82e9b7b..6175f2c5c822 100644 --- a/arch/x86/xen/smp_pv.c +++ b/arch/x86/xen/smp_pv.c @@ -97,18 +97,18 @@ asmlinkage __visible void cpu_bringup_and_idle(void) void xen_smp_intr_free_pv(unsigned int cpu) { + kfree(per_cpu(xen_irq_work, cpu).name); + per_cpu(xen_irq_work, cpu).name = NULL; if (per_cpu(xen_irq_work, cpu).irq >= 0) { unbind_from_irqhandler(per_cpu(xen_irq_work, cpu).irq, NULL); per_cpu(xen_irq_work, cpu).irq = -1; - kfree(per_cpu(xen_irq_work, cpu).name); - per_cpu(xen_irq_work, cpu).name = NULL; } + kfree(per_cpu(xen_pmu_irq, cpu).name); + per_cpu(xen_pmu_irq, cpu).name = NULL; if (per_cpu(xen_pmu_irq, cpu).irq >= 0) { unbind_from_irqhandler(per_cpu(xen_pmu_irq, cpu).irq, NULL); per_cpu(xen_pmu_irq, cpu).irq = -1; - kfree(per_cpu(xen_pmu_irq, cpu).name); - per_cpu(xen_pmu_irq, cpu).name = NULL; } } @@ -118,6 +118,7 @@ int xen_smp_intr_init_pv(unsigned int cpu) char *callfunc_name, *pmu_name; callfunc_name = kasprintf(GFP_KERNEL, "irqwork%d", cpu); + per_cpu(xen_irq_work, cpu).name = callfunc_name; rc = bind_ipi_to_irqhandler(XEN_IRQ_WORK_VECTOR, cpu, xen_irq_work_interrupt, @@ -127,10 +128,10 @@ int xen_smp_intr_init_pv(unsigned int cpu) if (rc < 0) goto fail; per_cpu(xen_irq_work, cpu).irq = rc; - per_cpu(xen_irq_work, cpu).name = callfunc_name; if (is_xen_pmu) { pmu_name = kasprintf(GFP_KERNEL, "pmu%d", cpu); + per_cpu(xen_pmu_irq, cpu).name = pmu_name; rc = bind_virq_to_irqhandler(VIRQ_XENPMU, cpu, xen_pmu_irq_handler, IRQF_PERCPU|IRQF_NOBALANCING, @@ -138,7 +139,6 @@ int xen_smp_intr_init_pv(unsigned int cpu) if (rc < 0) goto fail; per_cpu(xen_pmu_irq, cpu).irq = rc; - per_cpu(xen_pmu_irq, cpu).name = pmu_name; } return 0; diff --git a/arch/x86/xen/spinlock.c b/arch/x86/xen/spinlock.c index 043c73dfd2c9..5c6fc16e4b92 100644 --- a/arch/x86/xen/spinlock.c +++ b/arch/x86/xen/spinlock.c @@ -75,6 +75,7 @@ void xen_init_lock_cpu(int cpu) cpu, per_cpu(lock_kicker_irq, cpu)); name = kasprintf(GFP_KERNEL, "spinlock%d", cpu); + per_cpu(irq_name, cpu) = name; irq = bind_ipi_to_irqhandler(XEN_SPIN_UNLOCK_VECTOR, cpu, dummy_handler, @@ -85,7 +86,6 @@ void xen_init_lock_cpu(int cpu) if (irq >= 0) { disable_irq(irq); /* make sure it's never delivered */ per_cpu(lock_kicker_irq, cpu) = irq; - per_cpu(irq_name, cpu) = name; } printk("cpu %d spinlock event irq %d\n", cpu, irq); @@ -98,6 +98,8 @@ void xen_uninit_lock_cpu(int cpu) if (!xen_pvspin) return; + kfree(per_cpu(irq_name, cpu)); + per_cpu(irq_name, cpu) = NULL; /* * When booting the kernel with 'mitigations=auto,nosmt', the secondary * CPUs are not activated, and lock_kicker_irq is not initialized. @@ -108,8 +110,6 @@ void xen_uninit_lock_cpu(int cpu) unbind_from_irqhandler(irq, NULL); per_cpu(lock_kicker_irq, cpu) = -1; - kfree(per_cpu(irq_name, cpu)); - per_cpu(irq_name, cpu) = NULL; } PV_CALLEE_SAVE_REGS_THUNK(xen_vcpu_stolen); diff --git a/arch/x86/xen/xen-asm.S b/arch/x86/xen/xen-asm.S index 6b4fdf6b9542..4a184f6e4e4d 100644 --- a/arch/x86/xen/xen-asm.S +++ b/arch/x86/xen/xen-asm.S @@ -262,10 +262,10 @@ SYM_CODE_START(xen_entry_SYSCALL_compat) /* * Neither Xen nor the kernel really knows what the old SS and - * CS were. The kernel expects __USER32_DS and __USER32_CS, so + * CS were. The kernel expects __USER_DS and __USER32_CS, so * report those values even though Xen will guess its own values. */ - movq $__USER32_DS, 4*8(%rsp) + movq $__USER_DS, 4*8(%rsp) movq $__USER32_CS, 1*8(%rsp) jmp entry_SYSCALL_compat_after_hwframe @@ -284,10 +284,10 @@ SYM_CODE_START(xen_entry_SYSENTER_compat) /* * Neither Xen nor the kernel really knows what the old SS and - * CS were. The kernel expects __USER32_DS and __USER32_CS, so + * CS were. The kernel expects __USER_DS and __USER32_CS, so * report those values even though Xen will guess its own values. */ - movq $__USER32_DS, 4*8(%rsp) + movq $__USER_DS, 4*8(%rsp) movq $__USER32_CS, 1*8(%rsp) jmp entry_SYSENTER_compat_after_hwframe |