summaryrefslogtreecommitdiff
path: root/arch/x86
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kconfig138
-rw-r--r--arch/x86/Makefile8
-rw-r--r--arch/x86/boot/Makefile2
-rw-r--r--arch/x86/boot/compressed/Makefile10
-rw-r--r--arch/x86/boot/compressed/efi_mixed.S345
-rw-r--r--arch/x86/boot/compressed/efi_thunk_64.S195
-rw-r--r--arch/x86/boot/compressed/head_32.S4
-rw-r--r--arch/x86/boot/compressed/head_64.S311
-rw-r--r--arch/x86/boot/compressed/kaslr.c2
-rw-r--r--arch/x86/boot/compressed/mem_encrypt.S152
-rw-r--r--arch/x86/boot/cpuflags.c15
-rw-r--r--arch/x86/boot/header.S5
-rw-r--r--arch/x86/boot/string.c2
-rw-r--r--arch/x86/boot/tools/build.c2
-rw-r--r--arch/x86/coco/tdx/tdx.c66
-rw-r--r--arch/x86/crypto/Makefile3
-rw-r--r--arch/x86/crypto/aegis128-aesni-asm.S9
-rw-r--r--arch/x86/crypto/aria-aesni-avx-asm_64.S13
-rw-r--r--arch/x86/crypto/camellia-aesni-avx-asm_64.S2
-rw-r--r--arch/x86/crypto/camellia-aesni-avx2-asm_64.S4
-rw-r--r--arch/x86/crypto/cast5-avx-x86_64-asm_64.S2
-rw-r--r--arch/x86/crypto/crct10dif-pcl-asm_64.S1
-rw-r--r--arch/x86/crypto/nh-avx2-x86_64.S5
-rw-r--r--arch/x86/crypto/nh-sse2-x86_64.S5
-rw-r--r--arch/x86/crypto/nhpoly1305-avx2-glue.c11
-rw-r--r--arch/x86/crypto/nhpoly1305-sse2-glue.c11
-rw-r--r--arch/x86/crypto/poly1305-x86_64-cryptogams.pl1
-rw-r--r--arch/x86/crypto/serpent-avx-x86_64-asm_64.S2
-rw-r--r--arch/x86/crypto/serpent-avx2-asm_64.S2
-rw-r--r--arch/x86/crypto/sha1_ni_asm.S4
-rw-r--r--arch/x86/crypto/sha1_ssse3_asm.S3
-rw-r--r--arch/x86/crypto/sha256-avx-asm.S4
-rw-r--r--arch/x86/crypto/sha256-avx2-asm.S4
-rw-r--r--arch/x86/crypto/sha256-ssse3-asm.S4
-rw-r--r--arch/x86/crypto/sha256_ni_asm.S4
-rw-r--r--arch/x86/crypto/sha512-avx-asm.S3
-rw-r--r--arch/x86/crypto/sha512-avx2-asm.S3
-rw-r--r--arch/x86/crypto/sha512-ssse3-asm.S3
-rw-r--r--arch/x86/crypto/sm3-avx-asm_64.S4
-rw-r--r--arch/x86/crypto/sm4-aesni-avx-asm_64.S14
-rw-r--r--arch/x86/crypto/sm4-aesni-avx2-asm_64.S13
-rw-r--r--arch/x86/crypto/twofish-avx-x86_64-asm_64.S2
-rw-r--r--arch/x86/crypto/twofish_glue.c2
-rw-r--r--arch/x86/entry/entry_32.S4
-rw-r--r--arch/x86/entry/entry_64.S50
-rw-r--r--arch/x86/entry/entry_64_compat.S11
-rw-r--r--arch/x86/entry/thunk_64.S4
-rw-r--r--arch/x86/entry/vdso/Makefile14
-rw-r--r--arch/x86/entry/vdso/vdso.lds.S2
-rw-r--r--arch/x86/entry/vdso/vma.c32
-rw-r--r--arch/x86/events/amd/brs.c2
-rw-r--r--arch/x86/events/amd/core.c7
-rw-r--r--arch/x86/events/amd/ibs.c4
-rw-r--r--arch/x86/events/amd/lbr.c6
-rw-r--r--arch/x86/events/amd/uncore.c1
-rw-r--r--arch/x86/events/core.c48
-rw-r--r--arch/x86/events/intel/core.c23
-rw-r--r--arch/x86/events/intel/ds.c4
-rw-r--r--arch/x86/events/intel/lbr.c30
-rw-r--r--arch/x86/events/intel/p4.c2
-rw-r--r--arch/x86/events/intel/pt.c9
-rw-r--r--arch/x86/events/intel/uncore.h24
-rw-r--r--arch/x86/events/intel/uncore_snb.c3
-rw-r--r--arch/x86/events/intel/uncore_snbep.c495
-rw-r--r--arch/x86/events/perf_event.h31
-rw-r--r--arch/x86/events/rapl.c5
-rw-r--r--arch/x86/hyperv/hv_init.c77
-rw-r--r--arch/x86/ia32/Makefile2
-rw-r--r--arch/x86/include/asm/alternative.h68
-rw-r--r--arch/x86/include/asm/apic.h3
-rw-r--r--arch/x86/include/asm/cacheinfo.h13
-rw-r--r--arch/x86/include/asm/cmpxchg_32.h28
-rw-r--r--arch/x86/include/asm/cmpxchg_64.h5
-rw-r--r--arch/x86/include/asm/cpu.h2
-rw-r--r--arch/x86/include/asm/cpu_entry_area.h4
-rw-r--r--arch/x86/include/asm/cpufeatures.h3
-rw-r--r--arch/x86/include/asm/cpuid.h141
-rw-r--r--arch/x86/include/asm/current.h32
-rw-r--r--arch/x86/include/asm/debugreg.h2
-rw-r--r--arch/x86/include/asm/disabled-features.h17
-rw-r--r--arch/x86/include/asm/efi.h109
-rw-r--r--arch/x86/include/asm/elf.h5
-rw-r--r--arch/x86/include/asm/entry-common.h4
-rw-r--r--arch/x86/include/asm/fpu/signal.h7
-rw-r--r--arch/x86/include/asm/ftrace.h49
-rw-r--r--arch/x86/include/asm/hardirq.h3
-rw-r--r--arch/x86/include/asm/hyperv-tlfs.h11
-rw-r--r--arch/x86/include/asm/hyperv_timer.h9
-rw-r--r--arch/x86/include/asm/insn-eval.h18
-rw-r--r--arch/x86/include/asm/irq_remapping.h4
-rw-r--r--arch/x86/include/asm/irq_stack.h12
-rw-r--r--arch/x86/include/asm/irqdomain.h4
-rw-r--r--arch/x86/include/asm/kasan.h3
-rw-r--r--arch/x86/include/asm/kvm-x86-ops.h1
-rw-r--r--arch/x86/include/asm/kvm_host.h9
-rw-r--r--arch/x86/include/asm/linkage.h63
-rw-r--r--arch/x86/include/asm/memtype.h5
-rw-r--r--arch/x86/include/asm/microcode.h4
-rw-r--r--arch/x86/include/asm/microcode_intel.h5
-rw-r--r--arch/x86/include/asm/mshyperv.h2
-rw-r--r--arch/x86/include/asm/msi.h6
-rw-r--r--arch/x86/include/asm/msr-index.h30
-rw-r--r--arch/x86/include/asm/mtrr.h16
-rw-r--r--arch/x86/include/asm/nospec-branch.h178
-rw-r--r--arch/x86/include/asm/page_types.h12
-rw-r--r--arch/x86/include/asm/paravirt.h17
-rw-r--r--arch/x86/include/asm/paravirt_types.h89
-rw-r--r--arch/x86/include/asm/pci.h5
-rw-r--r--arch/x86/include/asm/pgtable-3level.h171
-rw-r--r--arch/x86/include/asm/pgtable-3level_types.h7
-rw-r--r--arch/x86/include/asm/pgtable.h27
-rw-r--r--arch/x86/include/asm/pgtable_32.h9
-rw-r--r--arch/x86/include/asm/pgtable_64.h1
-rw-r--r--arch/x86/include/asm/pgtable_64_types.h1
-rw-r--r--arch/x86/include/asm/pgtable_areas.h8
-rw-r--r--arch/x86/include/asm/pgtable_types.h4
-rw-r--r--arch/x86/include/asm/preempt.h27
-rw-r--r--arch/x86/include/asm/processor-flags.h2
-rw-r--r--arch/x86/include/asm/processor.h144
-rw-r--r--arch/x86/include/asm/qspinlock_paravirt.h47
-rw-r--r--arch/x86/include/asm/realmode.h1
-rw-r--r--arch/x86/include/asm/resctrl.h8
-rw-r--r--arch/x86/include/asm/segment.h2
-rw-r--r--arch/x86/include/asm/set_memory.h3
-rw-r--r--arch/x86/include/asm/sgx.h33
-rw-r--r--arch/x86/include/asm/sighandling.h9
-rw-r--r--arch/x86/include/asm/signal.h5
-rw-r--r--arch/x86/include/asm/smp.h12
-rw-r--r--arch/x86/include/asm/stackprotector.h14
-rw-r--r--arch/x86/include/asm/switch_to.h7
-rw-r--r--arch/x86/include/asm/tdx.h2
-rw-r--r--arch/x86/include/asm/text-patching.h1
-rw-r--r--arch/x86/include/asm/x86_init.h4
-rw-r--r--arch/x86/kernel/Makefile6
-rw-r--r--arch/x86/kernel/acpi/cstate.c24
-rw-r--r--arch/x86/kernel/alternative.c541
-rw-r--r--arch/x86/kernel/amd_gart_64.c2
-rw-r--r--arch/x86/kernel/apic/apic.c13
-rw-r--r--arch/x86/kernel/apic/msi.c211
-rw-r--r--arch/x86/kernel/apic/vector.c4
-rw-r--r--arch/x86/kernel/asm-offsets.c5
-rw-r--r--arch/x86/kernel/asm-offsets_64.c2
-rw-r--r--arch/x86/kernel/callthunks.c388
-rw-r--r--arch/x86/kernel/cpu/Makefile3
-rw-r--r--arch/x86/kernel/cpu/amd.c8
-rw-r--r--arch/x86/kernel/cpu/bugs.c162
-rw-r--r--arch/x86/kernel/cpu/cacheinfo.c179
-rw-r--r--arch/x86/kernel/cpu/common.c101
-rw-r--r--arch/x86/kernel/cpu/cpuid-deps.c1
-rw-r--r--arch/x86/kernel/cpu/hygon.c6
-rw-r--r--arch/x86/kernel/cpu/intel.c207
-rw-r--r--arch/x86/kernel/cpu/intel_epb.c7
-rw-r--r--arch/x86/kernel/cpu/mce/amd.c33
-rw-r--r--arch/x86/kernel/cpu/mce/severity.c8
-rw-r--r--arch/x86/kernel/cpu/microcode/amd.c5
-rw-r--r--arch/x86/kernel/cpu/microcode/core.c205
-rw-r--r--arch/x86/kernel/cpu/microcode/intel.c167
-rw-r--r--arch/x86/kernel/cpu/mshyperv.c6
-rw-r--r--arch/x86/kernel/cpu/mtrr/amd.c8
-rw-r--r--arch/x86/kernel/cpu/mtrr/centaur.c8
-rw-r--r--arch/x86/kernel/cpu/mtrr/cyrix.c42
-rw-r--r--arch/x86/kernel/cpu/mtrr/generic.c107
-rw-r--r--arch/x86/kernel/cpu/mtrr/mtrr.c173
-rw-r--r--arch/x86/kernel/cpu/mtrr/mtrr.h15
-rw-r--r--arch/x86/kernel/cpu/resctrl/core.c4
-rw-r--r--arch/x86/kernel/cpu/resctrl/ctrlmondata.c3
-rw-r--r--arch/x86/kernel/cpu/resctrl/internal.h10
-rw-r--r--arch/x86/kernel/cpu/resctrl/pseudo_lock.c8
-rw-r--r--arch/x86/kernel/cpu/scattered.c1
-rw-r--r--arch/x86/kernel/cpu/sgx/encl.c39
-rw-r--r--arch/x86/kernel/cpu/sgx/ioctl.c9
-rw-r--r--arch/x86/kernel/cpu/sgx/main.c8
-rw-r--r--arch/x86/kernel/cpu/tsx.c38
-rw-r--r--arch/x86/kernel/cpuid.c2
-rw-r--r--arch/x86/kernel/crash.c4
-rw-r--r--arch/x86/kernel/crash_dump_64.c2
-rw-r--r--arch/x86/kernel/devicetree.c18
-rw-r--r--arch/x86/kernel/dumpstack_32.c4
-rw-r--r--arch/x86/kernel/dumpstack_64.c2
-rw-r--r--arch/x86/kernel/espfix_64.c12
-rw-r--r--arch/x86/kernel/fpu/core.c21
-rw-r--r--arch/x86/kernel/fpu/init.c7
-rw-r--r--arch/x86/kernel/fpu/regset.c2
-rw-r--r--arch/x86/kernel/fpu/signal.c2
-rw-r--r--arch/x86/kernel/fpu/xstate.c64
-rw-r--r--arch/x86/kernel/fpu/xstate.h4
-rw-r--r--arch/x86/kernel/ftrace.c28
-rw-r--r--arch/x86/kernel/ftrace_64.S37
-rw-r--r--arch/x86/kernel/head64.c2
-rw-r--r--arch/x86/kernel/head_32.S22
-rw-r--r--arch/x86/kernel/head_64.S1
-rw-r--r--arch/x86/kernel/hw_breakpoint.c2
-rw-r--r--arch/x86/kernel/i8259.c2
-rw-r--r--arch/x86/kernel/irq_32.c13
-rw-r--r--arch/x86/kernel/irq_64.c6
-rw-r--r--arch/x86/kernel/kprobes/core.c21
-rw-r--r--arch/x86/kernel/kprobes/opt.c28
-rw-r--r--arch/x86/kernel/kvm.c18
-rw-r--r--arch/x86/kernel/module.c60
-rw-r--r--arch/x86/kernel/msr.c2
-rw-r--r--arch/x86/kernel/paravirt.c21
-rw-r--r--arch/x86/kernel/process.c4
-rw-r--r--arch/x86/kernel/process_32.c6
-rw-r--r--arch/x86/kernel/process_64.c10
-rw-r--r--arch/x86/kernel/ptrace.c174
-rw-r--r--arch/x86/kernel/relocate_kernel_64.S5
-rw-r--r--arch/x86/kernel/resource.c12
-rw-r--r--arch/x86/kernel/setup.c17
-rw-r--r--arch/x86/kernel/setup_percpu.c9
-rw-r--r--arch/x86/kernel/sev.c18
-rw-r--r--arch/x86/kernel/signal.c654
-rw-r--r--arch/x86/kernel/signal_32.c (renamed from arch/x86/ia32/ia32_signal.c)117
-rw-r--r--arch/x86/kernel/signal_64.c383
-rw-r--r--arch/x86/kernel/smpboot.c20
-rw-r--r--arch/x86/kernel/static_call.c3
-rw-r--r--arch/x86/kernel/topology.c2
-rw-r--r--arch/x86/kernel/traps.c15
-rw-r--r--arch/x86/kernel/tsc.c2
-rw-r--r--arch/x86/kernel/unwind_orc.c21
-rw-r--r--arch/x86/kernel/uprobes.c4
-rw-r--r--arch/x86/kernel/vmlinux.lds.S37
-rw-r--r--arch/x86/kernel/x86_init.c3
-rw-r--r--arch/x86/kvm/Kconfig1
-rw-r--r--arch/x86/kvm/cpuid.c39
-rw-r--r--arch/x86/kvm/debugfs.c2
-rw-r--r--arch/x86/kvm/emulate.c1
-rw-r--r--arch/x86/kvm/hyperv.c1
-rw-r--r--arch/x86/kvm/i8254.c4
-rw-r--r--arch/x86/kvm/i8259.c4
-rw-r--r--arch/x86/kvm/ioapic.c1
-rw-r--r--arch/x86/kvm/irq.c1
-rw-r--r--arch/x86/kvm/irq_comm.c7
-rw-r--r--arch/x86/kvm/kvm_onhyperv.c1
-rw-r--r--arch/x86/kvm/lapic.c8
-rw-r--r--arch/x86/kvm/mmu/mmu.c11
-rw-r--r--arch/x86/kvm/mmu/page_track.c1
-rw-r--r--arch/x86/kvm/mmu/spte.c4
-rw-r--r--arch/x86/kvm/mmu/spte.h4
-rw-r--r--arch/x86/kvm/mmu/tdp_iter.c1
-rw-r--r--arch/x86/kvm/mmu/tdp_mmu.c1
-rw-r--r--arch/x86/kvm/mtrr.c1
-rw-r--r--arch/x86/kvm/pmu.c1
-rw-r--r--arch/x86/kvm/reverse_cpuid.h3
-rw-r--r--arch/x86/kvm/smm.c1
-rw-r--r--arch/x86/kvm/svm/avic.c2
-rw-r--r--arch/x86/kvm/svm/nested.c14
-rw-r--r--arch/x86/kvm/svm/pmu.c2
-rw-r--r--arch/x86/kvm/svm/sev.c1
-rw-r--r--arch/x86/kvm/svm/svm.c99
-rw-r--r--arch/x86/kvm/svm/svm_onhyperv.c1
-rw-r--r--arch/x86/kvm/svm/svm_onhyperv.h4
-rw-r--r--arch/x86/kvm/svm/vmenter.S1
-rw-r--r--arch/x86/kvm/vmx/capabilities.h4
-rw-r--r--arch/x86/kvm/vmx/hyperv.c5
-rw-r--r--arch/x86/kvm/vmx/hyperv.h6
-rw-r--r--arch/x86/kvm/vmx/nested.c3
-rw-r--r--arch/x86/kvm/vmx/pmu_intel.c5
-rw-r--r--arch/x86/kvm/vmx/posted_intr.c2
-rw-r--r--arch/x86/kvm/vmx/sgx.c5
-rw-r--r--arch/x86/kvm/vmx/vmcs12.c1
-rw-r--r--arch/x86/kvm/vmx/vmx.c436
-rw-r--r--arch/x86/kvm/vmx/vmx_ops.h4
-rw-r--r--arch/x86/kvm/x86.c250
-rw-r--r--arch/x86/kvm/xen.c117
-rw-r--r--arch/x86/lib/Makefile1
-rw-r--r--arch/x86/lib/error-inject.c1
-rw-r--r--arch/x86/lib/insn-eval.c20
-rw-r--r--arch/x86/lib/iomap_copy_64.S2
-rw-r--r--arch/x86/lib/memcpy_32.c187
-rw-r--r--arch/x86/lib/memmove_32.S200
-rw-r--r--arch/x86/lib/putuser.S62
-rw-r--r--arch/x86/lib/retpoline.S107
-rw-r--r--arch/x86/lib/usercopy.c3
-rw-r--r--arch/x86/mm/cpu_entry_area.c58
-rw-r--r--arch/x86/mm/hugetlbpage.c4
-rw-r--r--arch/x86/mm/init.c2
-rw-r--r--arch/x86/mm/init_64.c133
-rw-r--r--arch/x86/mm/ioremap.c8
-rw-r--r--arch/x86/mm/kasan_init_64.c53
-rw-r--r--arch/x86/mm/kmmio.c50
-rw-r--r--arch/x86/mm/mem_encrypt_boot.S4
-rw-r--r--arch/x86/mm/mem_encrypt_identity.c18
-rw-r--r--arch/x86/mm/pat/cpa-test.c4
-rw-r--r--arch/x86/mm/pat/memtype.c157
-rw-r--r--arch/x86/mm/pat/set_memory.c123
-rw-r--r--arch/x86/mm/pgtable.c22
-rw-r--r--arch/x86/mm/pti.c2
-rw-r--r--arch/x86/net/bpf_jit_comp.c182
-rw-r--r--arch/x86/pci/acpi.c39
-rw-r--r--arch/x86/platform/efi/Makefile5
-rw-r--r--arch/x86/platform/efi/efi.c54
-rw-r--r--arch/x86/platform/efi/fake_mem.c197
-rw-r--r--arch/x86/platform/efi/memmap.c239
-rw-r--r--arch/x86/platform/efi/runtime-map.c194
-rw-r--r--arch/x86/platform/olpc/olpc-xo15-sci.c3
-rw-r--r--arch/x86/power/cpu.c25
-rw-r--r--arch/x86/power/hibernate.c2
-rw-r--r--arch/x86/realmode/init.c8
-rw-r--r--arch/x86/um/asm/elf.h4
-rw-r--r--arch/x86/xen/enlighten_pv.c9
-rw-r--r--arch/x86/xen/setup.c3
-rw-r--r--arch/x86/xen/smp.c24
-rw-r--r--arch/x86/xen/smp_pv.c12
-rw-r--r--arch/x86/xen/spinlock.c6
-rw-r--r--arch/x86/xen/xen-asm.S8
305 files changed, 7185 insertions, 4632 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 67745ceab0db..3604074a878b 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -69,6 +69,7 @@ config X86
select ARCH_ENABLE_THP_MIGRATION if X86_64 && TRANSPARENT_HUGEPAGE
select ARCH_HAS_ACPI_TABLE_UPGRADE if ACPI
select ARCH_HAS_CACHE_LINE_SIZE
+ select ARCH_HAS_CPU_CACHE_INVALIDATE_MEMREGION
select ARCH_HAS_CURRENT_STACK_POINTER
select ARCH_HAS_DEBUG_VIRTUAL
select ARCH_HAS_DEBUG_VM_PGTABLE if !X86_PAE
@@ -81,6 +82,7 @@ config X86
select ARCH_HAS_KCOV if X86_64
select ARCH_HAS_MEM_ENCRYPT
select ARCH_HAS_MEMBARRIER_SYNC_CORE
+ select ARCH_HAS_NMI_SAFE_THIS_CPU_OPS
select ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE
select ARCH_HAS_PMEM_API if X86_64
select ARCH_HAS_PTE_DEVMAP if X86_64
@@ -157,7 +159,7 @@ config X86
select GENERIC_TIME_VSYSCALL
select GENERIC_GETTIMEOFDAY
select GENERIC_VDSO_TIME_NS
- select GUP_GET_PTE_LOW_HIGH if X86_PAE
+ select GUP_GET_PXX_LOW_HIGH if X86_PAE
select HARDIRQS_SW_RESEND
select HARDLOCKUP_CHECK_TIMESTAMP if X86_64
select HAVE_ACPI_APEI if ACPI
@@ -195,6 +197,7 @@ config X86
select HAVE_CONTEXT_TRACKING_USER_OFFSTACK if HAVE_CONTEXT_TRACKING_USER
select HAVE_C_RECORDMCOUNT
select HAVE_OBJTOOL_MCOUNT if HAVE_OBJTOOL
+ select HAVE_OBJTOOL_NOP_MCOUNT if HAVE_OBJTOOL_MCOUNT
select HAVE_BUILDTIME_MCOUNT_SORT
select HAVE_DEBUG_KMEMLEAK
select HAVE_DMA_CONTIGUOUS
@@ -290,6 +293,8 @@ config X86
select X86_FEATURE_NAMES if PROC_FS
select PROC_PID_ARCH_STATUS if PROC_FS
select HAVE_ARCH_NODE_DEV_GROUP if X86_SGX
+ select FUNCTION_ALIGNMENT_16B if X86_64 || X86_ALIGNMENT_16
+ select FUNCTION_ALIGNMENT_4B
imply IMA_SECURE_AND_OR_TRUSTED_BOOT if EFI
select HAVE_DYNAMIC_FTRACE_NO_PATCHABLE
@@ -357,11 +362,6 @@ config ARCH_HAS_CPU_RELAX
config ARCH_HIBERNATION_POSSIBLE
def_bool y
-config ARCH_NR_GPIO
- int
- default 1024 if X86_64
- default 512
-
config ARCH_SUSPEND_POSSIBLE
def_bool y
@@ -462,8 +462,8 @@ config X86_X2APIC
Some Intel systems circa 2022 and later are locked into x2APIC mode
and can not fall back to the legacy APIC modes if SGX or TDX are
- enabled in the BIOS. They will be unable to boot without enabling
- this option.
+ enabled in the BIOS. They will boot with very reduced functionality
+ without enabling this option.
If you don't know what to do here, say N.
@@ -1109,7 +1109,6 @@ config X86_LOCAL_APIC
def_bool y
depends on X86_64 || SMP || X86_32_NON_STANDARD || X86_UP_APIC || PCI_MSI
select IRQ_DOMAIN_HIERARCHY
- select PCI_MSI_IRQ_DOMAIN if PCI_MSI
config X86_IO_APIC
def_bool y
@@ -1854,7 +1853,7 @@ config CC_HAS_IBT
config X86_KERNEL_IBT
prompt "Indirect Branch Tracking"
- bool
+ def_bool y
depends on X86_64 && CC_HAS_IBT && HAVE_OBJTOOL
# https://github.com/llvm/llvm-project/commit/9d7001eba9c4cb311e03cd8cdc231f9e579f2d0f
depends on !LD_IS_LLD || LLD_VERSION >= 140000
@@ -1980,6 +1979,23 @@ config EFI_STUB
See Documentation/admin-guide/efi-stub.rst for more information.
+config EFI_HANDOVER_PROTOCOL
+ bool "EFI handover protocol (DEPRECATED)"
+ depends on EFI_STUB
+ default y
+ help
+ Select this in order to include support for the deprecated EFI
+ handover protocol, which defines alternative entry points into the
+ EFI stub. This is a practice that has no basis in the UEFI
+ specification, and requires a priori knowledge on the part of the
+ bootloader about Linux/x86 specific ways of passing the command line
+ and initrd, and where in memory those assets may be loaded.
+
+ If in doubt, say Y. Even though the corresponding support is not
+ present in upstream GRUB or other bootloaders, most distros build
+ GRUB with numerous downstream patches applied, and may rely on the
+ handover protocol as as result.
+
config EFI_MIXED
bool "EFI mixed-mode support"
depends on EFI_STUB && X86_64
@@ -1994,6 +2010,37 @@ config EFI_MIXED
If unsure, say N.
+config EFI_FAKE_MEMMAP
+ bool "Enable EFI fake memory map"
+ depends on EFI
+ help
+ Saying Y here will enable "efi_fake_mem" boot option. By specifying
+ this parameter, you can add arbitrary attribute to specific memory
+ range by updating original (firmware provided) EFI memmap. This is
+ useful for debugging of EFI memmap related feature, e.g., Address
+ Range Mirroring feature.
+
+config EFI_MAX_FAKE_MEM
+ int "maximum allowable number of ranges in efi_fake_mem boot option"
+ depends on EFI_FAKE_MEMMAP
+ range 1 128
+ default 8
+ help
+ Maximum allowable number of ranges in efi_fake_mem boot option.
+ Ranges can be set up to this value using comma-separated list.
+ The default value is 8.
+
+config EFI_RUNTIME_MAP
+ bool "Export EFI runtime maps to sysfs" if EXPERT
+ depends on EFI
+ default KEXEC_CORE
+ help
+ Export EFI runtime memory regions to /sys/firmware/efi/runtime-map.
+ That memory map is required by the 2nd kernel to set up EFI virtual
+ mappings after kexec, but can also be used for debugging purposes.
+
+ See also Documentation/ABI/testing/sysfs-firmware-efi-runtime-map.
+
source "kernel/Kconfig.hz"
config KEXEC
@@ -2443,6 +2490,46 @@ config CC_HAS_SLS
config CC_HAS_RETURN_THUNK
def_bool $(cc-option,-mfunction-return=thunk-extern)
+config CC_HAS_ENTRY_PADDING
+ def_bool $(cc-option,-fpatchable-function-entry=16,16)
+
+config FUNCTION_PADDING_CFI
+ int
+ default 59 if FUNCTION_ALIGNMENT_64B
+ default 27 if FUNCTION_ALIGNMENT_32B
+ default 11 if FUNCTION_ALIGNMENT_16B
+ default 3 if FUNCTION_ALIGNMENT_8B
+ default 0
+
+# Basically: FUNCTION_ALIGNMENT - 5*CFI_CLANG
+# except Kconfig can't do arithmetic :/
+config FUNCTION_PADDING_BYTES
+ int
+ default FUNCTION_PADDING_CFI if CFI_CLANG
+ default FUNCTION_ALIGNMENT
+
+config CALL_PADDING
+ def_bool n
+ depends on CC_HAS_ENTRY_PADDING && OBJTOOL
+ select FUNCTION_ALIGNMENT_16B
+
+config FINEIBT
+ def_bool y
+ depends on X86_KERNEL_IBT && CFI_CLANG && RETPOLINE
+ select CALL_PADDING
+
+config HAVE_CALL_THUNKS
+ def_bool y
+ depends on CC_HAS_ENTRY_PADDING && RETHUNK && OBJTOOL
+
+config CALL_THUNKS
+ def_bool n
+ select CALL_PADDING
+
+config PREFIX_SYMBOLS
+ def_bool y
+ depends on CALL_PADDING && !CFI_CLANG
+
menuconfig SPECULATION_MITIGATIONS
bool "Mitigations for speculative execution vulnerabilities"
default y
@@ -2494,6 +2581,37 @@ config CPU_UNRET_ENTRY
help
Compile the kernel with support for the retbleed=unret mitigation.
+config CALL_DEPTH_TRACKING
+ bool "Mitigate RSB underflow with call depth tracking"
+ depends on CPU_SUP_INTEL && HAVE_CALL_THUNKS
+ select HAVE_DYNAMIC_FTRACE_NO_PATCHABLE
+ select CALL_THUNKS
+ default y
+ help
+ Compile the kernel with call depth tracking to mitigate the Intel
+ SKL Return-Speculation-Buffer (RSB) underflow issue. The
+ mitigation is off by default and needs to be enabled on the
+ kernel command line via the retbleed=stuff option. For
+ non-affected systems the overhead of this option is marginal as
+ the call depth tracking is using run-time generated call thunks
+ in a compiler generated padding area and call patching. This
+ increases text size by ~5%. For non affected systems this space
+ is unused. On affected SKL systems this results in a significant
+ performance gain over the IBRS mitigation.
+
+config CALL_THUNKS_DEBUG
+ bool "Enable call thunks and call depth tracking debugging"
+ depends on CALL_DEPTH_TRACKING
+ select FUNCTION_ALIGNMENT_32B
+ default n
+ help
+ Enable call/ret counters for imbalance detection and build in
+ a noisy dmesg about callthunks generation and call patching for
+ trouble shooting. The debug prints need to be enabled on the
+ kernel command line with 'debug-callthunks'.
+ Only enable this, when you are debugging call thunks as this
+ creates a noticable runtime overhead. If unsure say N.
+
config CPU_IBPB_ENTRY
bool "Enable IBPB on kernel entry"
depends on CPU_SUP_AMD && X86_64
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index 415a5d138de4..9cf07322875a 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -208,10 +208,16 @@ ifdef CONFIG_SLS
KBUILD_CFLAGS += -mharden-sls=all
endif
+ifdef CONFIG_CALL_PADDING
+PADDING_CFLAGS := -fpatchable-function-entry=$(CONFIG_FUNCTION_PADDING_BYTES),$(CONFIG_FUNCTION_PADDING_BYTES)
+KBUILD_CFLAGS += $(PADDING_CFLAGS)
+export PADDING_CFLAGS
+endif
+
KBUILD_LDFLAGS += -m elf_$(UTS_MACHINE)
ifdef CONFIG_LTO_CLANG
-ifeq ($(shell test $(CONFIG_LLD_VERSION) -lt 130000; echo $$?),0)
+ifeq ($(call test-lt, $(CONFIG_LLD_VERSION), 130000),y)
KBUILD_LDFLAGS += -plugin-opt=-stack-alignment=$(if $(CONFIG_X86_32),4,8)
endif
endif
diff --git a/arch/x86/boot/Makefile b/arch/x86/boot/Makefile
index 9860ca5979f8..9e38ffaadb5d 100644
--- a/arch/x86/boot/Makefile
+++ b/arch/x86/boot/Makefile
@@ -83,7 +83,7 @@ cmd_image = $(obj)/tools/build $(obj)/setup.bin $(obj)/vmlinux.bin \
$(obj)/bzImage: $(obj)/setup.bin $(obj)/vmlinux.bin $(obj)/tools/build FORCE
$(call if_changed,image)
- @$(kecho) 'Kernel: $@ is ready' ' (#'`cat .version`')'
+ @$(kecho) 'Kernel: $@ is ready' ' (#'$(or $(KBUILD_BUILD_VERSION),`cat .version`)')'
OBJCOPYFLAGS_vmlinux.bin := -O binary -R .note -R .comment -S
$(obj)/vmlinux.bin: $(obj)/compressed/vmlinux FORCE
diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile
index 3a261abb6d15..1acff356d97a 100644
--- a/arch/x86/boot/compressed/Makefile
+++ b/arch/x86/boot/compressed/Makefile
@@ -68,7 +68,7 @@ KBUILD_LDFLAGS += $(call ld-option,--no-ld-generated-unwind-info)
# address by the bootloader.
LDFLAGS_vmlinux := -pie $(call ld-option, --no-dynamic-linker)
ifdef CONFIG_LD_ORPHAN_WARN
-LDFLAGS_vmlinux += --orphan-handling=warn
+LDFLAGS_vmlinux += --orphan-handling=$(CONFIG_LD_ORPHAN_WARN_LEVEL)
endif
LDFLAGS_vmlinux += -z noexecstack
ifeq ($(CONFIG_LD_IS_BFD),y)
@@ -100,7 +100,7 @@ vmlinux-objs-$(CONFIG_RANDOMIZE_BASE) += $(obj)/kaslr.o
ifdef CONFIG_X86_64
vmlinux-objs-y += $(obj)/ident_map_64.o
vmlinux-objs-y += $(obj)/idt_64.o $(obj)/idt_handlers_64.o
- vmlinux-objs-y += $(obj)/mem_encrypt.o
+ vmlinux-objs-$(CONFIG_AMD_MEM_ENCRYPT) += $(obj)/mem_encrypt.o
vmlinux-objs-y += $(obj)/pgtable_64.o
vmlinux-objs-$(CONFIG_AMD_MEM_ENCRYPT) += $(obj)/sev.o
endif
@@ -108,11 +108,11 @@ endif
vmlinux-objs-$(CONFIG_ACPI) += $(obj)/acpi.o
vmlinux-objs-$(CONFIG_INTEL_TDX_GUEST) += $(obj)/tdx.o $(obj)/tdcall.o
-vmlinux-objs-$(CONFIG_EFI_MIXED) += $(obj)/efi_thunk_$(BITS).o
vmlinux-objs-$(CONFIG_EFI) += $(obj)/efi.o
-efi-obj-$(CONFIG_EFI_STUB) = $(objtree)/drivers/firmware/efi/libstub/lib.a
+vmlinux-objs-$(CONFIG_EFI_MIXED) += $(obj)/efi_mixed.o
+vmlinux-objs-$(CONFIG_EFI_STUB) += $(objtree)/drivers/firmware/efi/libstub/lib.a
-$(obj)/vmlinux: $(vmlinux-objs-y) $(efi-obj-y) FORCE
+$(obj)/vmlinux: $(vmlinux-objs-y) FORCE
$(call if_changed,ld)
OBJCOPYFLAGS_vmlinux.bin := -R .comment -S
diff --git a/arch/x86/boot/compressed/efi_mixed.S b/arch/x86/boot/compressed/efi_mixed.S
new file mode 100644
index 000000000000..4ca70bf93dc0
--- /dev/null
+++ b/arch/x86/boot/compressed/efi_mixed.S
@@ -0,0 +1,345 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) 2014, 2015 Intel Corporation; author Matt Fleming
+ *
+ * Early support for invoking 32-bit EFI services from a 64-bit kernel.
+ *
+ * Because this thunking occurs before ExitBootServices() we have to
+ * restore the firmware's 32-bit GDT and IDT before we make EFI service
+ * calls.
+ *
+ * On the plus side, we don't have to worry about mangling 64-bit
+ * addresses into 32-bits because we're executing with an identity
+ * mapped pagetable and haven't transitioned to 64-bit virtual addresses
+ * yet.
+ */
+
+#include <linux/linkage.h>
+#include <asm/msr.h>
+#include <asm/page_types.h>
+#include <asm/processor-flags.h>
+#include <asm/segment.h>
+
+ .code64
+ .text
+/*
+ * When booting in 64-bit mode on 32-bit EFI firmware, startup_64_mixed_mode()
+ * is the first thing that runs after switching to long mode. Depending on
+ * whether the EFI handover protocol or the compat entry point was used to
+ * enter the kernel, it will either branch to the 64-bit EFI handover
+ * entrypoint at offset 0x390 in the image, or to the 64-bit EFI PE/COFF
+ * entrypoint efi_pe_entry(). In the former case, the bootloader must provide a
+ * struct bootparams pointer as the third argument, so the presence of such a
+ * pointer is used to disambiguate.
+ *
+ * +--------------+
+ * +------------------+ +------------+ +------>| efi_pe_entry |
+ * | efi32_pe_entry |---->| | | +-----------+--+
+ * +------------------+ | | +------+----------------+ |
+ * | startup_32 |---->| startup_64_mixed_mode | |
+ * +------------------+ | | +------+----------------+ V
+ * | efi32_stub_entry |---->| | | +------------------+
+ * +------------------+ +------------+ +---->| efi64_stub_entry |
+ * +-------------+----+
+ * +------------+ +----------+ |
+ * | startup_64 |<----| efi_main |<--------------+
+ * +------------+ +----------+
+ */
+SYM_FUNC_START(startup_64_mixed_mode)
+ lea efi32_boot_args(%rip), %rdx
+ mov 0(%rdx), %edi
+ mov 4(%rdx), %esi
+ mov 8(%rdx), %edx // saved bootparams pointer
+ test %edx, %edx
+ jnz efi64_stub_entry
+ /*
+ * efi_pe_entry uses MS calling convention, which requires 32 bytes of
+ * shadow space on the stack even if all arguments are passed in
+ * registers. We also need an additional 8 bytes for the space that
+ * would be occupied by the return address, and this also results in
+ * the correct stack alignment for entry.
+ */
+ sub $40, %rsp
+ mov %rdi, %rcx // MS calling convention
+ mov %rsi, %rdx
+ jmp efi_pe_entry
+SYM_FUNC_END(startup_64_mixed_mode)
+
+SYM_FUNC_START(__efi64_thunk)
+ push %rbp
+ push %rbx
+
+ movl %ds, %eax
+ push %rax
+ movl %es, %eax
+ push %rax
+ movl %ss, %eax
+ push %rax
+
+ /* Copy args passed on stack */
+ movq 0x30(%rsp), %rbp
+ movq 0x38(%rsp), %rbx
+ movq 0x40(%rsp), %rax
+
+ /*
+ * Convert x86-64 ABI params to i386 ABI
+ */
+ subq $64, %rsp
+ movl %esi, 0x0(%rsp)
+ movl %edx, 0x4(%rsp)
+ movl %ecx, 0x8(%rsp)
+ movl %r8d, 0xc(%rsp)
+ movl %r9d, 0x10(%rsp)
+ movl %ebp, 0x14(%rsp)
+ movl %ebx, 0x18(%rsp)
+ movl %eax, 0x1c(%rsp)
+
+ leaq 0x20(%rsp), %rbx
+ sgdt (%rbx)
+ sidt 16(%rbx)
+
+ leaq 1f(%rip), %rbp
+
+ /*
+ * Switch to IDT and GDT with 32-bit segments. These are the firmware
+ * GDT and IDT that were installed when the kernel started executing.
+ * The pointers were saved by the efi32_entry() routine below.
+ *
+ * Pass the saved DS selector to the 32-bit code, and use far return to
+ * restore the saved CS selector.
+ */
+ lidt efi32_boot_idt(%rip)
+ lgdt efi32_boot_gdt(%rip)
+
+ movzwl efi32_boot_ds(%rip), %edx
+ movzwq efi32_boot_cs(%rip), %rax
+ pushq %rax
+ leaq efi_enter32(%rip), %rax
+ pushq %rax
+ lretq
+
+1: addq $64, %rsp
+ movq %rdi, %rax
+
+ pop %rbx
+ movl %ebx, %ss
+ pop %rbx
+ movl %ebx, %es
+ pop %rbx
+ movl %ebx, %ds
+ /* Clear out 32-bit selector from FS and GS */
+ xorl %ebx, %ebx
+ movl %ebx, %fs
+ movl %ebx, %gs
+
+ pop %rbx
+ pop %rbp
+ RET
+SYM_FUNC_END(__efi64_thunk)
+
+ .code32
+/*
+ * EFI service pointer must be in %edi.
+ *
+ * The stack should represent the 32-bit calling convention.
+ */
+SYM_FUNC_START_LOCAL(efi_enter32)
+ /* Load firmware selector into data and stack segment registers */
+ movl %edx, %ds
+ movl %edx, %es
+ movl %edx, %fs
+ movl %edx, %gs
+ movl %edx, %ss
+
+ /* Reload pgtables */
+ movl %cr3, %eax
+ movl %eax, %cr3
+
+ /* Disable paging */
+ movl %cr0, %eax
+ btrl $X86_CR0_PG_BIT, %eax
+ movl %eax, %cr0
+
+ /* Disable long mode via EFER */
+ movl $MSR_EFER, %ecx
+ rdmsr
+ btrl $_EFER_LME, %eax
+ wrmsr
+
+ call *%edi
+
+ /* We must preserve return value */
+ movl %eax, %edi
+
+ /*
+ * Some firmware will return with interrupts enabled. Be sure to
+ * disable them before we switch GDTs and IDTs.
+ */
+ cli
+
+ lidtl 16(%ebx)
+ lgdtl (%ebx)
+
+ movl %cr4, %eax
+ btsl $(X86_CR4_PAE_BIT), %eax
+ movl %eax, %cr4
+
+ movl %cr3, %eax
+ movl %eax, %cr3
+
+ movl $MSR_EFER, %ecx
+ rdmsr
+ btsl $_EFER_LME, %eax
+ wrmsr
+
+ xorl %eax, %eax
+ lldt %ax
+
+ pushl $__KERNEL_CS
+ pushl %ebp
+
+ /* Enable paging */
+ movl %cr0, %eax
+ btsl $X86_CR0_PG_BIT, %eax
+ movl %eax, %cr0
+ lret
+SYM_FUNC_END(efi_enter32)
+
+/*
+ * This is the common EFI stub entry point for mixed mode.
+ *
+ * Arguments: %ecx image handle
+ * %edx EFI system table pointer
+ * %esi struct bootparams pointer (or NULL when not using
+ * the EFI handover protocol)
+ *
+ * Since this is the point of no return for ordinary execution, no registers
+ * are considered live except for the function parameters. [Note that the EFI
+ * stub may still exit and return to the firmware using the Exit() EFI boot
+ * service.]
+ */
+SYM_FUNC_START(efi32_entry)
+ call 1f
+1: pop %ebx
+
+ /* Save firmware GDTR and code/data selectors */
+ sgdtl (efi32_boot_gdt - 1b)(%ebx)
+ movw %cs, (efi32_boot_cs - 1b)(%ebx)
+ movw %ds, (efi32_boot_ds - 1b)(%ebx)
+
+ /* Store firmware IDT descriptor */
+ sidtl (efi32_boot_idt - 1b)(%ebx)
+
+ /* Store boot arguments */
+ leal (efi32_boot_args - 1b)(%ebx), %ebx
+ movl %ecx, 0(%ebx)
+ movl %edx, 4(%ebx)
+ movl %esi, 8(%ebx)
+ movb $0x0, 12(%ebx) // efi_is64
+
+ /* Disable paging */
+ movl %cr0, %eax
+ btrl $X86_CR0_PG_BIT, %eax
+ movl %eax, %cr0
+
+ jmp startup_32
+SYM_FUNC_END(efi32_entry)
+
+#define ST32_boottime 60 // offsetof(efi_system_table_32_t, boottime)
+#define BS32_handle_protocol 88 // offsetof(efi_boot_services_32_t, handle_protocol)
+#define LI32_image_base 32 // offsetof(efi_loaded_image_32_t, image_base)
+
+/*
+ * efi_status_t efi32_pe_entry(efi_handle_t image_handle,
+ * efi_system_table_32_t *sys_table)
+ */
+SYM_FUNC_START(efi32_pe_entry)
+ pushl %ebp
+ movl %esp, %ebp
+ pushl %eax // dummy push to allocate loaded_image
+
+ pushl %ebx // save callee-save registers
+ pushl %edi
+
+ call verify_cpu // check for long mode support
+ testl %eax, %eax
+ movl $0x80000003, %eax // EFI_UNSUPPORTED
+ jnz 2f
+
+ call 1f
+1: pop %ebx
+
+ /* Get the loaded image protocol pointer from the image handle */
+ leal -4(%ebp), %eax
+ pushl %eax // &loaded_image
+ leal (loaded_image_proto - 1b)(%ebx), %eax
+ pushl %eax // pass the GUID address
+ pushl 8(%ebp) // pass the image handle
+
+ /*
+ * Note the alignment of the stack frame.
+ * sys_table
+ * handle <-- 16-byte aligned on entry by ABI
+ * return address
+ * frame pointer
+ * loaded_image <-- local variable
+ * saved %ebx <-- 16-byte aligned here
+ * saved %edi
+ * &loaded_image
+ * &loaded_image_proto
+ * handle <-- 16-byte aligned for call to handle_protocol
+ */
+
+ movl 12(%ebp), %eax // sys_table
+ movl ST32_boottime(%eax), %eax // sys_table->boottime
+ call *BS32_handle_protocol(%eax) // sys_table->boottime->handle_protocol
+ addl $12, %esp // restore argument space
+ testl %eax, %eax
+ jnz 2f
+
+ movl 8(%ebp), %ecx // image_handle
+ movl 12(%ebp), %edx // sys_table
+ movl -4(%ebp), %esi // loaded_image
+ movl LI32_image_base(%esi), %esi // loaded_image->image_base
+ leal (startup_32 - 1b)(%ebx), %ebp // runtime address of startup_32
+ /*
+ * We need to set the image_offset variable here since startup_32() will
+ * use it before we get to the 64-bit efi_pe_entry() in C code.
+ */
+ subl %esi, %ebp // calculate image_offset
+ movl %ebp, (image_offset - 1b)(%ebx) // save image_offset
+ xorl %esi, %esi
+ jmp efi32_entry // pass %ecx, %edx, %esi
+ // no other registers remain live
+
+2: popl %edi // restore callee-save registers
+ popl %ebx
+ leave
+ RET
+SYM_FUNC_END(efi32_pe_entry)
+
+ .section ".rodata"
+ /* EFI loaded image protocol GUID */
+ .balign 4
+SYM_DATA_START_LOCAL(loaded_image_proto)
+ .long 0x5b1b31a1
+ .word 0x9562, 0x11d2
+ .byte 0x8e, 0x3f, 0x00, 0xa0, 0xc9, 0x69, 0x72, 0x3b
+SYM_DATA_END(loaded_image_proto)
+
+ .data
+ .balign 8
+SYM_DATA_START_LOCAL(efi32_boot_gdt)
+ .word 0
+ .quad 0
+SYM_DATA_END(efi32_boot_gdt)
+
+SYM_DATA_START_LOCAL(efi32_boot_idt)
+ .word 0
+ .quad 0
+SYM_DATA_END(efi32_boot_idt)
+
+SYM_DATA_LOCAL(efi32_boot_cs, .word 0)
+SYM_DATA_LOCAL(efi32_boot_ds, .word 0)
+SYM_DATA_LOCAL(efi32_boot_args, .long 0, 0, 0)
+SYM_DATA(efi_is64, .byte 1)
diff --git a/arch/x86/boot/compressed/efi_thunk_64.S b/arch/x86/boot/compressed/efi_thunk_64.S
deleted file mode 100644
index 67e7edcdfea8..000000000000
--- a/arch/x86/boot/compressed/efi_thunk_64.S
+++ /dev/null
@@ -1,195 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * Copyright (C) 2014, 2015 Intel Corporation; author Matt Fleming
- *
- * Early support for invoking 32-bit EFI services from a 64-bit kernel.
- *
- * Because this thunking occurs before ExitBootServices() we have to
- * restore the firmware's 32-bit GDT and IDT before we make EFI service
- * calls.
- *
- * On the plus side, we don't have to worry about mangling 64-bit
- * addresses into 32-bits because we're executing with an identity
- * mapped pagetable and haven't transitioned to 64-bit virtual addresses
- * yet.
- */
-
-#include <linux/linkage.h>
-#include <asm/msr.h>
-#include <asm/page_types.h>
-#include <asm/processor-flags.h>
-#include <asm/segment.h>
-
- .code64
- .text
-SYM_FUNC_START(__efi64_thunk)
- push %rbp
- push %rbx
-
- movl %ds, %eax
- push %rax
- movl %es, %eax
- push %rax
- movl %ss, %eax
- push %rax
-
- /* Copy args passed on stack */
- movq 0x30(%rsp), %rbp
- movq 0x38(%rsp), %rbx
- movq 0x40(%rsp), %rax
-
- /*
- * Convert x86-64 ABI params to i386 ABI
- */
- subq $64, %rsp
- movl %esi, 0x0(%rsp)
- movl %edx, 0x4(%rsp)
- movl %ecx, 0x8(%rsp)
- movl %r8d, 0xc(%rsp)
- movl %r9d, 0x10(%rsp)
- movl %ebp, 0x14(%rsp)
- movl %ebx, 0x18(%rsp)
- movl %eax, 0x1c(%rsp)
-
- leaq 0x20(%rsp), %rbx
- sgdt (%rbx)
-
- addq $16, %rbx
- sidt (%rbx)
-
- leaq 1f(%rip), %rbp
-
- /*
- * Switch to IDT and GDT with 32-bit segments. This is the firmware GDT
- * and IDT that was installed when the kernel started executing. The
- * pointers were saved at the EFI stub entry point in head_64.S.
- *
- * Pass the saved DS selector to the 32-bit code, and use far return to
- * restore the saved CS selector.
- */
- leaq efi32_boot_idt(%rip), %rax
- lidt (%rax)
- leaq efi32_boot_gdt(%rip), %rax
- lgdt (%rax)
-
- movzwl efi32_boot_ds(%rip), %edx
- movzwq efi32_boot_cs(%rip), %rax
- pushq %rax
- leaq efi_enter32(%rip), %rax
- pushq %rax
- lretq
-
-1: addq $64, %rsp
- movq %rdi, %rax
-
- pop %rbx
- movl %ebx, %ss
- pop %rbx
- movl %ebx, %es
- pop %rbx
- movl %ebx, %ds
- /* Clear out 32-bit selector from FS and GS */
- xorl %ebx, %ebx
- movl %ebx, %fs
- movl %ebx, %gs
-
- /*
- * Convert 32-bit status code into 64-bit.
- */
- roll $1, %eax
- rorq $1, %rax
-
- pop %rbx
- pop %rbp
- RET
-SYM_FUNC_END(__efi64_thunk)
-
- .code32
-/*
- * EFI service pointer must be in %edi.
- *
- * The stack should represent the 32-bit calling convention.
- */
-SYM_FUNC_START_LOCAL(efi_enter32)
- /* Load firmware selector into data and stack segment registers */
- movl %edx, %ds
- movl %edx, %es
- movl %edx, %fs
- movl %edx, %gs
- movl %edx, %ss
-
- /* Reload pgtables */
- movl %cr3, %eax
- movl %eax, %cr3
-
- /* Disable paging */
- movl %cr0, %eax
- btrl $X86_CR0_PG_BIT, %eax
- movl %eax, %cr0
-
- /* Disable long mode via EFER */
- movl $MSR_EFER, %ecx
- rdmsr
- btrl $_EFER_LME, %eax
- wrmsr
-
- call *%edi
-
- /* We must preserve return value */
- movl %eax, %edi
-
- /*
- * Some firmware will return with interrupts enabled. Be sure to
- * disable them before we switch GDTs and IDTs.
- */
- cli
-
- lidtl (%ebx)
- subl $16, %ebx
-
- lgdtl (%ebx)
-
- movl %cr4, %eax
- btsl $(X86_CR4_PAE_BIT), %eax
- movl %eax, %cr4
-
- movl %cr3, %eax
- movl %eax, %cr3
-
- movl $MSR_EFER, %ecx
- rdmsr
- btsl $_EFER_LME, %eax
- wrmsr
-
- xorl %eax, %eax
- lldt %ax
-
- pushl $__KERNEL_CS
- pushl %ebp
-
- /* Enable paging */
- movl %cr0, %eax
- btsl $X86_CR0_PG_BIT, %eax
- movl %eax, %cr0
- lret
-SYM_FUNC_END(efi_enter32)
-
- .data
- .balign 8
-SYM_DATA_START(efi32_boot_gdt)
- .word 0
- .quad 0
-SYM_DATA_END(efi32_boot_gdt)
-
-SYM_DATA_START(efi32_boot_idt)
- .word 0
- .quad 0
-SYM_DATA_END(efi32_boot_idt)
-
-SYM_DATA_START(efi32_boot_cs)
- .word 0
-SYM_DATA_END(efi32_boot_cs)
-
-SYM_DATA_START(efi32_boot_ds)
- .word 0
-SYM_DATA_END(efi32_boot_ds)
diff --git a/arch/x86/boot/compressed/head_32.S b/arch/x86/boot/compressed/head_32.S
index 3b354eb9516d..6589ddd4cfaf 100644
--- a/arch/x86/boot/compressed/head_32.S
+++ b/arch/x86/boot/compressed/head_32.S
@@ -208,10 +208,6 @@ SYM_DATA_START_LOCAL(gdt)
.quad 0x00cf92000000ffff /* __KERNEL_DS */
SYM_DATA_END_LABEL(gdt, SYM_L_LOCAL, gdt_end)
-#ifdef CONFIG_EFI_STUB
-SYM_DATA(image_offset, .long 0)
-#endif
-
/*
* Stack and heap for uncompression
*/
diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S
index d33f060900d2..a75712991df3 100644
--- a/arch/x86/boot/compressed/head_64.S
+++ b/arch/x86/boot/compressed/head_64.S
@@ -38,6 +38,14 @@
#include "pgtable.h"
/*
+ * Fix alignment at 16 bytes. Following CONFIG_FUNCTION_ALIGNMENT will result
+ * in assembly errors due to trying to move .org backward due to the excessive
+ * alignment.
+ */
+#undef __ALIGN
+#define __ALIGN .balign 16, 0x90
+
+/*
* Locally defined symbols should be marked hidden:
*/
.hidden _bss
@@ -118,7 +126,9 @@ SYM_FUNC_START(startup_32)
1:
/* Setup Exception handling for SEV-ES */
+#ifdef CONFIG_AMD_MEM_ENCRYPT
call startup32_load_idt
+#endif
/* Make sure cpu supports long mode. */
call verify_cpu
@@ -178,12 +188,13 @@ SYM_FUNC_START(startup_32)
*/
/*
* If SEV is active then set the encryption mask in the page tables.
- * This will insure that when the kernel is copied and decompressed
+ * This will ensure that when the kernel is copied and decompressed
* it will be done so encrypted.
*/
- call get_sev_encryption_bit
xorl %edx, %edx
#ifdef CONFIG_AMD_MEM_ENCRYPT
+ call get_sev_encryption_bit
+ xorl %edx, %edx
testl %eax, %eax
jz 1f
subl $32, %eax /* Encryption bit is always above bit 31 */
@@ -249,6 +260,11 @@ SYM_FUNC_START(startup_32)
movl $__BOOT_TSS, %eax
ltr %ax
+#ifdef CONFIG_AMD_MEM_ENCRYPT
+ /* Check if the C-bit position is correct when SEV is active */
+ call startup32_check_sev_cbit
+#endif
+
/*
* Setup for the jump to 64bit mode
*
@@ -261,29 +277,11 @@ SYM_FUNC_START(startup_32)
*/
leal rva(startup_64)(%ebp), %eax
#ifdef CONFIG_EFI_MIXED
- movl rva(efi32_boot_args)(%ebp), %edi
- testl %edi, %edi
- jz 1f
- leal rva(efi64_stub_entry)(%ebp), %eax
- movl rva(efi32_boot_args+4)(%ebp), %esi
- movl rva(efi32_boot_args+8)(%ebp), %edx // saved bootparams pointer
- testl %edx, %edx
- jnz 1f
- /*
- * efi_pe_entry uses MS calling convention, which requires 32 bytes of
- * shadow space on the stack even if all arguments are passed in
- * registers. We also need an additional 8 bytes for the space that
- * would be occupied by the return address, and this also results in
- * the correct stack alignment for entry.
- */
- subl $40, %esp
- leal rva(efi_pe_entry)(%ebp), %eax
- movl %edi, %ecx // MS calling convention
- movl %esi, %edx
+ cmpb $1, rva(efi_is64)(%ebp)
+ je 1f
+ leal rva(startup_64_mixed_mode)(%ebp), %eax
1:
#endif
- /* Check if the C-bit position is correct when SEV is active */
- call startup32_check_sev_cbit
pushl $__KERNEL_CS
pushl %eax
@@ -296,38 +294,14 @@ SYM_FUNC_START(startup_32)
lret
SYM_FUNC_END(startup_32)
-#ifdef CONFIG_EFI_MIXED
+#if IS_ENABLED(CONFIG_EFI_MIXED) && IS_ENABLED(CONFIG_EFI_HANDOVER_PROTOCOL)
.org 0x190
SYM_FUNC_START(efi32_stub_entry)
add $0x4, %esp /* Discard return address */
popl %ecx
popl %edx
popl %esi
-
- call 1f
-1: pop %ebp
- subl $ rva(1b), %ebp
-
- movl %esi, rva(efi32_boot_args+8)(%ebp)
-SYM_INNER_LABEL(efi32_pe_stub_entry, SYM_L_LOCAL)
- movl %ecx, rva(efi32_boot_args)(%ebp)
- movl %edx, rva(efi32_boot_args+4)(%ebp)
- movb $0, rva(efi_is64)(%ebp)
-
- /* Save firmware GDTR and code/data selectors */
- sgdtl rva(efi32_boot_gdt)(%ebp)
- movw %cs, rva(efi32_boot_cs)(%ebp)
- movw %ds, rva(efi32_boot_ds)(%ebp)
-
- /* Store firmware IDT descriptor */
- sidtl rva(efi32_boot_idt)(%ebp)
-
- /* Disable paging */
- movl %cr0, %eax
- btrl $X86_CR0_PG_BIT, %eax
- movl %eax, %cr0
-
- jmp startup_32
+ jmp efi32_entry
SYM_FUNC_END(efi32_stub_entry)
#endif
@@ -550,7 +524,9 @@ trampoline_return:
SYM_CODE_END(startup_64)
#ifdef CONFIG_EFI_STUB
+#ifdef CONFIG_EFI_HANDOVER_PROTOCOL
.org 0x390
+#endif
SYM_FUNC_START(efi64_stub_entry)
and $~0xf, %rsp /* realign the stack */
movq %rdx, %rbx /* save boot_params pointer */
@@ -713,6 +689,7 @@ SYM_FUNC_START_LOCAL_NOALIGN(.Lno_longmode)
jmp 1b
SYM_FUNC_END(.Lno_longmode)
+ .globl verify_cpu
#include "../../kernel/verify_cpu.S"
.data
@@ -744,242 +721,6 @@ SYM_DATA_START(boot_idt)
.endr
SYM_DATA_END_LABEL(boot_idt, SYM_L_GLOBAL, boot_idt_end)
-#ifdef CONFIG_AMD_MEM_ENCRYPT
-SYM_DATA_START(boot32_idt_desc)
- .word boot32_idt_end - boot32_idt - 1
- .long 0
-SYM_DATA_END(boot32_idt_desc)
- .balign 8
-SYM_DATA_START(boot32_idt)
- .rept 32
- .quad 0
- .endr
-SYM_DATA_END_LABEL(boot32_idt, SYM_L_GLOBAL, boot32_idt_end)
-#endif
-
-#ifdef CONFIG_EFI_STUB
-SYM_DATA(image_offset, .long 0)
-#endif
-#ifdef CONFIG_EFI_MIXED
-SYM_DATA_LOCAL(efi32_boot_args, .long 0, 0, 0)
-SYM_DATA(efi_is64, .byte 1)
-
-#define ST32_boottime 60 // offsetof(efi_system_table_32_t, boottime)
-#define BS32_handle_protocol 88 // offsetof(efi_boot_services_32_t, handle_protocol)
-#define LI32_image_base 32 // offsetof(efi_loaded_image_32_t, image_base)
-
- __HEAD
- .code32
-SYM_FUNC_START(efi32_pe_entry)
-/*
- * efi_status_t efi32_pe_entry(efi_handle_t image_handle,
- * efi_system_table_32_t *sys_table)
- */
-
- pushl %ebp
- movl %esp, %ebp
- pushl %eax // dummy push to allocate loaded_image
-
- pushl %ebx // save callee-save registers
- pushl %edi
-
- call verify_cpu // check for long mode support
- testl %eax, %eax
- movl $0x80000003, %eax // EFI_UNSUPPORTED
- jnz 2f
-
- call 1f
-1: pop %ebx
- subl $ rva(1b), %ebx
-
- /* Get the loaded image protocol pointer from the image handle */
- leal -4(%ebp), %eax
- pushl %eax // &loaded_image
- leal rva(loaded_image_proto)(%ebx), %eax
- pushl %eax // pass the GUID address
- pushl 8(%ebp) // pass the image handle
-
- /*
- * Note the alignment of the stack frame.
- * sys_table
- * handle <-- 16-byte aligned on entry by ABI
- * return address
- * frame pointer
- * loaded_image <-- local variable
- * saved %ebx <-- 16-byte aligned here
- * saved %edi
- * &loaded_image
- * &loaded_image_proto
- * handle <-- 16-byte aligned for call to handle_protocol
- */
-
- movl 12(%ebp), %eax // sys_table
- movl ST32_boottime(%eax), %eax // sys_table->boottime
- call *BS32_handle_protocol(%eax) // sys_table->boottime->handle_protocol
- addl $12, %esp // restore argument space
- testl %eax, %eax
- jnz 2f
-
- movl 8(%ebp), %ecx // image_handle
- movl 12(%ebp), %edx // sys_table
- movl -4(%ebp), %esi // loaded_image
- movl LI32_image_base(%esi), %esi // loaded_image->image_base
- movl %ebx, %ebp // startup_32 for efi32_pe_stub_entry
- /*
- * We need to set the image_offset variable here since startup_32() will
- * use it before we get to the 64-bit efi_pe_entry() in C code.
- */
- subl %esi, %ebx
- movl %ebx, rva(image_offset)(%ebp) // save image_offset
- jmp efi32_pe_stub_entry
-
-2: popl %edi // restore callee-save registers
- popl %ebx
- leave
- RET
-SYM_FUNC_END(efi32_pe_entry)
-
- .section ".rodata"
- /* EFI loaded image protocol GUID */
- .balign 4
-SYM_DATA_START_LOCAL(loaded_image_proto)
- .long 0x5b1b31a1
- .word 0x9562, 0x11d2
- .byte 0x8e, 0x3f, 0x00, 0xa0, 0xc9, 0x69, 0x72, 0x3b
-SYM_DATA_END(loaded_image_proto)
-#endif
-
-#ifdef CONFIG_AMD_MEM_ENCRYPT
- __HEAD
- .code32
-/*
- * Write an IDT entry into boot32_idt
- *
- * Parameters:
- *
- * %eax: Handler address
- * %edx: Vector number
- *
- * Physical offset is expected in %ebp
- */
-SYM_FUNC_START(startup32_set_idt_entry)
- push %ebx
- push %ecx
-
- /* IDT entry address to %ebx */
- leal rva(boot32_idt)(%ebp), %ebx
- shl $3, %edx
- addl %edx, %ebx
-
- /* Build IDT entry, lower 4 bytes */
- movl %eax, %edx
- andl $0x0000ffff, %edx # Target code segment offset [15:0]
- movl $__KERNEL32_CS, %ecx # Target code segment selector
- shl $16, %ecx
- orl %ecx, %edx
-
- /* Store lower 4 bytes to IDT */
- movl %edx, (%ebx)
-
- /* Build IDT entry, upper 4 bytes */
- movl %eax, %edx
- andl $0xffff0000, %edx # Target code segment offset [31:16]
- orl $0x00008e00, %edx # Present, Type 32-bit Interrupt Gate
-
- /* Store upper 4 bytes to IDT */
- movl %edx, 4(%ebx)
-
- pop %ecx
- pop %ebx
- RET
-SYM_FUNC_END(startup32_set_idt_entry)
-#endif
-
-SYM_FUNC_START(startup32_load_idt)
-#ifdef CONFIG_AMD_MEM_ENCRYPT
- /* #VC handler */
- leal rva(startup32_vc_handler)(%ebp), %eax
- movl $X86_TRAP_VC, %edx
- call startup32_set_idt_entry
-
- /* Load IDT */
- leal rva(boot32_idt)(%ebp), %eax
- movl %eax, rva(boot32_idt_desc+2)(%ebp)
- lidt rva(boot32_idt_desc)(%ebp)
-#endif
- RET
-SYM_FUNC_END(startup32_load_idt)
-
-/*
- * Check for the correct C-bit position when the startup_32 boot-path is used.
- *
- * The check makes use of the fact that all memory is encrypted when paging is
- * disabled. The function creates 64 bits of random data using the RDRAND
- * instruction. RDRAND is mandatory for SEV guests, so always available. If the
- * hypervisor violates that the kernel will crash right here.
- *
- * The 64 bits of random data are stored to a memory location and at the same
- * time kept in the %eax and %ebx registers. Since encryption is always active
- * when paging is off the random data will be stored encrypted in main memory.
- *
- * Then paging is enabled. When the C-bit position is correct all memory is
- * still mapped encrypted and comparing the register values with memory will
- * succeed. An incorrect C-bit position will map all memory unencrypted, so that
- * the compare will use the encrypted random data and fail.
- */
-SYM_FUNC_START(startup32_check_sev_cbit)
-#ifdef CONFIG_AMD_MEM_ENCRYPT
- pushl %eax
- pushl %ebx
- pushl %ecx
- pushl %edx
-
- /* Check for non-zero sev_status */
- movl rva(sev_status)(%ebp), %eax
- testl %eax, %eax
- jz 4f
-
- /*
- * Get two 32-bit random values - Don't bail out if RDRAND fails
- * because it is better to prevent forward progress if no random value
- * can be gathered.
- */
-1: rdrand %eax
- jnc 1b
-2: rdrand %ebx
- jnc 2b
-
- /* Store to memory and keep it in the registers */
- movl %eax, rva(sev_check_data)(%ebp)
- movl %ebx, rva(sev_check_data+4)(%ebp)
-
- /* Enable paging to see if encryption is active */
- movl %cr0, %edx /* Backup %cr0 in %edx */
- movl $(X86_CR0_PG | X86_CR0_PE), %ecx /* Enable Paging and Protected mode */
- movl %ecx, %cr0
-
- cmpl %eax, rva(sev_check_data)(%ebp)
- jne 3f
- cmpl %ebx, rva(sev_check_data+4)(%ebp)
- jne 3f
-
- movl %edx, %cr0 /* Restore previous %cr0 */
-
- jmp 4f
-
-3: /* Check failed - hlt the machine */
- hlt
- jmp 3b
-
-4:
- popl %edx
- popl %ecx
- popl %ebx
- popl %eax
-#endif
- RET
-SYM_FUNC_END(startup32_check_sev_cbit)
-
/*
* Stack and heap for uncompression
*/
diff --git a/arch/x86/boot/compressed/kaslr.c b/arch/x86/boot/compressed/kaslr.c
index e476bcbd9b42..454757fbdfe5 100644
--- a/arch/x86/boot/compressed/kaslr.c
+++ b/arch/x86/boot/compressed/kaslr.c
@@ -668,7 +668,7 @@ static bool process_mem_region(struct mem_vector *region,
}
}
#endif
- return 0;
+ return false;
}
#ifdef CONFIG_EFI
diff --git a/arch/x86/boot/compressed/mem_encrypt.S b/arch/x86/boot/compressed/mem_encrypt.S
index a73e4d783cae..32f7cc8a8625 100644
--- a/arch/x86/boot/compressed/mem_encrypt.S
+++ b/arch/x86/boot/compressed/mem_encrypt.S
@@ -12,16 +12,13 @@
#include <asm/processor-flags.h>
#include <asm/msr.h>
#include <asm/asm-offsets.h>
+#include <asm/segment.h>
+#include <asm/trapnr.h>
.text
.code32
SYM_FUNC_START(get_sev_encryption_bit)
- xor %eax, %eax
-
-#ifdef CONFIG_AMD_MEM_ENCRYPT
push %ebx
- push %ecx
- push %edx
movl $0x80000000, %eax /* CPUID to check the highest leaf */
cpuid
@@ -52,12 +49,7 @@ SYM_FUNC_START(get_sev_encryption_bit)
xor %eax, %eax
.Lsev_exit:
- pop %edx
- pop %ecx
pop %ebx
-
-#endif /* CONFIG_AMD_MEM_ENCRYPT */
-
RET
SYM_FUNC_END(get_sev_encryption_bit)
@@ -98,7 +90,7 @@ SYM_CODE_START_LOCAL(sev_es_req_cpuid)
jmp 1b
SYM_CODE_END(sev_es_req_cpuid)
-SYM_CODE_START(startup32_vc_handler)
+SYM_CODE_START_LOCAL(startup32_vc_handler)
pushl %eax
pushl %ebx
pushl %ecx
@@ -184,15 +176,149 @@ SYM_CODE_START(startup32_vc_handler)
jmp .Lfail
SYM_CODE_END(startup32_vc_handler)
+/*
+ * Write an IDT entry into boot32_idt
+ *
+ * Parameters:
+ *
+ * %eax: Handler address
+ * %edx: Vector number
+ * %ecx: IDT address
+ */
+SYM_FUNC_START_LOCAL(startup32_set_idt_entry)
+ /* IDT entry address to %ecx */
+ leal (%ecx, %edx, 8), %ecx
+
+ /* Build IDT entry, lower 4 bytes */
+ movl %eax, %edx
+ andl $0x0000ffff, %edx # Target code segment offset [15:0]
+ orl $(__KERNEL32_CS << 16), %edx # Target code segment selector
+
+ /* Store lower 4 bytes to IDT */
+ movl %edx, (%ecx)
+
+ /* Build IDT entry, upper 4 bytes */
+ movl %eax, %edx
+ andl $0xffff0000, %edx # Target code segment offset [31:16]
+ orl $0x00008e00, %edx # Present, Type 32-bit Interrupt Gate
+
+ /* Store upper 4 bytes to IDT */
+ movl %edx, 4(%ecx)
+
+ RET
+SYM_FUNC_END(startup32_set_idt_entry)
+
+SYM_FUNC_START(startup32_load_idt)
+ push %ebp
+ push %ebx
+
+ call 1f
+1: pop %ebp
+
+ leal (boot32_idt - 1b)(%ebp), %ebx
+
+ /* #VC handler */
+ leal (startup32_vc_handler - 1b)(%ebp), %eax
+ movl $X86_TRAP_VC, %edx
+ movl %ebx, %ecx
+ call startup32_set_idt_entry
+
+ /* Load IDT */
+ leal (boot32_idt_desc - 1b)(%ebp), %ecx
+ movl %ebx, 2(%ecx)
+ lidt (%ecx)
+
+ pop %ebx
+ pop %ebp
+ RET
+SYM_FUNC_END(startup32_load_idt)
+
+/*
+ * Check for the correct C-bit position when the startup_32 boot-path is used.
+ *
+ * The check makes use of the fact that all memory is encrypted when paging is
+ * disabled. The function creates 64 bits of random data using the RDRAND
+ * instruction. RDRAND is mandatory for SEV guests, so always available. If the
+ * hypervisor violates that the kernel will crash right here.
+ *
+ * The 64 bits of random data are stored to a memory location and at the same
+ * time kept in the %eax and %ebx registers. Since encryption is always active
+ * when paging is off the random data will be stored encrypted in main memory.
+ *
+ * Then paging is enabled. When the C-bit position is correct all memory is
+ * still mapped encrypted and comparing the register values with memory will
+ * succeed. An incorrect C-bit position will map all memory unencrypted, so that
+ * the compare will use the encrypted random data and fail.
+ */
+SYM_FUNC_START(startup32_check_sev_cbit)
+ pushl %ebx
+ pushl %ebp
+
+ call 0f
+0: popl %ebp
+
+ /* Check for non-zero sev_status */
+ movl (sev_status - 0b)(%ebp), %eax
+ testl %eax, %eax
+ jz 4f
+
+ /*
+ * Get two 32-bit random values - Don't bail out if RDRAND fails
+ * because it is better to prevent forward progress if no random value
+ * can be gathered.
+ */
+1: rdrand %eax
+ jnc 1b
+2: rdrand %ebx
+ jnc 2b
+
+ /* Store to memory and keep it in the registers */
+ leal (sev_check_data - 0b)(%ebp), %ebp
+ movl %eax, 0(%ebp)
+ movl %ebx, 4(%ebp)
+
+ /* Enable paging to see if encryption is active */
+ movl %cr0, %edx /* Backup %cr0 in %edx */
+ movl $(X86_CR0_PG | X86_CR0_PE), %ecx /* Enable Paging and Protected mode */
+ movl %ecx, %cr0
+
+ cmpl %eax, 0(%ebp)
+ jne 3f
+ cmpl %ebx, 4(%ebp)
+ jne 3f
+
+ movl %edx, %cr0 /* Restore previous %cr0 */
+
+ jmp 4f
+
+3: /* Check failed - hlt the machine */
+ hlt
+ jmp 3b
+
+4:
+ popl %ebp
+ popl %ebx
+ RET
+SYM_FUNC_END(startup32_check_sev_cbit)
+
.code64
#include "../../kernel/sev_verify_cbit.S"
.data
-#ifdef CONFIG_AMD_MEM_ENCRYPT
.balign 8
SYM_DATA(sme_me_mask, .quad 0)
SYM_DATA(sev_status, .quad 0)
SYM_DATA(sev_check_data, .quad 0)
-#endif
+
+SYM_DATA_START_LOCAL(boot32_idt)
+ .rept 32
+ .quad 0
+ .endr
+SYM_DATA_END(boot32_idt)
+
+SYM_DATA_START_LOCAL(boot32_idt_desc)
+ .word . - boot32_idt - 1
+ .long 0
+SYM_DATA_END(boot32_idt_desc)
diff --git a/arch/x86/boot/cpuflags.c b/arch/x86/boot/cpuflags.c
index a83d67ec627d..d75237ba7ce9 100644
--- a/arch/x86/boot/cpuflags.c
+++ b/arch/x86/boot/cpuflags.c
@@ -64,20 +64,11 @@ int has_eflag(unsigned long mask)
return !!((f0^f1) & mask);
}
-/* Handle x86_32 PIC using ebx. */
-#if defined(__i386__) && defined(__PIC__)
-# define EBX_REG "=r"
-#else
-# define EBX_REG "=b"
-#endif
-
void cpuid_count(u32 id, u32 count, u32 *a, u32 *b, u32 *c, u32 *d)
{
- asm volatile(".ifnc %%ebx,%3 ; movl %%ebx,%3 ; .endif \n\t"
- "cpuid \n\t"
- ".ifnc %%ebx,%3 ; xchgl %%ebx,%3 ; .endif \n\t"
- : "=a" (*a), "=c" (*c), "=d" (*d), EBX_REG (*b)
- : "a" (id), "c" (count)
+ asm volatile("cpuid"
+ : "=a" (*a), "=b" (*b), "=c" (*c), "=d" (*d)
+ : "0" (id), "2" (count)
);
}
diff --git a/arch/x86/boot/header.S b/arch/x86/boot/header.S
index f912d7770130..9338c68e7413 100644
--- a/arch/x86/boot/header.S
+++ b/arch/x86/boot/header.S
@@ -80,10 +80,11 @@ bs_die:
ljmp $0xf000,$0xfff0
#ifdef CONFIG_EFI_STUB
- .org 0x3c
+ .org 0x38
#
# Offset to the PE header.
#
+ .long LINUX_PE_MAGIC
.long pe_header
#endif /* CONFIG_EFI_STUB */
@@ -406,7 +407,7 @@ xloadflags:
# define XLF1 0
#endif
-#ifdef CONFIG_EFI_STUB
+#ifdef CONFIG_EFI_HANDOVER_PROTOCOL
# ifdef CONFIG_EFI_MIXED
# define XLF23 (XLF_EFI_HANDOVER_32|XLF_EFI_HANDOVER_64)
# else
diff --git a/arch/x86/boot/string.c b/arch/x86/boot/string.c
index 8a3fff9128bb..1c8541ae3b3a 100644
--- a/arch/x86/boot/string.c
+++ b/arch/x86/boot/string.c
@@ -350,7 +350,7 @@ static int _kstrtoul(const char *s, unsigned int base, unsigned long *res)
}
/**
- * kstrtoul - convert a string to an unsigned long
+ * boot_kstrtoul - convert a string to an unsigned long
* @s: The start of the string. The string must be null-terminated, and may also
* include a single newline before its terminating null. The first character
* may also be a plus sign, but not a minus sign.
diff --git a/arch/x86/boot/tools/build.c b/arch/x86/boot/tools/build.c
index a3725ad46c5a..bd247692b701 100644
--- a/arch/x86/boot/tools/build.c
+++ b/arch/x86/boot/tools/build.c
@@ -290,6 +290,7 @@ static void efi_stub_entry_update(void)
{
unsigned long addr = efi32_stub_entry;
+#ifdef CONFIG_EFI_HANDOVER_PROTOCOL
#ifdef CONFIG_X86_64
/* Yes, this is really how we defined it :( */
addr = efi64_stub_entry - 0x200;
@@ -299,6 +300,7 @@ static void efi_stub_entry_update(void)
if (efi32_stub_entry != addr)
die("32-bit and 64-bit EFI entry points do not match\n");
#endif
+#endif
put_unaligned_le32(addr, &buf[0x264]);
}
diff --git a/arch/x86/coco/tdx/tdx.c b/arch/x86/coco/tdx/tdx.c
index b8998cf0508a..669d9e4f2901 100644
--- a/arch/x86/coco/tdx/tdx.c
+++ b/arch/x86/coco/tdx/tdx.c
@@ -5,6 +5,8 @@
#define pr_fmt(fmt) "tdx: " fmt
#include <linux/cpufeature.h>
+#include <linux/export.h>
+#include <linux/io.h>
#include <asm/coco.h>
#include <asm/tdx.h>
#include <asm/vmx.h>
@@ -15,6 +17,7 @@
/* TDX module Call Leaf IDs */
#define TDX_GET_INFO 1
#define TDX_GET_VEINFO 3
+#define TDX_GET_REPORT 4
#define TDX_ACCEPT_PAGE 6
/* TDX hypercall Leaf IDs */
@@ -36,6 +39,12 @@
#define ATTR_SEPT_VE_DISABLE BIT(28)
+/* TDX Module call error codes */
+#define TDCALL_RETURN_CODE(a) ((a) >> 32)
+#define TDCALL_INVALID_OPERAND 0xc0000100
+
+#define TDREPORT_SUBTYPE_0 0
+
/*
* Wrapper for standard use of __tdx_hypercall with no output aside from
* return code.
@@ -100,6 +109,37 @@ static inline void tdx_module_call(u64 fn, u64 rcx, u64 rdx, u64 r8, u64 r9,
panic("TDCALL %lld failed (Buggy TDX module!)\n", fn);
}
+/**
+ * tdx_mcall_get_report0() - Wrapper to get TDREPORT0 (a.k.a. TDREPORT
+ * subtype 0) using TDG.MR.REPORT TDCALL.
+ * @reportdata: Address of the input buffer which contains user-defined
+ * REPORTDATA to be included into TDREPORT.
+ * @tdreport: Address of the output buffer to store TDREPORT.
+ *
+ * Refer to section titled "TDG.MR.REPORT leaf" in the TDX Module
+ * v1.0 specification for more information on TDG.MR.REPORT TDCALL.
+ * It is used in the TDX guest driver module to get the TDREPORT0.
+ *
+ * Return 0 on success, -EINVAL for invalid operands, or -EIO on
+ * other TDCALL failures.
+ */
+int tdx_mcall_get_report0(u8 *reportdata, u8 *tdreport)
+{
+ u64 ret;
+
+ ret = __tdx_module_call(TDX_GET_REPORT, virt_to_phys(tdreport),
+ virt_to_phys(reportdata), TDREPORT_SUBTYPE_0,
+ 0, NULL);
+ if (ret) {
+ if (TDCALL_RETURN_CODE(ret) == TDCALL_INVALID_OPERAND)
+ return -EINVAL;
+ return -EIO;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(tdx_mcall_get_report0);
+
static void tdx_parse_tdinfo(u64 *cc_mask)
{
struct tdx_module_output out;
@@ -346,8 +386,8 @@ static int handle_mmio(struct pt_regs *regs, struct ve_info *ve)
{
unsigned long *reg, val, vaddr;
char buffer[MAX_INSN_SIZE];
+ enum insn_mmio_type mmio;
struct insn insn = {};
- enum mmio_type mmio;
int size, extend_size;
u8 extend_val = 0;
@@ -362,10 +402,10 @@ static int handle_mmio(struct pt_regs *regs, struct ve_info *ve)
return -EINVAL;
mmio = insn_decode_mmio(&insn, &size);
- if (WARN_ON_ONCE(mmio == MMIO_DECODE_FAILED))
+ if (WARN_ON_ONCE(mmio == INSN_MMIO_DECODE_FAILED))
return -EINVAL;
- if (mmio != MMIO_WRITE_IMM && mmio != MMIO_MOVS) {
+ if (mmio != INSN_MMIO_WRITE_IMM && mmio != INSN_MMIO_MOVS) {
reg = insn_get_modrm_reg_ptr(&insn, regs);
if (!reg)
return -EINVAL;
@@ -386,23 +426,23 @@ static int handle_mmio(struct pt_regs *regs, struct ve_info *ve)
/* Handle writes first */
switch (mmio) {
- case MMIO_WRITE:
+ case INSN_MMIO_WRITE:
memcpy(&val, reg, size);
if (!mmio_write(size, ve->gpa, val))
return -EIO;
return insn.length;
- case MMIO_WRITE_IMM:
+ case INSN_MMIO_WRITE_IMM:
val = insn.immediate.value;
if (!mmio_write(size, ve->gpa, val))
return -EIO;
return insn.length;
- case MMIO_READ:
- case MMIO_READ_ZERO_EXTEND:
- case MMIO_READ_SIGN_EXTEND:
+ case INSN_MMIO_READ:
+ case INSN_MMIO_READ_ZERO_EXTEND:
+ case INSN_MMIO_READ_SIGN_EXTEND:
/* Reads are handled below */
break;
- case MMIO_MOVS:
- case MMIO_DECODE_FAILED:
+ case INSN_MMIO_MOVS:
+ case INSN_MMIO_DECODE_FAILED:
/*
* MMIO was accessed with an instruction that could not be
* decoded or handled properly. It was likely not using io.h
@@ -419,15 +459,15 @@ static int handle_mmio(struct pt_regs *regs, struct ve_info *ve)
return -EIO;
switch (mmio) {
- case MMIO_READ:
+ case INSN_MMIO_READ:
/* Zero-extend for 32-bit operation */
extend_size = size == 4 ? sizeof(*reg) : 0;
break;
- case MMIO_READ_ZERO_EXTEND:
+ case INSN_MMIO_READ_ZERO_EXTEND:
/* Zero extend based on operand size */
extend_size = insn.opnd_bytes;
break;
- case MMIO_READ_SIGN_EXTEND:
+ case INSN_MMIO_READ_SIGN_EXTEND:
/* Sign extend based on operand size */
extend_size = insn.opnd_bytes;
if (size == 1 && val & BIT(7))
diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile
index 3b1d701a4f6c..3e7a329235bd 100644
--- a/arch/x86/crypto/Makefile
+++ b/arch/x86/crypto/Makefile
@@ -107,3 +107,6 @@ quiet_cmd_perlasm = PERLASM $@
cmd_perlasm = $(PERL) $< > $@
$(obj)/%.S: $(src)/%.pl FORCE
$(call if_changed,perlasm)
+
+# Disable GCOV in odd or sensitive code
+GCOV_PROFILE_curve25519-x86_64.o := n
diff --git a/arch/x86/crypto/aegis128-aesni-asm.S b/arch/x86/crypto/aegis128-aesni-asm.S
index b48ddebb4748..cdf3215ec272 100644
--- a/arch/x86/crypto/aegis128-aesni-asm.S
+++ b/arch/x86/crypto/aegis128-aesni-asm.S
@@ -7,6 +7,7 @@
*/
#include <linux/linkage.h>
+#include <linux/cfi_types.h>
#include <asm/frame.h>
#define STATE0 %xmm0
@@ -402,7 +403,7 @@ SYM_FUNC_END(crypto_aegis128_aesni_ad)
* void crypto_aegis128_aesni_enc(void *state, unsigned int length,
* const void *src, void *dst);
*/
-SYM_FUNC_START(crypto_aegis128_aesni_enc)
+SYM_TYPED_FUNC_START(crypto_aegis128_aesni_enc)
FRAME_BEGIN
cmp $0x10, LEN
@@ -499,7 +500,7 @@ SYM_FUNC_END(crypto_aegis128_aesni_enc)
* void crypto_aegis128_aesni_enc_tail(void *state, unsigned int length,
* const void *src, void *dst);
*/
-SYM_FUNC_START(crypto_aegis128_aesni_enc_tail)
+SYM_TYPED_FUNC_START(crypto_aegis128_aesni_enc_tail)
FRAME_BEGIN
/* load the state: */
@@ -556,7 +557,7 @@ SYM_FUNC_END(crypto_aegis128_aesni_enc_tail)
* void crypto_aegis128_aesni_dec(void *state, unsigned int length,
* const void *src, void *dst);
*/
-SYM_FUNC_START(crypto_aegis128_aesni_dec)
+SYM_TYPED_FUNC_START(crypto_aegis128_aesni_dec)
FRAME_BEGIN
cmp $0x10, LEN
@@ -653,7 +654,7 @@ SYM_FUNC_END(crypto_aegis128_aesni_dec)
* void crypto_aegis128_aesni_dec_tail(void *state, unsigned int length,
* const void *src, void *dst);
*/
-SYM_FUNC_START(crypto_aegis128_aesni_dec_tail)
+SYM_TYPED_FUNC_START(crypto_aegis128_aesni_dec_tail)
FRAME_BEGIN
/* load the state: */
diff --git a/arch/x86/crypto/aria-aesni-avx-asm_64.S b/arch/x86/crypto/aria-aesni-avx-asm_64.S
index c75fd7d015ed..03ae4cd1d976 100644
--- a/arch/x86/crypto/aria-aesni-avx-asm_64.S
+++ b/arch/x86/crypto/aria-aesni-avx-asm_64.S
@@ -7,6 +7,7 @@
*/
#include <linux/linkage.h>
+#include <linux/cfi_types.h>
#include <asm/frame.h>
/* struct aria_ctx: */
@@ -913,7 +914,7 @@ SYM_FUNC_START_LOCAL(__aria_aesni_avx_crypt_16way)
RET;
SYM_FUNC_END(__aria_aesni_avx_crypt_16way)
-SYM_FUNC_START(aria_aesni_avx_encrypt_16way)
+SYM_TYPED_FUNC_START(aria_aesni_avx_encrypt_16way)
/* input:
* %rdi: ctx, CTX
* %rsi: dst
@@ -938,7 +939,7 @@ SYM_FUNC_START(aria_aesni_avx_encrypt_16way)
RET;
SYM_FUNC_END(aria_aesni_avx_encrypt_16way)
-SYM_FUNC_START(aria_aesni_avx_decrypt_16way)
+SYM_TYPED_FUNC_START(aria_aesni_avx_decrypt_16way)
/* input:
* %rdi: ctx, CTX
* %rsi: dst
@@ -1039,7 +1040,7 @@ SYM_FUNC_START_LOCAL(__aria_aesni_avx_ctr_gen_keystream_16way)
RET;
SYM_FUNC_END(__aria_aesni_avx_ctr_gen_keystream_16way)
-SYM_FUNC_START(aria_aesni_avx_ctr_crypt_16way)
+SYM_TYPED_FUNC_START(aria_aesni_avx_ctr_crypt_16way)
/* input:
* %rdi: ctx
* %rsi: dst
@@ -1208,7 +1209,7 @@ SYM_FUNC_START_LOCAL(__aria_aesni_avx_gfni_crypt_16way)
RET;
SYM_FUNC_END(__aria_aesni_avx_gfni_crypt_16way)
-SYM_FUNC_START(aria_aesni_avx_gfni_encrypt_16way)
+SYM_TYPED_FUNC_START(aria_aesni_avx_gfni_encrypt_16way)
/* input:
* %rdi: ctx, CTX
* %rsi: dst
@@ -1233,7 +1234,7 @@ SYM_FUNC_START(aria_aesni_avx_gfni_encrypt_16way)
RET;
SYM_FUNC_END(aria_aesni_avx_gfni_encrypt_16way)
-SYM_FUNC_START(aria_aesni_avx_gfni_decrypt_16way)
+SYM_TYPED_FUNC_START(aria_aesni_avx_gfni_decrypt_16way)
/* input:
* %rdi: ctx, CTX
* %rsi: dst
@@ -1258,7 +1259,7 @@ SYM_FUNC_START(aria_aesni_avx_gfni_decrypt_16way)
RET;
SYM_FUNC_END(aria_aesni_avx_gfni_decrypt_16way)
-SYM_FUNC_START(aria_aesni_avx_gfni_ctr_crypt_16way)
+SYM_TYPED_FUNC_START(aria_aesni_avx_gfni_ctr_crypt_16way)
/* input:
* %rdi: ctx
* %rsi: dst
diff --git a/arch/x86/crypto/camellia-aesni-avx-asm_64.S b/arch/x86/crypto/camellia-aesni-avx-asm_64.S
index 2e1658ddbe1a..4a30618281ec 100644
--- a/arch/x86/crypto/camellia-aesni-avx-asm_64.S
+++ b/arch/x86/crypto/camellia-aesni-avx-asm_64.S
@@ -712,7 +712,6 @@ SYM_FUNC_END(roundsm16_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab)
.text
-.align 8
SYM_FUNC_START_LOCAL(__camellia_enc_blk16)
/* input:
* %rdi: ctx, CTX
@@ -799,7 +798,6 @@ SYM_FUNC_START_LOCAL(__camellia_enc_blk16)
jmp .Lenc_done;
SYM_FUNC_END(__camellia_enc_blk16)
-.align 8
SYM_FUNC_START_LOCAL(__camellia_dec_blk16)
/* input:
* %rdi: ctx, CTX
diff --git a/arch/x86/crypto/camellia-aesni-avx2-asm_64.S b/arch/x86/crypto/camellia-aesni-avx2-asm_64.S
index 0e4e9abbf4de..deaf62aa73a6 100644
--- a/arch/x86/crypto/camellia-aesni-avx2-asm_64.S
+++ b/arch/x86/crypto/camellia-aesni-avx2-asm_64.S
@@ -221,7 +221,6 @@
* Size optimization... with inlined roundsm32 binary would be over 5 times
* larger and would only marginally faster.
*/
-.align 8
SYM_FUNC_START_LOCAL(roundsm32_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd)
roundsm32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
%ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, %ymm15,
@@ -229,7 +228,6 @@ SYM_FUNC_START_LOCAL(roundsm32_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_c
RET;
SYM_FUNC_END(roundsm32_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd)
-.align 8
SYM_FUNC_START_LOCAL(roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab)
roundsm32(%ymm4, %ymm5, %ymm6, %ymm7, %ymm0, %ymm1, %ymm2, %ymm3,
%ymm12, %ymm13, %ymm14, %ymm15, %ymm8, %ymm9, %ymm10, %ymm11,
@@ -748,7 +746,6 @@ SYM_FUNC_END(roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab)
.text
-.align 8
SYM_FUNC_START_LOCAL(__camellia_enc_blk32)
/* input:
* %rdi: ctx, CTX
@@ -835,7 +832,6 @@ SYM_FUNC_START_LOCAL(__camellia_enc_blk32)
jmp .Lenc_done;
SYM_FUNC_END(__camellia_enc_blk32)
-.align 8
SYM_FUNC_START_LOCAL(__camellia_dec_blk32)
/* input:
* %rdi: ctx, CTX
diff --git a/arch/x86/crypto/cast5-avx-x86_64-asm_64.S b/arch/x86/crypto/cast5-avx-x86_64-asm_64.S
index b258af420c92..0326a01503c3 100644
--- a/arch/x86/crypto/cast5-avx-x86_64-asm_64.S
+++ b/arch/x86/crypto/cast5-avx-x86_64-asm_64.S
@@ -208,7 +208,6 @@
.text
-.align 16
SYM_FUNC_START_LOCAL(__cast5_enc_blk16)
/* input:
* %rdi: ctx
@@ -282,7 +281,6 @@ SYM_FUNC_START_LOCAL(__cast5_enc_blk16)
RET;
SYM_FUNC_END(__cast5_enc_blk16)
-.align 16
SYM_FUNC_START_LOCAL(__cast5_dec_blk16)
/* input:
* %rdi: ctx
diff --git a/arch/x86/crypto/crct10dif-pcl-asm_64.S b/arch/x86/crypto/crct10dif-pcl-asm_64.S
index 721474abfb71..5286db5b8165 100644
--- a/arch/x86/crypto/crct10dif-pcl-asm_64.S
+++ b/arch/x86/crypto/crct10dif-pcl-asm_64.S
@@ -94,7 +94,6 @@
#
# Assumes len >= 16.
#
-.align 16
SYM_FUNC_START(crc_t10dif_pcl)
movdqa .Lbswap_mask(%rip), BSWAP_MASK
diff --git a/arch/x86/crypto/nh-avx2-x86_64.S b/arch/x86/crypto/nh-avx2-x86_64.S
index 6a0b15e7196a..ef73a3ab8726 100644
--- a/arch/x86/crypto/nh-avx2-x86_64.S
+++ b/arch/x86/crypto/nh-avx2-x86_64.S
@@ -8,6 +8,7 @@
*/
#include <linux/linkage.h>
+#include <linux/cfi_types.h>
#define PASS0_SUMS %ymm0
#define PASS1_SUMS %ymm1
@@ -65,11 +66,11 @@
/*
* void nh_avx2(const u32 *key, const u8 *message, size_t message_len,
- * u8 hash[NH_HASH_BYTES])
+ * __le64 hash[NH_NUM_PASSES])
*
* It's guaranteed that message_len % 16 == 0.
*/
-SYM_FUNC_START(nh_avx2)
+SYM_TYPED_FUNC_START(nh_avx2)
vmovdqu 0x00(KEY), K0
vmovdqu 0x10(KEY), K1
diff --git a/arch/x86/crypto/nh-sse2-x86_64.S b/arch/x86/crypto/nh-sse2-x86_64.S
index 34c567bbcb4f..75fb994b6d17 100644
--- a/arch/x86/crypto/nh-sse2-x86_64.S
+++ b/arch/x86/crypto/nh-sse2-x86_64.S
@@ -8,6 +8,7 @@
*/
#include <linux/linkage.h>
+#include <linux/cfi_types.h>
#define PASS0_SUMS %xmm0
#define PASS1_SUMS %xmm1
@@ -67,11 +68,11 @@
/*
* void nh_sse2(const u32 *key, const u8 *message, size_t message_len,
- * u8 hash[NH_HASH_BYTES])
+ * __le64 hash[NH_NUM_PASSES])
*
* It's guaranteed that message_len % 16 == 0.
*/
-SYM_FUNC_START(nh_sse2)
+SYM_TYPED_FUNC_START(nh_sse2)
movdqu 0x00(KEY), K0
movdqu 0x10(KEY), K1
diff --git a/arch/x86/crypto/nhpoly1305-avx2-glue.c b/arch/x86/crypto/nhpoly1305-avx2-glue.c
index 8ea5ab0f1ca7..46b036204ed9 100644
--- a/arch/x86/crypto/nhpoly1305-avx2-glue.c
+++ b/arch/x86/crypto/nhpoly1305-avx2-glue.c
@@ -14,14 +14,7 @@
#include <asm/simd.h>
asmlinkage void nh_avx2(const u32 *key, const u8 *message, size_t message_len,
- u8 hash[NH_HASH_BYTES]);
-
-/* wrapper to avoid indirect call to assembly, which doesn't work with CFI */
-static void _nh_avx2(const u32 *key, const u8 *message, size_t message_len,
- __le64 hash[NH_NUM_PASSES])
-{
- nh_avx2(key, message, message_len, (u8 *)hash);
-}
+ __le64 hash[NH_NUM_PASSES]);
static int nhpoly1305_avx2_update(struct shash_desc *desc,
const u8 *src, unsigned int srclen)
@@ -33,7 +26,7 @@ static int nhpoly1305_avx2_update(struct shash_desc *desc,
unsigned int n = min_t(unsigned int, srclen, SZ_4K);
kernel_fpu_begin();
- crypto_nhpoly1305_update_helper(desc, src, n, _nh_avx2);
+ crypto_nhpoly1305_update_helper(desc, src, n, nh_avx2);
kernel_fpu_end();
src += n;
srclen -= n;
diff --git a/arch/x86/crypto/nhpoly1305-sse2-glue.c b/arch/x86/crypto/nhpoly1305-sse2-glue.c
index 2b353d42ed13..4a4970d75107 100644
--- a/arch/x86/crypto/nhpoly1305-sse2-glue.c
+++ b/arch/x86/crypto/nhpoly1305-sse2-glue.c
@@ -14,14 +14,7 @@
#include <asm/simd.h>
asmlinkage void nh_sse2(const u32 *key, const u8 *message, size_t message_len,
- u8 hash[NH_HASH_BYTES]);
-
-/* wrapper to avoid indirect call to assembly, which doesn't work with CFI */
-static void _nh_sse2(const u32 *key, const u8 *message, size_t message_len,
- __le64 hash[NH_NUM_PASSES])
-{
- nh_sse2(key, message, message_len, (u8 *)hash);
-}
+ __le64 hash[NH_NUM_PASSES]);
static int nhpoly1305_sse2_update(struct shash_desc *desc,
const u8 *src, unsigned int srclen)
@@ -33,7 +26,7 @@ static int nhpoly1305_sse2_update(struct shash_desc *desc,
unsigned int n = min_t(unsigned int, srclen, SZ_4K);
kernel_fpu_begin();
- crypto_nhpoly1305_update_helper(desc, src, n, _nh_sse2);
+ crypto_nhpoly1305_update_helper(desc, src, n, nh_sse2);
kernel_fpu_end();
src += n;
srclen -= n;
diff --git a/arch/x86/crypto/poly1305-x86_64-cryptogams.pl b/arch/x86/crypto/poly1305-x86_64-cryptogams.pl
index 2077ce7a5647..b9abcd79c1f4 100644
--- a/arch/x86/crypto/poly1305-x86_64-cryptogams.pl
+++ b/arch/x86/crypto/poly1305-x86_64-cryptogams.pl
@@ -108,7 +108,6 @@ if (!$kernel) {
sub declare_function() {
my ($name, $align, $nargs) = @_;
if($kernel) {
- $code .= ".align $align\n";
$code .= "SYM_FUNC_START($name)\n";
$code .= ".L$name:\n";
} else {
diff --git a/arch/x86/crypto/serpent-avx-x86_64-asm_64.S b/arch/x86/crypto/serpent-avx-x86_64-asm_64.S
index 82f2313f512b..97e283621851 100644
--- a/arch/x86/crypto/serpent-avx-x86_64-asm_64.S
+++ b/arch/x86/crypto/serpent-avx-x86_64-asm_64.S
@@ -550,7 +550,6 @@
#define write_blocks(x0, x1, x2, x3, t0, t1, t2) \
transpose_4x4(x0, x1, x2, x3, t0, t1, t2)
-.align 8
SYM_FUNC_START_LOCAL(__serpent_enc_blk8_avx)
/* input:
* %rdi: ctx, CTX
@@ -604,7 +603,6 @@ SYM_FUNC_START_LOCAL(__serpent_enc_blk8_avx)
RET;
SYM_FUNC_END(__serpent_enc_blk8_avx)
-.align 8
SYM_FUNC_START_LOCAL(__serpent_dec_blk8_avx)
/* input:
* %rdi: ctx, CTX
diff --git a/arch/x86/crypto/serpent-avx2-asm_64.S b/arch/x86/crypto/serpent-avx2-asm_64.S
index 8ea34c9b9316..6d60c50593a9 100644
--- a/arch/x86/crypto/serpent-avx2-asm_64.S
+++ b/arch/x86/crypto/serpent-avx2-asm_64.S
@@ -550,7 +550,6 @@
#define write_blocks(x0, x1, x2, x3, t0, t1, t2) \
transpose_4x4(x0, x1, x2, x3, t0, t1, t2)
-.align 8
SYM_FUNC_START_LOCAL(__serpent_enc_blk16)
/* input:
* %rdi: ctx, CTX
@@ -604,7 +603,6 @@ SYM_FUNC_START_LOCAL(__serpent_enc_blk16)
RET;
SYM_FUNC_END(__serpent_enc_blk16)
-.align 8
SYM_FUNC_START_LOCAL(__serpent_dec_blk16)
/* input:
* %rdi: ctx, CTX
diff --git a/arch/x86/crypto/sha1_ni_asm.S b/arch/x86/crypto/sha1_ni_asm.S
index 2f94ec0e763b..cade913d4882 100644
--- a/arch/x86/crypto/sha1_ni_asm.S
+++ b/arch/x86/crypto/sha1_ni_asm.S
@@ -54,6 +54,7 @@
*/
#include <linux/linkage.h>
+#include <linux/cfi_types.h>
#define DIGEST_PTR %rdi /* 1st arg */
#define DATA_PTR %rsi /* 2nd arg */
@@ -92,8 +93,7 @@
* numBlocks: Number of blocks to process
*/
.text
-.align 32
-SYM_FUNC_START(sha1_ni_transform)
+SYM_TYPED_FUNC_START(sha1_ni_transform)
push %rbp
mov %rsp, %rbp
sub $FRAME_SIZE, %rsp
diff --git a/arch/x86/crypto/sha1_ssse3_asm.S b/arch/x86/crypto/sha1_ssse3_asm.S
index 263f916362e0..f54988c80eb4 100644
--- a/arch/x86/crypto/sha1_ssse3_asm.S
+++ b/arch/x86/crypto/sha1_ssse3_asm.S
@@ -25,6 +25,7 @@
*/
#include <linux/linkage.h>
+#include <linux/cfi_types.h>
#define CTX %rdi // arg1
#define BUF %rsi // arg2
@@ -67,7 +68,7 @@
* param: function's name
*/
.macro SHA1_VECTOR_ASM name
- SYM_FUNC_START(\name)
+ SYM_TYPED_FUNC_START(\name)
push %rbx
push %r12
diff --git a/arch/x86/crypto/sha256-avx-asm.S b/arch/x86/crypto/sha256-avx-asm.S
index 3baa1ec39097..5555b5d5215a 100644
--- a/arch/x86/crypto/sha256-avx-asm.S
+++ b/arch/x86/crypto/sha256-avx-asm.S
@@ -48,6 +48,7 @@
########################################################################
#include <linux/linkage.h>
+#include <linux/cfi_types.h>
## assume buffers not aligned
#define VMOVDQ vmovdqu
@@ -346,8 +347,7 @@ a = TMP_
## arg 3 : Num blocks
########################################################################
.text
-SYM_FUNC_START(sha256_transform_avx)
-.align 32
+SYM_TYPED_FUNC_START(sha256_transform_avx)
pushq %rbx
pushq %r12
pushq %r13
diff --git a/arch/x86/crypto/sha256-avx2-asm.S b/arch/x86/crypto/sha256-avx2-asm.S
index 9bcdbc47b8b4..3eada9416852 100644
--- a/arch/x86/crypto/sha256-avx2-asm.S
+++ b/arch/x86/crypto/sha256-avx2-asm.S
@@ -49,6 +49,7 @@
########################################################################
#include <linux/linkage.h>
+#include <linux/cfi_types.h>
## assume buffers not aligned
#define VMOVDQ vmovdqu
@@ -523,8 +524,7 @@ STACK_SIZE = _CTX + _CTX_SIZE
## arg 3 : Num blocks
########################################################################
.text
-SYM_FUNC_START(sha256_transform_rorx)
-.align 32
+SYM_TYPED_FUNC_START(sha256_transform_rorx)
pushq %rbx
pushq %r12
pushq %r13
diff --git a/arch/x86/crypto/sha256-ssse3-asm.S b/arch/x86/crypto/sha256-ssse3-asm.S
index c4a5db612c32..959288eecc68 100644
--- a/arch/x86/crypto/sha256-ssse3-asm.S
+++ b/arch/x86/crypto/sha256-ssse3-asm.S
@@ -47,6 +47,7 @@
########################################################################
#include <linux/linkage.h>
+#include <linux/cfi_types.h>
## assume buffers not aligned
#define MOVDQ movdqu
@@ -355,8 +356,7 @@ a = TMP_
## arg 3 : Num blocks
########################################################################
.text
-SYM_FUNC_START(sha256_transform_ssse3)
-.align 32
+SYM_TYPED_FUNC_START(sha256_transform_ssse3)
pushq %rbx
pushq %r12
pushq %r13
diff --git a/arch/x86/crypto/sha256_ni_asm.S b/arch/x86/crypto/sha256_ni_asm.S
index 94d50dd27cb5..537b6dcd7ed8 100644
--- a/arch/x86/crypto/sha256_ni_asm.S
+++ b/arch/x86/crypto/sha256_ni_asm.S
@@ -54,6 +54,7 @@
*/
#include <linux/linkage.h>
+#include <linux/cfi_types.h>
#define DIGEST_PTR %rdi /* 1st arg */
#define DATA_PTR %rsi /* 2nd arg */
@@ -96,8 +97,7 @@
*/
.text
-.align 32
-SYM_FUNC_START(sha256_ni_transform)
+SYM_TYPED_FUNC_START(sha256_ni_transform)
shl $6, NUM_BLKS /* convert to bytes */
jz .Ldone_hash
diff --git a/arch/x86/crypto/sha512-avx-asm.S b/arch/x86/crypto/sha512-avx-asm.S
index 1fefe6dd3a9e..b0984f19fdb4 100644
--- a/arch/x86/crypto/sha512-avx-asm.S
+++ b/arch/x86/crypto/sha512-avx-asm.S
@@ -48,6 +48,7 @@
########################################################################
#include <linux/linkage.h>
+#include <linux/cfi_types.h>
.text
@@ -273,7 +274,7 @@ frame_size = frame_WK + WK_SIZE
# of SHA512 message blocks.
# "blocks" is the message length in SHA512 blocks
########################################################################
-SYM_FUNC_START(sha512_transform_avx)
+SYM_TYPED_FUNC_START(sha512_transform_avx)
test msglen, msglen
je nowork
diff --git a/arch/x86/crypto/sha512-avx2-asm.S b/arch/x86/crypto/sha512-avx2-asm.S
index 5cdaab7d6901..b1ca99055ef9 100644
--- a/arch/x86/crypto/sha512-avx2-asm.S
+++ b/arch/x86/crypto/sha512-avx2-asm.S
@@ -50,6 +50,7 @@
########################################################################
#include <linux/linkage.h>
+#include <linux/cfi_types.h>
.text
@@ -565,7 +566,7 @@ frame_size = frame_CTX + CTX_SIZE
# of SHA512 message blocks.
# "blocks" is the message length in SHA512 blocks
########################################################################
-SYM_FUNC_START(sha512_transform_rorx)
+SYM_TYPED_FUNC_START(sha512_transform_rorx)
# Save GPRs
push %rbx
push %r12
diff --git a/arch/x86/crypto/sha512-ssse3-asm.S b/arch/x86/crypto/sha512-ssse3-asm.S
index b84c22e06c5f..c06afb5270e5 100644
--- a/arch/x86/crypto/sha512-ssse3-asm.S
+++ b/arch/x86/crypto/sha512-ssse3-asm.S
@@ -48,6 +48,7 @@
########################################################################
#include <linux/linkage.h>
+#include <linux/cfi_types.h>
.text
@@ -274,7 +275,7 @@ frame_size = frame_WK + WK_SIZE
# of SHA512 message blocks.
# "blocks" is the message length in SHA512 blocks.
########################################################################
-SYM_FUNC_START(sha512_transform_ssse3)
+SYM_TYPED_FUNC_START(sha512_transform_ssse3)
test msglen, msglen
je nowork
diff --git a/arch/x86/crypto/sm3-avx-asm_64.S b/arch/x86/crypto/sm3-avx-asm_64.S
index b12b9efb5ec5..503bab450a91 100644
--- a/arch/x86/crypto/sm3-avx-asm_64.S
+++ b/arch/x86/crypto/sm3-avx-asm_64.S
@@ -12,6 +12,7 @@
*/
#include <linux/linkage.h>
+#include <linux/cfi_types.h>
#include <asm/frame.h>
/* Context structure */
@@ -327,8 +328,7 @@
* void sm3_transform_avx(struct sm3_state *state,
* const u8 *data, int nblocks);
*/
-.align 16
-SYM_FUNC_START(sm3_transform_avx)
+SYM_TYPED_FUNC_START(sm3_transform_avx)
/* input:
* %rdi: ctx, CTX
* %rsi: data (64*nblks bytes)
diff --git a/arch/x86/crypto/sm4-aesni-avx-asm_64.S b/arch/x86/crypto/sm4-aesni-avx-asm_64.S
index 4767ab61ff48..e2668d2fe6ce 100644
--- a/arch/x86/crypto/sm4-aesni-avx-asm_64.S
+++ b/arch/x86/crypto/sm4-aesni-avx-asm_64.S
@@ -14,6 +14,7 @@
*/
#include <linux/linkage.h>
+#include <linux/cfi_types.h>
#include <asm/frame.h>
#define rRIP (%rip)
@@ -139,13 +140,11 @@
.text
-.align 16
/*
* void sm4_aesni_avx_crypt4(const u32 *rk, u8 *dst,
* const u8 *src, int nblocks)
*/
-.align 8
SYM_FUNC_START(sm4_aesni_avx_crypt4)
/* input:
* %rdi: round key array, CTX
@@ -249,7 +248,6 @@ SYM_FUNC_START(sm4_aesni_avx_crypt4)
RET;
SYM_FUNC_END(sm4_aesni_avx_crypt4)
-.align 8
SYM_FUNC_START_LOCAL(__sm4_crypt_blk8)
/* input:
* %rdi: round key array, CTX
@@ -363,7 +361,6 @@ SYM_FUNC_END(__sm4_crypt_blk8)
* void sm4_aesni_avx_crypt8(const u32 *rk, u8 *dst,
* const u8 *src, int nblocks)
*/
-.align 8
SYM_FUNC_START(sm4_aesni_avx_crypt8)
/* input:
* %rdi: round key array, CTX
@@ -419,8 +416,7 @@ SYM_FUNC_END(sm4_aesni_avx_crypt8)
* void sm4_aesni_avx_ctr_enc_blk8(const u32 *rk, u8 *dst,
* const u8 *src, u8 *iv)
*/
-.align 8
-SYM_FUNC_START(sm4_aesni_avx_ctr_enc_blk8)
+SYM_TYPED_FUNC_START(sm4_aesni_avx_ctr_enc_blk8)
/* input:
* %rdi: round key array, CTX
* %rsi: dst (8 blocks)
@@ -494,8 +490,7 @@ SYM_FUNC_END(sm4_aesni_avx_ctr_enc_blk8)
* void sm4_aesni_avx_cbc_dec_blk8(const u32 *rk, u8 *dst,
* const u8 *src, u8 *iv)
*/
-.align 8
-SYM_FUNC_START(sm4_aesni_avx_cbc_dec_blk8)
+SYM_TYPED_FUNC_START(sm4_aesni_avx_cbc_dec_blk8)
/* input:
* %rdi: round key array, CTX
* %rsi: dst (8 blocks)
@@ -544,8 +539,7 @@ SYM_FUNC_END(sm4_aesni_avx_cbc_dec_blk8)
* void sm4_aesni_avx_cfb_dec_blk8(const u32 *rk, u8 *dst,
* const u8 *src, u8 *iv)
*/
-.align 8
-SYM_FUNC_START(sm4_aesni_avx_cfb_dec_blk8)
+SYM_TYPED_FUNC_START(sm4_aesni_avx_cfb_dec_blk8)
/* input:
* %rdi: round key array, CTX
* %rsi: dst (8 blocks)
diff --git a/arch/x86/crypto/sm4-aesni-avx2-asm_64.S b/arch/x86/crypto/sm4-aesni-avx2-asm_64.S
index 4732fe8bb65b..98ede9459287 100644
--- a/arch/x86/crypto/sm4-aesni-avx2-asm_64.S
+++ b/arch/x86/crypto/sm4-aesni-avx2-asm_64.S
@@ -14,6 +14,7 @@
*/
#include <linux/linkage.h>
+#include <linux/cfi_types.h>
#include <asm/frame.h>
#define rRIP (%rip)
@@ -153,9 +154,6 @@
.long 0xdeadbeef, 0xdeadbeef, 0xdeadbeef
.text
-.align 16
-
-.align 8
SYM_FUNC_START_LOCAL(__sm4_crypt_blk16)
/* input:
* %rdi: round key array, CTX
@@ -281,8 +279,7 @@ SYM_FUNC_END(__sm4_crypt_blk16)
* void sm4_aesni_avx2_ctr_enc_blk16(const u32 *rk, u8 *dst,
* const u8 *src, u8 *iv)
*/
-.align 8
-SYM_FUNC_START(sm4_aesni_avx2_ctr_enc_blk16)
+SYM_TYPED_FUNC_START(sm4_aesni_avx2_ctr_enc_blk16)
/* input:
* %rdi: round key array, CTX
* %rsi: dst (16 blocks)
@@ -394,8 +391,7 @@ SYM_FUNC_END(sm4_aesni_avx2_ctr_enc_blk16)
* void sm4_aesni_avx2_cbc_dec_blk16(const u32 *rk, u8 *dst,
* const u8 *src, u8 *iv)
*/
-.align 8
-SYM_FUNC_START(sm4_aesni_avx2_cbc_dec_blk16)
+SYM_TYPED_FUNC_START(sm4_aesni_avx2_cbc_dec_blk16)
/* input:
* %rdi: round key array, CTX
* %rsi: dst (16 blocks)
@@ -448,8 +444,7 @@ SYM_FUNC_END(sm4_aesni_avx2_cbc_dec_blk16)
* void sm4_aesni_avx2_cfb_dec_blk16(const u32 *rk, u8 *dst,
* const u8 *src, u8 *iv)
*/
-.align 8
-SYM_FUNC_START(sm4_aesni_avx2_cfb_dec_blk16)
+SYM_TYPED_FUNC_START(sm4_aesni_avx2_cfb_dec_blk16)
/* input:
* %rdi: round key array, CTX
* %rsi: dst (16 blocks)
diff --git a/arch/x86/crypto/twofish-avx-x86_64-asm_64.S b/arch/x86/crypto/twofish-avx-x86_64-asm_64.S
index 31f9b2ec3857..12fde271cd3f 100644
--- a/arch/x86/crypto/twofish-avx-x86_64-asm_64.S
+++ b/arch/x86/crypto/twofish-avx-x86_64-asm_64.S
@@ -228,7 +228,6 @@
vpxor x2, wkey, x2; \
vpxor x3, wkey, x3;
-.align 8
SYM_FUNC_START_LOCAL(__twofish_enc_blk8)
/* input:
* %rdi: ctx, CTX
@@ -270,7 +269,6 @@ SYM_FUNC_START_LOCAL(__twofish_enc_blk8)
RET;
SYM_FUNC_END(__twofish_enc_blk8)
-.align 8
SYM_FUNC_START_LOCAL(__twofish_dec_blk8)
/* input:
* %rdi: ctx, CTX
diff --git a/arch/x86/crypto/twofish_glue.c b/arch/x86/crypto/twofish_glue.c
index f9c4adc27404..0614beece279 100644
--- a/arch/x86/crypto/twofish_glue.c
+++ b/arch/x86/crypto/twofish_glue.c
@@ -38,8 +38,8 @@
* Third Edition.
*/
+#include <crypto/algapi.h>
#include <crypto/twofish.h>
-#include <linux/crypto.h>
#include <linux/init.h>
#include <linux/module.h>
#include <linux/types.h>
diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
index e309e7156038..91397f58ac30 100644
--- a/arch/x86/entry/entry_32.S
+++ b/arch/x86/entry/entry_32.S
@@ -1181,7 +1181,7 @@ SYM_CODE_START(asm_exc_nmi)
* is using the thread stack right now, so it's safe for us to use it.
*/
movl %esp, %ebx
- movl PER_CPU_VAR(cpu_current_top_of_stack), %esp
+ movl PER_CPU_VAR(pcpu_hot + X86_top_of_stack), %esp
call exc_nmi
movl %ebx, %esp
@@ -1243,7 +1243,7 @@ SYM_CODE_START(rewind_stack_and_make_dead)
/* Prevent any naive code from trying to unwind to our caller. */
xorl %ebp, %ebp
- movl PER_CPU_VAR(cpu_current_top_of_stack), %esi
+ movl PER_CPU_VAR(pcpu_hot + X86_top_of_stack), %esi
leal -TOP_OF_KERNEL_STACK_PADDING-PTREGS_SIZE(%esi), %esp
call make_task_dead
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 9953d966d124..15739a2c0983 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -92,7 +92,7 @@ SYM_CODE_START(entry_SYSCALL_64)
/* tss.sp2 is scratch space. */
movq %rsp, PER_CPU_VAR(cpu_tss_rw + TSS_sp2)
SWITCH_TO_KERNEL_CR3 scratch_reg=%rsp
- movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
+ movq PER_CPU_VAR(pcpu_hot + X86_top_of_stack), %rsp
SYM_INNER_LABEL(entry_SYSCALL_64_safe_stack, SYM_L_GLOBAL)
ANNOTATE_NOENDBR
@@ -252,7 +252,7 @@ SYM_FUNC_START(__switch_to_asm)
#ifdef CONFIG_STACKPROTECTOR
movq TASK_stack_canary(%rsi), %rbx
- movq %rbx, PER_CPU_VAR(fixed_percpu_data) + stack_canary_offset
+ movq %rbx, PER_CPU_VAR(fixed_percpu_data) + FIXED_stack_canary
#endif
/*
@@ -284,9 +284,11 @@ SYM_FUNC_END(__switch_to_asm)
* r12: kernel thread arg
*/
.pushsection .text, "ax"
-SYM_CODE_START(ret_from_fork)
+ __FUNC_ALIGN
+SYM_CODE_START_NOALIGN(ret_from_fork)
UNWIND_HINT_EMPTY
ANNOTATE_NOENDBR // copy_thread
+ CALL_DEPTH_ACCOUNT
movq %rax, %rdi
call schedule_tail /* rdi: 'prev' task parameter */
@@ -326,11 +328,12 @@ SYM_CODE_END(ret_from_fork)
#endif
.endm
-SYM_CODE_START_LOCAL(xen_error_entry)
+SYM_CODE_START(xen_error_entry)
+ ANNOTATE_NOENDBR
UNWIND_HINT_FUNC
PUSH_AND_CLEAR_REGS save_ret=1
ENCODE_FRAME_POINTER 8
- UNTRAIN_RET
+ UNTRAIN_RET_FROM_CALL
RET
SYM_CODE_END(xen_error_entry)
@@ -600,13 +603,13 @@ SYM_CODE_END(\asmsym)
* shared between 32 and 64 bit and emit the __irqentry_text_* markers
* so the stacktrace boundary checks work.
*/
- .align 16
+ __ALIGN
.globl __irqentry_text_start
__irqentry_text_start:
#include <asm/idtentry.h>
- .align 16
+ __ALIGN
.globl __irqentry_text_end
__irqentry_text_end:
ANNOTATE_NOENDBR
@@ -828,7 +831,8 @@ EXPORT_SYMBOL(asm_load_gs_index)
*
* C calling convention: exc_xen_hypervisor_callback(struct *pt_regs)
*/
-SYM_CODE_START_LOCAL(exc_xen_hypervisor_callback)
+ __FUNC_ALIGN
+SYM_CODE_START_LOCAL_NOALIGN(exc_xen_hypervisor_callback)
/*
* Since we don't modify %rdi, evtchn_do_upall(struct *pt_regs) will
@@ -856,7 +860,8 @@ SYM_CODE_END(exc_xen_hypervisor_callback)
* We distinguish between categories by comparing each saved segment register
* with its current contents: any discrepancy means we in category 1.
*/
-SYM_CODE_START(xen_failsafe_callback)
+ __FUNC_ALIGN
+SYM_CODE_START_NOALIGN(xen_failsafe_callback)
UNWIND_HINT_EMPTY
ENDBR
movl %ds, %ecx
@@ -903,7 +908,8 @@ SYM_CODE_END(xen_failsafe_callback)
* R14 - old CR3
* R15 - old SPEC_CTRL
*/
-SYM_CODE_START_LOCAL(paranoid_entry)
+SYM_CODE_START(paranoid_entry)
+ ANNOTATE_NOENDBR
UNWIND_HINT_FUNC
PUSH_AND_CLEAR_REGS save_ret=1
ENCODE_FRAME_POINTER 8
@@ -972,7 +978,7 @@ SYM_CODE_START_LOCAL(paranoid_entry)
* CR3 above, keep the old value in a callee saved register.
*/
IBRS_ENTER save_reg=%r15
- UNTRAIN_RET
+ UNTRAIN_RET_FROM_CALL
RET
SYM_CODE_END(paranoid_entry)
@@ -1038,7 +1044,8 @@ SYM_CODE_END(paranoid_exit)
/*
* Switch GS and CR3 if needed.
*/
-SYM_CODE_START_LOCAL(error_entry)
+SYM_CODE_START(error_entry)
+ ANNOTATE_NOENDBR
UNWIND_HINT_FUNC
PUSH_AND_CLEAR_REGS save_ret=1
@@ -1056,14 +1063,11 @@ SYM_CODE_START_LOCAL(error_entry)
/* We have user CR3. Change to kernel CR3. */
SWITCH_TO_KERNEL_CR3 scratch_reg=%rax
IBRS_ENTER
- UNTRAIN_RET
+ UNTRAIN_RET_FROM_CALL
leaq 8(%rsp), %rdi /* arg0 = pt_regs pointer */
-.Lerror_entry_from_usermode_after_swapgs:
-
/* Put us onto the real thread stack. */
- call sync_regs
- RET
+ jmp sync_regs
/*
* There are two places in the kernel that can potentially fault with
@@ -1094,6 +1098,7 @@ SYM_CODE_START_LOCAL(error_entry)
*/
.Lerror_entry_done_lfence:
FENCE_SWAPGS_KERNEL_ENTRY
+ CALL_DEPTH_ACCOUNT
leaq 8(%rsp), %rax /* return pt_regs pointer */
ANNOTATE_UNRET_END
RET
@@ -1112,7 +1117,7 @@ SYM_CODE_START_LOCAL(error_entry)
FENCE_SWAPGS_USER_ENTRY
SWITCH_TO_KERNEL_CR3 scratch_reg=%rax
IBRS_ENTER
- UNTRAIN_RET
+ UNTRAIN_RET_FROM_CALL
/*
* Pretend that the exception came from user mode: set up pt_regs
@@ -1121,7 +1126,7 @@ SYM_CODE_START_LOCAL(error_entry)
leaq 8(%rsp), %rdi /* arg0 = pt_regs pointer */
call fixup_bad_iret
mov %rax, %rdi
- jmp .Lerror_entry_from_usermode_after_swapgs
+ jmp sync_regs
SYM_CODE_END(error_entry)
SYM_CODE_START_LOCAL(error_return)
@@ -1206,7 +1211,7 @@ SYM_CODE_START(asm_exc_nmi)
FENCE_SWAPGS_USER_ENTRY
SWITCH_TO_KERNEL_CR3 scratch_reg=%rdx
movq %rsp, %rdx
- movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
+ movq PER_CPU_VAR(pcpu_hot + X86_top_of_stack), %rsp
UNWIND_HINT_IRET_REGS base=%rdx offset=8
pushq 5*8(%rdx) /* pt_regs->ss */
pushq 4*8(%rdx) /* pt_regs->rsp */
@@ -1516,12 +1521,13 @@ SYM_CODE_END(ignore_sysret)
#endif
.pushsection .text, "ax"
-SYM_CODE_START(rewind_stack_and_make_dead)
+ __FUNC_ALIGN
+SYM_CODE_START_NOALIGN(rewind_stack_and_make_dead)
UNWIND_HINT_FUNC
/* Prevent any naive code from trying to unwind to our caller. */
xorl %ebp, %ebp
- movq PER_CPU_VAR(cpu_current_top_of_stack), %rax
+ movq PER_CPU_VAR(pcpu_hot + X86_top_of_stack), %rax
leaq -PTREGS_SIZE(%rax), %rsp
UNWIND_HINT_REGS
diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S
index 4dd19819053a..70150298f8bd 100644
--- a/arch/x86/entry/entry_64_compat.S
+++ b/arch/x86/entry/entry_64_compat.S
@@ -58,10 +58,10 @@ SYM_CODE_START(entry_SYSENTER_compat)
SWITCH_TO_KERNEL_CR3 scratch_reg=%rax
popq %rax
- movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
+ movq PER_CPU_VAR(pcpu_hot + X86_top_of_stack), %rsp
/* Construct struct pt_regs on stack */
- pushq $__USER32_DS /* pt_regs->ss */
+ pushq $__USER_DS /* pt_regs->ss */
pushq $0 /* pt_regs->sp = 0 (placeholder) */
/*
@@ -128,7 +128,6 @@ SYM_INNER_LABEL(entry_SYSENTER_compat_after_hwframe, SYM_L_GLOBAL)
popfq
jmp .Lsysenter_flags_fixed
SYM_INNER_LABEL(__end_entry_SYSENTER_compat, SYM_L_GLOBAL)
- ANNOTATE_NOENDBR // is_sysenter_singlestep
SYM_CODE_END(entry_SYSENTER_compat)
/*
@@ -191,13 +190,13 @@ SYM_CODE_START(entry_SYSCALL_compat)
SWITCH_TO_KERNEL_CR3 scratch_reg=%rsp
/* Switch to the kernel stack */
- movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
+ movq PER_CPU_VAR(pcpu_hot + X86_top_of_stack), %rsp
SYM_INNER_LABEL(entry_SYSCALL_compat_safe_stack, SYM_L_GLOBAL)
ANNOTATE_NOENDBR
/* Construct struct pt_regs on stack */
- pushq $__USER32_DS /* pt_regs->ss */
+ pushq $__USER_DS /* pt_regs->ss */
pushq %r8 /* pt_regs->sp */
pushq %r11 /* pt_regs->flags */
pushq $__USER32_CS /* pt_regs->cs */
@@ -332,7 +331,7 @@ SYM_CODE_START(entry_INT80_compat)
ALTERNATIVE "", "jmp .Lint80_keep_stack", X86_FEATURE_XENPV
movq %rsp, %rax
- movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
+ movq PER_CPU_VAR(pcpu_hot + X86_top_of_stack), %rsp
pushq 5*8(%rax) /* regs->ss */
pushq 4*8(%rax) /* regs->rsp */
diff --git a/arch/x86/entry/thunk_64.S b/arch/x86/entry/thunk_64.S
index f38b07d2768b..5e37f41e5f14 100644
--- a/arch/x86/entry/thunk_64.S
+++ b/arch/x86/entry/thunk_64.S
@@ -11,7 +11,7 @@
/* rdi: arg1 ... normal C conventions. rax is saved/restored. */
.macro THUNK name, func
-SYM_FUNC_START_NOALIGN(\name)
+SYM_FUNC_START(\name)
pushq %rbp
movq %rsp, %rbp
@@ -36,7 +36,7 @@ SYM_FUNC_END(\name)
EXPORT_SYMBOL(preempt_schedule_thunk)
EXPORT_SYMBOL(preempt_schedule_notrace_thunk)
-SYM_CODE_START_LOCAL_NOALIGN(__thunk_restore)
+SYM_CODE_START_LOCAL(__thunk_restore)
popq %r11
popq %r10
popq %r9
diff --git a/arch/x86/entry/vdso/Makefile b/arch/x86/entry/vdso/Makefile
index 3e88b9df8c8f..838613ac15b8 100644
--- a/arch/x86/entry/vdso/Makefile
+++ b/arch/x86/entry/vdso/Makefile
@@ -33,11 +33,12 @@ vobjs32-y += vdso32/vclock_gettime.o
vobjs-$(CONFIG_X86_SGX) += vsgx.o
# files to link into kernel
-obj-y += vma.o extable.o
-KASAN_SANITIZE_vma.o := y
-UBSAN_SANITIZE_vma.o := y
-KCSAN_SANITIZE_vma.o := y
-OBJECT_FILES_NON_STANDARD_vma.o := n
+obj-y += vma.o extable.o
+KASAN_SANITIZE_vma.o := y
+UBSAN_SANITIZE_vma.o := y
+KCSAN_SANITIZE_vma.o := y
+OBJECT_FILES_NON_STANDARD_vma.o := n
+OBJECT_FILES_NON_STANDARD_extable.o := n
# vDSO images to build
vdso_img-$(VDSO64-y) += 64
@@ -94,7 +95,7 @@ ifneq ($(RETPOLINE_VDSO_CFLAGS),)
endif
endif
-$(vobjs): KBUILD_CFLAGS := $(filter-out $(CC_FLAGS_LTO) $(CC_FLAGS_CFI) $(RANDSTRUCT_CFLAGS) $(GCC_PLUGINS_CFLAGS) $(RETPOLINE_CFLAGS),$(KBUILD_CFLAGS)) $(CFL)
+$(vobjs): KBUILD_CFLAGS := $(filter-out $(PADDING_CFLAGS) $(CC_FLAGS_LTO) $(CC_FLAGS_CFI) $(RANDSTRUCT_CFLAGS) $(GCC_PLUGINS_CFLAGS) $(RETPOLINE_CFLAGS),$(KBUILD_CFLAGS)) $(CFL)
$(vobjs): KBUILD_AFLAGS += -DBUILD_VDSO
#
@@ -157,6 +158,7 @@ KBUILD_CFLAGS_32 := $(filter-out $(GCC_PLUGINS_CFLAGS),$(KBUILD_CFLAGS_32))
KBUILD_CFLAGS_32 := $(filter-out $(RETPOLINE_CFLAGS),$(KBUILD_CFLAGS_32))
KBUILD_CFLAGS_32 := $(filter-out $(CC_FLAGS_LTO),$(KBUILD_CFLAGS_32))
KBUILD_CFLAGS_32 := $(filter-out $(CC_FLAGS_CFI),$(KBUILD_CFLAGS_32))
+KBUILD_CFLAGS_32 := $(filter-out $(PADDING_CFLAGS),$(KBUILD_CFLAGS_32))
KBUILD_CFLAGS_32 += -m32 -msoft-float -mregparm=0 -fpic
KBUILD_CFLAGS_32 += -fno-stack-protector
KBUILD_CFLAGS_32 += $(call cc-option, -foptimize-sibling-calls)
diff --git a/arch/x86/entry/vdso/vdso.lds.S b/arch/x86/entry/vdso/vdso.lds.S
index 4bf48462fca7..e8c60ae7a7c8 100644
--- a/arch/x86/entry/vdso/vdso.lds.S
+++ b/arch/x86/entry/vdso/vdso.lds.S
@@ -27,7 +27,9 @@ VERSION {
__vdso_time;
clock_getres;
__vdso_clock_getres;
+#ifdef CONFIG_X86_SGX
__vdso_sgx_enter_enclave;
+#endif
local: *;
};
}
diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c
index 311eae30e089..b8f3f9b9e53c 100644
--- a/arch/x86/entry/vdso/vma.c
+++ b/arch/x86/entry/vdso/vma.c
@@ -98,24 +98,6 @@ static int vdso_mremap(const struct vm_special_mapping *sm,
}
#ifdef CONFIG_TIME_NS
-static struct page *find_timens_vvar_page(struct vm_area_struct *vma)
-{
- if (likely(vma->vm_mm == current->mm))
- return current->nsproxy->time_ns->vvar_page;
-
- /*
- * VM_PFNMAP | VM_IO protect .fault() handler from being called
- * through interfaces like /proc/$pid/mem or
- * process_vm_{readv,writev}() as long as there's no .access()
- * in special_mapping_vmops().
- * For more details check_vma_flags() and __access_remote_vm()
- */
-
- WARN(1, "vvar_page accessed remotely");
-
- return NULL;
-}
-
/*
* The vvar page layout depends on whether a task belongs to the root or
* non-root time namespace. Whenever a task changes its namespace, the VVAR
@@ -140,11 +122,6 @@ int vdso_join_timens(struct task_struct *task, struct time_namespace *ns)
return 0;
}
-#else
-static inline struct page *find_timens_vvar_page(struct vm_area_struct *vma)
-{
- return NULL;
-}
#endif
static vm_fault_t vvar_fault(const struct vm_special_mapping *sm,
@@ -210,11 +187,10 @@ static vm_fault_t vvar_fault(const struct vm_special_mapping *sm,
pgprot_decrypted(vma->vm_page_prot));
}
} else if (sym_offset == image->sym_hvclock_page) {
- struct ms_hyperv_tsc_page *tsc_pg = hv_get_tsc_page();
+ pfn = hv_get_tsc_pfn();
- if (tsc_pg && vclock_was_used(VDSO_CLOCKMODE_HVCLOCK))
- return vmf_insert_pfn(vma, vmf->address,
- virt_to_phys(tsc_pg) >> PAGE_SHIFT);
+ if (pfn && vclock_was_used(VDSO_CLOCKMODE_HVCLOCK))
+ return vmf_insert_pfn(vma, vmf->address, pfn);
} else if (sym_offset == image->sym_timens_page) {
struct page *timens_page = find_timens_vvar_page(vma);
@@ -327,7 +303,7 @@ static unsigned long vdso_addr(unsigned long start, unsigned len)
end -= len;
if (end > start) {
- offset = prandom_u32_max(((end - start) >> PAGE_SHIFT) + 1);
+ offset = get_random_u32_below(((end - start) >> PAGE_SHIFT) + 1);
addr = start + (offset << PAGE_SHIFT);
} else {
addr = start;
diff --git a/arch/x86/events/amd/brs.c b/arch/x86/events/amd/brs.c
index f1bff153d945..58461fa18b6f 100644
--- a/arch/x86/events/amd/brs.c
+++ b/arch/x86/events/amd/brs.c
@@ -384,7 +384,7 @@ static void amd_brs_poison_buffer(void)
* On ctxswin, sched_in = true, called after the PMU has started
* On ctxswout, sched_in = false, called before the PMU is stopped
*/
-void amd_pmu_brs_sched_task(struct perf_event_context *ctx, bool sched_in)
+void amd_pmu_brs_sched_task(struct perf_event_pmu_context *pmu_ctx, bool sched_in)
{
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
diff --git a/arch/x86/events/amd/core.c b/arch/x86/events/amd/core.c
index 8b70237c33f7..4386b10682ce 100644
--- a/arch/x86/events/amd/core.c
+++ b/arch/x86/events/amd/core.c
@@ -861,8 +861,7 @@ static int amd_pmu_handle_irq(struct pt_regs *regs)
pmu_enabled = cpuc->enabled;
cpuc->enabled = 0;
- /* stop everything (includes BRS) */
- amd_pmu_disable_all();
+ amd_brs_disable_all();
/* Drain BRS is in use (could be inactive) */
if (cpuc->lbr_users)
@@ -873,7 +872,7 @@ static int amd_pmu_handle_irq(struct pt_regs *regs)
cpuc->enabled = pmu_enabled;
if (pmu_enabled)
- amd_pmu_enable_all(0);
+ amd_brs_enable_all();
return amd_pmu_adjust_nmi_window(handled);
}
@@ -1388,7 +1387,7 @@ static int __init amd_core_pmu_init(void)
* numbered counter following it.
*/
for (i = 0; i < x86_pmu.num_counters - 1; i += 2)
- even_ctr_mask |= 1 << i;
+ even_ctr_mask |= BIT_ULL(i);
pair_constraint = (struct event_constraint)
__EVENT_CONSTRAINT(0, even_ctr_mask, 0,
diff --git a/arch/x86/events/amd/ibs.c b/arch/x86/events/amd/ibs.c
index 4cb710efbdd9..da3f5ebac4e1 100644
--- a/arch/x86/events/amd/ibs.c
+++ b/arch/x86/events/amd/ibs.c
@@ -631,7 +631,7 @@ static const struct attribute_group *op_attr_update[] = {
static struct perf_ibs perf_ibs_fetch = {
.pmu = {
- .task_ctx_nr = perf_invalid_context,
+ .task_ctx_nr = perf_hw_context,
.event_init = perf_ibs_init,
.add = perf_ibs_add,
@@ -655,7 +655,7 @@ static struct perf_ibs perf_ibs_fetch = {
static struct perf_ibs perf_ibs_op = {
.pmu = {
- .task_ctx_nr = perf_invalid_context,
+ .task_ctx_nr = perf_hw_context,
.event_init = perf_ibs_init,
.add = perf_ibs_add,
diff --git a/arch/x86/events/amd/lbr.c b/arch/x86/events/amd/lbr.c
index 38a75216c12c..eb31f850841a 100644
--- a/arch/x86/events/amd/lbr.c
+++ b/arch/x86/events/amd/lbr.c
@@ -352,7 +352,7 @@ void amd_pmu_lbr_add(struct perf_event *event)
cpuc->br_sel = reg->reg;
}
- perf_sched_cb_inc(event->ctx->pmu);
+ perf_sched_cb_inc(event->pmu);
if (!cpuc->lbr_users++ && !event->total_time_running)
amd_pmu_lbr_reset();
@@ -370,10 +370,10 @@ void amd_pmu_lbr_del(struct perf_event *event)
cpuc->lbr_users--;
WARN_ON_ONCE(cpuc->lbr_users < 0);
- perf_sched_cb_dec(event->ctx->pmu);
+ perf_sched_cb_dec(event->pmu);
}
-void amd_pmu_lbr_sched_task(struct perf_event_context *ctx, bool sched_in)
+void amd_pmu_lbr_sched_task(struct perf_event_pmu_context *pmu_ctx, bool sched_in)
{
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
diff --git a/arch/x86/events/amd/uncore.c b/arch/x86/events/amd/uncore.c
index d568afc705d2..83f15fe411b3 100644
--- a/arch/x86/events/amd/uncore.c
+++ b/arch/x86/events/amd/uncore.c
@@ -553,6 +553,7 @@ static void uncore_clean_online(void)
hlist_for_each_entry_safe(uncore, n, &uncore_unused_list, node) {
hlist_del(&uncore->node);
+ kfree(uncore->events);
kfree(uncore);
}
}
diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index b30b8bbcd1e2..85a63a41c471 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -90,6 +90,8 @@ DEFINE_STATIC_CALL_NULL(x86_pmu_swap_task_ctx, *x86_pmu.swap_task_ctx);
DEFINE_STATIC_CALL_NULL(x86_pmu_drain_pebs, *x86_pmu.drain_pebs);
DEFINE_STATIC_CALL_NULL(x86_pmu_pebs_aliases, *x86_pmu.pebs_aliases);
+DEFINE_STATIC_CALL_NULL(x86_pmu_filter, *x86_pmu.filter);
+
/*
* This one is magic, it will get called even when PMU init fails (because
* there is no PMU), in which case it should simply return NULL.
@@ -2031,6 +2033,7 @@ static void x86_pmu_static_call_update(void)
static_call_update(x86_pmu_pebs_aliases, x86_pmu.pebs_aliases);
static_call_update(x86_pmu_guest_get_msrs, x86_pmu.guest_get_msrs);
+ static_call_update(x86_pmu_filter, x86_pmu.filter);
}
static void _x86_pmu_read(struct perf_event *event)
@@ -2052,23 +2055,6 @@ void x86_pmu_show_pmu_cap(int num_counters, int num_counters_fixed,
pr_info("... event mask: %016Lx\n", intel_ctrl);
}
-/*
- * The generic code is not hybrid friendly. The hybrid_pmu->pmu
- * of the first registered PMU is unconditionally assigned to
- * each possible cpuctx->ctx.pmu.
- * Update the correct hybrid PMU to the cpuctx->ctx.pmu.
- */
-void x86_pmu_update_cpu_context(struct pmu *pmu, int cpu)
-{
- struct perf_cpu_context *cpuctx;
-
- if (!pmu->pmu_cpu_context)
- return;
-
- cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
- cpuctx->ctx.pmu = pmu;
-}
-
static int __init init_hw_perf_events(void)
{
struct x86_pmu_quirk *quirk;
@@ -2175,13 +2161,9 @@ static int __init init_hw_perf_events(void)
if (err)
goto out2;
} else {
- u8 cpu_type = get_this_hybrid_cpu_type();
struct x86_hybrid_pmu *hybrid_pmu;
int i, j;
- if (!cpu_type && x86_pmu.get_hybrid_cpu_type)
- cpu_type = x86_pmu.get_hybrid_cpu_type();
-
for (i = 0; i < x86_pmu.num_hybrid_pmus; i++) {
hybrid_pmu = &x86_pmu.hybrid_pmu[i];
@@ -2195,9 +2177,6 @@ static int __init init_hw_perf_events(void)
(hybrid_pmu->cpu_type == hybrid_big) ? PERF_TYPE_RAW : -1);
if (err)
break;
-
- if (cpu_type == hybrid_pmu->cpu_type)
- x86_pmu_update_cpu_context(&hybrid_pmu->pmu, raw_smp_processor_id());
}
if (i < x86_pmu.num_hybrid_pmus) {
@@ -2646,15 +2625,15 @@ static const struct attribute_group *x86_pmu_attr_groups[] = {
NULL,
};
-static void x86_pmu_sched_task(struct perf_event_context *ctx, bool sched_in)
+static void x86_pmu_sched_task(struct perf_event_pmu_context *pmu_ctx, bool sched_in)
{
- static_call_cond(x86_pmu_sched_task)(ctx, sched_in);
+ static_call_cond(x86_pmu_sched_task)(pmu_ctx, sched_in);
}
-static void x86_pmu_swap_task_ctx(struct perf_event_context *prev,
- struct perf_event_context *next)
+static void x86_pmu_swap_task_ctx(struct perf_event_pmu_context *prev_epc,
+ struct perf_event_pmu_context *next_epc)
{
- static_call_cond(x86_pmu_swap_task_ctx)(prev, next);
+ static_call_cond(x86_pmu_swap_task_ctx)(prev_epc, next_epc);
}
void perf_check_microcode(void)
@@ -2689,12 +2668,13 @@ static int x86_pmu_aux_output_match(struct perf_event *event)
return 0;
}
-static int x86_pmu_filter_match(struct perf_event *event)
+static bool x86_pmu_filter(struct pmu *pmu, int cpu)
{
- if (x86_pmu.filter_match)
- return x86_pmu.filter_match(event);
+ bool ret = false;
- return 1;
+ static_call_cond(x86_pmu_filter)(pmu, cpu, &ret);
+
+ return ret;
}
static struct pmu pmu = {
@@ -2725,7 +2705,7 @@ static struct pmu pmu = {
.aux_output_match = x86_pmu_aux_output_match,
- .filter_match = x86_pmu_filter_match,
+ .filter = x86_pmu_filter,
};
void arch_perf_update_userpage(struct perf_event *event,
diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index 1b92bf05fd65..dfd2c124cdf8 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -4536,8 +4536,6 @@ end:
cpumask_set_cpu(cpu, &pmu->supported_cpus);
cpuc->pmu = &pmu->pmu;
- x86_pmu_update_cpu_context(&pmu->pmu, cpu);
-
return true;
}
@@ -4671,17 +4669,17 @@ static void intel_pmu_cpu_dead(int cpu)
cpumask_clear_cpu(cpu, &hybrid_pmu(cpuc->pmu)->supported_cpus);
}
-static void intel_pmu_sched_task(struct perf_event_context *ctx,
+static void intel_pmu_sched_task(struct perf_event_pmu_context *pmu_ctx,
bool sched_in)
{
- intel_pmu_pebs_sched_task(ctx, sched_in);
- intel_pmu_lbr_sched_task(ctx, sched_in);
+ intel_pmu_pebs_sched_task(pmu_ctx, sched_in);
+ intel_pmu_lbr_sched_task(pmu_ctx, sched_in);
}
-static void intel_pmu_swap_task_ctx(struct perf_event_context *prev,
- struct perf_event_context *next)
+static void intel_pmu_swap_task_ctx(struct perf_event_pmu_context *prev_epc,
+ struct perf_event_pmu_context *next_epc)
{
- intel_pmu_lbr_swap_task_ctx(prev, next);
+ intel_pmu_lbr_swap_task_ctx(prev_epc, next_epc);
}
static int intel_pmu_check_period(struct perf_event *event, u64 value)
@@ -4705,12 +4703,11 @@ static int intel_pmu_aux_output_match(struct perf_event *event)
return is_intel_pt_event(event);
}
-static int intel_pmu_filter_match(struct perf_event *event)
+static void intel_pmu_filter(struct pmu *pmu, int cpu, bool *ret)
{
- struct x86_hybrid_pmu *pmu = hybrid_pmu(event->pmu);
- unsigned int cpu = smp_processor_id();
+ struct x86_hybrid_pmu *hpmu = hybrid_pmu(pmu);
- return cpumask_test_cpu(cpu, &pmu->supported_cpus);
+ *ret = !cpumask_test_cpu(cpu, &hpmu->supported_cpus);
}
PMU_FORMAT_ATTR(offcore_rsp, "config1:0-63");
@@ -6413,7 +6410,7 @@ __init int intel_pmu_init(void)
static_call_update(intel_pmu_set_topdown_event_period,
&adl_set_topdown_event_period);
- x86_pmu.filter_match = intel_pmu_filter_match;
+ x86_pmu.filter = intel_pmu_filter;
x86_pmu.get_event_constraints = adl_get_event_constraints;
x86_pmu.hw_config = adl_hw_config;
x86_pmu.limit_period = spr_limit_period;
diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c
index 446d2833efa7..88e58b6ee73c 100644
--- a/arch/x86/events/intel/ds.c
+++ b/arch/x86/events/intel/ds.c
@@ -1069,7 +1069,7 @@ static inline bool pebs_needs_sched_cb(struct cpu_hw_events *cpuc)
return cpuc->n_pebs && (cpuc->n_pebs == cpuc->n_large_pebs);
}
-void intel_pmu_pebs_sched_task(struct perf_event_context *ctx, bool sched_in)
+void intel_pmu_pebs_sched_task(struct perf_event_pmu_context *pmu_ctx, bool sched_in)
{
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
@@ -1177,7 +1177,7 @@ static void
pebs_update_state(bool needed_cb, struct cpu_hw_events *cpuc,
struct perf_event *event, bool add)
{
- struct pmu *pmu = event->ctx->pmu;
+ struct pmu *pmu = event->pmu;
/*
* Make sure we get updated with the first PEBS
* event. It will trigger also during removal, but
diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c
index 4dbde69c423b..1f21f576ca77 100644
--- a/arch/x86/events/intel/lbr.c
+++ b/arch/x86/events/intel/lbr.c
@@ -515,21 +515,21 @@ static void __intel_pmu_lbr_save(void *ctx)
cpuc->last_log_id = ++task_context_opt(ctx)->log_id;
}
-void intel_pmu_lbr_swap_task_ctx(struct perf_event_context *prev,
- struct perf_event_context *next)
+void intel_pmu_lbr_swap_task_ctx(struct perf_event_pmu_context *prev_epc,
+ struct perf_event_pmu_context *next_epc)
{
void *prev_ctx_data, *next_ctx_data;
- swap(prev->task_ctx_data, next->task_ctx_data);
+ swap(prev_epc->task_ctx_data, next_epc->task_ctx_data);
/*
- * Architecture specific synchronization makes sense in
- * case both prev->task_ctx_data and next->task_ctx_data
+ * Architecture specific synchronization makes sense in case
+ * both prev_epc->task_ctx_data and next_epc->task_ctx_data
* pointers are allocated.
*/
- prev_ctx_data = next->task_ctx_data;
- next_ctx_data = prev->task_ctx_data;
+ prev_ctx_data = next_epc->task_ctx_data;
+ next_ctx_data = prev_epc->task_ctx_data;
if (!prev_ctx_data || !next_ctx_data)
return;
@@ -538,7 +538,7 @@ void intel_pmu_lbr_swap_task_ctx(struct perf_event_context *prev,
task_context_opt(next_ctx_data)->lbr_callstack_users);
}
-void intel_pmu_lbr_sched_task(struct perf_event_context *ctx, bool sched_in)
+void intel_pmu_lbr_sched_task(struct perf_event_pmu_context *pmu_ctx, bool sched_in)
{
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
void *task_ctx;
@@ -551,7 +551,7 @@ void intel_pmu_lbr_sched_task(struct perf_event_context *ctx, bool sched_in)
* the task was scheduled out, restore the stack. Otherwise flush
* the LBR stack.
*/
- task_ctx = ctx ? ctx->task_ctx_data : NULL;
+ task_ctx = pmu_ctx ? pmu_ctx->task_ctx_data : NULL;
if (task_ctx) {
if (sched_in)
__intel_pmu_lbr_restore(task_ctx);
@@ -587,8 +587,8 @@ void intel_pmu_lbr_add(struct perf_event *event)
cpuc->br_sel = event->hw.branch_reg.reg;
- if (branch_user_callstack(cpuc->br_sel) && event->ctx->task_ctx_data)
- task_context_opt(event->ctx->task_ctx_data)->lbr_callstack_users++;
+ if (branch_user_callstack(cpuc->br_sel) && event->pmu_ctx->task_ctx_data)
+ task_context_opt(event->pmu_ctx->task_ctx_data)->lbr_callstack_users++;
/*
* Request pmu::sched_task() callback, which will fire inside the
@@ -611,7 +611,7 @@ void intel_pmu_lbr_add(struct perf_event *event)
*/
if (x86_pmu.intel_cap.pebs_baseline && event->attr.precise_ip > 0)
cpuc->lbr_pebs_users++;
- perf_sched_cb_inc(event->ctx->pmu);
+ perf_sched_cb_inc(event->pmu);
if (!cpuc->lbr_users++ && !event->total_time_running)
intel_pmu_lbr_reset();
}
@@ -664,8 +664,8 @@ void intel_pmu_lbr_del(struct perf_event *event)
return;
if (branch_user_callstack(cpuc->br_sel) &&
- event->ctx->task_ctx_data)
- task_context_opt(event->ctx->task_ctx_data)->lbr_callstack_users--;
+ event->pmu_ctx->task_ctx_data)
+ task_context_opt(event->pmu_ctx->task_ctx_data)->lbr_callstack_users--;
if (event->hw.flags & PERF_X86_EVENT_LBR_SELECT)
cpuc->lbr_select = 0;
@@ -675,7 +675,7 @@ void intel_pmu_lbr_del(struct perf_event *event)
cpuc->lbr_users--;
WARN_ON_ONCE(cpuc->lbr_users < 0);
WARN_ON_ONCE(cpuc->lbr_pebs_users < 0);
- perf_sched_cb_dec(event->ctx->pmu);
+ perf_sched_cb_dec(event->pmu);
}
static inline bool vlbr_exclude_host(void)
diff --git a/arch/x86/events/intel/p4.c b/arch/x86/events/intel/p4.c
index 03bbcc2fa2ff..35936188db01 100644
--- a/arch/x86/events/intel/p4.c
+++ b/arch/x86/events/intel/p4.c
@@ -24,7 +24,7 @@ struct p4_event_bind {
unsigned int escr_msr[2]; /* ESCR MSR for this event */
unsigned int escr_emask; /* valid ESCR EventMask bits */
unsigned int shared; /* event is shared across threads */
- char cntr[2][P4_CNTR_LIMIT]; /* counter index (offset), -1 on absence */
+ signed char cntr[2][P4_CNTR_LIMIT]; /* counter index (offset), -1 on absence */
};
struct p4_pebs_bind {
diff --git a/arch/x86/events/intel/pt.c b/arch/x86/events/intel/pt.c
index 82ef87e9a897..42a55794004a 100644
--- a/arch/x86/events/intel/pt.c
+++ b/arch/x86/events/intel/pt.c
@@ -1263,6 +1263,15 @@ static int pt_buffer_try_single(struct pt_buffer *buf, int nr_pages)
if (1 << order != nr_pages)
goto out;
+ /*
+ * Some processors cannot always support single range for more than
+ * 4KB - refer errata TGL052, ADL037 and RPL017. Future processors might
+ * also be affected, so for now rather than trying to keep track of
+ * which ones, just disable it for all.
+ */
+ if (nr_pages > 1)
+ goto out;
+
buf->single = true;
buf->nr_pages = nr_pages;
ret = 0;
diff --git a/arch/x86/events/intel/uncore.h b/arch/x86/events/intel/uncore.h
index 2adeaf4de4df..e278e2e7c051 100644
--- a/arch/x86/events/intel/uncore.h
+++ b/arch/x86/events/intel/uncore.h
@@ -2,6 +2,7 @@
#include <linux/slab.h>
#include <linux/pci.h>
#include <asm/apicdef.h>
+#include <asm/intel-family.h>
#include <linux/io-64-nonatomic-lo-hi.h>
#include <linux/perf_event.h>
@@ -88,12 +89,12 @@ struct intel_uncore_type {
* to identify which platform component each PMON block of that type is
* supposed to monitor.
*/
- struct intel_uncore_topology *topology;
+ struct intel_uncore_topology **topology;
/*
* Optional callbacks for managing mapping of Uncore units to PMONs
*/
int (*get_topology)(struct intel_uncore_type *type);
- int (*set_mapping)(struct intel_uncore_type *type);
+ void (*set_mapping)(struct intel_uncore_type *type);
void (*cleanup_mapping)(struct intel_uncore_type *type);
};
@@ -178,11 +179,26 @@ struct freerunning_counters {
unsigned *box_offsets;
};
-struct intel_uncore_topology {
- u64 configuration;
+struct uncore_iio_topology {
+ int pci_bus_no;
int segment;
};
+struct uncore_upi_topology {
+ int die_to;
+ int pmu_idx_to;
+ int enabled;
+};
+
+struct intel_uncore_topology {
+ int pmu_idx;
+ union {
+ void *untyped;
+ struct uncore_iio_topology *iio;
+ struct uncore_upi_topology *upi;
+ };
+};
+
struct pci2phy_map {
struct list_head list;
int segment;
diff --git a/arch/x86/events/intel/uncore_snb.c b/arch/x86/events/intel/uncore_snb.c
index 1ef4f7861e2e..1f4869227efb 100644
--- a/arch/x86/events/intel/uncore_snb.c
+++ b/arch/x86/events/intel/uncore_snb.c
@@ -1338,6 +1338,7 @@ static void __uncore_imc_init_box(struct intel_uncore_box *box,
/* MCHBAR is disabled */
if (!(mch_bar & BIT(0))) {
pr_warn("perf uncore: MCHBAR is disabled. Failed to map IMC free-running counters.\n");
+ pci_dev_put(pdev);
return;
}
mch_bar &= ~BIT(0);
@@ -1352,6 +1353,8 @@ static void __uncore_imc_init_box(struct intel_uncore_box *box,
box->io_addr = ioremap(addr, type->mmio_map_size);
if (!box->io_addr)
pr_warn("perf uncore: Failed to ioremap for %s.\n", type->name);
+
+ pci_dev_put(pdev);
}
static void tgl_uncore_imc_freerunning_init_box(struct intel_uncore_box *box)
diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c
index ed869443efb2..44c2f879f708 100644
--- a/arch/x86/events/intel/uncore_snbep.c
+++ b/arch/x86/events/intel/uncore_snbep.c
@@ -445,6 +445,7 @@
#define ICX_UPI_PCI_PMON_CTR0 0x320
#define ICX_UPI_PCI_PMON_BOX_CTL 0x318
#define ICX_UPI_CTL_UMASK_EXT 0xffffff
+#define ICX_UBOX_DID 0x3450
/* ICX M3UPI*/
#define ICX_M3UPI_PCI_PMON_CTL0 0xd8
@@ -457,6 +458,7 @@
/* SPR */
#define SPR_RAW_EVENT_MASK_EXT 0xffffff
+#define SPR_UBOX_DID 0x3250
/* SPR CHA */
#define SPR_CHA_PMON_CTL_TID_EN (1 << 16)
@@ -1372,6 +1374,28 @@ static struct pci_driver snbep_uncore_pci_driver = {
#define NODE_ID_MASK 0x7
+/* Each three bits from 0 to 23 of GIDNIDMAP register correspond Node ID. */
+#define GIDNIDMAP(config, id) (((config) >> (3 * (id))) & 0x7)
+
+static int upi_nodeid_groupid(struct pci_dev *ubox_dev, int nodeid_loc, int idmap_loc,
+ int *nodeid, int *groupid)
+{
+ int ret;
+
+ /* get the Node ID of the local register */
+ ret = pci_read_config_dword(ubox_dev, nodeid_loc, nodeid);
+ if (ret)
+ goto err;
+
+ *nodeid = *nodeid & NODE_ID_MASK;
+ /* get the Node ID mapping */
+ ret = pci_read_config_dword(ubox_dev, idmap_loc, groupid);
+ if (ret)
+ goto err;
+err:
+ return ret;
+}
+
/*
* build pci bus to socket mapping
*/
@@ -1397,13 +1421,8 @@ static int snbep_pci2phy_map_init(int devid, int nodeid_loc, int idmap_loc, bool
* the topology.
*/
if (nr_node_ids <= 8) {
- /* get the Node ID of the local register */
- err = pci_read_config_dword(ubox_dev, nodeid_loc, &config);
- if (err)
- break;
- nodeid = config & NODE_ID_MASK;
- /* get the Node ID mapping */
- err = pci_read_config_dword(ubox_dev, idmap_loc, &config);
+ err = upi_nodeid_groupid(ubox_dev, nodeid_loc, idmap_loc,
+ &nodeid, &config);
if (err)
break;
@@ -1421,7 +1440,7 @@ static int snbep_pci2phy_map_init(int devid, int nodeid_loc, int idmap_loc, bool
* to a particular node.
*/
for (i = 0; i < 8; i++) {
- if (nodeid == ((config >> (3 * i)) & 0x7)) {
+ if (nodeid == GIDNIDMAP(config, i)) {
if (topology_max_die_per_package() > 1)
die_id = i;
else
@@ -2891,6 +2910,7 @@ static bool hswep_has_limit_sbox(unsigned int device)
return false;
pci_read_config_dword(dev, HSWEP_PCU_CAPID4_OFFET, &capid4);
+ pci_dev_put(dev);
if (!hswep_get_chop(capid4))
return true;
@@ -3699,10 +3719,16 @@ static struct intel_uncore_ops skx_uncore_iio_ops = {
.read_counter = uncore_msr_read_counter,
};
-static inline u8 skx_iio_stack(struct intel_uncore_pmu *pmu, int die)
+static struct intel_uncore_topology *pmu_topology(struct intel_uncore_pmu *pmu, int die)
{
- return pmu->type->topology[die].configuration >>
- (pmu->pmu_idx * BUS_NUM_STRIDE);
+ int idx;
+
+ for (idx = 0; idx < pmu->type->num_boxes; idx++) {
+ if (pmu->type->topology[die][idx].pmu_idx == pmu->pmu_idx)
+ return &pmu->type->topology[die][idx];
+ }
+
+ return NULL;
}
static umode_t
@@ -3710,8 +3736,9 @@ pmu_iio_mapping_visible(struct kobject *kobj, struct attribute *attr,
int die, int zero_bus_pmu)
{
struct intel_uncore_pmu *pmu = dev_to_uncore_pmu(kobj_to_dev(kobj));
+ struct intel_uncore_topology *pmut = pmu_topology(pmu, die);
- return (!skx_iio_stack(pmu, die) && pmu->pmu_idx != zero_bus_pmu) ? 0 : attr->mode;
+ return (pmut && !pmut->iio->pci_bus_no && pmu->pmu_idx != zero_bus_pmu) ? 0 : attr->mode;
}
static umode_t
@@ -3727,9 +3754,10 @@ static ssize_t skx_iio_mapping_show(struct device *dev,
struct intel_uncore_pmu *pmu = dev_to_uncore_pmu(dev);
struct dev_ext_attribute *ea = to_dev_ext_attribute(attr);
long die = (long)ea->var;
+ struct intel_uncore_topology *pmut = pmu_topology(pmu, die);
- return sprintf(buf, "%04x:%02x\n", pmu->type->topology[die].segment,
- skx_iio_stack(pmu, die));
+ return sprintf(buf, "%04x:%02x\n", pmut ? pmut->iio->segment : 0,
+ pmut ? pmut->iio->pci_bus_no : 0);
}
static int skx_msr_cpu_bus_read(int cpu, u64 *topology)
@@ -3764,18 +3792,79 @@ static int die_to_cpu(int die)
return res;
}
-static int skx_iio_get_topology(struct intel_uncore_type *type)
+enum {
+ IIO_TOPOLOGY_TYPE,
+ UPI_TOPOLOGY_TYPE,
+ TOPOLOGY_MAX
+};
+
+static const size_t topology_size[TOPOLOGY_MAX] = {
+ sizeof(*((struct intel_uncore_topology *)NULL)->iio),
+ sizeof(*((struct intel_uncore_topology *)NULL)->upi)
+};
+
+static int pmu_alloc_topology(struct intel_uncore_type *type, int topology_type)
{
- int die, ret = -EPERM;
+ int die, idx;
+ struct intel_uncore_topology **topology;
+
+ if (!type->num_boxes)
+ return -EPERM;
- type->topology = kcalloc(uncore_max_dies(), sizeof(*type->topology),
- GFP_KERNEL);
- if (!type->topology)
- return -ENOMEM;
+ topology = kcalloc(uncore_max_dies(), sizeof(*topology), GFP_KERNEL);
+ if (!topology)
+ goto err;
for (die = 0; die < uncore_max_dies(); die++) {
- ret = skx_msr_cpu_bus_read(die_to_cpu(die),
- &type->topology[die].configuration);
+ topology[die] = kcalloc(type->num_boxes, sizeof(**topology), GFP_KERNEL);
+ if (!topology[die])
+ goto clear;
+ for (idx = 0; idx < type->num_boxes; idx++) {
+ topology[die][idx].untyped = kcalloc(type->num_boxes,
+ topology_size[topology_type],
+ GFP_KERNEL);
+ if (!topology[die][idx].untyped)
+ goto clear;
+ }
+ }
+
+ type->topology = topology;
+
+ return 0;
+clear:
+ for (; die >= 0; die--) {
+ for (idx = 0; idx < type->num_boxes; idx++)
+ kfree(topology[die][idx].untyped);
+ kfree(topology[die]);
+ }
+ kfree(topology);
+err:
+ return -ENOMEM;
+}
+
+static void pmu_free_topology(struct intel_uncore_type *type)
+{
+ int die, idx;
+
+ if (type->topology) {
+ for (die = 0; die < uncore_max_dies(); die++) {
+ for (idx = 0; idx < type->num_boxes; idx++)
+ kfree(type->topology[die][idx].untyped);
+ kfree(type->topology[die]);
+ }
+ kfree(type->topology);
+ type->topology = NULL;
+ }
+}
+
+static int skx_pmu_get_topology(struct intel_uncore_type *type,
+ int (*topology_cb)(struct intel_uncore_type*, int, int, u64))
+{
+ int die, ret = -EPERM;
+ u64 cpu_bus_msr;
+
+ for (die = 0; die < uncore_max_dies(); die++) {
+ ret = skx_msr_cpu_bus_read(die_to_cpu(die), &cpu_bus_msr);
if (ret)
break;
@@ -3783,15 +3872,33 @@ static int skx_iio_get_topology(struct intel_uncore_type *type)
if (ret < 0)
break;
- type->topology[die].segment = ret;
+ ret = topology_cb(type, ret, die, cpu_bus_msr);
+ if (ret)
+ break;
}
- if (ret < 0) {
- kfree(type->topology);
- type->topology = NULL;
+ return ret;
+}
+
+static int skx_iio_topology_cb(struct intel_uncore_type *type, int segment,
+ int die, u64 cpu_bus_msr)
+{
+ int idx;
+ struct intel_uncore_topology *t;
+
+ for (idx = 0; idx < type->num_boxes; idx++) {
+ t = &type->topology[die][idx];
+ t->pmu_idx = idx;
+ t->iio->segment = segment;
+ t->iio->pci_bus_no = (cpu_bus_msr >> (idx * BUS_NUM_STRIDE)) & 0xff;
}
- return ret;
+ return 0;
+}
+
+static int skx_iio_get_topology(struct intel_uncore_type *type)
+{
+ return skx_pmu_get_topology(type, skx_iio_topology_cb);
}
static struct attribute_group skx_iio_mapping_group = {
@@ -3803,8 +3910,25 @@ static const struct attribute_group *skx_iio_attr_update[] = {
NULL,
};
-static int
-pmu_iio_set_mapping(struct intel_uncore_type *type, struct attribute_group *ag)
+static void pmu_clear_mapping_attr(const struct attribute_group **groups,
+ struct attribute_group *ag)
+{
+ int i;
+
+ for (i = 0; groups[i]; i++) {
+ if (groups[i] == ag) {
+ for (i++; groups[i]; i++)
+ groups[i - 1] = groups[i];
+ groups[i - 1] = NULL;
+ break;
+ }
+ }
+}
+
+static void
+pmu_set_mapping(struct intel_uncore_type *type, struct attribute_group *ag,
+ ssize_t (*show)(struct device*, struct device_attribute*, char*),
+ int topology_type)
{
char buf[64];
int ret;
@@ -3812,11 +3936,13 @@ pmu_iio_set_mapping(struct intel_uncore_type *type, struct attribute_group *ag)
struct attribute **attrs = NULL;
struct dev_ext_attribute *eas = NULL;
- ret = type->get_topology(type);
+ ret = pmu_alloc_topology(type, topology_type);
if (ret < 0)
goto clear_attr_update;
- ret = -ENOMEM;
+ ret = type->get_topology(type);
+ if (ret < 0)
+ goto clear_topology;
/* One more for NULL. */
attrs = kcalloc((uncore_max_dies() + 1), sizeof(*attrs), GFP_KERNEL);
@@ -3828,20 +3954,20 @@ pmu_iio_set_mapping(struct intel_uncore_type *type, struct attribute_group *ag)
goto clear_attrs;
for (die = 0; die < uncore_max_dies(); die++) {
- sprintf(buf, "die%ld", die);
+ snprintf(buf, sizeof(buf), "die%ld", die);
sysfs_attr_init(&eas[die].attr.attr);
eas[die].attr.attr.name = kstrdup(buf, GFP_KERNEL);
if (!eas[die].attr.attr.name)
goto err;
eas[die].attr.attr.mode = 0444;
- eas[die].attr.show = skx_iio_mapping_show;
+ eas[die].attr.show = show;
eas[die].attr.store = NULL;
eas[die].var = (void *)die;
attrs[die] = &eas[die].attr.attr;
}
ag->attrs = attrs;
- return 0;
+ return;
err:
for (; die >= 0; die--)
kfree(eas[die].attr.attr.name);
@@ -3849,14 +3975,13 @@ err:
clear_attrs:
kfree(attrs);
clear_topology:
- kfree(type->topology);
+ pmu_free_topology(type);
clear_attr_update:
- type->attr_update = NULL;
- return ret;
+ pmu_clear_mapping_attr(type->attr_update, ag);
}
static void
-pmu_iio_cleanup_mapping(struct intel_uncore_type *type, struct attribute_group *ag)
+pmu_cleanup_mapping(struct intel_uncore_type *type, struct attribute_group *ag)
{
struct attribute **attr = ag->attrs;
@@ -3868,17 +3993,23 @@ pmu_iio_cleanup_mapping(struct intel_uncore_type *type, struct attribute_group *
kfree(attr_to_ext_attr(*ag->attrs));
kfree(ag->attrs);
ag->attrs = NULL;
- kfree(type->topology);
+ pmu_free_topology(type);
+}
+
+static void
+pmu_iio_set_mapping(struct intel_uncore_type *type, struct attribute_group *ag)
+{
+ pmu_set_mapping(type, ag, skx_iio_mapping_show, IIO_TOPOLOGY_TYPE);
}
-static int skx_iio_set_mapping(struct intel_uncore_type *type)
+static void skx_iio_set_mapping(struct intel_uncore_type *type)
{
- return pmu_iio_set_mapping(type, &skx_iio_mapping_group);
+ pmu_iio_set_mapping(type, &skx_iio_mapping_group);
}
static void skx_iio_cleanup_mapping(struct intel_uncore_type *type)
{
- pmu_iio_cleanup_mapping(type, &skx_iio_mapping_group);
+ pmu_cleanup_mapping(type, &skx_iio_mapping_group);
}
static struct intel_uncore_type skx_uncore_iio = {
@@ -4139,6 +4270,132 @@ static struct intel_uncore_ops skx_upi_uncore_pci_ops = {
.read_counter = snbep_uncore_pci_read_counter,
};
+static umode_t
+skx_upi_mapping_visible(struct kobject *kobj, struct attribute *attr, int die)
+{
+ struct intel_uncore_pmu *pmu = dev_to_uncore_pmu(kobj_to_dev(kobj));
+
+ return pmu->type->topology[die][pmu->pmu_idx].upi->enabled ? attr->mode : 0;
+}
+
+static ssize_t skx_upi_mapping_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct intel_uncore_pmu *pmu = dev_to_uncore_pmu(dev);
+ struct dev_ext_attribute *ea = to_dev_ext_attribute(attr);
+ long die = (long)ea->var;
+ struct uncore_upi_topology *upi = pmu->type->topology[die][pmu->pmu_idx].upi;
+
+ return sysfs_emit(buf, "upi_%d,die_%d\n", upi->pmu_idx_to, upi->die_to);
+}
+
+#define SKX_UPI_REG_DID 0x2058
+#define SKX_UPI_REGS_ADDR_DEVICE_LINK0 0x0e
+#define SKX_UPI_REGS_ADDR_FUNCTION 0x00
+
+/*
+ * UPI Link Parameter 0
+ * | Bit | Default | Description
+ * | 19:16 | 0h | base_nodeid - The NodeID of the sending socket.
+ * | 12:8 | 00h | sending_port - The processor die port number of the sending port.
+ */
+#define SKX_KTILP0_OFFSET 0x94
+
+/*
+ * UPI Pcode Status. This register is used by PCode to store the link training status.
+ * | Bit | Default | Description
+ * | 4 | 0h | ll_status_valid — Bit indicates the valid training status
+ * logged from PCode to the BIOS.
+ */
+#define SKX_KTIPCSTS_OFFSET 0x120
+
+static int upi_fill_topology(struct pci_dev *dev, struct intel_uncore_topology *tp,
+ int pmu_idx)
+{
+ int ret;
+ u32 upi_conf;
+ struct uncore_upi_topology *upi = tp->upi;
+
+ tp->pmu_idx = pmu_idx;
+ ret = pci_read_config_dword(dev, SKX_KTIPCSTS_OFFSET, &upi_conf);
+ if (ret) {
+ ret = pcibios_err_to_errno(ret);
+ goto err;
+ }
+ upi->enabled = (upi_conf >> 4) & 1;
+ if (upi->enabled) {
+ ret = pci_read_config_dword(dev, SKX_KTILP0_OFFSET,
+ &upi_conf);
+ if (ret) {
+ ret = pcibios_err_to_errno(ret);
+ goto err;
+ }
+ upi->die_to = (upi_conf >> 16) & 0xf;
+ upi->pmu_idx_to = (upi_conf >> 8) & 0x1f;
+ }
+err:
+ return ret;
+}
+
+static int skx_upi_topology_cb(struct intel_uncore_type *type, int segment,
+ int die, u64 cpu_bus_msr)
+{
+ int idx, ret;
+ struct intel_uncore_topology *upi;
+ unsigned int devfn;
+ struct pci_dev *dev = NULL;
+ u8 bus = cpu_bus_msr >> (3 * BUS_NUM_STRIDE);
+
+ for (idx = 0; idx < type->num_boxes; idx++) {
+ upi = &type->topology[die][idx];
+ devfn = PCI_DEVFN(SKX_UPI_REGS_ADDR_DEVICE_LINK0 + idx,
+ SKX_UPI_REGS_ADDR_FUNCTION);
+ dev = pci_get_domain_bus_and_slot(segment, bus, devfn);
+ if (dev) {
+ ret = upi_fill_topology(dev, upi, idx);
+ if (ret)
+ break;
+ }
+ }
+
+ pci_dev_put(dev);
+ return ret;
+}
+
+static int skx_upi_get_topology(struct intel_uncore_type *type)
+{
+ /* CPX case is not supported */
+ if (boot_cpu_data.x86_stepping == 11)
+ return -EPERM;
+
+ return skx_pmu_get_topology(type, skx_upi_topology_cb);
+}
+
+static struct attribute_group skx_upi_mapping_group = {
+ .is_visible = skx_upi_mapping_visible,
+};
+
+static const struct attribute_group *skx_upi_attr_update[] = {
+ &skx_upi_mapping_group,
+ NULL
+};
+
+static void
+pmu_upi_set_mapping(struct intel_uncore_type *type, struct attribute_group *ag)
+{
+ pmu_set_mapping(type, ag, skx_upi_mapping_show, UPI_TOPOLOGY_TYPE);
+}
+
+static void skx_upi_set_mapping(struct intel_uncore_type *type)
+{
+ pmu_upi_set_mapping(type, &skx_upi_mapping_group);
+}
+
+static void skx_upi_cleanup_mapping(struct intel_uncore_type *type)
+{
+ pmu_cleanup_mapping(type, &skx_upi_mapping_group);
+}
+
static struct intel_uncore_type skx_uncore_upi = {
.name = "upi",
.num_counters = 4,
@@ -4151,6 +4408,10 @@ static struct intel_uncore_type skx_uncore_upi = {
.box_ctl = SKX_UPI_PCI_PMON_BOX_CTL,
.ops = &skx_upi_uncore_pci_ops,
.format_group = &skx_upi_uncore_format_group,
+ .attr_update = skx_upi_attr_update,
+ .get_topology = skx_upi_get_topology,
+ .set_mapping = skx_upi_set_mapping,
+ .cleanup_mapping = skx_upi_cleanup_mapping,
};
static void skx_m2m_uncore_pci_init_box(struct intel_uncore_box *box)
@@ -4461,11 +4722,6 @@ static int sad_cfg_iio_topology(struct intel_uncore_type *type, u8 *sad_pmon_map
int die, stack_id, ret = -EPERM;
struct pci_dev *dev = NULL;
- type->topology = kcalloc(uncore_max_dies(), sizeof(*type->topology),
- GFP_KERNEL);
- if (!type->topology)
- return -ENOMEM;
-
while ((dev = pci_get_device(PCI_VENDOR_ID_INTEL, SNR_ICX_MESH2IIO_MMAP_DID, dev))) {
ret = pci_read_config_dword(dev, SNR_ICX_SAD_CONTROL_CFG, &sad_cfg);
if (ret) {
@@ -4483,14 +4739,12 @@ static int sad_cfg_iio_topology(struct intel_uncore_type *type, u8 *sad_pmon_map
/* Convert stack id from SAD_CONTROL to PMON notation. */
stack_id = sad_pmon_mapping[stack_id];
- ((u8 *)&(type->topology[die].configuration))[stack_id] = dev->bus->number;
- type->topology[die].segment = pci_domain_nr(dev->bus);
+ type->topology[die][stack_id].iio->segment = pci_domain_nr(dev->bus);
+ type->topology[die][stack_id].pmu_idx = stack_id;
+ type->topology[die][stack_id].iio->pci_bus_no = dev->bus->number;
}
- if (ret) {
- kfree(type->topology);
- type->topology = NULL;
- }
+ pci_dev_put(dev);
return ret;
}
@@ -4519,14 +4773,14 @@ static int snr_iio_get_topology(struct intel_uncore_type *type)
return sad_cfg_iio_topology(type, snr_sad_pmon_mapping);
}
-static int snr_iio_set_mapping(struct intel_uncore_type *type)
+static void snr_iio_set_mapping(struct intel_uncore_type *type)
{
- return pmu_iio_set_mapping(type, &snr_iio_mapping_group);
+ pmu_iio_set_mapping(type, &snr_iio_mapping_group);
}
static void snr_iio_cleanup_mapping(struct intel_uncore_type *type)
{
- pmu_iio_cleanup_mapping(type, &snr_iio_mapping_group);
+ pmu_cleanup_mapping(type, &snr_iio_mapping_group);
}
static struct event_constraint snr_uncore_iio_constraints[] = {
@@ -4857,6 +5111,8 @@ static int snr_uncore_mmio_map(struct intel_uncore_box *box,
addr += box_ctl;
+ pci_dev_put(pdev);
+
box->io_addr = ioremap(addr, type->mmio_map_size);
if (!box->io_addr) {
pr_warn("perf uncore: Failed to ioremap for %s.\n", type->name);
@@ -5137,14 +5393,19 @@ static int icx_iio_get_topology(struct intel_uncore_type *type)
return sad_cfg_iio_topology(type, icx_sad_pmon_mapping);
}
-static int icx_iio_set_mapping(struct intel_uncore_type *type)
+static void icx_iio_set_mapping(struct intel_uncore_type *type)
{
- return pmu_iio_set_mapping(type, &icx_iio_mapping_group);
+ /* Detect ICX-D system. This case is not supported */
+ if (boot_cpu_data.x86_model == INTEL_FAM6_ICELAKE_D) {
+ pmu_clear_mapping_attr(type->attr_update, &icx_iio_mapping_group);
+ return;
+ }
+ pmu_iio_set_mapping(type, &icx_iio_mapping_group);
}
static void icx_iio_cleanup_mapping(struct intel_uncore_type *type)
{
- pmu_iio_cleanup_mapping(type, &icx_iio_mapping_group);
+ pmu_cleanup_mapping(type, &icx_iio_mapping_group);
}
static struct intel_uncore_type icx_uncore_iio = {
@@ -5337,6 +5598,76 @@ static const struct attribute_group icx_upi_uncore_format_group = {
.attrs = icx_upi_uncore_formats_attr,
};
+#define ICX_UPI_REGS_ADDR_DEVICE_LINK0 0x02
+#define ICX_UPI_REGS_ADDR_FUNCTION 0x01
+
+static int discover_upi_topology(struct intel_uncore_type *type, int ubox_did, int dev_link0)
+{
+ struct pci_dev *ubox = NULL;
+ struct pci_dev *dev = NULL;
+ u32 nid, gid;
+ int i, idx, ret = -EPERM;
+ struct intel_uncore_topology *upi;
+ unsigned int devfn;
+
+ /* GIDNIDMAP method supports machines which have less than 8 sockets. */
+ if (uncore_max_dies() > 8)
+ goto err;
+
+ while ((ubox = pci_get_device(PCI_VENDOR_ID_INTEL, ubox_did, ubox))) {
+ ret = upi_nodeid_groupid(ubox, SKX_CPUNODEID, SKX_GIDNIDMAP, &nid, &gid);
+ if (ret) {
+ ret = pcibios_err_to_errno(ret);
+ break;
+ }
+
+ for (i = 0; i < 8; i++) {
+ if (nid != GIDNIDMAP(gid, i))
+ continue;
+ for (idx = 0; idx < type->num_boxes; idx++) {
+ upi = &type->topology[nid][idx];
+ devfn = PCI_DEVFN(dev_link0 + idx, ICX_UPI_REGS_ADDR_FUNCTION);
+ dev = pci_get_domain_bus_and_slot(pci_domain_nr(ubox->bus),
+ ubox->bus->number,
+ devfn);
+ if (dev) {
+ ret = upi_fill_topology(dev, upi, idx);
+ if (ret)
+ goto err;
+ }
+ }
+ }
+ }
+err:
+ pci_dev_put(ubox);
+ pci_dev_put(dev);
+ return ret;
+}
+
+static int icx_upi_get_topology(struct intel_uncore_type *type)
+{
+ return discover_upi_topology(type, ICX_UBOX_DID, ICX_UPI_REGS_ADDR_DEVICE_LINK0);
+}
+
+static struct attribute_group icx_upi_mapping_group = {
+ .is_visible = skx_upi_mapping_visible,
+};
+
+static const struct attribute_group *icx_upi_attr_update[] = {
+ &icx_upi_mapping_group,
+ NULL
+};
+
+static void icx_upi_set_mapping(struct intel_uncore_type *type)
+{
+ pmu_upi_set_mapping(type, &icx_upi_mapping_group);
+}
+
+static void icx_upi_cleanup_mapping(struct intel_uncore_type *type)
+{
+ pmu_cleanup_mapping(type, &icx_upi_mapping_group);
+}
+
static struct intel_uncore_type icx_uncore_upi = {
.name = "upi",
.num_counters = 4,
@@ -5349,6 +5680,10 @@ static struct intel_uncore_type icx_uncore_upi = {
.box_ctl = ICX_UPI_PCI_PMON_BOX_CTL,
.ops = &skx_upi_uncore_pci_ops,
.format_group = &icx_upi_uncore_format_group,
+ .attr_update = icx_upi_attr_update,
+ .get_topology = icx_upi_get_topology,
+ .set_mapping = icx_upi_set_mapping,
+ .cleanup_mapping = icx_upi_cleanup_mapping,
};
static struct event_constraint icx_uncore_m3upi_constraints[] = {
@@ -5780,9 +6115,43 @@ static struct intel_uncore_type spr_uncore_m2m = {
.name = "m2m",
};
+static struct attribute_group spr_upi_mapping_group = {
+ .is_visible = skx_upi_mapping_visible,
+};
+
+static const struct attribute_group *spr_upi_attr_update[] = {
+ &uncore_alias_group,
+ &spr_upi_mapping_group,
+ NULL
+};
+
+#define SPR_UPI_REGS_ADDR_DEVICE_LINK0 0x01
+
+static void spr_upi_set_mapping(struct intel_uncore_type *type)
+{
+ pmu_upi_set_mapping(type, &spr_upi_mapping_group);
+}
+
+static void spr_upi_cleanup_mapping(struct intel_uncore_type *type)
+{
+ pmu_cleanup_mapping(type, &spr_upi_mapping_group);
+}
+
+static int spr_upi_get_topology(struct intel_uncore_type *type)
+{
+ return discover_upi_topology(type, SPR_UBOX_DID, SPR_UPI_REGS_ADDR_DEVICE_LINK0);
+}
+
static struct intel_uncore_type spr_uncore_upi = {
- SPR_UNCORE_PCI_COMMON_FORMAT(),
+ .event_mask = SNBEP_PMON_RAW_EVENT_MASK,
+ .event_mask_ext = SPR_RAW_EVENT_MASK_EXT,
+ .format_group = &spr_uncore_raw_format_group,
+ .ops = &spr_uncore_pci_ops,
.name = "upi",
+ .attr_update = spr_upi_attr_update,
+ .get_topology = spr_upi_get_topology,
+ .set_mapping = spr_upi_set_mapping,
+ .cleanup_mapping = spr_upi_cleanup_mapping,
};
static struct intel_uncore_type spr_uncore_m3upi = {
@@ -5986,6 +6355,12 @@ static void uncore_type_customized_copy(struct intel_uncore_type *to_type,
to_type->format_group = from_type->format_group;
if (from_type->attr_update)
to_type->attr_update = from_type->attr_update;
+ if (from_type->set_mapping)
+ to_type->set_mapping = from_type->set_mapping;
+ if (from_type->get_topology)
+ to_type->get_topology = from_type->get_topology;
+ if (from_type->cleanup_mapping)
+ to_type->cleanup_mapping = from_type->cleanup_mapping;
}
static struct intel_uncore_type **
diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
index 332d2e6d8ae4..0e849f28a5c1 100644
--- a/arch/x86/events/perf_event.h
+++ b/arch/x86/events/perf_event.h
@@ -811,7 +811,7 @@ struct x86_pmu {
void (*cpu_dead)(int cpu);
void (*check_microcode)(void);
- void (*sched_task)(struct perf_event_context *ctx,
+ void (*sched_task)(struct perf_event_pmu_context *pmu_ctx,
bool sched_in);
/*
@@ -894,12 +894,12 @@ struct x86_pmu {
int num_topdown_events;
/*
- * perf task context (i.e. struct perf_event_context::task_ctx_data)
+ * perf task context (i.e. struct perf_event_pmu_context::task_ctx_data)
* switch helper to bridge calls from perf/core to perf/x86.
* See struct pmu::swap_task_ctx() usage for examples;
*/
- void (*swap_task_ctx)(struct perf_event_context *prev,
- struct perf_event_context *next);
+ void (*swap_task_ctx)(struct perf_event_pmu_context *prev_epc,
+ struct perf_event_pmu_context *next_epc);
/*
* AMD bits
@@ -925,7 +925,7 @@ struct x86_pmu {
int (*aux_output_match) (struct perf_event *event);
- int (*filter_match)(struct perf_event *event);
+ void (*filter)(struct pmu *pmu, int cpu, bool *ret);
/*
* Hybrid support
*
@@ -1180,8 +1180,6 @@ int x86_pmu_handle_irq(struct pt_regs *regs);
void x86_pmu_show_pmu_cap(int num_counters, int num_counters_fixed,
u64 intel_ctrl);
-void x86_pmu_update_cpu_context(struct pmu *pmu, int cpu);
-
extern struct event_constraint emptyconstraint;
extern struct event_constraint unconstrained;
@@ -1306,7 +1304,7 @@ void amd_pmu_lbr_reset(void);
void amd_pmu_lbr_read(void);
void amd_pmu_lbr_add(struct perf_event *event);
void amd_pmu_lbr_del(struct perf_event *event);
-void amd_pmu_lbr_sched_task(struct perf_event_context *ctx, bool sched_in);
+void amd_pmu_lbr_sched_task(struct perf_event_pmu_context *pmu_ctx, bool sched_in);
void amd_pmu_lbr_enable_all(void);
void amd_pmu_lbr_disable_all(void);
int amd_pmu_lbr_hw_config(struct perf_event *event);
@@ -1322,7 +1320,6 @@ void amd_brs_enable_all(void);
void amd_brs_disable_all(void);
void amd_brs_drain(void);
void amd_brs_lopwr_init(void);
-void amd_brs_disable_all(void);
int amd_brs_hw_config(struct perf_event *event);
void amd_brs_reset(void);
@@ -1330,7 +1327,7 @@ static inline void amd_pmu_brs_add(struct perf_event *event)
{
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
- perf_sched_cb_inc(event->ctx->pmu);
+ perf_sched_cb_inc(event->pmu);
cpuc->lbr_users++;
/*
* No need to reset BRS because it is reset
@@ -1345,10 +1342,10 @@ static inline void amd_pmu_brs_del(struct perf_event *event)
cpuc->lbr_users--;
WARN_ON_ONCE(cpuc->lbr_users < 0);
- perf_sched_cb_dec(event->ctx->pmu);
+ perf_sched_cb_dec(event->pmu);
}
-void amd_pmu_brs_sched_task(struct perf_event_context *ctx, bool sched_in);
+void amd_pmu_brs_sched_task(struct perf_event_pmu_context *pmu_ctx, bool sched_in);
#else
static inline int amd_brs_init(void)
{
@@ -1373,7 +1370,7 @@ static inline void amd_pmu_brs_del(struct perf_event *event)
{
}
-static inline void amd_pmu_brs_sched_task(struct perf_event_context *ctx, bool sched_in)
+static inline void amd_pmu_brs_sched_task(struct perf_event_pmu_context *pmu_ctx, bool sched_in)
{
}
@@ -1533,7 +1530,7 @@ void intel_pmu_pebs_enable_all(void);
void intel_pmu_pebs_disable_all(void);
-void intel_pmu_pebs_sched_task(struct perf_event_context *ctx, bool sched_in);
+void intel_pmu_pebs_sched_task(struct perf_event_pmu_context *pmu_ctx, bool sched_in);
void intel_pmu_auto_reload_read(struct perf_event *event);
@@ -1541,10 +1538,10 @@ void intel_pmu_store_pebs_lbrs(struct lbr_entry *lbr);
void intel_ds_init(void);
-void intel_pmu_lbr_swap_task_ctx(struct perf_event_context *prev,
- struct perf_event_context *next);
+void intel_pmu_lbr_swap_task_ctx(struct perf_event_pmu_context *prev_epc,
+ struct perf_event_pmu_context *next_epc);
-void intel_pmu_lbr_sched_task(struct perf_event_context *ctx, bool sched_in);
+void intel_pmu_lbr_sched_task(struct perf_event_pmu_context *pmu_ctx, bool sched_in);
u64 lbr_from_signext_quirk_wr(u64 val);
diff --git a/arch/x86/events/rapl.c b/arch/x86/events/rapl.c
index a829492bca4c..52e6e7ed4f78 100644
--- a/arch/x86/events/rapl.c
+++ b/arch/x86/events/rapl.c
@@ -800,13 +800,18 @@ static const struct x86_cpu_id rapl_model_match[] __initconst = {
X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_X, &model_hsx),
X86_MATCH_INTEL_FAM6_MODEL(COMETLAKE_L, &model_skl),
X86_MATCH_INTEL_FAM6_MODEL(COMETLAKE, &model_skl),
+ X86_MATCH_INTEL_FAM6_MODEL(TIGERLAKE_L, &model_skl),
+ X86_MATCH_INTEL_FAM6_MODEL(TIGERLAKE, &model_skl),
X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE, &model_skl),
X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE_L, &model_skl),
X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE_N, &model_skl),
X86_MATCH_INTEL_FAM6_MODEL(SAPPHIRERAPIDS_X, &model_spr),
+ X86_MATCH_INTEL_FAM6_MODEL(EMERALDRAPIDS_X, &model_spr),
X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE, &model_skl),
X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE_P, &model_skl),
X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE_S, &model_skl),
+ X86_MATCH_INTEL_FAM6_MODEL(METEORLAKE, &model_skl),
+ X86_MATCH_INTEL_FAM6_MODEL(METEORLAKE_L, &model_skl),
{},
};
MODULE_DEVICE_TABLE(x86cpu, rapl_model_match);
diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c
index 29774126e931..41ef036ebb7b 100644
--- a/arch/x86/hyperv/hv_init.c
+++ b/arch/x86/hyperv/hv_init.c
@@ -77,7 +77,7 @@ static int hyperv_init_ghcb(void)
static int hv_cpu_init(unsigned int cpu)
{
union hv_vp_assist_msr_contents msr = { 0 };
- struct hv_vp_assist_page **hvp = &hv_vp_assist_page[smp_processor_id()];
+ struct hv_vp_assist_page **hvp = &hv_vp_assist_page[cpu];
int ret;
ret = hv_common_cpu_init(cpu);
@@ -87,34 +87,32 @@ static int hv_cpu_init(unsigned int cpu)
if (!hv_vp_assist_page)
return 0;
- if (!*hvp) {
- if (hv_root_partition) {
- /*
- * For root partition we get the hypervisor provided VP assist
- * page, instead of allocating a new page.
- */
- rdmsrl(HV_X64_MSR_VP_ASSIST_PAGE, msr.as_uint64);
- *hvp = memremap(msr.pfn <<
- HV_X64_MSR_VP_ASSIST_PAGE_ADDRESS_SHIFT,
- PAGE_SIZE, MEMREMAP_WB);
- } else {
- /*
- * The VP assist page is an "overlay" page (see Hyper-V TLFS's
- * Section 5.2.1 "GPA Overlay Pages"). Here it must be zeroed
- * out to make sure we always write the EOI MSR in
- * hv_apic_eoi_write() *after* the EOI optimization is disabled
- * in hv_cpu_die(), otherwise a CPU may not be stopped in the
- * case of CPU offlining and the VM will hang.
- */
+ if (hv_root_partition) {
+ /*
+ * For root partition we get the hypervisor provided VP assist
+ * page, instead of allocating a new page.
+ */
+ rdmsrl(HV_X64_MSR_VP_ASSIST_PAGE, msr.as_uint64);
+ *hvp = memremap(msr.pfn << HV_X64_MSR_VP_ASSIST_PAGE_ADDRESS_SHIFT,
+ PAGE_SIZE, MEMREMAP_WB);
+ } else {
+ /*
+ * The VP assist page is an "overlay" page (see Hyper-V TLFS's
+ * Section 5.2.1 "GPA Overlay Pages"). Here it must be zeroed
+ * out to make sure we always write the EOI MSR in
+ * hv_apic_eoi_write() *after* the EOI optimization is disabled
+ * in hv_cpu_die(), otherwise a CPU may not be stopped in the
+ * case of CPU offlining and the VM will hang.
+ */
+ if (!*hvp)
*hvp = __vmalloc(PAGE_SIZE, GFP_KERNEL | __GFP_ZERO);
- if (*hvp)
- msr.pfn = vmalloc_to_pfn(*hvp);
- }
- WARN_ON(!(*hvp));
- if (*hvp) {
- msr.enable = 1;
- wrmsrl(HV_X64_MSR_VP_ASSIST_PAGE, msr.as_uint64);
- }
+ if (*hvp)
+ msr.pfn = vmalloc_to_pfn(*hvp);
+
+ }
+ if (!WARN_ON(!(*hvp))) {
+ msr.enable = 1;
+ wrmsrl(HV_X64_MSR_VP_ASSIST_PAGE, msr.as_uint64);
}
return hyperv_init_ghcb();
@@ -444,7 +442,7 @@ void __init hyperv_init(void)
if (hv_root_partition) {
struct page *pg;
- void *src, *dst;
+ void *src;
/*
* For the root partition, the hypervisor will set up its
@@ -459,13 +457,13 @@ void __init hyperv_init(void)
wrmsrl(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64);
pg = vmalloc_to_page(hv_hypercall_pg);
- dst = kmap_local_page(pg);
src = memremap(hypercall_msr.guest_physical_address << PAGE_SHIFT, PAGE_SIZE,
MEMREMAP_WB);
- BUG_ON(!(src && dst));
- memcpy(dst, src, HV_HYP_PAGE_SIZE);
+ BUG_ON(!src);
+ memcpy_to_page(pg, 0, src, HV_HYP_PAGE_SIZE);
memunmap(src);
- kunmap_local(dst);
+
+ hv_remap_tsc_clocksource();
} else {
hypercall_msr.guest_physical_address = vmalloc_to_pfn(hv_hypercall_pg);
wrmsrl(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64);
@@ -537,8 +535,7 @@ common_free:
void hyperv_cleanup(void)
{
union hv_x64_msr_hypercall_contents hypercall_msr;
-
- unregister_syscore_ops(&hv_syscore_ops);
+ union hv_reference_tsc_msr tsc_msr;
/* Reset our OS id */
wrmsrl(HV_X64_MSR_GUEST_OS_ID, 0);
@@ -552,12 +549,14 @@ void hyperv_cleanup(void)
hv_hypercall_pg = NULL;
/* Reset the hypercall page */
- hypercall_msr.as_uint64 = 0;
- wrmsrl(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64);
+ hypercall_msr.as_uint64 = hv_get_register(HV_X64_MSR_HYPERCALL);
+ hypercall_msr.enable = 0;
+ hv_set_register(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64);
/* Reset the TSC page */
- hypercall_msr.as_uint64 = 0;
- wrmsrl(HV_X64_MSR_REFERENCE_TSC, hypercall_msr.as_uint64);
+ tsc_msr.as_uint64 = hv_get_register(HV_X64_MSR_REFERENCE_TSC);
+ tsc_msr.enable = 0;
+ hv_set_register(HV_X64_MSR_REFERENCE_TSC, tsc_msr.as_uint64);
}
void hyperv_report_panic(struct pt_regs *regs, long err, bool in_die)
diff --git a/arch/x86/ia32/Makefile b/arch/x86/ia32/Makefile
index e481056698de..333556a86b2a 100644
--- a/arch/x86/ia32/Makefile
+++ b/arch/x86/ia32/Makefile
@@ -3,7 +3,5 @@
# Makefile for the ia32 kernel emulation subsystem.
#
-obj-$(CONFIG_IA32_EMULATION) := ia32_signal.o
-
audit-class-$(CONFIG_AUDIT) := audit.o
obj-$(CONFIG_IA32_EMULATION) += $(audit-class-y)
diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h
index 9542c582d546..7659217f4d49 100644
--- a/arch/x86/include/asm/alternative.h
+++ b/arch/x86/include/asm/alternative.h
@@ -78,8 +78,43 @@ extern void apply_alternatives(struct alt_instr *start, struct alt_instr *end);
extern void apply_retpolines(s32 *start, s32 *end);
extern void apply_returns(s32 *start, s32 *end);
extern void apply_ibt_endbr(s32 *start, s32 *end);
+extern void apply_fineibt(s32 *start_retpoline, s32 *end_retpoine,
+ s32 *start_cfi, s32 *end_cfi);
struct module;
+struct paravirt_patch_site;
+
+struct callthunk_sites {
+ s32 *call_start, *call_end;
+ struct paravirt_patch_site *pv_start, *pv_end;
+};
+
+#ifdef CONFIG_CALL_THUNKS
+extern void callthunks_patch_builtin_calls(void);
+extern void callthunks_patch_module_calls(struct callthunk_sites *sites,
+ struct module *mod);
+extern void *callthunks_translate_call_dest(void *dest);
+extern bool is_callthunk(void *addr);
+extern int x86_call_depth_emit_accounting(u8 **pprog, void *func);
+#else
+static __always_inline void callthunks_patch_builtin_calls(void) {}
+static __always_inline void
+callthunks_patch_module_calls(struct callthunk_sites *sites,
+ struct module *mod) {}
+static __always_inline void *callthunks_translate_call_dest(void *dest)
+{
+ return dest;
+}
+static __always_inline bool is_callthunk(void *addr)
+{
+ return false;
+}
+static __always_inline int x86_call_depth_emit_accounting(u8 **pprog,
+ void *func)
+{
+ return 0;
+}
+#endif
#ifdef CONFIG_SMP
extern void alternatives_smp_module_add(struct module *mod, char *name,
@@ -347,6 +382,7 @@ static inline int alternatives_text_reserved(void *start, void *end)
#define old_len 141b-140b
#define new_len1 144f-143f
#define new_len2 145f-144f
+#define new_len3 146f-145f
/*
* gas compatible max based on the idea from:
@@ -354,7 +390,8 @@ static inline int alternatives_text_reserved(void *start, void *end)
*
* The additional "-" is needed because gas uses a "true" value of -1.
*/
-#define alt_max_short(a, b) ((a) ^ (((a) ^ (b)) & -(-((a) < (b)))))
+#define alt_max_2(a, b) ((a) ^ (((a) ^ (b)) & -(-((a) < (b)))))
+#define alt_max_3(a, b, c) (alt_max_2(alt_max_2(a, b), c))
/*
@@ -366,13 +403,36 @@ static inline int alternatives_text_reserved(void *start, void *end)
140:
\oldinstr
141:
- .skip -((alt_max_short(new_len1, new_len2) - (old_len)) > 0) * \
- (alt_max_short(new_len1, new_len2) - (old_len)),0x90
+ .skip -((alt_max_2(new_len1, new_len2) - (old_len)) > 0) * \
+ (alt_max_2(new_len1, new_len2) - (old_len)),0x90
+142:
+
+ .pushsection .altinstructions,"a"
+ altinstruction_entry 140b,143f,\feature1,142b-140b,144f-143f
+ altinstruction_entry 140b,144f,\feature2,142b-140b,145f-144f
+ .popsection
+
+ .pushsection .altinstr_replacement,"ax"
+143:
+ \newinstr1
+144:
+ \newinstr2
+145:
+ .popsection
+.endm
+
+.macro ALTERNATIVE_3 oldinstr, newinstr1, feature1, newinstr2, feature2, newinstr3, feature3
+140:
+ \oldinstr
+141:
+ .skip -((alt_max_3(new_len1, new_len2, new_len3) - (old_len)) > 0) * \
+ (alt_max_3(new_len1, new_len2, new_len3) - (old_len)),0x90
142:
.pushsection .altinstructions,"a"
altinstruction_entry 140b,143f,\feature1,142b-140b,144f-143f
altinstruction_entry 140b,144f,\feature2,142b-140b,145f-144f
+ altinstruction_entry 140b,145f,\feature3,142b-140b,146f-145f
.popsection
.pushsection .altinstr_replacement,"ax"
@@ -381,6 +441,8 @@ static inline int alternatives_text_reserved(void *start, void *end)
144:
\newinstr2
145:
+ \newinstr3
+146:
.popsection
.endm
diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index 3415321c8240..3216da7074ba 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -249,7 +249,6 @@ static inline u64 native_x2apic_icr_read(void)
extern int x2apic_mode;
extern int x2apic_phys;
extern void __init x2apic_set_max_apicid(u32 apicid);
-extern void __init check_x2apic(void);
extern void x2apic_setup(void);
static inline int x2apic_enabled(void)
{
@@ -258,13 +257,13 @@ static inline int x2apic_enabled(void)
#define x2apic_supported() (boot_cpu_has(X86_FEATURE_X2APIC))
#else /* !CONFIG_X86_X2APIC */
-static inline void check_x2apic(void) { }
static inline void x2apic_setup(void) { }
static inline int x2apic_enabled(void) { return 0; }
#define x2apic_mode (0)
#define x2apic_supported() (0)
#endif /* !CONFIG_X86_X2APIC */
+extern void __init check_x2apic(void);
struct irq_data;
diff --git a/arch/x86/include/asm/cacheinfo.h b/arch/x86/include/asm/cacheinfo.h
index 86b2e0dcc4bf..ce9685fc78d8 100644
--- a/arch/x86/include/asm/cacheinfo.h
+++ b/arch/x86/include/asm/cacheinfo.h
@@ -2,7 +2,20 @@
#ifndef _ASM_X86_CACHEINFO_H
#define _ASM_X86_CACHEINFO_H
+/* Kernel controls MTRR and/or PAT MSRs. */
+extern unsigned int memory_caching_control;
+#define CACHE_MTRR 0x01
+#define CACHE_PAT 0x02
+
void cacheinfo_amd_init_llc_id(struct cpuinfo_x86 *c, int cpu);
void cacheinfo_hygon_init_llc_id(struct cpuinfo_x86 *c, int cpu);
+void cache_disable(void);
+void cache_enable(void);
+void set_cache_aps_delayed_init(bool val);
+bool get_cache_aps_delayed_init(void);
+void cache_bp_init(void);
+void cache_bp_restore(void);
+void cache_aps_init(void);
+
#endif /* _ASM_X86_CACHEINFO_H */
diff --git a/arch/x86/include/asm/cmpxchg_32.h b/arch/x86/include/asm/cmpxchg_32.h
index 215f5a65790f..6ba80ce9438d 100644
--- a/arch/x86/include/asm/cmpxchg_32.h
+++ b/arch/x86/include/asm/cmpxchg_32.h
@@ -7,34 +7,6 @@
* you need to test for the feature in boot_cpu_data.
*/
-/*
- * CMPXCHG8B only writes to the target if we had the previous
- * value in registers, otherwise it acts as a read and gives us the
- * "new previous" value. That is why there is a loop. Preloading
- * EDX:EAX is a performance optimization: in the common case it means
- * we need only one locked operation.
- *
- * A SIMD/3DNOW!/MMX/FPU 64-bit store here would require at the very
- * least an FPU save and/or %cr0.ts manipulation.
- *
- * cmpxchg8b must be used with the lock prefix here to allow the
- * instruction to be executed atomically. We need to have the reader
- * side to see the coherent 64bit value.
- */
-static inline void set_64bit(volatile u64 *ptr, u64 value)
-{
- u32 low = value;
- u32 high = value >> 32;
- u64 prev = *ptr;
-
- asm volatile("\n1:\t"
- LOCK_PREFIX "cmpxchg8b %0\n\t"
- "jnz 1b"
- : "=m" (*ptr), "+A" (prev)
- : "b" (low), "c" (high)
- : "memory");
-}
-
#ifdef CONFIG_X86_CMPXCHG64
#define arch_cmpxchg64(ptr, o, n) \
((__typeof__(*(ptr)))__cmpxchg64((ptr), (unsigned long long)(o), \
diff --git a/arch/x86/include/asm/cmpxchg_64.h b/arch/x86/include/asm/cmpxchg_64.h
index 250187ac8248..0d3beb27b7fe 100644
--- a/arch/x86/include/asm/cmpxchg_64.h
+++ b/arch/x86/include/asm/cmpxchg_64.h
@@ -2,11 +2,6 @@
#ifndef _ASM_X86_CMPXCHG_64_H
#define _ASM_X86_CMPXCHG_64_H
-static inline void set_64bit(volatile u64 *ptr, u64 val)
-{
- *ptr = val;
-}
-
#define arch_cmpxchg64(ptr, o, n) \
({ \
BUILD_BUG_ON(sizeof(*(ptr)) != 8); \
diff --git a/arch/x86/include/asm/cpu.h b/arch/x86/include/asm/cpu.h
index b472ef76826a..78796b98a544 100644
--- a/arch/x86/include/asm/cpu.h
+++ b/arch/x86/include/asm/cpu.h
@@ -95,5 +95,7 @@ static inline bool intel_cpu_signatures_match(unsigned int s1, unsigned int p1,
}
extern u64 x86_read_arch_cap_msr(void);
+int intel_find_matching_signature(void *mc, unsigned int csig, int cpf);
+int intel_microcode_sanity_check(void *mc, bool print_err, int hdr_type);
#endif /* _ASM_X86_CPU_H */
diff --git a/arch/x86/include/asm/cpu_entry_area.h b/arch/x86/include/asm/cpu_entry_area.h
index 75efc4c6f076..462fc34f1317 100644
--- a/arch/x86/include/asm/cpu_entry_area.h
+++ b/arch/x86/include/asm/cpu_entry_area.h
@@ -130,10 +130,6 @@ struct cpu_entry_area {
};
#define CPU_ENTRY_AREA_SIZE (sizeof(struct cpu_entry_area))
-#define CPU_ENTRY_AREA_ARRAY_SIZE (CPU_ENTRY_AREA_SIZE * NR_CPUS)
-
-/* Total size includes the readonly IDT mapping page as well: */
-#define CPU_ENTRY_AREA_TOTAL_SIZE (CPU_ENTRY_AREA_ARRAY_SIZE + PAGE_SIZE)
DECLARE_PER_CPU(struct cpu_entry_area *, cpu_entry_area);
DECLARE_PER_CPU(struct cea_exception_stacks *, cea_exception_stacks);
diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
index 1419c4e04d45..61012476d66e 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -304,6 +304,9 @@
#define X86_FEATURE_UNRET (11*32+15) /* "" AMD BTB untrain return */
#define X86_FEATURE_USE_IBPB_FW (11*32+16) /* "" Use IBPB during runtime firmware calls */
#define X86_FEATURE_RSB_VMEXIT_LITE (11*32+17) /* "" Fill RSB on VM exit when EIBRS is enabled */
+#define X86_FEATURE_SGX_EDECCSSA (11*32+18) /* "" SGX EDECCSSA user leaf function */
+#define X86_FEATURE_CALL_DEPTH (11*32+19) /* "" Call depth tracking for RSB stuffing */
+#define X86_FEATURE_MSR_TSX_CTRL (11*32+20) /* "" MSR IA32_TSX_CTRL (Intel) implemented */
/* Intel-defined CPU features, CPUID level 0x00000007:1 (EAX), word 12 */
#define X86_FEATURE_AVX_VNNI (12*32+ 4) /* AVX VNNI instructions */
diff --git a/arch/x86/include/asm/cpuid.h b/arch/x86/include/asm/cpuid.h
index 70b2db18165e..9bee3e7bf973 100644
--- a/arch/x86/include/asm/cpuid.h
+++ b/arch/x86/include/asm/cpuid.h
@@ -1,13 +1,132 @@
/* SPDX-License-Identifier: GPL-2.0 */
/*
* CPUID-related helpers/definitions
- *
- * Derived from arch/x86/kvm/cpuid.c
*/
#ifndef _ASM_X86_CPUID_H
#define _ASM_X86_CPUID_H
+#include <asm/string.h>
+
+struct cpuid_regs {
+ u32 eax, ebx, ecx, edx;
+};
+
+enum cpuid_regs_idx {
+ CPUID_EAX = 0,
+ CPUID_EBX,
+ CPUID_ECX,
+ CPUID_EDX,
+};
+
+#ifdef CONFIG_X86_32
+extern int have_cpuid_p(void);
+#else
+static inline int have_cpuid_p(void)
+{
+ return 1;
+}
+#endif
+static inline void native_cpuid(unsigned int *eax, unsigned int *ebx,
+ unsigned int *ecx, unsigned int *edx)
+{
+ /* ecx is often an input as well as an output. */
+ asm volatile("cpuid"
+ : "=a" (*eax),
+ "=b" (*ebx),
+ "=c" (*ecx),
+ "=d" (*edx)
+ : "0" (*eax), "2" (*ecx)
+ : "memory");
+}
+
+#define native_cpuid_reg(reg) \
+static inline unsigned int native_cpuid_##reg(unsigned int op) \
+{ \
+ unsigned int eax = op, ebx, ecx = 0, edx; \
+ \
+ native_cpuid(&eax, &ebx, &ecx, &edx); \
+ \
+ return reg; \
+}
+
+/*
+ * Native CPUID functions returning a single datum.
+ */
+native_cpuid_reg(eax)
+native_cpuid_reg(ebx)
+native_cpuid_reg(ecx)
+native_cpuid_reg(edx)
+
+#ifdef CONFIG_PARAVIRT_XXL
+#include <asm/paravirt.h>
+#else
+#define __cpuid native_cpuid
+#endif
+
+/*
+ * Generic CPUID function
+ * clear %ecx since some cpus (Cyrix MII) do not set or clear %ecx
+ * resulting in stale register contents being returned.
+ */
+static inline void cpuid(unsigned int op,
+ unsigned int *eax, unsigned int *ebx,
+ unsigned int *ecx, unsigned int *edx)
+{
+ *eax = op;
+ *ecx = 0;
+ __cpuid(eax, ebx, ecx, edx);
+}
+
+/* Some CPUID calls want 'count' to be placed in ecx */
+static inline void cpuid_count(unsigned int op, int count,
+ unsigned int *eax, unsigned int *ebx,
+ unsigned int *ecx, unsigned int *edx)
+{
+ *eax = op;
+ *ecx = count;
+ __cpuid(eax, ebx, ecx, edx);
+}
+
+/*
+ * CPUID functions returning a single datum
+ */
+static inline unsigned int cpuid_eax(unsigned int op)
+{
+ unsigned int eax, ebx, ecx, edx;
+
+ cpuid(op, &eax, &ebx, &ecx, &edx);
+
+ return eax;
+}
+
+static inline unsigned int cpuid_ebx(unsigned int op)
+{
+ unsigned int eax, ebx, ecx, edx;
+
+ cpuid(op, &eax, &ebx, &ecx, &edx);
+
+ return ebx;
+}
+
+static inline unsigned int cpuid_ecx(unsigned int op)
+{
+ unsigned int eax, ebx, ecx, edx;
+
+ cpuid(op, &eax, &ebx, &ecx, &edx);
+
+ return ecx;
+}
+
+static inline unsigned int cpuid_edx(unsigned int op)
+{
+ unsigned int eax, ebx, ecx, edx;
+
+ cpuid(op, &eax, &ebx, &ecx, &edx);
+
+ return edx;
+}
+
static __always_inline bool cpuid_function_is_indexed(u32 function)
{
switch (function) {
@@ -31,4 +150,22 @@ static __always_inline bool cpuid_function_is_indexed(u32 function)
return false;
}
+#define for_each_possible_hypervisor_cpuid_base(function) \
+ for (function = 0x40000000; function < 0x40010000; function += 0x100)
+
+static inline uint32_t hypervisor_cpuid_base(const char *sig, uint32_t leaves)
+{
+ uint32_t base, eax, signature[3];
+
+ for_each_possible_hypervisor_cpuid_base(base) {
+ cpuid(base, &eax, &signature[0], &signature[1], &signature[2]);
+
+ if (!memcmp(sig, signature, 12) &&
+ (leaves == 0 || ((eax - base) >= leaves)))
+ return base;
+ }
+
+ return 0;
+}
+
#endif /* _ASM_X86_CPUID_H */
diff --git a/arch/x86/include/asm/current.h b/arch/x86/include/asm/current.h
index 3e204e6140b5..a1168e7b69e5 100644
--- a/arch/x86/include/asm/current.h
+++ b/arch/x86/include/asm/current.h
@@ -3,16 +3,42 @@
#define _ASM_X86_CURRENT_H
#include <linux/compiler.h>
-#include <asm/percpu.h>
#ifndef __ASSEMBLY__
+
+#include <linux/cache.h>
+#include <asm/percpu.h>
+
struct task_struct;
-DECLARE_PER_CPU(struct task_struct *, current_task);
+struct pcpu_hot {
+ union {
+ struct {
+ struct task_struct *current_task;
+ int preempt_count;
+ int cpu_number;
+#ifdef CONFIG_CALL_DEPTH_TRACKING
+ u64 call_depth;
+#endif
+ unsigned long top_of_stack;
+ void *hardirq_stack_ptr;
+ u16 softirq_pending;
+#ifdef CONFIG_X86_64
+ bool hardirq_stack_inuse;
+#else
+ void *softirq_stack_ptr;
+#endif
+ };
+ u8 pad[64];
+ };
+};
+static_assert(sizeof(struct pcpu_hot) == 64);
+
+DECLARE_PER_CPU_ALIGNED(struct pcpu_hot, pcpu_hot);
static __always_inline struct task_struct *get_current(void)
{
- return this_cpu_read_stable(current_task);
+ return this_cpu_read_stable(pcpu_hot.current_task);
}
#define current get_current()
diff --git a/arch/x86/include/asm/debugreg.h b/arch/x86/include/asm/debugreg.h
index cfdf307ddc01..b049d950612f 100644
--- a/arch/x86/include/asm/debugreg.h
+++ b/arch/x86/include/asm/debugreg.h
@@ -2,8 +2,8 @@
#ifndef _ASM_X86_DEBUGREG_H
#define _ASM_X86_DEBUGREG_H
-
#include <linux/bug.h>
+#include <linux/percpu.h>
#include <uapi/asm/debugreg.h>
DECLARE_PER_CPU(unsigned long, cpu_dr7);
diff --git a/arch/x86/include/asm/disabled-features.h b/arch/x86/include/asm/disabled-features.h
index 33d2cd04d254..c44b56f7ffba 100644
--- a/arch/x86/include/asm/disabled-features.h
+++ b/arch/x86/include/asm/disabled-features.h
@@ -69,6 +69,12 @@
# define DISABLE_UNRET (1 << (X86_FEATURE_UNRET & 31))
#endif
+#ifdef CONFIG_CALL_DEPTH_TRACKING
+# define DISABLE_CALL_DEPTH_TRACKING 0
+#else
+# define DISABLE_CALL_DEPTH_TRACKING (1 << (X86_FEATURE_CALL_DEPTH & 31))
+#endif
+
#ifdef CONFIG_INTEL_IOMMU_SVM
# define DISABLE_ENQCMD 0
#else
@@ -81,6 +87,12 @@
# define DISABLE_SGX (1 << (X86_FEATURE_SGX & 31))
#endif
+#ifdef CONFIG_XEN_PV
+# define DISABLE_XENPV 0
+#else
+# define DISABLE_XENPV (1 << (X86_FEATURE_XENPV & 31))
+#endif
+
#ifdef CONFIG_INTEL_TDX_GUEST
# define DISABLE_TDX_GUEST 0
#else
@@ -98,10 +110,11 @@
#define DISABLED_MASK5 0
#define DISABLED_MASK6 0
#define DISABLED_MASK7 (DISABLE_PTI)
-#define DISABLED_MASK8 (DISABLE_TDX_GUEST)
+#define DISABLED_MASK8 (DISABLE_XENPV|DISABLE_TDX_GUEST)
#define DISABLED_MASK9 (DISABLE_SGX)
#define DISABLED_MASK10 0
-#define DISABLED_MASK11 (DISABLE_RETPOLINE|DISABLE_RETHUNK|DISABLE_UNRET)
+#define DISABLED_MASK11 (DISABLE_RETPOLINE|DISABLE_RETHUNK|DISABLE_UNRET| \
+ DISABLE_CALL_DEPTH_TRACKING)
#define DISABLED_MASK12 0
#define DISABLED_MASK13 0
#define DISABLED_MASK14 0
diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h
index 233ae6986d6f..a63154e049d7 100644
--- a/arch/x86/include/asm/efi.h
+++ b/arch/x86/include/asm/efi.h
@@ -178,7 +178,7 @@ struct efi_setup_data {
extern u64 efi_setup;
#ifdef CONFIG_EFI
-extern efi_status_t __efi64_thunk(u32, ...);
+extern u64 __efi64_thunk(u32, ...);
#define efi64_thunk(...) ({ \
u64 __pad[3]; /* must have space for 3 args on the stack */ \
@@ -228,16 +228,15 @@ static inline bool efi_is_native(void)
return efi_is_64bit();
}
-#define efi_mixed_mode_cast(attr) \
- __builtin_choose_expr( \
- __builtin_types_compatible_p(u32, __typeof__(attr)), \
- (unsigned long)(attr), (attr))
-
#define efi_table_attr(inst, attr) \
- (efi_is_native() \
- ? inst->attr \
- : (__typeof__(inst->attr)) \
- efi_mixed_mode_cast(inst->mixed_mode.attr))
+ (efi_is_native() ? (inst)->attr \
+ : efi_mixed_table_attr((inst), attr))
+
+#define efi_mixed_table_attr(inst, attr) \
+ (__typeof__(inst->attr)) \
+ _Generic(inst->mixed_mode.attr, \
+ u32: (unsigned long)(inst->mixed_mode.attr), \
+ default: (inst->mixed_mode.attr))
/*
* The following macros allow translating arguments if necessary from native to
@@ -325,6 +324,17 @@ static inline u32 efi64_convert_status(efi_status_t status)
#define __efi64_argmap_set_memory_space_attributes(phys, size, flags) \
(__efi64_split(phys), __efi64_split(size), __efi64_split(flags))
+/* file protocol */
+#define __efi64_argmap_open(prot, newh, fname, mode, attr) \
+ ((prot), efi64_zero_upper(newh), (fname), __efi64_split(mode), \
+ __efi64_split(attr))
+
+#define __efi64_argmap_set_position(pos) (__efi64_split(pos))
+
+/* file system protocol */
+#define __efi64_argmap_open_volume(prot, file) \
+ ((prot), efi64_zero_upper(file))
+
/*
* The macros below handle the plumbing for the argument mapping. To add a
* mapping for a specific EFI method, simply define a macro
@@ -344,31 +354,27 @@ static inline u32 efi64_convert_status(efi_status_t status)
#define __efi_eat(...)
#define __efi_eval(...) __VA_ARGS__
-/* The three macros below handle dispatching via the thunk if needed */
-
-#define efi_call_proto(inst, func, ...) \
- (efi_is_native() \
- ? inst->func(inst, ##__VA_ARGS__) \
- : __efi64_thunk_map(inst, func, inst, ##__VA_ARGS__))
+static inline efi_status_t __efi64_widen_efi_status(u64 status)
+{
+ /* use rotate to move the value of bit #31 into position #63 */
+ return ror64(rol32(status, 1), 1);
+}
-#define efi_bs_call(func, ...) \
- (efi_is_native() \
- ? efi_system_table->boottime->func(__VA_ARGS__) \
- : __efi64_thunk_map(efi_table_attr(efi_system_table, \
- boottime), \
- func, __VA_ARGS__))
+/* The macro below handles dispatching via the thunk if needed */
-#define efi_rt_call(func, ...) \
- (efi_is_native() \
- ? efi_system_table->runtime->func(__VA_ARGS__) \
- : __efi64_thunk_map(efi_table_attr(efi_system_table, \
- runtime), \
- func, __VA_ARGS__))
+#define efi_fn_call(inst, func, ...) \
+ (efi_is_native() ? (inst)->func(__VA_ARGS__) \
+ : efi_mixed_call((inst), func, ##__VA_ARGS__))
-#define efi_dxe_call(func, ...) \
- (efi_is_native() \
- ? efi_dxe_table->func(__VA_ARGS__) \
- : __efi64_thunk_map(efi_dxe_table, func, __VA_ARGS__))
+#define efi_mixed_call(inst, func, ...) \
+ _Generic(inst->func(__VA_ARGS__), \
+ efi_status_t: \
+ __efi64_widen_efi_status( \
+ __efi64_thunk_map(inst, func, ##__VA_ARGS__)), \
+ u64: ({ BUILD_BUG(); ULONG_MAX; }), \
+ default: \
+ (__typeof__(inst->func(__VA_ARGS__))) \
+ __efi64_thunk_map(inst, func, ##__VA_ARGS__))
#else /* CONFIG_EFI_MIXED */
@@ -400,13 +406,52 @@ static inline void efi_reserve_boot_services(void)
#ifdef CONFIG_EFI_FAKE_MEMMAP
extern void __init efi_fake_memmap_early(void);
+extern void __init efi_fake_memmap(void);
#else
static inline void efi_fake_memmap_early(void)
{
}
+
+static inline void efi_fake_memmap(void)
+{
+}
#endif
+extern int __init efi_memmap_alloc(unsigned int num_entries,
+ struct efi_memory_map_data *data);
+extern void __efi_memmap_free(u64 phys, unsigned long size,
+ unsigned long flags);
+#define __efi_memmap_free __efi_memmap_free
+
+extern int __init efi_memmap_install(struct efi_memory_map_data *data);
+extern int __init efi_memmap_split_count(efi_memory_desc_t *md,
+ struct range *range);
+extern void __init efi_memmap_insert(struct efi_memory_map *old_memmap,
+ void *buf, struct efi_mem_range *mem);
+
#define arch_ima_efi_boot_mode \
({ extern struct boot_params boot_params; boot_params.secure_boot; })
+#ifdef CONFIG_EFI_RUNTIME_MAP
+int efi_get_runtime_map_size(void);
+int efi_get_runtime_map_desc_size(void);
+int efi_runtime_map_copy(void *buf, size_t bufsz);
+#else
+static inline int efi_get_runtime_map_size(void)
+{
+ return 0;
+}
+
+static inline int efi_get_runtime_map_desc_size(void)
+{
+ return 0;
+}
+
+static inline int efi_runtime_map_copy(void *buf, size_t bufsz)
+{
+ return 0;
+}
+
+#endif
+
#endif /* _ASM_X86_EFI_H */
diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h
index cb0ff1055ab1..18fd06f7936a 100644
--- a/arch/x86/include/asm/elf.h
+++ b/arch/x86/include/asm/elf.h
@@ -152,10 +152,6 @@ do { \
(elf_check_arch_ia32(x) || \
(IS_ENABLED(CONFIG_X86_X32_ABI) && (x)->e_machine == EM_X86_64))
-#if __USER32_DS != __USER_DS
-# error "The following code assumes __USER32_DS == __USER_DS"
-#endif
-
static inline void elf_common_init(struct thread_struct *t,
struct pt_regs *regs, const u16 ds)
{
@@ -226,7 +222,6 @@ do { \
/* I'm not sure if we can use '-' here */
#define ELF_PLATFORM ("x86_64")
extern void set_personality_64bit(void);
-extern unsigned int sysctl_vsyscall32;
extern int force_personality32;
#endif /* !CONFIG_X86_32 */
diff --git a/arch/x86/include/asm/entry-common.h b/arch/x86/include/asm/entry-common.h
index 674ed46d3ced..117903881fe4 100644
--- a/arch/x86/include/asm/entry-common.h
+++ b/arch/x86/include/asm/entry-common.h
@@ -24,8 +24,8 @@ static __always_inline void arch_enter_from_user_mode(struct pt_regs *regs)
/*
* For !SMAP hardware we patch out CLAC on entry.
*/
- if (boot_cpu_has(X86_FEATURE_SMAP) ||
- (IS_ENABLED(CONFIG_64BIT) && boot_cpu_has(X86_FEATURE_XENPV)))
+ if (cpu_feature_enabled(X86_FEATURE_SMAP) ||
+ cpu_feature_enabled(X86_FEATURE_XENPV))
mask |= X86_EFLAGS_AC;
WARN_ON_ONCE(flags & mask);
diff --git a/arch/x86/include/asm/fpu/signal.h b/arch/x86/include/asm/fpu/signal.h
index e1c9df9102a5..611fa41711af 100644
--- a/arch/x86/include/asm/fpu/signal.h
+++ b/arch/x86/include/asm/fpu/signal.h
@@ -13,16 +13,9 @@
#ifdef CONFIG_X86_64
# include <uapi/asm/sigcontext.h>
# include <asm/user32.h>
-struct ksignal;
-int ia32_setup_rt_frame(int sig, struct ksignal *ksig,
- compat_sigset_t *set, struct pt_regs *regs);
-int ia32_setup_frame(int sig, struct ksignal *ksig,
- compat_sigset_t *set, struct pt_regs *regs);
#else
# define user_i387_ia32_struct user_i387_struct
# define user32_fxsr_struct user_fxsr_struct
-# define ia32_setup_frame __setup_frame
-# define ia32_setup_rt_frame __setup_rt_frame
#endif
extern void convert_from_fxsr(struct user_i387_ia32_struct *env,
diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h
index 908d99b127d3..5061ac98ffa1 100644
--- a/arch/x86/include/asm/ftrace.h
+++ b/arch/x86/include/asm/ftrace.h
@@ -34,19 +34,6 @@ static inline unsigned long ftrace_call_adjust(unsigned long addr)
return addr;
}
-/*
- * When a ftrace registered caller is tracing a function that is
- * also set by a register_ftrace_direct() call, it needs to be
- * differentiated in the ftrace_caller trampoline. To do this, we
- * place the direct caller in the ORIG_AX part of pt_regs. This
- * tells the ftrace_caller that there's a direct caller.
- */
-static inline void arch_ftrace_set_direct_caller(struct pt_regs *regs, unsigned long addr)
-{
- /* Emulate a call */
- regs->orig_ax = addr;
-}
-
#ifdef CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS
struct ftrace_regs {
struct pt_regs regs;
@@ -61,9 +48,25 @@ arch_ftrace_get_regs(struct ftrace_regs *fregs)
return &fregs->regs;
}
-#define ftrace_instruction_pointer_set(fregs, _ip) \
+#define ftrace_regs_set_instruction_pointer(fregs, _ip) \
do { (fregs)->regs.ip = (_ip); } while (0)
+#define ftrace_regs_get_instruction_pointer(fregs) \
+ ((fregs)->regs.ip)
+
+#define ftrace_regs_get_argument(fregs, n) \
+ regs_get_kernel_argument(&(fregs)->regs, n)
+#define ftrace_regs_get_stack_pointer(fregs) \
+ kernel_stack_pointer(&(fregs)->regs)
+#define ftrace_regs_return_value(fregs) \
+ regs_return_value(&(fregs)->regs)
+#define ftrace_regs_set_return_value(fregs, ret) \
+ regs_set_return_value(&(fregs)->regs, ret)
+#define ftrace_override_function_with_return(fregs) \
+ override_function_with_return(&(fregs)->regs)
+#define ftrace_regs_query_register_offset(name) \
+ regs_query_register_offset(name)
+
struct ftrace_ops;
#define ftrace_graph_func ftrace_graph_func
void ftrace_graph_func(unsigned long ip, unsigned long parent_ip,
@@ -72,6 +75,24 @@ void ftrace_graph_func(unsigned long ip, unsigned long parent_ip,
#define FTRACE_GRAPH_TRAMP_ADDR FTRACE_GRAPH_ADDR
#endif
+#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
+/*
+ * When a ftrace registered caller is tracing a function that is
+ * also set by a register_ftrace_direct() call, it needs to be
+ * differentiated in the ftrace_caller trampoline. To do this, we
+ * place the direct caller in the ORIG_AX part of pt_regs. This
+ * tells the ftrace_caller that there's a direct caller.
+ */
+static inline void
+__arch_ftrace_set_direct_caller(struct pt_regs *regs, unsigned long addr)
+{
+ /* Emulate a call */
+ regs->orig_ax = addr;
+}
+#define arch_ftrace_set_direct_caller(fregs, addr) \
+ __arch_ftrace_set_direct_caller(&(fregs)->regs, addr)
+#endif /* CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS */
+
#ifdef CONFIG_DYNAMIC_FTRACE
struct dyn_arch_ftrace {
diff --git a/arch/x86/include/asm/hardirq.h b/arch/x86/include/asm/hardirq.h
index 275e7fd20310..66837b8c67f1 100644
--- a/arch/x86/include/asm/hardirq.h
+++ b/arch/x86/include/asm/hardirq.h
@@ -3,9 +3,9 @@
#define _ASM_X86_HARDIRQ_H
#include <linux/threads.h>
+#include <asm/current.h>
typedef struct {
- u16 __softirq_pending;
#if IS_ENABLED(CONFIG_KVM_INTEL)
u8 kvm_cpu_l1tf_flush_l1d;
#endif
@@ -60,6 +60,7 @@ extern u64 arch_irq_stat_cpu(unsigned int cpu);
extern u64 arch_irq_stat(void);
#define arch_irq_stat arch_irq_stat
+#define local_softirq_pending_ref pcpu_hot.softirq_pending
#if IS_ENABLED(CONFIG_KVM_INTEL)
static inline void kvm_set_cpu_l1tf_flush_l1d(void)
diff --git a/arch/x86/include/asm/hyperv-tlfs.h b/arch/x86/include/asm/hyperv-tlfs.h
index 617332dd64ac..1e2fe398b66d 100644
--- a/arch/x86/include/asm/hyperv-tlfs.h
+++ b/arch/x86/include/asm/hyperv-tlfs.h
@@ -379,11 +379,20 @@ struct hv_nested_enlightenments_control {
struct hv_vp_assist_page {
__u32 apic_assist;
__u32 reserved1;
- __u64 vtl_control[3];
+ __u32 vtl_entry_reason;
+ __u32 vtl_reserved;
+ __u64 vtl_ret_x64rax;
+ __u64 vtl_ret_x64rcx;
struct hv_nested_enlightenments_control nested_control;
__u8 enlighten_vmentry;
__u8 reserved2[7];
__u64 current_nested_vmcs;
+ __u8 synthetic_time_unhalted_timer_expired;
+ __u8 reserved3[7];
+ __u8 virtualization_fault_information[40];
+ __u8 reserved4[8];
+ __u8 intercept_message[256];
+ __u8 vtl_ret_actions[256];
} __packed;
struct hv_enlightened_vmcs {
diff --git a/arch/x86/include/asm/hyperv_timer.h b/arch/x86/include/asm/hyperv_timer.h
new file mode 100644
index 000000000000..388fa81b8f38
--- /dev/null
+++ b/arch/x86/include/asm/hyperv_timer.h
@@ -0,0 +1,9 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_HYPERV_TIMER_H
+#define _ASM_X86_HYPERV_TIMER_H
+
+#include <asm/msr.h>
+
+#define hv_get_raw_timer() rdtsc_ordered()
+
+#endif
diff --git a/arch/x86/include/asm/insn-eval.h b/arch/x86/include/asm/insn-eval.h
index f07faa61c7f3..54368a43abf6 100644
--- a/arch/x86/include/asm/insn-eval.h
+++ b/arch/x86/include/asm/insn-eval.h
@@ -32,16 +32,16 @@ int insn_fetch_from_user_inatomic(struct pt_regs *regs,
bool insn_decode_from_regs(struct insn *insn, struct pt_regs *regs,
unsigned char buf[MAX_INSN_SIZE], int buf_size);
-enum mmio_type {
- MMIO_DECODE_FAILED,
- MMIO_WRITE,
- MMIO_WRITE_IMM,
- MMIO_READ,
- MMIO_READ_ZERO_EXTEND,
- MMIO_READ_SIGN_EXTEND,
- MMIO_MOVS,
+enum insn_mmio_type {
+ INSN_MMIO_DECODE_FAILED,
+ INSN_MMIO_WRITE,
+ INSN_MMIO_WRITE_IMM,
+ INSN_MMIO_READ,
+ INSN_MMIO_READ_ZERO_EXTEND,
+ INSN_MMIO_READ_SIGN_EXTEND,
+ INSN_MMIO_MOVS,
};
-enum mmio_type insn_decode_mmio(struct insn *insn, int *bytes);
+enum insn_mmio_type insn_decode_mmio(struct insn *insn, int *bytes);
#endif /* _ASM_X86_INSN_EVAL_H */
diff --git a/arch/x86/include/asm/irq_remapping.h b/arch/x86/include/asm/irq_remapping.h
index 7cc49432187f..7a2ed154a5e1 100644
--- a/arch/x86/include/asm/irq_remapping.h
+++ b/arch/x86/include/asm/irq_remapping.h
@@ -44,10 +44,6 @@ extern int irq_remapping_reenable(int);
extern int irq_remap_enable_fault_handling(void);
extern void panic_if_irq_remap(const char *msg);
-/* Create PCI MSI/MSIx irqdomain, use @parent as the parent irqdomain. */
-extern struct irq_domain *
-arch_create_remap_msi_irq_domain(struct irq_domain *par, const char *n, int id);
-
/* Get parent irqdomain for interrupt remapping irqdomain */
static inline struct irq_domain *arch_get_ir_parent_domain(void)
{
diff --git a/arch/x86/include/asm/irq_stack.h b/arch/x86/include/asm/irq_stack.h
index 147cb8fdda92..798183867d78 100644
--- a/arch/x86/include/asm/irq_stack.h
+++ b/arch/x86/include/asm/irq_stack.h
@@ -116,7 +116,7 @@
ASM_CALL_ARG2
#define call_on_irqstack(func, asm_call, argconstr...) \
- call_on_stack(__this_cpu_read(hardirq_stack_ptr), \
+ call_on_stack(__this_cpu_read(pcpu_hot.hardirq_stack_ptr), \
func, asm_call, argconstr)
/* Macros to assert type correctness for run_*_on_irqstack macros */
@@ -135,7 +135,7 @@
* User mode entry and interrupt on the irq stack do not \
* switch stacks. If from user mode the task stack is empty. \
*/ \
- if (user_mode(regs) || __this_cpu_read(hardirq_stack_inuse)) { \
+ if (user_mode(regs) || __this_cpu_read(pcpu_hot.hardirq_stack_inuse)) { \
irq_enter_rcu(); \
func(c_args); \
irq_exit_rcu(); \
@@ -146,9 +146,9 @@
* places. Invoke the stack switch macro with the call \
* sequence which matches the above direct invocation. \
*/ \
- __this_cpu_write(hardirq_stack_inuse, true); \
+ __this_cpu_write(pcpu_hot.hardirq_stack_inuse, true); \
call_on_irqstack(func, asm_call, constr); \
- __this_cpu_write(hardirq_stack_inuse, false); \
+ __this_cpu_write(pcpu_hot.hardirq_stack_inuse, false); \
} \
}
@@ -212,9 +212,9 @@
*/
#define do_softirq_own_stack() \
{ \
- __this_cpu_write(hardirq_stack_inuse, true); \
+ __this_cpu_write(pcpu_hot.hardirq_stack_inuse, true); \
call_on_irqstack(__do_softirq, ASM_CALL_ARG0); \
- __this_cpu_write(hardirq_stack_inuse, false); \
+ __this_cpu_write(pcpu_hot.hardirq_stack_inuse, false); \
}
#endif
diff --git a/arch/x86/include/asm/irqdomain.h b/arch/x86/include/asm/irqdomain.h
index 125c23b7bad3..30c325c235c0 100644
--- a/arch/x86/include/asm/irqdomain.h
+++ b/arch/x86/include/asm/irqdomain.h
@@ -7,9 +7,7 @@
#ifdef CONFIG_X86_LOCAL_APIC
enum {
- /* Allocate contiguous CPU vectors */
- X86_IRQ_ALLOC_CONTIGUOUS_VECTORS = 0x1,
- X86_IRQ_ALLOC_LEGACY = 0x2,
+ X86_IRQ_ALLOC_LEGACY = 0x1,
};
extern int x86_fwspec_is_ioapic(struct irq_fwspec *fwspec);
diff --git a/arch/x86/include/asm/kasan.h b/arch/x86/include/asm/kasan.h
index 13e70da38bed..de75306b932e 100644
--- a/arch/x86/include/asm/kasan.h
+++ b/arch/x86/include/asm/kasan.h
@@ -28,9 +28,12 @@
#ifdef CONFIG_KASAN
void __init kasan_early_init(void);
void __init kasan_init(void);
+void __init kasan_populate_shadow_for_vaddr(void *va, size_t size, int nid);
#else
static inline void kasan_early_init(void) { }
static inline void kasan_init(void) { }
+static inline void kasan_populate_shadow_for_vaddr(void *va, size_t size,
+ int nid) { }
#endif
#endif
diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h
index 84f43caef9b7..8dc345cc6318 100644
--- a/arch/x86/include/asm/kvm-x86-ops.h
+++ b/arch/x86/include/asm/kvm-x86-ops.h
@@ -14,6 +14,7 @@ BUILD_BUG_ON(1)
* to make a definition optional, but in this case the default will
* be __static_call_return0.
*/
+KVM_X86_OP(check_processor_compatibility)
KVM_X86_OP(hardware_enable)
KVM_X86_OP(hardware_disable)
KVM_X86_OP(hardware_unsetup)
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 7ca854714ccd..4d2bc08794e4 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1123,6 +1123,7 @@ struct msr_bitmap_range {
/* Xen emulation context */
struct kvm_xen {
+ struct mutex xen_lock;
u32 xen_version;
bool long_mode;
bool runstate_update_flag;
@@ -1535,6 +1536,8 @@ static inline u16 kvm_lapic_irq_dest_mode(bool dest_mode_logical)
struct kvm_x86_ops {
const char *name;
+ int (*check_processor_compatibility)(void);
+
int (*hardware_enable)(void);
void (*hardware_disable)(void);
void (*hardware_unsetup)(void);
@@ -1748,9 +1751,6 @@ struct kvm_x86_nested_ops {
};
struct kvm_x86_init_ops {
- int (*cpu_has_kvm_support)(void);
- int (*disabled_by_bios)(void);
- int (*check_processor_compatibility)(void);
int (*hardware_setup)(void);
unsigned int (*handle_intel_pt_intr)(void);
@@ -1777,6 +1777,9 @@ extern struct kvm_x86_ops kvm_x86_ops;
#define KVM_X86_OP_OPTIONAL_RET0 KVM_X86_OP
#include <asm/kvm-x86-ops.h>
+int kvm_x86_vendor_init(struct kvm_x86_init_ops *ops);
+void kvm_x86_vendor_exit(void);
+
#define __KVM_HAVE_ARCH_VM_ALLOC
static inline struct kvm *kvm_arch_alloc_vm(void)
{
diff --git a/arch/x86/include/asm/linkage.h b/arch/x86/include/asm/linkage.h
index f484d656d34e..dd9b8118f784 100644
--- a/arch/x86/include/asm/linkage.h
+++ b/arch/x86/include/asm/linkage.h
@@ -12,13 +12,26 @@
#define asmlinkage CPP_ASMLINKAGE __attribute__((regparm(0)))
#endif /* CONFIG_X86_32 */
-#ifdef __ASSEMBLY__
-
-#if defined(CONFIG_X86_64) || defined(CONFIG_X86_ALIGNMENT_16)
-#define __ALIGN .p2align 4, 0x90
+#define __ALIGN .balign CONFIG_FUNCTION_ALIGNMENT, 0x90;
#define __ALIGN_STR __stringify(__ALIGN)
+
+#if defined(CONFIG_CALL_PADDING) && !defined(__DISABLE_EXPORTS) && !defined(BUILD_VDSO)
+#define FUNCTION_PADDING .skip CONFIG_FUNCTION_ALIGNMENT, 0x90;
+#else
+#define FUNCTION_PADDING
+#endif
+
+#if (CONFIG_FUNCTION_ALIGNMENT > 8) && !defined(__DISABLE_EXPORTS) && !defined(BULID_VDSO)
+# define __FUNC_ALIGN __ALIGN; FUNCTION_PADDING
+#else
+# define __FUNC_ALIGN __ALIGN
#endif
+#define ASM_FUNC_ALIGN __stringify(__FUNC_ALIGN)
+#define SYM_F_ALIGN __FUNC_ALIGN
+
+#ifdef __ASSEMBLY__
+
#if defined(CONFIG_RETHUNK) && !defined(__DISABLE_EXPORTS) && !defined(BUILD_VDSO)
#define RET jmp __x86_return_thunk
#else /* CONFIG_RETPOLINE */
@@ -43,11 +56,45 @@
#endif /* __ASSEMBLY__ */
+/*
+ * Depending on -fpatchable-function-entry=N,N usage (CONFIG_CALL_PADDING) the
+ * CFI symbol layout changes.
+ *
+ * Without CALL_THUNKS:
+ *
+ * .align FUNCTION_ALIGNMENT
+ * __cfi_##name:
+ * .skip FUNCTION_PADDING, 0x90
+ * .byte 0xb8
+ * .long __kcfi_typeid_##name
+ * name:
+ *
+ * With CALL_THUNKS:
+ *
+ * .align FUNCTION_ALIGNMENT
+ * __cfi_##name:
+ * .byte 0xb8
+ * .long __kcfi_typeid_##name
+ * .skip FUNCTION_PADDING, 0x90
+ * name:
+ *
+ * In both cases the whole thing is FUNCTION_ALIGNMENT aligned and sized.
+ */
+
+#ifdef CONFIG_CALL_PADDING
+#define CFI_PRE_PADDING
+#define CFI_POST_PADDING .skip CONFIG_FUNCTION_PADDING_BYTES, 0x90;
+#else
+#define CFI_PRE_PADDING .skip CONFIG_FUNCTION_PADDING_BYTES, 0x90;
+#define CFI_POST_PADDING
+#endif
+
#define __CFI_TYPE(name) \
SYM_START(__cfi_##name, SYM_L_LOCAL, SYM_A_NONE) \
- .fill 11, 1, 0x90 ASM_NL \
+ CFI_PRE_PADDING \
.byte 0xb8 ASM_NL \
.long __kcfi_typeid_##name ASM_NL \
+ CFI_POST_PADDING \
SYM_FUNC_END(__cfi_##name)
/* SYM_TYPED_FUNC_START -- use for indirectly called globals, w/ CFI type */
@@ -57,7 +104,7 @@
/* SYM_FUNC_START -- use for global functions */
#define SYM_FUNC_START(name) \
- SYM_START(name, SYM_L_GLOBAL, SYM_A_ALIGN) \
+ SYM_START(name, SYM_L_GLOBAL, SYM_F_ALIGN) \
ENDBR
/* SYM_FUNC_START_NOALIGN -- use for global functions, w/o alignment */
@@ -67,7 +114,7 @@
/* SYM_FUNC_START_LOCAL -- use for local functions */
#define SYM_FUNC_START_LOCAL(name) \
- SYM_START(name, SYM_L_LOCAL, SYM_A_ALIGN) \
+ SYM_START(name, SYM_L_LOCAL, SYM_F_ALIGN) \
ENDBR
/* SYM_FUNC_START_LOCAL_NOALIGN -- use for local functions, w/o alignment */
@@ -77,7 +124,7 @@
/* SYM_FUNC_START_WEAK -- use for weak functions */
#define SYM_FUNC_START_WEAK(name) \
- SYM_START(name, SYM_L_WEAK, SYM_A_ALIGN) \
+ SYM_START(name, SYM_L_WEAK, SYM_F_ALIGN) \
ENDBR
/* SYM_FUNC_START_WEAK_NOALIGN -- use for weak functions, w/o alignment */
diff --git a/arch/x86/include/asm/memtype.h b/arch/x86/include/asm/memtype.h
index 9ca760e430b9..113b2fa51849 100644
--- a/arch/x86/include/asm/memtype.h
+++ b/arch/x86/include/asm/memtype.h
@@ -6,9 +6,8 @@
#include <asm/pgtable_types.h>
extern bool pat_enabled(void);
-extern void pat_disable(const char *reason);
-extern void pat_init(void);
-extern void init_cache_modes(void);
+extern void pat_bp_init(void);
+extern void pat_cpu_init(void);
extern int memtype_reserve(u64 start, u64 end,
enum page_cache_mode req_pcm, enum page_cache_mode *ret_pcm);
diff --git a/arch/x86/include/asm/microcode.h b/arch/x86/include/asm/microcode.h
index 74ecc2bd6cd0..d5a58bde091c 100644
--- a/arch/x86/include/asm/microcode.h
+++ b/arch/x86/include/asm/microcode.h
@@ -33,8 +33,7 @@ enum ucode_state {
};
struct microcode_ops {
- enum ucode_state (*request_microcode_fw) (int cpu, struct device *,
- bool refresh_fw);
+ enum ucode_state (*request_microcode_fw) (int cpu, struct device *);
void (*microcode_fini_cpu) (int cpu);
@@ -50,7 +49,6 @@ struct microcode_ops {
struct ucode_cpu_info {
struct cpu_signature cpu_sig;
- int valid;
void *mc;
};
extern struct ucode_cpu_info ucode_cpu_info[];
diff --git a/arch/x86/include/asm/microcode_intel.h b/arch/x86/include/asm/microcode_intel.h
index 4c92cea7e4b5..f1fa979e05bf 100644
--- a/arch/x86/include/asm/microcode_intel.h
+++ b/arch/x86/include/asm/microcode_intel.h
@@ -14,7 +14,8 @@ struct microcode_header_intel {
unsigned int pf;
unsigned int datasize;
unsigned int totalsize;
- unsigned int reserved[3];
+ unsigned int metasize;
+ unsigned int reserved[2];
};
struct microcode_intel {
@@ -41,6 +42,8 @@ struct extended_sigtable {
#define DEFAULT_UCODE_TOTALSIZE (DEFAULT_UCODE_DATASIZE + MC_HEADER_SIZE)
#define EXT_HEADER_SIZE (sizeof(struct extended_sigtable))
#define EXT_SIGNATURE_SIZE (sizeof(struct extended_signature))
+#define MC_HEADER_TYPE_MICROCODE 1
+#define MC_HEADER_TYPE_IFS 2
#define get_totalsize(mc) \
(((struct microcode_intel *)mc)->hdr.datasize ? \
diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h
index 61f0c206bff0..6d502f3efb0f 100644
--- a/arch/x86/include/asm/mshyperv.h
+++ b/arch/x86/include/asm/mshyperv.h
@@ -19,8 +19,6 @@ typedef int (*hyperv_fill_flush_list_func)(
struct hv_guest_mapping_flush_list *flush,
void *data);
-#define hv_get_raw_timer() rdtsc_ordered()
-
void hyperv_vector_handler(struct pt_regs *regs);
#if IS_ENABLED(CONFIG_HYPERV)
diff --git a/arch/x86/include/asm/msi.h b/arch/x86/include/asm/msi.h
index d71c7e8b738d..935c6d470341 100644
--- a/arch/x86/include/asm/msi.h
+++ b/arch/x86/include/asm/msi.h
@@ -62,4 +62,10 @@ typedef struct x86_msi_addr_hi {
struct msi_msg;
u32 x86_msi_msg_get_destid(struct msi_msg *msg, bool extid);
+#define X86_VECTOR_MSI_FLAGS_SUPPORTED \
+ (MSI_GENERIC_FLAGS_MASK | MSI_FLAG_PCI_MSIX | MSI_FLAG_PCI_MSIX_ALLOC_DYN)
+
+#define X86_VECTOR_MSI_FLAGS_REQUIRED \
+ (MSI_FLAG_USE_DEF_DOM_OPS | MSI_FLAG_USE_DEF_CHIP_OPS)
+
#endif /* _ASM_X86_MSI_H */
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 10ac52705892..37ff47552bcb 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -4,12 +4,7 @@
#include <linux/bits.h>
-/*
- * CPU model specific register (MSR) numbers.
- *
- * Do not add new entries to this file unless the definitions are shared
- * between multiple compilation units.
- */
+/* CPU model specific register (MSR) numbers. */
/* x86-64 specific MSRs */
#define MSR_EFER 0xc0000080 /* extended feature register */
@@ -535,6 +530,11 @@
#define MSR_AMD64_CPUID_FN_1 0xc0011004
#define MSR_AMD64_LS_CFG 0xc0011020
#define MSR_AMD64_DC_CFG 0xc0011022
+
+#define MSR_AMD64_DE_CFG 0xc0011029
+#define MSR_AMD64_DE_CFG_LFENCE_SERIALIZE_BIT 1
+#define MSR_AMD64_DE_CFG_LFENCE_SERIALIZE BIT_ULL(MSR_AMD64_DE_CFG_LFENCE_SERIALIZE_BIT)
+
#define MSR_AMD64_BU_CFG2 0xc001102a
#define MSR_AMD64_IBSFETCHCTL 0xc0011030
#define MSR_AMD64_IBSFETCHLINAD 0xc0011031
@@ -640,9 +640,6 @@
#define FAM10H_MMIO_CONF_BASE_MASK 0xfffffffULL
#define FAM10H_MMIO_CONF_BASE_SHIFT 20
#define MSR_FAM10H_NODE_ID 0xc001100c
-#define MSR_F10H_DECFG 0xc0011029
-#define MSR_F10H_DECFG_LFENCE_SERIALIZE_BIT 1
-#define MSR_F10H_DECFG_LFENCE_SERIALIZE BIT_ULL(MSR_F10H_DECFG_LFENCE_SERIALIZE_BIT)
/* K8 MSRs */
#define MSR_K8_TOP_MEM1 0xc001001a
@@ -796,6 +793,7 @@
#define ENERGY_PERF_BIAS_PERFORMANCE 0
#define ENERGY_PERF_BIAS_BALANCE_PERFORMANCE 4
#define ENERGY_PERF_BIAS_NORMAL 6
+#define ENERGY_PERF_BIAS_NORMAL_POWERSAVE 7
#define ENERGY_PERF_BIAS_BALANCE_POWERSAVE 8
#define ENERGY_PERF_BIAS_POWERSAVE 15
@@ -1050,6 +1048,20 @@
#define VMX_BASIC_MEM_TYPE_WB 6LLU
#define VMX_BASIC_INOUT 0x0040000000000000LLU
+/* Resctrl MSRs: */
+/* - Intel: */
+#define MSR_IA32_L3_QOS_CFG 0xc81
+#define MSR_IA32_L2_QOS_CFG 0xc82
+#define MSR_IA32_QM_EVTSEL 0xc8d
+#define MSR_IA32_QM_CTR 0xc8e
+#define MSR_IA32_PQR_ASSOC 0xc8f
+#define MSR_IA32_L3_CBM_BASE 0xc90
+#define MSR_IA32_L2_CBM_BASE 0xd10
+#define MSR_IA32_MBA_THRTL_BASE 0xd50
+
+/* - AMD: */
+#define MSR_IA32_MBA_BW_BASE 0xc0000200
+
/* MSR_IA32_VMX_MISC bits */
#define MSR_IA32_VMX_MISC_INTEL_PT (1ULL << 14)
#define MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS (1ULL << 29)
diff --git a/arch/x86/include/asm/mtrr.h b/arch/x86/include/asm/mtrr.h
index 76d726074c16..f0eeaf6e5f5f 100644
--- a/arch/x86/include/asm/mtrr.h
+++ b/arch/x86/include/asm/mtrr.h
@@ -25,13 +25,12 @@
#include <uapi/asm/mtrr.h>
-void mtrr_bp_init(void);
-
/*
* The following functions are for use by other drivers that cannot use
* arch_phys_wc_add and arch_phys_wc_del.
*/
# ifdef CONFIG_MTRR
+void mtrr_bp_init(void);
extern u8 mtrr_type_lookup(u64 addr, u64 end, u8 *uniform);
extern void mtrr_save_fixed_ranges(void *);
extern void mtrr_save_state(void);
@@ -42,12 +41,12 @@ extern int mtrr_add_page(unsigned long base, unsigned long size,
extern int mtrr_del(int reg, unsigned long base, unsigned long size);
extern int mtrr_del_page(int reg, unsigned long base, unsigned long size);
extern void mtrr_centaur_report_mcr(int mcr, u32 lo, u32 hi);
-extern void mtrr_ap_init(void);
-extern void set_mtrr_aps_delayed_init(void);
-extern void mtrr_aps_init(void);
extern void mtrr_bp_restore(void);
extern int mtrr_trim_uncached_memory(unsigned long end_pfn);
extern int amd_special_default_mtrr(void);
+void mtrr_disable(void);
+void mtrr_enable(void);
+void mtrr_generic_set_state(void);
# else
static inline u8 mtrr_type_lookup(u64 addr, u64 end, u8 *uniform)
{
@@ -83,10 +82,11 @@ static inline int mtrr_trim_uncached_memory(unsigned long end_pfn)
static inline void mtrr_centaur_report_mcr(int mcr, u32 lo, u32 hi)
{
}
-#define mtrr_ap_init() do {} while (0)
-#define set_mtrr_aps_delayed_init() do {} while (0)
-#define mtrr_aps_init() do {} while (0)
+#define mtrr_bp_init() do {} while (0)
#define mtrr_bp_restore() do {} while (0)
+#define mtrr_disable() do {} while (0)
+#define mtrr_enable() do {} while (0)
+#define mtrr_generic_set_state() do {} while (0)
# endif
#ifdef CONFIG_COMPAT
diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h
index c936ce9f0c47..771b0a2b7a34 100644
--- a/arch/x86/include/asm/nospec-branch.h
+++ b/arch/x86/include/asm/nospec-branch.h
@@ -12,8 +12,104 @@
#include <asm/msr-index.h>
#include <asm/unwind_hints.h>
#include <asm/percpu.h>
+#include <asm/current.h>
-#define RETPOLINE_THUNK_SIZE 32
+/*
+ * Call depth tracking for Intel SKL CPUs to address the RSB underflow
+ * issue in software.
+ *
+ * The tracking does not use a counter. It uses uses arithmetic shift
+ * right on call entry and logical shift left on return.
+ *
+ * The depth tracking variable is initialized to 0x8000.... when the call
+ * depth is zero. The arithmetic shift right sign extends the MSB and
+ * saturates after the 12th call. The shift count is 5 for both directions
+ * so the tracking covers 12 nested calls.
+ *
+ * Call
+ * 0: 0x8000000000000000 0x0000000000000000
+ * 1: 0xfc00000000000000 0xf000000000000000
+ * ...
+ * 11: 0xfffffffffffffff8 0xfffffffffffffc00
+ * 12: 0xffffffffffffffff 0xffffffffffffffe0
+ *
+ * After a return buffer fill the depth is credited 12 calls before the
+ * next stuffing has to take place.
+ *
+ * There is a inaccuracy for situations like this:
+ *
+ * 10 calls
+ * 5 returns
+ * 3 calls
+ * 4 returns
+ * 3 calls
+ * ....
+ *
+ * The shift count might cause this to be off by one in either direction,
+ * but there is still a cushion vs. the RSB depth. The algorithm does not
+ * claim to be perfect and it can be speculated around by the CPU, but it
+ * is considered that it obfuscates the problem enough to make exploitation
+ * extremly difficult.
+ */
+#define RET_DEPTH_SHIFT 5
+#define RSB_RET_STUFF_LOOPS 16
+#define RET_DEPTH_INIT 0x8000000000000000ULL
+#define RET_DEPTH_INIT_FROM_CALL 0xfc00000000000000ULL
+#define RET_DEPTH_CREDIT 0xffffffffffffffffULL
+
+#ifdef CONFIG_CALL_THUNKS_DEBUG
+# define CALL_THUNKS_DEBUG_INC_CALLS \
+ incq %gs:__x86_call_count;
+# define CALL_THUNKS_DEBUG_INC_RETS \
+ incq %gs:__x86_ret_count;
+# define CALL_THUNKS_DEBUG_INC_STUFFS \
+ incq %gs:__x86_stuffs_count;
+# define CALL_THUNKS_DEBUG_INC_CTXSW \
+ incq %gs:__x86_ctxsw_count;
+#else
+# define CALL_THUNKS_DEBUG_INC_CALLS
+# define CALL_THUNKS_DEBUG_INC_RETS
+# define CALL_THUNKS_DEBUG_INC_STUFFS
+# define CALL_THUNKS_DEBUG_INC_CTXSW
+#endif
+
+#if defined(CONFIG_CALL_DEPTH_TRACKING) && !defined(COMPILE_OFFSETS)
+
+#include <asm/asm-offsets.h>
+
+#define CREDIT_CALL_DEPTH \
+ movq $-1, PER_CPU_VAR(pcpu_hot + X86_call_depth);
+
+#define ASM_CREDIT_CALL_DEPTH \
+ movq $-1, PER_CPU_VAR(pcpu_hot + X86_call_depth);
+
+#define RESET_CALL_DEPTH \
+ mov $0x80, %rax; \
+ shl $56, %rax; \
+ movq %rax, PER_CPU_VAR(pcpu_hot + X86_call_depth);
+
+#define RESET_CALL_DEPTH_FROM_CALL \
+ mov $0xfc, %rax; \
+ shl $56, %rax; \
+ movq %rax, PER_CPU_VAR(pcpu_hot + X86_call_depth); \
+ CALL_THUNKS_DEBUG_INC_CALLS
+
+#define INCREMENT_CALL_DEPTH \
+ sarq $5, %gs:pcpu_hot + X86_call_depth; \
+ CALL_THUNKS_DEBUG_INC_CALLS
+
+#define ASM_INCREMENT_CALL_DEPTH \
+ sarq $5, PER_CPU_VAR(pcpu_hot + X86_call_depth); \
+ CALL_THUNKS_DEBUG_INC_CALLS
+
+#else
+#define CREDIT_CALL_DEPTH
+#define ASM_CREDIT_CALL_DEPTH
+#define RESET_CALL_DEPTH
+#define INCREMENT_CALL_DEPTH
+#define ASM_INCREMENT_CALL_DEPTH
+#define RESET_CALL_DEPTH_FROM_CALL
+#endif
/*
* Fill the CPU return stack buffer.
@@ -32,6 +128,7 @@
* from C via asm(".include <asm/nospec-branch.h>") but let's not go there.
*/
+#define RETPOLINE_THUNK_SIZE 32
#define RSB_CLEAR_LOOPS 32 /* To forcibly overwrite all entries */
/*
@@ -60,7 +157,9 @@
dec reg; \
jnz 771b; \
/* barrier for jnz misprediction */ \
- lfence;
+ lfence; \
+ ASM_CREDIT_CALL_DEPTH \
+ CALL_THUNKS_DEBUG_INC_CTXSW
#else
/*
* i386 doesn't unconditionally have LFENCE, as such it can't
@@ -185,11 +284,32 @@
* where we have a stack but before any RET instruction.
*/
.macro UNTRAIN_RET
-#if defined(CONFIG_CPU_UNRET_ENTRY) || defined(CONFIG_CPU_IBPB_ENTRY)
+#if defined(CONFIG_CPU_UNRET_ENTRY) || defined(CONFIG_CPU_IBPB_ENTRY) || \
+ defined(CONFIG_CALL_DEPTH_TRACKING)
ANNOTATE_UNRET_END
- ALTERNATIVE_2 "", \
- CALL_ZEN_UNTRAIN_RET, X86_FEATURE_UNRET, \
- "call entry_ibpb", X86_FEATURE_ENTRY_IBPB
+ ALTERNATIVE_3 "", \
+ CALL_ZEN_UNTRAIN_RET, X86_FEATURE_UNRET, \
+ "call entry_ibpb", X86_FEATURE_ENTRY_IBPB, \
+ __stringify(RESET_CALL_DEPTH), X86_FEATURE_CALL_DEPTH
+#endif
+.endm
+
+.macro UNTRAIN_RET_FROM_CALL
+#if defined(CONFIG_CPU_UNRET_ENTRY) || defined(CONFIG_CPU_IBPB_ENTRY) || \
+ defined(CONFIG_CALL_DEPTH_TRACKING)
+ ANNOTATE_UNRET_END
+ ALTERNATIVE_3 "", \
+ CALL_ZEN_UNTRAIN_RET, X86_FEATURE_UNRET, \
+ "call entry_ibpb", X86_FEATURE_ENTRY_IBPB, \
+ __stringify(RESET_CALL_DEPTH_FROM_CALL), X86_FEATURE_CALL_DEPTH
+#endif
+.endm
+
+
+.macro CALL_DEPTH_ACCOUNT
+#ifdef CONFIG_CALL_DEPTH_TRACKING
+ ALTERNATIVE "", \
+ __stringify(ASM_INCREMENT_CALL_DEPTH), X86_FEATURE_CALL_DEPTH
#endif
.endm
@@ -203,11 +323,45 @@
typedef u8 retpoline_thunk_t[RETPOLINE_THUNK_SIZE];
extern retpoline_thunk_t __x86_indirect_thunk_array[];
+extern retpoline_thunk_t __x86_indirect_call_thunk_array[];
+extern retpoline_thunk_t __x86_indirect_jump_thunk_array[];
extern void __x86_return_thunk(void);
extern void zen_untrain_ret(void);
extern void entry_ibpb(void);
+#ifdef CONFIG_CALL_THUNKS
+extern void (*x86_return_thunk)(void);
+#else
+#define x86_return_thunk (&__x86_return_thunk)
+#endif
+
+#ifdef CONFIG_CALL_DEPTH_TRACKING
+extern void __x86_return_skl(void);
+
+static inline void x86_set_skl_return_thunk(void)
+{
+ x86_return_thunk = &__x86_return_skl;
+}
+
+#define CALL_DEPTH_ACCOUNT \
+ ALTERNATIVE("", \
+ __stringify(INCREMENT_CALL_DEPTH), \
+ X86_FEATURE_CALL_DEPTH)
+
+#ifdef CONFIG_CALL_THUNKS_DEBUG
+DECLARE_PER_CPU(u64, __x86_call_count);
+DECLARE_PER_CPU(u64, __x86_ret_count);
+DECLARE_PER_CPU(u64, __x86_stuffs_count);
+DECLARE_PER_CPU(u64, __x86_ctxsw_count);
+#endif
+#else
+static inline void x86_set_skl_return_thunk(void) {}
+
+#define CALL_DEPTH_ACCOUNT ""
+
+#endif
+
#ifdef CONFIG_RETPOLINE
#define GEN(reg) \
@@ -215,6 +369,16 @@ extern void entry_ibpb(void);
#include <asm/GEN-for-each-reg.h>
#undef GEN
+#define GEN(reg) \
+ extern retpoline_thunk_t __x86_indirect_call_thunk_ ## reg;
+#include <asm/GEN-for-each-reg.h>
+#undef GEN
+
+#define GEN(reg) \
+ extern retpoline_thunk_t __x86_indirect_jump_thunk_ ## reg;
+#include <asm/GEN-for-each-reg.h>
+#undef GEN
+
#ifdef CONFIG_X86_64
/*
@@ -321,7 +485,7 @@ static inline void indirect_branch_prediction_barrier(void)
/* The Intel SPEC CTRL MSR base value cache */
extern u64 x86_spec_ctrl_base;
DECLARE_PER_CPU(u64, x86_spec_ctrl_current);
-extern void write_spec_ctrl_current(u64 val, bool force);
+extern void update_spec_ctrl_cond(u64 val);
extern u64 spec_ctrl_current(void);
/*
diff --git a/arch/x86/include/asm/page_types.h b/arch/x86/include/asm/page_types.h
index a506a411474d..86bd4311daf8 100644
--- a/arch/x86/include/asm/page_types.h
+++ b/arch/x86/include/asm/page_types.h
@@ -11,20 +11,14 @@
#define PAGE_SIZE (_AC(1,UL) << PAGE_SHIFT)
#define PAGE_MASK (~(PAGE_SIZE-1))
-#define PMD_PAGE_SIZE (_AC(1, UL) << PMD_SHIFT)
-#define PMD_PAGE_MASK (~(PMD_PAGE_SIZE-1))
-
-#define PUD_PAGE_SIZE (_AC(1, UL) << PUD_SHIFT)
-#define PUD_PAGE_MASK (~(PUD_PAGE_SIZE-1))
-
#define __VIRTUAL_MASK ((1UL << __VIRTUAL_MASK_SHIFT) - 1)
-/* Cast *PAGE_MASK to a signed type so that it is sign-extended if
+/* Cast P*D_MASK to a signed type so that it is sign-extended if
virtual addresses are 32-bits but physical addresses are larger
(ie, 32-bit PAE). */
#define PHYSICAL_PAGE_MASK (((signed long)PAGE_MASK) & __PHYSICAL_MASK)
-#define PHYSICAL_PMD_PAGE_MASK (((signed long)PMD_PAGE_MASK) & __PHYSICAL_MASK)
-#define PHYSICAL_PUD_PAGE_MASK (((signed long)PUD_PAGE_MASK) & __PHYSICAL_MASK)
+#define PHYSICAL_PMD_PAGE_MASK (((signed long)PMD_MASK) & __PHYSICAL_MASK)
+#define PHYSICAL_PUD_PAGE_MASK (((signed long)PUD_MASK) & __PHYSICAL_MASK)
#define HPAGE_SHIFT PMD_SHIFT
#define HPAGE_SIZE (_AC(1,UL) << HPAGE_SHIFT)
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index 2a0b8dd4ec33..73e9522db7c1 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -4,13 +4,13 @@
/* Various instructions on x86 need to be replaced for
* para-virtualization: those hooks are defined here. */
+#include <asm/paravirt_types.h>
+
#ifdef CONFIG_PARAVIRT
#include <asm/pgtable_types.h>
#include <asm/asm.h>
#include <asm/nospec-branch.h>
-#include <asm/paravirt_types.h>
-
#ifndef __ASSEMBLY__
#include <linux/bug.h>
#include <linux/types.h>
@@ -665,6 +665,7 @@ bool __raw_callee_save___native_vcpu_is_preempted(long cpu);
asm(".pushsection " section ", \"ax\";" \
".globl " PV_THUNK_NAME(func) ";" \
".type " PV_THUNK_NAME(func) ", @function;" \
+ ASM_FUNC_ALIGN \
PV_THUNK_NAME(func) ":" \
ASM_ENDBR \
FRAME_BEGIN \
@@ -730,6 +731,18 @@ static __always_inline unsigned long arch_local_irq_save(void)
#undef PVOP_VCALL4
#undef PVOP_CALL4
+#define DEFINE_PARAVIRT_ASM(func, instr, sec) \
+ asm (".pushsection " #sec ", \"ax\"\n" \
+ ".global " #func "\n\t" \
+ ".type " #func ", @function\n\t" \
+ ASM_FUNC_ALIGN "\n" \
+ #func ":\n\t" \
+ ASM_ENDBR \
+ instr "\n\t" \
+ ASM_RET \
+ ".size " #func ", . - " #func "\n\t" \
+ ".popsection")
+
extern void default_banner(void);
#else /* __ASSEMBLY__ */
diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h
index f3d601574730..8c1da419260f 100644
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -2,36 +2,23 @@
#ifndef _ASM_X86_PARAVIRT_TYPES_H
#define _ASM_X86_PARAVIRT_TYPES_H
-/* Bitmask of what can be clobbered: usually at least eax. */
-#define CLBR_EAX (1 << 0)
-#define CLBR_ECX (1 << 1)
-#define CLBR_EDX (1 << 2)
-#define CLBR_EDI (1 << 3)
-
-#ifdef CONFIG_X86_32
-/* CLBR_ANY should match all regs platform has. For i386, that's just it */
-#define CLBR_ANY ((1 << 4) - 1)
-
-#define CLBR_ARG_REGS (CLBR_EAX | CLBR_EDX | CLBR_ECX)
-#define CLBR_RET_REG (CLBR_EAX | CLBR_EDX)
-#else
-#define CLBR_RAX CLBR_EAX
-#define CLBR_RCX CLBR_ECX
-#define CLBR_RDX CLBR_EDX
-#define CLBR_RDI CLBR_EDI
-#define CLBR_RSI (1 << 4)
-#define CLBR_R8 (1 << 5)
-#define CLBR_R9 (1 << 6)
-#define CLBR_R10 (1 << 7)
-#define CLBR_R11 (1 << 8)
-
-#define CLBR_ANY ((1 << 9) - 1)
+#ifndef __ASSEMBLY__
+/* These all sit in the .parainstructions section to tell us what to patch. */
+struct paravirt_patch_site {
+ u8 *instr; /* original instructions */
+ u8 type; /* type of this instruction */
+ u8 len; /* length of original instruction */
+};
-#define CLBR_ARG_REGS (CLBR_RDI | CLBR_RSI | CLBR_RDX | \
- CLBR_RCX | CLBR_R8 | CLBR_R9)
-#define CLBR_RET_REG (CLBR_RAX)
+/* Lazy mode for batching updates / context switch */
+enum paravirt_lazy_mode {
+ PARAVIRT_LAZY_NONE,
+ PARAVIRT_LAZY_MMU,
+ PARAVIRT_LAZY_CPU,
+};
+#endif
-#endif /* X86_64 */
+#ifdef CONFIG_PARAVIRT
#ifndef __ASSEMBLY__
@@ -279,27 +266,23 @@ extern struct paravirt_patch_template pv_ops;
#define paravirt_type(op) \
[paravirt_typenum] "i" (PARAVIRT_PATCH(op)), \
[paravirt_opptr] "m" (pv_ops.op)
-#define paravirt_clobber(clobber) \
- [paravirt_clobber] "i" (clobber)
-
/*
* Generate some code, and mark it as patchable by the
* apply_paravirt() alternate instruction patcher.
*/
-#define _paravirt_alt(insn_string, type, clobber) \
+#define _paravirt_alt(insn_string, type) \
"771:\n\t" insn_string "\n" "772:\n" \
".pushsection .parainstructions,\"a\"\n" \
_ASM_ALIGN "\n" \
_ASM_PTR " 771b\n" \
" .byte " type "\n" \
" .byte 772b-771b\n" \
- " .short " clobber "\n" \
_ASM_ALIGN "\n" \
".popsection\n"
/* Generate patchable code, with the default asm parameters. */
#define paravirt_alt(insn_string) \
- _paravirt_alt(insn_string, "%c[paravirt_typenum]", "%c[paravirt_clobber]")
+ _paravirt_alt(insn_string, "%c[paravirt_typenum]")
/* Simple instruction patching code. */
#define NATIVE_LABEL(a,x,b) "\n\t.globl " a #x "_" #b "\n" a #x "_" #b ":\n\t"
@@ -451,20 +434,19 @@ int paravirt_disable_iospace(void);
})
-#define ____PVOP_CALL(ret, op, clbr, call_clbr, extra_clbr, ...) \
+#define ____PVOP_CALL(ret, op, call_clbr, extra_clbr, ...) \
({ \
PVOP_CALL_ARGS; \
PVOP_TEST_NULL(op); \
asm volatile(paravirt_alt(PARAVIRT_CALL) \
: call_clbr, ASM_CALL_CONSTRAINT \
: paravirt_type(op), \
- paravirt_clobber(clbr), \
##__VA_ARGS__ \
: "memory", "cc" extra_clbr); \
ret; \
})
-#define ____PVOP_ALT_CALL(ret, op, alt, cond, clbr, call_clbr, \
+#define ____PVOP_ALT_CALL(ret, op, alt, cond, call_clbr, \
extra_clbr, ...) \
({ \
PVOP_CALL_ARGS; \
@@ -473,45 +455,44 @@ int paravirt_disable_iospace(void);
alt, cond) \
: call_clbr, ASM_CALL_CONSTRAINT \
: paravirt_type(op), \
- paravirt_clobber(clbr), \
##__VA_ARGS__ \
: "memory", "cc" extra_clbr); \
ret; \
})
#define __PVOP_CALL(rettype, op, ...) \
- ____PVOP_CALL(PVOP_RETVAL(rettype), op, CLBR_ANY, \
+ ____PVOP_CALL(PVOP_RETVAL(rettype), op, \
PVOP_CALL_CLOBBERS, EXTRA_CLOBBERS, ##__VA_ARGS__)
#define __PVOP_ALT_CALL(rettype, op, alt, cond, ...) \
- ____PVOP_ALT_CALL(PVOP_RETVAL(rettype), op, alt, cond, CLBR_ANY,\
+ ____PVOP_ALT_CALL(PVOP_RETVAL(rettype), op, alt, cond, \
PVOP_CALL_CLOBBERS, EXTRA_CLOBBERS, \
##__VA_ARGS__)
#define __PVOP_CALLEESAVE(rettype, op, ...) \
- ____PVOP_CALL(PVOP_RETVAL(rettype), op.func, CLBR_RET_REG, \
+ ____PVOP_CALL(PVOP_RETVAL(rettype), op.func, \
PVOP_CALLEE_CLOBBERS, , ##__VA_ARGS__)
#define __PVOP_ALT_CALLEESAVE(rettype, op, alt, cond, ...) \
____PVOP_ALT_CALL(PVOP_RETVAL(rettype), op.func, alt, cond, \
- CLBR_RET_REG, PVOP_CALLEE_CLOBBERS, , ##__VA_ARGS__)
+ PVOP_CALLEE_CLOBBERS, , ##__VA_ARGS__)
#define __PVOP_VCALL(op, ...) \
- (void)____PVOP_CALL(, op, CLBR_ANY, PVOP_VCALL_CLOBBERS, \
+ (void)____PVOP_CALL(, op, PVOP_VCALL_CLOBBERS, \
VEXTRA_CLOBBERS, ##__VA_ARGS__)
#define __PVOP_ALT_VCALL(op, alt, cond, ...) \
- (void)____PVOP_ALT_CALL(, op, alt, cond, CLBR_ANY, \
+ (void)____PVOP_ALT_CALL(, op, alt, cond, \
PVOP_VCALL_CLOBBERS, VEXTRA_CLOBBERS, \
##__VA_ARGS__)
#define __PVOP_VCALLEESAVE(op, ...) \
- (void)____PVOP_CALL(, op.func, CLBR_RET_REG, \
+ (void)____PVOP_CALL(, op.func, \
PVOP_VCALLEE_CLOBBERS, , ##__VA_ARGS__)
#define __PVOP_ALT_VCALLEESAVE(op, alt, cond, ...) \
- (void)____PVOP_ALT_CALL(, op.func, alt, cond, CLBR_RET_REG, \
+ (void)____PVOP_ALT_CALL(, op.func, alt, cond, \
PVOP_VCALLEE_CLOBBERS, , ##__VA_ARGS__)
@@ -571,13 +552,6 @@ int paravirt_disable_iospace(void);
__PVOP_VCALL(op, PVOP_CALL_ARG1(arg1), PVOP_CALL_ARG2(arg2), \
PVOP_CALL_ARG3(arg3), PVOP_CALL_ARG4(arg4))
-/* Lazy mode for batching updates / context switch */
-enum paravirt_lazy_mode {
- PARAVIRT_LAZY_NONE,
- PARAVIRT_LAZY_MMU,
- PARAVIRT_LAZY_CPU,
-};
-
enum paravirt_lazy_mode paravirt_get_lazy_mode(void);
void paravirt_start_context_switch(struct task_struct *prev);
void paravirt_end_context_switch(struct task_struct *next);
@@ -593,16 +567,9 @@ unsigned long paravirt_ret0(void);
#define paravirt_nop ((void *)_paravirt_nop)
-/* These all sit in the .parainstructions section to tell us what to patch. */
-struct paravirt_patch_site {
- u8 *instr; /* original instructions */
- u8 type; /* type of this instruction */
- u8 len; /* length of original instruction */
-};
-
extern struct paravirt_patch_site __parainstructions[],
__parainstructions_end[];
#endif /* __ASSEMBLY__ */
-
+#endif /* CONFIG_PARAVIRT */
#endif /* _ASM_X86_PARAVIRT_TYPES_H */
diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h
index 736793d65bcb..b40c462b4af3 100644
--- a/arch/x86/include/asm/pci.h
+++ b/arch/x86/include/asm/pci.h
@@ -21,7 +21,7 @@ struct pci_sysdata {
#ifdef CONFIG_X86_64
void *iommu; /* IOMMU private data */
#endif
-#ifdef CONFIG_PCI_MSI_IRQ_DOMAIN
+#ifdef CONFIG_PCI_MSI
void *fwnode; /* IRQ domain for MSI assignment */
#endif
#if IS_ENABLED(CONFIG_VMD)
@@ -52,7 +52,7 @@ static inline int pci_proc_domain(struct pci_bus *bus)
}
#endif
-#ifdef CONFIG_PCI_MSI_IRQ_DOMAIN
+#ifdef CONFIG_PCI_MSI
static inline void *_pci_root_bus_fwnode(struct pci_bus *bus)
{
return to_pci_sysdata(bus)->fwnode;
@@ -92,6 +92,7 @@ void pcibios_scan_root(int bus);
struct irq_routing_table *pcibios_get_irq_routing_table(void);
int pcibios_set_irq_routing(struct pci_dev *dev, int pin, int irq);
+bool pci_dev_has_default_msi_parent_domain(struct pci_dev *dev);
#define HAVE_PCI_MMAP
#define arch_can_pci_mmap_wc() pat_enabled()
diff --git a/arch/x86/include/asm/pgtable-3level.h b/arch/x86/include/asm/pgtable-3level.h
index 28421a887209..967b135fa2c0 100644
--- a/arch/x86/include/asm/pgtable-3level.h
+++ b/arch/x86/include/asm/pgtable-3level.h
@@ -2,8 +2,6 @@
#ifndef _ASM_X86_PGTABLE_3LEVEL_H
#define _ASM_X86_PGTABLE_3LEVEL_H
-#include <asm/atomic64_32.h>
-
/*
* Intel Physical Address Extension (PAE) Mode - three-level page
* tables on PPro+ CPUs.
@@ -21,7 +19,15 @@
pr_err("%s:%d: bad pgd %p(%016Lx)\n", \
__FILE__, __LINE__, &(e), pgd_val(e))
-/* Rules for using set_pte: the pte being assigned *must* be
+#define pxx_xchg64(_pxx, _ptr, _val) ({ \
+ _pxx##val_t *_p = (_pxx##val_t *)_ptr; \
+ _pxx##val_t _o = *_p; \
+ do { } while (!try_cmpxchg64(_p, &_o, (_val))); \
+ native_make_##_pxx(_o); \
+})
+
+/*
+ * Rules for using set_pte: the pte being assigned *must* be
* either not present or in a state where the hardware will
* not attempt to update the pte. In places where this is
* not possible, use pte_get_and_clear to obtain the old pte
@@ -29,75 +35,19 @@
*/
static inline void native_set_pte(pte_t *ptep, pte_t pte)
{
- ptep->pte_high = pte.pte_high;
+ WRITE_ONCE(ptep->pte_high, pte.pte_high);
smp_wmb();
- ptep->pte_low = pte.pte_low;
-}
-
-#define pmd_read_atomic pmd_read_atomic
-/*
- * pte_offset_map_lock() on 32-bit PAE kernels was reading the pmd_t with
- * a "*pmdp" dereference done by GCC. Problem is, in certain places
- * where pte_offset_map_lock() is called, concurrent page faults are
- * allowed, if the mmap_lock is hold for reading. An example is mincore
- * vs page faults vs MADV_DONTNEED. On the page fault side
- * pmd_populate() rightfully does a set_64bit(), but if we're reading the
- * pmd_t with a "*pmdp" on the mincore side, a SMP race can happen
- * because GCC will not read the 64-bit value of the pmd atomically.
- *
- * To fix this all places running pte_offset_map_lock() while holding the
- * mmap_lock in read mode, shall read the pmdp pointer using this
- * function to know if the pmd is null or not, and in turn to know if
- * they can run pte_offset_map_lock() or pmd_trans_huge() or other pmd
- * operations.
- *
- * Without THP if the mmap_lock is held for reading, the pmd can only
- * transition from null to not null while pmd_read_atomic() runs. So
- * we can always return atomic pmd values with this function.
- *
- * With THP if the mmap_lock is held for reading, the pmd can become
- * trans_huge or none or point to a pte (and in turn become "stable")
- * at any time under pmd_read_atomic(). We could read it truly
- * atomically here with an atomic64_read() for the THP enabled case (and
- * it would be a whole lot simpler), but to avoid using cmpxchg8b we
- * only return an atomic pmdval if the low part of the pmdval is later
- * found to be stable (i.e. pointing to a pte). We are also returning a
- * 'none' (zero) pmdval if the low part of the pmd is zero.
- *
- * In some cases the high and low part of the pmdval returned may not be
- * consistent if THP is enabled (the low part may point to previously
- * mapped hugepage, while the high part may point to a more recently
- * mapped hugepage), but pmd_none_or_trans_huge_or_clear_bad() only
- * needs the low part of the pmd to be read atomically to decide if the
- * pmd is unstable or not, with the only exception when the low part
- * of the pmd is zero, in which case we return a 'none' pmd.
- */
-static inline pmd_t pmd_read_atomic(pmd_t *pmdp)
-{
- pmdval_t ret;
- u32 *tmp = (u32 *)pmdp;
-
- ret = (pmdval_t) (*tmp);
- if (ret) {
- /*
- * If the low part is null, we must not read the high part
- * or we can end up with a partial pmd.
- */
- smp_rmb();
- ret |= ((pmdval_t)*(tmp + 1)) << 32;
- }
-
- return (pmd_t) { ret };
+ WRITE_ONCE(ptep->pte_low, pte.pte_low);
}
static inline void native_set_pte_atomic(pte_t *ptep, pte_t pte)
{
- set_64bit((unsigned long long *)(ptep), native_pte_val(pte));
+ pxx_xchg64(pte, ptep, native_pte_val(pte));
}
static inline void native_set_pmd(pmd_t *pmdp, pmd_t pmd)
{
- set_64bit((unsigned long long *)(pmdp), native_pmd_val(pmd));
+ pxx_xchg64(pmd, pmdp, native_pmd_val(pmd));
}
static inline void native_set_pud(pud_t *pudp, pud_t pud)
@@ -105,7 +55,7 @@ static inline void native_set_pud(pud_t *pudp, pud_t pud)
#ifdef CONFIG_PAGE_TABLE_ISOLATION
pud.p4d.pgd = pti_set_user_pgtbl(&pudp->p4d.pgd, pud.p4d.pgd);
#endif
- set_64bit((unsigned long long *)(pudp), native_pud_val(pud));
+ pxx_xchg64(pud, pudp, native_pud_val(pud));
}
/*
@@ -116,17 +66,16 @@ static inline void native_set_pud(pud_t *pudp, pud_t pud)
static inline void native_pte_clear(struct mm_struct *mm, unsigned long addr,
pte_t *ptep)
{
- ptep->pte_low = 0;
+ WRITE_ONCE(ptep->pte_low, 0);
smp_wmb();
- ptep->pte_high = 0;
+ WRITE_ONCE(ptep->pte_high, 0);
}
-static inline void native_pmd_clear(pmd_t *pmd)
+static inline void native_pmd_clear(pmd_t *pmdp)
{
- u32 *tmp = (u32 *)pmd;
- *tmp = 0;
+ WRITE_ONCE(pmdp->pmd_low, 0);
smp_wmb();
- *(tmp + 1) = 0;
+ WRITE_ONCE(pmdp->pmd_high, 0);
}
static inline void native_pud_clear(pud_t *pudp)
@@ -149,41 +98,26 @@ static inline void pud_clear(pud_t *pudp)
*/
}
+
#ifdef CONFIG_SMP
static inline pte_t native_ptep_get_and_clear(pte_t *ptep)
{
- pte_t res;
-
- res.pte = (pteval_t)arch_atomic64_xchg((atomic64_t *)ptep, 0);
-
- return res;
+ return pxx_xchg64(pte, ptep, 0ULL);
}
-#else
-#define native_ptep_get_and_clear(xp) native_local_ptep_get_and_clear(xp)
-#endif
-union split_pmd {
- struct {
- u32 pmd_low;
- u32 pmd_high;
- };
- pmd_t pmd;
-};
-
-#ifdef CONFIG_SMP
static inline pmd_t native_pmdp_get_and_clear(pmd_t *pmdp)
{
- union split_pmd res, *orig = (union split_pmd *)pmdp;
-
- /* xchg acts as a barrier before setting of the high bits */
- res.pmd_low = xchg(&orig->pmd_low, 0);
- res.pmd_high = orig->pmd_high;
- orig->pmd_high = 0;
+ return pxx_xchg64(pmd, pmdp, 0ULL);
+}
- return res.pmd;
+static inline pud_t native_pudp_get_and_clear(pud_t *pudp)
+{
+ return pxx_xchg64(pud, pudp, 0ULL);
}
#else
+#define native_ptep_get_and_clear(xp) native_local_ptep_get_and_clear(xp)
#define native_pmdp_get_and_clear(xp) native_local_pmdp_get_and_clear(xp)
+#define native_pudp_get_and_clear(xp) native_local_pudp_get_and_clear(xp)
#endif
#ifndef pmdp_establish
@@ -199,53 +133,16 @@ static inline pmd_t pmdp_establish(struct vm_area_struct *vma,
* anybody.
*/
if (!(pmd_val(pmd) & _PAGE_PRESENT)) {
- union split_pmd old, new, *ptr;
-
- ptr = (union split_pmd *)pmdp;
-
- new.pmd = pmd;
-
/* xchg acts as a barrier before setting of the high bits */
- old.pmd_low = xchg(&ptr->pmd_low, new.pmd_low);
- old.pmd_high = ptr->pmd_high;
- ptr->pmd_high = new.pmd_high;
- return old.pmd;
- }
-
- do {
- old = *pmdp;
- } while (cmpxchg64(&pmdp->pmd, old.pmd, pmd.pmd) != old.pmd);
-
- return old;
-}
-#endif
-
-#ifdef CONFIG_SMP
-union split_pud {
- struct {
- u32 pud_low;
- u32 pud_high;
- };
- pud_t pud;
-};
-
-static inline pud_t native_pudp_get_and_clear(pud_t *pudp)
-{
- union split_pud res, *orig = (union split_pud *)pudp;
+ old.pmd_low = xchg(&pmdp->pmd_low, pmd.pmd_low);
+ old.pmd_high = READ_ONCE(pmdp->pmd_high);
+ WRITE_ONCE(pmdp->pmd_high, pmd.pmd_high);
-#ifdef CONFIG_PAGE_TABLE_ISOLATION
- pti_set_user_pgtbl(&pudp->p4d.pgd, __pgd(0));
-#endif
-
- /* xchg acts as a barrier before setting of the high bits */
- res.pud_low = xchg(&orig->pud_low, 0);
- res.pud_high = orig->pud_high;
- orig->pud_high = 0;
+ return old;
+ }
- return res.pud;
+ return pxx_xchg64(pmd, pmdp, pmd.pmd);
}
-#else
-#define native_pudp_get_and_clear(xp) native_local_pudp_get_and_clear(xp)
#endif
/* Encode and de-code a swap entry */
diff --git a/arch/x86/include/asm/pgtable-3level_types.h b/arch/x86/include/asm/pgtable-3level_types.h
index 56baf43befb4..80911349519e 100644
--- a/arch/x86/include/asm/pgtable-3level_types.h
+++ b/arch/x86/include/asm/pgtable-3level_types.h
@@ -18,6 +18,13 @@ typedef union {
};
pteval_t pte;
} pte_t;
+
+typedef union {
+ struct {
+ unsigned long pmd_low, pmd_high;
+ };
+ pmdval_t pmd;
+} pmd_t;
#endif /* !__ASSEMBLY__ */
#define SHARED_KERNEL_PMD (!static_cpu_has(X86_FEATURE_PTI))
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 5059799bebe3..0564edd24ffb 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -139,6 +139,7 @@ static inline int pmd_dirty(pmd_t pmd)
return pmd_flags(pmd) & _PAGE_DIRTY;
}
+#define pmd_young pmd_young
static inline int pmd_young(pmd_t pmd)
{
return pmd_flags(pmd) & _PAGE_ACCESSED;
@@ -291,7 +292,23 @@ static inline pte_t pte_clear_flags(pte_t pte, pteval_t clear)
#ifdef CONFIG_HAVE_ARCH_USERFAULTFD_WP
static inline int pte_uffd_wp(pte_t pte)
{
- return pte_flags(pte) & _PAGE_UFFD_WP;
+ bool wp = pte_flags(pte) & _PAGE_UFFD_WP;
+
+#ifdef CONFIG_DEBUG_VM
+ /*
+ * Having write bit for wr-protect-marked present ptes is fatal,
+ * because it means the uffd-wp bit will be ignored and write will
+ * just go through.
+ *
+ * Use any chance of pgtable walking to verify this (e.g., when
+ * page swapped out or being migrated for all purposes). It means
+ * something is already wrong. Tell the admin even before the
+ * process crashes. We also nail it with wrong pgtable setup.
+ */
+ WARN_ON_ONCE(wp && pte_write(pte));
+#endif
+
+ return wp;
}
static inline pte_t pte_mkuffd_wp(pte_t pte)
@@ -1438,6 +1455,14 @@ static inline bool arch_has_hw_pte_young(void)
return true;
}
+#ifdef CONFIG_XEN_PV
+#define arch_has_hw_nonleaf_pmd_young arch_has_hw_nonleaf_pmd_young
+static inline bool arch_has_hw_nonleaf_pmd_young(void)
+{
+ return !cpu_feature_enabled(X86_FEATURE_XENPV);
+}
+#endif
+
#ifdef CONFIG_PAGE_TABLE_CHECK
static inline bool pte_user_accessible_page(pte_t pte)
{
diff --git a/arch/x86/include/asm/pgtable_32.h b/arch/x86/include/asm/pgtable_32.h
index 7c9c968a42ef..7d4ad8907297 100644
--- a/arch/x86/include/asm/pgtable_32.h
+++ b/arch/x86/include/asm/pgtable_32.h
@@ -48,15 +48,6 @@ do { \
#endif /* !__ASSEMBLY__ */
/*
- * kern_addr_valid() is (1) for FLATMEM and (0) for SPARSEMEM
- */
-#ifdef CONFIG_FLATMEM
-#define kern_addr_valid(addr) (1)
-#else
-#define kern_addr_valid(kaddr) (0)
-#endif
-
-/*
* This is used to calculate the .brk reservation for initial pagetables.
* Enough space is reserved to allocate pagetables sufficient to cover all
* of LOWMEM_PAGES, which is an upper bound on the size of the direct map of
diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h
index e479491da8d5..7929327abe00 100644
--- a/arch/x86/include/asm/pgtable_64.h
+++ b/arch/x86/include/asm/pgtable_64.h
@@ -240,7 +240,6 @@ static inline void native_pgd_clear(pgd_t *pgd)
#define __swp_entry_to_pte(x) ((pte_t) { .pte = (x).val })
#define __swp_entry_to_pmd(x) ((pmd_t) { .pmd = (x).val })
-extern int kern_addr_valid(unsigned long addr);
extern void cleanup_highmap(void);
#define HAVE_ARCH_UNMAPPED_AREA
diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h
index 04f36063ad54..38bf837e3554 100644
--- a/arch/x86/include/asm/pgtable_64_types.h
+++ b/arch/x86/include/asm/pgtable_64_types.h
@@ -19,6 +19,7 @@ typedef unsigned long pgdval_t;
typedef unsigned long pgprotval_t;
typedef struct { pteval_t pte; } pte_t;
+typedef struct { pmdval_t pmd; } pmd_t;
#ifdef CONFIG_X86_5LEVEL
extern unsigned int __pgtable_l5_enabled;
diff --git a/arch/x86/include/asm/pgtable_areas.h b/arch/x86/include/asm/pgtable_areas.h
index d34cce1b995c..4f056fb88174 100644
--- a/arch/x86/include/asm/pgtable_areas.h
+++ b/arch/x86/include/asm/pgtable_areas.h
@@ -11,6 +11,12 @@
#define CPU_ENTRY_AREA_RO_IDT_VADDR ((void *)CPU_ENTRY_AREA_RO_IDT)
-#define CPU_ENTRY_AREA_MAP_SIZE (CPU_ENTRY_AREA_PER_CPU + CPU_ENTRY_AREA_ARRAY_SIZE - CPU_ENTRY_AREA_BASE)
+#ifdef CONFIG_X86_32
+#define CPU_ENTRY_AREA_MAP_SIZE (CPU_ENTRY_AREA_PER_CPU + \
+ (CPU_ENTRY_AREA_SIZE * NR_CPUS) - \
+ CPU_ENTRY_AREA_BASE)
+#else
+#define CPU_ENTRY_AREA_MAP_SIZE P4D_SIZE
+#endif
#endif /* _ASM_X86_PGTABLE_AREAS_H */
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index aa174fed3a71..447d4bee25c4 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -361,11 +361,9 @@ static inline pudval_t native_pud_val(pud_t pud)
#endif
#if CONFIG_PGTABLE_LEVELS > 2
-typedef struct { pmdval_t pmd; } pmd_t;
-
static inline pmd_t native_make_pmd(pmdval_t val)
{
- return (pmd_t) { val };
+ return (pmd_t) { .pmd = val };
}
static inline pmdval_t native_pmd_val(pmd_t pmd)
diff --git a/arch/x86/include/asm/preempt.h b/arch/x86/include/asm/preempt.h
index 5f6daea1ee24..2d13f25b1bd8 100644
--- a/arch/x86/include/asm/preempt.h
+++ b/arch/x86/include/asm/preempt.h
@@ -4,11 +4,11 @@
#include <asm/rmwcc.h>
#include <asm/percpu.h>
+#include <asm/current.h>
+
#include <linux/thread_info.h>
#include <linux/static_call_types.h>
-DECLARE_PER_CPU(int, __preempt_count);
-
/* We use the MSB mostly because its available */
#define PREEMPT_NEED_RESCHED 0x80000000
@@ -24,7 +24,7 @@ DECLARE_PER_CPU(int, __preempt_count);
*/
static __always_inline int preempt_count(void)
{
- return raw_cpu_read_4(__preempt_count) & ~PREEMPT_NEED_RESCHED;
+ return raw_cpu_read_4(pcpu_hot.preempt_count) & ~PREEMPT_NEED_RESCHED;
}
static __always_inline void preempt_count_set(int pc)
@@ -32,10 +32,10 @@ static __always_inline void preempt_count_set(int pc)
int old, new;
do {
- old = raw_cpu_read_4(__preempt_count);
+ old = raw_cpu_read_4(pcpu_hot.preempt_count);
new = (old & PREEMPT_NEED_RESCHED) |
(pc & ~PREEMPT_NEED_RESCHED);
- } while (raw_cpu_cmpxchg_4(__preempt_count, old, new) != old);
+ } while (raw_cpu_cmpxchg_4(pcpu_hot.preempt_count, old, new) != old);
}
/*
@@ -44,7 +44,7 @@ static __always_inline void preempt_count_set(int pc)
#define init_task_preempt_count(p) do { } while (0)
#define init_idle_preempt_count(p, cpu) do { \
- per_cpu(__preempt_count, (cpu)) = PREEMPT_DISABLED; \
+ per_cpu(pcpu_hot.preempt_count, (cpu)) = PREEMPT_DISABLED; \
} while (0)
/*
@@ -58,17 +58,17 @@ static __always_inline void preempt_count_set(int pc)
static __always_inline void set_preempt_need_resched(void)
{
- raw_cpu_and_4(__preempt_count, ~PREEMPT_NEED_RESCHED);
+ raw_cpu_and_4(pcpu_hot.preempt_count, ~PREEMPT_NEED_RESCHED);
}
static __always_inline void clear_preempt_need_resched(void)
{
- raw_cpu_or_4(__preempt_count, PREEMPT_NEED_RESCHED);
+ raw_cpu_or_4(pcpu_hot.preempt_count, PREEMPT_NEED_RESCHED);
}
static __always_inline bool test_preempt_need_resched(void)
{
- return !(raw_cpu_read_4(__preempt_count) & PREEMPT_NEED_RESCHED);
+ return !(raw_cpu_read_4(pcpu_hot.preempt_count) & PREEMPT_NEED_RESCHED);
}
/*
@@ -77,12 +77,12 @@ static __always_inline bool test_preempt_need_resched(void)
static __always_inline void __preempt_count_add(int val)
{
- raw_cpu_add_4(__preempt_count, val);
+ raw_cpu_add_4(pcpu_hot.preempt_count, val);
}
static __always_inline void __preempt_count_sub(int val)
{
- raw_cpu_add_4(__preempt_count, -val);
+ raw_cpu_add_4(pcpu_hot.preempt_count, -val);
}
/*
@@ -92,7 +92,8 @@ static __always_inline void __preempt_count_sub(int val)
*/
static __always_inline bool __preempt_count_dec_and_test(void)
{
- return GEN_UNARY_RMWcc("decl", __preempt_count, e, __percpu_arg([var]));
+ return GEN_UNARY_RMWcc("decl", pcpu_hot.preempt_count, e,
+ __percpu_arg([var]));
}
/*
@@ -100,7 +101,7 @@ static __always_inline bool __preempt_count_dec_and_test(void)
*/
static __always_inline bool should_resched(int preempt_offset)
{
- return unlikely(raw_cpu_read_4(__preempt_count) == preempt_offset);
+ return unlikely(raw_cpu_read_4(pcpu_hot.preempt_count) == preempt_offset);
}
#ifdef CONFIG_PREEMPTION
diff --git a/arch/x86/include/asm/processor-flags.h b/arch/x86/include/asm/processor-flags.h
index 02c2cbda4a74..a7f3d9100adb 100644
--- a/arch/x86/include/asm/processor-flags.h
+++ b/arch/x86/include/asm/processor-flags.h
@@ -35,7 +35,7 @@
*/
#ifdef CONFIG_X86_64
/* Mask off the address space ID and SME encryption bits. */
-#define CR3_ADDR_MASK __sme_clr(0x7FFFFFFFFFFFF000ull)
+#define CR3_ADDR_MASK __sme_clr(PHYSICAL_PAGE_MASK)
#define CR3_PCID_MASK 0xFFFull
#define CR3_NOFLUSH BIT_ULL(63)
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 67c9d73b31fa..4e35c66edeb7 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -16,6 +16,7 @@ struct vm86;
#include <uapi/asm/sigcontext.h>
#include <asm/current.h>
#include <asm/cpufeatures.h>
+#include <asm/cpuid.h>
#include <asm/page.h>
#include <asm/pgtable_types.h>
#include <asm/percpu.h>
@@ -146,17 +147,6 @@ struct cpuinfo_x86 {
unsigned initialized : 1;
} __randomize_layout;
-struct cpuid_regs {
- u32 eax, ebx, ecx, edx;
-};
-
-enum cpuid_regs_idx {
- CPUID_EAX = 0,
- CPUID_EBX,
- CPUID_ECX,
- CPUID_EDX,
-};
-
#define X86_VENDOR_INTEL 0
#define X86_VENDOR_CYRIX 1
#define X86_VENDOR_AMD 2
@@ -205,45 +195,6 @@ extern void identify_secondary_cpu(struct cpuinfo_x86 *);
extern void print_cpu_info(struct cpuinfo_x86 *);
void print_cpu_msr(struct cpuinfo_x86 *);
-#ifdef CONFIG_X86_32
-extern int have_cpuid_p(void);
-#else
-static inline int have_cpuid_p(void)
-{
- return 1;
-}
-#endif
-static inline void native_cpuid(unsigned int *eax, unsigned int *ebx,
- unsigned int *ecx, unsigned int *edx)
-{
- /* ecx is often an input as well as an output. */
- asm volatile("cpuid"
- : "=a" (*eax),
- "=b" (*ebx),
- "=c" (*ecx),
- "=d" (*edx)
- : "0" (*eax), "2" (*ecx)
- : "memory");
-}
-
-#define native_cpuid_reg(reg) \
-static inline unsigned int native_cpuid_##reg(unsigned int op) \
-{ \
- unsigned int eax = op, ebx, ecx = 0, edx; \
- \
- native_cpuid(&eax, &ebx, &ecx, &edx); \
- \
- return reg; \
-}
-
-/*
- * Native CPUID functions returning a single datum.
- */
-native_cpuid_reg(eax)
-native_cpuid_reg(ebx)
-native_cpuid_reg(ecx)
-native_cpuid_reg(edx)
-
/*
* Friendlier CR3 helpers.
*/
@@ -426,8 +377,6 @@ struct irq_stack {
char stack[IRQ_STACK_SIZE];
} __aligned(IRQ_STACK_SIZE);
-DECLARE_PER_CPU(unsigned long, cpu_current_top_of_stack);
-
#ifdef CONFIG_X86_64
struct fixed_percpu_data {
/*
@@ -450,8 +399,6 @@ static inline unsigned long cpu_kernelmode_gs_base(int cpu)
return (unsigned long)per_cpu(fixed_percpu_data.gs_base, cpu);
}
-DECLARE_PER_CPU(void *, hardirq_stack_ptr);
-DECLARE_PER_CPU(bool, hardirq_stack_inuse);
extern asmlinkage void ignore_sysret(void);
/* Save actual FS/GS selectors and bases to current->thread */
@@ -460,8 +407,6 @@ void current_save_fsgs(void);
#ifdef CONFIG_STACKPROTECTOR
DECLARE_PER_CPU(unsigned long, __stack_chk_guard);
#endif
-DECLARE_PER_CPU(struct irq_stack *, hardirq_stack_ptr);
-DECLARE_PER_CPU(struct irq_stack *, softirq_stack_ptr);
#endif /* !X86_64 */
struct perf_event;
@@ -566,7 +511,7 @@ static __always_inline unsigned long current_top_of_stack(void)
* and around vm86 mode and sp0 on x86_64 is special because of the
* entry trampoline.
*/
- return this_cpu_read_stable(cpu_current_top_of_stack);
+ return this_cpu_read_stable(pcpu_hot.top_of_stack);
}
static __always_inline bool on_thread_stack(void)
@@ -578,7 +523,6 @@ static __always_inline bool on_thread_stack(void)
#ifdef CONFIG_PARAVIRT_XXL
#include <asm/paravirt.h>
#else
-#define __cpuid native_cpuid
static inline void load_sp0(unsigned long sp0)
{
@@ -589,69 +533,6 @@ static inline void load_sp0(unsigned long sp0)
unsigned long __get_wchan(struct task_struct *p);
-/*
- * Generic CPUID function
- * clear %ecx since some cpus (Cyrix MII) do not set or clear %ecx
- * resulting in stale register contents being returned.
- */
-static inline void cpuid(unsigned int op,
- unsigned int *eax, unsigned int *ebx,
- unsigned int *ecx, unsigned int *edx)
-{
- *eax = op;
- *ecx = 0;
- __cpuid(eax, ebx, ecx, edx);
-}
-
-/* Some CPUID calls want 'count' to be placed in ecx */
-static inline void cpuid_count(unsigned int op, int count,
- unsigned int *eax, unsigned int *ebx,
- unsigned int *ecx, unsigned int *edx)
-{
- *eax = op;
- *ecx = count;
- __cpuid(eax, ebx, ecx, edx);
-}
-
-/*
- * CPUID functions returning a single datum
- */
-static inline unsigned int cpuid_eax(unsigned int op)
-{
- unsigned int eax, ebx, ecx, edx;
-
- cpuid(op, &eax, &ebx, &ecx, &edx);
-
- return eax;
-}
-
-static inline unsigned int cpuid_ebx(unsigned int op)
-{
- unsigned int eax, ebx, ecx, edx;
-
- cpuid(op, &eax, &ebx, &ecx, &edx);
-
- return ebx;
-}
-
-static inline unsigned int cpuid_ecx(unsigned int op)
-{
- unsigned int eax, ebx, ecx, edx;
-
- cpuid(op, &eax, &ebx, &ecx, &edx);
-
- return ecx;
-}
-
-static inline unsigned int cpuid_edx(unsigned int op)
-{
- unsigned int eax, ebx, ecx, edx;
-
- cpuid(op, &eax, &ebx, &ecx, &edx);
-
- return edx;
-}
-
extern void select_idle_routine(const struct cpuinfo_x86 *c);
extern void amd_e400_c1e_apic_setup(void);
@@ -667,10 +548,9 @@ extern int sysenter_setup(void);
/* Defined in head.S */
extern struct desc_ptr early_gdt_descr;
-extern void switch_to_new_gdt(int);
+extern void switch_gdt_and_percpu_base(int);
extern void load_direct_gdt(int);
extern void load_fixmap_gdt(int);
-extern void load_percpu_segment(int);
extern void cpu_init(void);
extern void cpu_init_secondary(void);
extern void cpu_init_exception_handling(void);
@@ -805,24 +685,6 @@ static inline u32 amd_get_nodes_per_socket(void) { return 0; }
static inline u32 amd_get_highest_perf(void) { return 0; }
#endif
-#define for_each_possible_hypervisor_cpuid_base(function) \
- for (function = 0x40000000; function < 0x40010000; function += 0x100)
-
-static inline uint32_t hypervisor_cpuid_base(const char *sig, uint32_t leaves)
-{
- uint32_t base, eax, signature[3];
-
- for_each_possible_hypervisor_cpuid_base(base) {
- cpuid(base, &eax, &signature[0], &signature[1], &signature[2]);
-
- if (!memcmp(sig, signature, 12) &&
- (leaves == 0 || ((eax - base) >= leaves)))
- return base;
- }
-
- return 0;
-}
-
extern unsigned long arch_align_stack(unsigned long sp);
void free_init_pages(const char *what, unsigned long begin, unsigned long end);
extern void free_kernel_image_pages(const char *what, void *begin, void *end);
diff --git a/arch/x86/include/asm/qspinlock_paravirt.h b/arch/x86/include/asm/qspinlock_paravirt.h
index 60ece592b220..42b17cf10b10 100644
--- a/arch/x86/include/asm/qspinlock_paravirt.h
+++ b/arch/x86/include/asm/qspinlock_paravirt.h
@@ -14,8 +14,6 @@
__PV_CALLEE_SAVE_REGS_THUNK(__pv_queued_spin_unlock_slowpath, ".spinlock.text");
#define __pv_queued_spin_unlock __pv_queued_spin_unlock
-#define PV_UNLOCK "__raw_callee_save___pv_queued_spin_unlock"
-#define PV_UNLOCK_SLOWPATH "__raw_callee_save___pv_queued_spin_unlock_slowpath"
/*
* Optimized assembly version of __raw_callee_save___pv_queued_spin_unlock
@@ -37,32 +35,27 @@ __PV_CALLEE_SAVE_REGS_THUNK(__pv_queued_spin_unlock_slowpath, ".spinlock.text");
* rsi = lockval (second argument)
* rdx = internal variable (set to 0)
*/
-asm (".pushsection .spinlock.text;"
- ".globl " PV_UNLOCK ";"
- ".type " PV_UNLOCK ", @function;"
- ".align 4,0x90;"
- PV_UNLOCK ": "
- ASM_ENDBR
- FRAME_BEGIN
- "push %rdx;"
- "mov $0x1,%eax;"
- "xor %edx,%edx;"
- LOCK_PREFIX "cmpxchg %dl,(%rdi);"
- "cmp $0x1,%al;"
- "jne .slowpath;"
- "pop %rdx;"
+#define PV_UNLOCK_ASM \
+ FRAME_BEGIN \
+ "push %rdx\n\t" \
+ "mov $0x1,%eax\n\t" \
+ "xor %edx,%edx\n\t" \
+ LOCK_PREFIX "cmpxchg %dl,(%rdi)\n\t" \
+ "cmp $0x1,%al\n\t" \
+ "jne .slowpath\n\t" \
+ "pop %rdx\n\t" \
+ FRAME_END \
+ ASM_RET \
+ ".slowpath:\n\t" \
+ "push %rsi\n\t" \
+ "movzbl %al,%esi\n\t" \
+ "call __raw_callee_save___pv_queued_spin_unlock_slowpath\n\t" \
+ "pop %rsi\n\t" \
+ "pop %rdx\n\t" \
FRAME_END
- ASM_RET
- ".slowpath: "
- "push %rsi;"
- "movzbl %al,%esi;"
- "call " PV_UNLOCK_SLOWPATH ";"
- "pop %rsi;"
- "pop %rdx;"
- FRAME_END
- ASM_RET
- ".size " PV_UNLOCK ", .-" PV_UNLOCK ";"
- ".popsection");
+
+DEFINE_PARAVIRT_ASM(__raw_callee_save___pv_queued_spin_unlock,
+ PV_UNLOCK_ASM, .spinlock.text);
#else /* CONFIG_64BIT */
diff --git a/arch/x86/include/asm/realmode.h b/arch/x86/include/asm/realmode.h
index fd6f6e5b755a..a336feef0af1 100644
--- a/arch/x86/include/asm/realmode.h
+++ b/arch/x86/include/asm/realmode.h
@@ -91,6 +91,7 @@ static inline void set_real_mode_mem(phys_addr_t mem)
void reserve_real_mode(void);
void load_trampoline_pgtable(void);
+void init_real_mode(void);
#endif /* __ASSEMBLY__ */
diff --git a/arch/x86/include/asm/resctrl.h b/arch/x86/include/asm/resctrl.h
index d24b04ebf950..52788f79786f 100644
--- a/arch/x86/include/asm/resctrl.h
+++ b/arch/x86/include/asm/resctrl.h
@@ -7,8 +7,6 @@
#include <linux/sched.h>
#include <linux/jump_label.h>
-#define IA32_PQR_ASSOC 0x0c8f
-
/**
* struct resctrl_pqr_state - State cache for the PQR MSR
* @cur_rmid: The cached Resource Monitoring ID
@@ -16,8 +14,8 @@
* @default_rmid: The user assigned Resource Monitoring ID
* @default_closid: The user assigned cached Class Of Service ID
*
- * The upper 32 bits of IA32_PQR_ASSOC contain closid and the
- * lower 10 bits rmid. The update to IA32_PQR_ASSOC always
+ * The upper 32 bits of MSR_IA32_PQR_ASSOC contain closid and the
+ * lower 10 bits rmid. The update to MSR_IA32_PQR_ASSOC always
* contains both parts, so we need to cache them. This also
* stores the user configured per cpu CLOSID and RMID.
*
@@ -77,7 +75,7 @@ static void __resctrl_sched_in(void)
if (closid != state->cur_closid || rmid != state->cur_rmid) {
state->cur_closid = closid;
state->cur_rmid = rmid;
- wrmsr(IA32_PQR_ASSOC, rmid, closid);
+ wrmsr(MSR_IA32_PQR_ASSOC, rmid, closid);
}
}
diff --git a/arch/x86/include/asm/segment.h b/arch/x86/include/asm/segment.h
index 2e7890dd58a4..c390a672d560 100644
--- a/arch/x86/include/asm/segment.h
+++ b/arch/x86/include/asm/segment.h
@@ -135,6 +135,7 @@
#define __KERNEL_DS (GDT_ENTRY_KERNEL_DS*8)
#define __USER_DS (GDT_ENTRY_DEFAULT_USER_DS*8 + 3)
#define __USER_CS (GDT_ENTRY_DEFAULT_USER_CS*8 + 3)
+#define __USER32_CS __USER_CS
#define __ESPFIX_SS (GDT_ENTRY_ESPFIX_SS*8)
/* segment for calling fn: */
@@ -210,7 +211,6 @@
#define __KERNEL_DS (GDT_ENTRY_KERNEL_DS*8)
#define __USER32_CS (GDT_ENTRY_DEFAULT_USER32_CS*8 + 3)
#define __USER_DS (GDT_ENTRY_DEFAULT_USER_DS*8 + 3)
-#define __USER32_DS __USER_DS
#define __USER_CS (GDT_ENTRY_DEFAULT_USER_CS*8 + 3)
#define __CPUNODE_SEG (GDT_ENTRY_CPUNODE*8 + 3)
diff --git a/arch/x86/include/asm/set_memory.h b/arch/x86/include/asm/set_memory.h
index b45c4d27fd46..a5e89641bd2d 100644
--- a/arch/x86/include/asm/set_memory.h
+++ b/arch/x86/include/asm/set_memory.h
@@ -6,6 +6,9 @@
#include <asm/page.h>
#include <asm-generic/set_memory.h>
+#define set_memory_rox set_memory_rox
+int set_memory_rox(unsigned long addr, int numpages);
+
/*
* The set_memory_* API can be used to change various attributes of a virtual
* address range. The attributes include:
diff --git a/arch/x86/include/asm/sgx.h b/arch/x86/include/asm/sgx.h
index eae20fa52b93..6a0069761508 100644
--- a/arch/x86/include/asm/sgx.h
+++ b/arch/x86/include/asm/sgx.h
@@ -115,17 +115,36 @@ enum sgx_miscselect {
* %SGX_ATTR_EINITTOKENKEY: Allow to use token signing key that is used to
* sign cryptographic tokens that can be passed to
* EINIT as an authorization to run an enclave.
+ * %SGX_ATTR_ASYNC_EXIT_NOTIFY: Allow enclaves to be notified after an
+ * asynchronous exit has occurred.
*/
enum sgx_attribute {
- SGX_ATTR_INIT = BIT(0),
- SGX_ATTR_DEBUG = BIT(1),
- SGX_ATTR_MODE64BIT = BIT(2),
- SGX_ATTR_PROVISIONKEY = BIT(4),
- SGX_ATTR_EINITTOKENKEY = BIT(5),
- SGX_ATTR_KSS = BIT(7),
+ SGX_ATTR_INIT = BIT(0),
+ SGX_ATTR_DEBUG = BIT(1),
+ SGX_ATTR_MODE64BIT = BIT(2),
+ /* BIT(3) is reserved */
+ SGX_ATTR_PROVISIONKEY = BIT(4),
+ SGX_ATTR_EINITTOKENKEY = BIT(5),
+ /* BIT(6) is for CET */
+ SGX_ATTR_KSS = BIT(7),
+ /* BIT(8) is reserved */
+ /* BIT(9) is reserved */
+ SGX_ATTR_ASYNC_EXIT_NOTIFY = BIT(10),
};
-#define SGX_ATTR_RESERVED_MASK (BIT_ULL(3) | BIT_ULL(6) | GENMASK_ULL(63, 8))
+#define SGX_ATTR_RESERVED_MASK (BIT_ULL(3) | \
+ BIT_ULL(6) | \
+ BIT_ULL(8) | \
+ BIT_ULL(9) | \
+ GENMASK_ULL(63, 11))
+
+#define SGX_ATTR_UNPRIV_MASK (SGX_ATTR_DEBUG | \
+ SGX_ATTR_MODE64BIT | \
+ SGX_ATTR_KSS | \
+ SGX_ATTR_ASYNC_EXIT_NOTIFY)
+
+#define SGX_ATTR_PRIV_MASK (SGX_ATTR_PROVISIONKEY | \
+ SGX_ATTR_EINITTOKENKEY)
/**
* struct sgx_secs - SGX Enclave Control Structure (SECS)
diff --git a/arch/x86/include/asm/sighandling.h b/arch/x86/include/asm/sighandling.h
index 65e667279e0f..e770c4fc47f4 100644
--- a/arch/x86/include/asm/sighandling.h
+++ b/arch/x86/include/asm/sighandling.h
@@ -15,4 +15,13 @@
void signal_fault(struct pt_regs *regs, void __user *frame, char *where);
+void __user *
+get_sigframe(struct ksignal *ksig, struct pt_regs *regs, size_t frame_size,
+ void __user **fpstate);
+
+int ia32_setup_frame(struct ksignal *ksig, struct pt_regs *regs);
+int ia32_setup_rt_frame(struct ksignal *ksig, struct pt_regs *regs);
+int x64_setup_rt_frame(struct ksignal *ksig, struct pt_regs *regs);
+int x32_setup_rt_frame(struct ksignal *ksig, struct pt_regs *regs);
+
#endif /* _ASM_X86_SIGHANDLING_H */
diff --git a/arch/x86/include/asm/signal.h b/arch/x86/include/asm/signal.h
index 2dfb5fea13af..4a4043ca6493 100644
--- a/arch/x86/include/asm/signal.h
+++ b/arch/x86/include/asm/signal.h
@@ -28,11 +28,6 @@ typedef struct {
#define SA_IA32_ABI 0x02000000u
#define SA_X32_ABI 0x01000000u
-#ifndef CONFIG_COMPAT
-#define compat_sigset_t compat_sigset_t
-typedef sigset_t compat_sigset_t;
-#endif
-
#endif /* __ASSEMBLY__ */
#include <uapi/asm/signal.h>
#ifndef __ASSEMBLY__
diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h
index a73bced40e24..b4dbb20dab1a 100644
--- a/arch/x86/include/asm/smp.h
+++ b/arch/x86/include/asm/smp.h
@@ -3,10 +3,10 @@
#define _ASM_X86_SMP_H
#ifndef __ASSEMBLY__
#include <linux/cpumask.h>
-#include <asm/percpu.h>
-#include <asm/thread_info.h>
#include <asm/cpumask.h>
+#include <asm/current.h>
+#include <asm/thread_info.h>
extern int smp_num_siblings;
extern unsigned int num_processors;
@@ -19,7 +19,6 @@ DECLARE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_llc_shared_map);
DECLARE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_l2c_shared_map);
DECLARE_PER_CPU_READ_MOSTLY(u16, cpu_llc_id);
DECLARE_PER_CPU_READ_MOSTLY(u16, cpu_l2c_id);
-DECLARE_PER_CPU_READ_MOSTLY(int, cpu_number);
DECLARE_EARLY_PER_CPU_READ_MOSTLY(u16, x86_cpu_to_apicid);
DECLARE_EARLY_PER_CPU_READ_MOSTLY(u32, x86_cpu_to_acpiid);
@@ -150,11 +149,10 @@ __visible void smp_call_function_single_interrupt(struct pt_regs *r);
/*
* This function is needed by all SMP systems. It must _always_ be valid
- * from the initial startup. We map APIC_BASE very early in page_setup(),
- * so this is correct in the x86 case.
+ * from the initial startup.
*/
-#define raw_smp_processor_id() this_cpu_read(cpu_number)
-#define __smp_processor_id() __this_cpu_read(cpu_number)
+#define raw_smp_processor_id() this_cpu_read(pcpu_hot.cpu_number)
+#define __smp_processor_id() __this_cpu_read(pcpu_hot.cpu_number)
#ifdef CONFIG_X86_32
extern int safe_smp_processor_id(void);
diff --git a/arch/x86/include/asm/stackprotector.h b/arch/x86/include/asm/stackprotector.h
index 24a8d6c4fb18..00473a650f51 100644
--- a/arch/x86/include/asm/stackprotector.h
+++ b/arch/x86/include/asm/stackprotector.h
@@ -34,7 +34,6 @@
#include <asm/percpu.h>
#include <asm/desc.h>
-#include <linux/random.h>
#include <linux/sched.h>
/*
@@ -50,22 +49,11 @@
*/
static __always_inline void boot_init_stack_canary(void)
{
- u64 canary;
- u64 tsc;
+ unsigned long canary = get_random_canary();
#ifdef CONFIG_X86_64
BUILD_BUG_ON(offsetof(struct fixed_percpu_data, stack_canary) != 40);
#endif
- /*
- * We both use the random pool and the current TSC as a source
- * of randomness. The TSC only matters for very early init,
- * there it already has some randomness on most systems. Later
- * on during the bootup the random pool has true entropy too.
- */
- get_random_bytes(&canary, sizeof(canary));
- tsc = rdtsc();
- canary += tsc + (tsc << 32UL);
- canary &= CANARY_MASK;
current->stack_canary = canary;
#ifdef CONFIG_X86_64
diff --git a/arch/x86/include/asm/switch_to.h b/arch/x86/include/asm/switch_to.h
index c08eb0fdd11f..5c91305d09d2 100644
--- a/arch/x86/include/asm/switch_to.h
+++ b/arch/x86/include/asm/switch_to.h
@@ -66,13 +66,10 @@ static inline void update_task_stack(struct task_struct *task)
{
/* sp0 always points to the entry trampoline stack, which is constant: */
#ifdef CONFIG_X86_32
- if (static_cpu_has(X86_FEATURE_XENPV))
- load_sp0(task->thread.sp0);
- else
- this_cpu_write(cpu_tss_rw.x86_tss.sp1, task->thread.sp0);
+ this_cpu_write(cpu_tss_rw.x86_tss.sp1, task->thread.sp0);
#else
/* Xen PV enters the kernel on the thread stack. */
- if (static_cpu_has(X86_FEATURE_XENPV))
+ if (cpu_feature_enabled(X86_FEATURE_XENPV))
load_sp0(task_top_of_stack(task));
#endif
}
diff --git a/arch/x86/include/asm/tdx.h b/arch/x86/include/asm/tdx.h
index 020c81a7c729..28d889c9aa16 100644
--- a/arch/x86/include/asm/tdx.h
+++ b/arch/x86/include/asm/tdx.h
@@ -67,6 +67,8 @@ void tdx_safe_halt(void);
bool tdx_early_handle_ve(struct pt_regs *regs);
+int tdx_mcall_get_report0(u8 *reportdata, u8 *tdreport);
+
#else
static inline void tdx_early_init(void) { };
diff --git a/arch/x86/include/asm/text-patching.h b/arch/x86/include/asm/text-patching.h
index 1cc15528ce29..f4b87f08f5c5 100644
--- a/arch/x86/include/asm/text-patching.h
+++ b/arch/x86/include/asm/text-patching.h
@@ -45,6 +45,7 @@ extern void *text_poke(void *addr, const void *opcode, size_t len);
extern void text_poke_sync(void);
extern void *text_poke_kgdb(void *addr, const void *opcode, size_t len);
extern void *text_poke_copy(void *addr, const void *opcode, size_t len);
+extern void *text_poke_copy_locked(void *addr, const void *opcode, size_t len, bool core_ok);
extern void *text_poke_set(void *addr, int c, size_t len);
extern int poke_int3_handler(struct pt_regs *regs);
extern void text_poke_bp(void *addr, const void *opcode, size_t len, const void *emulate);
diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h
index e9170457697e..c1c8c581759d 100644
--- a/arch/x86/include/asm/x86_init.h
+++ b/arch/x86/include/asm/x86_init.h
@@ -285,6 +285,8 @@ struct x86_hyper_runtime {
* possible in x86_early_init_platform_quirks() by
* only using the current x86_hardware_subarch
* semantics.
+ * @realmode_reserve: reserve memory for realmode trampoline
+ * @realmode_init: initialize realmode trampoline
* @hyper: x86 hypervisor specific runtime callbacks
*/
struct x86_platform_ops {
@@ -301,6 +303,8 @@ struct x86_platform_ops {
void (*apic_post_init)(void);
struct x86_legacy_features legacy;
void (*set_legacy_features)(void);
+ void (*realmode_reserve)(void);
+ void (*realmode_init)(void);
struct x86_hyper_runtime hyper;
struct x86_guest guest;
};
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index f901658d9f7c..96d51bbc2bd4 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -44,7 +44,7 @@ obj-y += head_$(BITS).o
obj-y += head$(BITS).o
obj-y += ebda.o
obj-y += platform-quirks.o
-obj-y += process_$(BITS).o signal.o
+obj-y += process_$(BITS).o signal.o signal_$(BITS).o
obj-$(CONFIG_COMPAT) += signal_compat.o
obj-y += traps.o idt.o irq.o irq_$(BITS).o dumpstack_$(BITS).o
obj-y += time.o ioport.o dumpstack.o nmi.o
@@ -54,7 +54,7 @@ obj-$(CONFIG_JUMP_LABEL) += jump_label.o
obj-$(CONFIG_IRQ_WORK) += irq_work.o
obj-y += probe_roms.o
obj-$(CONFIG_X86_32) += sys_ia32.o
-obj-$(CONFIG_IA32_EMULATION) += sys_ia32.o
+obj-$(CONFIG_IA32_EMULATION) += sys_ia32.o signal_32.o
obj-$(CONFIG_X86_64) += sys_x86_64.o
obj-$(CONFIG_X86_ESPFIX64) += espfix_64.o
obj-$(CONFIG_SYSFS) += ksysfs.o
@@ -143,6 +143,8 @@ obj-$(CONFIG_AMD_MEM_ENCRYPT) += sev.o
obj-$(CONFIG_CFI_CLANG) += cfi.o
+obj-$(CONFIG_CALL_THUNKS) += callthunks.o
+
###
# 64 bit specific files
ifeq ($(CONFIG_X86_64),y)
diff --git a/arch/x86/kernel/acpi/cstate.c b/arch/x86/kernel/acpi/cstate.c
index 7945eae5b315..401808b47af3 100644
--- a/arch/x86/kernel/acpi/cstate.c
+++ b/arch/x86/kernel/acpi/cstate.c
@@ -52,17 +52,25 @@ void acpi_processor_power_init_bm_check(struct acpi_processor_flags *flags,
if (c->x86_vendor == X86_VENDOR_INTEL &&
(c->x86 > 0xf || (c->x86 == 6 && c->x86_model >= 0x0f)))
flags->bm_control = 0;
- /*
- * For all recent Centaur CPUs, the ucode will make sure that each
- * core can keep cache coherence with each other while entering C3
- * type state. So, set bm_check to 1 to indicate that the kernel
- * doesn't need to execute a cache flush operation (WBINVD) when
- * entering C3 type state.
- */
+
if (c->x86_vendor == X86_VENDOR_CENTAUR) {
if (c->x86 > 6 || (c->x86 == 6 && c->x86_model == 0x0f &&
- c->x86_stepping >= 0x0e))
+ c->x86_stepping >= 0x0e)) {
+ /*
+ * For all recent Centaur CPUs, the ucode will make sure that each
+ * core can keep cache coherence with each other while entering C3
+ * type state. So, set bm_check to 1 to indicate that the kernel
+ * doesn't need to execute a cache flush operation (WBINVD) when
+ * entering C3 type state.
+ */
flags->bm_check = 1;
+ /*
+ * For all recent Centaur platforms, ARB_DISABLE is a nop.
+ * Set bm_control to zero to indicate that ARB_DISABLE is
+ * not required while entering C3 type state.
+ */
+ flags->bm_control = 0;
+ }
}
if (c->x86_vendor == X86_VENDOR_ZHAOXIN) {
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 5cadcea035e0..7d8c3cbde368 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -116,6 +116,7 @@ static void __init_or_module add_nops(void *insns, unsigned int len)
extern s32 __retpoline_sites[], __retpoline_sites_end[];
extern s32 __return_sites[], __return_sites_end[];
+extern s32 __cfi_sites[], __cfi_sites_end[];
extern s32 __ibt_endbr_seal[], __ibt_endbr_seal_end[];
extern struct alt_instr __alt_instructions[], __alt_instructions_end[];
extern s32 __smp_locks[], __smp_locks_end[];
@@ -377,6 +378,56 @@ static int emit_indirect(int op, int reg, u8 *bytes)
return i;
}
+static inline bool is_jcc32(struct insn *insn)
+{
+ /* Jcc.d32 second opcode byte is in the range: 0x80-0x8f */
+ return insn->opcode.bytes[0] == 0x0f && (insn->opcode.bytes[1] & 0xf0) == 0x80;
+}
+
+static int emit_call_track_retpoline(void *addr, struct insn *insn, int reg, u8 *bytes)
+{
+ u8 op = insn->opcode.bytes[0];
+ int i = 0;
+
+ /*
+ * Clang does 'weird' Jcc __x86_indirect_thunk_r11 conditional
+ * tail-calls. Deal with them.
+ */
+ if (is_jcc32(insn)) {
+ bytes[i++] = op;
+ op = insn->opcode.bytes[1];
+ goto clang_jcc;
+ }
+
+ if (insn->length == 6)
+ bytes[i++] = 0x2e; /* CS-prefix */
+
+ switch (op) {
+ case CALL_INSN_OPCODE:
+ __text_gen_insn(bytes+i, op, addr+i,
+ __x86_indirect_call_thunk_array[reg],
+ CALL_INSN_SIZE);
+ i += CALL_INSN_SIZE;
+ break;
+
+ case JMP32_INSN_OPCODE:
+clang_jcc:
+ __text_gen_insn(bytes+i, op, addr+i,
+ __x86_indirect_jump_thunk_array[reg],
+ JMP32_INSN_SIZE);
+ i += JMP32_INSN_SIZE;
+ break;
+
+ default:
+ WARN(1, "%pS %px %*ph\n", addr, addr, 6, addr);
+ return -1;
+ }
+
+ WARN_ON_ONCE(i != insn->length);
+
+ return i;
+}
+
/*
* Rewrite the compiler generated retpoline thunk calls.
*
@@ -409,8 +460,12 @@ static int patch_retpoline(void *addr, struct insn *insn, u8 *bytes)
BUG_ON(reg == 4);
if (cpu_feature_enabled(X86_FEATURE_RETPOLINE) &&
- !cpu_feature_enabled(X86_FEATURE_RETPOLINE_LFENCE))
+ !cpu_feature_enabled(X86_FEATURE_RETPOLINE_LFENCE)) {
+ if (cpu_feature_enabled(X86_FEATURE_CALL_DEPTH))
+ return emit_call_track_retpoline(addr, insn, reg, bytes);
+
return -1;
+ }
op = insn->opcode.bytes[0];
@@ -427,8 +482,7 @@ static int patch_retpoline(void *addr, struct insn *insn, u8 *bytes)
* [ NOP ]
* 1:
*/
- /* Jcc.d32 second opcode byte is in the range: 0x80-0x8f */
- if (op == 0x0f && (insn->opcode.bytes[1] & 0xf0) == 0x80) {
+ if (is_jcc32(insn)) {
cc = insn->opcode.bytes[1] & 0xf;
cc ^= 1; /* invert condition */
@@ -518,6 +572,11 @@ void __init_or_module noinline apply_retpolines(s32 *start, s32 *end)
}
#ifdef CONFIG_RETHUNK
+
+#ifdef CONFIG_CALL_THUNKS
+void (*x86_return_thunk)(void) __ro_after_init = &__x86_return_thunk;
+#endif
+
/*
* Rewrite the compiler generated return thunk tail-calls.
*
@@ -533,14 +592,18 @@ static int patch_return(void *addr, struct insn *insn, u8 *bytes)
{
int i = 0;
- if (cpu_feature_enabled(X86_FEATURE_RETHUNK))
- return -1;
+ if (cpu_feature_enabled(X86_FEATURE_RETHUNK)) {
+ if (x86_return_thunk == __x86_return_thunk)
+ return -1;
- bytes[i++] = RET_INSN_OPCODE;
+ i = JMP32_INSN_SIZE;
+ __text_gen_insn(bytes, JMP32_INSN_OPCODE, addr, x86_return_thunk, i);
+ } else {
+ bytes[i++] = RET_INSN_OPCODE;
+ }
for (; i < insn->length;)
bytes[i++] = INT3_INSN_OPCODE;
-
return i;
}
@@ -594,6 +657,28 @@ void __init_or_module noinline apply_returns(s32 *start, s32 *end) { }
#ifdef CONFIG_X86_KERNEL_IBT
+static void poison_endbr(void *addr, bool warn)
+{
+ u32 endbr, poison = gen_endbr_poison();
+
+ if (WARN_ON_ONCE(get_kernel_nofault(endbr, addr)))
+ return;
+
+ if (!is_endbr(endbr)) {
+ WARN_ON_ONCE(warn);
+ return;
+ }
+
+ DPRINTK("ENDBR at: %pS (%px)", addr, addr);
+
+ /*
+ * When we have IBT, the lack of ENDBR will trigger #CP
+ */
+ DUMP_BYTES(((u8*)addr), 4, "%px: orig: ", addr);
+ DUMP_BYTES(((u8*)&poison), 4, "%px: repl: ", addr);
+ text_poke_early(addr, &poison, 4);
+}
+
/*
* Generated by: objtool --ibt
*/
@@ -602,31 +687,391 @@ void __init_or_module noinline apply_ibt_endbr(s32 *start, s32 *end)
s32 *s;
for (s = start; s < end; s++) {
- u32 endbr, poison = gen_endbr_poison();
void *addr = (void *)s + *s;
- if (WARN_ON_ONCE(get_kernel_nofault(endbr, addr)))
+ poison_endbr(addr, true);
+ if (IS_ENABLED(CONFIG_FINEIBT))
+ poison_endbr(addr - 16, false);
+ }
+}
+
+#else
+
+void __init_or_module apply_ibt_endbr(s32 *start, s32 *end) { }
+
+#endif /* CONFIG_X86_KERNEL_IBT */
+
+#ifdef CONFIG_FINEIBT
+
+enum cfi_mode {
+ CFI_DEFAULT,
+ CFI_OFF,
+ CFI_KCFI,
+ CFI_FINEIBT,
+};
+
+static enum cfi_mode cfi_mode __ro_after_init = CFI_DEFAULT;
+static bool cfi_rand __ro_after_init = true;
+static u32 cfi_seed __ro_after_init;
+
+/*
+ * Re-hash the CFI hash with a boot-time seed while making sure the result is
+ * not a valid ENDBR instruction.
+ */
+static u32 cfi_rehash(u32 hash)
+{
+ hash ^= cfi_seed;
+ while (unlikely(is_endbr(hash) || is_endbr(-hash))) {
+ bool lsb = hash & 1;
+ hash >>= 1;
+ if (lsb)
+ hash ^= 0x80200003;
+ }
+ return hash;
+}
+
+static __init int cfi_parse_cmdline(char *str)
+{
+ if (!str)
+ return -EINVAL;
+
+ while (str) {
+ char *next = strchr(str, ',');
+ if (next) {
+ *next = 0;
+ next++;
+ }
+
+ if (!strcmp(str, "auto")) {
+ cfi_mode = CFI_DEFAULT;
+ } else if (!strcmp(str, "off")) {
+ cfi_mode = CFI_OFF;
+ cfi_rand = false;
+ } else if (!strcmp(str, "kcfi")) {
+ cfi_mode = CFI_KCFI;
+ } else if (!strcmp(str, "fineibt")) {
+ cfi_mode = CFI_FINEIBT;
+ } else if (!strcmp(str, "norand")) {
+ cfi_rand = false;
+ } else {
+ pr_err("Ignoring unknown cfi option (%s).", str);
+ }
+
+ str = next;
+ }
+
+ return 0;
+}
+early_param("cfi", cfi_parse_cmdline);
+
+/*
+ * kCFI FineIBT
+ *
+ * __cfi_\func: __cfi_\func:
+ * movl $0x12345678,%eax // 5 endbr64 // 4
+ * nop subl $0x12345678,%r10d // 7
+ * nop jz 1f // 2
+ * nop ud2 // 2
+ * nop 1: nop // 1
+ * nop
+ * nop
+ * nop
+ * nop
+ * nop
+ * nop
+ * nop
+ *
+ *
+ * caller: caller:
+ * movl $(-0x12345678),%r10d // 6 movl $0x12345678,%r10d // 6
+ * addl $-15(%r11),%r10d // 4 sub $16,%r11 // 4
+ * je 1f // 2 nop4 // 4
+ * ud2 // 2
+ * 1: call __x86_indirect_thunk_r11 // 5 call *%r11; nop2; // 5
+ *
+ */
+
+asm( ".pushsection .rodata \n"
+ "fineibt_preamble_start: \n"
+ " endbr64 \n"
+ " subl $0x12345678, %r10d \n"
+ " je fineibt_preamble_end \n"
+ " ud2 \n"
+ " nop \n"
+ "fineibt_preamble_end: \n"
+ ".popsection\n"
+);
+
+extern u8 fineibt_preamble_start[];
+extern u8 fineibt_preamble_end[];
+
+#define fineibt_preamble_size (fineibt_preamble_end - fineibt_preamble_start)
+#define fineibt_preamble_hash 7
+
+asm( ".pushsection .rodata \n"
+ "fineibt_caller_start: \n"
+ " movl $0x12345678, %r10d \n"
+ " sub $16, %r11 \n"
+ ASM_NOP4
+ "fineibt_caller_end: \n"
+ ".popsection \n"
+);
+
+extern u8 fineibt_caller_start[];
+extern u8 fineibt_caller_end[];
+
+#define fineibt_caller_size (fineibt_caller_end - fineibt_caller_start)
+#define fineibt_caller_hash 2
+
+#define fineibt_caller_jmp (fineibt_caller_size - 2)
+
+static u32 decode_preamble_hash(void *addr)
+{
+ u8 *p = addr;
+
+ /* b8 78 56 34 12 mov $0x12345678,%eax */
+ if (p[0] == 0xb8)
+ return *(u32 *)(addr + 1);
+
+ return 0; /* invalid hash value */
+}
+
+static u32 decode_caller_hash(void *addr)
+{
+ u8 *p = addr;
+
+ /* 41 ba 78 56 34 12 mov $0x12345678,%r10d */
+ if (p[0] == 0x41 && p[1] == 0xba)
+ return -*(u32 *)(addr + 2);
+
+ /* e8 0c 78 56 34 12 jmp.d8 +12 */
+ if (p[0] == JMP8_INSN_OPCODE && p[1] == fineibt_caller_jmp)
+ return -*(u32 *)(addr + 2);
+
+ return 0; /* invalid hash value */
+}
+
+/* .retpoline_sites */
+static int cfi_disable_callers(s32 *start, s32 *end)
+{
+ /*
+ * Disable kCFI by patching in a JMP.d8, this leaves the hash immediate
+ * in tact for later usage. Also see decode_caller_hash() and
+ * cfi_rewrite_callers().
+ */
+ const u8 jmp[] = { JMP8_INSN_OPCODE, fineibt_caller_jmp };
+ s32 *s;
+
+ for (s = start; s < end; s++) {
+ void *addr = (void *)s + *s;
+ u32 hash;
+
+ addr -= fineibt_caller_size;
+ hash = decode_caller_hash(addr);
+ if (!hash) /* nocfi callers */
continue;
- if (WARN_ON_ONCE(!is_endbr(endbr)))
+ text_poke_early(addr, jmp, 2);
+ }
+
+ return 0;
+}
+
+static int cfi_enable_callers(s32 *start, s32 *end)
+{
+ /*
+ * Re-enable kCFI, undo what cfi_disable_callers() did.
+ */
+ const u8 mov[] = { 0x41, 0xba };
+ s32 *s;
+
+ for (s = start; s < end; s++) {
+ void *addr = (void *)s + *s;
+ u32 hash;
+
+ addr -= fineibt_caller_size;
+ hash = decode_caller_hash(addr);
+ if (!hash) /* nocfi callers */
continue;
- DPRINTK("ENDBR at: %pS (%px)", addr, addr);
+ text_poke_early(addr, mov, 2);
+ }
- /*
- * When we have IBT, the lack of ENDBR will trigger #CP
- */
- DUMP_BYTES(((u8*)addr), 4, "%px: orig: ", addr);
- DUMP_BYTES(((u8*)&poison), 4, "%px: repl: ", addr);
- text_poke_early(addr, &poison, 4);
+ return 0;
+}
+
+/* .cfi_sites */
+static int cfi_rand_preamble(s32 *start, s32 *end)
+{
+ s32 *s;
+
+ for (s = start; s < end; s++) {
+ void *addr = (void *)s + *s;
+ u32 hash;
+
+ hash = decode_preamble_hash(addr);
+ if (WARN(!hash, "no CFI hash found at: %pS %px %*ph\n",
+ addr, addr, 5, addr))
+ return -EINVAL;
+
+ hash = cfi_rehash(hash);
+ text_poke_early(addr + 1, &hash, 4);
+ }
+
+ return 0;
+}
+
+static int cfi_rewrite_preamble(s32 *start, s32 *end)
+{
+ s32 *s;
+
+ for (s = start; s < end; s++) {
+ void *addr = (void *)s + *s;
+ u32 hash;
+
+ hash = decode_preamble_hash(addr);
+ if (WARN(!hash, "no CFI hash found at: %pS %px %*ph\n",
+ addr, addr, 5, addr))
+ return -EINVAL;
+
+ text_poke_early(addr, fineibt_preamble_start, fineibt_preamble_size);
+ WARN_ON(*(u32 *)(addr + fineibt_preamble_hash) != 0x12345678);
+ text_poke_early(addr + fineibt_preamble_hash, &hash, 4);
+ }
+
+ return 0;
+}
+
+/* .retpoline_sites */
+static int cfi_rand_callers(s32 *start, s32 *end)
+{
+ s32 *s;
+
+ for (s = start; s < end; s++) {
+ void *addr = (void *)s + *s;
+ u32 hash;
+
+ addr -= fineibt_caller_size;
+ hash = decode_caller_hash(addr);
+ if (hash) {
+ hash = -cfi_rehash(hash);
+ text_poke_early(addr + 2, &hash, 4);
+ }
+ }
+
+ return 0;
+}
+
+static int cfi_rewrite_callers(s32 *start, s32 *end)
+{
+ s32 *s;
+
+ for (s = start; s < end; s++) {
+ void *addr = (void *)s + *s;
+ u32 hash;
+
+ addr -= fineibt_caller_size;
+ hash = decode_caller_hash(addr);
+ if (hash) {
+ text_poke_early(addr, fineibt_caller_start, fineibt_caller_size);
+ WARN_ON(*(u32 *)(addr + fineibt_caller_hash) != 0x12345678);
+ text_poke_early(addr + fineibt_caller_hash, &hash, 4);
+ }
+ /* rely on apply_retpolines() */
}
+
+ return 0;
+}
+
+static void __apply_fineibt(s32 *start_retpoline, s32 *end_retpoline,
+ s32 *start_cfi, s32 *end_cfi, bool builtin)
+{
+ int ret;
+
+ if (WARN_ONCE(fineibt_preamble_size != 16,
+ "FineIBT preamble wrong size: %ld", fineibt_preamble_size))
+ return;
+
+ if (cfi_mode == CFI_DEFAULT) {
+ cfi_mode = CFI_KCFI;
+ if (HAS_KERNEL_IBT && cpu_feature_enabled(X86_FEATURE_IBT))
+ cfi_mode = CFI_FINEIBT;
+ }
+
+ /*
+ * Rewrite the callers to not use the __cfi_ stubs, such that we might
+ * rewrite them. This disables all CFI. If this succeeds but any of the
+ * later stages fails, we're without CFI.
+ */
+ ret = cfi_disable_callers(start_retpoline, end_retpoline);
+ if (ret)
+ goto err;
+
+ if (cfi_rand) {
+ if (builtin)
+ cfi_seed = get_random_u32();
+
+ ret = cfi_rand_preamble(start_cfi, end_cfi);
+ if (ret)
+ goto err;
+
+ ret = cfi_rand_callers(start_retpoline, end_retpoline);
+ if (ret)
+ goto err;
+ }
+
+ switch (cfi_mode) {
+ case CFI_OFF:
+ if (builtin)
+ pr_info("Disabling CFI\n");
+ return;
+
+ case CFI_KCFI:
+ ret = cfi_enable_callers(start_retpoline, end_retpoline);
+ if (ret)
+ goto err;
+
+ if (builtin)
+ pr_info("Using kCFI\n");
+ return;
+
+ case CFI_FINEIBT:
+ ret = cfi_rewrite_preamble(start_cfi, end_cfi);
+ if (ret)
+ goto err;
+
+ ret = cfi_rewrite_callers(start_retpoline, end_retpoline);
+ if (ret)
+ goto err;
+
+ if (builtin)
+ pr_info("Using FineIBT CFI\n");
+ return;
+
+ default:
+ break;
+ }
+
+err:
+ pr_err("Something went horribly wrong trying to rewrite the CFI implementation.\n");
}
#else
-void __init_or_module noinline apply_ibt_endbr(s32 *start, s32 *end) { }
+static void __apply_fineibt(s32 *start_retpoline, s32 *end_retpoline,
+ s32 *start_cfi, s32 *end_cfi, bool builtin)
+{
+}
+
+#endif
-#endif /* CONFIG_X86_KERNEL_IBT */
+void apply_fineibt(s32 *start_retpoline, s32 *end_retpoline,
+ s32 *start_cfi, s32 *end_cfi)
+{
+ return __apply_fineibt(start_retpoline, end_retpoline,
+ start_cfi, end_cfi,
+ /* .builtin = */ false);
+}
#ifdef CONFIG_SMP
static void alternatives_smp_lock(const s32 *start, const s32 *end,
@@ -934,6 +1379,9 @@ void __init alternative_instructions(void)
*/
apply_paravirt(__parainstructions, __parainstructions_end);
+ __apply_fineibt(__retpoline_sites, __retpoline_sites_end,
+ __cfi_sites, __cfi_sites_end, true);
+
/*
* Rewrite the retpolines, must be done before alternatives since
* those can rewrite the retpoline thunks.
@@ -947,6 +1395,12 @@ void __init alternative_instructions(void)
*/
apply_alternatives(__alt_instructions, __alt_instructions_end);
+ /*
+ * Now all calls are established. Apply the call thunks if
+ * required.
+ */
+ callthunks_patch_builtin_calls();
+
apply_ibt_endbr(__ibt_endbr_seal, __ibt_endbr_seal_end);
#ifdef CONFIG_SMP
@@ -1236,27 +1690,15 @@ void *text_poke_kgdb(void *addr, const void *opcode, size_t len)
return __text_poke(text_poke_memcpy, addr, opcode, len);
}
-/**
- * text_poke_copy - Copy instructions into (an unused part of) RX memory
- * @addr: address to modify
- * @opcode: source of the copy
- * @len: length to copy, could be more than 2x PAGE_SIZE
- *
- * Not safe against concurrent execution; useful for JITs to dump
- * new code blocks into unused regions of RX memory. Can be used in
- * conjunction with synchronize_rcu_tasks() to wait for existing
- * execution to quiesce after having made sure no existing functions
- * pointers are live.
- */
-void *text_poke_copy(void *addr, const void *opcode, size_t len)
+void *text_poke_copy_locked(void *addr, const void *opcode, size_t len,
+ bool core_ok)
{
unsigned long start = (unsigned long)addr;
size_t patched = 0;
- if (WARN_ON_ONCE(core_kernel_text(start)))
+ if (WARN_ON_ONCE(!core_ok && core_kernel_text(start)))
return NULL;
- mutex_lock(&text_mutex);
while (patched < len) {
unsigned long ptr = start + patched;
size_t s;
@@ -1266,6 +1708,25 @@ void *text_poke_copy(void *addr, const void *opcode, size_t len)
__text_poke(text_poke_memcpy, (void *)ptr, opcode + patched, s);
patched += s;
}
+ return addr;
+}
+
+/**
+ * text_poke_copy - Copy instructions into (an unused part of) RX memory
+ * @addr: address to modify
+ * @opcode: source of the copy
+ * @len: length to copy, could be more than 2x PAGE_SIZE
+ *
+ * Not safe against concurrent execution; useful for JITs to dump
+ * new code blocks into unused regions of RX memory. Can be used in
+ * conjunction with synchronize_rcu_tasks() to wait for existing
+ * execution to quiesce after having made sure no existing functions
+ * pointers are live.
+ */
+void *text_poke_copy(void *addr, const void *opcode, size_t len)
+{
+ mutex_lock(&text_mutex);
+ addr = text_poke_copy_locked(addr, opcode, len, false);
mutex_unlock(&text_mutex);
return addr;
}
@@ -1608,7 +2069,7 @@ static void text_poke_loc_init(struct text_poke_loc *tp, void *addr,
default:
BUG_ON(len != insn.length);
- };
+ }
switch (tp->opcode) {
@@ -1681,11 +2142,6 @@ void __ref text_poke_queue(void *addr, const void *opcode, size_t len, const voi
{
struct text_poke_loc *tp;
- if (unlikely(system_state == SYSTEM_BOOTING)) {
- text_poke_early(addr, opcode, len);
- return;
- }
-
text_poke_flush(addr);
tp = &tp_vec[tp_vec_nr++];
@@ -1707,11 +2163,6 @@ void __ref text_poke_bp(void *addr, const void *opcode, size_t len, const void *
{
struct text_poke_loc tp;
- if (unlikely(system_state == SYSTEM_BOOTING)) {
- text_poke_early(addr, opcode, len);
- return;
- }
-
text_poke_loc_init(&tp, addr, opcode, len, emulate);
text_poke_bp_batch(&tp, 1);
}
diff --git a/arch/x86/kernel/amd_gart_64.c b/arch/x86/kernel/amd_gart_64.c
index 19a0207e529f..56a917df410d 100644
--- a/arch/x86/kernel/amd_gart_64.c
+++ b/arch/x86/kernel/amd_gart_64.c
@@ -504,7 +504,7 @@ static __init unsigned long check_iommu_size(unsigned long aper, u64 aper_size)
}
a = aper + iommu_size;
- iommu_size -= round_up(a, PMD_PAGE_SIZE) - a;
+ iommu_size -= round_up(a, PMD_SIZE) - a;
if (iommu_size < 64*1024*1024) {
pr_warn("PCI-DMA: Warning: Small IOMMU %luMB."
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index c6876d3ea4b1..20d9a604da7c 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -1931,16 +1931,19 @@ void __init check_x2apic(void)
}
}
#else /* CONFIG_X86_X2APIC */
-static int __init validate_x2apic(void)
+void __init check_x2apic(void)
{
if (!apic_is_x2apic_enabled())
- return 0;
+ return;
/*
- * Checkme: Can we simply turn off x2apic here instead of panic?
+ * Checkme: Can we simply turn off x2APIC here instead of disabling the APIC?
*/
- panic("BIOS has enabled x2apic but kernel doesn't support x2apic, please disable x2apic in BIOS.\n");
+ pr_err("Kernel does not support x2APIC, please recompile with CONFIG_X86_X2APIC.\n");
+ pr_err("Disabling APIC, expect reduced performance and functionality.\n");
+
+ disable_apic = 1;
+ setup_clear_cpu_cap(X86_FEATURE_APIC);
}
-early_initcall(validate_x2apic);
static inline void try_to_enable_x2apic(int remap_mode) { }
static inline void __x2apic_enable(void) { }
diff --git a/arch/x86/kernel/apic/msi.c b/arch/x86/kernel/apic/msi.c
index 7517eb05bdc1..35d5b8fb18ef 100644
--- a/arch/x86/kernel/apic/msi.c
+++ b/arch/x86/kernel/apic/msi.c
@@ -142,70 +142,139 @@ msi_set_affinity(struct irq_data *irqd, const struct cpumask *mask, bool force)
return ret;
}
-/*
- * IRQ Chip for MSI PCI/PCI-X/PCI-Express Devices,
- * which implement the MSI or MSI-X Capability Structure.
+/**
+ * pci_dev_has_default_msi_parent_domain - Check whether the device has the default
+ * MSI parent domain associated
+ * @dev: Pointer to the PCI device
*/
-static struct irq_chip pci_msi_controller = {
- .name = "PCI-MSI",
- .irq_unmask = pci_msi_unmask_irq,
- .irq_mask = pci_msi_mask_irq,
- .irq_ack = irq_chip_ack_parent,
- .irq_retrigger = irq_chip_retrigger_hierarchy,
- .irq_set_affinity = msi_set_affinity,
- .flags = IRQCHIP_SKIP_SET_WAKE |
- IRQCHIP_AFFINITY_PRE_STARTUP,
-};
+bool pci_dev_has_default_msi_parent_domain(struct pci_dev *dev)
+{
+ struct irq_domain *domain = dev_get_msi_domain(&dev->dev);
-int pci_msi_prepare(struct irq_domain *domain, struct device *dev, int nvec,
- msi_alloc_info_t *arg)
+ if (!domain)
+ domain = dev_get_msi_domain(&dev->bus->dev);
+ if (!domain)
+ return false;
+
+ return domain == x86_vector_domain;
+}
+
+/**
+ * x86_msi_prepare - Setup of msi_alloc_info_t for allocations
+ * @domain: The domain for which this setup happens
+ * @dev: The device for which interrupts are allocated
+ * @nvec: The number of vectors to allocate
+ * @alloc: The allocation info structure to initialize
+ *
+ * This function is to be used for all types of MSI domains above the x86
+ * vector domain and any intermediates. It is always invoked from the
+ * top level interrupt domain. The domain specific allocation
+ * functionality is determined via the @domain's bus token which allows to
+ * map the X86 specific allocation type.
+ */
+static int x86_msi_prepare(struct irq_domain *domain, struct device *dev,
+ int nvec, msi_alloc_info_t *alloc)
{
- init_irq_alloc_info(arg, NULL);
- if (to_pci_dev(dev)->msix_enabled) {
- arg->type = X86_IRQ_ALLOC_TYPE_PCI_MSIX;
- } else {
- arg->type = X86_IRQ_ALLOC_TYPE_PCI_MSI;
- arg->flags |= X86_IRQ_ALLOC_CONTIGUOUS_VECTORS;
+ struct msi_domain_info *info = domain->host_data;
+
+ init_irq_alloc_info(alloc, NULL);
+
+ switch (info->bus_token) {
+ case DOMAIN_BUS_PCI_DEVICE_MSI:
+ alloc->type = X86_IRQ_ALLOC_TYPE_PCI_MSI;
+ return 0;
+ case DOMAIN_BUS_PCI_DEVICE_MSIX:
+ case DOMAIN_BUS_PCI_DEVICE_IMS:
+ alloc->type = X86_IRQ_ALLOC_TYPE_PCI_MSIX;
+ return 0;
+ default:
+ return -EINVAL;
}
-
- return 0;
}
-EXPORT_SYMBOL_GPL(pci_msi_prepare);
-static struct msi_domain_ops pci_msi_domain_ops = {
- .msi_prepare = pci_msi_prepare,
-};
+/**
+ * x86_init_dev_msi_info - Domain info setup for MSI domains
+ * @dev: The device for which the domain should be created
+ * @domain: The (root) domain providing this callback
+ * @real_parent: The real parent domain of the to initialize domain
+ * @info: The domain info for the to initialize domain
+ *
+ * This function is to be used for all types of MSI domains above the x86
+ * vector domain and any intermediates. The domain specific functionality
+ * is determined via the @real_parent.
+ */
+static bool x86_init_dev_msi_info(struct device *dev, struct irq_domain *domain,
+ struct irq_domain *real_parent, struct msi_domain_info *info)
+{
+ const struct msi_parent_ops *pops = real_parent->msi_parent_ops;
+
+ /* MSI parent domain specific settings */
+ switch (real_parent->bus_token) {
+ case DOMAIN_BUS_ANY:
+ /* Only the vector domain can have the ANY token */
+ if (WARN_ON_ONCE(domain != real_parent))
+ return false;
+ info->chip->irq_set_affinity = msi_set_affinity;
+ /* See msi_set_affinity() for the gory details */
+ info->flags |= MSI_FLAG_NOMASK_QUIRK;
+ break;
+ case DOMAIN_BUS_DMAR:
+ case DOMAIN_BUS_AMDVI:
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ return false;
+ }
+
+ /* Is the target supported? */
+ switch(info->bus_token) {
+ case DOMAIN_BUS_PCI_DEVICE_MSI:
+ case DOMAIN_BUS_PCI_DEVICE_MSIX:
+ break;
+ case DOMAIN_BUS_PCI_DEVICE_IMS:
+ if (!(pops->supported_flags & MSI_FLAG_PCI_IMS))
+ return false;
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ return false;
+ }
+
+ /*
+ * Mask out the domain specific MSI feature flags which are not
+ * supported by the real parent.
+ */
+ info->flags &= pops->supported_flags;
+ /* Enforce the required flags */
+ info->flags |= X86_VECTOR_MSI_FLAGS_REQUIRED;
+
+ /* This is always invoked from the top level MSI domain! */
+ info->ops->msi_prepare = x86_msi_prepare;
+
+ info->chip->irq_ack = irq_chip_ack_parent;
+ info->chip->irq_retrigger = irq_chip_retrigger_hierarchy;
+ info->chip->flags |= IRQCHIP_SKIP_SET_WAKE |
+ IRQCHIP_AFFINITY_PRE_STARTUP;
-static struct msi_domain_info pci_msi_domain_info = {
- .flags = MSI_FLAG_USE_DEF_DOM_OPS | MSI_FLAG_USE_DEF_CHIP_OPS |
- MSI_FLAG_PCI_MSIX,
- .ops = &pci_msi_domain_ops,
- .chip = &pci_msi_controller,
- .handler = handle_edge_irq,
- .handler_name = "edge",
+ info->handler = handle_edge_irq;
+ info->handler_name = "edge";
+
+ return true;
+}
+
+static const struct msi_parent_ops x86_vector_msi_parent_ops = {
+ .supported_flags = X86_VECTOR_MSI_FLAGS_SUPPORTED,
+ .init_dev_msi_info = x86_init_dev_msi_info,
};
struct irq_domain * __init native_create_pci_msi_domain(void)
{
- struct fwnode_handle *fn;
- struct irq_domain *d;
-
if (disable_apic)
return NULL;
- fn = irq_domain_alloc_named_fwnode("PCI-MSI");
- if (!fn)
- return NULL;
-
- d = pci_msi_create_irq_domain(fn, &pci_msi_domain_info,
- x86_vector_domain);
- if (!d) {
- irq_domain_free_fwnode(fn);
- pr_warn("Failed to initialize PCI-MSI irqdomain.\n");
- } else {
- d->flags |= IRQ_DOMAIN_MSI_NOMASK_QUIRK;
- }
- return d;
+ x86_vector_domain->flags |= IRQ_DOMAIN_FLAG_MSI_PARENT;
+ x86_vector_domain->msi_parent_ops = &x86_vector_msi_parent_ops;
+ return x86_vector_domain;
}
void __init x86_create_pci_msi_domain(void)
@@ -213,41 +282,19 @@ void __init x86_create_pci_msi_domain(void)
x86_pci_msi_default_domain = x86_init.irqs.create_pci_msi_domain();
}
-#ifdef CONFIG_IRQ_REMAP
-static struct irq_chip pci_msi_ir_controller = {
- .name = "IR-PCI-MSI",
- .irq_unmask = pci_msi_unmask_irq,
- .irq_mask = pci_msi_mask_irq,
- .irq_ack = irq_chip_ack_parent,
- .irq_retrigger = irq_chip_retrigger_hierarchy,
- .flags = IRQCHIP_SKIP_SET_WAKE |
- IRQCHIP_AFFINITY_PRE_STARTUP,
-};
-
-static struct msi_domain_info pci_msi_ir_domain_info = {
- .flags = MSI_FLAG_USE_DEF_DOM_OPS | MSI_FLAG_USE_DEF_CHIP_OPS |
- MSI_FLAG_MULTI_PCI_MSI | MSI_FLAG_PCI_MSIX,
- .ops = &pci_msi_domain_ops,
- .chip = &pci_msi_ir_controller,
- .handler = handle_edge_irq,
- .handler_name = "edge",
-};
-
-struct irq_domain *arch_create_remap_msi_irq_domain(struct irq_domain *parent,
- const char *name, int id)
+/* Keep around for hyperV */
+int pci_msi_prepare(struct irq_domain *domain, struct device *dev, int nvec,
+ msi_alloc_info_t *arg)
{
- struct fwnode_handle *fn;
- struct irq_domain *d;
+ init_irq_alloc_info(arg, NULL);
- fn = irq_domain_alloc_named_id_fwnode(name, id);
- if (!fn)
- return NULL;
- d = pci_msi_create_irq_domain(fn, &pci_msi_ir_domain_info, parent);
- if (!d)
- irq_domain_free_fwnode(fn);
- return d;
+ if (to_pci_dev(dev)->msix_enabled)
+ arg->type = X86_IRQ_ALLOC_TYPE_PCI_MSIX;
+ else
+ arg->type = X86_IRQ_ALLOC_TYPE_PCI_MSI;
+ return 0;
}
-#endif
+EXPORT_SYMBOL_GPL(pci_msi_prepare);
#ifdef CONFIG_DMAR_TABLE
/*
diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c
index 3e6f6b448f6a..c1efebd27e6c 100644
--- a/arch/x86/kernel/apic/vector.c
+++ b/arch/x86/kernel/apic/vector.c
@@ -539,10 +539,6 @@ static int x86_vector_alloc_irqs(struct irq_domain *domain, unsigned int virq,
if (disable_apic)
return -ENXIO;
- /* Currently vector allocator can't guarantee contiguous allocations */
- if ((info->flags & X86_IRQ_ALLOC_CONTIGUOUS_VECTORS) && nr_irqs > 1)
- return -ENOSYS;
-
/*
* Catch any attempt to touch the cascade interrupt on a PIC
* equipped system.
diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c
index 437308004ef2..82c783da16a8 100644
--- a/arch/x86/kernel/asm-offsets.c
+++ b/arch/x86/kernel/asm-offsets.c
@@ -107,4 +107,9 @@ static void __used common(void)
OFFSET(TSS_sp0, tss_struct, x86_tss.sp0);
OFFSET(TSS_sp1, tss_struct, x86_tss.sp1);
OFFSET(TSS_sp2, tss_struct, x86_tss.sp2);
+ OFFSET(X86_top_of_stack, pcpu_hot, top_of_stack);
+#ifdef CONFIG_CALL_DEPTH_TRACKING
+ OFFSET(X86_call_depth, pcpu_hot, call_depth);
+#endif
+
}
diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c
index 9b698215d261..bb65371ea9df 100644
--- a/arch/x86/kernel/asm-offsets_64.c
+++ b/arch/x86/kernel/asm-offsets_64.c
@@ -57,7 +57,7 @@ int main(void)
BLANK();
#ifdef CONFIG_STACKPROTECTOR
- DEFINE(stack_canary_offset, offsetof(struct fixed_percpu_data, stack_canary));
+ OFFSET(FIXED_stack_canary, fixed_percpu_data, stack_canary);
BLANK();
#endif
return 0;
diff --git a/arch/x86/kernel/callthunks.c b/arch/x86/kernel/callthunks.c
new file mode 100644
index 000000000000..ffea98f9064b
--- /dev/null
+++ b/arch/x86/kernel/callthunks.c
@@ -0,0 +1,388 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#define pr_fmt(fmt) "callthunks: " fmt
+
+#include <linux/debugfs.h>
+#include <linux/kallsyms.h>
+#include <linux/memory.h>
+#include <linux/moduleloader.h>
+#include <linux/static_call.h>
+
+#include <asm/alternative.h>
+#include <asm/asm-offsets.h>
+#include <asm/cpu.h>
+#include <asm/ftrace.h>
+#include <asm/insn.h>
+#include <asm/kexec.h>
+#include <asm/nospec-branch.h>
+#include <asm/paravirt.h>
+#include <asm/sections.h>
+#include <asm/switch_to.h>
+#include <asm/sync_core.h>
+#include <asm/text-patching.h>
+#include <asm/xen/hypercall.h>
+
+static int __initdata_or_module debug_callthunks;
+
+#define prdbg(fmt, args...) \
+do { \
+ if (debug_callthunks) \
+ printk(KERN_DEBUG pr_fmt(fmt), ##args); \
+} while(0)
+
+static int __init debug_thunks(char *str)
+{
+ debug_callthunks = 1;
+ return 1;
+}
+__setup("debug-callthunks", debug_thunks);
+
+#ifdef CONFIG_CALL_THUNKS_DEBUG
+DEFINE_PER_CPU(u64, __x86_call_count);
+DEFINE_PER_CPU(u64, __x86_ret_count);
+DEFINE_PER_CPU(u64, __x86_stuffs_count);
+DEFINE_PER_CPU(u64, __x86_ctxsw_count);
+EXPORT_SYMBOL_GPL(__x86_ctxsw_count);
+EXPORT_SYMBOL_GPL(__x86_call_count);
+#endif
+
+extern s32 __call_sites[], __call_sites_end[];
+
+struct thunk_desc {
+ void *template;
+ unsigned int template_size;
+};
+
+struct core_text {
+ unsigned long base;
+ unsigned long end;
+ const char *name;
+};
+
+static bool thunks_initialized __ro_after_init;
+
+static const struct core_text builtin_coretext = {
+ .base = (unsigned long)_text,
+ .end = (unsigned long)_etext,
+ .name = "builtin",
+};
+
+asm (
+ ".pushsection .rodata \n"
+ ".global skl_call_thunk_template \n"
+ "skl_call_thunk_template: \n"
+ __stringify(INCREMENT_CALL_DEPTH)" \n"
+ ".global skl_call_thunk_tail \n"
+ "skl_call_thunk_tail: \n"
+ ".popsection \n"
+);
+
+extern u8 skl_call_thunk_template[];
+extern u8 skl_call_thunk_tail[];
+
+#define SKL_TMPL_SIZE \
+ ((unsigned int)(skl_call_thunk_tail - skl_call_thunk_template))
+
+extern void error_entry(void);
+extern void xen_error_entry(void);
+extern void paranoid_entry(void);
+
+static inline bool within_coretext(const struct core_text *ct, void *addr)
+{
+ unsigned long p = (unsigned long)addr;
+
+ return ct->base <= p && p < ct->end;
+}
+
+static inline bool within_module_coretext(void *addr)
+{
+ bool ret = false;
+
+#ifdef CONFIG_MODULES
+ struct module *mod;
+
+ preempt_disable();
+ mod = __module_address((unsigned long)addr);
+ if (mod && within_module_core((unsigned long)addr, mod))
+ ret = true;
+ preempt_enable();
+#endif
+ return ret;
+}
+
+static bool is_coretext(const struct core_text *ct, void *addr)
+{
+ if (ct && within_coretext(ct, addr))
+ return true;
+ if (within_coretext(&builtin_coretext, addr))
+ return true;
+ return within_module_coretext(addr);
+}
+
+static bool skip_addr(void *dest)
+{
+ if (dest == error_entry)
+ return true;
+ if (dest == paranoid_entry)
+ return true;
+ if (dest == xen_error_entry)
+ return true;
+ /* Does FILL_RSB... */
+ if (dest == __switch_to_asm)
+ return true;
+ /* Accounts directly */
+ if (dest == ret_from_fork)
+ return true;
+#ifdef CONFIG_HOTPLUG_CPU
+ if (dest == start_cpu0)
+ return true;
+#endif
+#ifdef CONFIG_FUNCTION_TRACER
+ if (dest == __fentry__)
+ return true;
+#endif
+#ifdef CONFIG_KEXEC_CORE
+ if (dest >= (void *)relocate_kernel &&
+ dest < (void*)relocate_kernel + KEXEC_CONTROL_CODE_MAX_SIZE)
+ return true;
+#endif
+#ifdef CONFIG_XEN
+ if (dest >= (void *)hypercall_page &&
+ dest < (void*)hypercall_page + PAGE_SIZE)
+ return true;
+#endif
+ return false;
+}
+
+static __init_or_module void *call_get_dest(void *addr)
+{
+ struct insn insn;
+ void *dest;
+ int ret;
+
+ ret = insn_decode_kernel(&insn, addr);
+ if (ret)
+ return ERR_PTR(ret);
+
+ /* Patched out call? */
+ if (insn.opcode.bytes[0] != CALL_INSN_OPCODE)
+ return NULL;
+
+ dest = addr + insn.length + insn.immediate.value;
+ if (skip_addr(dest))
+ return NULL;
+ return dest;
+}
+
+static const u8 nops[] = {
+ 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90,
+ 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90,
+ 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90,
+ 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90,
+};
+
+static void *patch_dest(void *dest, bool direct)
+{
+ unsigned int tsize = SKL_TMPL_SIZE;
+ u8 *pad = dest - tsize;
+
+ /* Already patched? */
+ if (!bcmp(pad, skl_call_thunk_template, tsize))
+ return pad;
+
+ /* Ensure there are nops */
+ if (bcmp(pad, nops, tsize)) {
+ pr_warn_once("Invalid padding area for %pS\n", dest);
+ return NULL;
+ }
+
+ if (direct)
+ memcpy(pad, skl_call_thunk_template, tsize);
+ else
+ text_poke_copy_locked(pad, skl_call_thunk_template, tsize, true);
+ return pad;
+}
+
+static __init_or_module void patch_call(void *addr, const struct core_text *ct)
+{
+ void *pad, *dest;
+ u8 bytes[8];
+
+ if (!within_coretext(ct, addr))
+ return;
+
+ dest = call_get_dest(addr);
+ if (!dest || WARN_ON_ONCE(IS_ERR(dest)))
+ return;
+
+ if (!is_coretext(ct, dest))
+ return;
+
+ pad = patch_dest(dest, within_coretext(ct, dest));
+ if (!pad)
+ return;
+
+ prdbg("Patch call at: %pS %px to %pS %px -> %px \n", addr, addr,
+ dest, dest, pad);
+ __text_gen_insn(bytes, CALL_INSN_OPCODE, addr, pad, CALL_INSN_SIZE);
+ text_poke_early(addr, bytes, CALL_INSN_SIZE);
+}
+
+static __init_or_module void
+patch_call_sites(s32 *start, s32 *end, const struct core_text *ct)
+{
+ s32 *s;
+
+ for (s = start; s < end; s++)
+ patch_call((void *)s + *s, ct);
+}
+
+static __init_or_module void
+patch_paravirt_call_sites(struct paravirt_patch_site *start,
+ struct paravirt_patch_site *end,
+ const struct core_text *ct)
+{
+ struct paravirt_patch_site *p;
+
+ for (p = start; p < end; p++)
+ patch_call(p->instr, ct);
+}
+
+static __init_or_module void
+callthunks_setup(struct callthunk_sites *cs, const struct core_text *ct)
+{
+ prdbg("Patching call sites %s\n", ct->name);
+ patch_call_sites(cs->call_start, cs->call_end, ct);
+ patch_paravirt_call_sites(cs->pv_start, cs->pv_end, ct);
+ prdbg("Patching call sites done%s\n", ct->name);
+}
+
+void __init callthunks_patch_builtin_calls(void)
+{
+ struct callthunk_sites cs = {
+ .call_start = __call_sites,
+ .call_end = __call_sites_end,
+ .pv_start = __parainstructions,
+ .pv_end = __parainstructions_end
+ };
+
+ if (!cpu_feature_enabled(X86_FEATURE_CALL_DEPTH))
+ return;
+
+ pr_info("Setting up call depth tracking\n");
+ mutex_lock(&text_mutex);
+ callthunks_setup(&cs, &builtin_coretext);
+ static_call_force_reinit();
+ thunks_initialized = true;
+ mutex_unlock(&text_mutex);
+}
+
+void *callthunks_translate_call_dest(void *dest)
+{
+ void *target;
+
+ lockdep_assert_held(&text_mutex);
+
+ if (!thunks_initialized || skip_addr(dest))
+ return dest;
+
+ if (!is_coretext(NULL, dest))
+ return dest;
+
+ target = patch_dest(dest, false);
+ return target ? : dest;
+}
+
+bool is_callthunk(void *addr)
+{
+ unsigned int tmpl_size = SKL_TMPL_SIZE;
+ void *tmpl = skl_call_thunk_template;
+ unsigned long dest;
+
+ dest = roundup((unsigned long)addr, CONFIG_FUNCTION_ALIGNMENT);
+ if (!thunks_initialized || skip_addr((void *)dest))
+ return false;
+
+ return !bcmp((void *)(dest - tmpl_size), tmpl, tmpl_size);
+}
+
+#ifdef CONFIG_BPF_JIT
+int x86_call_depth_emit_accounting(u8 **pprog, void *func)
+{
+ unsigned int tmpl_size = SKL_TMPL_SIZE;
+ void *tmpl = skl_call_thunk_template;
+
+ if (!thunks_initialized)
+ return 0;
+
+ /* Is function call target a thunk? */
+ if (func && is_callthunk(func))
+ return 0;
+
+ memcpy(*pprog, tmpl, tmpl_size);
+ *pprog += tmpl_size;
+ return tmpl_size;
+}
+#endif
+
+#ifdef CONFIG_MODULES
+void noinline callthunks_patch_module_calls(struct callthunk_sites *cs,
+ struct module *mod)
+{
+ struct core_text ct = {
+ .base = (unsigned long)mod->core_layout.base,
+ .end = (unsigned long)mod->core_layout.base + mod->core_layout.size,
+ .name = mod->name,
+ };
+
+ if (!thunks_initialized)
+ return;
+
+ mutex_lock(&text_mutex);
+ callthunks_setup(cs, &ct);
+ mutex_unlock(&text_mutex);
+}
+#endif /* CONFIG_MODULES */
+
+#if defined(CONFIG_CALL_THUNKS_DEBUG) && defined(CONFIG_DEBUG_FS)
+static int callthunks_debug_show(struct seq_file *m, void *p)
+{
+ unsigned long cpu = (unsigned long)m->private;
+
+ seq_printf(m, "C: %16llu R: %16llu S: %16llu X: %16llu\n,",
+ per_cpu(__x86_call_count, cpu),
+ per_cpu(__x86_ret_count, cpu),
+ per_cpu(__x86_stuffs_count, cpu),
+ per_cpu(__x86_ctxsw_count, cpu));
+ return 0;
+}
+
+static int callthunks_debug_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, callthunks_debug_show, inode->i_private);
+}
+
+static const struct file_operations dfs_ops = {
+ .open = callthunks_debug_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static int __init callthunks_debugfs_init(void)
+{
+ struct dentry *dir;
+ unsigned long cpu;
+
+ dir = debugfs_create_dir("callthunks", NULL);
+ for_each_possible_cpu(cpu) {
+ void *arg = (void *)cpu;
+ char name [10];
+
+ sprintf(name, "cpu%lu", cpu);
+ debugfs_create_file(name, 0644, dir, arg, &dfs_ops);
+ }
+ return 0;
+}
+__initcall(callthunks_debugfs_init);
+#endif
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index f10a921ee756..d7e3ceaf75c1 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -17,9 +17,6 @@ KMSAN_SANITIZE_common.o := n
# As above, instrumenting secondary CPU boot code causes boot hangs.
KCSAN_SANITIZE_common.o := n
-# Make sure load_percpu_segment has no stackprotector
-CFLAGS_common.o := -fno-stack-protector
-
obj-y := cacheinfo.o scattered.o topology.o
obj-y += common.o
obj-y += rdrand.o
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 860b60273df3..f769d6d08b43 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -770,8 +770,6 @@ static void init_amd_gh(struct cpuinfo_x86 *c)
set_cpu_bug(c, X86_BUG_AMD_TLB_MMATCH);
}
-#define MSR_AMD64_DE_CFG 0xC0011029
-
static void init_amd_ln(struct cpuinfo_x86 *c)
{
/*
@@ -965,8 +963,8 @@ static void init_amd(struct cpuinfo_x86 *c)
* msr_set_bit() uses the safe accessors, too, even if the MSR
* is not present.
*/
- msr_set_bit(MSR_F10H_DECFG,
- MSR_F10H_DECFG_LFENCE_SERIALIZE_BIT);
+ msr_set_bit(MSR_AMD64_DE_CFG,
+ MSR_AMD64_DE_CFG_LFENCE_SERIALIZE_BIT);
/* A serializing LFENCE stops RDTSC speculation */
set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
@@ -985,7 +983,7 @@ static void init_amd(struct cpuinfo_x86 *c)
set_cpu_cap(c, X86_FEATURE_3DNOWPREFETCH);
/* AMD CPUs don't reset SS attributes on SYSRET, Xen does. */
- if (!cpu_has(c, X86_FEATURE_XENPV))
+ if (!cpu_feature_enabled(X86_FEATURE_XENPV))
set_cpu_bug(c, X86_BUG_SYSRET_SS_ATTRS);
/*
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index 3e3230cccaa7..bca0bd8f4846 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -60,11 +60,18 @@ EXPORT_SYMBOL_GPL(x86_spec_ctrl_current);
static DEFINE_MUTEX(spec_ctrl_mutex);
+/* Update SPEC_CTRL MSR and its cached copy unconditionally */
+static void update_spec_ctrl(u64 val)
+{
+ this_cpu_write(x86_spec_ctrl_current, val);
+ wrmsrl(MSR_IA32_SPEC_CTRL, val);
+}
+
/*
* Keep track of the SPEC_CTRL MSR value for the current task, which may differ
* from x86_spec_ctrl_base due to STIBP/SSB in __speculation_ctrl_update().
*/
-void write_spec_ctrl_current(u64 val, bool force)
+void update_spec_ctrl_cond(u64 val)
{
if (this_cpu_read(x86_spec_ctrl_current) == val)
return;
@@ -75,7 +82,7 @@ void write_spec_ctrl_current(u64 val, bool force)
* When KERNEL_IBRS this MSR is written on return-to-user, unless
* forced the update can be delayed until that time.
*/
- if (force || !cpu_feature_enabled(X86_FEATURE_KERNEL_IBRS))
+ if (!cpu_feature_enabled(X86_FEATURE_KERNEL_IBRS))
wrmsrl(MSR_IA32_SPEC_CTRL, val);
}
@@ -780,6 +787,7 @@ enum retbleed_mitigation {
RETBLEED_MITIGATION_IBPB,
RETBLEED_MITIGATION_IBRS,
RETBLEED_MITIGATION_EIBRS,
+ RETBLEED_MITIGATION_STUFF,
};
enum retbleed_mitigation_cmd {
@@ -787,6 +795,7 @@ enum retbleed_mitigation_cmd {
RETBLEED_CMD_AUTO,
RETBLEED_CMD_UNRET,
RETBLEED_CMD_IBPB,
+ RETBLEED_CMD_STUFF,
};
static const char * const retbleed_strings[] = {
@@ -795,6 +804,7 @@ static const char * const retbleed_strings[] = {
[RETBLEED_MITIGATION_IBPB] = "Mitigation: IBPB",
[RETBLEED_MITIGATION_IBRS] = "Mitigation: IBRS",
[RETBLEED_MITIGATION_EIBRS] = "Mitigation: Enhanced IBRS",
+ [RETBLEED_MITIGATION_STUFF] = "Mitigation: Stuffing",
};
static enum retbleed_mitigation retbleed_mitigation __ro_after_init =
@@ -824,8 +834,12 @@ static int __init retbleed_parse_cmdline(char *str)
retbleed_cmd = RETBLEED_CMD_UNRET;
} else if (!strcmp(str, "ibpb")) {
retbleed_cmd = RETBLEED_CMD_IBPB;
+ } else if (!strcmp(str, "stuff")) {
+ retbleed_cmd = RETBLEED_CMD_STUFF;
} else if (!strcmp(str, "nosmt")) {
retbleed_nosmt = true;
+ } else if (!strcmp(str, "force")) {
+ setup_force_cpu_bug(X86_BUG_RETBLEED);
} else {
pr_err("Ignoring unknown retbleed option (%s).", str);
}
@@ -872,6 +886,21 @@ static void __init retbleed_select_mitigation(void)
}
break;
+ case RETBLEED_CMD_STUFF:
+ if (IS_ENABLED(CONFIG_CALL_DEPTH_TRACKING) &&
+ spectre_v2_enabled == SPECTRE_V2_RETPOLINE) {
+ retbleed_mitigation = RETBLEED_MITIGATION_STUFF;
+
+ } else {
+ if (IS_ENABLED(CONFIG_CALL_DEPTH_TRACKING))
+ pr_err("WARNING: retbleed=stuff depends on spectre_v2=retpoline\n");
+ else
+ pr_err("WARNING: kernel not compiled with CALL_DEPTH_TRACKING.\n");
+
+ goto do_cmd_auto;
+ }
+ break;
+
do_cmd_auto:
case RETBLEED_CMD_AUTO:
default:
@@ -909,6 +938,12 @@ do_cmd_auto:
mitigate_smt = true;
break;
+ case RETBLEED_MITIGATION_STUFF:
+ setup_force_cpu_cap(X86_FEATURE_RETHUNK);
+ setup_force_cpu_cap(X86_FEATURE_CALL_DEPTH);
+ x86_set_skl_return_thunk();
+ break;
+
default:
break;
}
@@ -919,7 +954,7 @@ do_cmd_auto:
/*
* Let IBRS trump all on Intel without affecting the effects of the
- * retbleed= cmdline option.
+ * retbleed= cmdline option except for call depth based stuffing
*/
if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) {
switch (spectre_v2_enabled) {
@@ -932,7 +967,8 @@ do_cmd_auto:
retbleed_mitigation = RETBLEED_MITIGATION_EIBRS;
break;
default:
- pr_err(RETBLEED_INTEL_MSG);
+ if (retbleed_mitigation != RETBLEED_MITIGATION_STUFF)
+ pr_err(RETBLEED_INTEL_MSG);
}
}
@@ -1295,7 +1331,7 @@ static enum spectre_v2_mitigation_cmd __init spectre_v2_parse_cmdline(void)
return SPECTRE_V2_CMD_AUTO;
}
- if (cmd == SPECTRE_V2_CMD_IBRS && boot_cpu_has(X86_FEATURE_XENPV)) {
+ if (cmd == SPECTRE_V2_CMD_IBRS && cpu_feature_enabled(X86_FEATURE_XENPV)) {
pr_err("%s selected but running as XenPV guest. Switching to AUTO select\n",
mitigation_options[i].option);
return SPECTRE_V2_CMD_AUTO;
@@ -1328,7 +1364,7 @@ static void __init spec_ctrl_disable_kernel_rrsba(void)
if (ia32_cap & ARCH_CAP_RRSBA) {
x86_spec_ctrl_base |= SPEC_CTRL_RRSBA_DIS_S;
- write_spec_ctrl_current(x86_spec_ctrl_base, true);
+ update_spec_ctrl(x86_spec_ctrl_base);
}
}
@@ -1406,6 +1442,7 @@ static void __init spectre_v2_select_mitigation(void)
if (IS_ENABLED(CONFIG_CPU_IBRS_ENTRY) &&
boot_cpu_has_bug(X86_BUG_RETBLEED) &&
retbleed_cmd != RETBLEED_CMD_OFF &&
+ retbleed_cmd != RETBLEED_CMD_STUFF &&
boot_cpu_has(X86_FEATURE_IBRS) &&
boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) {
mode = SPECTRE_V2_IBRS;
@@ -1450,7 +1487,7 @@ static void __init spectre_v2_select_mitigation(void)
if (spectre_v2_in_ibrs_mode(mode)) {
x86_spec_ctrl_base |= SPEC_CTRL_IBRS;
- write_spec_ctrl_current(x86_spec_ctrl_base, true);
+ update_spec_ctrl(x86_spec_ctrl_base);
}
switch (mode) {
@@ -1564,7 +1601,7 @@ static void __init spectre_v2_select_mitigation(void)
static void update_stibp_msr(void * __unused)
{
u64 val = spec_ctrl_current() | (x86_spec_ctrl_base & SPEC_CTRL_STIBP);
- write_spec_ctrl_current(val, true);
+ update_spec_ctrl(val);
}
/* Update x86_spec_ctrl_base in case SMT state changed. */
@@ -1797,7 +1834,7 @@ static enum ssb_mitigation __init __ssb_select_mitigation(void)
x86_amd_ssb_disable();
} else {
x86_spec_ctrl_base |= SPEC_CTRL_SSBD;
- write_spec_ctrl_current(x86_spec_ctrl_base, true);
+ update_spec_ctrl(x86_spec_ctrl_base);
}
}
@@ -1944,6 +1981,8 @@ static int ib_prctl_set(struct task_struct *task, unsigned long ctrl)
if (ctrl == PR_SPEC_FORCE_DISABLE)
task_set_spec_ib_force_disable(task);
task_update_spec_tif(task);
+ if (task == current)
+ indirect_branch_prediction_barrier();
break;
default:
return -ERANGE;
@@ -2048,7 +2087,7 @@ int arch_prctl_spec_ctrl_get(struct task_struct *task, unsigned long which)
void x86_spec_ctrl_setup_ap(void)
{
if (boot_cpu_has(X86_FEATURE_MSR_SPEC_CTRL))
- write_spec_ctrl_current(x86_spec_ctrl_base, true);
+ update_spec_ctrl(x86_spec_ctrl_base);
if (ssb_mode == SPEC_STORE_BYPASS_DISABLE)
x86_amd_ssb_disable();
@@ -2199,74 +2238,74 @@ static const char * const l1tf_vmx_states[] = {
static ssize_t l1tf_show_state(char *buf)
{
if (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_AUTO)
- return sprintf(buf, "%s\n", L1TF_DEFAULT_MSG);
+ return sysfs_emit(buf, "%s\n", L1TF_DEFAULT_MSG);
if (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_EPT_DISABLED ||
(l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_NEVER &&
sched_smt_active())) {
- return sprintf(buf, "%s; VMX: %s\n", L1TF_DEFAULT_MSG,
- l1tf_vmx_states[l1tf_vmx_mitigation]);
+ return sysfs_emit(buf, "%s; VMX: %s\n", L1TF_DEFAULT_MSG,
+ l1tf_vmx_states[l1tf_vmx_mitigation]);
}
- return sprintf(buf, "%s; VMX: %s, SMT %s\n", L1TF_DEFAULT_MSG,
- l1tf_vmx_states[l1tf_vmx_mitigation],
- sched_smt_active() ? "vulnerable" : "disabled");
+ return sysfs_emit(buf, "%s; VMX: %s, SMT %s\n", L1TF_DEFAULT_MSG,
+ l1tf_vmx_states[l1tf_vmx_mitigation],
+ sched_smt_active() ? "vulnerable" : "disabled");
}
static ssize_t itlb_multihit_show_state(char *buf)
{
if (!boot_cpu_has(X86_FEATURE_MSR_IA32_FEAT_CTL) ||
!boot_cpu_has(X86_FEATURE_VMX))
- return sprintf(buf, "KVM: Mitigation: VMX unsupported\n");
+ return sysfs_emit(buf, "KVM: Mitigation: VMX unsupported\n");
else if (!(cr4_read_shadow() & X86_CR4_VMXE))
- return sprintf(buf, "KVM: Mitigation: VMX disabled\n");
+ return sysfs_emit(buf, "KVM: Mitigation: VMX disabled\n");
else if (itlb_multihit_kvm_mitigation)
- return sprintf(buf, "KVM: Mitigation: Split huge pages\n");
+ return sysfs_emit(buf, "KVM: Mitigation: Split huge pages\n");
else
- return sprintf(buf, "KVM: Vulnerable\n");
+ return sysfs_emit(buf, "KVM: Vulnerable\n");
}
#else
static ssize_t l1tf_show_state(char *buf)
{
- return sprintf(buf, "%s\n", L1TF_DEFAULT_MSG);
+ return sysfs_emit(buf, "%s\n", L1TF_DEFAULT_MSG);
}
static ssize_t itlb_multihit_show_state(char *buf)
{
- return sprintf(buf, "Processor vulnerable\n");
+ return sysfs_emit(buf, "Processor vulnerable\n");
}
#endif
static ssize_t mds_show_state(char *buf)
{
if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) {
- return sprintf(buf, "%s; SMT Host state unknown\n",
- mds_strings[mds_mitigation]);
+ return sysfs_emit(buf, "%s; SMT Host state unknown\n",
+ mds_strings[mds_mitigation]);
}
if (boot_cpu_has(X86_BUG_MSBDS_ONLY)) {
- return sprintf(buf, "%s; SMT %s\n", mds_strings[mds_mitigation],
- (mds_mitigation == MDS_MITIGATION_OFF ? "vulnerable" :
- sched_smt_active() ? "mitigated" : "disabled"));
+ return sysfs_emit(buf, "%s; SMT %s\n", mds_strings[mds_mitigation],
+ (mds_mitigation == MDS_MITIGATION_OFF ? "vulnerable" :
+ sched_smt_active() ? "mitigated" : "disabled"));
}
- return sprintf(buf, "%s; SMT %s\n", mds_strings[mds_mitigation],
- sched_smt_active() ? "vulnerable" : "disabled");
+ return sysfs_emit(buf, "%s; SMT %s\n", mds_strings[mds_mitigation],
+ sched_smt_active() ? "vulnerable" : "disabled");
}
static ssize_t tsx_async_abort_show_state(char *buf)
{
if ((taa_mitigation == TAA_MITIGATION_TSX_DISABLED) ||
(taa_mitigation == TAA_MITIGATION_OFF))
- return sprintf(buf, "%s\n", taa_strings[taa_mitigation]);
+ return sysfs_emit(buf, "%s\n", taa_strings[taa_mitigation]);
if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) {
- return sprintf(buf, "%s; SMT Host state unknown\n",
- taa_strings[taa_mitigation]);
+ return sysfs_emit(buf, "%s; SMT Host state unknown\n",
+ taa_strings[taa_mitigation]);
}
- return sprintf(buf, "%s; SMT %s\n", taa_strings[taa_mitigation],
- sched_smt_active() ? "vulnerable" : "disabled");
+ return sysfs_emit(buf, "%s; SMT %s\n", taa_strings[taa_mitigation],
+ sched_smt_active() ? "vulnerable" : "disabled");
}
static ssize_t mmio_stale_data_show_state(char *buf)
@@ -2334,73 +2373,72 @@ static char *pbrsb_eibrs_state(void)
static ssize_t spectre_v2_show_state(char *buf)
{
if (spectre_v2_enabled == SPECTRE_V2_LFENCE)
- return sprintf(buf, "Vulnerable: LFENCE\n");
+ return sysfs_emit(buf, "Vulnerable: LFENCE\n");
if (spectre_v2_enabled == SPECTRE_V2_EIBRS && unprivileged_ebpf_enabled())
- return sprintf(buf, "Vulnerable: eIBRS with unprivileged eBPF\n");
+ return sysfs_emit(buf, "Vulnerable: eIBRS with unprivileged eBPF\n");
if (sched_smt_active() && unprivileged_ebpf_enabled() &&
spectre_v2_enabled == SPECTRE_V2_EIBRS_LFENCE)
- return sprintf(buf, "Vulnerable: eIBRS+LFENCE with unprivileged eBPF and SMT\n");
+ return sysfs_emit(buf, "Vulnerable: eIBRS+LFENCE with unprivileged eBPF and SMT\n");
- return sprintf(buf, "%s%s%s%s%s%s%s\n",
- spectre_v2_strings[spectre_v2_enabled],
- ibpb_state(),
- boot_cpu_has(X86_FEATURE_USE_IBRS_FW) ? ", IBRS_FW" : "",
- stibp_state(),
- boot_cpu_has(X86_FEATURE_RSB_CTXSW) ? ", RSB filling" : "",
- pbrsb_eibrs_state(),
- spectre_v2_module_string());
+ return sysfs_emit(buf, "%s%s%s%s%s%s%s\n",
+ spectre_v2_strings[spectre_v2_enabled],
+ ibpb_state(),
+ boot_cpu_has(X86_FEATURE_USE_IBRS_FW) ? ", IBRS_FW" : "",
+ stibp_state(),
+ boot_cpu_has(X86_FEATURE_RSB_CTXSW) ? ", RSB filling" : "",
+ pbrsb_eibrs_state(),
+ spectre_v2_module_string());
}
static ssize_t srbds_show_state(char *buf)
{
- return sprintf(buf, "%s\n", srbds_strings[srbds_mitigation]);
+ return sysfs_emit(buf, "%s\n", srbds_strings[srbds_mitigation]);
}
static ssize_t retbleed_show_state(char *buf)
{
if (retbleed_mitigation == RETBLEED_MITIGATION_UNRET ||
retbleed_mitigation == RETBLEED_MITIGATION_IBPB) {
- if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD &&
- boot_cpu_data.x86_vendor != X86_VENDOR_HYGON)
- return sprintf(buf, "Vulnerable: untrained return thunk / IBPB on non-AMD based uarch\n");
+ if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD &&
+ boot_cpu_data.x86_vendor != X86_VENDOR_HYGON)
+ return sysfs_emit(buf, "Vulnerable: untrained return thunk / IBPB on non-AMD based uarch\n");
- return sprintf(buf, "%s; SMT %s\n",
- retbleed_strings[retbleed_mitigation],
- !sched_smt_active() ? "disabled" :
- spectre_v2_user_stibp == SPECTRE_V2_USER_STRICT ||
- spectre_v2_user_stibp == SPECTRE_V2_USER_STRICT_PREFERRED ?
- "enabled with STIBP protection" : "vulnerable");
+ return sysfs_emit(buf, "%s; SMT %s\n", retbleed_strings[retbleed_mitigation],
+ !sched_smt_active() ? "disabled" :
+ spectre_v2_user_stibp == SPECTRE_V2_USER_STRICT ||
+ spectre_v2_user_stibp == SPECTRE_V2_USER_STRICT_PREFERRED ?
+ "enabled with STIBP protection" : "vulnerable");
}
- return sprintf(buf, "%s\n", retbleed_strings[retbleed_mitigation]);
+ return sysfs_emit(buf, "%s\n", retbleed_strings[retbleed_mitigation]);
}
static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr,
char *buf, unsigned int bug)
{
if (!boot_cpu_has_bug(bug))
- return sprintf(buf, "Not affected\n");
+ return sysfs_emit(buf, "Not affected\n");
switch (bug) {
case X86_BUG_CPU_MELTDOWN:
if (boot_cpu_has(X86_FEATURE_PTI))
- return sprintf(buf, "Mitigation: PTI\n");
+ return sysfs_emit(buf, "Mitigation: PTI\n");
if (hypervisor_is_type(X86_HYPER_XEN_PV))
- return sprintf(buf, "Unknown (XEN PV detected, hypervisor mitigation required)\n");
+ return sysfs_emit(buf, "Unknown (XEN PV detected, hypervisor mitigation required)\n");
break;
case X86_BUG_SPECTRE_V1:
- return sprintf(buf, "%s\n", spectre_v1_strings[spectre_v1_mitigation]);
+ return sysfs_emit(buf, "%s\n", spectre_v1_strings[spectre_v1_mitigation]);
case X86_BUG_SPECTRE_V2:
return spectre_v2_show_state(buf);
case X86_BUG_SPEC_STORE_BYPASS:
- return sprintf(buf, "%s\n", ssb_strings[ssb_mode]);
+ return sysfs_emit(buf, "%s\n", ssb_strings[ssb_mode]);
case X86_BUG_L1TF:
if (boot_cpu_has(X86_FEATURE_L1TF_PTEINV))
@@ -2430,7 +2468,7 @@ static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr
break;
}
- return sprintf(buf, "Vulnerable\n");
+ return sysfs_emit(buf, "Vulnerable\n");
}
ssize_t cpu_show_meltdown(struct device *dev, struct device_attribute *attr, char *buf)
diff --git a/arch/x86/kernel/cpu/cacheinfo.c b/arch/x86/kernel/cpu/cacheinfo.c
index 66556833d7af..f4e5aa27eec6 100644
--- a/arch/x86/kernel/cpu/cacheinfo.c
+++ b/arch/x86/kernel/cpu/cacheinfo.c
@@ -11,15 +11,19 @@
#include <linux/slab.h>
#include <linux/cacheinfo.h>
#include <linux/cpu.h>
+#include <linux/cpuhotplug.h>
#include <linux/sched.h>
#include <linux/capability.h>
#include <linux/sysfs.h>
#include <linux/pci.h>
+#include <linux/stop_machine.h>
#include <asm/cpufeature.h>
#include <asm/cacheinfo.h>
#include <asm/amd_nb.h>
#include <asm/smp.h>
+#include <asm/mtrr.h>
+#include <asm/tlbflush.h>
#include "cpu.h"
@@ -35,6 +39,9 @@ DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_llc_shared_map);
/* Shared L2 cache maps */
DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_l2c_shared_map);
+/* Kernel controls MTRR and/or PAT MSRs. */
+unsigned int memory_caching_control __ro_after_init;
+
struct _cache_table {
unsigned char descriptor;
char cache_type;
@@ -1040,3 +1047,175 @@ int populate_cache_leaves(unsigned int cpu)
return 0;
}
+
+/*
+ * Disable and enable caches. Needed for changing MTRRs and the PAT MSR.
+ *
+ * Since we are disabling the cache don't allow any interrupts,
+ * they would run extremely slow and would only increase the pain.
+ *
+ * The caller must ensure that local interrupts are disabled and
+ * are reenabled after cache_enable() has been called.
+ */
+static unsigned long saved_cr4;
+static DEFINE_RAW_SPINLOCK(cache_disable_lock);
+
+void cache_disable(void) __acquires(cache_disable_lock)
+{
+ unsigned long cr0;
+
+ /*
+ * Note that this is not ideal
+ * since the cache is only flushed/disabled for this CPU while the
+ * MTRRs are changed, but changing this requires more invasive
+ * changes to the way the kernel boots
+ */
+
+ raw_spin_lock(&cache_disable_lock);
+
+ /* Enter the no-fill (CD=1, NW=0) cache mode and flush caches. */
+ cr0 = read_cr0() | X86_CR0_CD;
+ write_cr0(cr0);
+
+ /*
+ * Cache flushing is the most time-consuming step when programming
+ * the MTRRs. Fortunately, as per the Intel Software Development
+ * Manual, we can skip it if the processor supports cache self-
+ * snooping.
+ */
+ if (!static_cpu_has(X86_FEATURE_SELFSNOOP))
+ wbinvd();
+
+ /* Save value of CR4 and clear Page Global Enable (bit 7) */
+ if (cpu_feature_enabled(X86_FEATURE_PGE)) {
+ saved_cr4 = __read_cr4();
+ __write_cr4(saved_cr4 & ~X86_CR4_PGE);
+ }
+
+ /* Flush all TLBs via a mov %cr3, %reg; mov %reg, %cr3 */
+ count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
+ flush_tlb_local();
+
+ if (cpu_feature_enabled(X86_FEATURE_MTRR))
+ mtrr_disable();
+
+ /* Again, only flush caches if we have to. */
+ if (!static_cpu_has(X86_FEATURE_SELFSNOOP))
+ wbinvd();
+}
+
+void cache_enable(void) __releases(cache_disable_lock)
+{
+ /* Flush TLBs (no need to flush caches - they are disabled) */
+ count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
+ flush_tlb_local();
+
+ if (cpu_feature_enabled(X86_FEATURE_MTRR))
+ mtrr_enable();
+
+ /* Enable caches */
+ write_cr0(read_cr0() & ~X86_CR0_CD);
+
+ /* Restore value of CR4 */
+ if (cpu_feature_enabled(X86_FEATURE_PGE))
+ __write_cr4(saved_cr4);
+
+ raw_spin_unlock(&cache_disable_lock);
+}
+
+static void cache_cpu_init(void)
+{
+ unsigned long flags;
+
+ local_irq_save(flags);
+ cache_disable();
+
+ if (memory_caching_control & CACHE_MTRR)
+ mtrr_generic_set_state();
+
+ if (memory_caching_control & CACHE_PAT)
+ pat_cpu_init();
+
+ cache_enable();
+ local_irq_restore(flags);
+}
+
+static bool cache_aps_delayed_init = true;
+
+void set_cache_aps_delayed_init(bool val)
+{
+ cache_aps_delayed_init = val;
+}
+
+bool get_cache_aps_delayed_init(void)
+{
+ return cache_aps_delayed_init;
+}
+
+static int cache_rendezvous_handler(void *unused)
+{
+ if (get_cache_aps_delayed_init() || !cpu_online(smp_processor_id()))
+ cache_cpu_init();
+
+ return 0;
+}
+
+void __init cache_bp_init(void)
+{
+ mtrr_bp_init();
+ pat_bp_init();
+
+ if (memory_caching_control)
+ cache_cpu_init();
+}
+
+void cache_bp_restore(void)
+{
+ if (memory_caching_control)
+ cache_cpu_init();
+}
+
+static int cache_ap_init(unsigned int cpu)
+{
+ if (!memory_caching_control || get_cache_aps_delayed_init())
+ return 0;
+
+ /*
+ * Ideally we should hold mtrr_mutex here to avoid MTRR entries
+ * changed, but this routine will be called in CPU boot time,
+ * holding the lock breaks it.
+ *
+ * This routine is called in two cases:
+ *
+ * 1. very early time of software resume, when there absolutely
+ * isn't MTRR entry changes;
+ *
+ * 2. CPU hotadd time. We let mtrr_add/del_page hold cpuhotplug
+ * lock to prevent MTRR entry changes
+ */
+ stop_machine_from_inactive_cpu(cache_rendezvous_handler, NULL,
+ cpu_callout_mask);
+
+ return 0;
+}
+
+/*
+ * Delayed cache initialization for all AP's
+ */
+void cache_aps_init(void)
+{
+ if (!memory_caching_control || !get_cache_aps_delayed_init())
+ return;
+
+ stop_machine(cache_rendezvous_handler, NULL, cpu_online_mask);
+ set_cache_aps_delayed_init(false);
+}
+
+static int __init cache_ap_register(void)
+{
+ cpuhp_setup_state_nocalls(CPUHP_AP_CACHECTRL_STARTING,
+ "x86/cachectrl:starting",
+ cache_ap_init, NULL);
+ return 0;
+}
+core_initcall(cache_ap_register);
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 3e508f239098..9cfca3d7d0e2 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -22,9 +22,9 @@
#include <linux/io.h>
#include <linux/syscore_ops.h>
#include <linux/pgtable.h>
+#include <linux/stackprotector.h>
#include <asm/cmdline.h>
-#include <asm/stackprotector.h>
#include <asm/perf_event.h>
#include <asm/mmu_context.h>
#include <asm/doublefault.h>
@@ -52,6 +52,7 @@
#include <asm/cpu.h>
#include <asm/mce.h>
#include <asm/msr.h>
+#include <asm/cacheinfo.h>
#include <asm/memtype.h>
#include <asm/microcode.h>
#include <asm/microcode_intel.h>
@@ -609,6 +610,7 @@ static __always_inline void setup_cet(struct cpuinfo_x86 *c)
if (!ibt_selftest()) {
pr_err("IBT selftest: Failed!\n");
+ wrmsrl(MSR_IA32_S_CET, 0);
setup_clear_cpu_cap(X86_FEATURE_IBT);
return;
}
@@ -701,16 +703,6 @@ static const char *table_lookup_model(struct cpuinfo_x86 *c)
__u32 cpu_caps_cleared[NCAPINTS + NBUGINTS] __aligned(sizeof(unsigned long));
__u32 cpu_caps_set[NCAPINTS + NBUGINTS] __aligned(sizeof(unsigned long));
-void load_percpu_segment(int cpu)
-{
-#ifdef CONFIG_X86_32
- loadsegment(fs, __KERNEL_PERCPU);
-#else
- __loadsegment_simple(gs, 0);
- wrmsrl(MSR_GS_BASE, cpu_kernelmode_gs_base(cpu));
-#endif
-}
-
#ifdef CONFIG_X86_32
/* The 32-bit entry code needs to find cpu_entry_area. */
DEFINE_PER_CPU(struct cpu_entry_area *, cpu_entry_area);
@@ -738,16 +730,45 @@ void load_fixmap_gdt(int cpu)
}
EXPORT_SYMBOL_GPL(load_fixmap_gdt);
-/*
- * Current gdt points %fs at the "master" per-cpu area: after this,
- * it's on the real one.
+/**
+ * switch_gdt_and_percpu_base - Switch to direct GDT and runtime per CPU base
+ * @cpu: The CPU number for which this is invoked
+ *
+ * Invoked during early boot to switch from early GDT and early per CPU to
+ * the direct GDT and the runtime per CPU area. On 32-bit the percpu base
+ * switch is implicit by loading the direct GDT. On 64bit this requires
+ * to update GSBASE.
*/
-void switch_to_new_gdt(int cpu)
+void __init switch_gdt_and_percpu_base(int cpu)
{
- /* Load the original GDT */
load_direct_gdt(cpu);
- /* Reload the per-cpu base */
- load_percpu_segment(cpu);
+
+#ifdef CONFIG_X86_64
+ /*
+ * No need to load %gs. It is already correct.
+ *
+ * Writing %gs on 64bit would zero GSBASE which would make any per
+ * CPU operation up to the point of the wrmsrl() fault.
+ *
+ * Set GSBASE to the new offset. Until the wrmsrl() happens the
+ * early mapping is still valid. That means the GSBASE update will
+ * lose any prior per CPU data which was not copied over in
+ * setup_per_cpu_areas().
+ *
+ * This works even with stackprotector enabled because the
+ * per CPU stack canary is 0 in both per CPU areas.
+ */
+ wrmsrl(MSR_GS_BASE, cpu_kernelmode_gs_base(cpu));
+#else
+ /*
+ * %fs is already set to __KERNEL_PERCPU, but after switching GDT
+ * it is required to load FS again so that the 'hidden' part is
+ * updated from the new GDT. Up to this point the early per CPU
+ * translation is active. Any content of the early per CPU data
+ * which was not copied over in setup_per_cpu_areas() is lost.
+ */
+ loadsegment(fs, __KERNEL_PERCPU);
+#endif
}
static const struct cpu_dev *cpu_devs[X86_VENDOR_NUM] = {};
@@ -1948,7 +1969,6 @@ void identify_secondary_cpu(struct cpuinfo_x86 *c)
#ifdef CONFIG_X86_32
enable_sep_cpu();
#endif
- mtrr_ap_init();
validate_apic_and_package_id(c);
x86_spec_ctrl_setup_ap();
update_srbds_msr();
@@ -1993,27 +2013,18 @@ static __init int setup_clearcpuid(char *arg)
}
__setup("clearcpuid=", setup_clearcpuid);
+DEFINE_PER_CPU_ALIGNED(struct pcpu_hot, pcpu_hot) = {
+ .current_task = &init_task,
+ .preempt_count = INIT_PREEMPT_COUNT,
+ .top_of_stack = TOP_OF_INIT_STACK,
+};
+EXPORT_PER_CPU_SYMBOL(pcpu_hot);
+
#ifdef CONFIG_X86_64
DEFINE_PER_CPU_FIRST(struct fixed_percpu_data,
fixed_percpu_data) __aligned(PAGE_SIZE) __visible;
EXPORT_PER_CPU_SYMBOL_GPL(fixed_percpu_data);
-/*
- * The following percpu variables are hot. Align current_task to
- * cacheline size such that they fall in the same cacheline.
- */
-DEFINE_PER_CPU(struct task_struct *, current_task) ____cacheline_aligned =
- &init_task;
-EXPORT_PER_CPU_SYMBOL(current_task);
-
-DEFINE_PER_CPU(void *, hardirq_stack_ptr);
-DEFINE_PER_CPU(bool, hardirq_stack_inuse);
-
-DEFINE_PER_CPU(int, __preempt_count) = INIT_PREEMPT_COUNT;
-EXPORT_PER_CPU_SYMBOL(__preempt_count);
-
-DEFINE_PER_CPU(unsigned long, cpu_current_top_of_stack) = TOP_OF_INIT_STACK;
-
static void wrmsrl_cstar(unsigned long val)
{
/*
@@ -2064,20 +2075,6 @@ void syscall_init(void)
#else /* CONFIG_X86_64 */
-DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task;
-EXPORT_PER_CPU_SYMBOL(current_task);
-DEFINE_PER_CPU(int, __preempt_count) = INIT_PREEMPT_COUNT;
-EXPORT_PER_CPU_SYMBOL(__preempt_count);
-
-/*
- * On x86_32, vm86 modifies tss.sp0, so sp0 isn't a reliable way to find
- * the top of the kernel stack. Use an extra percpu variable to track the
- * top of the kernel stack directly.
- */
-DEFINE_PER_CPU(unsigned long, cpu_current_top_of_stack) =
- (unsigned long)&init_thread_union + THREAD_SIZE;
-EXPORT_PER_CPU_SYMBOL(cpu_current_top_of_stack);
-
#ifdef CONFIG_STACKPROTECTOR
DEFINE_PER_CPU(unsigned long, __stack_chk_guard);
EXPORT_PER_CPU_SYMBOL(__stack_chk_guard);
@@ -2248,12 +2245,6 @@ void cpu_init(void)
boot_cpu_has(X86_FEATURE_TSC) || boot_cpu_has(X86_FEATURE_DE))
cr4_clear_bits(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);
- /*
- * Initialize the per-CPU GDT with the boot GDT,
- * and set up the GDT descriptor:
- */
- switch_to_new_gdt(cpu);
-
if (IS_ENABLED(CONFIG_X86_64)) {
loadsegment(fs, 0);
memset(cur->thread.tls_array, 0, GDT_ENTRY_TLS_ENTRIES * 8);
diff --git a/arch/x86/kernel/cpu/cpuid-deps.c b/arch/x86/kernel/cpu/cpuid-deps.c
index c881bcafba7d..d95221117129 100644
--- a/arch/x86/kernel/cpu/cpuid-deps.c
+++ b/arch/x86/kernel/cpu/cpuid-deps.c
@@ -75,6 +75,7 @@ static const struct cpuid_dep cpuid_deps[] = {
{ X86_FEATURE_SGX_LC, X86_FEATURE_SGX },
{ X86_FEATURE_SGX1, X86_FEATURE_SGX },
{ X86_FEATURE_SGX2, X86_FEATURE_SGX1 },
+ { X86_FEATURE_SGX_EDECCSSA, X86_FEATURE_SGX1 },
{ X86_FEATURE_XFD, X86_FEATURE_XSAVES },
{ X86_FEATURE_XFD, X86_FEATURE_XGETBV1 },
{ X86_FEATURE_AMX_TILE, X86_FEATURE_XFD },
diff --git a/arch/x86/kernel/cpu/hygon.c b/arch/x86/kernel/cpu/hygon.c
index 21fd425088fe..5a2962c492d3 100644
--- a/arch/x86/kernel/cpu/hygon.c
+++ b/arch/x86/kernel/cpu/hygon.c
@@ -326,8 +326,8 @@ static void init_hygon(struct cpuinfo_x86 *c)
* msr_set_bit() uses the safe accessors, too, even if the MSR
* is not present.
*/
- msr_set_bit(MSR_F10H_DECFG,
- MSR_F10H_DECFG_LFENCE_SERIALIZE_BIT);
+ msr_set_bit(MSR_AMD64_DE_CFG,
+ MSR_AMD64_DE_CFG_LFENCE_SERIALIZE_BIT);
/* A serializing LFENCE stops RDTSC speculation */
set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
@@ -339,7 +339,7 @@ static void init_hygon(struct cpuinfo_x86 *c)
set_cpu_cap(c, X86_FEATURE_ARAT);
/* Hygon CPUs don't reset SS attributes on SYSRET, Xen does. */
- if (!cpu_has(c, X86_FEATURE_XENPV))
+ if (!cpu_feature_enabled(X86_FEATURE_XENPV))
set_cpu_bug(c, X86_BUG_SYSRET_SS_ATTRS);
check_null_seg_clears_base(c);
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 2d7ea5480ec3..291d4167fab8 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -210,12 +210,154 @@ int intel_cpu_collect_info(struct ucode_cpu_info *uci)
csig.rev = intel_get_microcode_revision();
uci->cpu_sig = csig;
- uci->valid = 1;
return 0;
}
EXPORT_SYMBOL_GPL(intel_cpu_collect_info);
+/*
+ * Returns 1 if update has been found, 0 otherwise.
+ */
+int intel_find_matching_signature(void *mc, unsigned int csig, int cpf)
+{
+ struct microcode_header_intel *mc_hdr = mc;
+ struct extended_sigtable *ext_hdr;
+ struct extended_signature *ext_sig;
+ int i;
+
+ if (intel_cpu_signatures_match(csig, cpf, mc_hdr->sig, mc_hdr->pf))
+ return 1;
+
+ /* Look for ext. headers: */
+ if (get_totalsize(mc_hdr) <= get_datasize(mc_hdr) + MC_HEADER_SIZE)
+ return 0;
+
+ ext_hdr = mc + get_datasize(mc_hdr) + MC_HEADER_SIZE;
+ ext_sig = (void *)ext_hdr + EXT_HEADER_SIZE;
+
+ for (i = 0; i < ext_hdr->count; i++) {
+ if (intel_cpu_signatures_match(csig, cpf, ext_sig->sig, ext_sig->pf))
+ return 1;
+ ext_sig++;
+ }
+ return 0;
+}
+EXPORT_SYMBOL_GPL(intel_find_matching_signature);
+
+/**
+ * intel_microcode_sanity_check() - Sanity check microcode file.
+ * @mc: Pointer to the microcode file contents.
+ * @print_err: Display failure reason if true, silent if false.
+ * @hdr_type: Type of file, i.e. normal microcode file or In Field Scan file.
+ * Validate if the microcode header type matches with the type
+ * specified here.
+ *
+ * Validate certain header fields and verify if computed checksum matches
+ * with the one specified in the header.
+ *
+ * Return: 0 if the file passes all the checks, -EINVAL if any of the checks
+ * fail.
+ */
+int intel_microcode_sanity_check(void *mc, bool print_err, int hdr_type)
+{
+ unsigned long total_size, data_size, ext_table_size;
+ struct microcode_header_intel *mc_header = mc;
+ struct extended_sigtable *ext_header = NULL;
+ u32 sum, orig_sum, ext_sigcount = 0, i;
+ struct extended_signature *ext_sig;
+
+ total_size = get_totalsize(mc_header);
+ data_size = get_datasize(mc_header);
+
+ if (data_size + MC_HEADER_SIZE > total_size) {
+ if (print_err)
+ pr_err("Error: bad microcode data file size.\n");
+ return -EINVAL;
+ }
+
+ if (mc_header->ldrver != 1 || mc_header->hdrver != hdr_type) {
+ if (print_err)
+ pr_err("Error: invalid/unknown microcode update format. Header type %d\n",
+ mc_header->hdrver);
+ return -EINVAL;
+ }
+
+ ext_table_size = total_size - (MC_HEADER_SIZE + data_size);
+ if (ext_table_size) {
+ u32 ext_table_sum = 0;
+ u32 *ext_tablep;
+
+ if (ext_table_size < EXT_HEADER_SIZE ||
+ ((ext_table_size - EXT_HEADER_SIZE) % EXT_SIGNATURE_SIZE)) {
+ if (print_err)
+ pr_err("Error: truncated extended signature table.\n");
+ return -EINVAL;
+ }
+
+ ext_header = mc + MC_HEADER_SIZE + data_size;
+ if (ext_table_size != exttable_size(ext_header)) {
+ if (print_err)
+ pr_err("Error: extended signature table size mismatch.\n");
+ return -EFAULT;
+ }
+
+ ext_sigcount = ext_header->count;
+
+ /*
+ * Check extended table checksum: the sum of all dwords that
+ * comprise a valid table must be 0.
+ */
+ ext_tablep = (u32 *)ext_header;
+
+ i = ext_table_size / sizeof(u32);
+ while (i--)
+ ext_table_sum += ext_tablep[i];
+
+ if (ext_table_sum) {
+ if (print_err)
+ pr_warn("Bad extended signature table checksum, aborting.\n");
+ return -EINVAL;
+ }
+ }
+
+ /*
+ * Calculate the checksum of update data and header. The checksum of
+ * valid update data and header including the extended signature table
+ * must be 0.
+ */
+ orig_sum = 0;
+ i = (MC_HEADER_SIZE + data_size) / sizeof(u32);
+ while (i--)
+ orig_sum += ((u32 *)mc)[i];
+
+ if (orig_sum) {
+ if (print_err)
+ pr_err("Bad microcode data checksum, aborting.\n");
+ return -EINVAL;
+ }
+
+ if (!ext_table_size)
+ return 0;
+
+ /*
+ * Check extended signature checksum: 0 => valid.
+ */
+ for (i = 0; i < ext_sigcount; i++) {
+ ext_sig = (void *)ext_header + EXT_HEADER_SIZE +
+ EXT_SIGNATURE_SIZE * i;
+
+ sum = (mc_header->sig + mc_header->pf + mc_header->cksum) -
+ (ext_sig->sig + ext_sig->pf + ext_sig->cksum);
+ if (sum) {
+ if (print_err)
+ pr_err("Bad extended signature checksum, aborting.\n");
+ return -EINVAL;
+ }
+ }
+ return 0;
+}
+EXPORT_SYMBOL_GPL(intel_microcode_sanity_check);
+
static void early_init_intel(struct cpuinfo_x86 *c)
{
u64 misc_enable;
@@ -1034,8 +1176,32 @@ static const struct {
static struct ratelimit_state bld_ratelimit;
+static unsigned int sysctl_sld_mitigate = 1;
static DEFINE_SEMAPHORE(buslock_sem);
+#ifdef CONFIG_PROC_SYSCTL
+static struct ctl_table sld_sysctls[] = {
+ {
+ .procname = "split_lock_mitigate",
+ .data = &sysctl_sld_mitigate,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_douintvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
+ },
+ {}
+};
+
+static int __init sld_mitigate_sysctl_init(void)
+{
+ register_sysctl_init("kernel", sld_sysctls);
+ return 0;
+}
+
+late_initcall(sld_mitigate_sysctl_init);
+#endif
+
static inline bool match_option(const char *arg, int arglen, const char *opt)
{
int len = strlen(opt), ratelimit;
@@ -1146,12 +1312,20 @@ static void split_lock_init(void)
split_lock_verify_msr(sld_state != sld_off);
}
-static void __split_lock_reenable(struct work_struct *work)
+static void __split_lock_reenable_unlock(struct work_struct *work)
{
sld_update_msr(true);
up(&buslock_sem);
}
+static DECLARE_DELAYED_WORK(sl_reenable_unlock, __split_lock_reenable_unlock);
+
+static void __split_lock_reenable(struct work_struct *work)
+{
+ sld_update_msr(true);
+}
+static DECLARE_DELAYED_WORK(sl_reenable, __split_lock_reenable);
+
/*
* If a CPU goes offline with pending delayed work to re-enable split lock
* detection then the delayed work will be executed on some other CPU. That
@@ -1169,10 +1343,9 @@ static int splitlock_cpu_offline(unsigned int cpu)
return 0;
}
-static DECLARE_DELAYED_WORK(split_lock_reenable, __split_lock_reenable);
-
static void split_lock_warn(unsigned long ip)
{
+ struct delayed_work *work;
int cpu;
if (!current->reported_split_lock)
@@ -1180,14 +1353,26 @@ static void split_lock_warn(unsigned long ip)
current->comm, current->pid, ip);
current->reported_split_lock = 1;
- /* misery factor #1, sleep 10ms before trying to execute split lock */
- if (msleep_interruptible(10) > 0)
- return;
- /* Misery factor #2, only allow one buslocked disabled core at a time */
- if (down_interruptible(&buslock_sem) == -EINTR)
- return;
+ if (sysctl_sld_mitigate) {
+ /*
+ * misery factor #1:
+ * sleep 10ms before trying to execute split lock.
+ */
+ if (msleep_interruptible(10) > 0)
+ return;
+ /*
+ * Misery factor #2:
+ * only allow one buslocked disabled core at a time.
+ */
+ if (down_interruptible(&buslock_sem) == -EINTR)
+ return;
+ work = &sl_reenable_unlock;
+ } else {
+ work = &sl_reenable;
+ }
+
cpu = get_cpu();
- schedule_delayed_work_on(cpu, &split_lock_reenable, 2);
+ schedule_delayed_work_on(cpu, work, 2);
/* Disable split lock detection on this CPU to make progress */
sld_update_msr(false);
diff --git a/arch/x86/kernel/cpu/intel_epb.c b/arch/x86/kernel/cpu/intel_epb.c
index fbaf12e43f41..3b8476158236 100644
--- a/arch/x86/kernel/cpu/intel_epb.c
+++ b/arch/x86/kernel/cpu/intel_epb.c
@@ -204,7 +204,12 @@ static int intel_epb_offline(unsigned int cpu)
}
static const struct x86_cpu_id intel_epb_normal[] = {
- X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE_L, 7),
+ X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE_L,
+ ENERGY_PERF_BIAS_NORMAL_POWERSAVE),
+ X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE_N,
+ ENERGY_PERF_BIAS_NORMAL_POWERSAVE),
+ X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE_P,
+ ENERGY_PERF_BIAS_NORMAL_POWERSAVE),
{}
};
diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c
index 1c87501e0fa3..10fb5b5c9efa 100644
--- a/arch/x86/kernel/cpu/mce/amd.c
+++ b/arch/x86/kernel/cpu/mce/amd.c
@@ -788,6 +788,24 @@ _log_error_bank(unsigned int bank, u32 msr_stat, u32 msr_addr, u64 misc)
return status & MCI_STATUS_DEFERRED;
}
+static bool _log_error_deferred(unsigned int bank, u32 misc)
+{
+ if (!_log_error_bank(bank, mca_msr_reg(bank, MCA_STATUS),
+ mca_msr_reg(bank, MCA_ADDR), misc))
+ return false;
+
+ /*
+ * Non-SMCA systems don't have MCA_DESTAT/MCA_DEADDR registers.
+ * Return true here to avoid accessing these registers.
+ */
+ if (!mce_flags.smca)
+ return true;
+
+ /* Clear MCA_DESTAT if the deferred error was logged from MCA_STATUS. */
+ wrmsrl(MSR_AMD64_SMCA_MCx_DESTAT(bank), 0);
+ return true;
+}
+
/*
* We have three scenarios for checking for Deferred errors:
*
@@ -799,19 +817,8 @@ _log_error_bank(unsigned int bank, u32 msr_stat, u32 msr_addr, u64 misc)
*/
static void log_error_deferred(unsigned int bank)
{
- bool defrd;
-
- defrd = _log_error_bank(bank, mca_msr_reg(bank, MCA_STATUS),
- mca_msr_reg(bank, MCA_ADDR), 0);
-
- if (!mce_flags.smca)
- return;
-
- /* Clear MCA_DESTAT if we logged the deferred error from MCA_STATUS. */
- if (defrd) {
- wrmsrl(MSR_AMD64_SMCA_MCx_DESTAT(bank), 0);
+ if (_log_error_deferred(bank, 0))
return;
- }
/*
* Only deferred errors are logged in MCA_DE{STAT,ADDR} so just check
@@ -832,7 +839,7 @@ static void amd_deferred_error_interrupt(void)
static void log_error_thresholding(unsigned int bank, u64 misc)
{
- _log_error_bank(bank, mca_msr_reg(bank, MCA_STATUS), mca_msr_reg(bank, MCA_ADDR), misc);
+ _log_error_deferred(bank, misc);
}
static void log_and_reset_block(struct threshold_block *block)
diff --git a/arch/x86/kernel/cpu/mce/severity.c b/arch/x86/kernel/cpu/mce/severity.c
index 00483d1c27e4..c4477162c07d 100644
--- a/arch/x86/kernel/cpu/mce/severity.c
+++ b/arch/x86/kernel/cpu/mce/severity.c
@@ -203,6 +203,11 @@ static struct severity {
BITSET(MCI_STATUS_OVER|MCI_STATUS_UC)
),
MCESEV(
+ PANIC, "Uncorrected in kernel",
+ BITSET(MCI_STATUS_UC),
+ KERNEL
+ ),
+ MCESEV(
UC, "Uncorrected",
BITSET(MCI_STATUS_UC)
),
@@ -391,9 +396,6 @@ static noinstr int mce_severity_intel(struct mce *m, struct pt_regs *regs, char
*msg = s->msg;
s->covered = 1;
- if (s->sev >= MCE_UC_SEVERITY && ctx == IN_KERNEL)
- return MCE_PANIC_SEVERITY;
-
return s->sev;
}
}
diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c
index 3a35dec3ec55..56471f750762 100644
--- a/arch/x86/kernel/cpu/microcode/amd.c
+++ b/arch/x86/kernel/cpu/microcode/amd.c
@@ -901,8 +901,7 @@ load_microcode_amd(bool save, u8 family, const u8 *data, size_t size)
*
* These might be larger than 2K.
*/
-static enum ucode_state request_microcode_amd(int cpu, struct device *device,
- bool refresh_fw)
+static enum ucode_state request_microcode_amd(int cpu, struct device *device)
{
char fw_name[36] = "amd-ucode/microcode_amd.bin";
struct cpuinfo_x86 *c = &cpu_data(cpu);
@@ -911,7 +910,7 @@ static enum ucode_state request_microcode_amd(int cpu, struct device *device,
const struct firmware *fw;
/* reload ucode container only on the boot cpu */
- if (!refresh_fw || !bsp)
+ if (!bsp)
return UCODE_OK;
if (c->x86 >= 0x15)
diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c
index 6a41cee242f6..712aafff96e0 100644
--- a/arch/x86/kernel/cpu/microcode/core.c
+++ b/arch/x86/kernel/cpu/microcode/core.c
@@ -319,60 +319,6 @@ void reload_early_microcode(void)
}
}
-static void collect_cpu_info_local(void *arg)
-{
- struct cpu_info_ctx *ctx = arg;
-
- ctx->err = microcode_ops->collect_cpu_info(smp_processor_id(),
- ctx->cpu_sig);
-}
-
-static int collect_cpu_info_on_target(int cpu, struct cpu_signature *cpu_sig)
-{
- struct cpu_info_ctx ctx = { .cpu_sig = cpu_sig, .err = 0 };
- int ret;
-
- ret = smp_call_function_single(cpu, collect_cpu_info_local, &ctx, 1);
- if (!ret)
- ret = ctx.err;
-
- return ret;
-}
-
-static int collect_cpu_info(int cpu)
-{
- struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
- int ret;
-
- memset(uci, 0, sizeof(*uci));
-
- ret = collect_cpu_info_on_target(cpu, &uci->cpu_sig);
- if (!ret)
- uci->valid = 1;
-
- return ret;
-}
-
-static void apply_microcode_local(void *arg)
-{
- enum ucode_state *err = arg;
-
- *err = microcode_ops->apply_microcode(smp_processor_id());
-}
-
-static int apply_microcode_on_target(int cpu)
-{
- enum ucode_state err;
- int ret;
-
- ret = smp_call_function_single(cpu, apply_microcode_local, &err, 1);
- if (!ret) {
- if (err == UCODE_ERROR)
- ret = 1;
- }
- return ret;
-}
-
/* fake device for request_firmware */
static struct platform_device *microcode_pdev;
@@ -458,7 +404,7 @@ static int __reload_late(void *info)
* below.
*/
if (cpumask_first(topology_sibling_cpumask(cpu)) == cpu)
- apply_microcode_local(&err);
+ err = microcode_ops->apply_microcode(cpu);
else
goto wait_for_siblings;
@@ -480,7 +426,7 @@ wait_for_siblings:
* revision.
*/
if (cpumask_first(topology_sibling_cpumask(cpu)) != cpu)
- apply_microcode_local(&err);
+ err = microcode_ops->apply_microcode(cpu);
return ret;
}
@@ -531,7 +477,7 @@ static ssize_t reload_store(struct device *dev,
if (ret)
goto put;
- tmp_ret = microcode_ops->request_microcode_fw(bsp, &microcode_pdev->dev, true);
+ tmp_ret = microcode_ops->request_microcode_fw(bsp, &microcode_pdev->dev);
if (tmp_ret != UCODE_NEW)
goto put;
@@ -589,91 +535,17 @@ static void microcode_fini_cpu(int cpu)
microcode_ops->microcode_fini_cpu(cpu);
}
-static enum ucode_state microcode_resume_cpu(int cpu)
+static enum ucode_state microcode_init_cpu(int cpu)
{
- if (apply_microcode_on_target(cpu))
- return UCODE_ERROR;
-
- pr_debug("CPU%d updated upon resume\n", cpu);
-
- return UCODE_OK;
-}
-
-static enum ucode_state microcode_init_cpu(int cpu, bool refresh_fw)
-{
- enum ucode_state ustate;
struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
- if (uci->valid)
- return UCODE_OK;
-
- if (collect_cpu_info(cpu))
- return UCODE_ERROR;
-
- /* --dimm. Trigger a delayed update? */
- if (system_state != SYSTEM_RUNNING)
- return UCODE_NFOUND;
-
- ustate = microcode_ops->request_microcode_fw(cpu, &microcode_pdev->dev, refresh_fw);
- if (ustate == UCODE_NEW) {
- pr_debug("CPU%d updated upon init\n", cpu);
- apply_microcode_on_target(cpu);
- }
-
- return ustate;
-}
-
-static enum ucode_state microcode_update_cpu(int cpu)
-{
- struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
-
- /* Refresh CPU microcode revision after resume. */
- collect_cpu_info(cpu);
-
- if (uci->valid)
- return microcode_resume_cpu(cpu);
-
- return microcode_init_cpu(cpu, false);
-}
-
-static int mc_device_add(struct device *dev, struct subsys_interface *sif)
-{
- int err, cpu = dev->id;
-
- if (!cpu_online(cpu))
- return 0;
-
- pr_debug("CPU%d added\n", cpu);
-
- err = sysfs_create_group(&dev->kobj, &mc_attr_group);
- if (err)
- return err;
+ memset(uci, 0, sizeof(*uci));
- if (microcode_init_cpu(cpu, true) == UCODE_ERROR)
- return -EINVAL;
+ microcode_ops->collect_cpu_info(cpu, &uci->cpu_sig);
- return err;
+ return microcode_ops->apply_microcode(cpu);
}
-static void mc_device_remove(struct device *dev, struct subsys_interface *sif)
-{
- int cpu = dev->id;
-
- if (!cpu_online(cpu))
- return;
-
- pr_debug("CPU%d removed\n", cpu);
- microcode_fini_cpu(cpu);
- sysfs_remove_group(&dev->kobj, &mc_attr_group);
-}
-
-static struct subsys_interface mc_cpu_interface = {
- .name = "microcode",
- .subsys = &cpu_subsys,
- .add_dev = mc_device_add,
- .remove_dev = mc_device_remove,
-};
-
/**
* microcode_bsp_resume - Update boot CPU microcode during resume.
*/
@@ -682,21 +554,23 @@ void microcode_bsp_resume(void)
int cpu = smp_processor_id();
struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
- if (uci->valid && uci->mc)
+ if (uci->mc)
microcode_ops->apply_microcode(cpu);
- else if (!uci->mc)
+ else
reload_early_microcode();
}
static struct syscore_ops mc_syscore_ops = {
- .resume = microcode_bsp_resume,
+ .resume = microcode_bsp_resume,
};
static int mc_cpu_starting(unsigned int cpu)
{
- microcode_update_cpu(cpu);
- pr_debug("CPU%d added\n", cpu);
- return 0;
+ enum ucode_state err = microcode_ops->apply_microcode(cpu);
+
+ pr_debug("%s: CPU%d, err: %d\n", __func__, cpu, err);
+
+ return err == UCODE_ERROR;
}
static int mc_cpu_online(unsigned int cpu)
@@ -713,13 +587,30 @@ static int mc_cpu_down_prep(unsigned int cpu)
struct device *dev;
dev = get_cpu_device(cpu);
+
+ microcode_fini_cpu(cpu);
+
/* Suspend is in progress, only remove the interface */
sysfs_remove_group(&dev->kobj, &mc_attr_group);
- pr_debug("CPU%d removed\n", cpu);
+ pr_debug("%s: CPU%d\n", __func__, cpu);
return 0;
}
+static void setup_online_cpu(struct work_struct *work)
+{
+ int cpu = smp_processor_id();
+ enum ucode_state err;
+
+ err = microcode_init_cpu(cpu);
+ if (err == UCODE_ERROR) {
+ pr_err("Error applying microcode on CPU%d\n", cpu);
+ return;
+ }
+
+ mc_cpu_online(cpu);
+}
+
static struct attribute *cpu_root_microcode_attrs[] = {
#ifdef CONFIG_MICROCODE_LATE_LOADING
&dev_attr_reload.attr,
@@ -750,28 +641,19 @@ static int __init microcode_init(void)
if (!microcode_ops)
return -ENODEV;
- microcode_pdev = platform_device_register_simple("microcode", -1,
- NULL, 0);
+ microcode_pdev = platform_device_register_simple("microcode", -1, NULL, 0);
if (IS_ERR(microcode_pdev))
return PTR_ERR(microcode_pdev);
- cpus_read_lock();
- mutex_lock(&microcode_mutex);
- error = subsys_interface_register(&mc_cpu_interface);
- mutex_unlock(&microcode_mutex);
- cpus_read_unlock();
-
- if (error)
- goto out_pdev;
-
- error = sysfs_create_group(&cpu_subsys.dev_root->kobj,
- &cpu_root_microcode_group);
-
+ error = sysfs_create_group(&cpu_subsys.dev_root->kobj, &cpu_root_microcode_group);
if (error) {
pr_err("Error creating microcode group!\n");
- goto out_driver;
+ goto out_pdev;
}
+ /* Do per-CPU setup */
+ schedule_on_each_cpu(setup_online_cpu);
+
register_syscore_ops(&mc_syscore_ops);
cpuhp_setup_state_nocalls(CPUHP_AP_MICROCODE_LOADER, "x86/microcode:starting",
mc_cpu_starting, NULL);
@@ -782,15 +664,6 @@ static int __init microcode_init(void)
return 0;
- out_driver:
- cpus_read_lock();
- mutex_lock(&microcode_mutex);
-
- subsys_interface_unregister(&mc_cpu_interface);
-
- mutex_unlock(&microcode_mutex);
- cpus_read_unlock();
-
out_pdev:
platform_device_unregister(microcode_pdev);
return error;
diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c
index 1fcbd671f1df..cac2bdb57f0b 100644
--- a/arch/x86/kernel/cpu/microcode/intel.c
+++ b/arch/x86/kernel/cpu/microcode/intel.c
@@ -48,34 +48,6 @@ static int llc_size_per_core;
/*
* Returns 1 if update has been found, 0 otherwise.
*/
-static int find_matching_signature(void *mc, unsigned int csig, int cpf)
-{
- struct microcode_header_intel *mc_hdr = mc;
- struct extended_sigtable *ext_hdr;
- struct extended_signature *ext_sig;
- int i;
-
- if (intel_cpu_signatures_match(csig, cpf, mc_hdr->sig, mc_hdr->pf))
- return 1;
-
- /* Look for ext. headers: */
- if (get_totalsize(mc_hdr) <= get_datasize(mc_hdr) + MC_HEADER_SIZE)
- return 0;
-
- ext_hdr = mc + get_datasize(mc_hdr) + MC_HEADER_SIZE;
- ext_sig = (void *)ext_hdr + EXT_HEADER_SIZE;
-
- for (i = 0; i < ext_hdr->count; i++) {
- if (intel_cpu_signatures_match(csig, cpf, ext_sig->sig, ext_sig->pf))
- return 1;
- ext_sig++;
- }
- return 0;
-}
-
-/*
- * Returns 1 if update has been found, 0 otherwise.
- */
static int has_newer_microcode(void *mc, unsigned int csig, int cpf, int new_rev)
{
struct microcode_header_intel *mc_hdr = mc;
@@ -83,7 +55,7 @@ static int has_newer_microcode(void *mc, unsigned int csig, int cpf, int new_rev
if (mc_hdr->rev <= new_rev)
return 0;
- return find_matching_signature(mc, csig, cpf);
+ return intel_find_matching_signature(mc, csig, cpf);
}
static struct ucode_patch *memdup_patch(void *data, unsigned int size)
@@ -117,7 +89,7 @@ static void save_microcode_patch(struct ucode_cpu_info *uci, void *data, unsigne
sig = mc_saved_hdr->sig;
pf = mc_saved_hdr->pf;
- if (find_matching_signature(data, sig, pf)) {
+ if (intel_find_matching_signature(data, sig, pf)) {
prev_found = true;
if (mc_hdr->rev <= mc_saved_hdr->rev)
@@ -149,7 +121,7 @@ static void save_microcode_patch(struct ucode_cpu_info *uci, void *data, unsigne
if (!p)
return;
- if (!find_matching_signature(p->data, uci->cpu_sig.sig, uci->cpu_sig.pf))
+ if (!intel_find_matching_signature(p->data, uci->cpu_sig.sig, uci->cpu_sig.pf))
return;
/*
@@ -163,104 +135,6 @@ static void save_microcode_patch(struct ucode_cpu_info *uci, void *data, unsigne
intel_ucode_patch = p->data;
}
-static int microcode_sanity_check(void *mc, int print_err)
-{
- unsigned long total_size, data_size, ext_table_size;
- struct microcode_header_intel *mc_header = mc;
- struct extended_sigtable *ext_header = NULL;
- u32 sum, orig_sum, ext_sigcount = 0, i;
- struct extended_signature *ext_sig;
-
- total_size = get_totalsize(mc_header);
- data_size = get_datasize(mc_header);
-
- if (data_size + MC_HEADER_SIZE > total_size) {
- if (print_err)
- pr_err("Error: bad microcode data file size.\n");
- return -EINVAL;
- }
-
- if (mc_header->ldrver != 1 || mc_header->hdrver != 1) {
- if (print_err)
- pr_err("Error: invalid/unknown microcode update format.\n");
- return -EINVAL;
- }
-
- ext_table_size = total_size - (MC_HEADER_SIZE + data_size);
- if (ext_table_size) {
- u32 ext_table_sum = 0;
- u32 *ext_tablep;
-
- if ((ext_table_size < EXT_HEADER_SIZE)
- || ((ext_table_size - EXT_HEADER_SIZE) % EXT_SIGNATURE_SIZE)) {
- if (print_err)
- pr_err("Error: truncated extended signature table.\n");
- return -EINVAL;
- }
-
- ext_header = mc + MC_HEADER_SIZE + data_size;
- if (ext_table_size != exttable_size(ext_header)) {
- if (print_err)
- pr_err("Error: extended signature table size mismatch.\n");
- return -EFAULT;
- }
-
- ext_sigcount = ext_header->count;
-
- /*
- * Check extended table checksum: the sum of all dwords that
- * comprise a valid table must be 0.
- */
- ext_tablep = (u32 *)ext_header;
-
- i = ext_table_size / sizeof(u32);
- while (i--)
- ext_table_sum += ext_tablep[i];
-
- if (ext_table_sum) {
- if (print_err)
- pr_warn("Bad extended signature table checksum, aborting.\n");
- return -EINVAL;
- }
- }
-
- /*
- * Calculate the checksum of update data and header. The checksum of
- * valid update data and header including the extended signature table
- * must be 0.
- */
- orig_sum = 0;
- i = (MC_HEADER_SIZE + data_size) / sizeof(u32);
- while (i--)
- orig_sum += ((u32 *)mc)[i];
-
- if (orig_sum) {
- if (print_err)
- pr_err("Bad microcode data checksum, aborting.\n");
- return -EINVAL;
- }
-
- if (!ext_table_size)
- return 0;
-
- /*
- * Check extended signature checksum: 0 => valid.
- */
- for (i = 0; i < ext_sigcount; i++) {
- ext_sig = (void *)ext_header + EXT_HEADER_SIZE +
- EXT_SIGNATURE_SIZE * i;
-
- sum = (mc_header->sig + mc_header->pf + mc_header->cksum) -
- (ext_sig->sig + ext_sig->pf + ext_sig->cksum);
- if (sum) {
- if (print_err)
- pr_err("Bad extended signature checksum, aborting.\n");
- return -EINVAL;
- }
- }
- return 0;
-}
-
/*
* Get microcode matching with BSP's model. Only CPUs with the same model as
* BSP can stay in the platform.
@@ -281,13 +155,13 @@ scan_microcode(void *data, size_t size, struct ucode_cpu_info *uci, bool save)
mc_size = get_totalsize(mc_header);
if (!mc_size ||
mc_size > size ||
- microcode_sanity_check(data, 0) < 0)
+ intel_microcode_sanity_check(data, false, MC_HEADER_TYPE_MICROCODE) < 0)
break;
size -= mc_size;
- if (!find_matching_signature(data, uci->cpu_sig.sig,
- uci->cpu_sig.pf)) {
+ if (!intel_find_matching_signature(data, uci->cpu_sig.sig,
+ uci->cpu_sig.pf)) {
data += mc_size;
continue;
}
@@ -621,7 +495,6 @@ void load_ucode_intel_ap(void)
else
iup = &intel_ucode_patch;
-reget:
if (!*iup) {
patch = __load_ucode_intel(&uci);
if (!patch)
@@ -632,12 +505,7 @@ reget:
uci.mc = *iup;
- if (apply_microcode_early(&uci, true)) {
- /* Mixed-silicon system? Try to refetch the proper patch: */
- *iup = NULL;
-
- goto reget;
- }
+ apply_microcode_early(&uci, true);
}
static struct microcode_intel *find_patch(struct ucode_cpu_info *uci)
@@ -652,9 +520,9 @@ static struct microcode_intel *find_patch(struct ucode_cpu_info *uci)
if (phdr->rev <= uci->cpu_sig.rev)
continue;
- if (!find_matching_signature(phdr,
- uci->cpu_sig.sig,
- uci->cpu_sig.pf))
+ if (!intel_find_matching_signature(phdr,
+ uci->cpu_sig.sig,
+ uci->cpu_sig.pf))
continue;
return iter->data;
@@ -680,7 +548,6 @@ void reload_ucode_intel(void)
static int collect_cpu_info(int cpu_num, struct cpu_signature *csig)
{
- static struct cpu_signature prev;
struct cpuinfo_x86 *c = &cpu_data(cpu_num);
unsigned int val[2];
@@ -696,13 +563,6 @@ static int collect_cpu_info(int cpu_num, struct cpu_signature *csig)
csig->rev = c->microcode;
- /* No extra locking on prev, races are harmless. */
- if (csig->sig != prev.sig || csig->pf != prev.pf || csig->rev != prev.rev) {
- pr_info("sig=0x%x, pf=0x%x, revision=0x%x\n",
- csig->sig, csig->pf, csig->rev);
- prev = *csig;
- }
-
return 0;
}
@@ -820,7 +680,7 @@ static enum ucode_state generic_load_microcode(int cpu, struct iov_iter *iter)
memcpy(mc, &mc_header, sizeof(mc_header));
data = mc + sizeof(mc_header);
if (!copy_from_iter_full(data, data_size, iter) ||
- microcode_sanity_check(mc, 1) < 0) {
+ intel_microcode_sanity_check(mc, true, MC_HEADER_TYPE_MICROCODE) < 0) {
break;
}
@@ -885,8 +745,7 @@ static bool is_blacklisted(unsigned int cpu)
return false;
}
-static enum ucode_state request_microcode_fw(int cpu, struct device *device,
- bool refresh_fw)
+static enum ucode_state request_microcode_fw(int cpu, struct device *device)
{
struct cpuinfo_x86 *c = &cpu_data(cpu);
const struct firmware *firmware;
@@ -908,7 +767,7 @@ static enum ucode_state request_microcode_fw(int cpu, struct device *device,
kvec.iov_base = (void *)firmware->data;
kvec.iov_len = firmware->size;
- iov_iter_kvec(&iter, WRITE, &kvec, 1, firmware->size);
+ iov_iter_kvec(&iter, ITER_SOURCE, &kvec, 1, firmware->size);
ret = generic_load_microcode(cpu, &iter);
release_firmware(firmware);
diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c
index e402923800d7..bfbdc072017d 100644
--- a/arch/x86/kernel/cpu/mshyperv.c
+++ b/arch/x86/kernel/cpu/mshyperv.c
@@ -475,6 +475,12 @@ static bool __init ms_hyperv_x2apic_available(void)
* (logically) generates MSIs directly to the system APIC irq domain.
* There is no HPET, and PCI MSI/MSI-X interrupts are remapped by the
* pci-hyperv host bridge.
+ *
+ * Note: for a Hyper-V root partition, this will always return false.
+ * The hypervisor doesn't expose these HYPERV_CPUID_VIRT_STACK_* cpuids by
+ * default, they are implemented as intercepts by the Windows Hyper-V stack.
+ * Even a nested root partition (L2 root) will not get them because the
+ * nested (L1) hypervisor filters them out.
*/
static bool __init ms_hyperv_msi_ext_dest_id(void)
{
diff --git a/arch/x86/kernel/cpu/mtrr/amd.c b/arch/x86/kernel/cpu/mtrr/amd.c
index a65a0272096d..eff6ac62c0ff 100644
--- a/arch/x86/kernel/cpu/mtrr/amd.c
+++ b/arch/x86/kernel/cpu/mtrr/amd.c
@@ -109,7 +109,7 @@ amd_validate_add_page(unsigned long base, unsigned long size, unsigned int type)
return 0;
}
-static const struct mtrr_ops amd_mtrr_ops = {
+const struct mtrr_ops amd_mtrr_ops = {
.vendor = X86_VENDOR_AMD,
.set = amd_set_mtrr,
.get = amd_get_mtrr,
@@ -117,9 +117,3 @@ static const struct mtrr_ops amd_mtrr_ops = {
.validate_add_page = amd_validate_add_page,
.have_wrcomb = positive_have_wrcomb,
};
-
-int __init amd_init_mtrr(void)
-{
- set_mtrr_ops(&amd_mtrr_ops);
- return 0;
-}
diff --git a/arch/x86/kernel/cpu/mtrr/centaur.c b/arch/x86/kernel/cpu/mtrr/centaur.c
index f27177816569..b8a74eddde83 100644
--- a/arch/x86/kernel/cpu/mtrr/centaur.c
+++ b/arch/x86/kernel/cpu/mtrr/centaur.c
@@ -111,7 +111,7 @@ centaur_validate_add_page(unsigned long base, unsigned long size, unsigned int t
return 0;
}
-static const struct mtrr_ops centaur_mtrr_ops = {
+const struct mtrr_ops centaur_mtrr_ops = {
.vendor = X86_VENDOR_CENTAUR,
.set = centaur_set_mcr,
.get = centaur_get_mcr,
@@ -119,9 +119,3 @@ static const struct mtrr_ops centaur_mtrr_ops = {
.validate_add_page = centaur_validate_add_page,
.have_wrcomb = positive_have_wrcomb,
};
-
-int __init centaur_init_mtrr(void)
-{
- set_mtrr_ops(&centaur_mtrr_ops);
- return 0;
-}
diff --git a/arch/x86/kernel/cpu/mtrr/cyrix.c b/arch/x86/kernel/cpu/mtrr/cyrix.c
index ca670919b561..173b9e01e623 100644
--- a/arch/x86/kernel/cpu/mtrr/cyrix.c
+++ b/arch/x86/kernel/cpu/mtrr/cyrix.c
@@ -234,51 +234,11 @@ static void cyrix_set_arr(unsigned int reg, unsigned long base,
post_set();
}
-typedef struct {
- unsigned long base;
- unsigned long size;
- mtrr_type type;
-} arr_state_t;
-
-static arr_state_t arr_state[8] = {
- {0UL, 0UL, 0UL}, {0UL, 0UL, 0UL}, {0UL, 0UL, 0UL}, {0UL, 0UL, 0UL},
- {0UL, 0UL, 0UL}, {0UL, 0UL, 0UL}, {0UL, 0UL, 0UL}, {0UL, 0UL, 0UL}
-};
-
-static unsigned char ccr_state[7] = { 0, 0, 0, 0, 0, 0, 0 };
-
-static void cyrix_set_all(void)
-{
- int i;
-
- prepare_set();
-
- /* the CCRs are not contiguous */
- for (i = 0; i < 4; i++)
- setCx86(CX86_CCR0 + i, ccr_state[i]);
- for (; i < 7; i++)
- setCx86(CX86_CCR4 + i, ccr_state[i]);
-
- for (i = 0; i < 8; i++) {
- cyrix_set_arr(i, arr_state[i].base,
- arr_state[i].size, arr_state[i].type);
- }
-
- post_set();
-}
-
-static const struct mtrr_ops cyrix_mtrr_ops = {
+const struct mtrr_ops cyrix_mtrr_ops = {
.vendor = X86_VENDOR_CYRIX,
- .set_all = cyrix_set_all,
.set = cyrix_set_arr,
.get = cyrix_get_arr,
.get_free_region = cyrix_get_free_region,
.validate_add_page = generic_validate_add_page,
.have_wrcomb = positive_have_wrcomb,
};
-
-int __init cyrix_init_mtrr(void)
-{
- set_mtrr_ops(&cyrix_mtrr_ops);
- return 0;
-}
diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c
index 558108296f3c..ee09d359e08f 100644
--- a/arch/x86/kernel/cpu/mtrr/generic.c
+++ b/arch/x86/kernel/cpu/mtrr/generic.c
@@ -10,6 +10,7 @@
#include <linux/mm.h>
#include <asm/processor-flags.h>
+#include <asm/cacheinfo.h>
#include <asm/cpufeature.h>
#include <asm/tlbflush.h>
#include <asm/mtrr.h>
@@ -396,9 +397,6 @@ print_fixed(unsigned base, unsigned step, const mtrr_type *types)
}
}
-static void prepare_set(void);
-static void post_set(void);
-
static void __init print_mtrr_state(void)
{
unsigned int i;
@@ -444,20 +442,6 @@ static void __init print_mtrr_state(void)
pr_debug("TOM2: %016llx aka %lldM\n", mtrr_tom2, mtrr_tom2>>20);
}
-/* PAT setup for BP. We need to go through sync steps here */
-void __init mtrr_bp_pat_init(void)
-{
- unsigned long flags;
-
- local_irq_save(flags);
- prepare_set();
-
- pat_init();
-
- post_set();
- local_irq_restore(flags);
-}
-
/* Grab all of the MTRR state for this CPU into *state */
bool __init get_mtrr_state(void)
{
@@ -684,7 +668,10 @@ static u32 deftype_lo, deftype_hi;
/**
* set_mtrr_state - Set the MTRR state for this CPU.
*
- * NOTE: The CPU must already be in a safe state for MTRR changes.
+ * NOTE: The CPU must already be in a safe state for MTRR changes, including
+ * measures that only a single CPU can be active in set_mtrr_state() in
+ * order to not be subject to races for usage of deftype_lo. This is
+ * accomplished by taking cache_disable_lock.
* RETURNS: 0 if no changes made, else a mask indicating what was changed.
*/
static unsigned long set_mtrr_state(void)
@@ -715,106 +702,34 @@ static unsigned long set_mtrr_state(void)
return change_mask;
}
-
-static unsigned long cr4;
-static DEFINE_RAW_SPINLOCK(set_atomicity_lock);
-
-/*
- * Since we are disabling the cache don't allow any interrupts,
- * they would run extremely slow and would only increase the pain.
- *
- * The caller must ensure that local interrupts are disabled and
- * are reenabled after post_set() has been called.
- */
-static void prepare_set(void) __acquires(set_atomicity_lock)
+void mtrr_disable(void)
{
- unsigned long cr0;
-
- /*
- * Note that this is not ideal
- * since the cache is only flushed/disabled for this CPU while the
- * MTRRs are changed, but changing this requires more invasive
- * changes to the way the kernel boots
- */
-
- raw_spin_lock(&set_atomicity_lock);
-
- /* Enter the no-fill (CD=1, NW=0) cache mode and flush caches. */
- cr0 = read_cr0() | X86_CR0_CD;
- write_cr0(cr0);
-
- /*
- * Cache flushing is the most time-consuming step when programming
- * the MTRRs. Fortunately, as per the Intel Software Development
- * Manual, we can skip it if the processor supports cache self-
- * snooping.
- */
- if (!static_cpu_has(X86_FEATURE_SELFSNOOP))
- wbinvd();
-
- /* Save value of CR4 and clear Page Global Enable (bit 7) */
- if (boot_cpu_has(X86_FEATURE_PGE)) {
- cr4 = __read_cr4();
- __write_cr4(cr4 & ~X86_CR4_PGE);
- }
-
- /* Flush all TLBs via a mov %cr3, %reg; mov %reg, %cr3 */
- count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
- flush_tlb_local();
-
/* Save MTRR state */
rdmsr(MSR_MTRRdefType, deftype_lo, deftype_hi);
/* Disable MTRRs, and set the default type to uncached */
mtrr_wrmsr(MSR_MTRRdefType, deftype_lo & ~0xcff, deftype_hi);
-
- /* Again, only flush caches if we have to. */
- if (!static_cpu_has(X86_FEATURE_SELFSNOOP))
- wbinvd();
}
-static void post_set(void) __releases(set_atomicity_lock)
+void mtrr_enable(void)
{
- /* Flush TLBs (no need to flush caches - they are disabled) */
- count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
- flush_tlb_local();
-
/* Intel (P6) standard MTRRs */
mtrr_wrmsr(MSR_MTRRdefType, deftype_lo, deftype_hi);
-
- /* Enable caches */
- write_cr0(read_cr0() & ~X86_CR0_CD);
-
- /* Restore value of CR4 */
- if (boot_cpu_has(X86_FEATURE_PGE))
- __write_cr4(cr4);
- raw_spin_unlock(&set_atomicity_lock);
}
-static void generic_set_all(void)
+void mtrr_generic_set_state(void)
{
unsigned long mask, count;
- unsigned long flags;
-
- local_irq_save(flags);
- prepare_set();
/* Actually set the state */
mask = set_mtrr_state();
- /* also set PAT */
- pat_init();
-
- post_set();
- local_irq_restore(flags);
-
/* Use the atomic bitops to update the global mask */
for (count = 0; count < sizeof(mask) * 8; ++count) {
if (mask & 0x01)
set_bit(count, &smp_changes_mask);
mask >>= 1;
}
-
}
/**
@@ -836,7 +751,7 @@ static void generic_set_mtrr(unsigned int reg, unsigned long base,
vr = &mtrr_state.var_ranges[reg];
local_irq_save(flags);
- prepare_set();
+ cache_disable();
if (size == 0) {
/*
@@ -855,7 +770,7 @@ static void generic_set_mtrr(unsigned int reg, unsigned long base,
mtrr_wrmsr(MTRRphysMask_MSR(reg), vr->mask_lo, vr->mask_hi);
}
- post_set();
+ cache_enable();
local_irq_restore(flags);
}
@@ -914,8 +829,6 @@ int positive_have_wrcomb(void)
* Generic structure...
*/
const struct mtrr_ops generic_mtrr_ops = {
- .use_intel_if = 1,
- .set_all = generic_set_all,
.get = generic_get_mtrr,
.get_free_region = generic_get_free_region,
.set = generic_set_mtrr,
diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.c b/arch/x86/kernel/cpu/mtrr/mtrr.c
index 2746cac9d8a9..783f3210d582 100644
--- a/arch/x86/kernel/cpu/mtrr/mtrr.c
+++ b/arch/x86/kernel/cpu/mtrr/mtrr.c
@@ -46,6 +46,7 @@
#include <linux/syscore_ops.h>
#include <linux/rcupdate.h>
+#include <asm/cacheinfo.h>
#include <asm/cpufeature.h>
#include <asm/e820/api.h>
#include <asm/mtrr.h>
@@ -58,32 +59,18 @@
#define MTRR_TO_PHYS_WC_OFFSET 1000
u32 num_var_ranges;
-static bool __mtrr_enabled;
-
static bool mtrr_enabled(void)
{
- return __mtrr_enabled;
+ return !!mtrr_if;
}
unsigned int mtrr_usage_table[MTRR_MAX_VAR_RANGES];
static DEFINE_MUTEX(mtrr_mutex);
u64 size_or_mask, size_and_mask;
-static bool mtrr_aps_delayed_init;
-
-static const struct mtrr_ops *mtrr_ops[X86_VENDOR_NUM] __ro_after_init;
const struct mtrr_ops *mtrr_if;
-static void set_mtrr(unsigned int reg, unsigned long base,
- unsigned long size, mtrr_type type);
-
-void __init set_mtrr_ops(const struct mtrr_ops *ops)
-{
- if (ops->vendor && ops->vendor < X86_VENDOR_NUM)
- mtrr_ops[ops->vendor] = ops;
-}
-
/* Returns non-zero if we have the write-combining memory type */
static int have_wrcomb(void)
{
@@ -119,11 +106,11 @@ static int have_wrcomb(void)
}
/* This function returns the number of variable MTRRs */
-static void __init set_num_var_ranges(void)
+static void __init set_num_var_ranges(bool use_generic)
{
unsigned long config = 0, dummy;
- if (use_intel())
+ if (use_generic)
rdmsr(MSR_MTRRcap, config, dummy);
else if (is_cpu(AMD) || is_cpu(HYGON))
config = 2;
@@ -160,25 +147,8 @@ static int mtrr_rendezvous_handler(void *info)
{
struct set_mtrr_data *data = info;
- /*
- * We use this same function to initialize the mtrrs during boot,
- * resume, runtime cpu online and on an explicit request to set a
- * specific MTRR.
- *
- * During boot or suspend, the state of the boot cpu's mtrrs has been
- * saved, and we want to replicate that across all the cpus that come
- * online (either at the end of boot or resume or during a runtime cpu
- * online). If we're doing that, @reg is set to something special and on
- * all the cpu's we do mtrr_if->set_all() (On the logical cpu that
- * started the boot/resume sequence, this might be a duplicate
- * set_all()).
- */
- if (data->smp_reg != ~0U) {
- mtrr_if->set(data->smp_reg, data->smp_base,
- data->smp_size, data->smp_type);
- } else if (mtrr_aps_delayed_init || !cpu_online(smp_processor_id())) {
- mtrr_if->set_all();
- }
+ mtrr_if->set(data->smp_reg, data->smp_base,
+ data->smp_size, data->smp_type);
return 0;
}
@@ -248,19 +218,6 @@ static void set_mtrr_cpuslocked(unsigned int reg, unsigned long base,
stop_machine_cpuslocked(mtrr_rendezvous_handler, &data, cpu_online_mask);
}
-static void set_mtrr_from_inactive_cpu(unsigned int reg, unsigned long base,
- unsigned long size, mtrr_type type)
-{
- struct set_mtrr_data data = { .smp_reg = reg,
- .smp_base = base,
- .smp_size = size,
- .smp_type = type
- };
-
- stop_machine_from_inactive_cpu(mtrr_rendezvous_handler, &data,
- cpu_callout_mask);
-}
-
/**
* mtrr_add_page - Add a memory type region
* @base: Physical base address of region in pages (in units of 4 kB!)
@@ -617,20 +574,6 @@ int arch_phys_wc_index(int handle)
}
EXPORT_SYMBOL_GPL(arch_phys_wc_index);
-/*
- * HACK ALERT!
- * These should be called implicitly, but we can't yet until all the initcall
- * stuff is done...
- */
-static void __init init_ifs(void)
-{
-#ifndef CONFIG_X86_64
- amd_init_mtrr();
- cyrix_init_mtrr();
- centaur_init_mtrr();
-#endif
-}
-
/* The suspend/resume methods are only for CPU without MTRR. CPU using generic
* MTRR driver doesn't require this
*/
@@ -686,10 +629,9 @@ int __initdata changed_by_mtrr_cleanup;
*/
void __init mtrr_bp_init(void)
{
+ const char *why = "(not available)";
u32 phys_addr;
- init_ifs();
-
phys_addr = 32;
if (boot_cpu_has(X86_FEATURE_MTRR)) {
@@ -730,21 +672,21 @@ void __init mtrr_bp_init(void)
case X86_VENDOR_AMD:
if (cpu_feature_enabled(X86_FEATURE_K6_MTRR)) {
/* Pre-Athlon (K6) AMD CPU MTRRs */
- mtrr_if = mtrr_ops[X86_VENDOR_AMD];
+ mtrr_if = &amd_mtrr_ops;
size_or_mask = SIZE_OR_MASK_BITS(32);
size_and_mask = 0;
}
break;
case X86_VENDOR_CENTAUR:
if (cpu_feature_enabled(X86_FEATURE_CENTAUR_MCR)) {
- mtrr_if = mtrr_ops[X86_VENDOR_CENTAUR];
+ mtrr_if = &centaur_mtrr_ops;
size_or_mask = SIZE_OR_MASK_BITS(32);
size_and_mask = 0;
}
break;
case X86_VENDOR_CYRIX:
if (cpu_feature_enabled(X86_FEATURE_CYRIX_ARR)) {
- mtrr_if = mtrr_ops[X86_VENDOR_CYRIX];
+ mtrr_if = &cyrix_mtrr_ops;
size_or_mask = SIZE_OR_MASK_BITS(32);
size_and_mask = 0;
}
@@ -754,58 +696,23 @@ void __init mtrr_bp_init(void)
}
}
- if (mtrr_if) {
- __mtrr_enabled = true;
- set_num_var_ranges();
+ if (mtrr_enabled()) {
+ set_num_var_ranges(mtrr_if == &generic_mtrr_ops);
init_table();
- if (use_intel()) {
+ if (mtrr_if == &generic_mtrr_ops) {
/* BIOS may override */
- __mtrr_enabled = get_mtrr_state();
-
- if (mtrr_enabled())
- mtrr_bp_pat_init();
-
- if (mtrr_cleanup(phys_addr)) {
- changed_by_mtrr_cleanup = 1;
- mtrr_if->set_all();
+ if (get_mtrr_state()) {
+ memory_caching_control |= CACHE_MTRR;
+ changed_by_mtrr_cleanup = mtrr_cleanup(phys_addr);
+ } else {
+ mtrr_if = NULL;
+ why = "by BIOS";
}
}
}
- if (!mtrr_enabled()) {
- pr_info("Disabled\n");
-
- /*
- * PAT initialization relies on MTRR's rendezvous handler.
- * Skip PAT init until the handler can initialize both
- * features independently.
- */
- pat_disable("MTRRs disabled, skipping PAT initialization too.");
- }
-}
-
-void mtrr_ap_init(void)
-{
if (!mtrr_enabled())
- return;
-
- if (!use_intel() || mtrr_aps_delayed_init)
- return;
-
- /*
- * Ideally we should hold mtrr_mutex here to avoid mtrr entries
- * changed, but this routine will be called in cpu boot time,
- * holding the lock breaks it.
- *
- * This routine is called in two cases:
- *
- * 1. very early time of software resume, when there absolutely
- * isn't mtrr entry changes;
- *
- * 2. cpu hotadd time. We let mtrr_add/del_page hold cpuhotplug
- * lock to prevent mtrr entry changes
- */
- set_mtrr_from_inactive_cpu(~0U, 0, 0, 0);
+ pr_info("MTRRs disabled %s\n", why);
}
/**
@@ -823,50 +730,12 @@ void mtrr_save_state(void)
smp_call_function_single(first_cpu, mtrr_save_fixed_ranges, NULL, 1);
}
-void set_mtrr_aps_delayed_init(void)
-{
- if (!mtrr_enabled())
- return;
- if (!use_intel())
- return;
-
- mtrr_aps_delayed_init = true;
-}
-
-/*
- * Delayed MTRR initialization for all AP's
- */
-void mtrr_aps_init(void)
-{
- if (!use_intel() || !mtrr_enabled())
- return;
-
- /*
- * Check if someone has requested the delay of AP MTRR initialization,
- * by doing set_mtrr_aps_delayed_init(), prior to this point. If not,
- * then we are done.
- */
- if (!mtrr_aps_delayed_init)
- return;
-
- set_mtrr(~0U, 0, 0, 0);
- mtrr_aps_delayed_init = false;
-}
-
-void mtrr_bp_restore(void)
-{
- if (!use_intel() || !mtrr_enabled())
- return;
-
- mtrr_if->set_all();
-}
-
static int __init mtrr_init_finialize(void)
{
if (!mtrr_enabled())
return 0;
- if (use_intel()) {
+ if (memory_caching_control & CACHE_MTRR) {
if (!changed_by_mtrr_cleanup)
mtrr_state_warn();
return 0;
diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.h b/arch/x86/kernel/cpu/mtrr/mtrr.h
index 2ac99e561181..02eb5871492d 100644
--- a/arch/x86/kernel/cpu/mtrr/mtrr.h
+++ b/arch/x86/kernel/cpu/mtrr/mtrr.h
@@ -14,11 +14,8 @@ extern unsigned int mtrr_usage_table[MTRR_MAX_VAR_RANGES];
struct mtrr_ops {
u32 vendor;
- u32 use_intel_if;
void (*set)(unsigned int reg, unsigned long base,
unsigned long size, mtrr_type type);
- void (*set_all)(void);
-
void (*get)(unsigned int reg, unsigned long *base,
unsigned long *size, mtrr_type *type);
int (*get_free_region)(unsigned long base, unsigned long size,
@@ -53,15 +50,11 @@ void set_mtrr_prepare_save(struct set_mtrr_context *ctxt);
void fill_mtrr_var_range(unsigned int index,
u32 base_lo, u32 base_hi, u32 mask_lo, u32 mask_hi);
bool get_mtrr_state(void);
-void mtrr_bp_pat_init(void);
-
-extern void __init set_mtrr_ops(const struct mtrr_ops *ops);
extern u64 size_or_mask, size_and_mask;
extern const struct mtrr_ops *mtrr_if;
#define is_cpu(vnd) (mtrr_if && mtrr_if->vendor == X86_VENDOR_##vnd)
-#define use_intel() (mtrr_if && mtrr_if->use_intel_if == 1)
extern unsigned int num_var_ranges;
extern u64 mtrr_tom2;
@@ -71,10 +64,10 @@ void mtrr_state_warn(void);
const char *mtrr_attrib_to_str(int x);
void mtrr_wrmsr(unsigned, unsigned, unsigned);
-/* CPU specific mtrr init functions */
-int amd_init_mtrr(void);
-int cyrix_init_mtrr(void);
-int centaur_init_mtrr(void);
+/* CPU specific mtrr_ops vectors. */
+extern const struct mtrr_ops amd_mtrr_ops;
+extern const struct mtrr_ops cyrix_mtrr_ops;
+extern const struct mtrr_ops centaur_mtrr_ops;
extern int changed_by_mtrr_cleanup;
extern int mtrr_cleanup(unsigned address_bits);
diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c
index 3266ea36667c..c98e52ff5f20 100644
--- a/arch/x86/kernel/cpu/resctrl/core.c
+++ b/arch/x86/kernel/cpu/resctrl/core.c
@@ -575,7 +575,7 @@ static void clear_closid_rmid(int cpu)
state->default_rmid = 0;
state->cur_closid = 0;
state->cur_rmid = 0;
- wrmsr(IA32_PQR_ASSOC, 0, 0);
+ wrmsr(MSR_IA32_PQR_ASSOC, 0, 0);
}
static int resctrl_online_cpu(unsigned int cpu)
@@ -828,7 +828,6 @@ static __init void rdt_init_res_defs_intel(void)
if (r->rid == RDT_RESOURCE_L3 ||
r->rid == RDT_RESOURCE_L2) {
r->cache.arch_has_sparse_bitmaps = false;
- r->cache.arch_has_empty_bitmaps = false;
r->cache.arch_has_per_cpu_cfg = false;
r->cache.min_cbm_bits = 1;
} else if (r->rid == RDT_RESOURCE_MBA) {
@@ -849,7 +848,6 @@ static __init void rdt_init_res_defs_amd(void)
if (r->rid == RDT_RESOURCE_L3 ||
r->rid == RDT_RESOURCE_L2) {
r->cache.arch_has_sparse_bitmaps = true;
- r->cache.arch_has_empty_bitmaps = true;
r->cache.arch_has_per_cpu_cfg = true;
r->cache.min_cbm_bits = 0;
} else if (r->rid == RDT_RESOURCE_MBA) {
diff --git a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c
index 1dafbdc5ac31..1df0e3262bca 100644
--- a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c
+++ b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c
@@ -105,8 +105,7 @@ static bool cbm_validate(char *buf, u32 *data, struct rdt_resource *r)
return false;
}
- if ((!r->cache.arch_has_empty_bitmaps && val == 0) ||
- val > r->default_ctrl) {
+ if ((r->cache.min_cbm_bits > 0 && val == 0) || val > r->default_ctrl) {
rdt_last_cmd_puts("Mask out of range\n");
return false;
}
diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h
index 5f7128686cfd..5ebd28e6aa0c 100644
--- a/arch/x86/kernel/cpu/resctrl/internal.h
+++ b/arch/x86/kernel/cpu/resctrl/internal.h
@@ -8,16 +8,6 @@
#include <linux/fs_context.h>
#include <linux/jump_label.h>
-#define MSR_IA32_L3_QOS_CFG 0xc81
-#define MSR_IA32_L2_QOS_CFG 0xc82
-#define MSR_IA32_L3_CBM_BASE 0xc90
-#define MSR_IA32_L2_CBM_BASE 0xd10
-#define MSR_IA32_MBA_THRTL_BASE 0xd50
-#define MSR_IA32_MBA_BW_BASE 0xc0000200
-
-#define MSR_IA32_QM_CTR 0x0c8e
-#define MSR_IA32_QM_EVTSEL 0x0c8d
-
#define L3_QOS_CDP_ENABLE 0x01ULL
#define L2_QOS_CDP_ENABLE 0x01ULL
diff --git a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c
index d961ae3ed96e..524f8ff3e69c 100644
--- a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c
+++ b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c
@@ -477,7 +477,7 @@ static int pseudo_lock_fn(void *_rdtgrp)
* pseudo-locked followed by reading of kernel memory to load it
* into the cache.
*/
- __wrmsr(IA32_PQR_ASSOC, rmid_p, rdtgrp->closid);
+ __wrmsr(MSR_IA32_PQR_ASSOC, rmid_p, rdtgrp->closid);
/*
* Cache was flushed earlier. Now access kernel memory to read it
* into cache region associated with just activated plr->closid.
@@ -513,7 +513,7 @@ static int pseudo_lock_fn(void *_rdtgrp)
* Critical section end: restore closid with capacity bitmask that
* does not overlap with pseudo-locked region.
*/
- __wrmsr(IA32_PQR_ASSOC, rmid_p, closid_p);
+ __wrmsr(MSR_IA32_PQR_ASSOC, rmid_p, closid_p);
/* Re-enable the hardware prefetcher(s) */
wrmsrl(MSR_MISC_FEATURE_CONTROL, saved_msr);
@@ -1560,9 +1560,9 @@ static const struct file_operations pseudo_lock_dev_fops = {
.mmap = pseudo_lock_dev_mmap,
};
-static char *pseudo_lock_devnode(struct device *dev, umode_t *mode)
+static char *pseudo_lock_devnode(const struct device *dev, umode_t *mode)
{
- struct rdtgroup *rdtgrp;
+ const struct rdtgroup *rdtgrp;
rdtgrp = dev_get_drvdata(dev);
if (mode)
diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c
index fc01f81f6e2a..f53944fb8f7f 100644
--- a/arch/x86/kernel/cpu/scattered.c
+++ b/arch/x86/kernel/cpu/scattered.c
@@ -40,6 +40,7 @@ static const struct cpuid_bit cpuid_bits[] = {
{ X86_FEATURE_PER_THREAD_MBA, CPUID_ECX, 0, 0x00000010, 3 },
{ X86_FEATURE_SGX1, CPUID_EAX, 0, 0x00000012, 0 },
{ X86_FEATURE_SGX2, CPUID_EAX, 1, 0x00000012, 0 },
+ { X86_FEATURE_SGX_EDECCSSA, CPUID_EAX, 11, 0x00000012, 0 },
{ X86_FEATURE_HW_PSTATE, CPUID_EDX, 7, 0x80000007, 0 },
{ X86_FEATURE_CPB, CPUID_EDX, 9, 0x80000007, 0 },
{ X86_FEATURE_PROC_FEEDBACK, CPUID_EDX, 11, 0x80000007, 0 },
diff --git a/arch/x86/kernel/cpu/sgx/encl.c b/arch/x86/kernel/cpu/sgx/encl.c
index 1ec20807de1e..2a0e90fe2abc 100644
--- a/arch/x86/kernel/cpu/sgx/encl.c
+++ b/arch/x86/kernel/cpu/sgx/encl.c
@@ -160,8 +160,8 @@ static int __sgx_encl_eldu(struct sgx_encl_page *encl_page,
return ret;
pginfo.addr = encl_page->desc & PAGE_MASK;
- pginfo.contents = (unsigned long)kmap_atomic(b.contents);
- pcmd_page = kmap_atomic(b.pcmd);
+ pginfo.contents = (unsigned long)kmap_local_page(b.contents);
+ pcmd_page = kmap_local_page(b.pcmd);
pginfo.metadata = (unsigned long)pcmd_page + b.pcmd_offset;
if (secs_page)
@@ -187,8 +187,8 @@ static int __sgx_encl_eldu(struct sgx_encl_page *encl_page,
*/
pcmd_page_empty = !memchr_inv(pcmd_page, 0, PAGE_SIZE);
- kunmap_atomic(pcmd_page);
- kunmap_atomic((void *)(unsigned long)pginfo.contents);
+ kunmap_local(pcmd_page);
+ kunmap_local((void *)(unsigned long)pginfo.contents);
get_page(b.pcmd);
sgx_encl_put_backing(&b);
@@ -197,10 +197,10 @@ static int __sgx_encl_eldu(struct sgx_encl_page *encl_page,
if (pcmd_page_empty && !reclaimer_writing_to_pcmd(encl, pcmd_first_page)) {
sgx_encl_truncate_backing_page(encl, PFN_DOWN(page_pcmd_off));
- pcmd_page = kmap_atomic(b.pcmd);
+ pcmd_page = kmap_local_page(b.pcmd);
if (memchr_inv(pcmd_page, 0, PAGE_SIZE))
pr_warn("PCMD page not empty after truncate.\n");
- kunmap_atomic(pcmd_page);
+ kunmap_local(pcmd_page);
}
put_page(b.pcmd);
@@ -268,7 +268,7 @@ static struct sgx_encl_page *sgx_encl_load_page_in_vma(struct sgx_encl *encl,
unsigned long addr,
unsigned long vm_flags)
{
- unsigned long vm_prot_bits = vm_flags & (VM_READ | VM_WRITE | VM_EXEC);
+ unsigned long vm_prot_bits = vm_flags & VM_ACCESS_FLAGS;
struct sgx_encl_page *entry;
entry = xa_load(&encl->page_array, PFN_DOWN(addr));
@@ -502,7 +502,7 @@ static void sgx_vma_open(struct vm_area_struct *vma)
int sgx_encl_may_map(struct sgx_encl *encl, unsigned long start,
unsigned long end, unsigned long vm_flags)
{
- unsigned long vm_prot_bits = vm_flags & (VM_READ | VM_WRITE | VM_EXEC);
+ unsigned long vm_prot_bits = vm_flags & VM_ACCESS_FLAGS;
struct sgx_encl_page *page;
unsigned long count = 0;
int ret = 0;
@@ -680,11 +680,15 @@ const struct vm_operations_struct sgx_vm_ops = {
void sgx_encl_release(struct kref *ref)
{
struct sgx_encl *encl = container_of(ref, struct sgx_encl, refcount);
+ unsigned long max_page_index = PFN_DOWN(encl->base + encl->size - 1);
struct sgx_va_page *va_page;
struct sgx_encl_page *entry;
- unsigned long index;
+ unsigned long count = 0;
+
+ XA_STATE(xas, &encl->page_array, PFN_DOWN(encl->base));
- xa_for_each(&encl->page_array, index, entry) {
+ xas_lock(&xas);
+ xas_for_each(&xas, entry, max_page_index) {
if (entry->epc_page) {
/*
* The page and its radix tree entry cannot be freed
@@ -699,9 +703,20 @@ void sgx_encl_release(struct kref *ref)
}
kfree(entry);
- /* Invoke scheduler to prevent soft lockups. */
- cond_resched();
+ /*
+ * Invoke scheduler on every XA_CHECK_SCHED iteration
+ * to prevent soft lockups.
+ */
+ if (!(++count % XA_CHECK_SCHED)) {
+ xas_pause(&xas);
+ xas_unlock(&xas);
+
+ cond_resched();
+
+ xas_lock(&xas);
+ }
}
+ xas_unlock(&xas);
xa_destroy(&encl->page_array);
diff --git a/arch/x86/kernel/cpu/sgx/ioctl.c b/arch/x86/kernel/cpu/sgx/ioctl.c
index ebe79d60619f..21ca0a831b70 100644
--- a/arch/x86/kernel/cpu/sgx/ioctl.c
+++ b/arch/x86/kernel/cpu/sgx/ioctl.c
@@ -111,7 +111,7 @@ static int sgx_encl_create(struct sgx_encl *encl, struct sgx_secs *secs)
encl->base = secs->base;
encl->size = secs->size;
encl->attributes = secs->attributes;
- encl->attributes_mask = SGX_ATTR_DEBUG | SGX_ATTR_MODE64BIT | SGX_ATTR_KSS;
+ encl->attributes_mask = SGX_ATTR_UNPRIV_MASK;
/* Set only after completion, as encl->lock has not been taken. */
set_bit(SGX_ENCL_CREATED, &encl->flags);
@@ -221,11 +221,11 @@ static int __sgx_encl_add_page(struct sgx_encl *encl,
pginfo.secs = (unsigned long)sgx_get_epc_virt_addr(encl->secs.epc_page);
pginfo.addr = encl_page->desc & PAGE_MASK;
pginfo.metadata = (unsigned long)secinfo;
- pginfo.contents = (unsigned long)kmap_atomic(src_page);
+ pginfo.contents = (unsigned long)kmap_local_page(src_page);
ret = __eadd(&pginfo, sgx_get_epc_virt_addr(epc_page));
- kunmap_atomic((void *)pginfo.contents);
+ kunmap_local((void *)pginfo.contents);
put_page(src_page);
return ret ? -EIO : 0;
@@ -356,6 +356,9 @@ static int sgx_validate_offset_length(struct sgx_encl *encl,
if (!length || !IS_ALIGNED(length, PAGE_SIZE))
return -EINVAL;
+ if (offset + length < offset)
+ return -EINVAL;
+
if (offset + length - PAGE_SIZE >= encl->size)
return -EINVAL;
diff --git a/arch/x86/kernel/cpu/sgx/main.c b/arch/x86/kernel/cpu/sgx/main.c
index 0aad028f04d4..e5a37b6e9aa5 100644
--- a/arch/x86/kernel/cpu/sgx/main.c
+++ b/arch/x86/kernel/cpu/sgx/main.c
@@ -165,17 +165,17 @@ static int __sgx_encl_ewb(struct sgx_epc_page *epc_page, void *va_slot,
pginfo.addr = 0;
pginfo.secs = 0;
- pginfo.contents = (unsigned long)kmap_atomic(backing->contents);
- pginfo.metadata = (unsigned long)kmap_atomic(backing->pcmd) +
+ pginfo.contents = (unsigned long)kmap_local_page(backing->contents);
+ pginfo.metadata = (unsigned long)kmap_local_page(backing->pcmd) +
backing->pcmd_offset;
ret = __ewb(&pginfo, sgx_get_epc_virt_addr(epc_page), va_slot);
set_page_dirty(backing->pcmd);
set_page_dirty(backing->contents);
- kunmap_atomic((void *)(unsigned long)(pginfo.metadata -
+ kunmap_local((void *)(unsigned long)(pginfo.metadata -
backing->pcmd_offset));
- kunmap_atomic((void *)(unsigned long)pginfo.contents);
+ kunmap_local((void *)(unsigned long)pginfo.contents);
return ret;
}
diff --git a/arch/x86/kernel/cpu/tsx.c b/arch/x86/kernel/cpu/tsx.c
index ec7bbac3a9f2..8009c8346d8f 100644
--- a/arch/x86/kernel/cpu/tsx.c
+++ b/arch/x86/kernel/cpu/tsx.c
@@ -58,24 +58,6 @@ static void tsx_enable(void)
wrmsrl(MSR_IA32_TSX_CTRL, tsx);
}
-static bool tsx_ctrl_is_supported(void)
-{
- u64 ia32_cap = x86_read_arch_cap_msr();
-
- /*
- * TSX is controlled via MSR_IA32_TSX_CTRL. However, support for this
- * MSR is enumerated by ARCH_CAP_TSX_MSR bit in MSR_IA32_ARCH_CAPABILITIES.
- *
- * TSX control (aka MSR_IA32_TSX_CTRL) is only available after a
- * microcode update on CPUs that have their MSR_IA32_ARCH_CAPABILITIES
- * bit MDS_NO=1. CPUs with MDS_NO=0 are not planned to get
- * MSR_IA32_TSX_CTRL support even after a microcode update. Thus,
- * tsx= cmdline requests will do nothing on CPUs without
- * MSR_IA32_TSX_CTRL support.
- */
- return !!(ia32_cap & ARCH_CAP_TSX_CTRL_MSR);
-}
-
static enum tsx_ctrl_states x86_get_tsx_auto_mode(void)
{
if (boot_cpu_has_bug(X86_BUG_TAA))
@@ -135,7 +117,7 @@ static void tsx_clear_cpuid(void)
rdmsrl(MSR_TSX_FORCE_ABORT, msr);
msr |= MSR_TFA_TSX_CPUID_CLEAR;
wrmsrl(MSR_TSX_FORCE_ABORT, msr);
- } else if (tsx_ctrl_is_supported()) {
+ } else if (cpu_feature_enabled(X86_FEATURE_MSR_TSX_CTRL)) {
rdmsrl(MSR_IA32_TSX_CTRL, msr);
msr |= TSX_CTRL_CPUID_CLEAR;
wrmsrl(MSR_IA32_TSX_CTRL, msr);
@@ -158,7 +140,8 @@ static void tsx_dev_mode_disable(void)
u64 mcu_opt_ctrl;
/* Check if RTM_ALLOW exists */
- if (!boot_cpu_has_bug(X86_BUG_TAA) || !tsx_ctrl_is_supported() ||
+ if (!boot_cpu_has_bug(X86_BUG_TAA) ||
+ !cpu_feature_enabled(X86_FEATURE_MSR_TSX_CTRL) ||
!cpu_feature_enabled(X86_FEATURE_SRBDS_CTRL))
return;
@@ -191,7 +174,20 @@ void __init tsx_init(void)
return;
}
- if (!tsx_ctrl_is_supported()) {
+ /*
+ * TSX is controlled via MSR_IA32_TSX_CTRL. However, support for this
+ * MSR is enumerated by ARCH_CAP_TSX_MSR bit in MSR_IA32_ARCH_CAPABILITIES.
+ *
+ * TSX control (aka MSR_IA32_TSX_CTRL) is only available after a
+ * microcode update on CPUs that have their MSR_IA32_ARCH_CAPABILITIES
+ * bit MDS_NO=1. CPUs with MDS_NO=0 are not planned to get
+ * MSR_IA32_TSX_CTRL support even after a microcode update. Thus,
+ * tsx= cmdline requests will do nothing on CPUs without
+ * MSR_IA32_TSX_CTRL support.
+ */
+ if (x86_read_arch_cap_msr() & ARCH_CAP_TSX_CTRL_MSR) {
+ setup_force_cpu_cap(X86_FEATURE_MSR_TSX_CTRL);
+ } else {
tsx_ctrl_state = TSX_CTRL_NOT_SUPPORTED;
return;
}
diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c
index 6f7b8cc1bc9f..621ba9c0f17a 100644
--- a/arch/x86/kernel/cpuid.c
+++ b/arch/x86/kernel/cpuid.c
@@ -139,7 +139,7 @@ static int cpuid_device_destroy(unsigned int cpu)
return 0;
}
-static char *cpuid_devnode(struct device *dev, umode_t *mode)
+static char *cpuid_devnode(const struct device *dev, umode_t *mode)
{
return kasprintf(GFP_KERNEL, "cpu/%u/cpuid", MINOR(dev->devt));
}
diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
index 9730c88530fc..305514431f26 100644
--- a/arch/x86/kernel/crash.c
+++ b/arch/x86/kernel/crash.c
@@ -401,10 +401,8 @@ int crash_load_segments(struct kimage *image)
kbuf.buf_align = ELF_CORE_HEADER_ALIGN;
kbuf.mem = KEXEC_BUF_MEM_UNKNOWN;
ret = kexec_add_buffer(&kbuf);
- if (ret) {
- vfree((void *)image->elf_headers);
+ if (ret)
return ret;
- }
image->elf_load_addr = kbuf.mem;
pr_debug("Loaded ELF headers at 0x%lx bufsz=0x%lx memsz=0x%lx\n",
image->elf_load_addr, kbuf.bufsz, kbuf.memsz);
diff --git a/arch/x86/kernel/crash_dump_64.c b/arch/x86/kernel/crash_dump_64.c
index e75bc2f217ff..32d710f7eb84 100644
--- a/arch/x86/kernel/crash_dump_64.c
+++ b/arch/x86/kernel/crash_dump_64.c
@@ -57,7 +57,7 @@ ssize_t elfcorehdr_read(char *buf, size_t count, u64 *ppos)
struct kvec kvec = { .iov_base = buf, .iov_len = count };
struct iov_iter iter;
- iov_iter_kvec(&iter, READ, &kvec, 1, count);
+ iov_iter_kvec(&iter, ITER_DEST, &kvec, 1, count);
return read_from_oldmem(&iter, count, ppos,
cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT));
diff --git a/arch/x86/kernel/devicetree.c b/arch/x86/kernel/devicetree.c
index 5cd51f25f446..28da5dd83fc0 100644
--- a/arch/x86/kernel/devicetree.c
+++ b/arch/x86/kernel/devicetree.c
@@ -31,11 +31,6 @@ char __initdata cmd_line[COMMAND_LINE_SIZE];
int __initdata of_ioapic;
-void __init early_init_dt_add_memory_arch(u64 base, u64 size)
-{
- BUG();
-}
-
void __init add_dtb(u64 data)
{
initial_dtb = data + offsetof(struct setup_data, data);
@@ -167,7 +162,14 @@ static void __init dtb_lapic_setup(void)
return;
}
smp_found_config = 1;
- pic_mode = 1;
+ if (of_property_read_bool(dn, "intel,virtual-wire-mode")) {
+ pr_info("Virtual Wire compatibility mode.\n");
+ pic_mode = 0;
+ } else {
+ pr_info("IMCR and PIC compatibility mode.\n");
+ pic_mode = 1;
+ }
+
register_lapic_address(lapic_addr);
}
@@ -248,7 +250,7 @@ static void __init dtb_add_ioapic(struct device_node *dn)
ret = of_address_to_resource(dn, 0, &r);
if (ret) {
- printk(KERN_ERR "Can't obtain address from device node %pOF.\n", dn);
+ pr_err("Can't obtain address from device node %pOF.\n", dn);
return;
}
mp_register_ioapic(++ioapic_id, r.start, gsi_top, &cfg);
@@ -265,7 +267,7 @@ static void __init dtb_ioapic_setup(void)
of_ioapic = 1;
return;
}
- printk(KERN_ERR "Error: No information about IO-APIC in OF.\n");
+ pr_err("Error: No information about IO-APIC in OF.\n");
}
#else
static void __init dtb_ioapic_setup(void) {}
diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c
index 722fd712e1cf..b4905d5173fd 100644
--- a/arch/x86/kernel/dumpstack_32.c
+++ b/arch/x86/kernel/dumpstack_32.c
@@ -37,7 +37,7 @@ const char *stack_type_name(enum stack_type type)
static bool in_hardirq_stack(unsigned long *stack, struct stack_info *info)
{
- unsigned long *begin = (unsigned long *)this_cpu_read(hardirq_stack_ptr);
+ unsigned long *begin = (unsigned long *)this_cpu_read(pcpu_hot.hardirq_stack_ptr);
unsigned long *end = begin + (THREAD_SIZE / sizeof(long));
/*
@@ -62,7 +62,7 @@ static bool in_hardirq_stack(unsigned long *stack, struct stack_info *info)
static bool in_softirq_stack(unsigned long *stack, struct stack_info *info)
{
- unsigned long *begin = (unsigned long *)this_cpu_read(softirq_stack_ptr);
+ unsigned long *begin = (unsigned long *)this_cpu_read(pcpu_hot.softirq_stack_ptr);
unsigned long *end = begin + (THREAD_SIZE / sizeof(long));
/*
diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c
index 6c5defd6569a..f05339fee778 100644
--- a/arch/x86/kernel/dumpstack_64.c
+++ b/arch/x86/kernel/dumpstack_64.c
@@ -134,7 +134,7 @@ static __always_inline bool in_exception_stack(unsigned long *stack, struct stac
static __always_inline bool in_irq_stack(unsigned long *stack, struct stack_info *info)
{
- unsigned long *end = (unsigned long *)this_cpu_read(hardirq_stack_ptr);
+ unsigned long *end = (unsigned long *)this_cpu_read(pcpu_hot.hardirq_stack_ptr);
unsigned long *begin;
/*
diff --git a/arch/x86/kernel/espfix_64.c b/arch/x86/kernel/espfix_64.c
index 9417d5aa7305..16f9814c9be0 100644
--- a/arch/x86/kernel/espfix_64.c
+++ b/arch/x86/kernel/espfix_64.c
@@ -94,17 +94,7 @@ static inline unsigned long espfix_base_addr(unsigned int cpu)
static void init_espfix_random(void)
{
- unsigned long rand;
-
- /*
- * This is run before the entropy pools are initialized,
- * but this is hopefully better than nothing.
- */
- if (!arch_get_random_longs(&rand, 1)) {
- /* The constant is an arbitrary large prime */
- rand = rdtsc();
- rand *= 0xc345c6b72fd16123UL;
- }
+ unsigned long rand = get_random_long();
slot_random = rand % ESPFIX_STACKS_PER_PAGE;
page_random = (rand / ESPFIX_STACKS_PER_PAGE)
diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index 3b28c5b25e12..9baa89a8877d 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -391,8 +391,6 @@ int fpu_copy_uabi_to_guest_fpstate(struct fpu_guest *gfpu, const void *buf,
{
struct fpstate *kstate = gfpu->fpstate;
const union fpregs_state *ustate = buf;
- struct pkru_state *xpkru;
- int ret;
if (!cpu_feature_enabled(X86_FEATURE_XSAVE)) {
if (ustate->xsave.header.xfeatures & ~XFEATURE_MASK_FPSSE)
@@ -406,16 +404,15 @@ int fpu_copy_uabi_to_guest_fpstate(struct fpu_guest *gfpu, const void *buf,
if (ustate->xsave.header.xfeatures & ~xcr0)
return -EINVAL;
- ret = copy_uabi_from_kernel_to_xstate(kstate, ustate);
- if (ret)
- return ret;
+ /*
+ * Nullify @vpkru to preserve its current value if PKRU's bit isn't set
+ * in the header. KVM's odd ABI is to leave PKRU untouched in this
+ * case (all other components are eventually re-initialized).
+ */
+ if (!(ustate->xsave.header.xfeatures & XFEATURE_MASK_PKRU))
+ vpkru = NULL;
- /* Retrieve PKRU if not in init state */
- if (kstate->regs.xsave.header.xfeatures & XFEATURE_MASK_PKRU) {
- xpkru = get_xsave_addr(&kstate->regs.xsave, XFEATURE_PKRU);
- *vpkru = xpkru->pkru;
- }
- return 0;
+ return copy_uabi_from_kernel_to_xstate(kstate, ustate, vpkru);
}
EXPORT_SYMBOL_GPL(fpu_copy_uabi_to_guest_fpstate);
#endif /* CONFIG_KVM */
@@ -605,9 +602,9 @@ int fpu_clone(struct task_struct *dst, unsigned long clone_flags, bool minimal)
if (test_thread_flag(TIF_NEED_FPU_LOAD))
fpregs_restore_userregs();
save_fpregs_to_fpstate(dst_fpu);
+ fpregs_unlock();
if (!(clone_flags & CLONE_THREAD))
fpu_inherit_perms(dst_fpu);
- fpregs_unlock();
/*
* Children never inherit PASID state.
diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c
index 8946f89761cc..851eb13edc01 100644
--- a/arch/x86/kernel/fpu/init.c
+++ b/arch/x86/kernel/fpu/init.c
@@ -133,9 +133,6 @@ static void __init fpu__init_system_generic(void)
fpu__init_system_mxcsr();
}
-/* Get alignment of the TYPE. */
-#define TYPE_ALIGN(TYPE) offsetof(struct { char x; TYPE test; }, test)
-
/*
* Enforce that 'MEMBER' is the last field of 'TYPE'.
*
@@ -143,8 +140,8 @@ static void __init fpu__init_system_generic(void)
* because that's how C aligns structs.
*/
#define CHECK_MEMBER_AT_END_OF(TYPE, MEMBER) \
- BUILD_BUG_ON(sizeof(TYPE) != ALIGN(offsetofend(TYPE, MEMBER), \
- TYPE_ALIGN(TYPE)))
+ BUILD_BUG_ON(sizeof(TYPE) != \
+ ALIGN(offsetofend(TYPE, MEMBER), _Alignof(TYPE)))
/*
* We append the 'struct fpu' to the task_struct:
diff --git a/arch/x86/kernel/fpu/regset.c b/arch/x86/kernel/fpu/regset.c
index 75ffaef8c299..6d056b68f4ed 100644
--- a/arch/x86/kernel/fpu/regset.c
+++ b/arch/x86/kernel/fpu/regset.c
@@ -167,7 +167,7 @@ int xstateregs_set(struct task_struct *target, const struct user_regset *regset,
}
fpu_force_restore(fpu);
- ret = copy_uabi_from_kernel_to_xstate(fpu->fpstate, kbuf ?: tmpbuf);
+ ret = copy_uabi_from_kernel_to_xstate(fpu->fpstate, kbuf ?: tmpbuf, &target->thread.pkru);
out:
vfree(tmpbuf);
diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c
index 91d4b6de58ab..558076dbde5b 100644
--- a/arch/x86/kernel/fpu/signal.c
+++ b/arch/x86/kernel/fpu/signal.c
@@ -396,7 +396,7 @@ static bool __fpu_restore_sig(void __user *buf, void __user *buf_fx,
fpregs = &fpu->fpstate->regs;
if (use_xsave() && !fx_only) {
- if (copy_sigframe_from_user_to_xstate(fpu->fpstate, buf_fx))
+ if (copy_sigframe_from_user_to_xstate(tsk, buf_fx))
return false;
} else {
if (__copy_from_user(&fpregs->fxsave, buf_fx,
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index 59e543b95a3c..714166cc25f2 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -440,8 +440,8 @@ static void __init __xstate_dump_leaves(void)
}
}
-#define XSTATE_WARN_ON(x) do { \
- if (WARN_ONCE(x, "XSAVE consistency problem, dumping leaves")) { \
+#define XSTATE_WARN_ON(x, fmt, ...) do { \
+ if (WARN_ONCE(x, "XSAVE consistency problem: " fmt, ##__VA_ARGS__)) { \
__xstate_dump_leaves(); \
} \
} while (0)
@@ -554,8 +554,7 @@ static bool __init check_xstate_against_struct(int nr)
(nr >= XFEATURE_MAX) ||
(nr == XFEATURE_PT_UNIMPLEMENTED_SO_FAR) ||
((nr >= XFEATURE_RSRVD_COMP_11) && (nr <= XFEATURE_RSRVD_COMP_16))) {
- WARN_ONCE(1, "no structure for xstate: %d\n", nr);
- XSTATE_WARN_ON(1);
+ XSTATE_WARN_ON(1, "No structure for xstate: %d\n", nr);
return false;
}
return true;
@@ -598,12 +597,13 @@ static bool __init paranoid_xstate_size_valid(unsigned int kernel_size)
* XSAVES.
*/
if (!xsaves && xfeature_is_supervisor(i)) {
- XSTATE_WARN_ON(1);
+ XSTATE_WARN_ON(1, "Got supervisor feature %d, but XSAVES not advertised\n", i);
return false;
}
}
size = xstate_calculate_size(fpu_kernel_cfg.max_features, compacted);
- XSTATE_WARN_ON(size != kernel_size);
+ XSTATE_WARN_ON(size != kernel_size,
+ "size %u != kernel_size %u\n", size, kernel_size);
return size == kernel_size;
}
@@ -1200,8 +1200,36 @@ static int copy_from_buffer(void *dst, unsigned int offset, unsigned int size,
}
+/**
+ * copy_uabi_to_xstate - Copy a UABI format buffer to the kernel xstate
+ * @fpstate: The fpstate buffer to copy to
+ * @kbuf: The UABI format buffer, if it comes from the kernel
+ * @ubuf: The UABI format buffer, if it comes from userspace
+ * @pkru: The location to write the PKRU value to
+ *
+ * Converts from the UABI format into the kernel internal hardware
+ * dependent format.
+ *
+ * This function ultimately has three different callers with distinct PKRU
+ * behavior.
+ * 1. When called from sigreturn the PKRU register will be restored from
+ * @fpstate via an XRSTOR. Correctly copying the UABI format buffer to
+ * @fpstate is sufficient to cover this case, but the caller will also
+ * pass a pointer to the thread_struct's pkru field in @pkru and updating
+ * it is harmless.
+ * 2. When called from ptrace the PKRU register will be restored from the
+ * thread_struct's pkru field. A pointer to that is passed in @pkru.
+ * The kernel will restore it manually, so the XRSTOR behavior that resets
+ * the PKRU register to the hardware init value (0) if the corresponding
+ * xfeatures bit is not set is emulated here.
+ * 3. When called from KVM the PKRU register will be restored from the vcpu's
+ * pkru field. A pointer to that is passed in @pkru. KVM hasn't used
+ * XRSTOR and hasn't had the PKRU resetting behavior described above. To
+ * preserve that KVM behavior, it passes NULL for @pkru if the xfeatures
+ * bit is not set.
+ */
static int copy_uabi_to_xstate(struct fpstate *fpstate, const void *kbuf,
- const void __user *ubuf)
+ const void __user *ubuf, u32 *pkru)
{
struct xregs_state *xsave = &fpstate->regs.xsave;
unsigned int offset, size;
@@ -1250,6 +1278,20 @@ static int copy_uabi_to_xstate(struct fpstate *fpstate, const void *kbuf,
}
}
+ if (hdr.xfeatures & XFEATURE_MASK_PKRU) {
+ struct pkru_state *xpkru;
+
+ xpkru = __raw_xsave_addr(xsave, XFEATURE_PKRU);
+ *pkru = xpkru->pkru;
+ } else {
+ /*
+ * KVM may pass NULL here to indicate that it does not need
+ * PKRU updated.
+ */
+ if (pkru)
+ *pkru = 0;
+ }
+
/*
* The state that came in from userspace was user-state only.
* Mask all the user states out of 'xfeatures':
@@ -1268,9 +1310,9 @@ static int copy_uabi_to_xstate(struct fpstate *fpstate, const void *kbuf,
* Convert from a ptrace standard-format kernel buffer to kernel XSAVE[S]
* format and copy to the target thread. Used by ptrace and KVM.
*/
-int copy_uabi_from_kernel_to_xstate(struct fpstate *fpstate, const void *kbuf)
+int copy_uabi_from_kernel_to_xstate(struct fpstate *fpstate, const void *kbuf, u32 *pkru)
{
- return copy_uabi_to_xstate(fpstate, kbuf, NULL);
+ return copy_uabi_to_xstate(fpstate, kbuf, NULL, pkru);
}
/*
@@ -1278,10 +1320,10 @@ int copy_uabi_from_kernel_to_xstate(struct fpstate *fpstate, const void *kbuf)
* XSAVE[S] format and copy to the target thread. This is called from the
* sigreturn() and rt_sigreturn() system calls.
*/
-int copy_sigframe_from_user_to_xstate(struct fpstate *fpstate,
+int copy_sigframe_from_user_to_xstate(struct task_struct *tsk,
const void __user *ubuf)
{
- return copy_uabi_to_xstate(fpstate, NULL, ubuf);
+ return copy_uabi_to_xstate(tsk->thread.fpu.fpstate, NULL, ubuf, &tsk->thread.pkru);
}
static bool validate_independent_components(u64 mask)
diff --git a/arch/x86/kernel/fpu/xstate.h b/arch/x86/kernel/fpu/xstate.h
index 5ad47031383b..a4ecb04d8d64 100644
--- a/arch/x86/kernel/fpu/xstate.h
+++ b/arch/x86/kernel/fpu/xstate.h
@@ -46,8 +46,8 @@ extern void __copy_xstate_to_uabi_buf(struct membuf to, struct fpstate *fpstate,
u32 pkru_val, enum xstate_copy_mode copy_mode);
extern void copy_xstate_to_uabi_buf(struct membuf to, struct task_struct *tsk,
enum xstate_copy_mode mode);
-extern int copy_uabi_from_kernel_to_xstate(struct fpstate *fpstate, const void *kbuf);
-extern int copy_sigframe_from_user_to_xstate(struct fpstate *fpstate, const void __user *ubuf);
+extern int copy_uabi_from_kernel_to_xstate(struct fpstate *fpstate, const void *kbuf, u32 *pkru);
+extern int copy_sigframe_from_user_to_xstate(struct task_struct *tsk, const void __user *ubuf);
extern void fpu__init_cpu_xstate(void);
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index bd165004776d..5e7ead52cfdb 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -24,10 +24,10 @@
#include <linux/module.h>
#include <linux/memory.h>
#include <linux/vmalloc.h>
+#include <linux/set_memory.h>
#include <trace/syscall.h>
-#include <asm/set_memory.h>
#include <asm/kprobes.h>
#include <asm/ftrace.h>
#include <asm/nops.h>
@@ -69,6 +69,10 @@ static const char *ftrace_nop_replace(void)
static const char *ftrace_call_replace(unsigned long ip, unsigned long addr)
{
+ /*
+ * No need to translate into a callthunk. The trampoline does
+ * the depth accounting itself.
+ */
return text_gen_insn(CALL_INSN_OPCODE, (void *)ip, (void *)addr);
}
@@ -217,7 +221,9 @@ void ftrace_replace_code(int enable)
ret = ftrace_verify_code(rec->ip, old);
if (ret) {
+ ftrace_expected = old;
ftrace_bug(ret, rec);
+ ftrace_expected = NULL;
return;
}
}
@@ -317,7 +323,7 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size)
unsigned long size;
unsigned long *ptr;
void *trampoline;
- void *ip;
+ void *ip, *dest;
/* 48 8b 15 <offset> is movq <offset>(%rip), %rdx */
unsigned const char op_ref[] = { 0x48, 0x8b, 0x15 };
unsigned const char retq[] = { RET_INSN_OPCODE, INT3_INSN_OPCODE };
@@ -359,7 +365,7 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size)
ip = trampoline + size;
if (cpu_feature_enabled(X86_FEATURE_RETHUNK))
- __text_gen_insn(ip, JMP32_INSN_OPCODE, ip, &__x86_return_thunk, JMP32_INSN_SIZE);
+ __text_gen_insn(ip, JMP32_INSN_OPCODE, ip, x86_return_thunk, JMP32_INSN_SIZE);
else
memcpy(ip, retq, sizeof(retq));
@@ -404,20 +410,20 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size)
/* put in the call to the function */
mutex_lock(&text_mutex);
call_offset -= start_offset;
+ /*
+ * No need to translate into a callthunk. The trampoline does
+ * the depth accounting before the call already.
+ */
+ dest = ftrace_ops_get_func(ops);
memcpy(trampoline + call_offset,
- text_gen_insn(CALL_INSN_OPCODE,
- trampoline + call_offset,
- ftrace_ops_get_func(ops)), CALL_INSN_SIZE);
+ text_gen_insn(CALL_INSN_OPCODE, trampoline + call_offset, dest),
+ CALL_INSN_SIZE);
mutex_unlock(&text_mutex);
/* ALLOC_TRAMP flags lets us know we created it */
ops->flags |= FTRACE_OPS_FL_ALLOC_TRAMP;
- set_vm_flush_reset_perms(trampoline);
-
- if (likely(system_state != SYSTEM_BOOTING))
- set_memory_ro((unsigned long)trampoline, npages);
- set_memory_x((unsigned long)trampoline, npages);
+ set_memory_rox((unsigned long)trampoline, npages);
return (unsigned long)trampoline;
fail:
tramp_free(trampoline);
diff --git a/arch/x86/kernel/ftrace_64.S b/arch/x86/kernel/ftrace_64.S
index 2a4be92fd144..1265ad519249 100644
--- a/arch/x86/kernel/ftrace_64.S
+++ b/arch/x86/kernel/ftrace_64.S
@@ -3,8 +3,9 @@
* Copyright (C) 2014 Steven Rostedt, Red Hat Inc
*/
-#include <linux/linkage.h>
#include <linux/cfi_types.h>
+#include <linux/linkage.h>
+#include <asm/asm-offsets.h>
#include <asm/ptrace.h>
#include <asm/ftrace.h>
#include <asm/export.h>
@@ -131,16 +132,19 @@
.endm
SYM_TYPED_FUNC_START(ftrace_stub)
+ CALL_DEPTH_ACCOUNT
RET
SYM_FUNC_END(ftrace_stub)
SYM_TYPED_FUNC_START(ftrace_stub_graph)
+ CALL_DEPTH_ACCOUNT
RET
SYM_FUNC_END(ftrace_stub_graph)
#ifdef CONFIG_DYNAMIC_FTRACE
SYM_FUNC_START(__fentry__)
+ CALL_DEPTH_ACCOUNT
RET
SYM_FUNC_END(__fentry__)
EXPORT_SYMBOL(__fentry__)
@@ -149,6 +153,8 @@ SYM_FUNC_START(ftrace_caller)
/* save_mcount_regs fills in first two parameters */
save_mcount_regs
+ CALL_DEPTH_ACCOUNT
+
/* Stack - skipping return address of ftrace_caller */
leaq MCOUNT_REG_SIZE+8(%rsp), %rcx
movq %rcx, RSP(%rsp)
@@ -164,6 +170,9 @@ SYM_INNER_LABEL(ftrace_caller_op_ptr, SYM_L_GLOBAL)
/* Only ops with REGS flag set should have CS register set */
movq $0, CS(%rsp)
+ /* Account for the function call below */
+ CALL_DEPTH_ACCOUNT
+
SYM_INNER_LABEL(ftrace_call, SYM_L_GLOBAL)
ANNOTATE_NOENDBR
call ftrace_stub
@@ -193,6 +202,8 @@ SYM_FUNC_START(ftrace_regs_caller)
save_mcount_regs 8
/* save_mcount_regs fills in first two parameters */
+ CALL_DEPTH_ACCOUNT
+
SYM_INNER_LABEL(ftrace_regs_caller_op_ptr, SYM_L_GLOBAL)
ANNOTATE_NOENDBR
/* Load the ftrace_ops into the 3rd parameter */
@@ -223,6 +234,9 @@ SYM_INNER_LABEL(ftrace_regs_caller_op_ptr, SYM_L_GLOBAL)
/* regs go into 4th parameter */
leaq (%rsp), %rcx
+ /* Account for the function call below */
+ CALL_DEPTH_ACCOUNT
+
SYM_INNER_LABEL(ftrace_regs_call, SYM_L_GLOBAL)
ANNOTATE_NOENDBR
call ftrace_stub
@@ -275,7 +289,20 @@ SYM_INNER_LABEL(ftrace_regs_caller_end, SYM_L_GLOBAL)
/* Restore flags */
popfq
UNWIND_HINT_FUNC
- RET
+
+ /*
+ * The above left an extra return value on the stack; effectively
+ * doing a tail-call without using a register. This PUSH;RET
+ * pattern unbalances the RSB, inject a pointless CALL to rebalance.
+ */
+ ANNOTATE_INTRA_FUNCTION_CALL
+ CALL .Ldo_rebalance
+ int3
+.Ldo_rebalance:
+ add $8, %rsp
+ ALTERNATIVE __stringify(RET), \
+ __stringify(ANNOTATE_UNRET_SAFE; ret; int3), \
+ X86_FEATURE_CALL_DEPTH
SYM_FUNC_END(ftrace_regs_caller)
STACK_FRAME_NON_STANDARD_FP(ftrace_regs_caller)
@@ -284,6 +311,8 @@ STACK_FRAME_NON_STANDARD_FP(ftrace_regs_caller)
#else /* ! CONFIG_DYNAMIC_FTRACE */
SYM_FUNC_START(__fentry__)
+ CALL_DEPTH_ACCOUNT
+
cmpq $ftrace_stub, ftrace_trace_function
jnz trace
RET
@@ -337,6 +366,8 @@ SYM_CODE_START(return_to_handler)
int3
.Ldo_rop:
mov %rdi, (%rsp)
- RET
+ ALTERNATIVE __stringify(RET), \
+ __stringify(ANNOTATE_UNRET_SAFE; ret; int3), \
+ X86_FEATURE_CALL_DEPTH
SYM_CODE_END(return_to_handler)
#endif
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index 6a3cfaf6b72a..387e4b12e823 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -203,7 +203,7 @@ unsigned long __head __startup_64(unsigned long physaddr,
load_delta = physaddr - (unsigned long)(_text - __START_KERNEL_map);
/* Is the address not 2M aligned? */
- if (load_delta & ~PMD_PAGE_MASK)
+ if (load_delta & ~PMD_MASK)
for (;;);
/* Include the SME encryption mask in the fixup value */
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index 9b7acc9c7874..67c8ed99144b 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -261,16 +261,6 @@ SYM_FUNC_START(startup_32_smp)
addl $__PAGE_OFFSET, %esp
/*
- * start system 32-bit setup. We need to re-do some of the things done
- * in 16-bit mode for the "real" operations.
- */
- movl setup_once_ref,%eax
- andl %eax,%eax
- jz 1f # Did we do this already?
- call *%eax
-1:
-
-/*
* Check if it is 486
*/
movb $4,X86 # at least 486
@@ -331,18 +321,7 @@ SYM_FUNC_END(startup_32_smp)
#include "verify_cpu.S"
-/*
- * setup_once
- *
- * The setup work we only want to run on the BSP.
- *
- * Warning: %esi is live across this function.
- */
__INIT
-setup_once:
- andl $0,setup_once_ref /* Once is enough, thanks */
- RET
-
SYM_FUNC_START(early_idt_handler_array)
# 36(%esp) %eflags
# 32(%esp) %cs
@@ -458,7 +437,6 @@ SYM_DATA(early_recursion_flag, .long 0)
__REFDATA
.align 4
SYM_DATA(initial_code, .long i386_start_kernel)
-SYM_DATA(setup_once_ref, .long setup_once)
#ifdef CONFIG_PAGE_TABLE_ISOLATION
#define PGD_ALIGN (2 * PAGE_SIZE)
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index d860d437631b..222efd4a09bc 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -370,6 +370,7 @@ SYM_CODE_END(secondary_startup_64)
* start_secondary() via .Ljump_to_C_code.
*/
SYM_CODE_START(start_cpu0)
+ ANNOTATE_NOENDBR
UNWIND_HINT_EMPTY
movq initial_stack(%rip), %rsp
jmp .Ljump_to_C_code
diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c
index 668a4a6533d9..bbb0f737aab1 100644
--- a/arch/x86/kernel/hw_breakpoint.c
+++ b/arch/x86/kernel/hw_breakpoint.c
@@ -266,7 +266,7 @@ static inline bool within_cpu_entry(unsigned long addr, unsigned long end)
/* CPU entry erea is always used for CPU entry */
if (within_area(addr, end, CPU_ENTRY_AREA_BASE,
- CPU_ENTRY_AREA_TOTAL_SIZE))
+ CPU_ENTRY_AREA_MAP_SIZE))
return true;
/*
diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c
index 15aefa3f3e18..3aa5304200c5 100644
--- a/arch/x86/kernel/i8259.c
+++ b/arch/x86/kernel/i8259.c
@@ -407,7 +407,7 @@ struct legacy_pic null_legacy_pic = {
.make_irq = legacy_pic_uint_noop,
};
-struct legacy_pic default_legacy_pic = {
+static struct legacy_pic default_legacy_pic = {
.nr_legacy_irqs = NR_IRQS_LEGACY,
.chip = &i8259A_chip,
.mask = mask_8259A_irq,
diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c
index 01833ebf5e8e..dc1049c01f9b 100644
--- a/arch/x86/kernel/irq_32.c
+++ b/arch/x86/kernel/irq_32.c
@@ -52,9 +52,6 @@ static inline int check_stack_overflow(void) { return 0; }
static inline void print_stack_overflow(void) { }
#endif
-DEFINE_PER_CPU(struct irq_stack *, hardirq_stack_ptr);
-DEFINE_PER_CPU(struct irq_stack *, softirq_stack_ptr);
-
static void call_on_stack(void *func, void *stack)
{
asm volatile("xchgl %%ebx,%%esp \n"
@@ -77,7 +74,7 @@ static inline int execute_on_irq_stack(int overflow, struct irq_desc *desc)
u32 *isp, *prev_esp, arg1;
curstk = (struct irq_stack *) current_stack();
- irqstk = __this_cpu_read(hardirq_stack_ptr);
+ irqstk = __this_cpu_read(pcpu_hot.hardirq_stack_ptr);
/*
* this is where we switch to the IRQ stack. However, if we are
@@ -115,7 +112,7 @@ int irq_init_percpu_irqstack(unsigned int cpu)
int node = cpu_to_node(cpu);
struct page *ph, *ps;
- if (per_cpu(hardirq_stack_ptr, cpu))
+ if (per_cpu(pcpu_hot.hardirq_stack_ptr, cpu))
return 0;
ph = alloc_pages_node(node, THREADINFO_GFP, THREAD_SIZE_ORDER);
@@ -127,8 +124,8 @@ int irq_init_percpu_irqstack(unsigned int cpu)
return -ENOMEM;
}
- per_cpu(hardirq_stack_ptr, cpu) = page_address(ph);
- per_cpu(softirq_stack_ptr, cpu) = page_address(ps);
+ per_cpu(pcpu_hot.hardirq_stack_ptr, cpu) = page_address(ph);
+ per_cpu(pcpu_hot.softirq_stack_ptr, cpu) = page_address(ps);
return 0;
}
@@ -138,7 +135,7 @@ void do_softirq_own_stack(void)
struct irq_stack *irqstk;
u32 *isp, *prev_esp;
- irqstk = __this_cpu_read(softirq_stack_ptr);
+ irqstk = __this_cpu_read(pcpu_hot.softirq_stack_ptr);
/* build the stack frame on the softirq stack */
isp = (u32 *) ((char *)irqstk + sizeof(*irqstk));
diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c
index 1c0fb96b9e39..fe0c859873d1 100644
--- a/arch/x86/kernel/irq_64.c
+++ b/arch/x86/kernel/irq_64.c
@@ -50,7 +50,7 @@ static int map_irq_stack(unsigned int cpu)
return -ENOMEM;
/* Store actual TOS to avoid adjustment in the hotpath */
- per_cpu(hardirq_stack_ptr, cpu) = va + IRQ_STACK_SIZE - 8;
+ per_cpu(pcpu_hot.hardirq_stack_ptr, cpu) = va + IRQ_STACK_SIZE - 8;
return 0;
}
#else
@@ -63,14 +63,14 @@ static int map_irq_stack(unsigned int cpu)
void *va = per_cpu_ptr(&irq_stack_backing_store, cpu);
/* Store actual TOS to avoid adjustment in the hotpath */
- per_cpu(hardirq_stack_ptr, cpu) = va + IRQ_STACK_SIZE - 8;
+ per_cpu(pcpu_hot.hardirq_stack_ptr, cpu) = va + IRQ_STACK_SIZE - 8;
return 0;
}
#endif
int irq_init_percpu_irqstack(unsigned int cpu)
{
- if (per_cpu(hardirq_stack_ptr, cpu))
+ if (per_cpu(pcpu_hot.hardirq_stack_ptr, cpu))
return 0;
return map_irq_stack(cpu);
}
diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c
index eb8bc82846b9..b36f3c367cb2 100644
--- a/arch/x86/kernel/kprobes/core.c
+++ b/arch/x86/kernel/kprobes/core.c
@@ -37,12 +37,14 @@
#include <linux/extable.h>
#include <linux/kdebug.h>
#include <linux/kallsyms.h>
+#include <linux/kgdb.h>
#include <linux/ftrace.h>
#include <linux/kasan.h>
#include <linux/moduleloader.h>
#include <linux/objtool.h>
#include <linux/vmalloc.h>
#include <linux/pgtable.h>
+#include <linux/set_memory.h>
#include <asm/text-patching.h>
#include <asm/cacheflush.h>
@@ -51,7 +53,6 @@
#include <asm/alternative.h>
#include <asm/insn.h>
#include <asm/debugreg.h>
-#include <asm/set_memory.h>
#include <asm/ibt.h>
#include "common.h"
@@ -281,12 +282,15 @@ static int can_probe(unsigned long paddr)
if (ret < 0)
return 0;
+#ifdef CONFIG_KGDB
/*
- * Another debugging subsystem might insert this breakpoint.
- * In that case, we can't recover it.
+ * If there is a dynamically installed kgdb sw breakpoint,
+ * this function should not be probed.
*/
- if (insn.opcode.bytes[0] == INT3_INSN_OPCODE)
+ if (insn.opcode.bytes[0] == INT3_INSN_OPCODE &&
+ kgdb_has_hit_break(addr))
return 0;
+#endif
addr += insn.length;
}
@@ -414,18 +418,11 @@ void *alloc_insn_page(void)
if (!page)
return NULL;
- set_vm_flush_reset_perms(page);
- /*
- * First make the page read-only, and only then make it executable to
- * prevent it from being W+X in between.
- */
- set_memory_ro((unsigned long)page, 1);
-
/*
* TODO: Once additional kernel code protection mechanisms are set, ensure
* that the page was not maliciously altered and it is still zeroed.
*/
- set_memory_x((unsigned long)page, 1);
+ set_memory_rox((unsigned long)page, 1);
return page;
}
diff --git a/arch/x86/kernel/kprobes/opt.c b/arch/x86/kernel/kprobes/opt.c
index e6b8c5362b94..e57e07b0edb6 100644
--- a/arch/x86/kernel/kprobes/opt.c
+++ b/arch/x86/kernel/kprobes/opt.c
@@ -15,6 +15,7 @@
#include <linux/extable.h>
#include <linux/kdebug.h>
#include <linux/kallsyms.h>
+#include <linux/kgdb.h>
#include <linux/ftrace.h>
#include <linux/objtool.h>
#include <linux/pgtable.h>
@@ -279,19 +280,6 @@ static int insn_is_indirect_jump(struct insn *insn)
return ret;
}
-static bool is_padding_int3(unsigned long addr, unsigned long eaddr)
-{
- unsigned char ops;
-
- for (; addr < eaddr; addr++) {
- if (get_kernel_nofault(ops, (void *)addr) < 0 ||
- ops != INT3_INSN_OPCODE)
- return false;
- }
-
- return true;
-}
-
/* Decode whole function to ensure any instructions don't jump into target */
static int can_optimize(unsigned long paddr)
{
@@ -334,15 +322,15 @@ static int can_optimize(unsigned long paddr)
ret = insn_decode_kernel(&insn, (void *)recovered_insn);
if (ret < 0)
return 0;
-
+#ifdef CONFIG_KGDB
/*
- * In the case of detecting unknown breakpoint, this could be
- * a padding INT3 between functions. Let's check that all the
- * rest of the bytes are also INT3.
+ * If there is a dynamically installed kgdb sw breakpoint,
+ * this function should not be probed.
*/
- if (insn.opcode.bytes[0] == INT3_INSN_OPCODE)
- return is_padding_int3(addr, paddr - offset + size) ? 1 : 0;
-
+ if (insn.opcode.bytes[0] == INT3_INSN_OPCODE &&
+ kgdb_has_hit_break(addr))
+ return 0;
+#endif
/* Recover address */
insn.kaddr = (void *)addr;
insn.next_byte = (void *)(addr + insn.length);
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index cf886f86038a..1cceac5984da 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -798,19 +798,13 @@ extern bool __raw_callee_save___kvm_vcpu_is_preempted(long);
* Hand-optimize version for x86-64 to avoid 8 64-bit register saving and
* restoring to/from the stack.
*/
-asm(
-".pushsection .text;"
-".global __raw_callee_save___kvm_vcpu_is_preempted;"
-".type __raw_callee_save___kvm_vcpu_is_preempted, @function;"
-"__raw_callee_save___kvm_vcpu_is_preempted:"
-ASM_ENDBR
-"movq __per_cpu_offset(,%rdi,8), %rax;"
-"cmpb $0, " __stringify(KVM_STEAL_TIME_preempted) "+steal_time(%rax);"
-"setne %al;"
-ASM_RET
-".size __raw_callee_save___kvm_vcpu_is_preempted, .-__raw_callee_save___kvm_vcpu_is_preempted;"
-".popsection");
+#define PV_VCPU_PREEMPTED_ASM \
+ "movq __per_cpu_offset(,%rdi,8), %rax\n\t" \
+ "cmpb $0, " __stringify(KVM_STEAL_TIME_preempted) "+steal_time(%rax)\n\t" \
+ "setne %al\n\t"
+DEFINE_PARAVIRT_ASM(__raw_callee_save___kvm_vcpu_is_preempted,
+ PV_VCPU_PREEMPTED_ASM, .text);
#endif
static void __init kvm_guest_init(void)
diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c
index c032edcd3d95..705fb2a41d7d 100644
--- a/arch/x86/kernel/module.c
+++ b/arch/x86/kernel/module.c
@@ -53,7 +53,7 @@ static unsigned long int get_module_load_offset(void)
*/
if (module_load_offset == 0)
module_load_offset =
- (prandom_u32_max(1024) + 1) * PAGE_SIZE;
+ get_random_u32_inclusive(1, 1024) * PAGE_SIZE;
mutex_unlock(&module_kaslr_mutex);
}
return module_load_offset;
@@ -74,10 +74,11 @@ void *module_alloc(unsigned long size)
return NULL;
p = __vmalloc_node_range(size, MODULE_ALIGN,
- MODULES_VADDR + get_module_load_offset(),
- MODULES_END, gfp_mask,
- PAGE_KERNEL, VM_DEFER_KMEMLEAK, NUMA_NO_NODE,
- __builtin_return_address(0));
+ MODULES_VADDR + get_module_load_offset(),
+ MODULES_END, gfp_mask, PAGE_KERNEL,
+ VM_FLUSH_RESET_PERMS | VM_DEFER_KMEMLEAK,
+ NUMA_NO_NODE, __builtin_return_address(0));
+
if (p && (kasan_alloc_module_shadow(p, size, gfp_mask) < 0)) {
vfree(p);
return NULL;
@@ -251,14 +252,13 @@ int module_finalize(const Elf_Ehdr *hdr,
const Elf_Shdr *sechdrs,
struct module *me)
{
- const Elf_Shdr *s, *text = NULL, *alt = NULL, *locks = NULL,
+ const Elf_Shdr *s, *alt = NULL, *locks = NULL,
*para = NULL, *orc = NULL, *orc_ip = NULL,
- *retpolines = NULL, *returns = NULL, *ibt_endbr = NULL;
+ *retpolines = NULL, *returns = NULL, *ibt_endbr = NULL,
+ *calls = NULL, *cfi = NULL;
char *secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset;
for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) {
- if (!strcmp(".text", secstrings + s->sh_name))
- text = s;
if (!strcmp(".altinstructions", secstrings + s->sh_name))
alt = s;
if (!strcmp(".smp_locks", secstrings + s->sh_name))
@@ -273,6 +273,10 @@ int module_finalize(const Elf_Ehdr *hdr,
retpolines = s;
if (!strcmp(".return_sites", secstrings + s->sh_name))
returns = s;
+ if (!strcmp(".call_sites", secstrings + s->sh_name))
+ calls = s;
+ if (!strcmp(".cfi_sites", secstrings + s->sh_name))
+ cfi = s;
if (!strcmp(".ibt_endbr_seal", secstrings + s->sh_name))
ibt_endbr = s;
}
@@ -285,6 +289,22 @@ int module_finalize(const Elf_Ehdr *hdr,
void *pseg = (void *)para->sh_addr;
apply_paravirt(pseg, pseg + para->sh_size);
}
+ if (retpolines || cfi) {
+ void *rseg = NULL, *cseg = NULL;
+ unsigned int rsize = 0, csize = 0;
+
+ if (retpolines) {
+ rseg = (void *)retpolines->sh_addr;
+ rsize = retpolines->sh_size;
+ }
+
+ if (cfi) {
+ cseg = (void *)cfi->sh_addr;
+ csize = cfi->sh_size;
+ }
+
+ apply_fineibt(rseg, rseg + rsize, cseg, cseg + csize);
+ }
if (retpolines) {
void *rseg = (void *)retpolines->sh_addr;
apply_retpolines(rseg, rseg + retpolines->sh_size);
@@ -298,16 +318,32 @@ int module_finalize(const Elf_Ehdr *hdr,
void *aseg = (void *)alt->sh_addr;
apply_alternatives(aseg, aseg + alt->sh_size);
}
+ if (calls || para) {
+ struct callthunk_sites cs = {};
+
+ if (calls) {
+ cs.call_start = (void *)calls->sh_addr;
+ cs.call_end = (void *)calls->sh_addr + calls->sh_size;
+ }
+
+ if (para) {
+ cs.pv_start = (void *)para->sh_addr;
+ cs.pv_end = (void *)para->sh_addr + para->sh_size;
+ }
+
+ callthunks_patch_module_calls(&cs, me);
+ }
if (ibt_endbr) {
void *iseg = (void *)ibt_endbr->sh_addr;
apply_ibt_endbr(iseg, iseg + ibt_endbr->sh_size);
}
- if (locks && text) {
+ if (locks) {
void *lseg = (void *)locks->sh_addr;
- void *tseg = (void *)text->sh_addr;
+ void *text = me->core_layout.base;
+ void *text_end = text + me->core_layout.text_size;
alternatives_smp_module_add(me, me->name,
lseg, lseg + locks->sh_size,
- tseg, tseg + text->sh_size);
+ text, text_end);
}
if (orc && orc_ip)
diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c
index ed8ac6bcbafb..708751311786 100644
--- a/arch/x86/kernel/msr.c
+++ b/arch/x86/kernel/msr.c
@@ -250,7 +250,7 @@ static int msr_device_destroy(unsigned int cpu)
return 0;
}
-static char *msr_devnode(struct device *dev, umode_t *mode)
+static char *msr_devnode(const struct device *dev, umode_t *mode)
{
return kasprintf(GFP_KERNEL, "cpu/%u/msr", MINOR(dev->devt));
}
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index 7ca2d46c08cc..327757afb027 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -37,27 +37,10 @@
* nop stub, which must not clobber anything *including the stack* to
* avoid confusing the entry prologues.
*/
-extern void _paravirt_nop(void);
-asm (".pushsection .entry.text, \"ax\"\n"
- ".global _paravirt_nop\n"
- "_paravirt_nop:\n\t"
- ASM_ENDBR
- ASM_RET
- ".size _paravirt_nop, . - _paravirt_nop\n\t"
- ".type _paravirt_nop, @function\n\t"
- ".popsection");
+DEFINE_PARAVIRT_ASM(_paravirt_nop, "", .entry.text);
/* stub always returning 0. */
-asm (".pushsection .entry.text, \"ax\"\n"
- ".global paravirt_ret0\n"
- "paravirt_ret0:\n\t"
- ASM_ENDBR
- "xor %" _ASM_AX ", %" _ASM_AX ";\n\t"
- ASM_RET
- ".size paravirt_ret0, . - paravirt_ret0\n\t"
- ".type paravirt_ret0, @function\n\t"
- ".popsection");
-
+DEFINE_PARAVIRT_ASM(paravirt_ret0, "xor %eax,%eax", .entry.text);
void __init default_banner(void)
{
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index c21b7347a26d..40d156a31676 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -600,7 +600,7 @@ static __always_inline void __speculation_ctrl_update(unsigned long tifp,
}
if (updmsr)
- write_spec_ctrl_current(msr, false);
+ update_spec_ctrl_cond(msr);
}
static unsigned long speculation_ctrl_update_tif(struct task_struct *tsk)
@@ -965,7 +965,7 @@ early_param("idle", idle_setup);
unsigned long arch_align_stack(unsigned long sp)
{
if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space)
- sp -= prandom_u32_max(8192);
+ sp -= get_random_u32_below(8192);
return sp & ~0xf;
}
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 2f314b170c9f..470c128759ea 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -191,13 +191,13 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
arch_end_context_switch(next_p);
/*
- * Reload esp0 and cpu_current_top_of_stack. This changes
+ * Reload esp0 and pcpu_hot.top_of_stack. This changes
* current_thread_info(). Refresh the SYSENTER configuration in
* case prev or next is vm86.
*/
update_task_stack(next_p);
refresh_sysenter_cs(next);
- this_cpu_write(cpu_current_top_of_stack,
+ this_cpu_write(pcpu_hot.top_of_stack,
(unsigned long)task_stack_page(next_p) +
THREAD_SIZE);
@@ -207,7 +207,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
if (prev->gs | next->gs)
loadsegment(gs, next->gs);
- this_cpu_write(current_task, next_p);
+ raw_cpu_write(pcpu_hot.current_task, next_p);
switch_fpu_finish();
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 6b3418bff326..4e34b3b68ebd 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -165,7 +165,7 @@ static noinstr unsigned long __rdgsbase_inactive(void)
lockdep_assert_irqs_disabled();
- if (!static_cpu_has(X86_FEATURE_XENPV)) {
+ if (!cpu_feature_enabled(X86_FEATURE_XENPV)) {
native_swapgs();
gsbase = rdgsbase();
native_swapgs();
@@ -190,7 +190,7 @@ static noinstr void __wrgsbase_inactive(unsigned long gsbase)
{
lockdep_assert_irqs_disabled();
- if (!static_cpu_has(X86_FEATURE_XENPV)) {
+ if (!cpu_feature_enabled(X86_FEATURE_XENPV)) {
native_swapgs();
wrgsbase(gsbase);
native_swapgs();
@@ -563,7 +563,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
int cpu = smp_processor_id();
WARN_ON_ONCE(IS_ENABLED(CONFIG_DEBUG_ENTRY) &&
- this_cpu_read(hardirq_stack_inuse));
+ this_cpu_read(pcpu_hot.hardirq_stack_inuse));
if (!test_thread_flag(TIF_NEED_FPU_LOAD))
switch_fpu_prepare(prev_fpu, cpu);
@@ -617,8 +617,8 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
/*
* Switch the PDA and FPU contexts.
*/
- this_cpu_write(current_task, next_p);
- this_cpu_write(cpu_current_top_of_stack, task_top_of_stack(next_p));
+ raw_cpu_write(pcpu_hot.current_task, next_p);
+ raw_cpu_write(pcpu_hot.top_of_stack, task_top_of_stack(next_p));
switch_fpu_finish();
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 37c12fb92906..dfaa270a7cc9 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -44,16 +44,35 @@
#include "tls.h"
-enum x86_regset {
- REGSET_GENERAL,
- REGSET_FP,
- REGSET_XFP,
- REGSET_IOPERM64 = REGSET_XFP,
- REGSET_XSTATE,
- REGSET_TLS,
- REGSET_IOPERM32,
+enum x86_regset_32 {
+ REGSET32_GENERAL,
+ REGSET32_FP,
+ REGSET32_XFP,
+ REGSET32_XSTATE,
+ REGSET32_TLS,
+ REGSET32_IOPERM,
};
+enum x86_regset_64 {
+ REGSET64_GENERAL,
+ REGSET64_FP,
+ REGSET64_IOPERM,
+ REGSET64_XSTATE,
+};
+
+#define REGSET_GENERAL \
+({ \
+ BUILD_BUG_ON((int)REGSET32_GENERAL != (int)REGSET64_GENERAL); \
+ REGSET32_GENERAL; \
+})
+
+#define REGSET_FP \
+({ \
+ BUILD_BUG_ON((int)REGSET32_FP != (int)REGSET64_FP); \
+ REGSET32_FP; \
+})
+
+
struct pt_regs_offset {
const char *name;
int offset;
@@ -788,13 +807,13 @@ long arch_ptrace(struct task_struct *child, long request,
#ifdef CONFIG_X86_32
case PTRACE_GETFPXREGS: /* Get the child extended FPU state. */
return copy_regset_to_user(child, &user_x86_32_view,
- REGSET_XFP,
+ REGSET32_XFP,
0, sizeof(struct user_fxsr_struct),
datap) ? -EIO : 0;
case PTRACE_SETFPXREGS: /* Set the child extended FPU state. */
return copy_regset_from_user(child, &user_x86_32_view,
- REGSET_XFP,
+ REGSET32_XFP,
0, sizeof(struct user_fxsr_struct),
datap) ? -EIO : 0;
#endif
@@ -1086,13 +1105,13 @@ static long ia32_arch_ptrace(struct task_struct *child, compat_long_t request,
case PTRACE_GETFPXREGS: /* Get the child extended FPU state. */
return copy_regset_to_user(child, &user_x86_32_view,
- REGSET_XFP, 0,
+ REGSET32_XFP, 0,
sizeof(struct user32_fxsr_struct),
datap);
case PTRACE_SETFPXREGS: /* Set the child extended FPU state. */
return copy_regset_from_user(child, &user_x86_32_view,
- REGSET_XFP, 0,
+ REGSET32_XFP, 0,
sizeof(struct user32_fxsr_struct),
datap);
@@ -1215,29 +1234,38 @@ long compat_arch_ptrace(struct task_struct *child, compat_long_t request,
#ifdef CONFIG_X86_64
static struct user_regset x86_64_regsets[] __ro_after_init = {
- [REGSET_GENERAL] = {
- .core_note_type = NT_PRSTATUS,
- .n = sizeof(struct user_regs_struct) / sizeof(long),
- .size = sizeof(long), .align = sizeof(long),
- .regset_get = genregs_get, .set = genregs_set
+ [REGSET64_GENERAL] = {
+ .core_note_type = NT_PRSTATUS,
+ .n = sizeof(struct user_regs_struct) / sizeof(long),
+ .size = sizeof(long),
+ .align = sizeof(long),
+ .regset_get = genregs_get,
+ .set = genregs_set
},
- [REGSET_FP] = {
- .core_note_type = NT_PRFPREG,
- .n = sizeof(struct fxregs_state) / sizeof(long),
- .size = sizeof(long), .align = sizeof(long),
- .active = regset_xregset_fpregs_active, .regset_get = xfpregs_get, .set = xfpregs_set
+ [REGSET64_FP] = {
+ .core_note_type = NT_PRFPREG,
+ .n = sizeof(struct fxregs_state) / sizeof(long),
+ .size = sizeof(long),
+ .align = sizeof(long),
+ .active = regset_xregset_fpregs_active,
+ .regset_get = xfpregs_get,
+ .set = xfpregs_set
},
- [REGSET_XSTATE] = {
- .core_note_type = NT_X86_XSTATE,
- .size = sizeof(u64), .align = sizeof(u64),
- .active = xstateregs_active, .regset_get = xstateregs_get,
- .set = xstateregs_set
+ [REGSET64_XSTATE] = {
+ .core_note_type = NT_X86_XSTATE,
+ .size = sizeof(u64),
+ .align = sizeof(u64),
+ .active = xstateregs_active,
+ .regset_get = xstateregs_get,
+ .set = xstateregs_set
},
- [REGSET_IOPERM64] = {
- .core_note_type = NT_386_IOPERM,
- .n = IO_BITMAP_LONGS,
- .size = sizeof(long), .align = sizeof(long),
- .active = ioperm_active, .regset_get = ioperm_get
+ [REGSET64_IOPERM] = {
+ .core_note_type = NT_386_IOPERM,
+ .n = IO_BITMAP_LONGS,
+ .size = sizeof(long),
+ .align = sizeof(long),
+ .active = ioperm_active,
+ .regset_get = ioperm_get
},
};
@@ -1256,43 +1284,57 @@ static const struct user_regset_view user_x86_64_view = {
#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION
static struct user_regset x86_32_regsets[] __ro_after_init = {
- [REGSET_GENERAL] = {
- .core_note_type = NT_PRSTATUS,
- .n = sizeof(struct user_regs_struct32) / sizeof(u32),
- .size = sizeof(u32), .align = sizeof(u32),
- .regset_get = genregs32_get, .set = genregs32_set
+ [REGSET32_GENERAL] = {
+ .core_note_type = NT_PRSTATUS,
+ .n = sizeof(struct user_regs_struct32) / sizeof(u32),
+ .size = sizeof(u32),
+ .align = sizeof(u32),
+ .regset_get = genregs32_get,
+ .set = genregs32_set
},
- [REGSET_FP] = {
- .core_note_type = NT_PRFPREG,
- .n = sizeof(struct user_i387_ia32_struct) / sizeof(u32),
- .size = sizeof(u32), .align = sizeof(u32),
- .active = regset_fpregs_active, .regset_get = fpregs_get, .set = fpregs_set
+ [REGSET32_FP] = {
+ .core_note_type = NT_PRFPREG,
+ .n = sizeof(struct user_i387_ia32_struct) / sizeof(u32),
+ .size = sizeof(u32),
+ .align = sizeof(u32),
+ .active = regset_fpregs_active,
+ .regset_get = fpregs_get,
+ .set = fpregs_set
},
- [REGSET_XFP] = {
- .core_note_type = NT_PRXFPREG,
- .n = sizeof(struct fxregs_state) / sizeof(u32),
- .size = sizeof(u32), .align = sizeof(u32),
- .active = regset_xregset_fpregs_active, .regset_get = xfpregs_get, .set = xfpregs_set
+ [REGSET32_XFP] = {
+ .core_note_type = NT_PRXFPREG,
+ .n = sizeof(struct fxregs_state) / sizeof(u32),
+ .size = sizeof(u32),
+ .align = sizeof(u32),
+ .active = regset_xregset_fpregs_active,
+ .regset_get = xfpregs_get,
+ .set = xfpregs_set
},
- [REGSET_XSTATE] = {
- .core_note_type = NT_X86_XSTATE,
- .size = sizeof(u64), .align = sizeof(u64),
- .active = xstateregs_active, .regset_get = xstateregs_get,
- .set = xstateregs_set
+ [REGSET32_XSTATE] = {
+ .core_note_type = NT_X86_XSTATE,
+ .size = sizeof(u64),
+ .align = sizeof(u64),
+ .active = xstateregs_active,
+ .regset_get = xstateregs_get,
+ .set = xstateregs_set
},
- [REGSET_TLS] = {
- .core_note_type = NT_386_TLS,
- .n = GDT_ENTRY_TLS_ENTRIES, .bias = GDT_ENTRY_TLS_MIN,
- .size = sizeof(struct user_desc),
- .align = sizeof(struct user_desc),
- .active = regset_tls_active,
- .regset_get = regset_tls_get, .set = regset_tls_set
+ [REGSET32_TLS] = {
+ .core_note_type = NT_386_TLS,
+ .n = GDT_ENTRY_TLS_ENTRIES,
+ .bias = GDT_ENTRY_TLS_MIN,
+ .size = sizeof(struct user_desc),
+ .align = sizeof(struct user_desc),
+ .active = regset_tls_active,
+ .regset_get = regset_tls_get,
+ .set = regset_tls_set
},
- [REGSET_IOPERM32] = {
- .core_note_type = NT_386_IOPERM,
- .n = IO_BITMAP_BYTES / sizeof(u32),
- .size = sizeof(u32), .align = sizeof(u32),
- .active = ioperm_active, .regset_get = ioperm_get
+ [REGSET32_IOPERM] = {
+ .core_note_type = NT_386_IOPERM,
+ .n = IO_BITMAP_BYTES / sizeof(u32),
+ .size = sizeof(u32),
+ .align = sizeof(u32),
+ .active = ioperm_active,
+ .regset_get = ioperm_get
},
};
@@ -1311,10 +1353,10 @@ u64 xstate_fx_sw_bytes[USER_XSTATE_FX_SW_WORDS];
void __init update_regset_xstate_info(unsigned int size, u64 xstate_mask)
{
#ifdef CONFIG_X86_64
- x86_64_regsets[REGSET_XSTATE].n = size / sizeof(u64);
+ x86_64_regsets[REGSET64_XSTATE].n = size / sizeof(u64);
#endif
#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION
- x86_32_regsets[REGSET_XSTATE].n = size / sizeof(u64);
+ x86_32_regsets[REGSET32_XSTATE].n = size / sizeof(u64);
#endif
xstate_fx_sw_bytes[USER_XSTATE_XCR0_WORD] = xstate_mask;
}
diff --git a/arch/x86/kernel/relocate_kernel_64.S b/arch/x86/kernel/relocate_kernel_64.S
index 4809c0dc4eb0..4a73351f87f8 100644
--- a/arch/x86/kernel/relocate_kernel_64.S
+++ b/arch/x86/kernel/relocate_kernel_64.S
@@ -41,6 +41,7 @@
.text
.align PAGE_SIZE
.code64
+SYM_CODE_START_NOALIGN(relocate_range)
SYM_CODE_START_NOALIGN(relocate_kernel)
UNWIND_HINT_EMPTY
ANNOTATE_NOENDBR
@@ -312,5 +313,5 @@ SYM_CODE_START_LOCAL_NOALIGN(swap_pages)
int3
SYM_CODE_END(swap_pages)
- .globl kexec_control_code_size
-.set kexec_control_code_size, . - relocate_kernel
+ .skip KEXEC_CONTROL_CODE_MAX_SIZE - (. - relocate_kernel), 0xcc
+SYM_CODE_END(relocate_range);
diff --git a/arch/x86/kernel/resource.c b/arch/x86/kernel/resource.c
index bba1abd05bfe..79bc8a97a083 100644
--- a/arch/x86/kernel/resource.c
+++ b/arch/x86/kernel/resource.c
@@ -42,8 +42,16 @@ static void remove_e820_regions(struct resource *avail)
resource_clip(avail, e820_start, e820_end);
if (orig.start != avail->start || orig.end != avail->end) {
- pr_info("clipped %pR to %pR for e820 entry [mem %#010Lx-%#010Lx]\n",
- &orig, avail, e820_start, e820_end);
+ pr_info("resource: avoiding allocation from e820 entry [mem %#010Lx-%#010Lx]\n",
+ e820_start, e820_end);
+ if (avail->end > avail->start)
+ /*
+ * Use %pa instead of %pR because "avail"
+ * is typically IORESOURCE_UNSET, so %pR
+ * shows the size instead of addresses.
+ */
+ pr_info("resource: remaining [mem %pa-%pa] available\n",
+ &avail->start, &avail->end);
orig = *avail;
}
}
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 216fee7144ee..88188549647c 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -31,9 +31,11 @@
#include <xen/xen.h>
#include <asm/apic.h>
+#include <asm/efi.h>
#include <asm/numa.h>
#include <asm/bios_ebda.h>
#include <asm/bugs.h>
+#include <asm/cacheinfo.h>
#include <asm/cpu.h>
#include <asm/efi.h>
#include <asm/gart.h>
@@ -1074,24 +1076,13 @@ void __init setup_arch(char **cmdline_p)
max_pfn = e820__end_of_ram_pfn();
/* update e820 for memory not covered by WB MTRRs */
- if (IS_ENABLED(CONFIG_MTRR))
- mtrr_bp_init();
- else
- pat_disable("PAT support disabled because CONFIG_MTRR is disabled in the kernel.");
-
+ cache_bp_init();
if (mtrr_trim_uncached_memory(max_pfn))
max_pfn = e820__end_of_ram_pfn();
max_possible_pfn = max_pfn;
/*
- * This call is required when the CPU does not support PAT. If
- * mtrr_bp_init() invoked it already via pat_init() the call has no
- * effect.
- */
- init_cache_modes();
-
- /*
* Define random base addresses for memory sections after max_pfn is
* defined and before each memory section base is used.
*/
@@ -1175,7 +1166,7 @@ void __init setup_arch(char **cmdline_p)
* Moreover, on machines with SandyBridge graphics or in setups that use
* crashkernel the entire 1M is reserved anyway.
*/
- reserve_real_mode();
+ x86_platform.realmode_reserve();
init_mem_mapping();
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index 49325caa7307..c242dc47e9cb 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -11,6 +11,7 @@
#include <linux/smp.h>
#include <linux/topology.h>
#include <linux/pfn.h>
+#include <linux/stackprotector.h>
#include <asm/sections.h>
#include <asm/processor.h>
#include <asm/desc.h>
@@ -21,10 +22,6 @@
#include <asm/proto.h>
#include <asm/cpumask.h>
#include <asm/cpu.h>
-#include <asm/stackprotector.h>
-
-DEFINE_PER_CPU_READ_MOSTLY(int, cpu_number);
-EXPORT_PER_CPU_SYMBOL(cpu_number);
#ifdef CONFIG_X86_64
#define BOOT_PERCPU_OFFSET ((unsigned long)__per_cpu_load)
@@ -172,7 +169,7 @@ void __init setup_per_cpu_areas(void)
for_each_possible_cpu(cpu) {
per_cpu_offset(cpu) = delta + pcpu_unit_offsets[cpu];
per_cpu(this_cpu_off, cpu) = per_cpu_offset(cpu);
- per_cpu(cpu_number, cpu) = cpu;
+ per_cpu(pcpu_hot.cpu_number, cpu) = cpu;
setup_percpu_segment(cpu);
/*
* Copy data used in early init routines from the
@@ -211,7 +208,7 @@ void __init setup_per_cpu_areas(void)
* area. Reload any changed state for the boot CPU.
*/
if (!cpu)
- switch_to_new_gdt(cpu);
+ switch_gdt_and_percpu_base(cpu);
}
/* indicate the early static arrays will soon be gone */
diff --git a/arch/x86/kernel/sev.c b/arch/x86/kernel/sev.c
index a428c62330d3..679026a640ef 100644
--- a/arch/x86/kernel/sev.c
+++ b/arch/x86/kernel/sev.c
@@ -1536,32 +1536,32 @@ static enum es_result vc_handle_mmio_movs(struct es_em_ctxt *ctxt,
static enum es_result vc_handle_mmio(struct ghcb *ghcb, struct es_em_ctxt *ctxt)
{
struct insn *insn = &ctxt->insn;
+ enum insn_mmio_type mmio;
unsigned int bytes = 0;
- enum mmio_type mmio;
enum es_result ret;
u8 sign_byte;
long *reg_data;
mmio = insn_decode_mmio(insn, &bytes);
- if (mmio == MMIO_DECODE_FAILED)
+ if (mmio == INSN_MMIO_DECODE_FAILED)
return ES_DECODE_FAILED;
- if (mmio != MMIO_WRITE_IMM && mmio != MMIO_MOVS) {
+ if (mmio != INSN_MMIO_WRITE_IMM && mmio != INSN_MMIO_MOVS) {
reg_data = insn_get_modrm_reg_ptr(insn, ctxt->regs);
if (!reg_data)
return ES_DECODE_FAILED;
}
switch (mmio) {
- case MMIO_WRITE:
+ case INSN_MMIO_WRITE:
memcpy(ghcb->shared_buffer, reg_data, bytes);
ret = vc_do_mmio(ghcb, ctxt, bytes, false);
break;
- case MMIO_WRITE_IMM:
+ case INSN_MMIO_WRITE_IMM:
memcpy(ghcb->shared_buffer, insn->immediate1.bytes, bytes);
ret = vc_do_mmio(ghcb, ctxt, bytes, false);
break;
- case MMIO_READ:
+ case INSN_MMIO_READ:
ret = vc_do_mmio(ghcb, ctxt, bytes, true);
if (ret)
break;
@@ -1572,7 +1572,7 @@ static enum es_result vc_handle_mmio(struct ghcb *ghcb, struct es_em_ctxt *ctxt)
memcpy(reg_data, ghcb->shared_buffer, bytes);
break;
- case MMIO_READ_ZERO_EXTEND:
+ case INSN_MMIO_READ_ZERO_EXTEND:
ret = vc_do_mmio(ghcb, ctxt, bytes, true);
if (ret)
break;
@@ -1581,7 +1581,7 @@ static enum es_result vc_handle_mmio(struct ghcb *ghcb, struct es_em_ctxt *ctxt)
memset(reg_data, 0, insn->opnd_bytes);
memcpy(reg_data, ghcb->shared_buffer, bytes);
break;
- case MMIO_READ_SIGN_EXTEND:
+ case INSN_MMIO_READ_SIGN_EXTEND:
ret = vc_do_mmio(ghcb, ctxt, bytes, true);
if (ret)
break;
@@ -1600,7 +1600,7 @@ static enum es_result vc_handle_mmio(struct ghcb *ghcb, struct es_em_ctxt *ctxt)
memset(reg_data, sign_byte, insn->opnd_bytes);
memcpy(reg_data, ghcb->shared_buffer, bytes);
break;
- case MMIO_MOVS:
+ case INSN_MMIO_MOVS:
ret = vc_handle_mmio_movs(ctxt, bytes);
break;
default:
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index 9c7265b524c7..1504eb8d25aa 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -37,180 +37,27 @@
#include <asm/sighandling.h>
#include <asm/vm86.h>
-#ifdef CONFIG_X86_64
-#include <linux/compat.h>
-#include <asm/proto.h>
-#include <asm/ia32_unistd.h>
-#include <asm/fpu/xstate.h>
-#endif /* CONFIG_X86_64 */
-
#include <asm/syscall.h>
#include <asm/sigframe.h>
#include <asm/signal.h>
-#ifdef CONFIG_X86_64
-/*
- * If regs->ss will cause an IRET fault, change it. Otherwise leave it
- * alone. Using this generally makes no sense unless
- * user_64bit_mode(regs) would return true.
- */
-static void force_valid_ss(struct pt_regs *regs)
+static inline int is_ia32_compat_frame(struct ksignal *ksig)
{
- u32 ar;
- asm volatile ("lar %[old_ss], %[ar]\n\t"
- "jz 1f\n\t" /* If invalid: */
- "xorl %[ar], %[ar]\n\t" /* set ar = 0 */
- "1:"
- : [ar] "=r" (ar)
- : [old_ss] "rm" ((u16)regs->ss));
-
- /*
- * For a valid 64-bit user context, we need DPL 3, type
- * read-write data or read-write exp-down data, and S and P
- * set. We can't use VERW because VERW doesn't check the
- * P bit.
- */
- ar &= AR_DPL_MASK | AR_S | AR_P | AR_TYPE_MASK;
- if (ar != (AR_DPL3 | AR_S | AR_P | AR_TYPE_RWDATA) &&
- ar != (AR_DPL3 | AR_S | AR_P | AR_TYPE_RWDATA_EXPDOWN))
- regs->ss = __USER_DS;
+ return IS_ENABLED(CONFIG_IA32_EMULATION) &&
+ ksig->ka.sa.sa_flags & SA_IA32_ABI;
}
-# define CONTEXT_COPY_SIZE offsetof(struct sigcontext, reserved1)
-#else
-# define CONTEXT_COPY_SIZE sizeof(struct sigcontext)
-#endif
-static bool restore_sigcontext(struct pt_regs *regs,
- struct sigcontext __user *usc,
- unsigned long uc_flags)
+static inline int is_ia32_frame(struct ksignal *ksig)
{
- struct sigcontext sc;
-
- /* Always make any pending restarted system calls return -EINTR */
- current->restart_block.fn = do_no_restart_syscall;
-
- if (copy_from_user(&sc, usc, CONTEXT_COPY_SIZE))
- return false;
-
-#ifdef CONFIG_X86_32
- loadsegment(gs, sc.gs);
- regs->fs = sc.fs;
- regs->es = sc.es;
- regs->ds = sc.ds;
-#endif /* CONFIG_X86_32 */
-
- regs->bx = sc.bx;
- regs->cx = sc.cx;
- regs->dx = sc.dx;
- regs->si = sc.si;
- regs->di = sc.di;
- regs->bp = sc.bp;
- regs->ax = sc.ax;
- regs->sp = sc.sp;
- regs->ip = sc.ip;
-
-#ifdef CONFIG_X86_64
- regs->r8 = sc.r8;
- regs->r9 = sc.r9;
- regs->r10 = sc.r10;
- regs->r11 = sc.r11;
- regs->r12 = sc.r12;
- regs->r13 = sc.r13;
- regs->r14 = sc.r14;
- regs->r15 = sc.r15;
-#endif /* CONFIG_X86_64 */
-
- /* Get CS/SS and force CPL3 */
- regs->cs = sc.cs | 0x03;
- regs->ss = sc.ss | 0x03;
-
- regs->flags = (regs->flags & ~FIX_EFLAGS) | (sc.flags & FIX_EFLAGS);
- /* disable syscall checks */
- regs->orig_ax = -1;
-
-#ifdef CONFIG_X86_64
- /*
- * Fix up SS if needed for the benefit of old DOSEMU and
- * CRIU.
- */
- if (unlikely(!(uc_flags & UC_STRICT_RESTORE_SS) && user_64bit_mode(regs)))
- force_valid_ss(regs);
-#endif
-
- return fpu__restore_sig((void __user *)sc.fpstate,
- IS_ENABLED(CONFIG_X86_32));
+ return IS_ENABLED(CONFIG_X86_32) || is_ia32_compat_frame(ksig);
}
-static __always_inline int
-__unsafe_setup_sigcontext(struct sigcontext __user *sc, void __user *fpstate,
- struct pt_regs *regs, unsigned long mask)
+static inline int is_x32_frame(struct ksignal *ksig)
{
-#ifdef CONFIG_X86_32
- unsigned int gs;
- savesegment(gs, gs);
-
- unsafe_put_user(gs, (unsigned int __user *)&sc->gs, Efault);
- unsafe_put_user(regs->fs, (unsigned int __user *)&sc->fs, Efault);
- unsafe_put_user(regs->es, (unsigned int __user *)&sc->es, Efault);
- unsafe_put_user(regs->ds, (unsigned int __user *)&sc->ds, Efault);
-#endif /* CONFIG_X86_32 */
-
- unsafe_put_user(regs->di, &sc->di, Efault);
- unsafe_put_user(regs->si, &sc->si, Efault);
- unsafe_put_user(regs->bp, &sc->bp, Efault);
- unsafe_put_user(regs->sp, &sc->sp, Efault);
- unsafe_put_user(regs->bx, &sc->bx, Efault);
- unsafe_put_user(regs->dx, &sc->dx, Efault);
- unsafe_put_user(regs->cx, &sc->cx, Efault);
- unsafe_put_user(regs->ax, &sc->ax, Efault);
-#ifdef CONFIG_X86_64
- unsafe_put_user(regs->r8, &sc->r8, Efault);
- unsafe_put_user(regs->r9, &sc->r9, Efault);
- unsafe_put_user(regs->r10, &sc->r10, Efault);
- unsafe_put_user(regs->r11, &sc->r11, Efault);
- unsafe_put_user(regs->r12, &sc->r12, Efault);
- unsafe_put_user(regs->r13, &sc->r13, Efault);
- unsafe_put_user(regs->r14, &sc->r14, Efault);
- unsafe_put_user(regs->r15, &sc->r15, Efault);
-#endif /* CONFIG_X86_64 */
-
- unsafe_put_user(current->thread.trap_nr, &sc->trapno, Efault);
- unsafe_put_user(current->thread.error_code, &sc->err, Efault);
- unsafe_put_user(regs->ip, &sc->ip, Efault);
-#ifdef CONFIG_X86_32
- unsafe_put_user(regs->cs, (unsigned int __user *)&sc->cs, Efault);
- unsafe_put_user(regs->flags, &sc->flags, Efault);
- unsafe_put_user(regs->sp, &sc->sp_at_signal, Efault);
- unsafe_put_user(regs->ss, (unsigned int __user *)&sc->ss, Efault);
-#else /* !CONFIG_X86_32 */
- unsafe_put_user(regs->flags, &sc->flags, Efault);
- unsafe_put_user(regs->cs, &sc->cs, Efault);
- unsafe_put_user(0, &sc->gs, Efault);
- unsafe_put_user(0, &sc->fs, Efault);
- unsafe_put_user(regs->ss, &sc->ss, Efault);
-#endif /* CONFIG_X86_32 */
-
- unsafe_put_user(fpstate, (unsigned long __user *)&sc->fpstate, Efault);
-
- /* non-iBCS2 extensions.. */
- unsafe_put_user(mask, &sc->oldmask, Efault);
- unsafe_put_user(current->thread.cr2, &sc->cr2, Efault);
- return 0;
-Efault:
- return -EFAULT;
+ return IS_ENABLED(CONFIG_X86_X32_ABI) &&
+ ksig->ka.sa.sa_flags & SA_X32_ABI;
}
-#define unsafe_put_sigcontext(sc, fp, regs, set, label) \
-do { \
- if (__unsafe_setup_sigcontext(sc, fp, regs, set->sig[0])) \
- goto label; \
-} while(0);
-
-#define unsafe_put_sigmask(set, frame, label) \
- unsafe_put_user(*(__u64 *)(set), \
- (__u64 __user *)&(frame)->uc.uc_sigmask, \
- label)
-
/*
* Set up a signal frame.
*/
@@ -223,24 +70,12 @@ do { \
/*
* Determine which stack to use..
*/
-static unsigned long align_sigframe(unsigned long sp)
-{
-#ifdef CONFIG_X86_32
- /*
- * Align the stack pointer according to the i386 ABI,
- * i.e. so that on function entry ((sp + 4) & 15) == 0.
- */
- sp = ((sp + 4) & -FRAME_ALIGNMENT) - 4;
-#else /* !CONFIG_X86_32 */
- sp = round_down(sp, FRAME_ALIGNMENT) - 8;
-#endif
- return sp;
-}
-
-static void __user *
-get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size,
+void __user *
+get_sigframe(struct ksignal *ksig, struct pt_regs *regs, size_t frame_size,
void __user **fpstate)
{
+ struct k_sigaction *ka = &ksig->ka;
+ int ia32_frame = is_ia32_frame(ksig);
/* Default to using normal stack */
bool nested_altstack = on_sig_stack(regs->sp);
bool entering_altstack = false;
@@ -249,7 +84,7 @@ get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size,
unsigned long buf_fx = 0;
/* redzone */
- if (IS_ENABLED(CONFIG_X86_64))
+ if (!ia32_frame)
sp -= 128;
/* This is the X/Open sanctioned signal stack switching. */
@@ -263,7 +98,7 @@ get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size,
sp = current->sas_ss_sp + current->sas_ss_size;
entering_altstack = true;
}
- } else if (IS_ENABLED(CONFIG_X86_32) &&
+ } else if (ia32_frame &&
!nested_altstack &&
regs->ss != __USER_DS &&
!(ka->sa.sa_flags & SA_RESTORER) &&
@@ -273,11 +108,19 @@ get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size,
entering_altstack = true;
}
- sp = fpu__alloc_mathframe(sp, IS_ENABLED(CONFIG_X86_32),
- &buf_fx, &math_size);
+ sp = fpu__alloc_mathframe(sp, ia32_frame, &buf_fx, &math_size);
*fpstate = (void __user *)sp;
- sp = align_sigframe(sp - frame_size);
+ sp -= frame_size;
+
+ if (ia32_frame)
+ /*
+ * Align the stack pointer according to the i386 ABI,
+ * i.e. so that on function entry ((sp + 4) & 15) == 0.
+ */
+ sp = ((sp + 4) & -FRAME_ALIGNMENT) - 4;
+ else
+ sp = round_down(sp, FRAME_ALIGNMENT) - 8;
/*
* If we are on the alternate signal stack and would overflow it, don't.
@@ -300,391 +143,6 @@ get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size,
return (void __user *)sp;
}
-#ifdef CONFIG_X86_32
-static const struct {
- u16 poplmovl;
- u32 val;
- u16 int80;
-} __attribute__((packed)) retcode = {
- 0xb858, /* popl %eax; movl $..., %eax */
- __NR_sigreturn,
- 0x80cd, /* int $0x80 */
-};
-
-static const struct {
- u8 movl;
- u32 val;
- u16 int80;
- u8 pad;
-} __attribute__((packed)) rt_retcode = {
- 0xb8, /* movl $..., %eax */
- __NR_rt_sigreturn,
- 0x80cd, /* int $0x80 */
- 0
-};
-
-static int
-__setup_frame(int sig, struct ksignal *ksig, sigset_t *set,
- struct pt_regs *regs)
-{
- struct sigframe __user *frame;
- void __user *restorer;
- void __user *fp = NULL;
-
- frame = get_sigframe(&ksig->ka, regs, sizeof(*frame), &fp);
-
- if (!user_access_begin(frame, sizeof(*frame)))
- return -EFAULT;
-
- unsafe_put_user(sig, &frame->sig, Efault);
- unsafe_put_sigcontext(&frame->sc, fp, regs, set, Efault);
- unsafe_put_user(set->sig[1], &frame->extramask[0], Efault);
- if (current->mm->context.vdso)
- restorer = current->mm->context.vdso +
- vdso_image_32.sym___kernel_sigreturn;
- else
- restorer = &frame->retcode;
- if (ksig->ka.sa.sa_flags & SA_RESTORER)
- restorer = ksig->ka.sa.sa_restorer;
-
- /* Set up to return from userspace. */
- unsafe_put_user(restorer, &frame->pretcode, Efault);
-
- /*
- * This is popl %eax ; movl $__NR_sigreturn, %eax ; int $0x80
- *
- * WE DO NOT USE IT ANY MORE! It's only left here for historical
- * reasons and because gdb uses it as a signature to notice
- * signal handler stack frames.
- */
- unsafe_put_user(*((u64 *)&retcode), (u64 *)frame->retcode, Efault);
- user_access_end();
-
- /* Set up registers for signal handler */
- regs->sp = (unsigned long)frame;
- regs->ip = (unsigned long)ksig->ka.sa.sa_handler;
- regs->ax = (unsigned long)sig;
- regs->dx = 0;
- regs->cx = 0;
-
- regs->ds = __USER_DS;
- regs->es = __USER_DS;
- regs->ss = __USER_DS;
- regs->cs = __USER_CS;
-
- return 0;
-
-Efault:
- user_access_end();
- return -EFAULT;
-}
-
-static int __setup_rt_frame(int sig, struct ksignal *ksig,
- sigset_t *set, struct pt_regs *regs)
-{
- struct rt_sigframe __user *frame;
- void __user *restorer;
- void __user *fp = NULL;
-
- frame = get_sigframe(&ksig->ka, regs, sizeof(*frame), &fp);
-
- if (!user_access_begin(frame, sizeof(*frame)))
- return -EFAULT;
-
- unsafe_put_user(sig, &frame->sig, Efault);
- unsafe_put_user(&frame->info, &frame->pinfo, Efault);
- unsafe_put_user(&frame->uc, &frame->puc, Efault);
-
- /* Create the ucontext. */
- if (static_cpu_has(X86_FEATURE_XSAVE))
- unsafe_put_user(UC_FP_XSTATE, &frame->uc.uc_flags, Efault);
- else
- unsafe_put_user(0, &frame->uc.uc_flags, Efault);
- unsafe_put_user(0, &frame->uc.uc_link, Efault);
- unsafe_save_altstack(&frame->uc.uc_stack, regs->sp, Efault);
-
- /* Set up to return from userspace. */
- restorer = current->mm->context.vdso +
- vdso_image_32.sym___kernel_rt_sigreturn;
- if (ksig->ka.sa.sa_flags & SA_RESTORER)
- restorer = ksig->ka.sa.sa_restorer;
- unsafe_put_user(restorer, &frame->pretcode, Efault);
-
- /*
- * This is movl $__NR_rt_sigreturn, %ax ; int $0x80
- *
- * WE DO NOT USE IT ANY MORE! It's only left here for historical
- * reasons and because gdb uses it as a signature to notice
- * signal handler stack frames.
- */
- unsafe_put_user(*((u64 *)&rt_retcode), (u64 *)frame->retcode, Efault);
- unsafe_put_sigcontext(&frame->uc.uc_mcontext, fp, regs, set, Efault);
- unsafe_put_sigmask(set, frame, Efault);
- user_access_end();
-
- if (copy_siginfo_to_user(&frame->info, &ksig->info))
- return -EFAULT;
-
- /* Set up registers for signal handler */
- regs->sp = (unsigned long)frame;
- regs->ip = (unsigned long)ksig->ka.sa.sa_handler;
- regs->ax = (unsigned long)sig;
- regs->dx = (unsigned long)&frame->info;
- regs->cx = (unsigned long)&frame->uc;
-
- regs->ds = __USER_DS;
- regs->es = __USER_DS;
- regs->ss = __USER_DS;
- regs->cs = __USER_CS;
-
- return 0;
-Efault:
- user_access_end();
- return -EFAULT;
-}
-#else /* !CONFIG_X86_32 */
-static unsigned long frame_uc_flags(struct pt_regs *regs)
-{
- unsigned long flags;
-
- if (boot_cpu_has(X86_FEATURE_XSAVE))
- flags = UC_FP_XSTATE | UC_SIGCONTEXT_SS;
- else
- flags = UC_SIGCONTEXT_SS;
-
- if (likely(user_64bit_mode(regs)))
- flags |= UC_STRICT_RESTORE_SS;
-
- return flags;
-}
-
-static int __setup_rt_frame(int sig, struct ksignal *ksig,
- sigset_t *set, struct pt_regs *regs)
-{
- struct rt_sigframe __user *frame;
- void __user *fp = NULL;
- unsigned long uc_flags;
-
- /* x86-64 should always use SA_RESTORER. */
- if (!(ksig->ka.sa.sa_flags & SA_RESTORER))
- return -EFAULT;
-
- frame = get_sigframe(&ksig->ka, regs, sizeof(struct rt_sigframe), &fp);
- uc_flags = frame_uc_flags(regs);
-
- if (!user_access_begin(frame, sizeof(*frame)))
- return -EFAULT;
-
- /* Create the ucontext. */
- unsafe_put_user(uc_flags, &frame->uc.uc_flags, Efault);
- unsafe_put_user(0, &frame->uc.uc_link, Efault);
- unsafe_save_altstack(&frame->uc.uc_stack, regs->sp, Efault);
-
- /* Set up to return from userspace. If provided, use a stub
- already in userspace. */
- unsafe_put_user(ksig->ka.sa.sa_restorer, &frame->pretcode, Efault);
- unsafe_put_sigcontext(&frame->uc.uc_mcontext, fp, regs, set, Efault);
- unsafe_put_sigmask(set, frame, Efault);
- user_access_end();
-
- if (ksig->ka.sa.sa_flags & SA_SIGINFO) {
- if (copy_siginfo_to_user(&frame->info, &ksig->info))
- return -EFAULT;
- }
-
- /* Set up registers for signal handler */
- regs->di = sig;
- /* In case the signal handler was declared without prototypes */
- regs->ax = 0;
-
- /* This also works for non SA_SIGINFO handlers because they expect the
- next argument after the signal number on the stack. */
- regs->si = (unsigned long)&frame->info;
- regs->dx = (unsigned long)&frame->uc;
- regs->ip = (unsigned long) ksig->ka.sa.sa_handler;
-
- regs->sp = (unsigned long)frame;
-
- /*
- * Set up the CS and SS registers to run signal handlers in
- * 64-bit mode, even if the handler happens to be interrupting
- * 32-bit or 16-bit code.
- *
- * SS is subtle. In 64-bit mode, we don't need any particular
- * SS descriptor, but we do need SS to be valid. It's possible
- * that the old SS is entirely bogus -- this can happen if the
- * signal we're trying to deliver is #GP or #SS caused by a bad
- * SS value. We also have a compatibility issue here: DOSEMU
- * relies on the contents of the SS register indicating the
- * SS value at the time of the signal, even though that code in
- * DOSEMU predates sigreturn's ability to restore SS. (DOSEMU
- * avoids relying on sigreturn to restore SS; instead it uses
- * a trampoline.) So we do our best: if the old SS was valid,
- * we keep it. Otherwise we replace it.
- */
- regs->cs = __USER_CS;
-
- if (unlikely(regs->ss != __USER_DS))
- force_valid_ss(regs);
-
- return 0;
-
-Efault:
- user_access_end();
- return -EFAULT;
-}
-#endif /* CONFIG_X86_32 */
-
-#ifdef CONFIG_X86_X32_ABI
-static int x32_copy_siginfo_to_user(struct compat_siginfo __user *to,
- const struct kernel_siginfo *from)
-{
- struct compat_siginfo new;
-
- copy_siginfo_to_external32(&new, from);
- if (from->si_signo == SIGCHLD) {
- new._sifields._sigchld_x32._utime = from->si_utime;
- new._sifields._sigchld_x32._stime = from->si_stime;
- }
- if (copy_to_user(to, &new, sizeof(struct compat_siginfo)))
- return -EFAULT;
- return 0;
-}
-
-int copy_siginfo_to_user32(struct compat_siginfo __user *to,
- const struct kernel_siginfo *from)
-{
- if (in_x32_syscall())
- return x32_copy_siginfo_to_user(to, from);
- return __copy_siginfo_to_user32(to, from);
-}
-#endif /* CONFIG_X86_X32_ABI */
-
-static int x32_setup_rt_frame(struct ksignal *ksig,
- compat_sigset_t *set,
- struct pt_regs *regs)
-{
-#ifdef CONFIG_X86_X32_ABI
- struct rt_sigframe_x32 __user *frame;
- unsigned long uc_flags;
- void __user *restorer;
- void __user *fp = NULL;
-
- if (!(ksig->ka.sa.sa_flags & SA_RESTORER))
- return -EFAULT;
-
- frame = get_sigframe(&ksig->ka, regs, sizeof(*frame), &fp);
-
- uc_flags = frame_uc_flags(regs);
-
- if (!user_access_begin(frame, sizeof(*frame)))
- return -EFAULT;
-
- /* Create the ucontext. */
- unsafe_put_user(uc_flags, &frame->uc.uc_flags, Efault);
- unsafe_put_user(0, &frame->uc.uc_link, Efault);
- unsafe_compat_save_altstack(&frame->uc.uc_stack, regs->sp, Efault);
- unsafe_put_user(0, &frame->uc.uc__pad0, Efault);
- restorer = ksig->ka.sa.sa_restorer;
- unsafe_put_user(restorer, (unsigned long __user *)&frame->pretcode, Efault);
- unsafe_put_sigcontext(&frame->uc.uc_mcontext, fp, regs, set, Efault);
- unsafe_put_sigmask(set, frame, Efault);
- user_access_end();
-
- if (ksig->ka.sa.sa_flags & SA_SIGINFO) {
- if (x32_copy_siginfo_to_user(&frame->info, &ksig->info))
- return -EFAULT;
- }
-
- /* Set up registers for signal handler */
- regs->sp = (unsigned long) frame;
- regs->ip = (unsigned long) ksig->ka.sa.sa_handler;
-
- /* We use the x32 calling convention here... */
- regs->di = ksig->sig;
- regs->si = (unsigned long) &frame->info;
- regs->dx = (unsigned long) &frame->uc;
-
- loadsegment(ds, __USER_DS);
- loadsegment(es, __USER_DS);
-
- regs->cs = __USER_CS;
- regs->ss = __USER_DS;
-#endif /* CONFIG_X86_X32_ABI */
-
- return 0;
-#ifdef CONFIG_X86_X32_ABI
-Efault:
- user_access_end();
- return -EFAULT;
-#endif
-}
-
-/*
- * Do a signal return; undo the signal stack.
- */
-#ifdef CONFIG_X86_32
-SYSCALL_DEFINE0(sigreturn)
-{
- struct pt_regs *regs = current_pt_regs();
- struct sigframe __user *frame;
- sigset_t set;
-
- frame = (struct sigframe __user *)(regs->sp - 8);
-
- if (!access_ok(frame, sizeof(*frame)))
- goto badframe;
- if (__get_user(set.sig[0], &frame->sc.oldmask) ||
- __get_user(set.sig[1], &frame->extramask[0]))
- goto badframe;
-
- set_current_blocked(&set);
-
- /*
- * x86_32 has no uc_flags bits relevant to restore_sigcontext.
- * Save a few cycles by skipping the __get_user.
- */
- if (!restore_sigcontext(regs, &frame->sc, 0))
- goto badframe;
- return regs->ax;
-
-badframe:
- signal_fault(regs, frame, "sigreturn");
-
- return 0;
-}
-#endif /* CONFIG_X86_32 */
-
-SYSCALL_DEFINE0(rt_sigreturn)
-{
- struct pt_regs *regs = current_pt_regs();
- struct rt_sigframe __user *frame;
- sigset_t set;
- unsigned long uc_flags;
-
- frame = (struct rt_sigframe __user *)(regs->sp - sizeof(long));
- if (!access_ok(frame, sizeof(*frame)))
- goto badframe;
- if (__get_user(*(__u64 *)&set, (__u64 __user *)&frame->uc.uc_sigmask))
- goto badframe;
- if (__get_user(uc_flags, &frame->uc.uc_flags))
- goto badframe;
-
- set_current_blocked(&set);
-
- if (!restore_sigcontext(regs, &frame->uc.uc_mcontext, uc_flags))
- goto badframe;
-
- if (restore_altstack(&frame->uc.uc_stack))
- goto badframe;
-
- return regs->ax;
-
-badframe:
- signal_fault(regs, frame, "rt_sigreturn");
- return 0;
-}
-
/*
* There are four different struct types for signal frame: sigframe_ia32,
* rt_sigframe_ia32, rt_sigframe_x32, and rt_sigframe. Use the worst case
@@ -743,43 +201,22 @@ unsigned long get_sigframe_size(void)
return max_frame_size;
}
-static inline int is_ia32_compat_frame(struct ksignal *ksig)
-{
- return IS_ENABLED(CONFIG_IA32_EMULATION) &&
- ksig->ka.sa.sa_flags & SA_IA32_ABI;
-}
-
-static inline int is_ia32_frame(struct ksignal *ksig)
-{
- return IS_ENABLED(CONFIG_X86_32) || is_ia32_compat_frame(ksig);
-}
-
-static inline int is_x32_frame(struct ksignal *ksig)
-{
- return IS_ENABLED(CONFIG_X86_X32_ABI) &&
- ksig->ka.sa.sa_flags & SA_X32_ABI;
-}
-
static int
setup_rt_frame(struct ksignal *ksig, struct pt_regs *regs)
{
- int usig = ksig->sig;
- sigset_t *set = sigmask_to_save();
- compat_sigset_t *cset = (compat_sigset_t *) set;
-
/* Perform fixup for the pre-signal frame. */
rseq_signal_deliver(ksig, regs);
/* Set up the stack frame */
if (is_ia32_frame(ksig)) {
if (ksig->ka.sa.sa_flags & SA_SIGINFO)
- return ia32_setup_rt_frame(usig, ksig, cset, regs);
+ return ia32_setup_rt_frame(ksig, regs);
else
- return ia32_setup_frame(usig, ksig, cset, regs);
+ return ia32_setup_frame(ksig, regs);
} else if (is_x32_frame(ksig)) {
- return x32_setup_rt_frame(ksig, cset, regs);
+ return x32_setup_rt_frame(ksig, regs);
} else {
- return __setup_rt_frame(ksig->sig, ksig, set, regs);
+ return x64_setup_rt_frame(ksig, regs);
}
}
@@ -969,36 +406,3 @@ bool sigaltstack_size_valid(size_t ss_size)
return true;
}
#endif /* CONFIG_DYNAMIC_SIGFRAME */
-
-#ifdef CONFIG_X86_X32_ABI
-COMPAT_SYSCALL_DEFINE0(x32_rt_sigreturn)
-{
- struct pt_regs *regs = current_pt_regs();
- struct rt_sigframe_x32 __user *frame;
- sigset_t set;
- unsigned long uc_flags;
-
- frame = (struct rt_sigframe_x32 __user *)(regs->sp - 8);
-
- if (!access_ok(frame, sizeof(*frame)))
- goto badframe;
- if (__get_user(set.sig[0], (__u64 __user *)&frame->uc.uc_sigmask))
- goto badframe;
- if (__get_user(uc_flags, &frame->uc.uc_flags))
- goto badframe;
-
- set_current_blocked(&set);
-
- if (!restore_sigcontext(regs, &frame->uc.uc_mcontext, uc_flags))
- goto badframe;
-
- if (compat_restore_altstack(&frame->uc.uc_stack))
- goto badframe;
-
- return regs->ax;
-
-badframe:
- signal_fault(regs, frame, "x32 rt_sigreturn");
- return 0;
-}
-#endif
diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/kernel/signal_32.c
index c9c3859322fa..2553136cf39b 100644
--- a/arch/x86/ia32/ia32_signal.c
+++ b/arch/x86/kernel/signal_32.c
@@ -1,7 +1,5 @@
// SPDX-License-Identifier: GPL-2.0
/*
- * linux/arch/x86_64/ia32/ia32_signal.c
- *
* Copyright (C) 1991, 1992 Linus Torvalds
*
* 1997-11-28 Modified for POSIX.1b signals by Richard Henderson
@@ -26,7 +24,6 @@
#include <linux/uaccess.h>
#include <asm/fpu/signal.h>
#include <asm/ptrace.h>
-#include <asm/ia32_unistd.h>
#include <asm/user32.h>
#include <uapi/asm/sigcontext.h>
#include <asm/proto.h>
@@ -35,6 +32,9 @@
#include <asm/sighandling.h>
#include <asm/smap.h>
+#ifdef CONFIG_IA32_EMULATION
+#include <asm/ia32_unistd.h>
+
static inline void reload_segments(struct sigcontext_32 *sc)
{
unsigned int cur;
@@ -53,6 +53,21 @@ static inline void reload_segments(struct sigcontext_32 *sc)
loadsegment(es, sc->es | 0x03);
}
+#define sigset32_t compat_sigset_t
+#define restore_altstack32 compat_restore_altstack
+#define unsafe_save_altstack32 unsafe_compat_save_altstack
+
+#else
+
+#define sigset32_t sigset_t
+#define __NR_ia32_sigreturn __NR_sigreturn
+#define __NR_ia32_rt_sigreturn __NR_rt_sigreturn
+#define restore_altstack32 restore_altstack
+#define unsafe_save_altstack32 unsafe_save_altstack
+#define __copy_siginfo_to_user32 copy_siginfo_to_user
+
+#endif
+
/*
* Do a signal return; undo the signal stack.
*/
@@ -86,6 +101,7 @@ static bool ia32_restore_sigcontext(struct pt_regs *regs,
/* disable syscall checks */
regs->orig_ax = -1;
+#ifdef CONFIG_IA32_EMULATION
/*
* Reload fs and gs if they have changed in the signal
* handler. This does not handle long fs/gs base changes in
@@ -93,10 +109,17 @@ static bool ia32_restore_sigcontext(struct pt_regs *regs,
* normal case.
*/
reload_segments(&sc);
+#else
+ loadsegment(gs, sc.gs);
+ regs->fs = sc.fs;
+ regs->es = sc.es;
+ regs->ds = sc.ds;
+#endif
+
return fpu__restore_sig(compat_ptr(sc.fpstate), 1);
}
-COMPAT_SYSCALL_DEFINE0(sigreturn)
+SYSCALL32_DEFINE0(sigreturn)
{
struct pt_regs *regs = current_pt_regs();
struct sigframe_ia32 __user *frame = (struct sigframe_ia32 __user *)(regs->sp-8);
@@ -119,7 +142,7 @@ badframe:
return 0;
}
-COMPAT_SYSCALL_DEFINE0(rt_sigreturn)
+SYSCALL32_DEFINE0(rt_sigreturn)
{
struct pt_regs *regs = current_pt_regs();
struct rt_sigframe_ia32 __user *frame;
@@ -129,7 +152,7 @@ COMPAT_SYSCALL_DEFINE0(rt_sigreturn)
if (!access_ok(frame, sizeof(*frame)))
goto badframe;
- if (__get_user(set.sig[0], (__u64 __user *)&frame->uc.uc_sigmask))
+ if (__get_user(*(__u64 *)&set, (__u64 __user *)&frame->uc.uc_sigmask))
goto badframe;
set_current_blocked(&set);
@@ -137,7 +160,7 @@ COMPAT_SYSCALL_DEFINE0(rt_sigreturn)
if (!ia32_restore_sigcontext(regs, &frame->uc.uc_mcontext))
goto badframe;
- if (compat_restore_altstack(&frame->uc.uc_stack))
+ if (restore_altstack32(&frame->uc.uc_stack))
goto badframe;
return regs->ax;
@@ -159,9 +182,15 @@ __unsafe_setup_sigcontext32(struct sigcontext_32 __user *sc,
struct pt_regs *regs, unsigned int mask)
{
unsafe_put_user(get_user_seg(gs), (unsigned int __user *)&sc->gs, Efault);
+#ifdef CONFIG_IA32_EMULATION
unsafe_put_user(get_user_seg(fs), (unsigned int __user *)&sc->fs, Efault);
unsafe_put_user(get_user_seg(ds), (unsigned int __user *)&sc->ds, Efault);
unsafe_put_user(get_user_seg(es), (unsigned int __user *)&sc->es, Efault);
+#else
+ unsafe_put_user(regs->fs, (unsigned int __user *)&sc->fs, Efault);
+ unsafe_put_user(regs->es, (unsigned int __user *)&sc->es, Efault);
+ unsafe_put_user(regs->ds, (unsigned int __user *)&sc->ds, Efault);
+#endif
unsafe_put_user(regs->di, &sc->di, Efault);
unsafe_put_user(regs->si, &sc->si, Efault);
@@ -196,43 +225,9 @@ do { \
goto label; \
} while(0)
-/*
- * Determine which stack to use..
- */
-static void __user *get_sigframe(struct ksignal *ksig, struct pt_regs *regs,
- size_t frame_size,
- void __user **fpstate)
-{
- unsigned long sp, fx_aligned, math_size;
-
- /* Default to using normal stack */
- sp = regs->sp;
-
- /* This is the X/Open sanctioned signal stack switching. */
- if (ksig->ka.sa.sa_flags & SA_ONSTACK)
- sp = sigsp(sp, ksig);
- /* This is the legacy signal stack switching. */
- else if (regs->ss != __USER32_DS &&
- !(ksig->ka.sa.sa_flags & SA_RESTORER) &&
- ksig->ka.sa.sa_restorer)
- sp = (unsigned long) ksig->ka.sa.sa_restorer;
-
- sp = fpu__alloc_mathframe(sp, 1, &fx_aligned, &math_size);
- *fpstate = (struct _fpstate_32 __user *) sp;
- if (!copy_fpstate_to_sigframe(*fpstate, (void __user *)fx_aligned,
- math_size))
- return (void __user *) -1L;
-
- sp -= frame_size;
- /* Align the stack pointer according to the i386 ABI,
- * i.e. so that on function entry ((sp + 4) & 15) == 0. */
- sp = ((sp + 4) & -16ul) - 4;
- return (void __user *) sp;
-}
-
-int ia32_setup_frame(int sig, struct ksignal *ksig,
- compat_sigset_t *set, struct pt_regs *regs)
+int ia32_setup_frame(struct ksignal *ksig, struct pt_regs *regs)
{
+ sigset32_t *set = (sigset32_t *) sigmask_to_save();
struct sigframe_ia32 __user *frame;
void __user *restorer;
void __user *fp = NULL;
@@ -264,7 +259,7 @@ int ia32_setup_frame(int sig, struct ksignal *ksig,
if (!user_access_begin(frame, sizeof(*frame)))
return -EFAULT;
- unsafe_put_user(sig, &frame->sig, Efault);
+ unsafe_put_user(ksig->sig, &frame->sig, Efault);
unsafe_put_sigcontext32(&frame->sc, fp, regs, set, Efault);
unsafe_put_user(set->sig[1], &frame->extramask[0], Efault);
unsafe_put_user(ptr_to_compat(restorer), &frame->pretcode, Efault);
@@ -280,15 +275,20 @@ int ia32_setup_frame(int sig, struct ksignal *ksig,
regs->ip = (unsigned long) ksig->ka.sa.sa_handler;
/* Make -mregparm=3 work */
- regs->ax = sig;
+ regs->ax = ksig->sig;
regs->dx = 0;
regs->cx = 0;
- loadsegment(ds, __USER32_DS);
- loadsegment(es, __USER32_DS);
+#ifdef CONFIG_IA32_EMULATION
+ loadsegment(ds, __USER_DS);
+ loadsegment(es, __USER_DS);
+#else
+ regs->ds = __USER_DS;
+ regs->es = __USER_DS;
+#endif
regs->cs = __USER32_CS;
- regs->ss = __USER32_DS;
+ regs->ss = __USER_DS;
return 0;
Efault:
@@ -296,9 +296,9 @@ Efault:
return -EFAULT;
}
-int ia32_setup_rt_frame(int sig, struct ksignal *ksig,
- compat_sigset_t *set, struct pt_regs *regs)
+int ia32_setup_rt_frame(struct ksignal *ksig, struct pt_regs *regs)
{
+ sigset32_t *set = (sigset32_t *) sigmask_to_save();
struct rt_sigframe_ia32 __user *frame;
void __user *restorer;
void __user *fp = NULL;
@@ -321,7 +321,7 @@ int ia32_setup_rt_frame(int sig, struct ksignal *ksig,
if (!user_access_begin(frame, sizeof(*frame)))
return -EFAULT;
- unsafe_put_user(sig, &frame->sig, Efault);
+ unsafe_put_user(ksig->sig, &frame->sig, Efault);
unsafe_put_user(ptr_to_compat(&frame->info), &frame->pinfo, Efault);
unsafe_put_user(ptr_to_compat(&frame->uc), &frame->puc, Efault);
@@ -331,7 +331,7 @@ int ia32_setup_rt_frame(int sig, struct ksignal *ksig,
else
unsafe_put_user(0, &frame->uc.uc_flags, Efault);
unsafe_put_user(0, &frame->uc.uc_link, Efault);
- unsafe_compat_save_altstack(&frame->uc.uc_stack, regs->sp, Efault);
+ unsafe_save_altstack32(&frame->uc.uc_stack, regs->sp, Efault);
if (ksig->ka.sa.sa_flags & SA_RESTORER)
restorer = ksig->ka.sa.sa_restorer;
@@ -357,15 +357,20 @@ int ia32_setup_rt_frame(int sig, struct ksignal *ksig,
regs->ip = (unsigned long) ksig->ka.sa.sa_handler;
/* Make -mregparm=3 work */
- regs->ax = sig;
+ regs->ax = ksig->sig;
regs->dx = (unsigned long) &frame->info;
regs->cx = (unsigned long) &frame->uc;
- loadsegment(ds, __USER32_DS);
- loadsegment(es, __USER32_DS);
+#ifdef CONFIG_IA32_EMULATION
+ loadsegment(ds, __USER_DS);
+ loadsegment(es, __USER_DS);
+#else
+ regs->ds = __USER_DS;
+ regs->es = __USER_DS;
+#endif
regs->cs = __USER32_CS;
- regs->ss = __USER32_DS;
+ regs->ss = __USER_DS;
return 0;
Efault:
diff --git a/arch/x86/kernel/signal_64.c b/arch/x86/kernel/signal_64.c
new file mode 100644
index 000000000000..ff9c55064223
--- /dev/null
+++ b/arch/x86/kernel/signal_64.c
@@ -0,0 +1,383 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 1991, 1992 Linus Torvalds
+ * Copyright (C) 2000, 2001, 2002 Andi Kleen SuSE Labs
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/unistd.h>
+#include <linux/uaccess.h>
+#include <linux/syscalls.h>
+
+#include <asm/ucontext.h>
+#include <asm/fpu/signal.h>
+#include <asm/sighandling.h>
+
+#include <asm/syscall.h>
+#include <asm/sigframe.h>
+#include <asm/signal.h>
+
+/*
+ * If regs->ss will cause an IRET fault, change it. Otherwise leave it
+ * alone. Using this generally makes no sense unless
+ * user_64bit_mode(regs) would return true.
+ */
+static void force_valid_ss(struct pt_regs *regs)
+{
+ u32 ar;
+ asm volatile ("lar %[old_ss], %[ar]\n\t"
+ "jz 1f\n\t" /* If invalid: */
+ "xorl %[ar], %[ar]\n\t" /* set ar = 0 */
+ "1:"
+ : [ar] "=r" (ar)
+ : [old_ss] "rm" ((u16)regs->ss));
+
+ /*
+ * For a valid 64-bit user context, we need DPL 3, type
+ * read-write data or read-write exp-down data, and S and P
+ * set. We can't use VERW because VERW doesn't check the
+ * P bit.
+ */
+ ar &= AR_DPL_MASK | AR_S | AR_P | AR_TYPE_MASK;
+ if (ar != (AR_DPL3 | AR_S | AR_P | AR_TYPE_RWDATA) &&
+ ar != (AR_DPL3 | AR_S | AR_P | AR_TYPE_RWDATA_EXPDOWN))
+ regs->ss = __USER_DS;
+}
+
+static bool restore_sigcontext(struct pt_regs *regs,
+ struct sigcontext __user *usc,
+ unsigned long uc_flags)
+{
+ struct sigcontext sc;
+
+ /* Always make any pending restarted system calls return -EINTR */
+ current->restart_block.fn = do_no_restart_syscall;
+
+ if (copy_from_user(&sc, usc, offsetof(struct sigcontext, reserved1)))
+ return false;
+
+ regs->bx = sc.bx;
+ regs->cx = sc.cx;
+ regs->dx = sc.dx;
+ regs->si = sc.si;
+ regs->di = sc.di;
+ regs->bp = sc.bp;
+ regs->ax = sc.ax;
+ regs->sp = sc.sp;
+ regs->ip = sc.ip;
+ regs->r8 = sc.r8;
+ regs->r9 = sc.r9;
+ regs->r10 = sc.r10;
+ regs->r11 = sc.r11;
+ regs->r12 = sc.r12;
+ regs->r13 = sc.r13;
+ regs->r14 = sc.r14;
+ regs->r15 = sc.r15;
+
+ /* Get CS/SS and force CPL3 */
+ regs->cs = sc.cs | 0x03;
+ regs->ss = sc.ss | 0x03;
+
+ regs->flags = (regs->flags & ~FIX_EFLAGS) | (sc.flags & FIX_EFLAGS);
+ /* disable syscall checks */
+ regs->orig_ax = -1;
+
+ /*
+ * Fix up SS if needed for the benefit of old DOSEMU and
+ * CRIU.
+ */
+ if (unlikely(!(uc_flags & UC_STRICT_RESTORE_SS) && user_64bit_mode(regs)))
+ force_valid_ss(regs);
+
+ return fpu__restore_sig((void __user *)sc.fpstate, 0);
+}
+
+static __always_inline int
+__unsafe_setup_sigcontext(struct sigcontext __user *sc, void __user *fpstate,
+ struct pt_regs *regs, unsigned long mask)
+{
+ unsafe_put_user(regs->di, &sc->di, Efault);
+ unsafe_put_user(regs->si, &sc->si, Efault);
+ unsafe_put_user(regs->bp, &sc->bp, Efault);
+ unsafe_put_user(regs->sp, &sc->sp, Efault);
+ unsafe_put_user(regs->bx, &sc->bx, Efault);
+ unsafe_put_user(regs->dx, &sc->dx, Efault);
+ unsafe_put_user(regs->cx, &sc->cx, Efault);
+ unsafe_put_user(regs->ax, &sc->ax, Efault);
+ unsafe_put_user(regs->r8, &sc->r8, Efault);
+ unsafe_put_user(regs->r9, &sc->r9, Efault);
+ unsafe_put_user(regs->r10, &sc->r10, Efault);
+ unsafe_put_user(regs->r11, &sc->r11, Efault);
+ unsafe_put_user(regs->r12, &sc->r12, Efault);
+ unsafe_put_user(regs->r13, &sc->r13, Efault);
+ unsafe_put_user(regs->r14, &sc->r14, Efault);
+ unsafe_put_user(regs->r15, &sc->r15, Efault);
+
+ unsafe_put_user(current->thread.trap_nr, &sc->trapno, Efault);
+ unsafe_put_user(current->thread.error_code, &sc->err, Efault);
+ unsafe_put_user(regs->ip, &sc->ip, Efault);
+ unsafe_put_user(regs->flags, &sc->flags, Efault);
+ unsafe_put_user(regs->cs, &sc->cs, Efault);
+ unsafe_put_user(0, &sc->gs, Efault);
+ unsafe_put_user(0, &sc->fs, Efault);
+ unsafe_put_user(regs->ss, &sc->ss, Efault);
+
+ unsafe_put_user(fpstate, (unsigned long __user *)&sc->fpstate, Efault);
+
+ /* non-iBCS2 extensions.. */
+ unsafe_put_user(mask, &sc->oldmask, Efault);
+ unsafe_put_user(current->thread.cr2, &sc->cr2, Efault);
+ return 0;
+Efault:
+ return -EFAULT;
+}
+
+#define unsafe_put_sigcontext(sc, fp, regs, set, label) \
+do { \
+ if (__unsafe_setup_sigcontext(sc, fp, regs, set->sig[0])) \
+ goto label; \
+} while(0);
+
+#define unsafe_put_sigmask(set, frame, label) \
+ unsafe_put_user(*(__u64 *)(set), \
+ (__u64 __user *)&(frame)->uc.uc_sigmask, \
+ label)
+
+static unsigned long frame_uc_flags(struct pt_regs *regs)
+{
+ unsigned long flags;
+
+ if (boot_cpu_has(X86_FEATURE_XSAVE))
+ flags = UC_FP_XSTATE | UC_SIGCONTEXT_SS;
+ else
+ flags = UC_SIGCONTEXT_SS;
+
+ if (likely(user_64bit_mode(regs)))
+ flags |= UC_STRICT_RESTORE_SS;
+
+ return flags;
+}
+
+int x64_setup_rt_frame(struct ksignal *ksig, struct pt_regs *regs)
+{
+ sigset_t *set = sigmask_to_save();
+ struct rt_sigframe __user *frame;
+ void __user *fp = NULL;
+ unsigned long uc_flags;
+
+ /* x86-64 should always use SA_RESTORER. */
+ if (!(ksig->ka.sa.sa_flags & SA_RESTORER))
+ return -EFAULT;
+
+ frame = get_sigframe(ksig, regs, sizeof(struct rt_sigframe), &fp);
+ uc_flags = frame_uc_flags(regs);
+
+ if (!user_access_begin(frame, sizeof(*frame)))
+ return -EFAULT;
+
+ /* Create the ucontext. */
+ unsafe_put_user(uc_flags, &frame->uc.uc_flags, Efault);
+ unsafe_put_user(0, &frame->uc.uc_link, Efault);
+ unsafe_save_altstack(&frame->uc.uc_stack, regs->sp, Efault);
+
+ /* Set up to return from userspace. If provided, use a stub
+ already in userspace. */
+ unsafe_put_user(ksig->ka.sa.sa_restorer, &frame->pretcode, Efault);
+ unsafe_put_sigcontext(&frame->uc.uc_mcontext, fp, regs, set, Efault);
+ unsafe_put_sigmask(set, frame, Efault);
+ user_access_end();
+
+ if (ksig->ka.sa.sa_flags & SA_SIGINFO) {
+ if (copy_siginfo_to_user(&frame->info, &ksig->info))
+ return -EFAULT;
+ }
+
+ /* Set up registers for signal handler */
+ regs->di = ksig->sig;
+ /* In case the signal handler was declared without prototypes */
+ regs->ax = 0;
+
+ /* This also works for non SA_SIGINFO handlers because they expect the
+ next argument after the signal number on the stack. */
+ regs->si = (unsigned long)&frame->info;
+ regs->dx = (unsigned long)&frame->uc;
+ regs->ip = (unsigned long) ksig->ka.sa.sa_handler;
+
+ regs->sp = (unsigned long)frame;
+
+ /*
+ * Set up the CS and SS registers to run signal handlers in
+ * 64-bit mode, even if the handler happens to be interrupting
+ * 32-bit or 16-bit code.
+ *
+ * SS is subtle. In 64-bit mode, we don't need any particular
+ * SS descriptor, but we do need SS to be valid. It's possible
+ * that the old SS is entirely bogus -- this can happen if the
+ * signal we're trying to deliver is #GP or #SS caused by a bad
+ * SS value. We also have a compatibility issue here: DOSEMU
+ * relies on the contents of the SS register indicating the
+ * SS value at the time of the signal, even though that code in
+ * DOSEMU predates sigreturn's ability to restore SS. (DOSEMU
+ * avoids relying on sigreturn to restore SS; instead it uses
+ * a trampoline.) So we do our best: if the old SS was valid,
+ * we keep it. Otherwise we replace it.
+ */
+ regs->cs = __USER_CS;
+
+ if (unlikely(regs->ss != __USER_DS))
+ force_valid_ss(regs);
+
+ return 0;
+
+Efault:
+ user_access_end();
+ return -EFAULT;
+}
+
+/*
+ * Do a signal return; undo the signal stack.
+ */
+SYSCALL_DEFINE0(rt_sigreturn)
+{
+ struct pt_regs *regs = current_pt_regs();
+ struct rt_sigframe __user *frame;
+ sigset_t set;
+ unsigned long uc_flags;
+
+ frame = (struct rt_sigframe __user *)(regs->sp - sizeof(long));
+ if (!access_ok(frame, sizeof(*frame)))
+ goto badframe;
+ if (__get_user(*(__u64 *)&set, (__u64 __user *)&frame->uc.uc_sigmask))
+ goto badframe;
+ if (__get_user(uc_flags, &frame->uc.uc_flags))
+ goto badframe;
+
+ set_current_blocked(&set);
+
+ if (!restore_sigcontext(regs, &frame->uc.uc_mcontext, uc_flags))
+ goto badframe;
+
+ if (restore_altstack(&frame->uc.uc_stack))
+ goto badframe;
+
+ return regs->ax;
+
+badframe:
+ signal_fault(regs, frame, "rt_sigreturn");
+ return 0;
+}
+
+#ifdef CONFIG_X86_X32_ABI
+static int x32_copy_siginfo_to_user(struct compat_siginfo __user *to,
+ const struct kernel_siginfo *from)
+{
+ struct compat_siginfo new;
+
+ copy_siginfo_to_external32(&new, from);
+ if (from->si_signo == SIGCHLD) {
+ new._sifields._sigchld_x32._utime = from->si_utime;
+ new._sifields._sigchld_x32._stime = from->si_stime;
+ }
+ if (copy_to_user(to, &new, sizeof(struct compat_siginfo)))
+ return -EFAULT;
+ return 0;
+}
+
+int copy_siginfo_to_user32(struct compat_siginfo __user *to,
+ const struct kernel_siginfo *from)
+{
+ if (in_x32_syscall())
+ return x32_copy_siginfo_to_user(to, from);
+ return __copy_siginfo_to_user32(to, from);
+}
+
+int x32_setup_rt_frame(struct ksignal *ksig, struct pt_regs *regs)
+{
+ compat_sigset_t *set = (compat_sigset_t *) sigmask_to_save();
+ struct rt_sigframe_x32 __user *frame;
+ unsigned long uc_flags;
+ void __user *restorer;
+ void __user *fp = NULL;
+
+ if (!(ksig->ka.sa.sa_flags & SA_RESTORER))
+ return -EFAULT;
+
+ frame = get_sigframe(ksig, regs, sizeof(*frame), &fp);
+
+ uc_flags = frame_uc_flags(regs);
+
+ if (!user_access_begin(frame, sizeof(*frame)))
+ return -EFAULT;
+
+ /* Create the ucontext. */
+ unsafe_put_user(uc_flags, &frame->uc.uc_flags, Efault);
+ unsafe_put_user(0, &frame->uc.uc_link, Efault);
+ unsafe_compat_save_altstack(&frame->uc.uc_stack, regs->sp, Efault);
+ unsafe_put_user(0, &frame->uc.uc__pad0, Efault);
+ restorer = ksig->ka.sa.sa_restorer;
+ unsafe_put_user(restorer, (unsigned long __user *)&frame->pretcode, Efault);
+ unsafe_put_sigcontext(&frame->uc.uc_mcontext, fp, regs, set, Efault);
+ unsafe_put_sigmask(set, frame, Efault);
+ user_access_end();
+
+ if (ksig->ka.sa.sa_flags & SA_SIGINFO) {
+ if (x32_copy_siginfo_to_user(&frame->info, &ksig->info))
+ return -EFAULT;
+ }
+
+ /* Set up registers for signal handler */
+ regs->sp = (unsigned long) frame;
+ regs->ip = (unsigned long) ksig->ka.sa.sa_handler;
+
+ /* We use the x32 calling convention here... */
+ regs->di = ksig->sig;
+ regs->si = (unsigned long) &frame->info;
+ regs->dx = (unsigned long) &frame->uc;
+
+ loadsegment(ds, __USER_DS);
+ loadsegment(es, __USER_DS);
+
+ regs->cs = __USER_CS;
+ regs->ss = __USER_DS;
+
+ return 0;
+
+Efault:
+ user_access_end();
+ return -EFAULT;
+}
+
+COMPAT_SYSCALL_DEFINE0(x32_rt_sigreturn)
+{
+ struct pt_regs *regs = current_pt_regs();
+ struct rt_sigframe_x32 __user *frame;
+ sigset_t set;
+ unsigned long uc_flags;
+
+ frame = (struct rt_sigframe_x32 __user *)(regs->sp - 8);
+
+ if (!access_ok(frame, sizeof(*frame)))
+ goto badframe;
+ if (__get_user(set.sig[0], (__u64 __user *)&frame->uc.uc_sigmask))
+ goto badframe;
+ if (__get_user(uc_flags, &frame->uc.uc_flags))
+ goto badframe;
+
+ set_current_blocked(&set);
+
+ if (!restore_sigcontext(regs, &frame->uc.uc_mcontext, uc_flags))
+ goto badframe;
+
+ if (compat_restore_altstack(&frame->uc.uc_stack))
+ goto badframe;
+
+ return regs->ax;
+
+badframe:
+ signal_fault(regs, frame, "x32 rt_sigreturn");
+ return 0;
+}
+#endif /* CONFIG_X86_X32_ABI */
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 3f3ea0287f69..55cad72715d9 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -56,8 +56,10 @@
#include <linux/numa.h>
#include <linux/pgtable.h>
#include <linux/overflow.h>
+#include <linux/stackprotector.h>
#include <asm/acpi.h>
+#include <asm/cacheinfo.h>
#include <asm/desc.h>
#include <asm/nmi.h>
#include <asm/irq.h>
@@ -1046,7 +1048,7 @@ int common_cpu_up(unsigned int cpu, struct task_struct *idle)
/* Just in case we booted with a single CPU. */
alternatives_enable_smp();
- per_cpu(current_task, cpu) = idle;
+ per_cpu(pcpu_hot.current_task, cpu) = idle;
cpu_init_stack_canary(cpu, idle);
/* Initialize the interrupt stack(s) */
@@ -1056,7 +1058,7 @@ int common_cpu_up(unsigned int cpu, struct task_struct *idle)
#ifdef CONFIG_X86_32
/* Stack for startup_32 can be just as for start_secondary onwards */
- per_cpu(cpu_current_top_of_stack, cpu) = task_top_of_stack(idle);
+ per_cpu(pcpu_hot.top_of_stack, cpu) = task_top_of_stack(idle);
#else
initial_gs = per_cpu_offset(cpu);
#endif
@@ -1428,8 +1430,6 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus)
uv_system_init();
- set_mtrr_aps_delayed_init();
-
smp_quirk_init_udelay();
speculative_store_bypass_ht_init();
@@ -1439,12 +1439,12 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus)
void arch_thaw_secondary_cpus_begin(void)
{
- set_mtrr_aps_delayed_init();
+ set_cache_aps_delayed_init(true);
}
void arch_thaw_secondary_cpus_end(void)
{
- mtrr_aps_init();
+ cache_aps_init();
}
/*
@@ -1453,7 +1453,11 @@ void arch_thaw_secondary_cpus_end(void)
void __init native_smp_prepare_boot_cpu(void)
{
int me = smp_processor_id();
- switch_to_new_gdt(me);
+
+ /* SMP handles this from setup_per_cpu_areas() */
+ if (!IS_ENABLED(CONFIG_SMP))
+ switch_gdt_and_percpu_base(me);
+
/* already set me in cpu_online_mask in boot_cpu_init() */
cpumask_set_cpu(me, cpu_callout_mask);
cpu_set_state_online(me);
@@ -1487,7 +1491,7 @@ void __init native_smp_cpus_done(unsigned int max_cpus)
nmi_selftest();
impress_friends();
- mtrr_aps_init();
+ cache_aps_init();
}
static int __initdata setup_possible_cpus = -1;
diff --git a/arch/x86/kernel/static_call.c b/arch/x86/kernel/static_call.c
index aaaba85d6d7f..2ebc338980bc 100644
--- a/arch/x86/kernel/static_call.c
+++ b/arch/x86/kernel/static_call.c
@@ -34,6 +34,7 @@ static void __ref __static_call_transform(void *insn, enum insn_type type,
switch (type) {
case CALL:
+ func = callthunks_translate_call_dest(func);
code = text_gen_insn(CALL_INSN_OPCODE, insn, func);
if (func == &__static_call_return0) {
emulate = code;
@@ -52,7 +53,7 @@ static void __ref __static_call_transform(void *insn, enum insn_type type,
case RET:
if (cpu_feature_enabled(X86_FEATURE_RETHUNK))
- code = text_gen_insn(JMP32_INSN_OPCODE, insn, &__x86_return_thunk);
+ code = text_gen_insn(JMP32_INSN_OPCODE, insn, x86_return_thunk);
else
code = &retinsn;
break;
diff --git a/arch/x86/kernel/topology.c b/arch/x86/kernel/topology.c
index 8617d1ed9d31..1b83377274b8 100644
--- a/arch/x86/kernel/topology.c
+++ b/arch/x86/kernel/topology.c
@@ -106,7 +106,7 @@ int arch_register_cpu(int num)
* Xen PV guests don't support CPU0 hotplug at all.
*/
if (c->x86_vendor != X86_VENDOR_INTEL ||
- boot_cpu_has(X86_FEATURE_XENPV))
+ cpu_feature_enabled(X86_FEATURE_XENPV))
cpu0_hotpluggable = 0;
/*
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 178015a820f0..d317dc3d06a3 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -15,6 +15,7 @@
#include <linux/context_tracking.h>
#include <linux/interrupt.h>
#include <linux/kallsyms.h>
+#include <linux/kmsan.h>
#include <linux/spinlock.h>
#include <linux/kprobes.h>
#include <linux/uaccess.h>
@@ -67,13 +68,13 @@
#ifdef CONFIG_X86_64
#include <asm/x86_init.h>
-#include <asm/proto.h>
#else
#include <asm/processor-flags.h>
#include <asm/setup.h>
-#include <asm/proto.h>
#endif
+#include <asm/proto.h>
+
DECLARE_BITMAP(system_vectors, NR_VECTORS);
static inline void cond_local_irq_enable(struct pt_regs *regs)
@@ -301,6 +302,12 @@ static noinstr bool handle_bug(struct pt_regs *regs)
{
bool handled = false;
+ /*
+ * Normally @regs are unpoisoned by irqentry_enter(), but handle_bug()
+ * is a rare case that uses @regs without passing them to
+ * irqentry_enter().
+ */
+ kmsan_unpoison_entry_regs(regs);
if (!is_valid_bugaddr(regs->ip))
return handled;
@@ -851,7 +858,7 @@ DEFINE_IDTENTRY_RAW(exc_int3)
*/
asmlinkage __visible noinstr struct pt_regs *sync_regs(struct pt_regs *eregs)
{
- struct pt_regs *regs = (struct pt_regs *)this_cpu_read(cpu_current_top_of_stack) - 1;
+ struct pt_regs *regs = (struct pt_regs *)this_cpu_read(pcpu_hot.top_of_stack) - 1;
if (regs != eregs)
*regs = *eregs;
return regs;
@@ -869,7 +876,7 @@ asmlinkage __visible noinstr struct pt_regs *vc_switch_off_ist(struct pt_regs *r
* trust it and switch to the current kernel stack
*/
if (ip_within_syscall_gap(regs)) {
- sp = this_cpu_read(cpu_current_top_of_stack);
+ sp = this_cpu_read(pcpu_hot.top_of_stack);
goto sync;
}
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index cafacb2e58cc..a78e73da4a74 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -51,7 +51,7 @@ int tsc_clocksource_reliable;
static u32 art_to_tsc_numerator;
static u32 art_to_tsc_denominator;
static u64 art_to_tsc_offset;
-struct clocksource *art_related_clocksource;
+static struct clocksource *art_related_clocksource;
struct cyc2ns {
struct cyc2ns_data data[2]; /* 0 + 2*16 = 32 */
diff --git a/arch/x86/kernel/unwind_orc.c b/arch/x86/kernel/unwind_orc.c
index c059820dfaea..cdf6c6060170 100644
--- a/arch/x86/kernel/unwind_orc.c
+++ b/arch/x86/kernel/unwind_orc.c
@@ -136,6 +136,21 @@ static struct orc_entry null_orc_entry = {
.type = UNWIND_HINT_TYPE_CALL
};
+#ifdef CONFIG_CALL_THUNKS
+static struct orc_entry *orc_callthunk_find(unsigned long ip)
+{
+ if (!is_callthunk((void *)ip))
+ return NULL;
+
+ return &null_orc_entry;
+}
+#else
+static struct orc_entry *orc_callthunk_find(unsigned long ip)
+{
+ return NULL;
+}
+#endif
+
/* Fake frame pointer entry -- used as a fallback for generated code */
static struct orc_entry orc_fp_entry = {
.type = UNWIND_HINT_TYPE_CALL,
@@ -189,7 +204,11 @@ static struct orc_entry *orc_find(unsigned long ip)
if (orc)
return orc;
- return orc_ftrace_find(ip);
+ orc = orc_ftrace_find(ip);
+ if (orc)
+ return orc;
+
+ return orc_callthunk_find(ip);
}
#ifdef CONFIG_MODULES
diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c
index b63cf8f7745e..6c07f6daaa22 100644
--- a/arch/x86/kernel/uprobes.c
+++ b/arch/x86/kernel/uprobes.c
@@ -722,8 +722,9 @@ static int branch_setup_xol_ops(struct arch_uprobe *auprobe, struct insn *insn)
switch (opc1) {
case 0xeb: /* jmp 8 */
case 0xe9: /* jmp 32 */
- case 0x90: /* prefix* + nop; same as jmp with .offs = 0 */
break;
+ case 0x90: /* prefix* + nop; same as jmp with .offs = 0 */
+ goto setup;
case 0xe8: /* call relative */
branch_clear_offset(auprobe, insn);
@@ -753,6 +754,7 @@ static int branch_setup_xol_ops(struct arch_uprobe *auprobe, struct insn *insn)
return -ENOTSUPP;
}
+setup:
auprobe->branch.opc1 = opc1;
auprobe->branch.ilen = insn->length;
auprobe->branch.offs = insn->immediate.value;
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index 15f29053cec4..2e0ee14229bf 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -132,18 +132,19 @@ SECTIONS
CPUIDLE_TEXT
LOCK_TEXT
KPROBES_TEXT
- ALIGN_ENTRY_TEXT_BEGIN
- ENTRY_TEXT
- ALIGN_ENTRY_TEXT_END
SOFTIRQENTRY_TEXT
- STATIC_CALL_TEXT
- *(.gnu.warning)
-
#ifdef CONFIG_RETPOLINE
__indirect_thunk_start = .;
*(.text.__x86.*)
__indirect_thunk_end = .;
#endif
+ STATIC_CALL_TEXT
+
+ ALIGN_ENTRY_TEXT_BEGIN
+ ENTRY_TEXT
+ ALIGN_ENTRY_TEXT_END
+ *(.gnu.warning)
+
} :text =0xcccc
/* End of text section, which should occupy whole number of pages */
@@ -290,6 +291,13 @@ SECTIONS
*(.return_sites)
__return_sites_end = .;
}
+
+ . = ALIGN(8);
+ .call_sites : AT(ADDR(.call_sites) - LOAD_OFFSET) {
+ __call_sites = .;
+ *(.call_sites)
+ __call_sites_end = .;
+ }
#endif
#ifdef CONFIG_X86_KERNEL_IBT
@@ -301,6 +309,15 @@ SECTIONS
}
#endif
+#ifdef CONFIG_FINEIBT
+ . = ALIGN(8);
+ .cfi_sites : AT(ADDR(.cfi_sites) - LOAD_OFFSET) {
+ __cfi_sites = .;
+ *(.cfi_sites)
+ __cfi_sites_end = .;
+ }
+#endif
+
/*
* struct alt_inst entries. From the header (alternative.h):
* "Alternative instructions for different CPU types or capabilities"
@@ -493,11 +510,3 @@ INIT_PER_CPU(irq_stack_backing_store);
#endif
#endif /* CONFIG_X86_64 */
-
-#ifdef CONFIG_KEXEC_CORE
-#include <asm/kexec.h>
-
-. = ASSERT(kexec_control_code_size <= KEXEC_CONTROL_CODE_MAX_SIZE,
- "kexec control code size is too big");
-#endif
-
diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c
index 57353519bc11..ef80d361b463 100644
--- a/arch/x86/kernel/x86_init.c
+++ b/arch/x86/kernel/x86_init.c
@@ -25,6 +25,7 @@
#include <asm/iommu.h>
#include <asm/mach_traps.h>
#include <asm/irqdomain.h>
+#include <asm/realmode.h>
void x86_init_noop(void) { }
void __init x86_init_uint_noop(unsigned int unused) { }
@@ -145,6 +146,8 @@ struct x86_platform_ops x86_platform __ro_after_init = {
.get_nmi_reason = default_get_nmi_reason,
.save_sched_clock_state = tsc_save_sched_clock_state,
.restore_sched_clock_state = tsc_restore_sched_clock_state,
+ .realmode_reserve = reserve_real_mode,
+ .realmode_init = init_real_mode,
.hyper.pin_vcpu = x86_op_int_noop,
.guest = {
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index fbeaa9ddef59..8e578311ca9d 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -49,6 +49,7 @@ config KVM
select SRCU
select INTERVAL_TREE
select HAVE_KVM_PM_NOTIFIER if PM
+ select KVM_GENERIC_HARDWARE_ENABLING
help
Support hosting fully virtualized guest machines using hardware
virtualization extensions. You will need a fairly recent
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index c3f084185daf..2a9f1e200dbc 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -8,6 +8,7 @@
* Copyright 2011 Red Hat, Inc. and/or its affiliates.
* Copyright IBM Corporation, 2008
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/kvm_host.h>
#include <linux/export.h>
@@ -676,7 +677,7 @@ void kvm_set_cpu_caps(void)
);
kvm_cpu_cap_init_kvm_defined(CPUID_12_EAX,
- SF(SGX1) | SF(SGX2)
+ SF(SGX1) | SF(SGX2) | SF(SGX_EDECCSSA)
);
kvm_cpu_cap_mask(CPUID_8000_0001_ECX,
@@ -774,16 +775,22 @@ struct kvm_cpuid_array {
int nent;
};
+static struct kvm_cpuid_entry2 *get_next_cpuid(struct kvm_cpuid_array *array)
+{
+ if (array->nent >= array->maxnent)
+ return NULL;
+
+ return &array->entries[array->nent++];
+}
+
static struct kvm_cpuid_entry2 *do_host_cpuid(struct kvm_cpuid_array *array,
u32 function, u32 index)
{
- struct kvm_cpuid_entry2 *entry;
+ struct kvm_cpuid_entry2 *entry = get_next_cpuid(array);
- if (array->nent >= array->maxnent)
+ if (!entry)
return NULL;
- entry = &array->entries[array->nent++];
-
memset(entry, 0, sizeof(*entry));
entry->function = function;
entry->index = index;
@@ -960,22 +967,13 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function)
entry->edx = edx.full;
break;
}
- /*
- * Per Intel's SDM, the 0x1f is a superset of 0xb,
- * thus they can be handled by common code.
- */
case 0x1f:
case 0xb:
/*
- * Populate entries until the level type (ECX[15:8]) of the
- * previous entry is zero. Note, CPUID EAX.{0x1f,0xb}.0 is
- * the starting entry, filled by the primary do_host_cpuid().
+ * No topology; a valid topology is indicated by the presence
+ * of subleaf 1.
*/
- for (i = 1; entry->ecx & 0xff00; ++i) {
- entry = do_host_cpuid(array, function, i);
- if (!entry)
- goto out;
- }
+ entry->eax = entry->ebx = entry->ecx = 0;
break;
case 0xd: {
u64 permitted_xcr0 = kvm_caps.supported_xcr0 & xstate_get_guest_group_perm();
@@ -1062,9 +1060,7 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function)
* userspace. ATTRIBUTES.XFRM is not adjusted as userspace is
* expected to derive it from supported XCR0.
*/
- entry->eax &= SGX_ATTR_DEBUG | SGX_ATTR_MODE64BIT |
- SGX_ATTR_PROVISIONKEY | SGX_ATTR_EINITTOKENKEY |
- SGX_ATTR_KSS;
+ entry->eax &= SGX_ATTR_PRIV_MASK | SGX_ATTR_UNPRIV_MASK;
entry->ebx &= 0;
break;
/* Intel PT */
@@ -1208,6 +1204,9 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function)
entry->ebx = entry->ecx = entry->edx = 0;
break;
case 0x8000001e:
+ /* Do not return host topology information. */
+ entry->eax = entry->ebx = entry->ecx = 0;
+ entry->edx = 0; /* reserved */
break;
case 0x8000001F:
if (!kvm_cpu_cap_has(X86_FEATURE_SEV)) {
diff --git a/arch/x86/kvm/debugfs.c b/arch/x86/kvm/debugfs.c
index c1390357126a..ee8c4c3496ed 100644
--- a/arch/x86/kvm/debugfs.c
+++ b/arch/x86/kvm/debugfs.c
@@ -4,6 +4,8 @@
*
* Copyright 2016 Red Hat, Inc. and/or its affiliates.
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/kvm_host.h>
#include <linux/debugfs.h>
#include "lapic.h"
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 5cc3efa0e21c..c3443045cd93 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -17,6 +17,7 @@
*
* From: xen-unstable 10676:af9809f51f81a3c43f276f00c81a52ef558afda4
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/kvm_host.h>
#include "kvm_cache_regs.h"
diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
index 80d082ecd5c5..71aff0edc0ed 100644
--- a/arch/x86/kvm/hyperv.c
+++ b/arch/x86/kvm/hyperv.c
@@ -17,6 +17,7 @@
* Ben-Ami Yassour <benami@il.ibm.com>
* Andrey Smetanin <asmetanin@virtuozzo.com>
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include "x86.h"
#include "lapic.h"
diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index e0a7a0e7a73c..cd57a517d04a 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -30,7 +30,7 @@
* Based on QEMU and Xen.
*/
-#define pr_fmt(fmt) "pit: " fmt
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/kvm_host.h>
#include <linux/slab.h>
@@ -351,7 +351,7 @@ static void create_pit_timer(struct kvm_pit *pit, u32 val, int is_period)
if (ps->period < min_period) {
pr_info_ratelimited(
- "kvm: requested %lld ns "
+ "requested %lld ns "
"i8254 timer period limited to %lld ns\n",
ps->period, min_period);
ps->period = min_period;
diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
index e1bb6218bb96..4756bcb5724f 100644
--- a/arch/x86/kvm/i8259.c
+++ b/arch/x86/kvm/i8259.c
@@ -26,6 +26,8 @@
* Yaozu (Eddie) Dong <Eddie.dong@intel.com>
* Port from Qemu.
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/mm.h>
#include <linux/slab.h>
#include <linux/bitops.h>
@@ -35,7 +37,7 @@
#include "trace.h"
#define pr_pic_unimpl(fmt, ...) \
- pr_err_ratelimited("kvm: pic: " fmt, ## __VA_ARGS__)
+ pr_err_ratelimited("pic: " fmt, ## __VA_ARGS__)
static void pic_irq_request(struct kvm *kvm, int level);
diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c
index 765943d7cfa5..042dee556125 100644
--- a/arch/x86/kvm/ioapic.c
+++ b/arch/x86/kvm/ioapic.c
@@ -26,6 +26,7 @@
* Yaozu (Eddie) Dong <eddie.dong@intel.com>
* Based on Xen 3.1 code.
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/kvm_host.h>
#include <linux/kvm.h>
diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c
index a70952eca905..b2c397dd2bc6 100644
--- a/arch/x86/kvm/irq.c
+++ b/arch/x86/kvm/irq.c
@@ -7,6 +7,7 @@
* Authors:
* Yaozu (Eddie) Dong <Eddie.dong@intel.com>
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/export.h>
#include <linux/kvm_host.h>
diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c
index 3742d9adacfc..16d076a1b91a 100644
--- a/arch/x86/kvm/irq_comm.c
+++ b/arch/x86/kvm/irq_comm.c
@@ -8,6 +8,7 @@
*
* Copyright 2010 Red Hat, Inc. and/or its affiliates.
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/kvm_host.h>
#include <linux/slab.h>
@@ -56,7 +57,7 @@ int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src,
if (irq->dest_mode == APIC_DEST_PHYSICAL &&
irq->dest_id == 0xff && kvm_lowest_prio_delivery(irq)) {
- printk(KERN_INFO "kvm: apic: phys broadcast and lowest prio\n");
+ pr_info("apic: phys broadcast and lowest prio\n");
irq->delivery_mode = APIC_DM_FIXED;
}
@@ -199,7 +200,7 @@ int kvm_request_irq_source_id(struct kvm *kvm)
irq_source_id = find_first_zero_bit(bitmap, BITS_PER_LONG);
if (irq_source_id >= BITS_PER_LONG) {
- printk(KERN_WARNING "kvm: exhaust allocatable IRQ sources!\n");
+ pr_warn("exhausted allocatable IRQ sources!\n");
irq_source_id = -EFAULT;
goto unlock;
}
@@ -221,7 +222,7 @@ void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id)
mutex_lock(&kvm->irq_lock);
if (irq_source_id < 0 ||
irq_source_id >= BITS_PER_LONG) {
- printk(KERN_ERR "kvm: IRQ source ID out of range!\n");
+ pr_err("IRQ source ID out of range!\n");
goto unlock;
}
clear_bit(irq_source_id, &kvm->arch.irq_sources_bitmap);
diff --git a/arch/x86/kvm/kvm_onhyperv.c b/arch/x86/kvm/kvm_onhyperv.c
index ee4f696a0782..482d6639ef88 100644
--- a/arch/x86/kvm/kvm_onhyperv.c
+++ b/arch/x86/kvm/kvm_onhyperv.c
@@ -2,6 +2,7 @@
/*
* KVM L1 hypervisor optimizations on Hyper-V.
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/kvm_host.h>
#include <asm/mshyperv.h>
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 669ea125b7e2..7cf4eebc9bcc 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -15,6 +15,7 @@
*
* Based on Xen 3.1 code, Copyright (c) 2004, Intel Corporation.
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/kvm_host.h>
#include <linux/kvm.h>
@@ -1064,8 +1065,7 @@ static void kvm_apic_disabled_lapic_found(struct kvm *kvm)
{
if (!kvm->arch.disabled_lapic_found) {
kvm->arch.disabled_lapic_found = true;
- printk(KERN_INFO
- "Disabled LAPIC found during irq injection\n");
+ pr_info("Disabled LAPIC found during irq injection\n");
}
}
@@ -1683,7 +1683,7 @@ static void limit_periodic_timer_frequency(struct kvm_lapic *apic)
if (apic->lapic_timer.period < min_period) {
pr_info_ratelimited(
- "kvm: vcpu %i: requested %lld ns "
+ "vcpu %i: requested %lld ns "
"lapic timer period limited to %lld ns\n",
apic->vcpu->vcpu_id,
apic->lapic_timer.period, min_period);
@@ -1968,7 +1968,7 @@ static bool set_target_expiration(struct kvm_lapic *apic, u32 count_reg)
deadline = apic->lapic_timer.period;
else if (unlikely(deadline > apic->lapic_timer.period)) {
pr_info_ratelimited(
- "kvm: vcpu %i: requested lapic timer restore with "
+ "vcpu %i: requested lapic timer restore with "
"starting count register %#x=%u (%lld ns) > initial count (%lld ns). "
"Using initial count to start timer.\n",
apic->vcpu->vcpu_id,
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 15d389370f88..aeb240b339f5 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -14,6 +14,7 @@
* Yaniv Kamay <yaniv@qumranet.com>
* Avi Kivity <avi@qumranet.com>
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include "irq.h"
#include "ioapic.h"
@@ -2492,6 +2493,7 @@ static bool __kvm_mmu_prepare_zap_page(struct kvm *kvm,
{
bool list_unstable, zapped_root = false;
+ lockdep_assert_held_write(&kvm->mmu_lock);
trace_kvm_mmu_prepare_zap_page(sp);
++kvm->stat.mmu_shadow_zapped;
*nr_zapped = mmu_zap_unsync_children(kvm, sp, invalid_list);
@@ -3455,8 +3457,7 @@ static int fast_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
}
if (++retry_count > 4) {
- printk_once(KERN_WARNING
- "kvm: Fast #PF retrying more than 4 times.\n");
+ pr_warn_once("Fast #PF retrying more than 4 times.\n");
break;
}
@@ -6166,7 +6167,7 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end)
write_lock(&kvm->mmu_lock);
- kvm_mmu_invalidate_begin(kvm, gfn_start, gfn_end);
+ kvm_mmu_invalidate_begin(kvm, 0, -1ul);
flush = kvm_rmap_zap_gfn_range(kvm, gfn_start, gfn_end);
@@ -6180,7 +6181,7 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end)
kvm_flush_remote_tlbs_with_address(kvm, gfn_start,
gfn_end - gfn_start);
- kvm_mmu_invalidate_end(kvm, gfn_start, gfn_end);
+ kvm_mmu_invalidate_end(kvm, 0, -1ul);
write_unlock(&kvm->mmu_lock);
}
@@ -6646,7 +6647,7 @@ void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen)
* zap all shadow pages.
*/
if (unlikely(gen == 0)) {
- kvm_debug_ratelimited("kvm: zapping shadow pages for mmio generation wraparound\n");
+ kvm_debug_ratelimited("zapping shadow pages for mmio generation wraparound\n");
kvm_mmu_zap_all_fast(kvm);
}
}
diff --git a/arch/x86/kvm/mmu/page_track.c b/arch/x86/kvm/mmu/page_track.c
index 2e09d1b6249f..0a2ac438d647 100644
--- a/arch/x86/kvm/mmu/page_track.c
+++ b/arch/x86/kvm/mmu/page_track.c
@@ -10,6 +10,7 @@
* Author:
* Xiao Guangrong <guangrong.xiao@linux.intel.com>
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/kvm_host.h>
#include <linux/rculist.h>
diff --git a/arch/x86/kvm/mmu/spte.c b/arch/x86/kvm/mmu/spte.c
index c0fd7e049b4e..fce6f047399f 100644
--- a/arch/x86/kvm/mmu/spte.c
+++ b/arch/x86/kvm/mmu/spte.c
@@ -7,7 +7,7 @@
* Copyright (C) 2006 Qumranet, Inc.
* Copyright 2020 Red Hat, Inc. and/or its affiliates.
*/
-
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/kvm_host.h>
#include "mmu.h"
@@ -352,7 +352,7 @@ u64 mark_spte_for_access_track(u64 spte)
WARN_ONCE(spte & (SHADOW_ACC_TRACK_SAVED_BITS_MASK <<
SHADOW_ACC_TRACK_SAVED_BITS_SHIFT),
- "kvm: Access Tracking saved bit locations are not zero\n");
+ "Access Tracking saved bit locations are not zero\n");
spte |= (spte & SHADOW_ACC_TRACK_SAVED_BITS_MASK) <<
SHADOW_ACC_TRACK_SAVED_BITS_SHIFT;
diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h
index 6f54dc9409c9..0d8deefee66c 100644
--- a/arch/x86/kvm/mmu/spte.h
+++ b/arch/x86/kvm/mmu/spte.h
@@ -435,11 +435,11 @@ static inline void check_spte_writable_invariants(u64 spte)
{
if (spte & shadow_mmu_writable_mask)
WARN_ONCE(!(spte & shadow_host_writable_mask),
- "kvm: MMU-writable SPTE is not Host-writable: %llx",
+ KBUILD_MODNAME ": MMU-writable SPTE is not Host-writable: %llx",
spte);
else
WARN_ONCE(is_writable_pte(spte),
- "kvm: Writable SPTE is not MMU-writable: %llx", spte);
+ KBUILD_MODNAME ": Writable SPTE is not MMU-writable: %llx", spte);
}
static inline bool is_mmu_writable_spte(u64 spte)
diff --git a/arch/x86/kvm/mmu/tdp_iter.c b/arch/x86/kvm/mmu/tdp_iter.c
index 39b48e7d7d1a..e26e744df1d1 100644
--- a/arch/x86/kvm/mmu/tdp_iter.c
+++ b/arch/x86/kvm/mmu/tdp_iter.c
@@ -1,4 +1,5 @@
// SPDX-License-Identifier: GPL-2.0
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include "mmu_internal.h"
#include "tdp_iter.h"
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index cc1fb9a65620..bba33aea0fb0 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -1,4 +1,5 @@
// SPDX-License-Identifier: GPL-2.0
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include "mmu.h"
#include "mmu_internal.h"
diff --git a/arch/x86/kvm/mtrr.c b/arch/x86/kvm/mtrr.c
index a8502e02f479..9fac1ec03463 100644
--- a/arch/x86/kvm/mtrr.c
+++ b/arch/x86/kvm/mtrr.c
@@ -13,6 +13,7 @@
* Paolo Bonzini <pbonzini@redhat.com>
* Xiao Guangrong <guangrong.xiao@linux.intel.com>
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/kvm_host.h>
#include <asm/mtrr.h>
diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c
index eb594620dd75..d939d3b84e6f 100644
--- a/arch/x86/kvm/pmu.c
+++ b/arch/x86/kvm/pmu.c
@@ -9,6 +9,7 @@
* Gleb Natapov <gleb@redhat.com>
* Wei Huang <wei@redhat.com>
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/types.h>
#include <linux/kvm_host.h>
diff --git a/arch/x86/kvm/reverse_cpuid.h b/arch/x86/kvm/reverse_cpuid.h
index 25b9b51abb20..4945456fd646 100644
--- a/arch/x86/kvm/reverse_cpuid.h
+++ b/arch/x86/kvm/reverse_cpuid.h
@@ -37,6 +37,7 @@ enum kvm_only_cpuid_leafs {
/* Intel-defined SGX sub-features, CPUID level 0x12 (EAX). */
#define KVM_X86_FEATURE_SGX1 KVM_X86_FEATURE(CPUID_12_EAX, 0)
#define KVM_X86_FEATURE_SGX2 KVM_X86_FEATURE(CPUID_12_EAX, 1)
+#define KVM_X86_FEATURE_SGX_EDECCSSA KVM_X86_FEATURE(CPUID_12_EAX, 11)
/* Intel-defined sub-features, CPUID level 0x00000007:1 (EDX) */
#define X86_FEATURE_AVX_VNNI_INT8 KVM_X86_FEATURE(CPUID_7_1_EDX, 4)
@@ -102,6 +103,8 @@ static __always_inline u32 __feature_translate(int x86_feature)
return KVM_X86_FEATURE_SGX1;
else if (x86_feature == X86_FEATURE_SGX2)
return KVM_X86_FEATURE_SGX2;
+ else if (x86_feature == X86_FEATURE_SGX_EDECCSSA)
+ return KVM_X86_FEATURE_SGX_EDECCSSA;
else if (x86_feature == X86_FEATURE_CONSTANT_TSC)
return KVM_X86_FEATURE_CONSTANT_TSC;
diff --git a/arch/x86/kvm/smm.c b/arch/x86/kvm/smm.c
index a9c1c2af8d94..cc43638d48a3 100644
--- a/arch/x86/kvm/smm.c
+++ b/arch/x86/kvm/smm.c
@@ -1,4 +1,5 @@
/* SPDX-License-Identifier: GPL-2.0 */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/kvm_host.h>
#include "x86.h"
diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c
index 14677bc31b83..b3928150a37c 100644
--- a/arch/x86/kvm/svm/avic.c
+++ b/arch/x86/kvm/svm/avic.c
@@ -12,7 +12,7 @@
* Avi Kivity <avi@qumranet.com>
*/
-#define pr_fmt(fmt) "SVM: " fmt
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/kvm_types.h>
#include <linux/hashtable.h>
diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
index 34ac03969f28..700df66d23c7 100644
--- a/arch/x86/kvm/svm/nested.c
+++ b/arch/x86/kvm/svm/nested.c
@@ -12,7 +12,7 @@
* Avi Kivity <avi@qumranet.com>
*/
-#define pr_fmt(fmt) "SVM: " fmt
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/kvm_types.h>
#include <linux/kvm_host.h>
@@ -138,15 +138,13 @@ void recalc_intercepts(struct vcpu_svm *svm)
c->intercepts[i] = h->intercepts[i];
if (g->int_ctl & V_INTR_MASKING_MASK) {
- /* We only want the cr8 intercept bits of L1 */
- vmcb_clr_intercept(c, INTERCEPT_CR8_READ);
- vmcb_clr_intercept(c, INTERCEPT_CR8_WRITE);
-
/*
- * Once running L2 with HF_VINTR_MASK, EFLAGS.IF does not
- * affect any interrupt we may want to inject; therefore,
- * interrupt window vmexits are irrelevant to L0.
+ * Once running L2 with HF_VINTR_MASK, EFLAGS.IF and CR8
+ * does not affect any interrupt we may want to inject;
+ * therefore, writes to CR8 are irrelevant to L0, as are
+ * interrupt window vmexits.
*/
+ vmcb_clr_intercept(c, INTERCEPT_CR8_WRITE);
vmcb_clr_intercept(c, INTERCEPT_VINTR);
}
diff --git a/arch/x86/kvm/svm/pmu.c b/arch/x86/kvm/svm/pmu.c
index 0e313fbae055..1ff068f23841 100644
--- a/arch/x86/kvm/svm/pmu.c
+++ b/arch/x86/kvm/svm/pmu.c
@@ -9,6 +9,8 @@
*
* Implementation is based on pmu_intel.c file
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/types.h>
#include <linux/kvm_host.h>
#include <linux/perf_event.h>
diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
index 86d6897f4806..273cba809328 100644
--- a/arch/x86/kvm/svm/sev.c
+++ b/arch/x86/kvm/svm/sev.c
@@ -6,6 +6,7 @@
*
* Copyright 2010 Red Hat, Inc. and/or its affiliates.
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/kvm_types.h>
#include <linux/kvm_host.h>
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index f2453df77727..d13cf53e7390 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -1,4 +1,4 @@
-#define pr_fmt(fmt) "SVM: " fmt
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/kvm_host.h>
@@ -519,21 +519,37 @@ static void svm_init_osvw(struct kvm_vcpu *vcpu)
vcpu->arch.osvw.status |= 1;
}
-static int has_svm(void)
+static bool kvm_is_svm_supported(void)
{
+ int cpu = raw_smp_processor_id();
const char *msg;
+ u64 vm_cr;
if (!cpu_has_svm(&msg)) {
- printk(KERN_INFO "has_svm: %s\n", msg);
- return 0;
+ pr_err("SVM not supported by CPU %d, %s\n", cpu, msg);
+ return false;
}
if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT)) {
pr_info("KVM is unsupported when running as an SEV guest\n");
- return 0;
+ return false;
}
- return 1;
+ rdmsrl(MSR_VM_CR, vm_cr);
+ if (vm_cr & (1 << SVM_VM_CR_SVM_DISABLE)) {
+ pr_err("SVM disabled (by BIOS) in MSR_VM_CR on CPU %d\n", cpu);
+ return false;
+ }
+
+ return true;
+}
+
+static int svm_check_processor_compat(void)
+{
+ if (!kvm_is_svm_supported())
+ return -EIO;
+
+ return 0;
}
void __svm_write_tsc_multiplier(u64 multiplier)
@@ -572,10 +588,6 @@ static int svm_hardware_enable(void)
if (efer & EFER_SVME)
return -EBUSY;
- if (!has_svm()) {
- pr_err("%s: err EOPNOTSUPP on %d\n", __func__, me);
- return -EINVAL;
- }
sd = per_cpu_ptr(&svm_data, me);
sd->asid_generation = 1;
sd->max_asid = cpuid_ebx(SVM_CPUID_FUNC) - 1;
@@ -2076,7 +2088,7 @@ static void svm_handle_mce(struct kvm_vcpu *vcpu)
* Erratum 383 triggered. Guest state is corrupt so kill the
* guest.
*/
- pr_err("KVM: Guest triggered AMD Erratum 383\n");
+ pr_err("Guest triggered AMD Erratum 383\n");
kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
@@ -2705,9 +2717,9 @@ static int svm_get_msr_feature(struct kvm_msr_entry *msr)
msr->data = 0;
switch (msr->index) {
- case MSR_F10H_DECFG:
- if (boot_cpu_has(X86_FEATURE_LFENCE_RDTSC))
- msr->data |= MSR_F10H_DECFG_LFENCE_SERIALIZE;
+ case MSR_AMD64_DE_CFG:
+ if (cpu_feature_enabled(X86_FEATURE_LFENCE_RDTSC))
+ msr->data |= MSR_AMD64_DE_CFG_LFENCE_SERIALIZE;
break;
default:
return KVM_MSR_RET_INVALID;
@@ -2806,7 +2818,7 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
msr_info->data = 0x1E;
}
break;
- case MSR_F10H_DECFG:
+ case MSR_AMD64_DE_CFG:
msr_info->data = svm->msr_decfg;
break;
default:
@@ -3035,7 +3047,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
case MSR_VM_IGNNE:
vcpu_unimpl(vcpu, "unimplemented wrmsr: 0x%x data 0x%llx\n", ecx, data);
break;
- case MSR_F10H_DECFG: {
+ case MSR_AMD64_DE_CFG: {
struct kvm_msr_entry msr_entry;
msr_entry.index = msr->index;
@@ -4076,17 +4088,6 @@ static void svm_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa,
vmcb_mark_dirty(svm->vmcb, VMCB_CR);
}
-static int is_disabled(void)
-{
- u64 vm_cr;
-
- rdmsrl(MSR_VM_CR, vm_cr);
- if (vm_cr & (1 << SVM_VM_CR_SVM_DISABLE))
- return 1;
-
- return 0;
-}
-
static void
svm_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
{
@@ -4098,11 +4099,6 @@ svm_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
hypercall[2] = 0xd9;
}
-static int __init svm_check_processor_compat(void)
-{
- return 0;
-}
-
/*
* The kvm parameter can be NULL (module initialization, or invocation before
* VM creation). Be sure to check the kvm parameter before using it.
@@ -4629,7 +4625,7 @@ static bool svm_can_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type,
smap = cr4 & X86_CR4_SMAP;
is_user = svm_get_cpl(vcpu) == 3;
if (smap && (!smep || is_user)) {
- pr_err_ratelimited("KVM: SEV Guest triggered AMD Erratum 1096\n");
+ pr_err_ratelimited("SEV Guest triggered AMD Erratum 1096\n");
/*
* If the fault occurred in userspace, arbitrarily inject #GP
@@ -4701,7 +4697,9 @@ static int svm_vm_init(struct kvm *kvm)
}
static struct kvm_x86_ops svm_x86_ops __initdata = {
- .name = "kvm_amd",
+ .name = KBUILD_MODNAME,
+
+ .check_processor_compatibility = svm_check_processor_compat,
.hardware_unsetup = svm_hardware_unsetup,
.hardware_enable = svm_hardware_enable,
@@ -4978,7 +4976,7 @@ static __init int svm_hardware_setup(void)
}
if (nested) {
- printk(KERN_INFO "kvm: Nested Virtualization enabled\n");
+ pr_info("Nested Virtualization enabled\n");
kvm_enable_efer_bits(EFER_SVME | EFER_LMSLE);
}
@@ -4996,7 +4994,7 @@ static __init int svm_hardware_setup(void)
/* Force VM NPT level equal to the host's paging level */
kvm_configure_mmu(npt_enabled, get_npt_level(),
get_npt_level(), PG_LEVEL_1G);
- pr_info("kvm: Nested Paging %sabled\n", npt_enabled ? "en" : "dis");
+ pr_info("Nested Paging %sabled\n", npt_enabled ? "en" : "dis");
/* Setup shadow_me_value and shadow_me_mask */
kvm_mmu_set_me_spte_mask(sme_me_mask, sme_me_mask);
@@ -5088,10 +5086,7 @@ err:
static struct kvm_x86_init_ops svm_init_ops __initdata = {
- .cpu_has_kvm_support = has_svm,
- .disabled_by_bios = is_disabled,
.hardware_setup = svm_hardware_setup,
- .check_processor_compatibility = svm_check_processor_compat,
.runtime_ops = &svm_x86_ops,
.pmu_ops = &amd_pmu_ops,
@@ -5099,15 +5094,37 @@ static struct kvm_x86_init_ops svm_init_ops __initdata = {
static int __init svm_init(void)
{
+ int r;
+
__unused_size_checks();
- return kvm_init(&svm_init_ops, sizeof(struct vcpu_svm),
- __alignof__(struct vcpu_svm), THIS_MODULE);
+ if (!kvm_is_svm_supported())
+ return -EOPNOTSUPP;
+
+ r = kvm_x86_vendor_init(&svm_init_ops);
+ if (r)
+ return r;
+
+ /*
+ * Common KVM initialization _must_ come last, after this, /dev/kvm is
+ * exposed to userspace!
+ */
+ r = kvm_init(sizeof(struct vcpu_svm), __alignof__(struct vcpu_svm),
+ THIS_MODULE);
+ if (r)
+ goto err_kvm_init;
+
+ return 0;
+
+err_kvm_init:
+ kvm_x86_vendor_exit();
+ return r;
}
static void __exit svm_exit(void)
{
kvm_exit();
+ kvm_x86_vendor_exit();
}
module_init(svm_init)
diff --git a/arch/x86/kvm/svm/svm_onhyperv.c b/arch/x86/kvm/svm/svm_onhyperv.c
index 26a89d0da93e..7af8422d3382 100644
--- a/arch/x86/kvm/svm/svm_onhyperv.c
+++ b/arch/x86/kvm/svm/svm_onhyperv.c
@@ -2,6 +2,7 @@
/*
* KVM L1 hypervisor optimizations on Hyper-V for SVM.
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/kvm_host.h>
diff --git a/arch/x86/kvm/svm/svm_onhyperv.h b/arch/x86/kvm/svm/svm_onhyperv.h
index 45faf84476ce..6981c1e9a809 100644
--- a/arch/x86/kvm/svm/svm_onhyperv.h
+++ b/arch/x86/kvm/svm/svm_onhyperv.h
@@ -34,7 +34,7 @@ static inline void svm_hv_hardware_setup(void)
{
if (npt_enabled &&
ms_hyperv.nested_features & HV_X64_NESTED_ENLIGHTENED_TLB) {
- pr_info("kvm: Hyper-V enlightened NPT TLB flush enabled\n");
+ pr_info(KBUILD_MODNAME ": Hyper-V enlightened NPT TLB flush enabled\n");
svm_x86_ops.tlb_remote_flush = hv_remote_flush_tlb;
svm_x86_ops.tlb_remote_flush_with_range =
hv_remote_flush_tlb_with_range;
@@ -43,7 +43,7 @@ static inline void svm_hv_hardware_setup(void)
if (ms_hyperv.nested_features & HV_X64_NESTED_DIRECT_FLUSH) {
int cpu;
- pr_info("kvm: Hyper-V Direct TLB Flush enabled\n");
+ pr_info(KBUILD_MODNAME ": Hyper-V Direct TLB Flush enabled\n");
for_each_online_cpu(cpu) {
struct hv_vp_assist_page *vp_ap =
hv_get_vp_assist_page(cpu);
diff --git a/arch/x86/kvm/svm/vmenter.S b/arch/x86/kvm/svm/vmenter.S
index 34367dc203f2..8e8295e774f0 100644
--- a/arch/x86/kvm/svm/vmenter.S
+++ b/arch/x86/kvm/svm/vmenter.S
@@ -1,6 +1,7 @@
/* SPDX-License-Identifier: GPL-2.0 */
#include <linux/linkage.h>
#include <asm/asm.h>
+#include <asm/asm-offsets.h>
#include <asm/bitsperlong.h>
#include <asm/kvm_vcpu_regs.h>
#include <asm/nospec-branch.h>
diff --git a/arch/x86/kvm/vmx/capabilities.h b/arch/x86/kvm/vmx/capabilities.h
index cd2ac9536c99..45162c1bcd8f 100644
--- a/arch/x86/kvm/vmx/capabilities.h
+++ b/arch/x86/kvm/vmx/capabilities.h
@@ -66,13 +66,13 @@ struct vmcs_config {
u64 misc;
struct nested_vmx_msrs nested;
};
-extern struct vmcs_config vmcs_config;
+extern struct vmcs_config vmcs_config __ro_after_init;
struct vmx_capability {
u32 ept;
u32 vpid;
};
-extern struct vmx_capability vmx_capability;
+extern struct vmx_capability vmx_capability __ro_after_init;
static inline bool cpu_has_vmx_basic_inout(void)
{
diff --git a/arch/x86/kvm/vmx/hyperv.c b/arch/x86/kvm/vmx/hyperv.c
index f773450fba6e..22daca752797 100644
--- a/arch/x86/kvm/vmx/hyperv.c
+++ b/arch/x86/kvm/vmx/hyperv.c
@@ -1,6 +1,5 @@
// SPDX-License-Identifier: GPL-2.0
-
-#define pr_fmt(fmt) "kvm/hyper-v: " fmt
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/errno.h>
#include <linux/smp.h>
@@ -527,7 +526,7 @@ int nested_evmcs_check_controls(struct vmcs12 *vmcs12)
} \
while (0)
-__init void evmcs_sanitize_exec_ctrls(struct vmcs_config *vmcs_conf)
+void evmcs_sanitize_exec_ctrls(struct vmcs_config *vmcs_conf)
{
evmcs_check_vmcs_conf(cpu_based_exec_ctrl, EXEC_CTRL);
evmcs_check_vmcs_conf(pin_based_exec_ctrl, PINCTRL);
diff --git a/arch/x86/kvm/vmx/hyperv.h b/arch/x86/kvm/vmx/hyperv.h
index 883102d567c3..ab08a9b9ab7d 100644
--- a/arch/x86/kvm/vmx/hyperv.h
+++ b/arch/x86/kvm/vmx/hyperv.h
@@ -179,9 +179,7 @@ static __always_inline int get_evmcs_offset(unsigned long field,
{
int offset = evmcs_field_offset(field, clean_field);
- WARN_ONCE(offset < 0, "KVM: accessing unsupported EVMCS field %lx\n",
- field);
-
+ WARN_ONCE(offset < 0, "accessing unsupported EVMCS field %lx\n", field);
return offset;
}
@@ -273,7 +271,7 @@ static inline void evmcs_load(u64 phys_addr)
vp_ap->enlighten_vmentry = 1;
}
-__init void evmcs_sanitize_exec_ctrls(struct vmcs_config *vmcs_conf);
+void evmcs_sanitize_exec_ctrls(struct vmcs_config *vmcs_conf);
#else /* !IS_ENABLED(CONFIG_HYPERV) */
static __always_inline void evmcs_write64(unsigned long field, u64 value) {}
static inline void evmcs_write32(unsigned long field, u32 value) {}
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index d93c715cda6a..557b9c468734 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -1,4 +1,5 @@
// SPDX-License-Identifier: GPL-2.0
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/objtool.h>
#include <linux/percpu.h>
@@ -203,7 +204,7 @@ static void nested_vmx_abort(struct kvm_vcpu *vcpu, u32 indicator)
{
/* TODO: not to reset guest simply here. */
kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
- pr_debug_ratelimited("kvm: nested vmx abort, indicator %d\n", indicator);
+ pr_debug_ratelimited("nested vmx abort, indicator %d\n", indicator);
}
static inline bool vmx_control_verify(u32 control, u32 low, u32 high)
diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c
index e5cec07ca8d9..efce9ad70e4e 100644
--- a/arch/x86/kvm/vmx/pmu_intel.c
+++ b/arch/x86/kvm/vmx/pmu_intel.c
@@ -8,6 +8,8 @@
* Avi Kivity <avi@redhat.com>
* Gleb Natapov <gleb@redhat.com>
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/types.h>
#include <linux/kvm_host.h>
#include <linux/perf_event.h>
@@ -762,8 +764,7 @@ void vmx_passthrough_lbr_msrs(struct kvm_vcpu *vcpu)
return;
warn:
- pr_warn_ratelimited("kvm: vcpu-%d: fail to passthrough LBR.\n",
- vcpu->vcpu_id);
+ pr_warn_ratelimited("vcpu-%d: fail to passthrough LBR.\n", vcpu->vcpu_id);
}
static void intel_pmu_cleanup(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/kvm/vmx/posted_intr.c b/arch/x86/kvm/vmx/posted_intr.c
index 1b56c5e5c9fb..94c38bea60e7 100644
--- a/arch/x86/kvm/vmx/posted_intr.c
+++ b/arch/x86/kvm/vmx/posted_intr.c
@@ -1,4 +1,6 @@
// SPDX-License-Identifier: GPL-2.0-only
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/kvm_host.h>
#include <asm/irq_remapping.h>
diff --git a/arch/x86/kvm/vmx/sgx.c b/arch/x86/kvm/vmx/sgx.c
index b12da2a6dec9..aa53c98034bf 100644
--- a/arch/x86/kvm/vmx/sgx.c
+++ b/arch/x86/kvm/vmx/sgx.c
@@ -1,5 +1,6 @@
// SPDX-License-Identifier: GPL-2.0
/* Copyright(c) 2021 Intel Corporation. */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <asm/sgx.h>
@@ -164,7 +165,7 @@ static int __handle_encls_ecreate(struct kvm_vcpu *vcpu,
if (!vcpu->kvm->arch.sgx_provisioning_allowed &&
(attributes & SGX_ATTR_PROVISIONKEY)) {
if (sgx_12_1->eax & SGX_ATTR_PROVISIONKEY)
- pr_warn_once("KVM: SGX PROVISIONKEY advertised but not allowed\n");
+ pr_warn_once("SGX PROVISIONKEY advertised but not allowed\n");
kvm_inject_gp(vcpu, 0);
return 1;
}
@@ -381,7 +382,7 @@ int handle_encls(struct kvm_vcpu *vcpu)
return handle_encls_ecreate(vcpu);
if (leaf == EINIT)
return handle_encls_einit(vcpu);
- WARN(1, "KVM: unexpected exit on ENCLS[%u]", leaf);
+ WARN_ONCE(1, "unexpected exit on ENCLS[%u]", leaf);
vcpu->run->exit_reason = KVM_EXIT_UNKNOWN;
vcpu->run->hw.hardware_exit_reason = EXIT_REASON_ENCLS;
return 0;
diff --git a/arch/x86/kvm/vmx/vmcs12.c b/arch/x86/kvm/vmx/vmcs12.c
index 2251b60920f8..106a72c923ca 100644
--- a/arch/x86/kvm/vmx/vmcs12.c
+++ b/arch/x86/kvm/vmx/vmcs12.c
@@ -1,4 +1,5 @@
// SPDX-License-Identifier: GPL-2.0
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include "vmcs12.h"
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index ad2ac66ef32e..c788aa382611 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -12,6 +12,7 @@
* Avi Kivity <avi@qumranet.com>
* Yaniv Kamay <yaniv@qumranet.com>
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/highmem.h>
#include <linux/hrtimer.h>
@@ -444,36 +445,36 @@ void vmread_error(unsigned long field, bool fault)
if (fault)
kvm_spurious_fault();
else
- vmx_insn_failed("kvm: vmread failed: field=%lx\n", field);
+ vmx_insn_failed("vmread failed: field=%lx\n", field);
}
noinline void vmwrite_error(unsigned long field, unsigned long value)
{
- vmx_insn_failed("kvm: vmwrite failed: field=%lx val=%lx err=%u\n",
+ vmx_insn_failed("vmwrite failed: field=%lx val=%lx err=%u\n",
field, value, vmcs_read32(VM_INSTRUCTION_ERROR));
}
noinline void vmclear_error(struct vmcs *vmcs, u64 phys_addr)
{
- vmx_insn_failed("kvm: vmclear failed: %p/%llx err=%u\n",
+ vmx_insn_failed("vmclear failed: %p/%llx err=%u\n",
vmcs, phys_addr, vmcs_read32(VM_INSTRUCTION_ERROR));
}
noinline void vmptrld_error(struct vmcs *vmcs, u64 phys_addr)
{
- vmx_insn_failed("kvm: vmptrld failed: %p/%llx err=%u\n",
+ vmx_insn_failed("vmptrld failed: %p/%llx err=%u\n",
vmcs, phys_addr, vmcs_read32(VM_INSTRUCTION_ERROR));
}
noinline void invvpid_error(unsigned long ext, u16 vpid, gva_t gva)
{
- vmx_insn_failed("kvm: invvpid failed: ext=0x%lx vpid=%u gva=0x%lx\n",
+ vmx_insn_failed("invvpid failed: ext=0x%lx vpid=%u gva=0x%lx\n",
ext, vpid, gva);
}
noinline void invept_error(unsigned long ext, u64 eptp, gpa_t gpa)
{
- vmx_insn_failed("kvm: invept failed: ext=0x%lx eptp=%llx gpa=0x%llx\n",
+ vmx_insn_failed("invept failed: ext=0x%lx eptp=%llx gpa=0x%llx\n",
ext, eptp, gpa);
}
@@ -488,8 +489,8 @@ static DEFINE_PER_CPU(struct list_head, loaded_vmcss_on_cpu);
static DECLARE_BITMAP(vmx_vpid_bitmap, VMX_NR_VPIDS);
static DEFINE_SPINLOCK(vmx_vpid_lock);
-struct vmcs_config vmcs_config;
-struct vmx_capability vmx_capability;
+struct vmcs_config vmcs_config __ro_after_init;
+struct vmx_capability vmx_capability __ro_after_init;
#define VMX_SEGMENT_FIELD(seg) \
[VCPU_SREG_##seg] = { \
@@ -523,6 +524,8 @@ static inline void vmx_segment_cache_clear(struct vcpu_vmx *vmx)
static unsigned long host_idt_base;
#if IS_ENABLED(CONFIG_HYPERV)
+static struct kvm_x86_ops vmx_x86_ops __initdata;
+
static bool __read_mostly enlightened_vmcs = true;
module_param(enlightened_vmcs, bool, 0444);
@@ -551,6 +554,71 @@ static int hv_enable_l2_tlb_flush(struct kvm_vcpu *vcpu)
return 0;
}
+static __init void hv_init_evmcs(void)
+{
+ int cpu;
+
+ if (!enlightened_vmcs)
+ return;
+
+ /*
+ * Enlightened VMCS usage should be recommended and the host needs
+ * to support eVMCS v1 or above.
+ */
+ if (ms_hyperv.hints & HV_X64_ENLIGHTENED_VMCS_RECOMMENDED &&
+ (ms_hyperv.nested_features & HV_X64_ENLIGHTENED_VMCS_VERSION) >=
+ KVM_EVMCS_VERSION) {
+
+ /* Check that we have assist pages on all online CPUs */
+ for_each_online_cpu(cpu) {
+ if (!hv_get_vp_assist_page(cpu)) {
+ enlightened_vmcs = false;
+ break;
+ }
+ }
+
+ if (enlightened_vmcs) {
+ pr_info("Using Hyper-V Enlightened VMCS\n");
+ static_branch_enable(&enable_evmcs);
+ }
+
+ if (ms_hyperv.nested_features & HV_X64_NESTED_DIRECT_FLUSH)
+ vmx_x86_ops.enable_l2_tlb_flush
+ = hv_enable_l2_tlb_flush;
+
+ } else {
+ enlightened_vmcs = false;
+ }
+}
+
+static void hv_reset_evmcs(void)
+{
+ struct hv_vp_assist_page *vp_ap;
+
+ if (!static_branch_unlikely(&enable_evmcs))
+ return;
+
+ /*
+ * KVM should enable eVMCS if and only if all CPUs have a VP assist
+ * page, and should reject CPU onlining if eVMCS is enabled the CPU
+ * doesn't have a VP assist page allocated.
+ */
+ vp_ap = hv_get_vp_assist_page(smp_processor_id());
+ if (WARN_ON_ONCE(!vp_ap))
+ return;
+
+ /*
+ * Reset everything to support using non-enlightened VMCS access later
+ * (e.g. when we reload the module with enlightened_vmcs=0)
+ */
+ vp_ap->nested_control.features.directhypercall = 0;
+ vp_ap->current_nested_vmcs = 0;
+ vp_ap->enlighten_vmentry = 0;
+}
+
+#else /* IS_ENABLED(CONFIG_HYPERV) */
+static void hv_init_evmcs(void) {}
+static void hv_reset_evmcs(void) {}
#endif /* IS_ENABLED(CONFIG_HYPERV) */
/*
@@ -1613,8 +1681,8 @@ static int skip_emulated_instruction(struct kvm_vcpu *vcpu)
if (!instr_len)
goto rip_updated;
- WARN(exit_reason.enclave_mode,
- "KVM: skipping instruction after SGX enclave VM-Exit");
+ WARN_ONCE(exit_reason.enclave_mode,
+ "skipping instruction after SGX enclave VM-Exit");
orig_rip = kvm_rip_read(vcpu);
rip = orig_rip + instr_len;
@@ -2448,88 +2516,6 @@ static void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
}
}
-static __init int cpu_has_kvm_support(void)
-{
- return cpu_has_vmx();
-}
-
-static __init int vmx_disabled_by_bios(void)
-{
- return !boot_cpu_has(X86_FEATURE_MSR_IA32_FEAT_CTL) ||
- !boot_cpu_has(X86_FEATURE_VMX);
-}
-
-static int kvm_cpu_vmxon(u64 vmxon_pointer)
-{
- u64 msr;
-
- cr4_set_bits(X86_CR4_VMXE);
-
- asm_volatile_goto("1: vmxon %[vmxon_pointer]\n\t"
- _ASM_EXTABLE(1b, %l[fault])
- : : [vmxon_pointer] "m"(vmxon_pointer)
- : : fault);
- return 0;
-
-fault:
- WARN_ONCE(1, "VMXON faulted, MSR_IA32_FEAT_CTL (0x3a) = 0x%llx\n",
- rdmsrl_safe(MSR_IA32_FEAT_CTL, &msr) ? 0xdeadbeef : msr);
- cr4_clear_bits(X86_CR4_VMXE);
-
- return -EFAULT;
-}
-
-static int vmx_hardware_enable(void)
-{
- int cpu = raw_smp_processor_id();
- u64 phys_addr = __pa(per_cpu(vmxarea, cpu));
- int r;
-
- if (cr4_read_shadow() & X86_CR4_VMXE)
- return -EBUSY;
-
- /*
- * This can happen if we hot-added a CPU but failed to allocate
- * VP assist page for it.
- */
- if (static_branch_unlikely(&enable_evmcs) &&
- !hv_get_vp_assist_page(cpu))
- return -EFAULT;
-
- intel_pt_handle_vmx(1);
-
- r = kvm_cpu_vmxon(phys_addr);
- if (r) {
- intel_pt_handle_vmx(0);
- return r;
- }
-
- if (enable_ept)
- ept_sync_global();
-
- return 0;
-}
-
-static void vmclear_local_loaded_vmcss(void)
-{
- int cpu = raw_smp_processor_id();
- struct loaded_vmcs *v, *n;
-
- list_for_each_entry_safe(v, n, &per_cpu(loaded_vmcss_on_cpu, cpu),
- loaded_vmcss_on_cpu_link)
- __loaded_vmcs_clear(v);
-}
-
-static void vmx_hardware_disable(void)
-{
- vmclear_local_loaded_vmcss();
-
- if (cpu_vmxoff())
- kvm_spurious_fault();
-
- intel_pt_handle_vmx(0);
-}
-
/*
* There is no X86_FEATURE for SGX yet, but anyway we need to query CPUID
* directly instead of going through cpu_has(), to ensure KVM is trapping
@@ -2565,8 +2551,7 @@ static bool cpu_has_perf_global_ctrl_bug(void)
return false;
}
-static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt,
- u32 msr, u32 *result)
+static int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt, u32 msr, u32 *result)
{
u32 vmx_msr_low, vmx_msr_high;
u32 ctl = ctl_min | ctl_opt;
@@ -2584,7 +2569,7 @@ static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt,
return 0;
}
-static __init u64 adjust_vmx_controls64(u64 ctl_opt, u32 msr)
+static u64 adjust_vmx_controls64(u64 ctl_opt, u32 msr)
{
u64 allowed;
@@ -2593,8 +2578,8 @@ static __init u64 adjust_vmx_controls64(u64 ctl_opt, u32 msr)
return ctl_opt & allowed;
}
-static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf,
- struct vmx_capability *vmx_cap)
+static int setup_vmcs_config(struct vmcs_config *vmcs_conf,
+ struct vmx_capability *vmx_cap)
{
u32 vmx_msr_low, vmx_msr_high;
u32 _pin_based_exec_control = 0;
@@ -2760,6 +2745,119 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf,
return 0;
}
+static bool kvm_is_vmx_supported(void)
+{
+ int cpu = raw_smp_processor_id();
+
+ if (!cpu_has_vmx()) {
+ pr_err("VMX not supported by CPU %d\n", cpu);
+ return false;
+ }
+
+ if (!this_cpu_has(X86_FEATURE_MSR_IA32_FEAT_CTL) ||
+ !this_cpu_has(X86_FEATURE_VMX)) {
+ pr_err("VMX not enabled (by BIOS) in MSR_IA32_FEAT_CTL on CPU %d\n", cpu);
+ return false;
+ }
+
+ return true;
+}
+
+static int vmx_check_processor_compat(void)
+{
+ int cpu = raw_smp_processor_id();
+ struct vmcs_config vmcs_conf;
+ struct vmx_capability vmx_cap;
+
+ if (!kvm_is_vmx_supported())
+ return -EIO;
+
+ if (setup_vmcs_config(&vmcs_conf, &vmx_cap) < 0) {
+ pr_err("Failed to setup VMCS config on CPU %d\n", cpu);
+ return -EIO;
+ }
+ if (nested)
+ nested_vmx_setup_ctls_msrs(&vmcs_conf, vmx_cap.ept);
+ if (memcmp(&vmcs_config, &vmcs_conf, sizeof(struct vmcs_config))) {
+ pr_err("Inconsistent VMCS config on CPU %d\n", cpu);
+ return -EIO;
+ }
+ return 0;
+}
+
+static int kvm_cpu_vmxon(u64 vmxon_pointer)
+{
+ u64 msr;
+
+ cr4_set_bits(X86_CR4_VMXE);
+
+ asm_volatile_goto("1: vmxon %[vmxon_pointer]\n\t"
+ _ASM_EXTABLE(1b, %l[fault])
+ : : [vmxon_pointer] "m"(vmxon_pointer)
+ : : fault);
+ return 0;
+
+fault:
+ WARN_ONCE(1, "VMXON faulted, MSR_IA32_FEAT_CTL (0x3a) = 0x%llx\n",
+ rdmsrl_safe(MSR_IA32_FEAT_CTL, &msr) ? 0xdeadbeef : msr);
+ cr4_clear_bits(X86_CR4_VMXE);
+
+ return -EFAULT;
+}
+
+static int vmx_hardware_enable(void)
+{
+ int cpu = raw_smp_processor_id();
+ u64 phys_addr = __pa(per_cpu(vmxarea, cpu));
+ int r;
+
+ if (cr4_read_shadow() & X86_CR4_VMXE)
+ return -EBUSY;
+
+ /*
+ * This can happen if we hot-added a CPU but failed to allocate
+ * VP assist page for it.
+ */
+ if (static_branch_unlikely(&enable_evmcs) &&
+ !hv_get_vp_assist_page(cpu))
+ return -EFAULT;
+
+ intel_pt_handle_vmx(1);
+
+ r = kvm_cpu_vmxon(phys_addr);
+ if (r) {
+ intel_pt_handle_vmx(0);
+ return r;
+ }
+
+ if (enable_ept)
+ ept_sync_global();
+
+ return 0;
+}
+
+static void vmclear_local_loaded_vmcss(void)
+{
+ int cpu = raw_smp_processor_id();
+ struct loaded_vmcs *v, *n;
+
+ list_for_each_entry_safe(v, n, &per_cpu(loaded_vmcss_on_cpu, cpu),
+ loaded_vmcss_on_cpu_link)
+ __loaded_vmcs_clear(v);
+}
+
+static void vmx_hardware_disable(void)
+{
+ vmclear_local_loaded_vmcss();
+
+ if (cpu_vmxoff())
+ kvm_spurious_fault();
+
+ hv_reset_evmcs();
+
+ intel_pt_handle_vmx(0);
+}
+
struct vmcs *alloc_vmcs_cpu(bool shadow, int cpu, gfp_t flags)
{
int node = cpu_to_node(cpu);
@@ -2955,9 +3053,8 @@ static void fix_rmode_seg(int seg, struct kvm_segment *save)
var.type = 0x3;
var.avl = 0;
if (save->base & 0xf)
- printk_once(KERN_WARNING "kvm: segment base is not "
- "paragraph aligned when entering "
- "protected mode (seg=%d)", seg);
+ pr_warn_once("segment base is not paragraph aligned "
+ "when entering protected mode (seg=%d)", seg);
}
vmcs_write16(sf->selector, var.selector);
@@ -2987,8 +3084,7 @@ static void enter_rmode(struct kvm_vcpu *vcpu)
* vcpu. Warn the user that an update is overdue.
*/
if (!kvm_vmx->tss_addr)
- printk_once(KERN_WARNING "kvm: KVM_SET_TSS_ADDR need to be "
- "called before entering vcpu\n");
+ pr_warn_once("KVM_SET_TSS_ADDR needs to be called before running vCPU\n");
vmx_segment_cache_clear(vmx);
@@ -6823,7 +6919,7 @@ static void handle_external_interrupt_irqoff(struct kvm_vcpu *vcpu)
gate_desc *desc = (gate_desc *)host_idt_base + vector;
if (KVM_BUG(!is_external_intr(intr_info), vcpu->kvm,
- "KVM: unexpected VM-Exit interrupt info: 0x%x", intr_info))
+ "unexpected VM-Exit interrupt info: 0x%x", intr_info))
return;
handle_interrupt_nmi_irqoff(vcpu, gate_offset(desc));
@@ -7421,29 +7517,6 @@ static int vmx_vm_init(struct kvm *kvm)
return 0;
}
-static int __init vmx_check_processor_compat(void)
-{
- struct vmcs_config vmcs_conf;
- struct vmx_capability vmx_cap;
-
- if (!this_cpu_has(X86_FEATURE_MSR_IA32_FEAT_CTL) ||
- !this_cpu_has(X86_FEATURE_VMX)) {
- pr_err("kvm: VMX is disabled on CPU %d\n", smp_processor_id());
- return -EIO;
- }
-
- if (setup_vmcs_config(&vmcs_conf, &vmx_cap) < 0)
- return -EIO;
- if (nested)
- nested_vmx_setup_ctls_msrs(&vmcs_conf, vmx_cap.ept);
- if (memcmp(&vmcs_config, &vmcs_conf, sizeof(struct vmcs_config)) != 0) {
- printk(KERN_ERR "kvm: CPU %d feature inconsistency!\n",
- smp_processor_id());
- return -EIO;
- }
- return 0;
-}
-
static u8 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
{
u8 cache;
@@ -8042,7 +8115,9 @@ static void vmx_vm_destroy(struct kvm *kvm)
}
static struct kvm_x86_ops vmx_x86_ops __initdata = {
- .name = "kvm_intel",
+ .name = KBUILD_MODNAME,
+
+ .check_processor_compatibility = vmx_check_processor_compat,
.hardware_unsetup = vmx_hardware_unsetup,
@@ -8262,7 +8337,7 @@ static __init int hardware_setup(void)
return -EIO;
if (cpu_has_perf_global_ctrl_bug())
- pr_warn_once("kvm: VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL "
+ pr_warn_once("VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL "
"does not work properly. Using workaround\n");
if (boot_cpu_has(X86_FEATURE_NX))
@@ -8270,7 +8345,7 @@ static __init int hardware_setup(void)
if (boot_cpu_has(X86_FEATURE_MPX)) {
rdmsrl(MSR_IA32_BNDCFGS, host_bndcfgs);
- WARN_ONCE(host_bndcfgs, "KVM: BNDCFGS in host will be lost");
+ WARN_ONCE(host_bndcfgs, "BNDCFGS in host will be lost");
}
if (!cpu_has_vmx_mpx())
@@ -8289,7 +8364,7 @@ static __init int hardware_setup(void)
/* NX support is required for shadow paging. */
if (!enable_ept && !boot_cpu_has(X86_FEATURE_NX)) {
- pr_err_ratelimited("kvm: NX (Execute Disable) not supported\n");
+ pr_err_ratelimited("NX (Execute Disable) not supported\n");
return -EOPNOTSUPP;
}
@@ -8441,9 +8516,6 @@ static __init int hardware_setup(void)
}
static struct kvm_x86_init_ops vmx_init_ops __initdata = {
- .cpu_has_kvm_support = cpu_has_kvm_support,
- .disabled_by_bios = vmx_disabled_by_bios,
- .check_processor_compatibility = vmx_check_processor_compat,
.hardware_setup = hardware_setup,
.handle_intel_pt_intr = NULL,
@@ -8461,41 +8533,23 @@ static void vmx_cleanup_l1d_flush(void)
l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_AUTO;
}
-static void vmx_exit(void)
+static void __vmx_exit(void)
{
+ allow_smaller_maxphyaddr = false;
+
#ifdef CONFIG_KEXEC_CORE
RCU_INIT_POINTER(crash_vmclear_loaded_vmcss, NULL);
synchronize_rcu();
#endif
+ vmx_cleanup_l1d_flush();
+}
+static void vmx_exit(void)
+{
kvm_exit();
+ kvm_x86_vendor_exit();
-#if IS_ENABLED(CONFIG_HYPERV)
- if (static_branch_unlikely(&enable_evmcs)) {
- int cpu;
- struct hv_vp_assist_page *vp_ap;
- /*
- * Reset everything to support using non-enlightened VMCS
- * access later (e.g. when we reload the module with
- * enlightened_vmcs=0)
- */
- for_each_online_cpu(cpu) {
- vp_ap = hv_get_vp_assist_page(cpu);
-
- if (!vp_ap)
- continue;
-
- vp_ap->nested_control.features.directhypercall = 0;
- vp_ap->current_nested_vmcs = 0;
- vp_ap->enlighten_vmentry = 0;
- }
-
- static_branch_disable(&enable_evmcs);
- }
-#endif
- vmx_cleanup_l1d_flush();
-
- allow_smaller_maxphyaddr = false;
+ __vmx_exit();
}
module_exit(vmx_exit);
@@ -8503,56 +8557,29 @@ static int __init vmx_init(void)
{
int r, cpu;
-#if IS_ENABLED(CONFIG_HYPERV)
+ if (!kvm_is_vmx_supported())
+ return -EOPNOTSUPP;
+
/*
- * Enlightened VMCS usage should be recommended and the host needs
- * to support eVMCS v1 or above. We can also disable eVMCS support
- * with module parameter.
+ * Note, hv_init_evmcs() touches only VMX knobs, i.e. there's nothing
+ * to unwind if a later step fails.
*/
- if (enlightened_vmcs &&
- ms_hyperv.hints & HV_X64_ENLIGHTENED_VMCS_RECOMMENDED &&
- (ms_hyperv.nested_features & HV_X64_ENLIGHTENED_VMCS_VERSION) >=
- KVM_EVMCS_VERSION) {
+ hv_init_evmcs();
- /* Check that we have assist pages on all online CPUs */
- for_each_online_cpu(cpu) {
- if (!hv_get_vp_assist_page(cpu)) {
- enlightened_vmcs = false;
- break;
- }
- }
-
- if (enlightened_vmcs) {
- pr_info("KVM: vmx: using Hyper-V Enlightened VMCS\n");
- static_branch_enable(&enable_evmcs);
- }
-
- if (ms_hyperv.nested_features & HV_X64_NESTED_DIRECT_FLUSH)
- vmx_x86_ops.enable_l2_tlb_flush
- = hv_enable_l2_tlb_flush;
-
- } else {
- enlightened_vmcs = false;
- }
-#endif
-
- r = kvm_init(&vmx_init_ops, sizeof(struct vcpu_vmx),
- __alignof__(struct vcpu_vmx), THIS_MODULE);
+ r = kvm_x86_vendor_init(&vmx_init_ops);
if (r)
return r;
/*
- * Must be called after kvm_init() so enable_ept is properly set
+ * Must be called after common x86 init so enable_ept is properly set
* up. Hand the parameter mitigation value in which was stored in
* the pre module init parser. If no parameter was given, it will
* contain 'auto' which will be turned into the default 'cond'
* mitigation mode.
*/
r = vmx_setup_l1d_flush(vmentry_l1d_flush_param);
- if (r) {
- vmx_exit();
- return r;
- }
+ if (r)
+ goto err_l1d_flush;
vmx_setup_fb_clear_ctrl();
@@ -8576,6 +8603,21 @@ static int __init vmx_init(void)
if (!enable_ept)
allow_smaller_maxphyaddr = true;
+ /*
+ * Common KVM initialization _must_ come last, after this, /dev/kvm is
+ * exposed to userspace!
+ */
+ r = kvm_init(sizeof(struct vcpu_vmx), __alignof__(struct vcpu_vmx),
+ THIS_MODULE);
+ if (r)
+ goto err_kvm_init;
+
return 0;
+
+err_kvm_init:
+ __vmx_exit();
+err_l1d_flush:
+ kvm_x86_vendor_exit();
+ return r;
}
module_init(vmx_init);
diff --git a/arch/x86/kvm/vmx/vmx_ops.h b/arch/x86/kvm/vmx/vmx_ops.h
index 842dc898c972..a5282014616c 100644
--- a/arch/x86/kvm/vmx/vmx_ops.h
+++ b/arch/x86/kvm/vmx/vmx_ops.h
@@ -100,8 +100,8 @@ static __always_inline unsigned long __vmcs_readl(unsigned long field)
return value;
do_fail:
- WARN_ONCE(1, "kvm: vmread failed: field=%lx\n", field);
- pr_warn_ratelimited("kvm: vmread failed: field=%lx\n", field);
+ WARN_ONCE(1, KBUILD_MODNAME ": vmread failed: field=%lx\n", field);
+ pr_warn_ratelimited(KBUILD_MODNAME ": vmread failed: field=%lx\n", field);
return 0;
do_exception:
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 5becce5bd45a..508074e47bc0 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -15,6 +15,7 @@
* Amit Shah <amit.shah@qumranet.com>
* Ben-Ami Yassour <benami@il.ibm.com>
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/kvm_host.h>
#include "irq.h"
@@ -128,6 +129,7 @@ static int kvm_vcpu_do_singlestep(struct kvm_vcpu *vcpu);
static int __set_sregs2(struct kvm_vcpu *vcpu, struct kvm_sregs2 *sregs2);
static void __get_sregs2(struct kvm_vcpu *vcpu, struct kvm_sregs2 *sregs2);
+static DEFINE_MUTEX(vendor_module_lock);
struct kvm_x86_ops kvm_x86_ops __read_mostly;
#define KVM_X86_OP(func) \
@@ -1557,7 +1559,7 @@ static const u32 msr_based_features_all[] = {
MSR_IA32_VMX_EPT_VPID_CAP,
MSR_IA32_VMX_VMFUNC,
- MSR_F10H_DECFG,
+ MSR_AMD64_DE_CFG,
MSR_IA32_UCODE_REV,
MSR_IA32_ARCH_CAPABILITIES,
MSR_IA32_PERF_CAPABILITIES,
@@ -2086,7 +2088,7 @@ static int kvm_emulate_monitor_mwait(struct kvm_vcpu *vcpu, const char *insn)
!guest_cpuid_has(vcpu, X86_FEATURE_MWAIT))
return kvm_handle_invalid_op(vcpu);
- pr_warn_once("kvm: %s instruction emulated as NOP!\n", insn);
+ pr_warn_once("%s instruction emulated as NOP!\n", insn);
return kvm_emulate_as_nop(vcpu);
}
int kvm_emulate_mwait(struct kvm_vcpu *vcpu)
@@ -2433,7 +2435,8 @@ static int kvm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz)
thresh_lo = adjust_tsc_khz(tsc_khz, -tsc_tolerance_ppm);
thresh_hi = adjust_tsc_khz(tsc_khz, tsc_tolerance_ppm);
if (user_tsc_khz < thresh_lo || user_tsc_khz > thresh_hi) {
- pr_debug("kvm: requested TSC rate %u falls outside tolerance [%u,%u]\n", user_tsc_khz, thresh_lo, thresh_hi);
+ pr_debug("requested TSC rate %u falls outside tolerance [%u,%u]\n",
+ user_tsc_khz, thresh_lo, thresh_hi);
use_scaling = 1;
}
return set_tsc_khz(vcpu, user_tsc_khz, use_scaling);
@@ -7701,7 +7704,7 @@ static int emulator_cmpxchg_emulated(struct x86_emulate_ctxt *ctxt,
return X86EMUL_CONTINUE;
emul_write:
- printk_once(KERN_WARNING "kvm: emulating exchange as write\n");
+ pr_warn_once("emulating exchange as write\n");
return emulator_write_emulated(ctxt, addr, new, bytes, exception);
}
@@ -8262,7 +8265,7 @@ static struct x86_emulate_ctxt *alloc_emulate_ctxt(struct kvm_vcpu *vcpu)
ctxt = kmem_cache_zalloc(x86_emulator_cache, GFP_KERNEL_ACCOUNT);
if (!ctxt) {
- pr_err("kvm: failed to allocate vcpu's emulator\n");
+ pr_err("failed to allocate vcpu's emulator\n");
return NULL;
}
@@ -9273,35 +9276,66 @@ static struct notifier_block pvclock_gtod_notifier = {
};
#endif
-int kvm_arch_init(void *opaque)
+static inline void kvm_ops_update(struct kvm_x86_init_ops *ops)
+{
+ memcpy(&kvm_x86_ops, ops->runtime_ops, sizeof(kvm_x86_ops));
+
+#define __KVM_X86_OP(func) \
+ static_call_update(kvm_x86_##func, kvm_x86_ops.func);
+#define KVM_X86_OP(func) \
+ WARN_ON(!kvm_x86_ops.func); __KVM_X86_OP(func)
+#define KVM_X86_OP_OPTIONAL __KVM_X86_OP
+#define KVM_X86_OP_OPTIONAL_RET0(func) \
+ static_call_update(kvm_x86_##func, (void *)kvm_x86_ops.func ? : \
+ (void *)__static_call_return0);
+#include <asm/kvm-x86-ops.h>
+#undef __KVM_X86_OP
+
+ kvm_pmu_ops_update(ops->pmu_ops);
+}
+
+static int kvm_x86_check_processor_compatibility(void)
+{
+ int cpu = smp_processor_id();
+ struct cpuinfo_x86 *c = &cpu_data(cpu);
+
+ /*
+ * Compatibility checks are done when loading KVM and when enabling
+ * hardware, e.g. during CPU hotplug, to ensure all online CPUs are
+ * compatible, i.e. KVM should never perform a compatibility check on
+ * an offline CPU.
+ */
+ WARN_ON(!cpu_online(cpu));
+
+ if (__cr4_reserved_bits(cpu_has, c) !=
+ __cr4_reserved_bits(cpu_has, &boot_cpu_data))
+ return -EIO;
+
+ return static_call(kvm_x86_check_processor_compatibility)();
+}
+
+static void kvm_x86_check_cpu_compat(void *ret)
+{
+ *(int *)ret = kvm_x86_check_processor_compatibility();
+}
+
+static int __kvm_x86_vendor_init(struct kvm_x86_init_ops *ops)
{
- struct kvm_x86_init_ops *ops = opaque;
u64 host_pat;
- int r;
+ int r, cpu;
if (kvm_x86_ops.hardware_enable) {
- pr_err("kvm: already loaded vendor module '%s'\n", kvm_x86_ops.name);
+ pr_err("already loaded vendor module '%s'\n", kvm_x86_ops.name);
return -EEXIST;
}
- if (!ops->cpu_has_kvm_support()) {
- pr_err_ratelimited("kvm: no hardware support for '%s'\n",
- ops->runtime_ops->name);
- return -EOPNOTSUPP;
- }
- if (ops->disabled_by_bios()) {
- pr_err_ratelimited("kvm: support for '%s' disabled by bios\n",
- ops->runtime_ops->name);
- return -EOPNOTSUPP;
- }
-
/*
* KVM explicitly assumes that the guest has an FPU and
* FXSAVE/FXRSTOR. For example, the KVM_GET_FPU explicitly casts the
* vCPU's FPU state as a fxregs_state struct.
*/
if (!boot_cpu_has(X86_FEATURE_FPU) || !boot_cpu_has(X86_FEATURE_FXSR)) {
- printk(KERN_ERR "kvm: inadequate fpu\n");
+ pr_err("inadequate fpu\n");
return -EOPNOTSUPP;
}
@@ -9319,19 +9353,19 @@ int kvm_arch_init(void *opaque)
*/
if (rdmsrl_safe(MSR_IA32_CR_PAT, &host_pat) ||
(host_pat & GENMASK(2, 0)) != 6) {
- pr_err("kvm: host PAT[0] is not WB\n");
+ pr_err("host PAT[0] is not WB\n");
return -EIO;
}
x86_emulator_cache = kvm_alloc_emulator_cache();
if (!x86_emulator_cache) {
- pr_err("kvm: failed to allocate cache for x86 emulator\n");
+ pr_err("failed to allocate cache for x86 emulator\n");
return -ENOMEM;
}
user_return_msrs = alloc_percpu(struct kvm_user_return_msrs);
if (!user_return_msrs) {
- printk(KERN_ERR "kvm: failed to allocate percpu kvm_user_return_msrs\n");
+ pr_err("failed to allocate percpu kvm_user_return_msrs\n");
r = -ENOMEM;
goto out_free_x86_emulator_cache;
}
@@ -9341,13 +9375,37 @@ int kvm_arch_init(void *opaque)
if (r)
goto out_free_percpu;
- kvm_timer_init();
-
if (boot_cpu_has(X86_FEATURE_XSAVE)) {
host_xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
kvm_caps.supported_xcr0 = host_xcr0 & KVM_SUPPORTED_XCR0;
}
+ rdmsrl_safe(MSR_EFER, &host_efer);
+
+ if (boot_cpu_has(X86_FEATURE_XSAVES))
+ rdmsrl(MSR_IA32_XSS, host_xss);
+
+ kvm_init_pmu_capability();
+
+ r = ops->hardware_setup();
+ if (r != 0)
+ goto out_mmu_exit;
+
+ kvm_ops_update(ops);
+
+ for_each_online_cpu(cpu) {
+ smp_call_function_single(cpu, kvm_x86_check_cpu_compat, &r, 1);
+ if (r < 0)
+ goto out_unwind_ops;
+ }
+
+ /*
+ * Point of no return! DO NOT add error paths below this point unless
+ * absolutely necessary, as most operations from this point forward
+ * require unwinding.
+ */
+ kvm_timer_init();
+
if (pi_inject_timer == -1)
pi_inject_timer = housekeeping_enabled(HK_TYPE_TIMER);
#ifdef CONFIG_X86_64
@@ -9357,8 +9415,35 @@ int kvm_arch_init(void *opaque)
set_hv_tscchange_cb(kvm_hyperv_tsc_notifier);
#endif
+ kvm_register_perf_callbacks(ops->handle_intel_pt_intr);
+
+ if (!kvm_cpu_cap_has(X86_FEATURE_XSAVES))
+ kvm_caps.supported_xss = 0;
+
+#define __kvm_cpu_cap_has(UNUSED_, f) kvm_cpu_cap_has(f)
+ cr4_reserved_bits = __cr4_reserved_bits(__kvm_cpu_cap_has, UNUSED_);
+#undef __kvm_cpu_cap_has
+
+ if (kvm_caps.has_tsc_control) {
+ /*
+ * Make sure the user can only configure tsc_khz values that
+ * fit into a signed integer.
+ * A min value is not calculated because it will always
+ * be 1 on all machines.
+ */
+ u64 max = min(0x7fffffffULL,
+ __scale_tsc(kvm_caps.max_tsc_scaling_ratio, tsc_khz));
+ kvm_caps.max_guest_tsc_khz = max;
+ }
+ kvm_caps.default_tsc_scaling_ratio = 1ULL << kvm_caps.tsc_scaling_ratio_frac_bits;
+ kvm_init_msr_list();
return 0;
+out_unwind_ops:
+ kvm_x86_ops.hardware_enable = NULL;
+ static_call(kvm_x86_hardware_unsetup)();
+out_mmu_exit:
+ kvm_mmu_vendor_module_exit();
out_free_percpu:
free_percpu(user_return_msrs);
out_free_x86_emulator_cache:
@@ -9366,8 +9451,22 @@ out_free_x86_emulator_cache:
return r;
}
-void kvm_arch_exit(void)
+int kvm_x86_vendor_init(struct kvm_x86_init_ops *ops)
+{
+ int r;
+
+ mutex_lock(&vendor_module_lock);
+ r = __kvm_x86_vendor_init(ops);
+ mutex_unlock(&vendor_module_lock);
+
+ return r;
+}
+EXPORT_SYMBOL_GPL(kvm_x86_vendor_init);
+
+void kvm_x86_vendor_exit(void)
{
+ kvm_unregister_perf_callbacks();
+
#ifdef CONFIG_X86_64
if (hypervisor_is_type(X86_HYPER_MS_HYPERV))
clear_hv_tscchange_cb();
@@ -9384,7 +9483,7 @@ void kvm_arch_exit(void)
irq_work_sync(&pvclock_irq_work);
cancel_work_sync(&pvclock_gtod_work);
#endif
- kvm_x86_ops.hardware_enable = NULL;
+ static_call(kvm_x86_hardware_unsetup)();
kvm_mmu_vendor_module_exit();
free_percpu(user_return_msrs);
kmem_cache_destroy(x86_emulator_cache);
@@ -9392,7 +9491,11 @@ void kvm_arch_exit(void)
static_key_deferred_flush(&kvm_xen_enabled);
WARN_ON(static_branch_unlikely(&kvm_xen_enabled.key));
#endif
+ mutex_lock(&vendor_module_lock);
+ kvm_x86_ops.hardware_enable = NULL;
+ mutex_unlock(&vendor_module_lock);
}
+EXPORT_SYMBOL_GPL(kvm_x86_vendor_exit);
static int __kvm_emulate_halt(struct kvm_vcpu *vcpu, int state, int reason)
{
@@ -11556,7 +11659,7 @@ static int sync_regs(struct kvm_vcpu *vcpu)
int kvm_arch_vcpu_precreate(struct kvm *kvm, unsigned int id)
{
if (kvm_check_tsc_unstable() && kvm->created_vcpus)
- pr_warn_once("kvm: SMP vm created on host with unstable TSC; "
+ pr_warn_once("SMP vm created on host with unstable TSC; "
"guest TSC will not be reliable\n");
if (!kvm->arch.max_vcpu_ids)
@@ -11633,7 +11736,7 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
goto free_wbinvd_dirty_mask;
if (!fpu_alloc_guest_fpstate(&vcpu->arch.guest_fpu)) {
- pr_err("kvm: failed to allocate vcpu's fpu\n");
+ pr_err("failed to allocate vcpu's fpu\n");
goto free_emulate_ctxt;
}
@@ -11907,6 +12010,11 @@ int kvm_arch_hardware_enable(void)
bool stable, backwards_tsc = false;
kvm_user_return_msr_cpu_online();
+
+ ret = kvm_x86_check_processor_compatibility();
+ if (ret)
+ return ret;
+
ret = static_call(kvm_x86_hardware_enable)();
if (ret != 0)
return ret;
@@ -11993,88 +12101,6 @@ void kvm_arch_hardware_disable(void)
drop_user_return_notifiers();
}
-static inline void kvm_ops_update(struct kvm_x86_init_ops *ops)
-{
- memcpy(&kvm_x86_ops, ops->runtime_ops, sizeof(kvm_x86_ops));
-
-#define __KVM_X86_OP(func) \
- static_call_update(kvm_x86_##func, kvm_x86_ops.func);
-#define KVM_X86_OP(func) \
- WARN_ON(!kvm_x86_ops.func); __KVM_X86_OP(func)
-#define KVM_X86_OP_OPTIONAL __KVM_X86_OP
-#define KVM_X86_OP_OPTIONAL_RET0(func) \
- static_call_update(kvm_x86_##func, (void *)kvm_x86_ops.func ? : \
- (void *)__static_call_return0);
-#include <asm/kvm-x86-ops.h>
-#undef __KVM_X86_OP
-
- kvm_pmu_ops_update(ops->pmu_ops);
-}
-
-int kvm_arch_hardware_setup(void *opaque)
-{
- struct kvm_x86_init_ops *ops = opaque;
- int r;
-
- rdmsrl_safe(MSR_EFER, &host_efer);
-
- if (boot_cpu_has(X86_FEATURE_XSAVES))
- rdmsrl(MSR_IA32_XSS, host_xss);
-
- kvm_init_pmu_capability();
-
- r = ops->hardware_setup();
- if (r != 0)
- return r;
-
- kvm_ops_update(ops);
-
- kvm_register_perf_callbacks(ops->handle_intel_pt_intr);
-
- if (!kvm_cpu_cap_has(X86_FEATURE_XSAVES))
- kvm_caps.supported_xss = 0;
-
-#define __kvm_cpu_cap_has(UNUSED_, f) kvm_cpu_cap_has(f)
- cr4_reserved_bits = __cr4_reserved_bits(__kvm_cpu_cap_has, UNUSED_);
-#undef __kvm_cpu_cap_has
-
- if (kvm_caps.has_tsc_control) {
- /*
- * Make sure the user can only configure tsc_khz values that
- * fit into a signed integer.
- * A min value is not calculated because it will always
- * be 1 on all machines.
- */
- u64 max = min(0x7fffffffULL,
- __scale_tsc(kvm_caps.max_tsc_scaling_ratio, tsc_khz));
- kvm_caps.max_guest_tsc_khz = max;
- }
- kvm_caps.default_tsc_scaling_ratio = 1ULL << kvm_caps.tsc_scaling_ratio_frac_bits;
- kvm_init_msr_list();
- return 0;
-}
-
-void kvm_arch_hardware_unsetup(void)
-{
- kvm_unregister_perf_callbacks();
-
- static_call(kvm_x86_hardware_unsetup)();
-}
-
-int kvm_arch_check_processor_compat(void *opaque)
-{
- struct cpuinfo_x86 *c = &cpu_data(smp_processor_id());
- struct kvm_x86_init_ops *ops = opaque;
-
- WARN_ON(!irqs_disabled());
-
- if (__cr4_reserved_bits(cpu_has, c) !=
- __cr4_reserved_bits(cpu_has, &boot_cpu_data))
- return -EIO;
-
- return ops->check_processor_compatibility();
-}
-
bool kvm_vcpu_is_reset_bsp(struct kvm_vcpu *vcpu)
{
return vcpu->kvm->arch.bsp_vcpu_id == vcpu->vcpu_id;
diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c
index b178f40bd863..2681e2007e39 100644
--- a/arch/x86/kvm/xen.c
+++ b/arch/x86/kvm/xen.c
@@ -5,6 +5,7 @@
*
* KVM Xen emulation
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include "x86.h"
#include "xen.h"
@@ -271,7 +272,15 @@ static void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, bool atomic)
* Attempt to obtain the GPC lock on *both* (if there are two)
* gfn_to_pfn caches that cover the region.
*/
- read_lock_irqsave(&gpc1->lock, flags);
+ if (atomic) {
+ local_irq_save(flags);
+ if (!read_trylock(&gpc1->lock)) {
+ local_irq_restore(flags);
+ return;
+ }
+ } else {
+ read_lock_irqsave(&gpc1->lock, flags);
+ }
while (!kvm_gpc_check(gpc1, user_len1)) {
read_unlock_irqrestore(&gpc1->lock, flags);
@@ -304,9 +313,18 @@ static void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, bool atomic)
* The guest's runstate_info is split across two pages and we
* need to hold and validate both GPCs simultaneously. We can
* declare a lock ordering GPC1 > GPC2 because nothing else
- * takes them more than one at a time.
+ * takes them more than one at a time. Set a subclass on the
+ * gpc1 lock to make lockdep shut up about it.
*/
- read_lock(&gpc2->lock);
+ lock_set_subclass(&gpc1->lock.dep_map, 1, _THIS_IP_);
+ if (atomic) {
+ if (!read_trylock(&gpc2->lock)) {
+ read_unlock_irqrestore(&gpc1->lock, flags);
+ return;
+ }
+ } else {
+ read_lock(&gpc2->lock);
+ }
if (!kvm_gpc_check(gpc2, user_len2)) {
read_unlock(&gpc2->lock);
@@ -590,26 +608,26 @@ int kvm_xen_hvm_set_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data)
if (!IS_ENABLED(CONFIG_64BIT) && data->u.long_mode) {
r = -EINVAL;
} else {
- mutex_lock(&kvm->lock);
+ mutex_lock(&kvm->arch.xen.xen_lock);
kvm->arch.xen.long_mode = !!data->u.long_mode;
- mutex_unlock(&kvm->lock);
+ mutex_unlock(&kvm->arch.xen.xen_lock);
r = 0;
}
break;
case KVM_XEN_ATTR_TYPE_SHARED_INFO:
- mutex_lock(&kvm->lock);
+ mutex_lock(&kvm->arch.xen.xen_lock);
r = kvm_xen_shared_info_init(kvm, data->u.shared_info.gfn);
- mutex_unlock(&kvm->lock);
+ mutex_unlock(&kvm->arch.xen.xen_lock);
break;
case KVM_XEN_ATTR_TYPE_UPCALL_VECTOR:
if (data->u.vector && data->u.vector < 0x10)
r = -EINVAL;
else {
- mutex_lock(&kvm->lock);
+ mutex_lock(&kvm->arch.xen.xen_lock);
kvm->arch.xen.upcall_vector = data->u.vector;
- mutex_unlock(&kvm->lock);
+ mutex_unlock(&kvm->arch.xen.xen_lock);
r = 0;
}
break;
@@ -619,9 +637,9 @@ int kvm_xen_hvm_set_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data)
break;
case KVM_XEN_ATTR_TYPE_XEN_VERSION:
- mutex_lock(&kvm->lock);
+ mutex_lock(&kvm->arch.xen.xen_lock);
kvm->arch.xen.xen_version = data->u.xen_version;
- mutex_unlock(&kvm->lock);
+ mutex_unlock(&kvm->arch.xen.xen_lock);
r = 0;
break;
@@ -630,9 +648,9 @@ int kvm_xen_hvm_set_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data)
r = -EOPNOTSUPP;
break;
}
- mutex_lock(&kvm->lock);
+ mutex_lock(&kvm->arch.xen.xen_lock);
kvm->arch.xen.runstate_update_flag = !!data->u.runstate_update_flag;
- mutex_unlock(&kvm->lock);
+ mutex_unlock(&kvm->arch.xen.xen_lock);
r = 0;
break;
@@ -647,7 +665,7 @@ int kvm_xen_hvm_get_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data)
{
int r = -ENOENT;
- mutex_lock(&kvm->lock);
+ mutex_lock(&kvm->arch.xen.xen_lock);
switch (data->type) {
case KVM_XEN_ATTR_TYPE_LONG_MODE:
@@ -686,7 +704,7 @@ int kvm_xen_hvm_get_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data)
break;
}
- mutex_unlock(&kvm->lock);
+ mutex_unlock(&kvm->arch.xen.xen_lock);
return r;
}
@@ -694,7 +712,7 @@ int kvm_xen_vcpu_set_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data)
{
int idx, r = -ENOENT;
- mutex_lock(&vcpu->kvm->lock);
+ mutex_lock(&vcpu->kvm->arch.xen.xen_lock);
idx = srcu_read_lock(&vcpu->kvm->srcu);
switch (data->type) {
@@ -922,7 +940,7 @@ int kvm_xen_vcpu_set_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data)
}
srcu_read_unlock(&vcpu->kvm->srcu, idx);
- mutex_unlock(&vcpu->kvm->lock);
+ mutex_unlock(&vcpu->kvm->arch.xen.xen_lock);
return r;
}
@@ -930,7 +948,7 @@ int kvm_xen_vcpu_get_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data)
{
int r = -ENOENT;
- mutex_lock(&vcpu->kvm->lock);
+ mutex_lock(&vcpu->kvm->arch.xen.xen_lock);
switch (data->type) {
case KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO:
@@ -1013,7 +1031,7 @@ int kvm_xen_vcpu_get_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data)
break;
}
- mutex_unlock(&vcpu->kvm->lock);
+ mutex_unlock(&vcpu->kvm->arch.xen.xen_lock);
return r;
}
@@ -1106,7 +1124,7 @@ int kvm_xen_hvm_config(struct kvm *kvm, struct kvm_xen_hvm_config *xhc)
xhc->blob_size_32 || xhc->blob_size_64))
return -EINVAL;
- mutex_lock(&kvm->lock);
+ mutex_lock(&kvm->arch.xen.xen_lock);
if (xhc->msr && !kvm->arch.xen_hvm_config.msr)
static_branch_inc(&kvm_xen_enabled.key);
@@ -1115,7 +1133,7 @@ int kvm_xen_hvm_config(struct kvm *kvm, struct kvm_xen_hvm_config *xhc)
memcpy(&kvm->arch.xen_hvm_config, xhc, sizeof(*xhc));
- mutex_unlock(&kvm->lock);
+ mutex_unlock(&kvm->arch.xen.xen_lock);
return 0;
}
@@ -1658,15 +1676,7 @@ static int kvm_xen_set_evtchn(struct kvm_xen_evtchn *xe, struct kvm *kvm)
mm_borrowed = true;
}
- /*
- * For the irqfd workqueue, using the main kvm->lock mutex is
- * fine since this function is invoked from kvm_set_irq() with
- * no other lock held, no srcu. In future if it will be called
- * directly from a vCPU thread (e.g. on hypercall for an IPI)
- * then it may need to switch to using a leaf-node mutex for
- * serializing the shared_info mapping.
- */
- mutex_lock(&kvm->lock);
+ mutex_lock(&kvm->arch.xen.xen_lock);
/*
* It is theoretically possible for the page to be unmapped
@@ -1695,7 +1705,7 @@ static int kvm_xen_set_evtchn(struct kvm_xen_evtchn *xe, struct kvm *kvm)
srcu_read_unlock(&kvm->srcu, idx);
} while(!rc);
- mutex_unlock(&kvm->lock);
+ mutex_unlock(&kvm->arch.xen.xen_lock);
if (mm_borrowed)
kthread_unuse_mm(kvm->mm);
@@ -1811,7 +1821,7 @@ static int kvm_xen_eventfd_update(struct kvm *kvm,
int ret;
/* Protect writes to evtchnfd as well as the idr lookup. */
- mutex_lock(&kvm->lock);
+ mutex_lock(&kvm->arch.xen.xen_lock);
evtchnfd = idr_find(&kvm->arch.xen.evtchn_ports, port);
ret = -ENOENT;
@@ -1842,7 +1852,7 @@ static int kvm_xen_eventfd_update(struct kvm *kvm,
}
ret = 0;
out_unlock:
- mutex_unlock(&kvm->lock);
+ mutex_unlock(&kvm->arch.xen.xen_lock);
return ret;
}
@@ -1905,10 +1915,10 @@ static int kvm_xen_eventfd_assign(struct kvm *kvm,
evtchnfd->deliver.port.priority = data->u.evtchn.deliver.port.priority;
}
- mutex_lock(&kvm->lock);
+ mutex_lock(&kvm->arch.xen.xen_lock);
ret = idr_alloc(&kvm->arch.xen.evtchn_ports, evtchnfd, port, port + 1,
GFP_KERNEL);
- mutex_unlock(&kvm->lock);
+ mutex_unlock(&kvm->arch.xen.xen_lock);
if (ret >= 0)
return 0;
@@ -1926,9 +1936,9 @@ static int kvm_xen_eventfd_deassign(struct kvm *kvm, u32 port)
{
struct evtchnfd *evtchnfd;
- mutex_lock(&kvm->lock);
+ mutex_lock(&kvm->arch.xen.xen_lock);
evtchnfd = idr_remove(&kvm->arch.xen.evtchn_ports, port);
- mutex_unlock(&kvm->lock);
+ mutex_unlock(&kvm->arch.xen.xen_lock);
if (!evtchnfd)
return -ENOENT;
@@ -1942,18 +1952,42 @@ static int kvm_xen_eventfd_deassign(struct kvm *kvm, u32 port)
static int kvm_xen_eventfd_reset(struct kvm *kvm)
{
- struct evtchnfd *evtchnfd;
+ struct evtchnfd *evtchnfd, **all_evtchnfds;
int i;
+ int n = 0;
- mutex_lock(&kvm->lock);
+ mutex_lock(&kvm->arch.xen.xen_lock);
+
+ /*
+ * Because synchronize_srcu() cannot be called inside the
+ * critical section, first collect all the evtchnfd objects
+ * in an array as they are removed from evtchn_ports.
+ */
+ idr_for_each_entry(&kvm->arch.xen.evtchn_ports, evtchnfd, i)
+ n++;
+
+ all_evtchnfds = kmalloc_array(n, sizeof(struct evtchnfd *), GFP_KERNEL);
+ if (!all_evtchnfds) {
+ mutex_unlock(&kvm->arch.xen.xen_lock);
+ return -ENOMEM;
+ }
+
+ n = 0;
idr_for_each_entry(&kvm->arch.xen.evtchn_ports, evtchnfd, i) {
+ all_evtchnfds[n++] = evtchnfd;
idr_remove(&kvm->arch.xen.evtchn_ports, evtchnfd->send_port);
- synchronize_srcu(&kvm->srcu);
+ }
+ mutex_unlock(&kvm->arch.xen.xen_lock);
+
+ synchronize_srcu(&kvm->srcu);
+
+ while (n--) {
+ evtchnfd = all_evtchnfds[n];
if (!evtchnfd->deliver.port.port)
eventfd_ctx_put(evtchnfd->deliver.eventfd.ctx);
kfree(evtchnfd);
}
- mutex_unlock(&kvm->lock);
+ kfree(all_evtchnfds);
return 0;
}
@@ -2045,6 +2079,7 @@ void kvm_xen_destroy_vcpu(struct kvm_vcpu *vcpu)
void kvm_xen_init_vm(struct kvm *kvm)
{
+ mutex_init(&kvm->arch.xen.xen_lock);
idr_init(&kvm->arch.xen.evtchn_ports);
kvm_gpc_init(&kvm->arch.xen.shinfo_cache, kvm, NULL, KVM_HOST_USES_PFN);
}
diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile
index 7ba5f61d7273..4f1a40a86534 100644
--- a/arch/x86/lib/Makefile
+++ b/arch/x86/lib/Makefile
@@ -60,6 +60,7 @@ ifeq ($(CONFIG_X86_32),y)
lib-y += checksum_32.o
lib-y += strstr_32.o
lib-y += string_32.o
+ lib-y += memmove_32.o
ifneq ($(CONFIG_X86_CMPXCHG64),y)
lib-y += cmpxchg8b_emu.o atomic64_386_32.o
endif
diff --git a/arch/x86/lib/error-inject.c b/arch/x86/lib/error-inject.c
index 1e3de0769b81..b5a6d83106bc 100644
--- a/arch/x86/lib/error-inject.c
+++ b/arch/x86/lib/error-inject.c
@@ -11,6 +11,7 @@ asm(
".text\n"
".type just_return_func, @function\n"
".globl just_return_func\n"
+ ASM_FUNC_ALIGN
"just_return_func:\n"
ANNOTATE_NOENDBR
ASM_RET
diff --git a/arch/x86/lib/insn-eval.c b/arch/x86/lib/insn-eval.c
index 21104c41cba0..558a605929db 100644
--- a/arch/x86/lib/insn-eval.c
+++ b/arch/x86/lib/insn-eval.c
@@ -1595,16 +1595,16 @@ bool insn_decode_from_regs(struct insn *insn, struct pt_regs *regs,
* Returns:
*
* Type of the instruction. Size of the memory operand is stored in
- * @bytes. If decode failed, MMIO_DECODE_FAILED returned.
+ * @bytes. If decode failed, INSN_MMIO_DECODE_FAILED returned.
*/
-enum mmio_type insn_decode_mmio(struct insn *insn, int *bytes)
+enum insn_mmio_type insn_decode_mmio(struct insn *insn, int *bytes)
{
- enum mmio_type type = MMIO_DECODE_FAILED;
+ enum insn_mmio_type type = INSN_MMIO_DECODE_FAILED;
*bytes = 0;
if (insn_get_opcode(insn))
- return MMIO_DECODE_FAILED;
+ return INSN_MMIO_DECODE_FAILED;
switch (insn->opcode.bytes[0]) {
case 0x88: /* MOV m8,r8 */
@@ -1613,7 +1613,7 @@ enum mmio_type insn_decode_mmio(struct insn *insn, int *bytes)
case 0x89: /* MOV m16/m32/m64, r16/m32/m64 */
if (!*bytes)
*bytes = insn->opnd_bytes;
- type = MMIO_WRITE;
+ type = INSN_MMIO_WRITE;
break;
case 0xc6: /* MOV m8, imm8 */
@@ -1622,7 +1622,7 @@ enum mmio_type insn_decode_mmio(struct insn *insn, int *bytes)
case 0xc7: /* MOV m16/m32/m64, imm16/imm32/imm64 */
if (!*bytes)
*bytes = insn->opnd_bytes;
- type = MMIO_WRITE_IMM;
+ type = INSN_MMIO_WRITE_IMM;
break;
case 0x8a: /* MOV r8, m8 */
@@ -1631,7 +1631,7 @@ enum mmio_type insn_decode_mmio(struct insn *insn, int *bytes)
case 0x8b: /* MOV r16/r32/r64, m16/m32/m64 */
if (!*bytes)
*bytes = insn->opnd_bytes;
- type = MMIO_READ;
+ type = INSN_MMIO_READ;
break;
case 0xa4: /* MOVS m8, m8 */
@@ -1640,7 +1640,7 @@ enum mmio_type insn_decode_mmio(struct insn *insn, int *bytes)
case 0xa5: /* MOVS m16/m32/m64, m16/m32/m64 */
if (!*bytes)
*bytes = insn->opnd_bytes;
- type = MMIO_MOVS;
+ type = INSN_MMIO_MOVS;
break;
case 0x0f: /* Two-byte instruction */
@@ -1651,7 +1651,7 @@ enum mmio_type insn_decode_mmio(struct insn *insn, int *bytes)
case 0xb7: /* MOVZX r32/r64, m16 */
if (!*bytes)
*bytes = 2;
- type = MMIO_READ_ZERO_EXTEND;
+ type = INSN_MMIO_READ_ZERO_EXTEND;
break;
case 0xbe: /* MOVSX r16/r32/r64, m8 */
@@ -1660,7 +1660,7 @@ enum mmio_type insn_decode_mmio(struct insn *insn, int *bytes)
case 0xbf: /* MOVSX r32/r64, m16 */
if (!*bytes)
*bytes = 2;
- type = MMIO_READ_SIGN_EXTEND;
+ type = INSN_MMIO_READ_SIGN_EXTEND;
break;
}
break;
diff --git a/arch/x86/lib/iomap_copy_64.S b/arch/x86/lib/iomap_copy_64.S
index a1f9416bf67a..6ff2f56cb0f7 100644
--- a/arch/x86/lib/iomap_copy_64.S
+++ b/arch/x86/lib/iomap_copy_64.S
@@ -10,6 +10,6 @@
*/
SYM_FUNC_START(__iowrite32_copy)
movl %edx,%ecx
- rep movsd
+ rep movsl
RET
SYM_FUNC_END(__iowrite32_copy)
diff --git a/arch/x86/lib/memcpy_32.c b/arch/x86/lib/memcpy_32.c
index ef3af7ff2c8a..a29b64befb93 100644
--- a/arch/x86/lib/memcpy_32.c
+++ b/arch/x86/lib/memcpy_32.c
@@ -17,190 +17,3 @@ __visible void *memset(void *s, int c, size_t count)
return __memset(s, c, count);
}
EXPORT_SYMBOL(memset);
-
-__visible void *memmove(void *dest, const void *src, size_t n)
-{
- int d0,d1,d2,d3,d4,d5;
- char *ret = dest;
-
- __asm__ __volatile__(
- /* Handle more 16 bytes in loop */
- "cmp $0x10, %0\n\t"
- "jb 1f\n\t"
-
- /* Decide forward/backward copy mode */
- "cmp %2, %1\n\t"
- "jb 2f\n\t"
-
- /*
- * movs instruction have many startup latency
- * so we handle small size by general register.
- */
- "cmp $680, %0\n\t"
- "jb 3f\n\t"
- /*
- * movs instruction is only good for aligned case.
- */
- "mov %1, %3\n\t"
- "xor %2, %3\n\t"
- "and $0xff, %3\n\t"
- "jz 4f\n\t"
- "3:\n\t"
- "sub $0x10, %0\n\t"
-
- /*
- * We gobble 16 bytes forward in each loop.
- */
- "3:\n\t"
- "sub $0x10, %0\n\t"
- "mov 0*4(%1), %3\n\t"
- "mov 1*4(%1), %4\n\t"
- "mov %3, 0*4(%2)\n\t"
- "mov %4, 1*4(%2)\n\t"
- "mov 2*4(%1), %3\n\t"
- "mov 3*4(%1), %4\n\t"
- "mov %3, 2*4(%2)\n\t"
- "mov %4, 3*4(%2)\n\t"
- "lea 0x10(%1), %1\n\t"
- "lea 0x10(%2), %2\n\t"
- "jae 3b\n\t"
- "add $0x10, %0\n\t"
- "jmp 1f\n\t"
-
- /*
- * Handle data forward by movs.
- */
- ".p2align 4\n\t"
- "4:\n\t"
- "mov -4(%1, %0), %3\n\t"
- "lea -4(%2, %0), %4\n\t"
- "shr $2, %0\n\t"
- "rep movsl\n\t"
- "mov %3, (%4)\n\t"
- "jmp 11f\n\t"
- /*
- * Handle data backward by movs.
- */
- ".p2align 4\n\t"
- "6:\n\t"
- "mov (%1), %3\n\t"
- "mov %2, %4\n\t"
- "lea -4(%1, %0), %1\n\t"
- "lea -4(%2, %0), %2\n\t"
- "shr $2, %0\n\t"
- "std\n\t"
- "rep movsl\n\t"
- "mov %3,(%4)\n\t"
- "cld\n\t"
- "jmp 11f\n\t"
-
- /*
- * Start to prepare for backward copy.
- */
- ".p2align 4\n\t"
- "2:\n\t"
- "cmp $680, %0\n\t"
- "jb 5f\n\t"
- "mov %1, %3\n\t"
- "xor %2, %3\n\t"
- "and $0xff, %3\n\t"
- "jz 6b\n\t"
-
- /*
- * Calculate copy position to tail.
- */
- "5:\n\t"
- "add %0, %1\n\t"
- "add %0, %2\n\t"
- "sub $0x10, %0\n\t"
-
- /*
- * We gobble 16 bytes backward in each loop.
- */
- "7:\n\t"
- "sub $0x10, %0\n\t"
-
- "mov -1*4(%1), %3\n\t"
- "mov -2*4(%1), %4\n\t"
- "mov %3, -1*4(%2)\n\t"
- "mov %4, -2*4(%2)\n\t"
- "mov -3*4(%1), %3\n\t"
- "mov -4*4(%1), %4\n\t"
- "mov %3, -3*4(%2)\n\t"
- "mov %4, -4*4(%2)\n\t"
- "lea -0x10(%1), %1\n\t"
- "lea -0x10(%2), %2\n\t"
- "jae 7b\n\t"
- /*
- * Calculate copy position to head.
- */
- "add $0x10, %0\n\t"
- "sub %0, %1\n\t"
- "sub %0, %2\n\t"
-
- /*
- * Move data from 8 bytes to 15 bytes.
- */
- ".p2align 4\n\t"
- "1:\n\t"
- "cmp $8, %0\n\t"
- "jb 8f\n\t"
- "mov 0*4(%1), %3\n\t"
- "mov 1*4(%1), %4\n\t"
- "mov -2*4(%1, %0), %5\n\t"
- "mov -1*4(%1, %0), %1\n\t"
-
- "mov %3, 0*4(%2)\n\t"
- "mov %4, 1*4(%2)\n\t"
- "mov %5, -2*4(%2, %0)\n\t"
- "mov %1, -1*4(%2, %0)\n\t"
- "jmp 11f\n\t"
-
- /*
- * Move data from 4 bytes to 7 bytes.
- */
- ".p2align 4\n\t"
- "8:\n\t"
- "cmp $4, %0\n\t"
- "jb 9f\n\t"
- "mov 0*4(%1), %3\n\t"
- "mov -1*4(%1, %0), %4\n\t"
- "mov %3, 0*4(%2)\n\t"
- "mov %4, -1*4(%2, %0)\n\t"
- "jmp 11f\n\t"
-
- /*
- * Move data from 2 bytes to 3 bytes.
- */
- ".p2align 4\n\t"
- "9:\n\t"
- "cmp $2, %0\n\t"
- "jb 10f\n\t"
- "movw 0*2(%1), %%dx\n\t"
- "movw -1*2(%1, %0), %%bx\n\t"
- "movw %%dx, 0*2(%2)\n\t"
- "movw %%bx, -1*2(%2, %0)\n\t"
- "jmp 11f\n\t"
-
- /*
- * Move data for 1 byte.
- */
- ".p2align 4\n\t"
- "10:\n\t"
- "cmp $1, %0\n\t"
- "jb 11f\n\t"
- "movb (%1), %%cl\n\t"
- "movb %%cl, (%2)\n\t"
- ".p2align 4\n\t"
- "11:"
- : "=&c" (d0), "=&S" (d1), "=&D" (d2),
- "=r" (d3),"=r" (d4), "=r"(d5)
- :"0" (n),
- "1" (src),
- "2" (dest)
- :"memory");
-
- return ret;
-
-}
-EXPORT_SYMBOL(memmove);
diff --git a/arch/x86/lib/memmove_32.S b/arch/x86/lib/memmove_32.S
new file mode 100644
index 000000000000..0588b2c0fc95
--- /dev/null
+++ b/arch/x86/lib/memmove_32.S
@@ -0,0 +1,200 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#include <linux/linkage.h>
+#include <asm/export.h>
+
+SYM_FUNC_START(memmove)
+/*
+ * void *memmove(void *dest_in, const void *src_in, size_t n)
+ * -mregparm=3 passes these in registers:
+ * dest_in: %eax
+ * src_in: %edx
+ * n: %ecx
+ * See also: arch/x86/entry/calling.h for description of the calling convention.
+ *
+ * n can remain in %ecx, but for `rep movsl`, we'll need dest in %edi and src
+ * in %esi.
+ */
+.set dest_in, %eax
+.set dest, %edi
+.set src_in, %edx
+.set src, %esi
+.set n, %ecx
+.set tmp0, %edx
+.set tmp0w, %dx
+.set tmp1, %ebx
+.set tmp1w, %bx
+.set tmp2, %eax
+.set tmp3b, %cl
+
+/*
+ * Save all callee-saved registers, because this function is going to clobber
+ * all of them:
+ */
+ pushl %ebp
+ movl %esp, %ebp // set standard frame pointer
+
+ pushl %ebx
+ pushl %edi
+ pushl %esi
+ pushl %eax // save 'dest_in' parameter [eax] as the return value
+
+ movl src_in, src
+ movl dest_in, dest
+
+ /* Handle more 16 bytes in loop */
+ cmpl $0x10, n
+ jb .Lmove_16B
+
+ /* Decide forward/backward copy mode */
+ cmpl dest, src
+ jb .Lbackwards_header
+
+ /*
+ * movs instruction have many startup latency
+ * so we handle small size by general register.
+ */
+ cmpl $680, n
+ jb .Ltoo_small_forwards
+ /* movs instruction is only good for aligned case. */
+ movl src, tmp0
+ xorl dest, tmp0
+ andl $0xff, tmp0
+ jz .Lforward_movs
+.Ltoo_small_forwards:
+ subl $0x10, n
+
+ /* We gobble 16 bytes forward in each loop. */
+.Lmove_16B_forwards_loop:
+ subl $0x10, n
+ movl 0*4(src), tmp0
+ movl 1*4(src), tmp1
+ movl tmp0, 0*4(dest)
+ movl tmp1, 1*4(dest)
+ movl 2*4(src), tmp0
+ movl 3*4(src), tmp1
+ movl tmp0, 2*4(dest)
+ movl tmp1, 3*4(dest)
+ leal 0x10(src), src
+ leal 0x10(dest), dest
+ jae .Lmove_16B_forwards_loop
+ addl $0x10, n
+ jmp .Lmove_16B
+
+ /* Handle data forward by movs. */
+.p2align 4
+.Lforward_movs:
+ movl -4(src, n), tmp0
+ leal -4(dest, n), tmp1
+ shrl $2, n
+ rep movsl
+ movl tmp0, (tmp1)
+ jmp .Ldone
+
+ /* Handle data backward by movs. */
+.p2align 4
+.Lbackwards_movs:
+ movl (src), tmp0
+ movl dest, tmp1
+ leal -4(src, n), src
+ leal -4(dest, n), dest
+ shrl $2, n
+ std
+ rep movsl
+ movl tmp0,(tmp1)
+ cld
+ jmp .Ldone
+
+ /* Start to prepare for backward copy. */
+.p2align 4
+.Lbackwards_header:
+ cmpl $680, n
+ jb .Ltoo_small_backwards
+ movl src, tmp0
+ xorl dest, tmp0
+ andl $0xff, tmp0
+ jz .Lbackwards_movs
+
+ /* Calculate copy position to tail. */
+.Ltoo_small_backwards:
+ addl n, src
+ addl n, dest
+ subl $0x10, n
+
+ /* We gobble 16 bytes backward in each loop. */
+.Lmove_16B_backwards_loop:
+ subl $0x10, n
+
+ movl -1*4(src), tmp0
+ movl -2*4(src), tmp1
+ movl tmp0, -1*4(dest)
+ movl tmp1, -2*4(dest)
+ movl -3*4(src), tmp0
+ movl -4*4(src), tmp1
+ movl tmp0, -3*4(dest)
+ movl tmp1, -4*4(dest)
+ leal -0x10(src), src
+ leal -0x10(dest), dest
+ jae .Lmove_16B_backwards_loop
+ /* Calculate copy position to head. */
+ addl $0x10, n
+ subl n, src
+ subl n, dest
+
+ /* Move data from 8 bytes to 15 bytes. */
+.p2align 4
+.Lmove_16B:
+ cmpl $8, n
+ jb .Lmove_8B
+ movl 0*4(src), tmp0
+ movl 1*4(src), tmp1
+ movl -2*4(src, n), tmp2
+ movl -1*4(src, n), src
+
+ movl tmp0, 0*4(dest)
+ movl tmp1, 1*4(dest)
+ movl tmp2, -2*4(dest, n)
+ movl src, -1*4(dest, n)
+ jmp .Ldone
+
+ /* Move data from 4 bytes to 7 bytes. */
+.p2align 4
+.Lmove_8B:
+ cmpl $4, n
+ jb .Lmove_4B
+ movl 0*4(src), tmp0
+ movl -1*4(src, n), tmp1
+ movl tmp0, 0*4(dest)
+ movl tmp1, -1*4(dest, n)
+ jmp .Ldone
+
+ /* Move data from 2 bytes to 3 bytes. */
+.p2align 4
+.Lmove_4B:
+ cmpl $2, n
+ jb .Lmove_1B
+ movw 0*2(src), tmp0w
+ movw -1*2(src, n), tmp1w
+ movw tmp0w, 0*2(dest)
+ movw tmp1w, -1*2(dest, n)
+ jmp .Ldone
+
+ /* Move data for 1 byte. */
+.p2align 4
+.Lmove_1B:
+ cmpl $1, n
+ jb .Ldone
+ movb (src), tmp3b
+ movb tmp3b, (dest)
+.p2align 4
+.Ldone:
+ popl dest_in // restore 'dest_in' [eax] as the return value
+ /* Restore all callee-saved registers: */
+ popl %esi
+ popl %edi
+ popl %ebx
+ popl %ebp
+
+ RET
+SYM_FUNC_END(memmove)
+EXPORT_SYMBOL(memmove)
diff --git a/arch/x86/lib/putuser.S b/arch/x86/lib/putuser.S
index b7dfd60243b7..32125224fcca 100644
--- a/arch/x86/lib/putuser.S
+++ b/arch/x86/lib/putuser.S
@@ -47,8 +47,6 @@ SYM_FUNC_START(__put_user_1)
LOAD_TASK_SIZE_MINUS_N(0)
cmp %_ASM_BX,%_ASM_CX
jae .Lbad_put_user
-SYM_INNER_LABEL(__put_user_nocheck_1, SYM_L_GLOBAL)
- ENDBR
ASM_STAC
1: movb %al,(%_ASM_CX)
xor %ecx,%ecx
@@ -56,54 +54,87 @@ SYM_INNER_LABEL(__put_user_nocheck_1, SYM_L_GLOBAL)
RET
SYM_FUNC_END(__put_user_1)
EXPORT_SYMBOL(__put_user_1)
+
+SYM_FUNC_START(__put_user_nocheck_1)
+ ENDBR
+ ASM_STAC
+2: movb %al,(%_ASM_CX)
+ xor %ecx,%ecx
+ ASM_CLAC
+ RET
+SYM_FUNC_END(__put_user_nocheck_1)
EXPORT_SYMBOL(__put_user_nocheck_1)
SYM_FUNC_START(__put_user_2)
LOAD_TASK_SIZE_MINUS_N(1)
cmp %_ASM_BX,%_ASM_CX
jae .Lbad_put_user
-SYM_INNER_LABEL(__put_user_nocheck_2, SYM_L_GLOBAL)
- ENDBR
ASM_STAC
-2: movw %ax,(%_ASM_CX)
+3: movw %ax,(%_ASM_CX)
xor %ecx,%ecx
ASM_CLAC
RET
SYM_FUNC_END(__put_user_2)
EXPORT_SYMBOL(__put_user_2)
+
+SYM_FUNC_START(__put_user_nocheck_2)
+ ENDBR
+ ASM_STAC
+4: movw %ax,(%_ASM_CX)
+ xor %ecx,%ecx
+ ASM_CLAC
+ RET
+SYM_FUNC_END(__put_user_nocheck_2)
EXPORT_SYMBOL(__put_user_nocheck_2)
SYM_FUNC_START(__put_user_4)
LOAD_TASK_SIZE_MINUS_N(3)
cmp %_ASM_BX,%_ASM_CX
jae .Lbad_put_user
-SYM_INNER_LABEL(__put_user_nocheck_4, SYM_L_GLOBAL)
- ENDBR
ASM_STAC
-3: movl %eax,(%_ASM_CX)
+5: movl %eax,(%_ASM_CX)
xor %ecx,%ecx
ASM_CLAC
RET
SYM_FUNC_END(__put_user_4)
EXPORT_SYMBOL(__put_user_4)
+
+SYM_FUNC_START(__put_user_nocheck_4)
+ ENDBR
+ ASM_STAC
+6: movl %eax,(%_ASM_CX)
+ xor %ecx,%ecx
+ ASM_CLAC
+ RET
+SYM_FUNC_END(__put_user_nocheck_4)
EXPORT_SYMBOL(__put_user_nocheck_4)
SYM_FUNC_START(__put_user_8)
LOAD_TASK_SIZE_MINUS_N(7)
cmp %_ASM_BX,%_ASM_CX
jae .Lbad_put_user
-SYM_INNER_LABEL(__put_user_nocheck_8, SYM_L_GLOBAL)
- ENDBR
ASM_STAC
-4: mov %_ASM_AX,(%_ASM_CX)
+7: mov %_ASM_AX,(%_ASM_CX)
#ifdef CONFIG_X86_32
-5: movl %edx,4(%_ASM_CX)
+8: movl %edx,4(%_ASM_CX)
#endif
xor %ecx,%ecx
ASM_CLAC
RET
SYM_FUNC_END(__put_user_8)
EXPORT_SYMBOL(__put_user_8)
+
+SYM_FUNC_START(__put_user_nocheck_8)
+ ENDBR
+ ASM_STAC
+9: mov %_ASM_AX,(%_ASM_CX)
+#ifdef CONFIG_X86_32
+10: movl %edx,4(%_ASM_CX)
+#endif
+ xor %ecx,%ecx
+ ASM_CLAC
+ RET
+SYM_FUNC_END(__put_user_nocheck_8)
EXPORT_SYMBOL(__put_user_nocheck_8)
SYM_CODE_START_LOCAL(.Lbad_put_user_clac)
@@ -117,6 +148,11 @@ SYM_CODE_END(.Lbad_put_user_clac)
_ASM_EXTABLE_UA(2b, .Lbad_put_user_clac)
_ASM_EXTABLE_UA(3b, .Lbad_put_user_clac)
_ASM_EXTABLE_UA(4b, .Lbad_put_user_clac)
-#ifdef CONFIG_X86_32
_ASM_EXTABLE_UA(5b, .Lbad_put_user_clac)
+ _ASM_EXTABLE_UA(6b, .Lbad_put_user_clac)
+ _ASM_EXTABLE_UA(7b, .Lbad_put_user_clac)
+ _ASM_EXTABLE_UA(9b, .Lbad_put_user_clac)
+#ifdef CONFIG_X86_32
+ _ASM_EXTABLE_UA(8b, .Lbad_put_user_clac)
+ _ASM_EXTABLE_UA(10b, .Lbad_put_user_clac)
#endif
diff --git a/arch/x86/lib/retpoline.S b/arch/x86/lib/retpoline.S
index 073289a55f84..5f61c65322be 100644
--- a/arch/x86/lib/retpoline.S
+++ b/arch/x86/lib/retpoline.S
@@ -5,24 +5,27 @@
#include <asm/dwarf2.h>
#include <asm/cpufeatures.h>
#include <asm/alternative.h>
+#include <asm/asm-offsets.h>
#include <asm/export.h>
#include <asm/nospec-branch.h>
#include <asm/unwind_hints.h>
+#include <asm/percpu.h>
#include <asm/frame.h>
.section .text.__x86.indirect_thunk
-.macro RETPOLINE reg
+
+.macro POLINE reg
ANNOTATE_INTRA_FUNCTION_CALL
call .Ldo_rop_\@
-.Lspec_trap_\@:
- UNWIND_HINT_EMPTY
- pause
- lfence
- jmp .Lspec_trap_\@
+ int3
.Ldo_rop_\@:
mov %\reg, (%_ASM_SP)
UNWIND_HINT_FUNC
+.endm
+
+.macro RETPOLINE reg
+ POLINE \reg
RET
.endm
@@ -52,7 +55,6 @@ SYM_INNER_LABEL(__x86_indirect_thunk_\reg, SYM_L_GLOBAL)
*/
#define __EXPORT_THUNK(sym) _ASM_NOKPROBE(sym); EXPORT_SYMBOL(sym)
-#define EXPORT_THUNK(reg) __EXPORT_THUNK(__x86_indirect_thunk_ ## reg)
.align RETPOLINE_THUNK_SIZE
SYM_CODE_START(__x86_indirect_thunk_array)
@@ -64,10 +66,65 @@ SYM_CODE_START(__x86_indirect_thunk_array)
.align RETPOLINE_THUNK_SIZE
SYM_CODE_END(__x86_indirect_thunk_array)
-#define GEN(reg) EXPORT_THUNK(reg)
+#define GEN(reg) __EXPORT_THUNK(__x86_indirect_thunk_ ## reg)
+#include <asm/GEN-for-each-reg.h>
+#undef GEN
+
+#ifdef CONFIG_CALL_DEPTH_TRACKING
+.macro CALL_THUNK reg
+ .align RETPOLINE_THUNK_SIZE
+
+SYM_INNER_LABEL(__x86_indirect_call_thunk_\reg, SYM_L_GLOBAL)
+ UNWIND_HINT_EMPTY
+ ANNOTATE_NOENDBR
+
+ CALL_DEPTH_ACCOUNT
+ POLINE \reg
+ ANNOTATE_UNRET_SAFE
+ ret
+ int3
+.endm
+
+ .align RETPOLINE_THUNK_SIZE
+SYM_CODE_START(__x86_indirect_call_thunk_array)
+
+#define GEN(reg) CALL_THUNK reg
#include <asm/GEN-for-each-reg.h>
#undef GEN
+ .align RETPOLINE_THUNK_SIZE
+SYM_CODE_END(__x86_indirect_call_thunk_array)
+
+#define GEN(reg) __EXPORT_THUNK(__x86_indirect_call_thunk_ ## reg)
+#include <asm/GEN-for-each-reg.h>
+#undef GEN
+
+.macro JUMP_THUNK reg
+ .align RETPOLINE_THUNK_SIZE
+
+SYM_INNER_LABEL(__x86_indirect_jump_thunk_\reg, SYM_L_GLOBAL)
+ UNWIND_HINT_EMPTY
+ ANNOTATE_NOENDBR
+ POLINE \reg
+ ANNOTATE_UNRET_SAFE
+ ret
+ int3
+.endm
+
+ .align RETPOLINE_THUNK_SIZE
+SYM_CODE_START(__x86_indirect_jump_thunk_array)
+
+#define GEN(reg) JUMP_THUNK reg
+#include <asm/GEN-for-each-reg.h>
+#undef GEN
+
+ .align RETPOLINE_THUNK_SIZE
+SYM_CODE_END(__x86_indirect_jump_thunk_array)
+
+#define GEN(reg) __EXPORT_THUNK(__x86_indirect_jump_thunk_ ## reg)
+#include <asm/GEN-for-each-reg.h>
+#undef GEN
+#endif
/*
* This function name is magical and is used by -mfunction-return=thunk-extern
* for the compiler to generate JMPs to it.
@@ -140,3 +197,37 @@ __EXPORT_THUNK(zen_untrain_ret)
EXPORT_SYMBOL(__x86_return_thunk)
#endif /* CONFIG_RETHUNK */
+
+#ifdef CONFIG_CALL_DEPTH_TRACKING
+
+ .align 64
+SYM_FUNC_START(__x86_return_skl)
+ ANNOTATE_NOENDBR
+ /*
+ * Keep the hotpath in a 16byte I-fetch for the non-debug
+ * case.
+ */
+ CALL_THUNKS_DEBUG_INC_RETS
+ shlq $5, PER_CPU_VAR(pcpu_hot + X86_call_depth)
+ jz 1f
+ ANNOTATE_UNRET_SAFE
+ ret
+ int3
+1:
+ CALL_THUNKS_DEBUG_INC_STUFFS
+ .rept 16
+ ANNOTATE_INTRA_FUNCTION_CALL
+ call 2f
+ int3
+2:
+ .endr
+ add $(8*16), %rsp
+
+ CREDIT_CALL_DEPTH
+
+ ANNOTATE_UNRET_SAFE
+ ret
+ int3
+SYM_FUNC_END(__x86_return_skl)
+
+#endif /* CONFIG_CALL_DEPTH_TRACKING */
diff --git a/arch/x86/lib/usercopy.c b/arch/x86/lib/usercopy.c
index f1bb18617156..24b48af27417 100644
--- a/arch/x86/lib/usercopy.c
+++ b/arch/x86/lib/usercopy.c
@@ -6,6 +6,7 @@
#include <linux/uaccess.h>
#include <linux/export.h>
+#include <linux/instrumented.h>
#include <asm/tlbflush.h>
@@ -44,7 +45,9 @@ copy_from_user_nmi(void *to, const void __user *from, unsigned long n)
* called from other contexts.
*/
pagefault_disable();
+ instrument_copy_from_user_before(to, from, n);
ret = raw_copy_from_user(to, from, n);
+ instrument_copy_from_user_after(to, from, n, ret);
pagefault_enable();
return ret;
diff --git a/arch/x86/mm/cpu_entry_area.c b/arch/x86/mm/cpu_entry_area.c
index 6c2f1b76a0b6..7316a8224259 100644
--- a/arch/x86/mm/cpu_entry_area.c
+++ b/arch/x86/mm/cpu_entry_area.c
@@ -9,22 +9,60 @@
#include <asm/cpu_entry_area.h>
#include <asm/fixmap.h>
#include <asm/desc.h>
+#include <asm/kasan.h>
static DEFINE_PER_CPU_PAGE_ALIGNED(struct entry_stack_page, entry_stack_storage);
#ifdef CONFIG_X86_64
static DEFINE_PER_CPU_PAGE_ALIGNED(struct exception_stacks, exception_stacks);
DEFINE_PER_CPU(struct cea_exception_stacks*, cea_exception_stacks);
-#endif
-#ifdef CONFIG_X86_32
+static DEFINE_PER_CPU_READ_MOSTLY(unsigned long, _cea_offset);
+
+static __always_inline unsigned int cea_offset(unsigned int cpu)
+{
+ return per_cpu(_cea_offset, cpu);
+}
+
+static __init void init_cea_offsets(void)
+{
+ unsigned int max_cea;
+ unsigned int i, j;
+
+ max_cea = (CPU_ENTRY_AREA_MAP_SIZE - PAGE_SIZE) / CPU_ENTRY_AREA_SIZE;
+
+ /* O(sodding terrible) */
+ for_each_possible_cpu(i) {
+ unsigned int cea;
+
+again:
+ cea = get_random_u32_below(max_cea);
+
+ for_each_possible_cpu(j) {
+ if (cea_offset(j) == cea)
+ goto again;
+
+ if (i == j)
+ break;
+ }
+
+ per_cpu(_cea_offset, i) = cea;
+ }
+}
+#else /* !X86_64 */
DECLARE_PER_CPU_PAGE_ALIGNED(struct doublefault_stack, doublefault_stack);
+
+static __always_inline unsigned int cea_offset(unsigned int cpu)
+{
+ return cpu;
+}
+static inline void init_cea_offsets(void) { }
#endif
/* Is called from entry code, so must be noinstr */
noinstr struct cpu_entry_area *get_cpu_entry_area(int cpu)
{
- unsigned long va = CPU_ENTRY_AREA_PER_CPU + cpu * CPU_ENTRY_AREA_SIZE;
+ unsigned long va = CPU_ENTRY_AREA_PER_CPU + cea_offset(cpu) * CPU_ENTRY_AREA_SIZE;
BUILD_BUG_ON(sizeof(struct cpu_entry_area) % PAGE_SIZE != 0);
return (struct cpu_entry_area *) va;
@@ -138,20 +176,19 @@ static void __init setup_cpu_entry_area(unsigned int cpu)
pgprot_t tss_prot = PAGE_KERNEL_RO;
#else
/*
- * On native 32-bit systems, the GDT cannot be read-only because
+ * On 32-bit systems, the GDT cannot be read-only because
* our double fault handler uses a task gate, and entering through
* a task gate needs to change an available TSS to busy. If the
* GDT is read-only, that will triple fault. The TSS cannot be
* read-only because the CPU writes to it on task switches.
- *
- * On Xen PV, the GDT must be read-only because the hypervisor
- * requires it.
*/
- pgprot_t gdt_prot = boot_cpu_has(X86_FEATURE_XENPV) ?
- PAGE_KERNEL_RO : PAGE_KERNEL;
+ pgprot_t gdt_prot = PAGE_KERNEL;
pgprot_t tss_prot = PAGE_KERNEL;
#endif
+ kasan_populate_shadow_for_vaddr(cea, CPU_ENTRY_AREA_SIZE,
+ early_cpu_to_node(cpu));
+
cea_set_pte(&cea->gdt, get_cpu_gdt_paddr(cpu), gdt_prot);
cea_map_percpu_pages(&cea->entry_stack_page,
@@ -205,7 +242,6 @@ static __init void setup_cpu_entry_area_ptes(void)
/* The +1 is for the readonly IDT: */
BUILD_BUG_ON((CPU_ENTRY_AREA_PAGES+1)*PAGE_SIZE != CPU_ENTRY_AREA_MAP_SIZE);
- BUILD_BUG_ON(CPU_ENTRY_AREA_TOTAL_SIZE != CPU_ENTRY_AREA_MAP_SIZE);
BUG_ON(CPU_ENTRY_AREA_BASE & ~PMD_MASK);
start = CPU_ENTRY_AREA_BASE;
@@ -221,6 +257,8 @@ void __init setup_cpu_entry_areas(void)
{
unsigned int cpu;
+ init_cea_offsets();
+
setup_cpu_entry_area_ptes();
for_each_possible_cpu(cpu)
diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c
index 6b3033845c6d..5804bbae4f01 100644
--- a/arch/x86/mm/hugetlbpage.c
+++ b/arch/x86/mm/hugetlbpage.c
@@ -37,8 +37,12 @@ int pmd_huge(pmd_t pmd)
*/
int pud_huge(pud_t pud)
{
+#if CONFIG_PGTABLE_LEVELS > 2
return !pud_none(pud) &&
(pud_val(pud) & (_PAGE_PRESENT|_PAGE_PSE)) != _PAGE_PRESENT;
+#else
+ return 0;
+#endif
}
#ifdef CONFIG_HUGETLB_PAGE
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index 9121bc1b9453..d3987359d441 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -801,7 +801,7 @@ void __init poking_init(void)
spinlock_t *ptl;
pte_t *ptep;
- poking_mm = copy_init_mm();
+ poking_mm = mm_alloc();
BUG_ON(!poking_mm);
/*
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 3f040c6e5d13..a190aae8ceaf 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -1416,47 +1416,6 @@ void mark_rodata_ro(void)
debug_checkwx();
}
-int kern_addr_valid(unsigned long addr)
-{
- unsigned long above = ((long)addr) >> __VIRTUAL_MASK_SHIFT;
- pgd_t *pgd;
- p4d_t *p4d;
- pud_t *pud;
- pmd_t *pmd;
- pte_t *pte;
-
- if (above != 0 && above != -1UL)
- return 0;
-
- pgd = pgd_offset_k(addr);
- if (pgd_none(*pgd))
- return 0;
-
- p4d = p4d_offset(pgd, addr);
- if (!p4d_present(*p4d))
- return 0;
-
- pud = pud_offset(p4d, addr);
- if (!pud_present(*pud))
- return 0;
-
- if (pud_large(*pud))
- return pfn_valid(pud_pfn(*pud));
-
- pmd = pmd_offset(pud, addr);
- if (!pmd_present(*pmd))
- return 0;
-
- if (pmd_large(*pmd))
- return pfn_valid(pmd_pfn(*pmd));
-
- pte = pte_offset_kernel(pmd, addr);
- if (pte_none(*pte))
- return 0;
-
- return pfn_valid(pte_pfn(*pte));
-}
-
/*
* Block size is the minimum amount of memory which can be hotplugged or
* hotremoved. It must be power of two and must be equal or larger than
@@ -1533,72 +1492,44 @@ static long __meminitdata addr_start, addr_end;
static void __meminitdata *p_start, *p_end;
static int __meminitdata node_start;
-static int __meminit vmemmap_populate_hugepages(unsigned long start,
- unsigned long end, int node, struct vmem_altmap *altmap)
+void __meminit vmemmap_set_pmd(pmd_t *pmd, void *p, int node,
+ unsigned long addr, unsigned long next)
{
- unsigned long addr;
- unsigned long next;
- pgd_t *pgd;
- p4d_t *p4d;
- pud_t *pud;
- pmd_t *pmd;
-
- for (addr = start; addr < end; addr = next) {
- next = pmd_addr_end(addr, end);
-
- pgd = vmemmap_pgd_populate(addr, node);
- if (!pgd)
- return -ENOMEM;
-
- p4d = vmemmap_p4d_populate(pgd, addr, node);
- if (!p4d)
- return -ENOMEM;
-
- pud = vmemmap_pud_populate(p4d, addr, node);
- if (!pud)
- return -ENOMEM;
-
- pmd = pmd_offset(pud, addr);
- if (pmd_none(*pmd)) {
- void *p;
-
- p = vmemmap_alloc_block_buf(PMD_SIZE, node, altmap);
- if (p) {
- pte_t entry;
-
- entry = pfn_pte(__pa(p) >> PAGE_SHIFT,
- PAGE_KERNEL_LARGE);
- set_pmd(pmd, __pmd(pte_val(entry)));
+ pte_t entry;
+
+ entry = pfn_pte(__pa(p) >> PAGE_SHIFT,
+ PAGE_KERNEL_LARGE);
+ set_pmd(pmd, __pmd(pte_val(entry)));
+
+ /* check to see if we have contiguous blocks */
+ if (p_end != p || node_start != node) {
+ if (p_start)
+ pr_debug(" [%lx-%lx] PMD -> [%p-%p] on node %d\n",
+ addr_start, addr_end-1, p_start, p_end-1, node_start);
+ addr_start = addr;
+ node_start = node;
+ p_start = p;
+ }
- /* check to see if we have contiguous blocks */
- if (p_end != p || node_start != node) {
- if (p_start)
- pr_debug(" [%lx-%lx] PMD -> [%p-%p] on node %d\n",
- addr_start, addr_end-1, p_start, p_end-1, node_start);
- addr_start = addr;
- node_start = node;
- p_start = p;
- }
+ addr_end = addr + PMD_SIZE;
+ p_end = p + PMD_SIZE;
- addr_end = addr + PMD_SIZE;
- p_end = p + PMD_SIZE;
+ if (!IS_ALIGNED(addr, PMD_SIZE) ||
+ !IS_ALIGNED(next, PMD_SIZE))
+ vmemmap_use_new_sub_pmd(addr, next);
+}
- if (!IS_ALIGNED(addr, PMD_SIZE) ||
- !IS_ALIGNED(next, PMD_SIZE))
- vmemmap_use_new_sub_pmd(addr, next);
+int __meminit vmemmap_check_pmd(pmd_t *pmd, int node,
+ unsigned long addr, unsigned long next)
+{
+ int large = pmd_large(*pmd);
- continue;
- } else if (altmap)
- return -ENOMEM; /* no fallback */
- } else if (pmd_large(*pmd)) {
- vmemmap_verify((pte_t *)pmd, node, addr, next);
- vmemmap_use_sub_pmd(addr, next);
- continue;
- }
- if (vmemmap_populate_basepages(addr, next, node, NULL))
- return -ENOMEM;
+ if (pmd_large(*pmd)) {
+ vmemmap_verify((pte_t *)pmd, node, addr, next);
+ vmemmap_use_sub_pmd(addr, next);
}
- return 0;
+
+ return large;
}
int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node,
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index 78c5bc654cff..6453fbaedb08 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -217,9 +217,15 @@ __ioremap_caller(resource_size_t phys_addr, unsigned long size,
* Mappings have to be page-aligned
*/
offset = phys_addr & ~PAGE_MASK;
- phys_addr &= PHYSICAL_PAGE_MASK;
+ phys_addr &= PAGE_MASK;
size = PAGE_ALIGN(last_addr+1) - phys_addr;
+ /*
+ * Mask out any bits not part of the actual physical
+ * address, like memory encryption bits.
+ */
+ phys_addr &= PHYSICAL_PAGE_MASK;
+
retval = memtype_reserve(phys_addr, (u64)phys_addr + size,
pcm, &new_pcm);
if (retval) {
diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c
index e7b9b464a82f..0302491d799d 100644
--- a/arch/x86/mm/kasan_init_64.c
+++ b/arch/x86/mm/kasan_init_64.c
@@ -316,10 +316,33 @@ void __init kasan_early_init(void)
kasan_map_early_shadow(init_top_pgt);
}
+static unsigned long kasan_mem_to_shadow_align_down(unsigned long va)
+{
+ unsigned long shadow = (unsigned long)kasan_mem_to_shadow((void *)va);
+
+ return round_down(shadow, PAGE_SIZE);
+}
+
+static unsigned long kasan_mem_to_shadow_align_up(unsigned long va)
+{
+ unsigned long shadow = (unsigned long)kasan_mem_to_shadow((void *)va);
+
+ return round_up(shadow, PAGE_SIZE);
+}
+
+void __init kasan_populate_shadow_for_vaddr(void *va, size_t size, int nid)
+{
+ unsigned long shadow_start, shadow_end;
+
+ shadow_start = kasan_mem_to_shadow_align_down((unsigned long)va);
+ shadow_end = kasan_mem_to_shadow_align_up((unsigned long)va + size);
+ kasan_populate_shadow(shadow_start, shadow_end, nid);
+}
+
void __init kasan_init(void)
{
+ unsigned long shadow_cea_begin, shadow_cea_per_cpu_begin, shadow_cea_end;
int i;
- void *shadow_cpu_entry_begin, *shadow_cpu_entry_end;
memcpy(early_top_pgt, init_top_pgt, sizeof(early_top_pgt));
@@ -360,16 +383,10 @@ void __init kasan_init(void)
map_range(&pfn_mapped[i]);
}
- shadow_cpu_entry_begin = (void *)CPU_ENTRY_AREA_BASE;
- shadow_cpu_entry_begin = kasan_mem_to_shadow(shadow_cpu_entry_begin);
- shadow_cpu_entry_begin = (void *)round_down(
- (unsigned long)shadow_cpu_entry_begin, PAGE_SIZE);
-
- shadow_cpu_entry_end = (void *)(CPU_ENTRY_AREA_BASE +
- CPU_ENTRY_AREA_MAP_SIZE);
- shadow_cpu_entry_end = kasan_mem_to_shadow(shadow_cpu_entry_end);
- shadow_cpu_entry_end = (void *)round_up(
- (unsigned long)shadow_cpu_entry_end, PAGE_SIZE);
+ shadow_cea_begin = kasan_mem_to_shadow_align_down(CPU_ENTRY_AREA_BASE);
+ shadow_cea_per_cpu_begin = kasan_mem_to_shadow_align_up(CPU_ENTRY_AREA_PER_CPU);
+ shadow_cea_end = kasan_mem_to_shadow_align_up(CPU_ENTRY_AREA_BASE +
+ CPU_ENTRY_AREA_MAP_SIZE);
kasan_populate_early_shadow(
kasan_mem_to_shadow((void *)PAGE_OFFSET + MAXMEM),
@@ -391,12 +408,18 @@ void __init kasan_init(void)
kasan_populate_early_shadow(
kasan_mem_to_shadow((void *)VMALLOC_END + 1),
- shadow_cpu_entry_begin);
+ (void *)shadow_cea_begin);
- kasan_populate_shadow((unsigned long)shadow_cpu_entry_begin,
- (unsigned long)shadow_cpu_entry_end, 0);
+ /*
+ * Populate the shadow for the shared portion of the CPU entry area.
+ * Shadows for the per-CPU areas are mapped on-demand, as each CPU's
+ * area is randomly placed somewhere in the 512GiB range and mapping
+ * the entire 512GiB range is prohibitively expensive.
+ */
+ kasan_populate_shadow(shadow_cea_begin,
+ shadow_cea_per_cpu_begin, 0);
- kasan_populate_early_shadow(shadow_cpu_entry_end,
+ kasan_populate_early_shadow((void *)shadow_cea_end,
kasan_mem_to_shadow((void *)__START_KERNEL_map));
kasan_populate_shadow((unsigned long)kasan_mem_to_shadow(_stext),
diff --git a/arch/x86/mm/kmmio.c b/arch/x86/mm/kmmio.c
index d3efbc5b3449..9f82019179e1 100644
--- a/arch/x86/mm/kmmio.c
+++ b/arch/x86/mm/kmmio.c
@@ -62,7 +62,13 @@ struct kmmio_context {
int active;
};
-static DEFINE_SPINLOCK(kmmio_lock);
+/*
+ * The kmmio_lock is taken in int3 context, which is treated as NMI context.
+ * This causes lockdep to complain about it bein in both NMI and normal
+ * context. Hide it from lockdep, as it should not have any other locks
+ * taken under it, and this is only enabled for debugging mmio anyway.
+ */
+static arch_spinlock_t kmmio_lock = __ARCH_SPIN_LOCK_UNLOCKED;
/* Protected by kmmio_lock */
unsigned int kmmio_count;
@@ -240,15 +246,14 @@ int kmmio_handler(struct pt_regs *regs, unsigned long addr)
page_base &= page_level_mask(l);
/*
- * Preemption is now disabled to prevent process switch during
- * single stepping. We can only handle one active kmmio trace
+ * Hold the RCU read lock over single stepping to avoid looking
+ * up the probe and kmmio_fault_page again. The rcu_read_lock_sched()
+ * also disables preemption and prevents process switch during
+ * the single stepping. We can only handle one active kmmio trace
* per cpu, so ensure that we finish it before something else
- * gets to run. We also hold the RCU read lock over single
- * stepping to avoid looking up the probe and kmmio_fault_page
- * again.
+ * gets to run.
*/
- preempt_disable();
- rcu_read_lock();
+ rcu_read_lock_sched_notrace();
faultpage = get_kmmio_fault_page(page_base);
if (!faultpage) {
@@ -317,8 +322,7 @@ int kmmio_handler(struct pt_regs *regs, unsigned long addr)
return 1; /* fault handled */
no_kmmio:
- rcu_read_unlock();
- preempt_enable_no_resched();
+ rcu_read_unlock_sched_notrace();
return ret;
}
@@ -346,10 +350,10 @@ static int post_kmmio_handler(unsigned long condition, struct pt_regs *regs)
ctx->probe->post_handler(ctx->probe, condition, regs);
/* Prevent racing against release_kmmio_fault_page(). */
- spin_lock(&kmmio_lock);
+ arch_spin_lock(&kmmio_lock);
if (ctx->fpage->count)
arm_kmmio_fault_page(ctx->fpage);
- spin_unlock(&kmmio_lock);
+ arch_spin_unlock(&kmmio_lock);
regs->flags &= ~X86_EFLAGS_TF;
regs->flags |= ctx->saved_flags;
@@ -357,8 +361,7 @@ static int post_kmmio_handler(unsigned long condition, struct pt_regs *regs)
/* These were acquired in kmmio_handler(). */
ctx->active--;
BUG_ON(ctx->active);
- rcu_read_unlock();
- preempt_enable_no_resched();
+ rcu_read_unlock_sched_notrace();
/*
* if somebody else is singlestepping across a probe point, flags
@@ -440,7 +443,8 @@ int register_kmmio_probe(struct kmmio_probe *p)
unsigned int l;
pte_t *pte;
- spin_lock_irqsave(&kmmio_lock, flags);
+ local_irq_save(flags);
+ arch_spin_lock(&kmmio_lock);
if (get_kmmio_probe(addr)) {
ret = -EEXIST;
goto out;
@@ -460,7 +464,9 @@ int register_kmmio_probe(struct kmmio_probe *p)
size += page_level_size(l);
}
out:
- spin_unlock_irqrestore(&kmmio_lock, flags);
+ arch_spin_unlock(&kmmio_lock);
+ local_irq_restore(flags);
+
/*
* XXX: What should I do here?
* Here was a call to global_flush_tlb(), but it does not exist
@@ -494,7 +500,8 @@ static void remove_kmmio_fault_pages(struct rcu_head *head)
struct kmmio_fault_page **prevp = &dr->release_list;
unsigned long flags;
- spin_lock_irqsave(&kmmio_lock, flags);
+ local_irq_save(flags);
+ arch_spin_lock(&kmmio_lock);
while (f) {
if (!f->count) {
list_del_rcu(&f->list);
@@ -506,7 +513,8 @@ static void remove_kmmio_fault_pages(struct rcu_head *head)
}
f = *prevp;
}
- spin_unlock_irqrestore(&kmmio_lock, flags);
+ arch_spin_unlock(&kmmio_lock);
+ local_irq_restore(flags);
/* This is the real RCU destroy call. */
call_rcu(&dr->rcu, rcu_free_kmmio_fault_pages);
@@ -540,14 +548,16 @@ void unregister_kmmio_probe(struct kmmio_probe *p)
if (!pte)
return;
- spin_lock_irqsave(&kmmio_lock, flags);
+ local_irq_save(flags);
+ arch_spin_lock(&kmmio_lock);
while (size < size_lim) {
release_kmmio_fault_page(addr + size, &release_list);
size += page_level_size(l);
}
list_del_rcu(&p->list);
kmmio_count--;
- spin_unlock_irqrestore(&kmmio_lock, flags);
+ arch_spin_unlock(&kmmio_lock);
+ local_irq_restore(flags);
if (!release_list)
return;
diff --git a/arch/x86/mm/mem_encrypt_boot.S b/arch/x86/mm/mem_encrypt_boot.S
index 9de3d900bc92..e25288ee33c2 100644
--- a/arch/x86/mm/mem_encrypt_boot.S
+++ b/arch/x86/mm/mem_encrypt_boot.S
@@ -26,7 +26,7 @@ SYM_FUNC_START(sme_encrypt_execute)
* RCX - virtual address of the encryption workarea, including:
* - stack page (PAGE_SIZE)
* - encryption routine page (PAGE_SIZE)
- * - intermediate copy buffer (PMD_PAGE_SIZE)
+ * - intermediate copy buffer (PMD_SIZE)
* R8 - physical address of the pagetables to use for encryption
*/
@@ -123,7 +123,7 @@ SYM_FUNC_START(__enc_copy)
wbinvd /* Invalidate any cache entries */
/* Copy/encrypt up to 2MB at a time */
- movq $PMD_PAGE_SIZE, %r12
+ movq $PMD_SIZE, %r12
1:
cmpq %r12, %r9
jnb 2f
diff --git a/arch/x86/mm/mem_encrypt_identity.c b/arch/x86/mm/mem_encrypt_identity.c
index f415498d3175..88cccd65029d 100644
--- a/arch/x86/mm/mem_encrypt_identity.c
+++ b/arch/x86/mm/mem_encrypt_identity.c
@@ -93,7 +93,7 @@ struct sme_populate_pgd_data {
* section is 2MB aligned to allow for simple pagetable setup using only
* PMD entries (see vmlinux.lds.S).
*/
-static char sme_workarea[2 * PMD_PAGE_SIZE] __section(".init.scratch");
+static char sme_workarea[2 * PMD_SIZE] __section(".init.scratch");
static char sme_cmdline_arg[] __initdata = "mem_encrypt";
static char sme_cmdline_on[] __initdata = "on";
@@ -198,8 +198,8 @@ static void __init __sme_map_range_pmd(struct sme_populate_pgd_data *ppd)
while (ppd->vaddr < ppd->vaddr_end) {
sme_populate_pgd_large(ppd);
- ppd->vaddr += PMD_PAGE_SIZE;
- ppd->paddr += PMD_PAGE_SIZE;
+ ppd->vaddr += PMD_SIZE;
+ ppd->paddr += PMD_SIZE;
}
}
@@ -225,11 +225,11 @@ static void __init __sme_map_range(struct sme_populate_pgd_data *ppd,
vaddr_end = ppd->vaddr_end;
/* If start is not 2MB aligned, create PTE entries */
- ppd->vaddr_end = ALIGN(ppd->vaddr, PMD_PAGE_SIZE);
+ ppd->vaddr_end = ALIGN(ppd->vaddr, PMD_SIZE);
__sme_map_range_pte(ppd);
/* Create PMD entries */
- ppd->vaddr_end = vaddr_end & PMD_PAGE_MASK;
+ ppd->vaddr_end = vaddr_end & PMD_MASK;
__sme_map_range_pmd(ppd);
/* If end is not 2MB aligned, create PTE entries */
@@ -325,7 +325,7 @@ void __init sme_encrypt_kernel(struct boot_params *bp)
/* Physical addresses gives us the identity mapped virtual addresses */
kernel_start = __pa_symbol(_text);
- kernel_end = ALIGN(__pa_symbol(_end), PMD_PAGE_SIZE);
+ kernel_end = ALIGN(__pa_symbol(_end), PMD_SIZE);
kernel_len = kernel_end - kernel_start;
initrd_start = 0;
@@ -355,12 +355,12 @@ void __init sme_encrypt_kernel(struct boot_params *bp)
* executable encryption area size:
* stack page (PAGE_SIZE)
* encryption routine page (PAGE_SIZE)
- * intermediate copy buffer (PMD_PAGE_SIZE)
+ * intermediate copy buffer (PMD_SIZE)
* pagetable structures for the encryption of the kernel
* pagetable structures for workarea (in case not currently mapped)
*/
execute_start = workarea_start;
- execute_end = execute_start + (PAGE_SIZE * 2) + PMD_PAGE_SIZE;
+ execute_end = execute_start + (PAGE_SIZE * 2) + PMD_SIZE;
execute_len = execute_end - execute_start;
/*
@@ -383,7 +383,7 @@ void __init sme_encrypt_kernel(struct boot_params *bp)
* before it is mapped.
*/
workarea_len = execute_len + pgtable_area_len;
- workarea_end = ALIGN(workarea_start + workarea_len, PMD_PAGE_SIZE);
+ workarea_end = ALIGN(workarea_start + workarea_len, PMD_SIZE);
/*
* Set the address to the start of where newly created pagetable
diff --git a/arch/x86/mm/pat/cpa-test.c b/arch/x86/mm/pat/cpa-test.c
index 423b21e80929..3d2f7f0a6ed1 100644
--- a/arch/x86/mm/pat/cpa-test.c
+++ b/arch/x86/mm/pat/cpa-test.c
@@ -136,10 +136,10 @@ static int pageattr_test(void)
failed += print_split(&sa);
for (i = 0; i < NTEST; i++) {
- unsigned long pfn = prandom_u32_max(max_pfn_mapped);
+ unsigned long pfn = get_random_u32_below(max_pfn_mapped);
addr[i] = (unsigned long)__va(pfn << PAGE_SHIFT);
- len[i] = prandom_u32_max(NPAGES);
+ len[i] = get_random_u32_below(NPAGES);
len[i] = min_t(unsigned long, len[i], max_pfn_mapped - pfn - 1);
if (len[i] == 0)
diff --git a/arch/x86/mm/pat/memtype.c b/arch/x86/mm/pat/memtype.c
index 66a209f7eb86..46de9cf5c91d 100644
--- a/arch/x86/mm/pat/memtype.c
+++ b/arch/x86/mm/pat/memtype.c
@@ -43,6 +43,7 @@
#include <linux/rbtree.h>
#include <asm/cacheflush.h>
+#include <asm/cacheinfo.h>
#include <asm/processor.h>
#include <asm/tlbflush.h>
#include <asm/x86_init.h>
@@ -60,41 +61,34 @@
#undef pr_fmt
#define pr_fmt(fmt) "" fmt
-static bool __read_mostly pat_bp_initialized;
static bool __read_mostly pat_disabled = !IS_ENABLED(CONFIG_X86_PAT);
-static bool __initdata pat_force_disabled = !IS_ENABLED(CONFIG_X86_PAT);
-static bool __read_mostly pat_bp_enabled;
-static bool __read_mostly pat_cm_initialized;
+static u64 __ro_after_init pat_msr_val;
/*
* PAT support is enabled by default, but can be disabled for
* various user-requested or hardware-forced reasons:
*/
-void pat_disable(const char *msg_reason)
+static void __init pat_disable(const char *msg_reason)
{
if (pat_disabled)
return;
- if (pat_bp_initialized) {
- WARN_ONCE(1, "x86/PAT: PAT cannot be disabled after initialization\n");
- return;
- }
-
pat_disabled = true;
pr_info("x86/PAT: %s\n", msg_reason);
+
+ memory_caching_control &= ~CACHE_PAT;
}
static int __init nopat(char *str)
{
pat_disable("PAT support disabled via boot option.");
- pat_force_disabled = true;
return 0;
}
early_param("nopat", nopat);
bool pat_enabled(void)
{
- return pat_bp_enabled;
+ return !pat_disabled;
}
EXPORT_SYMBOL_GPL(pat_enabled);
@@ -192,7 +186,8 @@ enum {
#define CM(c) (_PAGE_CACHE_MODE_ ## c)
-static enum page_cache_mode pat_get_cache_mode(unsigned pat_val, char *msg)
+static enum page_cache_mode __init pat_get_cache_mode(unsigned int pat_val,
+ char *msg)
{
enum page_cache_mode cache;
char *cache_mode;
@@ -219,14 +214,12 @@ static enum page_cache_mode pat_get_cache_mode(unsigned pat_val, char *msg)
* configuration.
* Using lower indices is preferred, so we start with highest index.
*/
-static void __init_cache_modes(u64 pat)
+static void __init init_cache_modes(u64 pat)
{
enum page_cache_mode cache;
char pat_msg[33];
int i;
- WARN_ON_ONCE(pat_cm_initialized);
-
pat_msg[32] = 0;
for (i = 7; i >= 0; i--) {
cache = pat_get_cache_mode((pat >> (i * 8)) & 7,
@@ -234,34 +227,9 @@ static void __init_cache_modes(u64 pat)
update_cache_mode_entry(i, cache);
}
pr_info("x86/PAT: Configuration [0-7]: %s\n", pat_msg);
-
- pat_cm_initialized = true;
}
-#define PAT(x, y) ((u64)PAT_ ## y << ((x)*8))
-
-static void pat_bp_init(u64 pat)
-{
- u64 tmp_pat;
-
- if (!boot_cpu_has(X86_FEATURE_PAT)) {
- pat_disable("PAT not supported by the CPU.");
- return;
- }
-
- rdmsrl(MSR_IA32_CR_PAT, tmp_pat);
- if (!tmp_pat) {
- pat_disable("PAT support disabled by the firmware.");
- return;
- }
-
- wrmsrl(MSR_IA32_CR_PAT, pat);
- pat_bp_enabled = true;
-
- __init_cache_modes(pat);
-}
-
-static void pat_ap_init(u64 pat)
+void pat_cpu_init(void)
{
if (!boot_cpu_has(X86_FEATURE_PAT)) {
/*
@@ -271,30 +239,39 @@ static void pat_ap_init(u64 pat)
panic("x86/PAT: PAT enabled, but not supported by secondary CPU\n");
}
- wrmsrl(MSR_IA32_CR_PAT, pat);
+ wrmsrl(MSR_IA32_CR_PAT, pat_msr_val);
}
-void __init init_cache_modes(void)
+/**
+ * pat_bp_init - Initialize the PAT MSR value and PAT table
+ *
+ * This function initializes PAT MSR value and PAT table with an OS-defined
+ * value to enable additional cache attributes, WC, WT and WP.
+ *
+ * This function prepares the calls of pat_cpu_init() via cache_cpu_init()
+ * on all CPUs.
+ */
+void __init pat_bp_init(void)
{
- u64 pat = 0;
+ struct cpuinfo_x86 *c = &boot_cpu_data;
+#define PAT(p0, p1, p2, p3, p4, p5, p6, p7) \
+ (((u64)PAT_ ## p0) | ((u64)PAT_ ## p1 << 8) | \
+ ((u64)PAT_ ## p2 << 16) | ((u64)PAT_ ## p3 << 24) | \
+ ((u64)PAT_ ## p4 << 32) | ((u64)PAT_ ## p5 << 40) | \
+ ((u64)PAT_ ## p6 << 48) | ((u64)PAT_ ## p7 << 56))
- if (pat_cm_initialized)
- return;
- if (boot_cpu_has(X86_FEATURE_PAT)) {
- /*
- * CPU supports PAT. Set PAT table to be consistent with
- * PAT MSR. This case supports "nopat" boot option, and
- * virtual machine environments which support PAT without
- * MTRRs. In specific, Xen has unique setup to PAT MSR.
- *
- * If PAT MSR returns 0, it is considered invalid and emulates
- * as No PAT.
- */
- rdmsrl(MSR_IA32_CR_PAT, pat);
- }
+ if (!IS_ENABLED(CONFIG_X86_PAT))
+ pr_info_once("x86/PAT: PAT support disabled because CONFIG_X86_PAT is disabled in the kernel.\n");
+
+ if (!cpu_feature_enabled(X86_FEATURE_PAT))
+ pat_disable("PAT not supported by the CPU.");
+ else
+ rdmsrl(MSR_IA32_CR_PAT, pat_msr_val);
+
+ if (!pat_msr_val) {
+ pat_disable("PAT support disabled by the firmware.");
- if (!pat) {
/*
* No PAT. Emulate the PAT table that corresponds to the two
* cache bits, PWT (Write Through) and PCD (Cache Disable).
@@ -313,40 +290,22 @@ void __init init_cache_modes(void)
* NOTE: When WC or WP is used, it is redirected to UC- per
* the default setup in __cachemode2pte_tbl[].
*/
- pat = PAT(0, WB) | PAT(1, WT) | PAT(2, UC_MINUS) | PAT(3, UC) |
- PAT(4, WB) | PAT(5, WT) | PAT(6, UC_MINUS) | PAT(7, UC);
- } else if (!pat_force_disabled && cpu_feature_enabled(X86_FEATURE_HYPERVISOR)) {
- /*
- * Clearly PAT is enabled underneath. Allow pat_enabled() to
- * reflect this.
- */
- pat_bp_enabled = true;
+ pat_msr_val = PAT(WB, WT, UC_MINUS, UC, WB, WT, UC_MINUS, UC);
}
- __init_cache_modes(pat);
-}
-
-/**
- * pat_init - Initialize the PAT MSR and PAT table on the current CPU
- *
- * This function initializes PAT MSR and PAT table with an OS-defined value
- * to enable additional cache attributes, WC, WT and WP.
- *
- * This function must be called on all CPUs using the specific sequence of
- * operations defined in Intel SDM. mtrr_rendezvous_handler() provides this
- * procedure for PAT.
- */
-void pat_init(void)
-{
- u64 pat;
- struct cpuinfo_x86 *c = &boot_cpu_data;
-
-#ifndef CONFIG_X86_PAT
- pr_info_once("x86/PAT: PAT support disabled because CONFIG_X86_PAT is disabled in the kernel.\n");
-#endif
-
- if (pat_disabled)
+ /*
+ * Xen PV doesn't allow to set PAT MSR, but all cache modes are
+ * supported.
+ * When running as TDX guest setting the PAT MSR won't work either
+ * due to the requirement to set CR0.CD when doing so. Rely on
+ * firmware to have set the PAT MSR correctly.
+ */
+ if (pat_disabled ||
+ cpu_feature_enabled(X86_FEATURE_XENPV) ||
+ cpu_feature_enabled(X86_FEATURE_TDX_GUEST)) {
+ init_cache_modes(pat_msr_val);
return;
+ }
if ((c->x86_vendor == X86_VENDOR_INTEL) &&
(((c->x86 == 0x6) && (c->x86_model <= 0xd)) ||
@@ -371,8 +330,7 @@ void pat_init(void)
* NOTE: When WT or WP is used, it is redirected to UC- per
* the default setup in __cachemode2pte_tbl[].
*/
- pat = PAT(0, WB) | PAT(1, WC) | PAT(2, UC_MINUS) | PAT(3, UC) |
- PAT(4, WB) | PAT(5, WC) | PAT(6, UC_MINUS) | PAT(7, UC);
+ pat_msr_val = PAT(WB, WC, UC_MINUS, UC, WB, WC, UC_MINUS, UC);
} else {
/*
* Full PAT support. We put WT in slot 7 to improve
@@ -400,19 +358,14 @@ void pat_init(void)
* The reserved slots are unused, but mapped to their
* corresponding types in the presence of PAT errata.
*/
- pat = PAT(0, WB) | PAT(1, WC) | PAT(2, UC_MINUS) | PAT(3, UC) |
- PAT(4, WB) | PAT(5, WP) | PAT(6, UC_MINUS) | PAT(7, WT);
+ pat_msr_val = PAT(WB, WC, UC_MINUS, UC, WB, WP, UC_MINUS, WT);
}
- if (!pat_bp_initialized) {
- pat_bp_init(pat);
- pat_bp_initialized = true;
- } else {
- pat_ap_init(pat);
- }
-}
+ memory_caching_control |= CACHE_PAT;
+ init_cache_modes(pat_msr_val);
#undef PAT
+}
static DEFINE_SPINLOCK(memtype_lock); /* protects memtype accesses */
diff --git a/arch/x86/mm/pat/set_memory.c b/arch/x86/mm/pat/set_memory.c
index 2e5a045731de..356758b7d4b4 100644
--- a/arch/x86/mm/pat/set_memory.c
+++ b/arch/x86/mm/pat/set_memory.c
@@ -20,6 +20,7 @@
#include <linux/kernel.h>
#include <linux/cc_platform.h>
#include <linux/set_memory.h>
+#include <linux/memregion.h>
#include <asm/e820/api.h>
#include <asm/processor.h>
@@ -219,6 +220,23 @@ within_inclusive(unsigned long addr, unsigned long start, unsigned long end)
#ifdef CONFIG_X86_64
+/*
+ * The kernel image is mapped into two places in the virtual address space
+ * (addresses without KASLR, of course):
+ *
+ * 1. The kernel direct map (0xffff880000000000)
+ * 2. The "high kernel map" (0xffffffff81000000)
+ *
+ * We actually execute out of #2. If we get the address of a kernel symbol, it
+ * points to #2, but almost all physical-to-virtual translations point to #1.
+ *
+ * This is so that we can have both a directmap of all physical memory *and*
+ * take full advantage of the the limited (s32) immediate addressing range (2G)
+ * of x86_64.
+ *
+ * See Documentation/x86/x86_64/mm.rst for more detail.
+ */
+
static inline unsigned long highmap_start_pfn(void)
{
return __pa_symbol(_text) >> PAGE_SHIFT;
@@ -330,6 +348,23 @@ void arch_invalidate_pmem(void *addr, size_t size)
EXPORT_SYMBOL_GPL(arch_invalidate_pmem);
#endif
+#ifdef CONFIG_ARCH_HAS_CPU_CACHE_INVALIDATE_MEMREGION
+bool cpu_cache_has_invalidate_memregion(void)
+{
+ return !cpu_feature_enabled(X86_FEATURE_HYPERVISOR);
+}
+EXPORT_SYMBOL_NS_GPL(cpu_cache_has_invalidate_memregion, DEVMEM);
+
+int cpu_cache_invalidate_memregion(int res_desc)
+{
+ if (WARN_ON_ONCE(!cpu_cache_has_invalidate_memregion()))
+ return -ENXIO;
+ wbinvd_on_all_cpus();
+ return 0;
+}
+EXPORT_SYMBOL_NS_GPL(cpu_cache_invalidate_memregion, DEVMEM);
+#endif
+
static void __cpa_flush_all(void *arg)
{
unsigned long cache = (unsigned long)arg;
@@ -587,10 +622,6 @@ static inline pgprot_t verify_rwx(pgprot_t old, pgprot_t new, unsigned long star
{
unsigned long end;
- /* Kernel text is rw at boot up */
- if (system_state == SYSTEM_BOOTING)
- return new;
-
/*
* 32-bit has some unfixable W+X issues, like EFI code
* and writeable data being in the same page. Disable
@@ -747,11 +778,11 @@ phys_addr_t slow_virt_to_phys(void *__virt_addr)
switch (level) {
case PG_LEVEL_1G:
phys_addr = (phys_addr_t)pud_pfn(*(pud_t *)pte) << PAGE_SHIFT;
- offset = virt_addr & ~PUD_PAGE_MASK;
+ offset = virt_addr & ~PUD_MASK;
break;
case PG_LEVEL_2M:
phys_addr = (phys_addr_t)pmd_pfn(*(pmd_t *)pte) << PAGE_SHIFT;
- offset = virt_addr & ~PMD_PAGE_MASK;
+ offset = virt_addr & ~PMD_MASK;
break;
default:
phys_addr = (phys_addr_t)pte_pfn(*pte) << PAGE_SHIFT;
@@ -1041,7 +1072,7 @@ __split_large_page(struct cpa_data *cpa, pte_t *kpte, unsigned long address,
case PG_LEVEL_1G:
ref_prot = pud_pgprot(*(pud_t *)kpte);
ref_pfn = pud_pfn(*(pud_t *)kpte);
- pfninc = PMD_PAGE_SIZE >> PAGE_SHIFT;
+ pfninc = PMD_SIZE >> PAGE_SHIFT;
lpaddr = address & PUD_MASK;
lpinc = PMD_SIZE;
/*
@@ -1628,8 +1659,11 @@ repeat:
return err;
}
-static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias);
+static int __change_page_attr_set_clr(struct cpa_data *cpa, int primary);
+/*
+ * Check the directmap and "high kernel map" 'aliases'.
+ */
static int cpa_process_alias(struct cpa_data *cpa)
{
struct cpa_data alias_cpa;
@@ -1653,6 +1687,12 @@ static int cpa_process_alias(struct cpa_data *cpa)
alias_cpa.flags &= ~(CPA_PAGES_ARRAY | CPA_ARRAY);
alias_cpa.curpage = 0;
+ /* Directmap always has NX set, do not modify. */
+ if (__supported_pte_mask & _PAGE_NX) {
+ alias_cpa.mask_clr.pgprot &= ~_PAGE_NX;
+ alias_cpa.mask_set.pgprot &= ~_PAGE_NX;
+ }
+
cpa->force_flush_all = 1;
ret = __change_page_attr_set_clr(&alias_cpa, 0);
@@ -1675,6 +1715,15 @@ static int cpa_process_alias(struct cpa_data *cpa)
alias_cpa.flags &= ~(CPA_PAGES_ARRAY | CPA_ARRAY);
alias_cpa.curpage = 0;
+ /*
+ * [_text, _brk_end) also covers data, do not modify NX except
+ * in cases where the highmap is the primary target.
+ */
+ if (__supported_pte_mask & _PAGE_NX) {
+ alias_cpa.mask_clr.pgprot &= ~_PAGE_NX;
+ alias_cpa.mask_set.pgprot &= ~_PAGE_NX;
+ }
+
cpa->force_flush_all = 1;
/*
* The high mapping range is imprecise, so ignore the
@@ -1687,12 +1736,19 @@ static int cpa_process_alias(struct cpa_data *cpa)
return 0;
}
-static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias)
+static int __change_page_attr_set_clr(struct cpa_data *cpa, int primary)
{
unsigned long numpages = cpa->numpages;
unsigned long rempages = numpages;
int ret = 0;
+ /*
+ * No changes, easy!
+ */
+ if (!(pgprot_val(cpa->mask_set) | pgprot_val(cpa->mask_clr)) &&
+ !cpa->force_split)
+ return ret;
+
while (rempages) {
/*
* Store the remaining nr of pages for the large page
@@ -1705,13 +1761,13 @@ static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias)
if (!debug_pagealloc_enabled())
spin_lock(&cpa_lock);
- ret = __change_page_attr(cpa, checkalias);
+ ret = __change_page_attr(cpa, primary);
if (!debug_pagealloc_enabled())
spin_unlock(&cpa_lock);
if (ret)
goto out;
- if (checkalias) {
+ if (primary && !(cpa->flags & CPA_NO_CHECK_ALIAS)) {
ret = cpa_process_alias(cpa);
if (ret)
goto out;
@@ -1739,7 +1795,7 @@ static int change_page_attr_set_clr(unsigned long *addr, int numpages,
struct page **pages)
{
struct cpa_data cpa;
- int ret, cache, checkalias;
+ int ret, cache;
memset(&cpa, 0, sizeof(cpa));
@@ -1785,20 +1841,11 @@ static int change_page_attr_set_clr(unsigned long *addr, int numpages,
cpa.numpages = numpages;
cpa.mask_set = mask_set;
cpa.mask_clr = mask_clr;
- cpa.flags = 0;
+ cpa.flags = in_flag;
cpa.curpage = 0;
cpa.force_split = force_split;
- if (in_flag & (CPA_ARRAY | CPA_PAGES_ARRAY))
- cpa.flags |= in_flag;
-
- /* No alias checking for _NX bit modifications */
- checkalias = (pgprot_val(mask_set) | pgprot_val(mask_clr)) != _PAGE_NX;
- /* Has caller explicitly disabled alias checking? */
- if (in_flag & CPA_NO_CHECK_ALIAS)
- checkalias = 0;
-
- ret = __change_page_attr_set_clr(&cpa, checkalias);
+ ret = __change_page_attr_set_clr(&cpa, 1);
/*
* Check whether we really changed something:
@@ -2029,6 +2076,16 @@ int set_memory_ro(unsigned long addr, int numpages)
return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_RW), 0);
}
+int set_memory_rox(unsigned long addr, int numpages)
+{
+ pgprot_t clr = __pgprot(_PAGE_RW);
+
+ if (__supported_pte_mask & _PAGE_NX)
+ clr.pgprot |= _PAGE_NX;
+
+ return change_page_attr_clear(&addr, numpages, clr, 0);
+}
+
int set_memory_rw(unsigned long addr, int numpages)
{
return change_page_attr_set(&addr, numpages, __pgprot(_PAGE_RW), 0);
@@ -2041,11 +2098,9 @@ int set_memory_np(unsigned long addr, int numpages)
int set_memory_np_noalias(unsigned long addr, int numpages)
{
- int cpa_flags = CPA_NO_CHECK_ALIAS;
-
return change_page_attr_set_clr(&addr, numpages, __pgprot(0),
__pgprot(_PAGE_PRESENT), 0,
- cpa_flags, NULL);
+ CPA_NO_CHECK_ALIAS, NULL);
}
int set_memory_4k(unsigned long addr, int numpages)
@@ -2262,7 +2317,7 @@ static int __set_pages_p(struct page *page, int numpages)
.numpages = numpages,
.mask_set = __pgprot(_PAGE_PRESENT | _PAGE_RW),
.mask_clr = __pgprot(0),
- .flags = 0};
+ .flags = CPA_NO_CHECK_ALIAS };
/*
* No alias checking needed for setting present flag. otherwise,
@@ -2270,7 +2325,7 @@ static int __set_pages_p(struct page *page, int numpages)
* mappings (this adds to complexity if we want to do this from
* atomic context especially). Let's keep it simple!
*/
- return __change_page_attr_set_clr(&cpa, 0);
+ return __change_page_attr_set_clr(&cpa, 1);
}
static int __set_pages_np(struct page *page, int numpages)
@@ -2281,7 +2336,7 @@ static int __set_pages_np(struct page *page, int numpages)
.numpages = numpages,
.mask_set = __pgprot(0),
.mask_clr = __pgprot(_PAGE_PRESENT | _PAGE_RW),
- .flags = 0};
+ .flags = CPA_NO_CHECK_ALIAS };
/*
* No alias checking needed for setting not present flag. otherwise,
@@ -2289,7 +2344,7 @@ static int __set_pages_np(struct page *page, int numpages)
* mappings (this adds to complexity if we want to do this from
* atomic context especially). Let's keep it simple!
*/
- return __change_page_attr_set_clr(&cpa, 0);
+ return __change_page_attr_set_clr(&cpa, 1);
}
int set_direct_map_invalid_noflush(struct page *page)
@@ -2360,7 +2415,7 @@ int __init kernel_map_pages_in_pgd(pgd_t *pgd, u64 pfn, unsigned long address,
.numpages = numpages,
.mask_set = __pgprot(0),
.mask_clr = __pgprot(~page_flags & (_PAGE_NX|_PAGE_RW)),
- .flags = 0,
+ .flags = CPA_NO_CHECK_ALIAS,
};
WARN_ONCE(num_online_cpus() > 1, "Don't call after initializing SMP");
@@ -2373,7 +2428,7 @@ int __init kernel_map_pages_in_pgd(pgd_t *pgd, u64 pfn, unsigned long address,
cpa.mask_set = __pgprot(_PAGE_PRESENT | page_flags);
- retval = __change_page_attr_set_clr(&cpa, 0);
+ retval = __change_page_attr_set_clr(&cpa, 1);
__flush_tlb_all();
out:
@@ -2403,12 +2458,12 @@ int __init kernel_unmap_pages_in_pgd(pgd_t *pgd, unsigned long address,
.numpages = numpages,
.mask_set = __pgprot(0),
.mask_clr = __pgprot(_PAGE_PRESENT | _PAGE_RW),
- .flags = 0,
+ .flags = CPA_NO_CHECK_ALIAS,
};
WARN_ONCE(num_online_cpus() > 1, "Don't call after initializing SMP");
- retval = __change_page_attr_set_clr(&cpa, 0);
+ retval = __change_page_attr_set_clr(&cpa, 1);
__flush_tlb_all();
return retval;
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index 8525f2876fb4..e4f499eb0f29 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -299,9 +299,6 @@ static void pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd, pmd_t *pmds[])
pud_t *pud;
int i;
- if (PREALLOCATED_PMDS == 0) /* Work around gcc-3.4.x bug */
- return;
-
p4d = p4d_offset(pgd, 0);
pud = pud_offset(p4d, 0);
@@ -434,10 +431,12 @@ pgd_t *pgd_alloc(struct mm_struct *mm)
mm->pgd = pgd;
- if (preallocate_pmds(mm, pmds, PREALLOCATED_PMDS) != 0)
+ if (sizeof(pmds) != 0 &&
+ preallocate_pmds(mm, pmds, PREALLOCATED_PMDS) != 0)
goto out_free_pgd;
- if (preallocate_pmds(mm, u_pmds, PREALLOCATED_USER_PMDS) != 0)
+ if (sizeof(u_pmds) != 0 &&
+ preallocate_pmds(mm, u_pmds, PREALLOCATED_USER_PMDS) != 0)
goto out_free_pmds;
if (paravirt_pgd_alloc(mm) != 0)
@@ -451,17 +450,22 @@ pgd_t *pgd_alloc(struct mm_struct *mm)
spin_lock(&pgd_lock);
pgd_ctor(mm, pgd);
- pgd_prepopulate_pmd(mm, pgd, pmds);
- pgd_prepopulate_user_pmd(mm, pgd, u_pmds);
+ if (sizeof(pmds) != 0)
+ pgd_prepopulate_pmd(mm, pgd, pmds);
+
+ if (sizeof(u_pmds) != 0)
+ pgd_prepopulate_user_pmd(mm, pgd, u_pmds);
spin_unlock(&pgd_lock);
return pgd;
out_free_user_pmds:
- free_pmds(mm, u_pmds, PREALLOCATED_USER_PMDS);
+ if (sizeof(u_pmds) != 0)
+ free_pmds(mm, u_pmds, PREALLOCATED_USER_PMDS);
out_free_pmds:
- free_pmds(mm, pmds, PREALLOCATED_PMDS);
+ if (sizeof(pmds) != 0)
+ free_pmds(mm, pmds, PREALLOCATED_PMDS);
out_free_pgd:
_pgd_free(pgd);
out:
diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c
index ffe3b3a087fe..78414c6d1b5e 100644
--- a/arch/x86/mm/pti.c
+++ b/arch/x86/mm/pti.c
@@ -592,7 +592,7 @@ static void pti_set_kernel_image_nonglobal(void)
* of the image.
*/
unsigned long start = PFN_ALIGN(_text);
- unsigned long end = ALIGN((unsigned long)_end, PMD_PAGE_SIZE);
+ unsigned long end = ALIGN((unsigned long)_end, PMD_SIZE);
/*
* This clears _PAGE_GLOBAL from the entire kernel image.
diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index 00127abd89ee..b808be77635e 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -11,8 +11,8 @@
#include <linux/bpf.h>
#include <linux/memory.h>
#include <linux/sort.h>
-#include <linux/init.h>
#include <asm/extable.h>
+#include <asm/ftrace.h>
#include <asm/set_memory.h>
#include <asm/nospec-branch.h>
#include <asm/text-patching.h>
@@ -341,6 +341,13 @@ static int emit_call(u8 **pprog, void *func, void *ip)
return emit_patch(pprog, func, ip, 0xE8);
}
+static int emit_rsb_call(u8 **pprog, void *func, void *ip)
+{
+ OPTIMIZER_HIDE_VAR(func);
+ x86_call_depth_emit_accounting(pprog, func);
+ return emit_patch(pprog, func, ip, 0xE8);
+}
+
static int emit_jump(u8 **pprog, void *func, void *ip)
{
return emit_patch(pprog, func, ip, 0xE9);
@@ -389,18 +396,6 @@ out:
return ret;
}
-int __init bpf_arch_init_dispatcher_early(void *ip)
-{
- const u8 *nop_insn = x86_nops[5];
-
- if (is_endbr(*(u32 *)ip))
- ip += ENDBR_INSN_SIZE;
-
- if (memcmp(ip, nop_insn, X86_PATCH_SIZE))
- text_poke_early(ip, nop_insn, X86_PATCH_SIZE);
- return 0;
-}
-
int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t,
void *old_addr, void *new_addr)
{
@@ -430,7 +425,10 @@ static void emit_indirect_jump(u8 **pprog, int reg, u8 *ip)
EMIT2(0xFF, 0xE0 + reg);
} else if (cpu_feature_enabled(X86_FEATURE_RETPOLINE)) {
OPTIMIZER_HIDE_VAR(reg);
- emit_jump(&prog, &__x86_indirect_thunk_array[reg], ip);
+ if (cpu_feature_enabled(X86_FEATURE_CALL_DEPTH))
+ emit_jump(&prog, &__x86_indirect_jump_thunk_array[reg], ip);
+ else
+ emit_jump(&prog, &__x86_indirect_thunk_array[reg], ip);
} else {
EMIT2(0xFF, 0xE0 + reg); /* jmp *%\reg */
if (IS_ENABLED(CONFIG_RETPOLINE) || IS_ENABLED(CONFIG_SLS))
@@ -445,7 +443,7 @@ static void emit_return(u8 **pprog, u8 *ip)
u8 *prog = *pprog;
if (cpu_feature_enabled(X86_FEATURE_RETHUNK)) {
- emit_jump(&prog, &__x86_return_thunk, ip);
+ emit_jump(&prog, x86_return_thunk, ip);
} else {
EMIT1(0xC3); /* ret */
if (IS_ENABLED(CONFIG_SLS))
@@ -904,6 +902,65 @@ static void emit_nops(u8 **pprog, int len)
*pprog = prog;
}
+/* emit the 3-byte VEX prefix
+ *
+ * r: same as rex.r, extra bit for ModRM reg field
+ * x: same as rex.x, extra bit for SIB index field
+ * b: same as rex.b, extra bit for ModRM r/m, or SIB base
+ * m: opcode map select, encoding escape bytes e.g. 0x0f38
+ * w: same as rex.w (32 bit or 64 bit) or opcode specific
+ * src_reg2: additional source reg (encoded as BPF reg)
+ * l: vector length (128 bit or 256 bit) or reserved
+ * pp: opcode prefix (none, 0x66, 0xf2 or 0xf3)
+ */
+static void emit_3vex(u8 **pprog, bool r, bool x, bool b, u8 m,
+ bool w, u8 src_reg2, bool l, u8 pp)
+{
+ u8 *prog = *pprog;
+ const u8 b0 = 0xc4; /* first byte of 3-byte VEX prefix */
+ u8 b1, b2;
+ u8 vvvv = reg2hex[src_reg2];
+
+ /* reg2hex gives only the lower 3 bit of vvvv */
+ if (is_ereg(src_reg2))
+ vvvv |= 1 << 3;
+
+ /*
+ * 2nd byte of 3-byte VEX prefix
+ * ~ means bit inverted encoding
+ *
+ * 7 0
+ * +---+---+---+---+---+---+---+---+
+ * |~R |~X |~B | m |
+ * +---+---+---+---+---+---+---+---+
+ */
+ b1 = (!r << 7) | (!x << 6) | (!b << 5) | (m & 0x1f);
+ /*
+ * 3rd byte of 3-byte VEX prefix
+ *
+ * 7 0
+ * +---+---+---+---+---+---+---+---+
+ * | W | ~vvvv | L | pp |
+ * +---+---+---+---+---+---+---+---+
+ */
+ b2 = (w << 7) | ((~vvvv & 0xf) << 3) | (l << 2) | (pp & 3);
+
+ EMIT3(b0, b1, b2);
+ *pprog = prog;
+}
+
+/* emit BMI2 shift instruction */
+static void emit_shiftx(u8 **pprog, u32 dst_reg, u8 src_reg, bool is64, u8 op)
+{
+ u8 *prog = *pprog;
+ bool r = is_ereg(dst_reg);
+ u8 m = 2; /* escape code 0f38 */
+
+ emit_3vex(&prog, r, false, r, m, is64, src_reg, false, op);
+ EMIT2(0xf7, add_2reg(0xC0, dst_reg, dst_reg));
+ *pprog = prog;
+}
+
#define INSN_SZ_DIFF (((addrs[i] - addrs[i - 1]) - (prog - temp)))
static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image,
@@ -1150,17 +1207,38 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image
case BPF_ALU64 | BPF_LSH | BPF_X:
case BPF_ALU64 | BPF_RSH | BPF_X:
case BPF_ALU64 | BPF_ARSH | BPF_X:
+ /* BMI2 shifts aren't better when shift count is already in rcx */
+ if (boot_cpu_has(X86_FEATURE_BMI2) && src_reg != BPF_REG_4) {
+ /* shrx/sarx/shlx dst_reg, dst_reg, src_reg */
+ bool w = (BPF_CLASS(insn->code) == BPF_ALU64);
+ u8 op;
+
+ switch (BPF_OP(insn->code)) {
+ case BPF_LSH:
+ op = 1; /* prefix 0x66 */
+ break;
+ case BPF_RSH:
+ op = 3; /* prefix 0xf2 */
+ break;
+ case BPF_ARSH:
+ op = 2; /* prefix 0xf3 */
+ break;
+ }
+
+ emit_shiftx(&prog, dst_reg, src_reg, w, op);
- /* Check for bad case when dst_reg == rcx */
- if (dst_reg == BPF_REG_4) {
- /* mov r11, dst_reg */
- EMIT_mov(AUX_REG, dst_reg);
- dst_reg = AUX_REG;
+ break;
}
if (src_reg != BPF_REG_4) { /* common case */
- EMIT1(0x51); /* push rcx */
-
+ /* Check for bad case when dst_reg == rcx */
+ if (dst_reg == BPF_REG_4) {
+ /* mov r11, dst_reg */
+ EMIT_mov(AUX_REG, dst_reg);
+ dst_reg = AUX_REG;
+ } else {
+ EMIT1(0x51); /* push rcx */
+ }
/* mov rcx, src_reg */
EMIT_mov(BPF_REG_4, src_reg);
}
@@ -1172,12 +1250,14 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image
b3 = simple_alu_opcodes[BPF_OP(insn->code)];
EMIT2(0xD3, add_1reg(b3, dst_reg));
- if (src_reg != BPF_REG_4)
- EMIT1(0x59); /* pop rcx */
+ if (src_reg != BPF_REG_4) {
+ if (insn->dst_reg == BPF_REG_4)
+ /* mov dst_reg, r11 */
+ EMIT_mov(insn->dst_reg, AUX_REG);
+ else
+ EMIT1(0x59); /* pop rcx */
+ }
- if (insn->dst_reg == BPF_REG_4)
- /* mov dst_reg, r11 */
- EMIT_mov(insn->dst_reg, AUX_REG);
break;
case BPF_ALU | BPF_END | BPF_FROM_BE:
@@ -1239,8 +1319,7 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image
/* speculation barrier */
case BPF_ST | BPF_NOSPEC:
- if (boot_cpu_has(X86_FEATURE_XMM2))
- EMIT_LFENCE();
+ EMIT_LFENCE();
break;
/* ST: *(u8*)(dst_reg + off) = imm */
@@ -1446,19 +1525,26 @@ st: if (is_imm8(insn->off))
break;
/* call */
- case BPF_JMP | BPF_CALL:
+ case BPF_JMP | BPF_CALL: {
+ int offs;
+
func = (u8 *) __bpf_call_base + imm32;
if (tail_call_reachable) {
/* mov rax, qword ptr [rbp - rounded_stack_depth - 8] */
EMIT3_off32(0x48, 0x8B, 0x85,
-round_up(bpf_prog->aux->stack_depth, 8) - 8);
- if (!imm32 || emit_call(&prog, func, image + addrs[i - 1] + 7))
+ if (!imm32)
return -EINVAL;
+ offs = 7 + x86_call_depth_emit_accounting(&prog, func);
} else {
- if (!imm32 || emit_call(&prog, func, image + addrs[i - 1]))
+ if (!imm32)
return -EINVAL;
+ offs = x86_call_depth_emit_accounting(&prog, func);
}
+ if (emit_call(&prog, func, image + addrs[i - 1] + offs))
+ return -EINVAL;
break;
+ }
case BPF_JMP | BPF_TAIL_CALL:
if (imm32)
@@ -1826,10 +1912,6 @@ static int invoke_bpf_prog(const struct btf_func_model *m, u8 **pprog,
struct bpf_tramp_link *l, int stack_size,
int run_ctx_off, bool save_ret)
{
- void (*exit)(struct bpf_prog *prog, u64 start,
- struct bpf_tramp_run_ctx *run_ctx) = __bpf_prog_exit;
- u64 (*enter)(struct bpf_prog *prog,
- struct bpf_tramp_run_ctx *run_ctx) = __bpf_prog_enter;
u8 *prog = *pprog;
u8 *jmp_insn;
int ctx_cookie_off = offsetof(struct bpf_tramp_run_ctx, bpf_cookie);
@@ -1848,23 +1930,12 @@ static int invoke_bpf_prog(const struct btf_func_model *m, u8 **pprog,
*/
emit_stx(&prog, BPF_DW, BPF_REG_FP, BPF_REG_1, -run_ctx_off + ctx_cookie_off);
- if (p->aux->sleepable) {
- enter = __bpf_prog_enter_sleepable;
- exit = __bpf_prog_exit_sleepable;
- } else if (p->type == BPF_PROG_TYPE_STRUCT_OPS) {
- enter = __bpf_prog_enter_struct_ops;
- exit = __bpf_prog_exit_struct_ops;
- } else if (p->expected_attach_type == BPF_LSM_CGROUP) {
- enter = __bpf_prog_enter_lsm_cgroup;
- exit = __bpf_prog_exit_lsm_cgroup;
- }
-
/* arg1: mov rdi, progs[i] */
emit_mov_imm64(&prog, BPF_REG_1, (long) p >> 32, (u32) (long) p);
/* arg2: lea rsi, [rbp - ctx_cookie_off] */
EMIT4(0x48, 0x8D, 0x75, -run_ctx_off);
- if (emit_call(&prog, enter, prog))
+ if (emit_rsb_call(&prog, bpf_trampoline_enter(p), prog))
return -EINVAL;
/* remember prog start time returned by __bpf_prog_enter */
emit_mov_reg(&prog, true, BPF_REG_6, BPF_REG_0);
@@ -1885,7 +1956,7 @@ static int invoke_bpf_prog(const struct btf_func_model *m, u8 **pprog,
(long) p->insnsi >> 32,
(u32) (long) p->insnsi);
/* call JITed bpf program or interpreter */
- if (emit_call(&prog, p->bpf_func, prog))
+ if (emit_rsb_call(&prog, p->bpf_func, prog))
return -EINVAL;
/*
@@ -1909,7 +1980,7 @@ static int invoke_bpf_prog(const struct btf_func_model *m, u8 **pprog,
emit_mov_reg(&prog, true, BPF_REG_2, BPF_REG_6);
/* arg3: lea rdx, [rbp - run_ctx_off] */
EMIT4(0x48, 0x8D, 0x55, -run_ctx_off);
- if (emit_call(&prog, exit, prog))
+ if (emit_rsb_call(&prog, bpf_trampoline_exit(p), prog))
return -EINVAL;
*pprog = prog;
@@ -2131,6 +2202,11 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i
prog = image;
EMIT_ENDBR();
+ /*
+ * This is the direct-call trampoline, as such it needs accounting
+ * for the __fentry__ call.
+ */
+ x86_call_depth_emit_accounting(&prog, NULL);
EMIT1(0x55); /* push rbp */
EMIT3(0x48, 0x89, 0xE5); /* mov rbp, rsp */
EMIT4(0x48, 0x83, 0xEC, stack_size); /* sub rsp, stack_size */
@@ -2157,7 +2233,7 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i
if (flags & BPF_TRAMP_F_CALL_ORIG) {
/* arg1: mov rdi, im */
emit_mov_imm64(&prog, BPF_REG_1, (long) im >> 32, (u32) (long) im);
- if (emit_call(&prog, __bpf_tramp_enter, prog)) {
+ if (emit_rsb_call(&prog, __bpf_tramp_enter, prog)) {
ret = -EINVAL;
goto cleanup;
}
@@ -2189,7 +2265,7 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i
EMIT2(0xff, 0xd0); /* call *rax */
} else {
/* call original function */
- if (emit_call(&prog, orig_call, prog)) {
+ if (emit_rsb_call(&prog, orig_call, prog)) {
ret = -EINVAL;
goto cleanup;
}
@@ -2233,7 +2309,7 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i
im->ip_epilogue = prog;
/* arg1: mov rdi, im */
emit_mov_imm64(&prog, BPF_REG_1, (long) im >> 32, (u32) (long) im);
- if (emit_call(&prog, __bpf_tramp_exit, prog)) {
+ if (emit_rsb_call(&prog, __bpf_tramp_exit, prog)) {
ret = -EINVAL;
goto cleanup;
}
diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c
index 2f82480fd430..ea2eb2ec90e2 100644
--- a/arch/x86/pci/acpi.c
+++ b/arch/x86/pci/acpi.c
@@ -1,4 +1,7 @@
// SPDX-License-Identifier: GPL-2.0
+
+#define pr_fmt(fmt) "PCI: " fmt
+
#include <linux/pci.h>
#include <linux/acpi.h>
#include <linux/init.h>
@@ -37,15 +40,15 @@ static int __init set_nouse_crs(const struct dmi_system_id *id)
static int __init set_ignore_seg(const struct dmi_system_id *id)
{
- printk(KERN_INFO "PCI: %s detected: ignoring ACPI _SEG\n", id->ident);
+ pr_info("%s detected: ignoring ACPI _SEG\n", id->ident);
pci_ignore_seg = true;
return 0;
}
static int __init set_no_e820(const struct dmi_system_id *id)
{
- printk(KERN_INFO "PCI: %s detected: not clipping E820 regions from _CRS\n",
- id->ident);
+ pr_info("%s detected: not clipping E820 regions from _CRS\n",
+ id->ident);
pci_use_e820 = false;
return 0;
}
@@ -231,10 +234,9 @@ void __init pci_acpi_crs_quirks(void)
else if (pci_probe & PCI_USE__CRS)
pci_use_crs = true;
- printk(KERN_INFO "PCI: %s host bridge windows from ACPI; "
- "if necessary, use \"pci=%s\" and report a bug\n",
- pci_use_crs ? "Using" : "Ignoring",
- pci_use_crs ? "nocrs" : "use_crs");
+ pr_info("%s host bridge windows from ACPI; if necessary, use \"pci=%s\" and report a bug\n",
+ pci_use_crs ? "Using" : "Ignoring",
+ pci_use_crs ? "nocrs" : "use_crs");
/* "pci=use_e820"/"pci=no_e820" on the kernel cmdline takes precedence */
if (pci_probe & PCI_NO_E820)
@@ -242,19 +244,17 @@ void __init pci_acpi_crs_quirks(void)
else if (pci_probe & PCI_USE_E820)
pci_use_e820 = true;
- printk(KERN_INFO "PCI: %s E820 reservations for host bridge windows\n",
- pci_use_e820 ? "Using" : "Ignoring");
+ pr_info("%s E820 reservations for host bridge windows\n",
+ pci_use_e820 ? "Using" : "Ignoring");
if (pci_probe & (PCI_NO_E820 | PCI_USE_E820))
- printk(KERN_INFO "PCI: Please notify linux-pci@vger.kernel.org so future kernels can this automatically\n");
+ pr_info("Please notify linux-pci@vger.kernel.org so future kernels can do this automatically\n");
}
#ifdef CONFIG_PCI_MMCONFIG
static int check_segment(u16 seg, struct device *dev, char *estr)
{
if (seg) {
- dev_err(dev,
- "%s can't access PCI configuration "
- "space under this host bridge.\n",
+ dev_err(dev, "%s can't access configuration space under this host bridge\n",
estr);
return -EIO;
}
@@ -264,9 +264,7 @@ static int check_segment(u16 seg, struct device *dev, char *estr)
* just can't access extended configuration space of
* devices under this host bridge.
*/
- dev_warn(dev,
- "%s can't access extended PCI configuration "
- "space under this bridge.\n",
+ dev_warn(dev, "%s can't access extended configuration space under this bridge\n",
estr);
return 0;
@@ -421,9 +419,8 @@ struct pci_bus *pci_acpi_scan_root(struct acpi_pci_root *root)
root->segment = domain = 0;
if (domain && !pci_domains_supported) {
- printk(KERN_WARNING "pci_bus %04x:%02x: "
- "ignored (multiple domains not supported)\n",
- domain, busnum);
+ pr_warn("pci_bus %04x:%02x: ignored (multiple domains not supported)\n",
+ domain, busnum);
return NULL;
}
@@ -491,7 +488,7 @@ int __init pci_acpi_init(void)
if (acpi_noirq)
return -ENODEV;
- printk(KERN_INFO "PCI: Using ACPI for IRQ routing\n");
+ pr_info("Using ACPI for IRQ routing\n");
acpi_irq_penalty_init();
pcibios_enable_irq = acpi_pci_irq_enable;
pcibios_disable_irq = acpi_pci_irq_disable;
@@ -503,7 +500,7 @@ int __init pci_acpi_init(void)
* also do it here in case there are still broken drivers that
* don't use pci_enable_device().
*/
- printk(KERN_INFO "PCI: Routing PCI interrupts for all devices because \"pci=routeirq\" specified\n");
+ pr_info("Routing PCI interrupts for all devices because \"pci=routeirq\" specified\n");
for_each_pci_dev(dev)
acpi_pci_irq_enable(dev);
}
diff --git a/arch/x86/platform/efi/Makefile b/arch/x86/platform/efi/Makefile
index a50245157685..543df9a1379d 100644
--- a/arch/x86/platform/efi/Makefile
+++ b/arch/x86/platform/efi/Makefile
@@ -2,5 +2,8 @@
KASAN_SANITIZE := n
GCOV_PROFILE := n
-obj-$(CONFIG_EFI) += quirks.o efi.o efi_$(BITS).o efi_stub_$(BITS).o
+obj-$(CONFIG_EFI) += memmap.o quirks.o efi.o efi_$(BITS).o \
+ efi_stub_$(BITS).o
obj-$(CONFIG_EFI_MIXED) += efi_thunk_$(BITS).o
+obj-$(CONFIG_EFI_FAKE_MEMMAP) += fake_mem.o
+obj-$(CONFIG_EFI_RUNTIME_MAP) += runtime-map.o
diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c
index ebc98a68c400..55d9caf66401 100644
--- a/arch/x86/platform/efi/efi.c
+++ b/arch/x86/platform/efi/efi.c
@@ -214,9 +214,11 @@ int __init efi_memblock_x86_reserve_range(void)
data.desc_size = e->efi_memdesc_size;
data.desc_version = e->efi_memdesc_version;
- rv = efi_memmap_init_early(&data);
- if (rv)
- return rv;
+ if (!efi_enabled(EFI_PARAVIRT)) {
+ rv = efi_memmap_init_early(&data);
+ if (rv)
+ return rv;
+ }
if (add_efi_memmap || do_efi_soft_reserve())
do_add_efi_memmap();
@@ -303,6 +305,50 @@ static void __init efi_clean_memmap(void)
}
}
+/*
+ * Firmware can use EfiMemoryMappedIO to request that MMIO regions be
+ * mapped by the OS so they can be accessed by EFI runtime services, but
+ * should have no other significance to the OS (UEFI r2.10, sec 7.2).
+ * However, most bootloaders and EFI stubs convert EfiMemoryMappedIO
+ * regions to E820_TYPE_RESERVED entries, which prevent Linux from
+ * allocating space from them (see remove_e820_regions()).
+ *
+ * Some platforms use EfiMemoryMappedIO entries for PCI MMCONFIG space and
+ * PCI host bridge windows, which means Linux can't allocate BAR space for
+ * hot-added devices.
+ *
+ * Remove large EfiMemoryMappedIO regions from the E820 map to avoid this
+ * problem.
+ *
+ * Retain small EfiMemoryMappedIO regions because on some platforms, these
+ * describe non-window space that's included in host bridge _CRS. If we
+ * assign that space to PCI devices, they don't work.
+ */
+static void __init efi_remove_e820_mmio(void)
+{
+ efi_memory_desc_t *md;
+ u64 size, start, end;
+ int i = 0;
+
+ for_each_efi_memory_desc(md) {
+ if (md->type == EFI_MEMORY_MAPPED_IO) {
+ size = md->num_pages << EFI_PAGE_SHIFT;
+ start = md->phys_addr;
+ end = start + size - 1;
+ if (size >= 256*1024) {
+ pr_info("Remove mem%02u: MMIO range=[0x%08llx-0x%08llx] (%lluMB) from e820 map\n",
+ i, start, end, size >> 20);
+ e820__range_remove(start, size,
+ E820_TYPE_RESERVED, 1);
+ } else {
+ pr_info("Not removing mem%02u: MMIO range=[0x%08llx-0x%08llx] (%lluKB) from e820 map\n",
+ i, start, end, size >> 10);
+ }
+ }
+ i++;
+ }
+}
+
void __init efi_print_memmap(void)
{
efi_memory_desc_t *md;
@@ -474,6 +520,8 @@ void __init efi_init(void)
set_bit(EFI_RUNTIME_SERVICES, &efi.flags);
efi_clean_memmap();
+ efi_remove_e820_mmio();
+
if (efi_enabled(EFI_DBG))
efi_print_memmap();
}
diff --git a/arch/x86/platform/efi/fake_mem.c b/arch/x86/platform/efi/fake_mem.c
new file mode 100644
index 000000000000..41d57cad3d84
--- /dev/null
+++ b/arch/x86/platform/efi/fake_mem.c
@@ -0,0 +1,197 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * fake_mem.c
+ *
+ * Copyright (C) 2015 FUJITSU LIMITED
+ * Author: Taku Izumi <izumi.taku@jp.fujitsu.com>
+ *
+ * This code introduces new boot option named "efi_fake_mem"
+ * By specifying this parameter, you can add arbitrary attribute to
+ * specific memory range by updating original (firmware provided) EFI
+ * memmap.
+ */
+
+#include <linux/kernel.h>
+#include <linux/efi.h>
+#include <linux/init.h>
+#include <linux/memblock.h>
+#include <linux/types.h>
+#include <linux/sort.h>
+#include <asm/e820/api.h>
+#include <asm/efi.h>
+
+#define EFI_MAX_FAKEMEM CONFIG_EFI_MAX_FAKE_MEM
+
+static struct efi_mem_range efi_fake_mems[EFI_MAX_FAKEMEM];
+static int nr_fake_mem;
+
+static int __init cmp_fake_mem(const void *x1, const void *x2)
+{
+ const struct efi_mem_range *m1 = x1;
+ const struct efi_mem_range *m2 = x2;
+
+ if (m1->range.start < m2->range.start)
+ return -1;
+ if (m1->range.start > m2->range.start)
+ return 1;
+ return 0;
+}
+
+static void __init efi_fake_range(struct efi_mem_range *efi_range)
+{
+ struct efi_memory_map_data data = { 0 };
+ int new_nr_map = efi.memmap.nr_map;
+ efi_memory_desc_t *md;
+ void *new_memmap;
+
+ /* count up the number of EFI memory descriptor */
+ for_each_efi_memory_desc(md)
+ new_nr_map += efi_memmap_split_count(md, &efi_range->range);
+
+ /* allocate memory for new EFI memmap */
+ if (efi_memmap_alloc(new_nr_map, &data) != 0)
+ return;
+
+ /* create new EFI memmap */
+ new_memmap = early_memremap(data.phys_map, data.size);
+ if (!new_memmap) {
+ __efi_memmap_free(data.phys_map, data.size, data.flags);
+ return;
+ }
+
+ efi_memmap_insert(&efi.memmap, new_memmap, efi_range);
+
+ /* swap into new EFI memmap */
+ early_memunmap(new_memmap, data.size);
+
+ efi_memmap_install(&data);
+}
+
+void __init efi_fake_memmap(void)
+{
+ int i;
+
+ if (!efi_enabled(EFI_MEMMAP) || !nr_fake_mem)
+ return;
+
+ for (i = 0; i < nr_fake_mem; i++)
+ efi_fake_range(&efi_fake_mems[i]);
+
+ /* print new EFI memmap */
+ efi_print_memmap();
+}
+
+static int __init setup_fake_mem(char *p)
+{
+ u64 start = 0, mem_size = 0, attribute = 0;
+ int i;
+
+ if (!p)
+ return -EINVAL;
+
+ while (*p != '\0') {
+ mem_size = memparse(p, &p);
+ if (*p == '@')
+ start = memparse(p+1, &p);
+ else
+ break;
+
+ if (*p == ':')
+ attribute = simple_strtoull(p+1, &p, 0);
+ else
+ break;
+
+ if (nr_fake_mem >= EFI_MAX_FAKEMEM)
+ break;
+
+ efi_fake_mems[nr_fake_mem].range.start = start;
+ efi_fake_mems[nr_fake_mem].range.end = start + mem_size - 1;
+ efi_fake_mems[nr_fake_mem].attribute = attribute;
+ nr_fake_mem++;
+
+ if (*p == ',')
+ p++;
+ }
+
+ sort(efi_fake_mems, nr_fake_mem, sizeof(struct efi_mem_range),
+ cmp_fake_mem, NULL);
+
+ for (i = 0; i < nr_fake_mem; i++)
+ pr_info("efi_fake_mem: add attr=0x%016llx to [mem 0x%016llx-0x%016llx]",
+ efi_fake_mems[i].attribute, efi_fake_mems[i].range.start,
+ efi_fake_mems[i].range.end);
+
+ return *p == '\0' ? 0 : -EINVAL;
+}
+
+early_param("efi_fake_mem", setup_fake_mem);
+
+void __init efi_fake_memmap_early(void)
+{
+ int i;
+
+ /*
+ * The late efi_fake_mem() call can handle all requests if
+ * EFI_MEMORY_SP support is disabled.
+ */
+ if (!efi_soft_reserve_enabled())
+ return;
+
+ if (!efi_enabled(EFI_MEMMAP) || !nr_fake_mem)
+ return;
+
+ /*
+ * Given that efi_fake_memmap() needs to perform memblock
+ * allocations it needs to run after e820__memblock_setup().
+ * However, if efi_fake_mem specifies EFI_MEMORY_SP for a given
+ * address range that potentially needs to mark the memory as
+ * reserved prior to e820__memblock_setup(). Update e820
+ * directly if EFI_MEMORY_SP is specified for an
+ * EFI_CONVENTIONAL_MEMORY descriptor.
+ */
+ for (i = 0; i < nr_fake_mem; i++) {
+ struct efi_mem_range *mem = &efi_fake_mems[i];
+ efi_memory_desc_t *md;
+ u64 m_start, m_end;
+
+ if ((mem->attribute & EFI_MEMORY_SP) == 0)
+ continue;
+
+ m_start = mem->range.start;
+ m_end = mem->range.end;
+ for_each_efi_memory_desc(md) {
+ u64 start, end, size;
+
+ if (md->type != EFI_CONVENTIONAL_MEMORY)
+ continue;
+
+ start = md->phys_addr;
+ end = md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT) - 1;
+
+ if (m_start <= end && m_end >= start)
+ /* fake range overlaps descriptor */;
+ else
+ continue;
+
+ /*
+ * Trim the boundary of the e820 update to the
+ * descriptor in case the fake range overlaps
+ * !EFI_CONVENTIONAL_MEMORY
+ */
+ start = max(start, m_start);
+ end = min(end, m_end);
+ size = end - start + 1;
+
+ if (end <= start)
+ continue;
+
+ /*
+ * Ensure each efi_fake_mem instance results in
+ * a unique e820 resource
+ */
+ e820__range_remove(start, size, E820_TYPE_RAM, 1);
+ e820__range_add(start, size, E820_TYPE_SOFT_RESERVED);
+ e820__update_table(e820_table);
+ }
+ }
+}
diff --git a/arch/x86/platform/efi/memmap.c b/arch/x86/platform/efi/memmap.c
new file mode 100644
index 000000000000..c69f8471e6d0
--- /dev/null
+++ b/arch/x86/platform/efi/memmap.c
@@ -0,0 +1,239 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Common EFI memory map functions.
+ */
+
+#define pr_fmt(fmt) "efi: " fmt
+
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/efi.h>
+#include <linux/io.h>
+#include <asm/early_ioremap.h>
+#include <asm/efi.h>
+#include <linux/memblock.h>
+#include <linux/slab.h>
+
+static phys_addr_t __init __efi_memmap_alloc_early(unsigned long size)
+{
+ return memblock_phys_alloc(size, SMP_CACHE_BYTES);
+}
+
+static phys_addr_t __init __efi_memmap_alloc_late(unsigned long size)
+{
+ unsigned int order = get_order(size);
+ struct page *p = alloc_pages(GFP_KERNEL, order);
+
+ if (!p)
+ return 0;
+
+ return PFN_PHYS(page_to_pfn(p));
+}
+
+void __init __efi_memmap_free(u64 phys, unsigned long size, unsigned long flags)
+{
+ if (flags & EFI_MEMMAP_MEMBLOCK) {
+ if (slab_is_available())
+ memblock_free_late(phys, size);
+ else
+ memblock_phys_free(phys, size);
+ } else if (flags & EFI_MEMMAP_SLAB) {
+ struct page *p = pfn_to_page(PHYS_PFN(phys));
+ unsigned int order = get_order(size);
+
+ free_pages((unsigned long) page_address(p), order);
+ }
+}
+
+/**
+ * efi_memmap_alloc - Allocate memory for the EFI memory map
+ * @num_entries: Number of entries in the allocated map.
+ * @data: efi memmap installation parameters
+ *
+ * Depending on whether mm_init() has already been invoked or not,
+ * either memblock or "normal" page allocation is used.
+ *
+ * Returns zero on success, a negative error code on failure.
+ */
+int __init efi_memmap_alloc(unsigned int num_entries,
+ struct efi_memory_map_data *data)
+{
+ /* Expect allocation parameters are zero initialized */
+ WARN_ON(data->phys_map || data->size);
+
+ data->size = num_entries * efi.memmap.desc_size;
+ data->desc_version = efi.memmap.desc_version;
+ data->desc_size = efi.memmap.desc_size;
+ data->flags &= ~(EFI_MEMMAP_SLAB | EFI_MEMMAP_MEMBLOCK);
+ data->flags |= efi.memmap.flags & EFI_MEMMAP_LATE;
+
+ if (slab_is_available()) {
+ data->flags |= EFI_MEMMAP_SLAB;
+ data->phys_map = __efi_memmap_alloc_late(data->size);
+ } else {
+ data->flags |= EFI_MEMMAP_MEMBLOCK;
+ data->phys_map = __efi_memmap_alloc_early(data->size);
+ }
+
+ if (!data->phys_map)
+ return -ENOMEM;
+ return 0;
+}
+
+/**
+ * efi_memmap_install - Install a new EFI memory map in efi.memmap
+ * @ctx: map allocation parameters (address, size, flags)
+ *
+ * Unlike efi_memmap_init_*(), this function does not allow the caller
+ * to switch from early to late mappings. It simply uses the existing
+ * mapping function and installs the new memmap.
+ *
+ * Returns zero on success, a negative error code on failure.
+ */
+int __init efi_memmap_install(struct efi_memory_map_data *data)
+{
+ efi_memmap_unmap();
+
+ if (efi_enabled(EFI_PARAVIRT))
+ return 0;
+
+ return __efi_memmap_init(data);
+}
+
+/**
+ * efi_memmap_split_count - Count number of additional EFI memmap entries
+ * @md: EFI memory descriptor to split
+ * @range: Address range (start, end) to split around
+ *
+ * Returns the number of additional EFI memmap entries required to
+ * accommodate @range.
+ */
+int __init efi_memmap_split_count(efi_memory_desc_t *md, struct range *range)
+{
+ u64 m_start, m_end;
+ u64 start, end;
+ int count = 0;
+
+ start = md->phys_addr;
+ end = start + (md->num_pages << EFI_PAGE_SHIFT) - 1;
+
+ /* modifying range */
+ m_start = range->start;
+ m_end = range->end;
+
+ if (m_start <= start) {
+ /* split into 2 parts */
+ if (start < m_end && m_end < end)
+ count++;
+ }
+
+ if (start < m_start && m_start < end) {
+ /* split into 3 parts */
+ if (m_end < end)
+ count += 2;
+ /* split into 2 parts */
+ if (end <= m_end)
+ count++;
+ }
+
+ return count;
+}
+
+/**
+ * efi_memmap_insert - Insert a memory region in an EFI memmap
+ * @old_memmap: The existing EFI memory map structure
+ * @buf: Address of buffer to store new map
+ * @mem: Memory map entry to insert
+ *
+ * It is suggested that you call efi_memmap_split_count() first
+ * to see how large @buf needs to be.
+ */
+void __init efi_memmap_insert(struct efi_memory_map *old_memmap, void *buf,
+ struct efi_mem_range *mem)
+{
+ u64 m_start, m_end, m_attr;
+ efi_memory_desc_t *md;
+ u64 start, end;
+ void *old, *new;
+
+ /* modifying range */
+ m_start = mem->range.start;
+ m_end = mem->range.end;
+ m_attr = mem->attribute;
+
+ /*
+ * The EFI memory map deals with regions in EFI_PAGE_SIZE
+ * units. Ensure that the region described by 'mem' is aligned
+ * correctly.
+ */
+ if (!IS_ALIGNED(m_start, EFI_PAGE_SIZE) ||
+ !IS_ALIGNED(m_end + 1, EFI_PAGE_SIZE)) {
+ WARN_ON(1);
+ return;
+ }
+
+ for (old = old_memmap->map, new = buf;
+ old < old_memmap->map_end;
+ old += old_memmap->desc_size, new += old_memmap->desc_size) {
+
+ /* copy original EFI memory descriptor */
+ memcpy(new, old, old_memmap->desc_size);
+ md = new;
+ start = md->phys_addr;
+ end = md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT) - 1;
+
+ if (m_start <= start && end <= m_end)
+ md->attribute |= m_attr;
+
+ if (m_start <= start &&
+ (start < m_end && m_end < end)) {
+ /* first part */
+ md->attribute |= m_attr;
+ md->num_pages = (m_end - md->phys_addr + 1) >>
+ EFI_PAGE_SHIFT;
+ /* latter part */
+ new += old_memmap->desc_size;
+ memcpy(new, old, old_memmap->desc_size);
+ md = new;
+ md->phys_addr = m_end + 1;
+ md->num_pages = (end - md->phys_addr + 1) >>
+ EFI_PAGE_SHIFT;
+ }
+
+ if ((start < m_start && m_start < end) && m_end < end) {
+ /* first part */
+ md->num_pages = (m_start - md->phys_addr) >>
+ EFI_PAGE_SHIFT;
+ /* middle part */
+ new += old_memmap->desc_size;
+ memcpy(new, old, old_memmap->desc_size);
+ md = new;
+ md->attribute |= m_attr;
+ md->phys_addr = m_start;
+ md->num_pages = (m_end - m_start + 1) >>
+ EFI_PAGE_SHIFT;
+ /* last part */
+ new += old_memmap->desc_size;
+ memcpy(new, old, old_memmap->desc_size);
+ md = new;
+ md->phys_addr = m_end + 1;
+ md->num_pages = (end - m_end) >>
+ EFI_PAGE_SHIFT;
+ }
+
+ if ((start < m_start && m_start < end) &&
+ (end <= m_end)) {
+ /* first part */
+ md->num_pages = (m_start - md->phys_addr) >>
+ EFI_PAGE_SHIFT;
+ /* latter part */
+ new += old_memmap->desc_size;
+ memcpy(new, old, old_memmap->desc_size);
+ md = new;
+ md->phys_addr = m_start;
+ md->num_pages = (end - md->phys_addr + 1) >>
+ EFI_PAGE_SHIFT;
+ md->attribute |= m_attr;
+ }
+ }
+}
diff --git a/arch/x86/platform/efi/runtime-map.c b/arch/x86/platform/efi/runtime-map.c
new file mode 100644
index 000000000000..bbee682ef8cd
--- /dev/null
+++ b/arch/x86/platform/efi/runtime-map.c
@@ -0,0 +1,194 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2013 Red Hat, Inc., Dave Young <dyoung@redhat.com>
+ */
+
+#include <linux/string.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/efi.h>
+#include <linux/slab.h>
+
+#include <asm/efi.h>
+#include <asm/setup.h>
+
+struct efi_runtime_map_entry {
+ efi_memory_desc_t md;
+ struct kobject kobj; /* kobject for each entry */
+};
+
+static struct efi_runtime_map_entry **map_entries;
+
+struct map_attribute {
+ struct attribute attr;
+ ssize_t (*show)(struct efi_runtime_map_entry *entry, char *buf);
+};
+
+static inline struct map_attribute *to_map_attr(struct attribute *attr)
+{
+ return container_of(attr, struct map_attribute, attr);
+}
+
+static ssize_t type_show(struct efi_runtime_map_entry *entry, char *buf)
+{
+ return snprintf(buf, PAGE_SIZE, "0x%x\n", entry->md.type);
+}
+
+#define EFI_RUNTIME_FIELD(var) entry->md.var
+
+#define EFI_RUNTIME_U64_ATTR_SHOW(name) \
+static ssize_t name##_show(struct efi_runtime_map_entry *entry, char *buf) \
+{ \
+ return snprintf(buf, PAGE_SIZE, "0x%llx\n", EFI_RUNTIME_FIELD(name)); \
+}
+
+EFI_RUNTIME_U64_ATTR_SHOW(phys_addr);
+EFI_RUNTIME_U64_ATTR_SHOW(virt_addr);
+EFI_RUNTIME_U64_ATTR_SHOW(num_pages);
+EFI_RUNTIME_U64_ATTR_SHOW(attribute);
+
+static inline struct efi_runtime_map_entry *to_map_entry(struct kobject *kobj)
+{
+ return container_of(kobj, struct efi_runtime_map_entry, kobj);
+}
+
+static ssize_t map_attr_show(struct kobject *kobj, struct attribute *attr,
+ char *buf)
+{
+ struct efi_runtime_map_entry *entry = to_map_entry(kobj);
+ struct map_attribute *map_attr = to_map_attr(attr);
+
+ return map_attr->show(entry, buf);
+}
+
+static struct map_attribute map_type_attr = __ATTR_RO_MODE(type, 0400);
+static struct map_attribute map_phys_addr_attr = __ATTR_RO_MODE(phys_addr, 0400);
+static struct map_attribute map_virt_addr_attr = __ATTR_RO_MODE(virt_addr, 0400);
+static struct map_attribute map_num_pages_attr = __ATTR_RO_MODE(num_pages, 0400);
+static struct map_attribute map_attribute_attr = __ATTR_RO_MODE(attribute, 0400);
+
+/*
+ * These are default attributes that are added for every memmap entry.
+ */
+static struct attribute *def_attrs[] = {
+ &map_type_attr.attr,
+ &map_phys_addr_attr.attr,
+ &map_virt_addr_attr.attr,
+ &map_num_pages_attr.attr,
+ &map_attribute_attr.attr,
+ NULL
+};
+ATTRIBUTE_GROUPS(def);
+
+static const struct sysfs_ops map_attr_ops = {
+ .show = map_attr_show,
+};
+
+static void map_release(struct kobject *kobj)
+{
+ struct efi_runtime_map_entry *entry;
+
+ entry = to_map_entry(kobj);
+ kfree(entry);
+}
+
+static struct kobj_type __refdata map_ktype = {
+ .sysfs_ops = &map_attr_ops,
+ .default_groups = def_groups,
+ .release = map_release,
+};
+
+static struct kset *map_kset;
+
+static struct efi_runtime_map_entry *
+add_sysfs_runtime_map_entry(struct kobject *kobj, int nr,
+ efi_memory_desc_t *md)
+{
+ int ret;
+ struct efi_runtime_map_entry *entry;
+
+ if (!map_kset) {
+ map_kset = kset_create_and_add("runtime-map", NULL, kobj);
+ if (!map_kset)
+ return ERR_PTR(-ENOMEM);
+ }
+
+ entry = kzalloc(sizeof(*entry), GFP_KERNEL);
+ if (!entry) {
+ kset_unregister(map_kset);
+ map_kset = NULL;
+ return ERR_PTR(-ENOMEM);
+ }
+
+ memcpy(&entry->md, md, sizeof(efi_memory_desc_t));
+
+ kobject_init(&entry->kobj, &map_ktype);
+ entry->kobj.kset = map_kset;
+ ret = kobject_add(&entry->kobj, NULL, "%d", nr);
+ if (ret) {
+ kobject_put(&entry->kobj);
+ kset_unregister(map_kset);
+ map_kset = NULL;
+ return ERR_PTR(ret);
+ }
+
+ return entry;
+}
+
+int efi_get_runtime_map_size(void)
+{
+ return efi.memmap.nr_map * efi.memmap.desc_size;
+}
+
+int efi_get_runtime_map_desc_size(void)
+{
+ return efi.memmap.desc_size;
+}
+
+int efi_runtime_map_copy(void *buf, size_t bufsz)
+{
+ size_t sz = efi_get_runtime_map_size();
+
+ if (sz > bufsz)
+ sz = bufsz;
+
+ memcpy(buf, efi.memmap.map, sz);
+ return 0;
+}
+
+static int __init efi_runtime_map_init(void)
+{
+ int i, j, ret = 0;
+ struct efi_runtime_map_entry *entry;
+ efi_memory_desc_t *md;
+
+ if (!efi_enabled(EFI_MEMMAP) || !efi_kobj)
+ return 0;
+
+ map_entries = kcalloc(efi.memmap.nr_map, sizeof(entry), GFP_KERNEL);
+ if (!map_entries) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ i = 0;
+ for_each_efi_memory_desc(md) {
+ entry = add_sysfs_runtime_map_entry(efi_kobj, i, md);
+ if (IS_ERR(entry)) {
+ ret = PTR_ERR(entry);
+ goto out_add_entry;
+ }
+ *(map_entries + i++) = entry;
+ }
+
+ return 0;
+out_add_entry:
+ for (j = i - 1; j >= 0; j--) {
+ entry = *(map_entries + j);
+ kobject_put(&entry->kobj);
+ }
+out:
+ return ret;
+}
+subsys_initcall_sync(efi_runtime_map_init);
diff --git a/arch/x86/platform/olpc/olpc-xo15-sci.c b/arch/x86/platform/olpc/olpc-xo15-sci.c
index 994a229cb79f..68244a3422d1 100644
--- a/arch/x86/platform/olpc/olpc-xo15-sci.c
+++ b/arch/x86/platform/olpc/olpc-xo15-sci.c
@@ -183,13 +183,12 @@ err_sysfs:
return r;
}
-static int xo15_sci_remove(struct acpi_device *device)
+static void xo15_sci_remove(struct acpi_device *device)
{
acpi_disable_gpe(NULL, xo15_sci_gpe);
acpi_remove_gpe_handler(NULL, xo15_sci_gpe, xo15_sci_gpe_handler);
cancel_work_sync(&sci_work);
sysfs_remove_file(&device->dev.kobj, &lid_wake_on_close_attr.attr);
- return 0;
}
#ifdef CONFIG_PM_SLEEP
diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c
index bb176c72891c..236447ee9beb 100644
--- a/arch/x86/power/cpu.c
+++ b/arch/x86/power/cpu.c
@@ -23,6 +23,7 @@
#include <asm/fpu/api.h>
#include <asm/debugreg.h>
#include <asm/cpu.h>
+#include <asm/cacheinfo.h>
#include <asm/mmu_context.h>
#include <asm/cpu_device_id.h>
#include <asm/microcode.h>
@@ -261,7 +262,7 @@ static void notrace __restore_processor_state(struct saved_context *ctxt)
do_fpu_end();
tsc_verify_tsc_adjust(true);
x86_platform.restore_sched_clock_state();
- mtrr_bp_restore();
+ cache_bp_restore();
perf_restore_debug_store();
c = &cpu_data(smp_processor_id());
@@ -513,15 +514,23 @@ static int pm_cpu_check(const struct x86_cpu_id *c)
static void pm_save_spec_msr(void)
{
- u32 spec_msr_id[] = {
- MSR_IA32_SPEC_CTRL,
- MSR_IA32_TSX_CTRL,
- MSR_TSX_FORCE_ABORT,
- MSR_IA32_MCU_OPT_CTRL,
- MSR_AMD64_LS_CFG,
+ struct msr_enumeration {
+ u32 msr_no;
+ u32 feature;
+ } msr_enum[] = {
+ { MSR_IA32_SPEC_CTRL, X86_FEATURE_MSR_SPEC_CTRL },
+ { MSR_IA32_TSX_CTRL, X86_FEATURE_MSR_TSX_CTRL },
+ { MSR_TSX_FORCE_ABORT, X86_FEATURE_TSX_FORCE_ABORT },
+ { MSR_IA32_MCU_OPT_CTRL, X86_FEATURE_SRBDS_CTRL },
+ { MSR_AMD64_LS_CFG, X86_FEATURE_LS_CFG_SSBD },
+ { MSR_AMD64_DE_CFG, X86_FEATURE_LFENCE_RDTSC },
};
+ int i;
- msr_build_context(spec_msr_id, ARRAY_SIZE(spec_msr_id));
+ for (i = 0; i < ARRAY_SIZE(msr_enum); i++) {
+ if (boot_cpu_has(msr_enum[i].feature))
+ msr_build_context(&msr_enum[i].msr_no, 1);
+ }
}
static int pm_check_save_msr(void)
diff --git a/arch/x86/power/hibernate.c b/arch/x86/power/hibernate.c
index e94e0050a583..6f955eb1e163 100644
--- a/arch/x86/power/hibernate.c
+++ b/arch/x86/power/hibernate.c
@@ -159,7 +159,7 @@ int relocate_restore_code(void)
if (!relocated_restore_code)
return -ENOMEM;
- memcpy((void *)relocated_restore_code, core_restore_code, PAGE_SIZE);
+ __memcpy((void *)relocated_restore_code, core_restore_code, PAGE_SIZE);
/* Make the page containing the relocated code executable */
pgd = (pgd_t *)__va(read_cr3_pa()) +
diff --git a/arch/x86/realmode/init.c b/arch/x86/realmode/init.c
index 41d7669a97ad..af565816d2ba 100644
--- a/arch/x86/realmode/init.c
+++ b/arch/x86/realmode/init.c
@@ -200,14 +200,18 @@ static void __init set_real_mode_permissions(void)
set_memory_x((unsigned long) text_start, text_size >> PAGE_SHIFT);
}
-static int __init init_real_mode(void)
+void __init init_real_mode(void)
{
if (!real_mode_header)
panic("Real mode trampoline was not allocated");
setup_real_mode();
set_real_mode_permissions();
+}
+static int __init do_init_real_mode(void)
+{
+ x86_platform.realmode_init();
return 0;
}
-early_initcall(init_real_mode);
+early_initcall(do_init_real_mode);
diff --git a/arch/x86/um/asm/elf.h b/arch/x86/um/asm/elf.h
index dcaf3b38a9e0..6523eb7c3bd1 100644
--- a/arch/x86/um/asm/elf.h
+++ b/arch/x86/um/asm/elf.h
@@ -201,10 +201,6 @@ typedef struct user_i387_struct elf_fpregset_t;
struct task_struct;
-extern int elf_core_copy_fpregs(struct task_struct *t, elf_fpregset_t *fpu);
-
-#define ELF_CORE_COPY_FPREGS(t, fpu) elf_core_copy_fpregs(t, fpu)
-
#define ELF_EXEC_PAGESIZE 4096
#define ELF_ET_DYN_BASE (TASK_SIZE / 3 * 2)
diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c
index f82857e48815..5b1379662877 100644
--- a/arch/x86/xen/enlighten_pv.c
+++ b/arch/x86/xen/enlighten_pv.c
@@ -23,6 +23,7 @@
#include <linux/start_kernel.h>
#include <linux/sched.h>
#include <linux/kprobes.h>
+#include <linux/kstrtox.h>
#include <linux/memblock.h>
#include <linux/export.h>
#include <linux/mm.h>
@@ -32,6 +33,7 @@
#include <linux/edd.h>
#include <linux/reboot.h>
#include <linux/virtio_anchor.h>
+#include <linux/stackprotector.h>
#include <xen/xen.h>
#include <xen/events.h>
@@ -64,7 +66,6 @@
#include <asm/pgalloc.h>
#include <asm/tlbflush.h>
#include <asm/reboot.h>
-#include <asm/stackprotector.h>
#include <asm/hypervisor.h>
#include <asm/mach_traps.h>
#include <asm/mwait.h>
@@ -113,7 +114,7 @@ static __read_mostly bool xen_msr_safe = IS_ENABLED(CONFIG_XEN_PV_MSR_SAFE);
static int __init parse_xen_msr_safe(char *str)
{
if (str)
- return strtobool(str, &xen_msr_safe);
+ return kstrtobool(str, &xen_msr_safe);
return -EINVAL;
}
early_param("xen_msr_safe", parse_xen_msr_safe);
@@ -1209,7 +1210,7 @@ static void __init xen_setup_gdt(int cpu)
pv_ops.cpu.write_gdt_entry = xen_write_gdt_entry_boot;
pv_ops.cpu.load_gdt = xen_load_gdt_boot;
- switch_to_new_gdt(cpu);
+ switch_gdt_and_percpu_base(cpu);
pv_ops.cpu.write_gdt_entry = xen_write_gdt_entry;
pv_ops.cpu.load_gdt = xen_load_gdt;
@@ -1265,6 +1266,8 @@ asmlinkage __visible void __init xen_start_kernel(struct start_info *si)
xen_vcpu_info_reset(0);
x86_platform.get_nmi_reason = xen_get_nmi_reason;
+ x86_platform.realmode_reserve = x86_init_noop;
+ x86_platform.realmode_init = x86_init_noop;
x86_init.resources.memory_setup = xen_memory_setup;
x86_init.irqs.intr_mode_select = x86_init_noop;
diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index 4f4309500559..8db26f10fb1d 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -7,6 +7,7 @@
#include <linux/init.h>
#include <linux/sched.h>
+#include <linux/kstrtox.h>
#include <linux/mm.h>
#include <linux/pm.h>
#include <linux/memblock.h>
@@ -85,7 +86,7 @@ static void __init xen_parse_512gb(void)
arg = strstr(xen_start_info->cmd_line, "xen_512gb_limit=");
if (!arg)
val = true;
- else if (strtobool(arg + strlen("xen_512gb_limit="), &val))
+ else if (kstrtobool(arg + strlen("xen_512gb_limit="), &val))
return;
xen_512gb_limit = val;
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
index c3e1f9a7d43a..4b0d6fff88de 100644
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -32,30 +32,30 @@ static irqreturn_t xen_reschedule_interrupt(int irq, void *dev_id)
void xen_smp_intr_free(unsigned int cpu)
{
+ kfree(per_cpu(xen_resched_irq, cpu).name);
+ per_cpu(xen_resched_irq, cpu).name = NULL;
if (per_cpu(xen_resched_irq, cpu).irq >= 0) {
unbind_from_irqhandler(per_cpu(xen_resched_irq, cpu).irq, NULL);
per_cpu(xen_resched_irq, cpu).irq = -1;
- kfree(per_cpu(xen_resched_irq, cpu).name);
- per_cpu(xen_resched_irq, cpu).name = NULL;
}
+ kfree(per_cpu(xen_callfunc_irq, cpu).name);
+ per_cpu(xen_callfunc_irq, cpu).name = NULL;
if (per_cpu(xen_callfunc_irq, cpu).irq >= 0) {
unbind_from_irqhandler(per_cpu(xen_callfunc_irq, cpu).irq, NULL);
per_cpu(xen_callfunc_irq, cpu).irq = -1;
- kfree(per_cpu(xen_callfunc_irq, cpu).name);
- per_cpu(xen_callfunc_irq, cpu).name = NULL;
}
+ kfree(per_cpu(xen_debug_irq, cpu).name);
+ per_cpu(xen_debug_irq, cpu).name = NULL;
if (per_cpu(xen_debug_irq, cpu).irq >= 0) {
unbind_from_irqhandler(per_cpu(xen_debug_irq, cpu).irq, NULL);
per_cpu(xen_debug_irq, cpu).irq = -1;
- kfree(per_cpu(xen_debug_irq, cpu).name);
- per_cpu(xen_debug_irq, cpu).name = NULL;
}
+ kfree(per_cpu(xen_callfuncsingle_irq, cpu).name);
+ per_cpu(xen_callfuncsingle_irq, cpu).name = NULL;
if (per_cpu(xen_callfuncsingle_irq, cpu).irq >= 0) {
unbind_from_irqhandler(per_cpu(xen_callfuncsingle_irq, cpu).irq,
NULL);
per_cpu(xen_callfuncsingle_irq, cpu).irq = -1;
- kfree(per_cpu(xen_callfuncsingle_irq, cpu).name);
- per_cpu(xen_callfuncsingle_irq, cpu).name = NULL;
}
}
@@ -65,6 +65,7 @@ int xen_smp_intr_init(unsigned int cpu)
char *resched_name, *callfunc_name, *debug_name;
resched_name = kasprintf(GFP_KERNEL, "resched%d", cpu);
+ per_cpu(xen_resched_irq, cpu).name = resched_name;
rc = bind_ipi_to_irqhandler(XEN_RESCHEDULE_VECTOR,
cpu,
xen_reschedule_interrupt,
@@ -74,9 +75,9 @@ int xen_smp_intr_init(unsigned int cpu)
if (rc < 0)
goto fail;
per_cpu(xen_resched_irq, cpu).irq = rc;
- per_cpu(xen_resched_irq, cpu).name = resched_name;
callfunc_name = kasprintf(GFP_KERNEL, "callfunc%d", cpu);
+ per_cpu(xen_callfunc_irq, cpu).name = callfunc_name;
rc = bind_ipi_to_irqhandler(XEN_CALL_FUNCTION_VECTOR,
cpu,
xen_call_function_interrupt,
@@ -86,10 +87,10 @@ int xen_smp_intr_init(unsigned int cpu)
if (rc < 0)
goto fail;
per_cpu(xen_callfunc_irq, cpu).irq = rc;
- per_cpu(xen_callfunc_irq, cpu).name = callfunc_name;
if (!xen_fifo_events) {
debug_name = kasprintf(GFP_KERNEL, "debug%d", cpu);
+ per_cpu(xen_debug_irq, cpu).name = debug_name;
rc = bind_virq_to_irqhandler(VIRQ_DEBUG, cpu,
xen_debug_interrupt,
IRQF_PERCPU | IRQF_NOBALANCING,
@@ -97,10 +98,10 @@ int xen_smp_intr_init(unsigned int cpu)
if (rc < 0)
goto fail;
per_cpu(xen_debug_irq, cpu).irq = rc;
- per_cpu(xen_debug_irq, cpu).name = debug_name;
}
callfunc_name = kasprintf(GFP_KERNEL, "callfuncsingle%d", cpu);
+ per_cpu(xen_callfuncsingle_irq, cpu).name = callfunc_name;
rc = bind_ipi_to_irqhandler(XEN_CALL_FUNCTION_SINGLE_VECTOR,
cpu,
xen_call_function_single_interrupt,
@@ -110,7 +111,6 @@ int xen_smp_intr_init(unsigned int cpu)
if (rc < 0)
goto fail;
per_cpu(xen_callfuncsingle_irq, cpu).irq = rc;
- per_cpu(xen_callfuncsingle_irq, cpu).name = callfunc_name;
return 0;
diff --git a/arch/x86/xen/smp_pv.c b/arch/x86/xen/smp_pv.c
index 480be82e9b7b..6175f2c5c822 100644
--- a/arch/x86/xen/smp_pv.c
+++ b/arch/x86/xen/smp_pv.c
@@ -97,18 +97,18 @@ asmlinkage __visible void cpu_bringup_and_idle(void)
void xen_smp_intr_free_pv(unsigned int cpu)
{
+ kfree(per_cpu(xen_irq_work, cpu).name);
+ per_cpu(xen_irq_work, cpu).name = NULL;
if (per_cpu(xen_irq_work, cpu).irq >= 0) {
unbind_from_irqhandler(per_cpu(xen_irq_work, cpu).irq, NULL);
per_cpu(xen_irq_work, cpu).irq = -1;
- kfree(per_cpu(xen_irq_work, cpu).name);
- per_cpu(xen_irq_work, cpu).name = NULL;
}
+ kfree(per_cpu(xen_pmu_irq, cpu).name);
+ per_cpu(xen_pmu_irq, cpu).name = NULL;
if (per_cpu(xen_pmu_irq, cpu).irq >= 0) {
unbind_from_irqhandler(per_cpu(xen_pmu_irq, cpu).irq, NULL);
per_cpu(xen_pmu_irq, cpu).irq = -1;
- kfree(per_cpu(xen_pmu_irq, cpu).name);
- per_cpu(xen_pmu_irq, cpu).name = NULL;
}
}
@@ -118,6 +118,7 @@ int xen_smp_intr_init_pv(unsigned int cpu)
char *callfunc_name, *pmu_name;
callfunc_name = kasprintf(GFP_KERNEL, "irqwork%d", cpu);
+ per_cpu(xen_irq_work, cpu).name = callfunc_name;
rc = bind_ipi_to_irqhandler(XEN_IRQ_WORK_VECTOR,
cpu,
xen_irq_work_interrupt,
@@ -127,10 +128,10 @@ int xen_smp_intr_init_pv(unsigned int cpu)
if (rc < 0)
goto fail;
per_cpu(xen_irq_work, cpu).irq = rc;
- per_cpu(xen_irq_work, cpu).name = callfunc_name;
if (is_xen_pmu) {
pmu_name = kasprintf(GFP_KERNEL, "pmu%d", cpu);
+ per_cpu(xen_pmu_irq, cpu).name = pmu_name;
rc = bind_virq_to_irqhandler(VIRQ_XENPMU, cpu,
xen_pmu_irq_handler,
IRQF_PERCPU|IRQF_NOBALANCING,
@@ -138,7 +139,6 @@ int xen_smp_intr_init_pv(unsigned int cpu)
if (rc < 0)
goto fail;
per_cpu(xen_pmu_irq, cpu).irq = rc;
- per_cpu(xen_pmu_irq, cpu).name = pmu_name;
}
return 0;
diff --git a/arch/x86/xen/spinlock.c b/arch/x86/xen/spinlock.c
index 043c73dfd2c9..5c6fc16e4b92 100644
--- a/arch/x86/xen/spinlock.c
+++ b/arch/x86/xen/spinlock.c
@@ -75,6 +75,7 @@ void xen_init_lock_cpu(int cpu)
cpu, per_cpu(lock_kicker_irq, cpu));
name = kasprintf(GFP_KERNEL, "spinlock%d", cpu);
+ per_cpu(irq_name, cpu) = name;
irq = bind_ipi_to_irqhandler(XEN_SPIN_UNLOCK_VECTOR,
cpu,
dummy_handler,
@@ -85,7 +86,6 @@ void xen_init_lock_cpu(int cpu)
if (irq >= 0) {
disable_irq(irq); /* make sure it's never delivered */
per_cpu(lock_kicker_irq, cpu) = irq;
- per_cpu(irq_name, cpu) = name;
}
printk("cpu %d spinlock event irq %d\n", cpu, irq);
@@ -98,6 +98,8 @@ void xen_uninit_lock_cpu(int cpu)
if (!xen_pvspin)
return;
+ kfree(per_cpu(irq_name, cpu));
+ per_cpu(irq_name, cpu) = NULL;
/*
* When booting the kernel with 'mitigations=auto,nosmt', the secondary
* CPUs are not activated, and lock_kicker_irq is not initialized.
@@ -108,8 +110,6 @@ void xen_uninit_lock_cpu(int cpu)
unbind_from_irqhandler(irq, NULL);
per_cpu(lock_kicker_irq, cpu) = -1;
- kfree(per_cpu(irq_name, cpu));
- per_cpu(irq_name, cpu) = NULL;
}
PV_CALLEE_SAVE_REGS_THUNK(xen_vcpu_stolen);
diff --git a/arch/x86/xen/xen-asm.S b/arch/x86/xen/xen-asm.S
index 6b4fdf6b9542..4a184f6e4e4d 100644
--- a/arch/x86/xen/xen-asm.S
+++ b/arch/x86/xen/xen-asm.S
@@ -262,10 +262,10 @@ SYM_CODE_START(xen_entry_SYSCALL_compat)
/*
* Neither Xen nor the kernel really knows what the old SS and
- * CS were. The kernel expects __USER32_DS and __USER32_CS, so
+ * CS were. The kernel expects __USER_DS and __USER32_CS, so
* report those values even though Xen will guess its own values.
*/
- movq $__USER32_DS, 4*8(%rsp)
+ movq $__USER_DS, 4*8(%rsp)
movq $__USER32_CS, 1*8(%rsp)
jmp entry_SYSCALL_compat_after_hwframe
@@ -284,10 +284,10 @@ SYM_CODE_START(xen_entry_SYSENTER_compat)
/*
* Neither Xen nor the kernel really knows what the old SS and
- * CS were. The kernel expects __USER32_DS and __USER32_CS, so
+ * CS were. The kernel expects __USER_DS and __USER32_CS, so
* report those values even though Xen will guess its own values.
*/
- movq $__USER32_DS, 4*8(%rsp)
+ movq $__USER_DS, 4*8(%rsp)
movq $__USER32_CS, 1*8(%rsp)
jmp entry_SYSENTER_compat_after_hwframe