summaryrefslogtreecommitdiff
path: root/arch/x86/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/kernel')
-rw-r--r--arch/x86/kernel/acpi/cppc.c2
-rw-r--r--arch/x86/kernel/alternative.c80
-rw-r--r--arch/x86/kernel/amd_node.c150
-rw-r--r--arch/x86/kernel/apic/apic.c15
-rw-r--r--arch/x86/kernel/apic/io_apic.c2
-rw-r--r--arch/x86/kernel/cpu/amd.c20
-rw-r--r--arch/x86/kernel/cpu/common.c6
-rw-r--r--arch/x86/kernel/cpu/microcode/amd.c23
-rw-r--r--arch/x86/kernel/dumpstack.c23
-rw-r--r--arch/x86/kernel/fpu/core.c3
-rw-r--r--arch/x86/kernel/ftrace_64.S8
-rw-r--r--arch/x86/kernel/kprobes/core.c3
-rw-r--r--arch/x86/kernel/kprobes/opt.c4
-rw-r--r--arch/x86/kernel/module.c15
-rw-r--r--arch/x86/kernel/smpboot.c78
-rw-r--r--arch/x86/kernel/static_call.c13
-rw-r--r--arch/x86/kernel/traps.c119
-rw-r--r--arch/x86/kernel/uprobes.c70
18 files changed, 409 insertions, 225 deletions
diff --git a/arch/x86/kernel/acpi/cppc.c b/arch/x86/kernel/acpi/cppc.c
index 7047124490f6..d7c8ef1e354d 100644
--- a/arch/x86/kernel/acpi/cppc.c
+++ b/arch/x86/kernel/acpi/cppc.c
@@ -196,7 +196,7 @@ int amd_detect_prefcore(bool *detected)
break;
}
- for_each_present_cpu(cpu) {
+ for_each_online_cpu(cpu) {
u32 tmp;
int ret;
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 8ee5ff547357..e377b06e70e3 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -9,6 +9,7 @@
#include <asm/text-patching.h>
#include <asm/insn.h>
+#include <asm/insn-eval.h>
#include <asm/ibt.h>
#include <asm/set_memory.h>
#include <asm/nmi.h>
@@ -346,25 +347,6 @@ static void add_nop(u8 *buf, unsigned int len)
}
/*
- * Matches NOP and NOPL, not any of the other possible NOPs.
- */
-static bool insn_is_nop(struct insn *insn)
-{
- /* Anything NOP, but no REP NOP */
- if (insn->opcode.bytes[0] == 0x90 &&
- (!insn->prefixes.nbytes || insn->prefixes.bytes[0] != 0xF3))
- return true;
-
- /* NOPL */
- if (insn->opcode.bytes[0] == 0x0F && insn->opcode.bytes[1] == 0x1F)
- return true;
-
- /* TODO: more nops */
-
- return false;
-}
-
-/*
* Find the offset of the first non-NOP instruction starting at @offset
* but no further than @len.
*/
@@ -559,7 +541,7 @@ EXPORT_SYMBOL(BUG_func);
* Rewrite the "call BUG_func" replacement to point to the target of the
* indirect pv_ops call "call *disp(%ip)".
*/
-static int alt_replace_call(u8 *instr, u8 *insn_buff, struct alt_instr *a)
+static unsigned int alt_replace_call(u8 *instr, u8 *insn_buff, struct alt_instr *a)
{
void *target, *bug = &BUG_func;
s32 disp;
@@ -643,7 +625,7 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start,
* order.
*/
for (a = start; a < end; a++) {
- int insn_buff_sz = 0;
+ unsigned int insn_buff_sz = 0;
/*
* In case of nested ALTERNATIVE()s the outer alternative might
@@ -683,11 +665,8 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start,
memcpy(insn_buff, replacement, a->replacementlen);
insn_buff_sz = a->replacementlen;
- if (a->flags & ALT_FLAG_DIRECT_CALL) {
+ if (a->flags & ALT_FLAG_DIRECT_CALL)
insn_buff_sz = alt_replace_call(instr, insn_buff, a);
- if (insn_buff_sz < 0)
- continue;
- }
for (; insn_buff_sz < a->instrlen; insn_buff_sz++)
insn_buff[insn_buff_sz] = 0x90;
@@ -2244,21 +2223,34 @@ int alternatives_text_reserved(void *start, void *end)
* See entry_{32,64}.S for more details.
*/
-/*
- * We define the int3_magic() function in assembly to control the calling
- * convention such that we can 'call' it from assembly.
- */
+extern void int3_selftest_asm(unsigned int *ptr);
-extern void int3_magic(unsigned int *ptr); /* defined in asm */
+asm (
+" .pushsection .init.text, \"ax\", @progbits\n"
+" .type int3_selftest_asm, @function\n"
+"int3_selftest_asm:\n"
+ ANNOTATE_NOENDBR
+ /*
+ * INT3 padded with NOP to CALL_INSN_SIZE. The INT3 triggers an
+ * exception, then the int3_exception_nb notifier emulates a call to
+ * int3_selftest_callee().
+ */
+" int3; nop; nop; nop; nop\n"
+ ASM_RET
+" .size int3_selftest_asm, . - int3_selftest_asm\n"
+" .popsection\n"
+);
+
+extern void int3_selftest_callee(unsigned int *ptr);
asm (
" .pushsection .init.text, \"ax\", @progbits\n"
-" .type int3_magic, @function\n"
-"int3_magic:\n"
+" .type int3_selftest_callee, @function\n"
+"int3_selftest_callee:\n"
ANNOTATE_NOENDBR
-" movl $1, (%" _ASM_ARG1 ")\n"
+" movl $0x1234, (%" _ASM_ARG1 ")\n"
ASM_RET
-" .size int3_magic, .-int3_magic\n"
+" .size int3_selftest_callee, . - int3_selftest_callee\n"
" .popsection\n"
);
@@ -2267,7 +2259,7 @@ extern void int3_selftest_ip(void); /* defined in asm below */
static int __init
int3_exception_notify(struct notifier_block *self, unsigned long val, void *data)
{
- unsigned long selftest = (unsigned long)&int3_selftest_ip;
+ unsigned long selftest = (unsigned long)&int3_selftest_asm;
struct die_args *args = data;
struct pt_regs *regs = args->regs;
@@ -2282,7 +2274,7 @@ int3_exception_notify(struct notifier_block *self, unsigned long val, void *data
if (regs->ip - INT3_INSN_SIZE != selftest)
return NOTIFY_DONE;
- int3_emulate_call(regs, (unsigned long)&int3_magic);
+ int3_emulate_call(regs, (unsigned long)&int3_selftest_callee);
return NOTIFY_STOP;
}
@@ -2298,19 +2290,11 @@ static noinline void __init int3_selftest(void)
BUG_ON(register_die_notifier(&int3_exception_nb));
/*
- * Basically: int3_magic(&val); but really complicated :-)
- *
- * INT3 padded with NOP to CALL_INSN_SIZE. The int3_exception_nb
- * notifier above will emulate CALL for us.
+ * Basically: int3_selftest_callee(&val); but really complicated :-)
*/
- asm volatile ("int3_selftest_ip:\n\t"
- ANNOTATE_NOENDBR
- " int3; nop; nop; nop; nop\n\t"
- : ASM_CALL_CONSTRAINT
- : __ASM_SEL_RAW(a, D) (&val)
- : "memory");
-
- BUG_ON(val != 1);
+ int3_selftest_asm(&val);
+
+ BUG_ON(val != 0x1234);
unregister_die_notifier(&int3_exception_nb);
}
diff --git a/arch/x86/kernel/amd_node.c b/arch/x86/kernel/amd_node.c
index a40176b62eb5..3d0a4768d603 100644
--- a/arch/x86/kernel/amd_node.c
+++ b/arch/x86/kernel/amd_node.c
@@ -34,62 +34,6 @@ struct pci_dev *amd_node_get_func(u16 node, u8 func)
return pci_get_domain_bus_and_slot(0, 0, PCI_DEVFN(AMD_NODE0_PCI_SLOT + node, func));
}
-#define DF_BLK_INST_CNT 0x040
-#define DF_CFG_ADDR_CNTL_LEGACY 0x084
-#define DF_CFG_ADDR_CNTL_DF4 0xC04
-
-#define DF_MAJOR_REVISION GENMASK(27, 24)
-
-static u16 get_cfg_addr_cntl_offset(struct pci_dev *df_f0)
-{
- u32 reg;
-
- /*
- * Revision fields added for DF4 and later.
- *
- * Major revision of '0' is found pre-DF4. Field is Read-as-Zero.
- */
- if (pci_read_config_dword(df_f0, DF_BLK_INST_CNT, &reg))
- return 0;
-
- if (reg & DF_MAJOR_REVISION)
- return DF_CFG_ADDR_CNTL_DF4;
-
- return DF_CFG_ADDR_CNTL_LEGACY;
-}
-
-struct pci_dev *amd_node_get_root(u16 node)
-{
- struct pci_dev *root;
- u16 cntl_off;
- u8 bus;
-
- if (!cpu_feature_enabled(X86_FEATURE_ZEN))
- return NULL;
-
- /*
- * D18F0xXXX [Config Address Control] (DF::CfgAddressCntl)
- * Bits [7:0] (SecBusNum) holds the bus number of the root device for
- * this Data Fabric instance. The segment, device, and function will be 0.
- */
- struct pci_dev *df_f0 __free(pci_dev_put) = amd_node_get_func(node, 0);
- if (!df_f0)
- return NULL;
-
- cntl_off = get_cfg_addr_cntl_offset(df_f0);
- if (!cntl_off)
- return NULL;
-
- if (pci_read_config_byte(df_f0, cntl_off, &bus))
- return NULL;
-
- /* Grab the pointer for the actual root device instance. */
- root = pci_get_domain_bus_and_slot(0, bus, 0);
-
- pci_dbg(root, "is root for AMD node %u\n", node);
- return root;
-}
-
static struct pci_dev **amd_roots;
/* Protect the PCI config register pairs used for SMN. */
@@ -274,51 +218,21 @@ DEFINE_SHOW_STORE_ATTRIBUTE(smn_node);
DEFINE_SHOW_STORE_ATTRIBUTE(smn_address);
DEFINE_SHOW_STORE_ATTRIBUTE(smn_value);
-static int amd_cache_roots(void)
-{
- u16 node, num_nodes = amd_num_nodes();
-
- amd_roots = kcalloc(num_nodes, sizeof(*amd_roots), GFP_KERNEL);
- if (!amd_roots)
- return -ENOMEM;
-
- for (node = 0; node < num_nodes; node++)
- amd_roots[node] = amd_node_get_root(node);
-
- return 0;
-}
-
-static int reserve_root_config_spaces(void)
+static struct pci_dev *get_next_root(struct pci_dev *root)
{
- struct pci_dev *root = NULL;
- struct pci_bus *bus = NULL;
-
- while ((bus = pci_find_next_bus(bus))) {
- /* Root device is Device 0 Function 0 on each Primary Bus. */
- root = pci_get_slot(bus, 0);
- if (!root)
+ while ((root = pci_get_class(PCI_CLASS_BRIDGE_HOST << 8, root))) {
+ /* Root device is Device 0 Function 0. */
+ if (root->devfn)
continue;
if (root->vendor != PCI_VENDOR_ID_AMD &&
root->vendor != PCI_VENDOR_ID_HYGON)
continue;
- pci_dbg(root, "Reserving PCI config space\n");
-
- /*
- * There are a few SMN index/data pairs and other registers
- * that shouldn't be accessed by user space.
- * So reserve the entire PCI config space for simplicity rather
- * than covering specific registers piecemeal.
- */
- if (!pci_request_config_region_exclusive(root, 0, PCI_CFG_SPACE_SIZE, NULL)) {
- pci_err(root, "Failed to reserve config space\n");
- return -EEXIST;
- }
+ break;
}
- smn_exclusive = true;
- return 0;
+ return root;
}
static bool enable_dfs;
@@ -332,7 +246,8 @@ __setup("amd_smn_debugfs_enable", amd_smn_enable_dfs);
static int __init amd_smn_init(void)
{
- int err;
+ u16 count, num_roots, roots_per_node, node, num_nodes;
+ struct pci_dev *root;
if (!cpu_feature_enabled(X86_FEATURE_ZEN))
return 0;
@@ -342,13 +257,48 @@ static int __init amd_smn_init(void)
if (amd_roots)
return 0;
- err = amd_cache_roots();
- if (err)
- return err;
+ num_roots = 0;
+ root = NULL;
+ while ((root = get_next_root(root))) {
+ pci_dbg(root, "Reserving PCI config space\n");
- err = reserve_root_config_spaces();
- if (err)
- return err;
+ /*
+ * There are a few SMN index/data pairs and other registers
+ * that shouldn't be accessed by user space. So reserve the
+ * entire PCI config space for simplicity rather than covering
+ * specific registers piecemeal.
+ */
+ if (!pci_request_config_region_exclusive(root, 0, PCI_CFG_SPACE_SIZE, NULL)) {
+ pci_err(root, "Failed to reserve config space\n");
+ return -EEXIST;
+ }
+
+ num_roots++;
+ }
+
+ pr_debug("Found %d AMD root devices\n", num_roots);
+
+ if (!num_roots)
+ return -ENODEV;
+
+ num_nodes = amd_num_nodes();
+ amd_roots = kcalloc(num_nodes, sizeof(*amd_roots), GFP_KERNEL);
+ if (!amd_roots)
+ return -ENOMEM;
+
+ roots_per_node = num_roots / num_nodes;
+
+ count = 0;
+ node = 0;
+ root = NULL;
+ while (node < num_nodes && (root = get_next_root(root))) {
+ /* Use one root for each node and skip the rest. */
+ if (count++ % roots_per_node)
+ continue;
+
+ pci_dbg(root, "is root for AMD node %u\n", node);
+ amd_roots[node++] = root;
+ }
if (enable_dfs) {
debugfs_dir = debugfs_create_dir("amd_smn", arch_debugfs_dir);
@@ -358,6 +308,8 @@ static int __init amd_smn_init(void)
debugfs_create_file("value", 0600, debugfs_dir, NULL, &smn_value_fops);
}
+ smn_exclusive = true;
+
return 0;
}
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 680d305589a3..ca1c8b70ac44 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -173,6 +173,7 @@ static struct resource lapic_resource = {
.flags = IORESOURCE_MEM | IORESOURCE_BUSY,
};
+/* Measured in ticks per HZ. */
unsigned int lapic_timer_period = 0;
static void apic_pm_activate(void);
@@ -792,6 +793,7 @@ static int __init calibrate_APIC_clock(void)
{
struct clock_event_device *levt = this_cpu_ptr(&lapic_events);
u64 tsc_perj = 0, tsc_start = 0;
+ long delta_tsc_khz, bus_khz;
unsigned long jif_start;
unsigned long deltaj;
long delta, deltatsc;
@@ -894,14 +896,15 @@ static int __init calibrate_APIC_clock(void)
apic_pr_verbose("..... calibration result: %u\n", lapic_timer_period);
if (boot_cpu_has(X86_FEATURE_TSC)) {
- apic_pr_verbose("..... CPU clock speed is %ld.%04ld MHz.\n",
- (deltatsc / LAPIC_CAL_LOOPS) / (1000000 / HZ),
- (deltatsc / LAPIC_CAL_LOOPS) % (1000000 / HZ));
+ delta_tsc_khz = (deltatsc * HZ) / (1000 * LAPIC_CAL_LOOPS);
+
+ apic_pr_verbose("..... CPU clock speed is %ld.%03ld MHz.\n",
+ delta_tsc_khz / 1000, delta_tsc_khz % 1000);
}
- apic_pr_verbose("..... host bus clock speed is %u.%04u MHz.\n",
- lapic_timer_period / (1000000 / HZ),
- lapic_timer_period % (1000000 / HZ));
+ bus_khz = (long)lapic_timer_period * HZ / 1000;
+ apic_pr_verbose("..... host bus clock speed is %ld.%03ld MHz.\n",
+ bus_khz / 1000, bus_khz % 1000);
/*
* Do a sanity check on the APIC calibration result
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 5ba2feb2c04c..1e0442e867b1 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -2864,7 +2864,7 @@ int mp_irqdomain_alloc(struct irq_domain *domain, unsigned int virq,
ioapic = mp_irqdomain_ioapic_idx(domain);
pin = info->ioapic.pin;
- if (irq_find_mapping(domain, (irq_hw_number_t)pin) > 0)
+ if (irq_resolve_mapping(domain, (irq_hw_number_t)pin))
return -EEXIST;
data = kzalloc(sizeof(*data), GFP_KERNEL);
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index ccaa51ce63f6..5d46709c58d0 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -516,7 +516,7 @@ static void bsp_init_amd(struct cpuinfo_x86 *c)
setup_force_cpu_cap(X86_FEATURE_ZEN5);
break;
case 0x50 ... 0x5f:
- case 0x90 ... 0xaf:
+ case 0x80 ... 0xaf:
case 0xc0 ... 0xcf:
setup_force_cpu_cap(X86_FEATURE_ZEN6);
break;
@@ -1035,8 +1035,26 @@ static void init_amd_zen4(struct cpuinfo_x86 *c)
}
}
+static const struct x86_cpu_id zen5_rdseed_microcode[] = {
+ ZEN_MODEL_STEP_UCODE(0x1a, 0x02, 0x1, 0x0b00215a),
+ ZEN_MODEL_STEP_UCODE(0x1a, 0x08, 0x1, 0x0b008121),
+ ZEN_MODEL_STEP_UCODE(0x1a, 0x11, 0x0, 0x0b101054),
+ ZEN_MODEL_STEP_UCODE(0x1a, 0x24, 0x0, 0x0b204037),
+ ZEN_MODEL_STEP_UCODE(0x1a, 0x44, 0x0, 0x0b404035),
+ ZEN_MODEL_STEP_UCODE(0x1a, 0x44, 0x1, 0x0b404108),
+ ZEN_MODEL_STEP_UCODE(0x1a, 0x60, 0x0, 0x0b600037),
+ ZEN_MODEL_STEP_UCODE(0x1a, 0x68, 0x0, 0x0b608038),
+ ZEN_MODEL_STEP_UCODE(0x1a, 0x70, 0x0, 0x0b700037),
+ {},
+};
+
static void init_amd_zen5(struct cpuinfo_x86 *c)
{
+ if (!x86_match_min_microcode_rev(zen5_rdseed_microcode)) {
+ clear_cpu_cap(c, X86_FEATURE_RDSEED);
+ msr_clear_bit(MSR_AMD64_CPUID_FN_7, 18);
+ pr_emerg_once("RDSEED32 is broken. Disabling the corresponding CPUID bit.\n");
+ }
}
static void init_amd(struct cpuinfo_x86 *c)
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index c7d3512914ca..02d97834a1d4 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -78,6 +78,10 @@
DEFINE_PER_CPU_READ_MOSTLY(struct cpuinfo_x86, cpu_info);
EXPORT_PER_CPU_SYMBOL(cpu_info);
+/* Used for modules: built-in code uses runtime constants */
+unsigned long USER_PTR_MAX;
+EXPORT_SYMBOL(USER_PTR_MAX);
+
u32 elf_hwcap2 __read_mostly;
/* Number of siblings per CPU package */
@@ -2579,7 +2583,7 @@ void __init arch_cpu_finalize_init(void)
alternative_instructions();
if (IS_ENABLED(CONFIG_X86_64)) {
- unsigned long USER_PTR_MAX = TASK_SIZE_MAX;
+ USER_PTR_MAX = TASK_SIZE_MAX;
/*
* Enable this when LAM is gated on LASS support
diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c
index 28ed8c089024..a881bf4c2011 100644
--- a/arch/x86/kernel/cpu/microcode/amd.c
+++ b/arch/x86/kernel/cpu/microcode/amd.c
@@ -220,10 +220,13 @@ static bool need_sha_check(u32 cur_rev)
case 0xaa001: return cur_rev <= 0xaa00116; break;
case 0xaa002: return cur_rev <= 0xaa00218; break;
case 0xb0021: return cur_rev <= 0xb002146; break;
+ case 0xb0081: return cur_rev <= 0xb008111; break;
case 0xb1010: return cur_rev <= 0xb101046; break;
case 0xb2040: return cur_rev <= 0xb204031; break;
case 0xb4040: return cur_rev <= 0xb404031; break;
+ case 0xb4041: return cur_rev <= 0xb404101; break;
case 0xb6000: return cur_rev <= 0xb600031; break;
+ case 0xb6080: return cur_rev <= 0xb608031; break;
case 0xb7000: return cur_rev <= 0xb700031; break;
default: break;
}
@@ -233,13 +236,31 @@ static bool need_sha_check(u32 cur_rev)
return true;
}
+static bool cpu_has_entrysign(void)
+{
+ unsigned int fam = x86_family(bsp_cpuid_1_eax);
+ unsigned int model = x86_model(bsp_cpuid_1_eax);
+
+ if (fam == 0x17 || fam == 0x19)
+ return true;
+
+ if (fam == 0x1a) {
+ if (model <= 0x2f ||
+ (0x40 <= model && model <= 0x4f) ||
+ (0x60 <= model && model <= 0x6f))
+ return true;
+ }
+
+ return false;
+}
+
static bool verify_sha256_digest(u32 patch_id, u32 cur_rev, const u8 *data, unsigned int len)
{
struct patch_digest *pd = NULL;
u8 digest[SHA256_DIGEST_SIZE];
int i;
- if (x86_family(bsp_cpuid_1_eax) < 0x17)
+ if (!cpu_has_entrysign())
return true;
if (!need_sha_check(cur_rev))
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index 71ee20102a8a..b10684dedc58 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -181,8 +181,8 @@ static void show_regs_if_on_stack(struct stack_info *info, struct pt_regs *regs,
* in false positive reports. Disable instrumentation to avoid those.
*/
__no_kmsan_checks
-static void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
- unsigned long *stack, const char *log_lvl)
+static void __show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
+ unsigned long *stack, const char *log_lvl)
{
struct unwind_state state;
struct stack_info stack_info = {0};
@@ -303,6 +303,25 @@ next:
}
}
+static void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
+ unsigned long *stack, const char *log_lvl)
+{
+ /*
+ * Disable KASAN to avoid false positives during walking another
+ * task's stacks, as values on these stacks may change concurrently
+ * with task execution.
+ */
+ bool disable_kasan = task && task != current;
+
+ if (disable_kasan)
+ kasan_disable_current();
+
+ __show_trace_log_lvl(task, regs, stack, log_lvl);
+
+ if (disable_kasan)
+ kasan_enable_current();
+}
+
void show_stack(struct task_struct *task, unsigned long *sp,
const char *loglvl)
{
diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index 1f71cc135e9a..e88eacb1b5bb 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -825,6 +825,9 @@ void fpu__clear_user_states(struct fpu *fpu)
!fpregs_state_valid(fpu, smp_processor_id()))
os_xrstor_supervisor(fpu->fpstate);
+ /* Ensure XFD state is in sync before reloading XSTATE */
+ xfd_update_state(fpu->fpstate);
+
/* Reset user states in registers. */
restore_fpregs_from_init_fpstate(XFEATURE_MASK_USER_RESTORE);
diff --git a/arch/x86/kernel/ftrace_64.S b/arch/x86/kernel/ftrace_64.S
index 367da3638167..823dbdd0eb41 100644
--- a/arch/x86/kernel/ftrace_64.S
+++ b/arch/x86/kernel/ftrace_64.S
@@ -354,12 +354,17 @@ SYM_CODE_START(return_to_handler)
UNWIND_HINT_UNDEFINED
ANNOTATE_NOENDBR
+ /* Restore return_to_handler value that got eaten by previous ret instruction. */
+ subq $8, %rsp
+ UNWIND_HINT_FUNC
+
/* Save ftrace_regs for function exit context */
subq $(FRAME_SIZE), %rsp
movq %rax, RAX(%rsp)
movq %rdx, RDX(%rsp)
movq %rbp, RBP(%rsp)
+ movq %rsp, RSP(%rsp)
movq %rsp, %rdi
call ftrace_return_to_handler
@@ -368,7 +373,8 @@ SYM_CODE_START(return_to_handler)
movq RDX(%rsp), %rdx
movq RAX(%rsp), %rax
- addq $(FRAME_SIZE), %rsp
+ addq $(FRAME_SIZE) + 8, %rsp
+
/*
* Jump back to the old return address. This cannot be JMP_NOSPEC rdi
* since IBT would demand that contain ENDBR, which simply isn't so for
diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c
index 3863d7709386..c1fac3a9fecc 100644
--- a/arch/x86/kernel/kprobes/core.c
+++ b/arch/x86/kernel/kprobes/core.c
@@ -141,7 +141,6 @@ bool can_boost(struct insn *insn, void *addr)
{
kprobe_opcode_t opcode;
insn_byte_t prefix;
- int i;
if (search_exception_tables((unsigned long)addr))
return false; /* Page fault may occur on this address. */
@@ -154,7 +153,7 @@ bool can_boost(struct insn *insn, void *addr)
if (insn->opcode.nbytes != 1)
return false;
- for_each_insn_prefix(insn, i, prefix) {
+ for_each_insn_prefix(insn, prefix) {
insn_attr_t attr;
attr = inat_get_opcode_attribute(prefix);
diff --git a/arch/x86/kernel/kprobes/opt.c b/arch/x86/kernel/kprobes/opt.c
index 0aabd4c4e2c4..6f826a00eca2 100644
--- a/arch/x86/kernel/kprobes/opt.c
+++ b/arch/x86/kernel/kprobes/opt.c
@@ -103,7 +103,6 @@ static void synthesize_set_arg1(kprobe_opcode_t *addr, unsigned long val)
asm (
".pushsection .rodata\n"
- "optprobe_template_func:\n"
".global optprobe_template_entry\n"
"optprobe_template_entry:\n"
#ifdef CONFIG_X86_64
@@ -160,9 +159,6 @@ asm (
"optprobe_template_end:\n"
".popsection\n");
-void optprobe_template_func(void);
-STACK_FRAME_NON_STANDARD(optprobe_template_func);
-
#define TMPL_CLAC_IDX \
((long)optprobe_template_clac - (long)optprobe_template_entry)
#define TMPL_MOVE_IDX \
diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c
index 0ffbae902e2f..11c45ce42694 100644
--- a/arch/x86/kernel/module.c
+++ b/arch/x86/kernel/module.c
@@ -97,6 +97,7 @@ static int __write_relocate_add(Elf64_Shdr *sechdrs,
DEBUGP("%s relocate section %u to %u\n",
apply ? "Applying" : "Clearing",
relsec, sechdrs[relsec].sh_info);
+
for (i = 0; i < sechdrs[relsec].sh_size / sizeof(*rel); i++) {
size_t size;
@@ -162,15 +163,17 @@ static int __write_relocate_add(Elf64_Shdr *sechdrs,
if (apply) {
if (memcmp(loc, &zero, size)) {
- pr_err("x86/modules: Invalid relocation target, existing value is nonzero for type %d, loc %p, val %Lx\n",
- (int)ELF64_R_TYPE(rel[i].r_info), loc, val);
+ pr_err("x86/modules: Invalid relocation target, existing value is nonzero for sec %u, idx %u, type %d, loc %lx, val %llx\n",
+ relsec, i, (int)ELF64_R_TYPE(rel[i].r_info),
+ (unsigned long)loc, val);
return -ENOEXEC;
}
write(loc, &val, size);
} else {
if (memcmp(loc, &val, size)) {
- pr_warn("x86/modules: Invalid relocation target, existing value does not match expected value for type %d, loc %p, val %Lx\n",
- (int)ELF64_R_TYPE(rel[i].r_info), loc, val);
+ pr_warn("x86/modules: Invalid relocation target, existing value does not match expected value for sec %u, idx %u, type %d, loc %lx, val %llx\n",
+ relsec, i, (int)ELF64_R_TYPE(rel[i].r_info),
+ (unsigned long)loc, val);
return -ENOEXEC;
}
write(loc, &zero, size);
@@ -179,8 +182,8 @@ static int __write_relocate_add(Elf64_Shdr *sechdrs,
return 0;
overflow:
- pr_err("overflow in relocation type %d val %Lx\n",
- (int)ELF64_R_TYPE(rel[i].r_info), val);
+ pr_err("overflow in relocation type %d val %llx sec %u idx %d\n",
+ (int)ELF64_R_TYPE(rel[i].r_info), val, relsec, i);
pr_err("`%s' likely not compiled with -mcmodel=kernel\n",
me->name);
return -ENOEXEC;
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index eb289abece23..c2107047dc14 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -515,6 +515,76 @@ static void __init build_sched_topology(void)
set_sched_topology(topology);
}
+#ifdef CONFIG_NUMA
+static int sched_avg_remote_distance;
+static int avg_remote_numa_distance(void)
+{
+ int i, j;
+ int distance, nr_remote, total_distance;
+
+ if (sched_avg_remote_distance > 0)
+ return sched_avg_remote_distance;
+
+ nr_remote = 0;
+ total_distance = 0;
+ for_each_node_state(i, N_CPU) {
+ for_each_node_state(j, N_CPU) {
+ distance = node_distance(i, j);
+
+ if (distance >= REMOTE_DISTANCE) {
+ nr_remote++;
+ total_distance += distance;
+ }
+ }
+ }
+ if (nr_remote)
+ sched_avg_remote_distance = total_distance / nr_remote;
+ else
+ sched_avg_remote_distance = REMOTE_DISTANCE;
+
+ return sched_avg_remote_distance;
+}
+
+int arch_sched_node_distance(int from, int to)
+{
+ int d = node_distance(from, to);
+
+ switch (boot_cpu_data.x86_vfm) {
+ case INTEL_GRANITERAPIDS_X:
+ case INTEL_ATOM_DARKMONT_X:
+
+ if (!x86_has_numa_in_package || topology_max_packages() == 1 ||
+ d < REMOTE_DISTANCE)
+ return d;
+
+ /*
+ * With SNC enabled, there could be too many levels of remote
+ * NUMA node distances, creating NUMA domain levels
+ * including local nodes and partial remote nodes.
+ *
+ * Trim finer distance tuning for NUMA nodes in remote package
+ * for the purpose of building sched domains. Group NUMA nodes
+ * in the remote package in the same sched group.
+ * Simplify NUMA domains and avoid extra NUMA levels including
+ * different remote NUMA nodes and local nodes.
+ *
+ * GNR and CWF don't expect systems with more than 2 packages
+ * and more than 2 hops between packages. Single average remote
+ * distance won't be appropriate if there are more than 2
+ * packages as average distance to different remote packages
+ * could be different.
+ */
+ WARN_ONCE(topology_max_packages() > 2,
+ "sched: Expect only up to 2 packages for GNR or CWF, "
+ "but saw %d packages when building sched domains.",
+ topology_max_packages());
+
+ d = avg_remote_numa_distance();
+ }
+ return d;
+}
+#endif /* CONFIG_NUMA */
+
void set_cpu_sibling_map(int cpu)
{
bool has_smt = __max_threads_per_core > 1;
@@ -1328,11 +1398,7 @@ void __noreturn hlt_play_dead(void)
native_halt();
}
-/*
- * native_play_dead() is essentially a __noreturn function, but it can't
- * be marked as such as the compiler may complain about it.
- */
-void native_play_dead(void)
+void __noreturn native_play_dead(void)
{
if (cpu_feature_enabled(X86_FEATURE_KERNEL_IBRS))
__update_spec_ctrl(0);
@@ -1351,7 +1417,7 @@ int native_cpu_disable(void)
return -ENOSYS;
}
-void native_play_dead(void)
+void __noreturn native_play_dead(void)
{
BUG();
}
diff --git a/arch/x86/kernel/static_call.c b/arch/x86/kernel/static_call.c
index 378c388d1b31..2892cdb14563 100644
--- a/arch/x86/kernel/static_call.c
+++ b/arch/x86/kernel/static_call.c
@@ -26,6 +26,11 @@ static const u8 xor5rax[] = { 0x2e, 0x2e, 0x2e, 0x31, 0xc0 };
static const u8 retinsn[] = { RET_INSN_OPCODE, 0xcc, 0xcc, 0xcc, 0xcc };
+/*
+ * ud1 (%edx),%rdi -- see __WARN_trap() / decode_bug()
+ */
+static const u8 warninsn[] = { 0x67, 0x48, 0x0f, 0xb9, 0x3a };
+
static u8 __is_Jcc(u8 *insn) /* Jcc.d32 */
{
u8 ret = 0;
@@ -69,7 +74,10 @@ static void __ref __static_call_transform(void *insn, enum insn_type type,
emulate = code;
code = &xor5rax;
}
-
+ if (func == &__WARN_trap) {
+ emulate = code;
+ code = &warninsn;
+ }
break;
case NOP:
@@ -128,7 +136,8 @@ static void __static_call_validate(u8 *insn, bool tail, bool tramp)
} else {
if (opcode == CALL_INSN_OPCODE ||
!memcmp(insn, x86_nops[5], 5) ||
- !memcmp(insn, xor5rax, 5))
+ !memcmp(insn, xor5rax, 5) ||
+ !memcmp(insn, warninsn, 5))
return;
}
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 6b22611e69cc..cb324cc1fd99 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -31,6 +31,7 @@
#include <linux/kexec.h>
#include <linux/sched.h>
#include <linux/sched/task_stack.h>
+#include <linux/static_call.h>
#include <linux/timer.h>
#include <linux/init.h>
#include <linux/bug.h>
@@ -102,25 +103,37 @@ __always_inline int is_valid_bugaddr(unsigned long addr)
* UBSan{0}: 67 0f b9 00 ud1 (%eax),%eax
* UBSan{10}: 67 0f b9 40 10 ud1 0x10(%eax),%eax
* static_call: 0f b9 cc ud1 %esp,%ecx
+ * __WARN_trap: 67 48 0f b9 3a ud1 (%edx),%reg
*
- * Notably UBSAN uses EAX, static_call uses ECX.
+ * Notable, since __WARN_trap can use all registers, the distinction between
+ * UD1 users is through R/M.
*/
__always_inline int decode_bug(unsigned long addr, s32 *imm, int *len)
{
unsigned long start = addr;
+ u8 v, reg, rm, rex = 0;
+ int type = BUG_UD1;
bool lock = false;
- u8 v;
if (addr < TASK_SIZE_MAX)
return BUG_NONE;
- v = *(u8 *)(addr++);
- if (v == INSN_ASOP)
+ for (;;) {
v = *(u8 *)(addr++);
+ if (v == INSN_ASOP)
+ continue;
- if (v == INSN_LOCK) {
- lock = true;
- v = *(u8 *)(addr++);
+ if (v == INSN_LOCK) {
+ lock = true;
+ continue;
+ }
+
+ if ((v & 0xf0) == 0x40) {
+ rex = v;
+ continue;
+ }
+
+ break;
}
switch (v) {
@@ -156,18 +169,33 @@ __always_inline int decode_bug(unsigned long addr, s32 *imm, int *len)
if (X86_MODRM_MOD(v) != 3 && X86_MODRM_RM(v) == 4)
addr++; /* SIB */
+ reg = X86_MODRM_REG(v) + 8*!!X86_REX_R(rex);
+ rm = X86_MODRM_RM(v) + 8*!!X86_REX_B(rex);
+
/* Decode immediate, if present */
switch (X86_MODRM_MOD(v)) {
case 0: if (X86_MODRM_RM(v) == 5)
- addr += 4; /* RIP + disp32 */
+ addr += 4; /* RIP + disp32 */
+
+ if (rm == 0) /* (%eax) */
+ type = BUG_UD1_UBSAN;
+
+ if (rm == 2) { /* (%edx) */
+ *imm = reg;
+ type = BUG_UD1_WARN;
+ }
break;
case 1: *imm = *(s8 *)addr;
addr += 1;
+ if (rm == 0) /* (%eax) */
+ type = BUG_UD1_UBSAN;
break;
case 2: *imm = *(s32 *)addr;
addr += 4;
+ if (rm == 0) /* (%eax) */
+ type = BUG_UD1_UBSAN;
break;
case 3: break;
@@ -176,12 +204,76 @@ __always_inline int decode_bug(unsigned long addr, s32 *imm, int *len)
/* record instruction length */
*len = addr - start;
- if (X86_MODRM_REG(v) == 0) /* EAX */
- return BUG_UD1_UBSAN;
+ return type;
+}
- return BUG_UD1;
+static inline unsigned long pt_regs_val(struct pt_regs *regs, int nr)
+{
+ int offset = pt_regs_offset(regs, nr);
+ if (WARN_ON_ONCE(offset < -0))
+ return 0;
+ return *((unsigned long *)((void *)regs + offset));
}
+#ifdef HAVE_ARCH_BUG_FORMAT_ARGS
+DEFINE_STATIC_CALL(WARN_trap, __WARN_trap);
+EXPORT_STATIC_CALL_TRAMP(WARN_trap);
+
+/*
+ * Create a va_list from an exception context.
+ */
+void *__warn_args(struct arch_va_list *args, struct pt_regs *regs)
+{
+ /*
+ * Register save area; populate with function call argument registers
+ */
+ args->regs[0] = regs->di;
+ args->regs[1] = regs->si;
+ args->regs[2] = regs->dx;
+ args->regs[3] = regs->cx;
+ args->regs[4] = regs->r8;
+ args->regs[5] = regs->r9;
+
+ /*
+ * From the ABI document:
+ *
+ * @gp_offset - the element holds the offset in bytes from
+ * reg_save_area to the place where the next available general purpose
+ * argument register is saved. In case all argument registers have
+ * been exhausted, it is set to the value 48 (6*8).
+ *
+ * @fp_offset - the element holds the offset in bytes from
+ * reg_save_area to the place where the next available floating point
+ * argument is saved. In case all argument registers have been
+ * exhausted, it is set to the value 176 (6*8 + 8*16)
+ *
+ * @overflow_arg_area - this pointer is used to fetch arguments passed
+ * on the stack. It is initialized with the address of the first
+ * argument passed on the stack, if any, and then always updated to
+ * point to the start of the next argument on the stack.
+ *
+ * @reg_save_area - the element points to the start of the register
+ * save area.
+ *
+ * Notably the vararg starts with the second argument and there are no
+ * floating point arguments in the kernel.
+ */
+ args->args.gp_offset = 1*8;
+ args->args.fp_offset = 6*8 + 8*16;
+ args->args.reg_save_area = &args->regs;
+ args->args.overflow_arg_area = (void *)regs->sp;
+
+ /*
+ * If the exception came from __WARN_trap, there is a return
+ * address on the stack, skip that. This is why any __WARN_trap()
+ * caller must inhibit tail-call optimization.
+ */
+ if ((void *)regs->ip == &__WARN_trap)
+ args->args.overflow_arg_area += 8;
+
+ return &args->args;
+}
+#endif /* HAVE_ARCH_BUG_FORMAT */
static nokprobe_inline int
do_trap_no_signal(struct task_struct *tsk, int trapnr, const char *str,
@@ -334,6 +426,11 @@ static noinstr bool handle_bug(struct pt_regs *regs)
raw_local_irq_enable();
switch (ud_type) {
+ case BUG_UD1_WARN:
+ if (report_bug_entry((void *)pt_regs_val(regs, ud_imm), regs) == BUG_TRAP_TYPE_WARN)
+ handled = true;
+ break;
+
case BUG_UD2:
if (report_bug(regs->ip, regs) == BUG_TRAP_TYPE_WARN) {
handled = true;
diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c
index 845aeaf36b8d..7be8e361ca55 100644
--- a/arch/x86/kernel/uprobes.c
+++ b/arch/x86/kernel/uprobes.c
@@ -17,6 +17,7 @@
#include <linux/kdebug.h>
#include <asm/processor.h>
#include <asm/insn.h>
+#include <asm/insn-eval.h>
#include <asm/mmu_context.h>
#include <asm/nops.h>
@@ -258,9 +259,8 @@ static volatile u32 good_2byte_insns[256 / 32] = {
static bool is_prefix_bad(struct insn *insn)
{
insn_byte_t p;
- int i;
- for_each_insn_prefix(insn, i, p) {
+ for_each_insn_prefix(insn, p) {
insn_attr_t attr;
attr = inat_get_opcode_attribute(p);
@@ -1158,35 +1158,12 @@ unlock:
mmap_write_unlock(mm);
}
-static bool insn_is_nop(struct insn *insn)
-{
- return insn->opcode.nbytes == 1 && insn->opcode.bytes[0] == 0x90;
-}
-
-static bool insn_is_nopl(struct insn *insn)
-{
- if (insn->opcode.nbytes != 2)
- return false;
-
- if (insn->opcode.bytes[0] != 0x0f || insn->opcode.bytes[1] != 0x1f)
- return false;
-
- if (!insn->modrm.nbytes)
- return false;
-
- if (X86_MODRM_REG(insn->modrm.bytes[0]) != 0)
- return false;
-
- /* 0f 1f /0 - NOPL */
- return true;
-}
-
static bool can_optimize(struct insn *insn, unsigned long vaddr)
{
if (!insn->x86_64 || insn->length != 5)
return false;
- if (!insn_is_nop(insn) && !insn_is_nopl(insn))
+ if (!insn_is_nop(insn))
return false;
/* We can't do cross page atomic writes yet. */
@@ -1426,19 +1403,14 @@ static int branch_setup_xol_ops(struct arch_uprobe *auprobe, struct insn *insn)
{
u8 opc1 = OPCODE1(insn);
insn_byte_t p;
- int i;
- /* x86_nops[insn->length]; same as jmp with .offs = 0 */
- if (insn->length <= ASM_NOP_MAX &&
- !memcmp(insn->kaddr, x86_nops[insn->length], insn->length))
+ if (insn_is_nop(insn))
goto setup;
switch (opc1) {
case 0xeb: /* jmp 8 */
case 0xe9: /* jmp 32 */
break;
- case 0x90: /* prefix* + nop; same as jmp with .offs = 0 */
- goto setup;
case 0xe8: /* call relative */
branch_clear_offset(auprobe, insn);
@@ -1463,7 +1435,7 @@ static int branch_setup_xol_ops(struct arch_uprobe *auprobe, struct insn *insn)
* Intel and AMD behavior differ in 64-bit mode: Intel ignores 66 prefix.
* No one uses these insns, reject any branch insns with such prefix.
*/
- for_each_insn_prefix(insn, i, p) {
+ for_each_insn_prefix(insn, p) {
if (p == 0x66)
return -ENOTSUPP;
}
@@ -1819,3 +1791,35 @@ bool arch_uretprobe_is_alive(struct return_instance *ret, enum rp_check ctx,
else
return regs->sp <= ret->stack;
}
+
+/*
+ * Heuristic-based check if uprobe is installed at the function entry.
+ *
+ * Under assumption of user code being compiled with frame pointers,
+ * `push %rbp/%ebp` is a good indicator that we indeed are.
+ *
+ * Similarly, `endbr64` (assuming 64-bit mode) is also a common pattern.
+ * If we get this wrong, captured stack trace might have one extra bogus
+ * entry, but the rest of stack trace will still be meaningful.
+ */
+bool is_uprobe_at_func_entry(struct pt_regs *regs)
+{
+ struct arch_uprobe *auprobe;
+
+ if (!current->utask)
+ return false;
+
+ auprobe = current->utask->auprobe;
+ if (!auprobe)
+ return false;
+
+ /* push %rbp/%ebp */
+ if (auprobe->insn[0] == 0x55)
+ return true;
+
+ /* endbr64 (64-bit only) */
+ if (user_64bit_mode(regs) && is_endbr((u32 *)auprobe->insn))
+ return true;
+
+ return false;
+}