summaryrefslogtreecommitdiff
path: root/arch/x86/kernel/cpu/mce/core.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2025-12-02 11:04:37 -0800
committerLinus Torvalds <torvalds@linux-foundation.org>2025-12-02 11:04:37 -0800
commita61288200e8b6f42bff116508dc72ebcc206f10a (patch)
treea959547d2ce66db015808d7b2df4fee754d4cab8 /arch/x86/kernel/cpu/mce/core.c
parent49219bba0149157774b7091c3ea9ad22b2114285 (diff)
parent5c4663ed1eac01987a1421f059380db48ab7b1a3 (diff)
Merge tag 'ras_core_for_v6.19_rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull x86 RAS updates from Borislav Petkov: - The second part of the AMD MCA interrupts rework after the last-minute show-stopper from the last merge window was sorted out. After this, the AMD MCA deferred errors, thresholding and corrected errors interrupt handlers use common MCA code and are tightly integrated into the core MCA code, thereby getting rid of considerable duplication. All culminating into allowing CMCI error thresholding storms to be detected at AMD too, using the common infrastructure - Add support for two new MCA bank bits on AMD Zen6 which denote whether the error address logged is a system physical address, which obviates the need for it to be translated before further error recovery can be done * tag 'ras_core_for_v6.19_rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: x86/mce: Handle AMD threshold interrupt storms x86/mce: Do not clear bank's poll bit in mce_poll_banks on AMD SMCA systems x86/mce: Add support for physical address valid bit x86/mce: Save and use APEI corrected threshold limit x86/mce/amd: Define threshold restart function for banks x86/mce/amd: Remove redundant reset_block() x86/mce/amd: Support SMCA Corrected Error Interrupt x86/mce/amd: Enable interrupt vectors once per-CPU on SMCA systems x86/mce: Unify AMD DFR handler with MCA Polling x86/mce: Unify AMD THR handler with MCA Polling
Diffstat (limited to 'arch/x86/kernel/cpu/mce/core.c')
-rw-r--r--arch/x86/kernel/cpu/mce/core.c31
1 files changed, 30 insertions, 1 deletions
diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c
index 460e90a1a0b1..4aff14e04287 100644
--- a/arch/x86/kernel/cpu/mce/core.c
+++ b/arch/x86/kernel/cpu/mce/core.c
@@ -687,7 +687,10 @@ static noinstr void mce_read_aux(struct mce_hw_err *err, int i)
m->misc = mce_rdmsrq(mca_msr_reg(i, MCA_MISC));
if (m->status & MCI_STATUS_ADDRV) {
- m->addr = mce_rdmsrq(mca_msr_reg(i, MCA_ADDR));
+ if (m->kflags & MCE_CHECK_DFR_REGS)
+ m->addr = mce_rdmsrq(MSR_AMD64_SMCA_MCx_DEADDR(i));
+ else
+ m->addr = mce_rdmsrq(mca_msr_reg(i, MCA_ADDR));
/*
* Mask the reported address by the reported granularity.
@@ -715,6 +718,29 @@ static noinstr void mce_read_aux(struct mce_hw_err *err, int i)
DEFINE_PER_CPU(unsigned, mce_poll_count);
/*
+ * We have three scenarios for checking for Deferred errors:
+ *
+ * 1) Non-SMCA systems check MCA_STATUS and log error if found.
+ * 2) SMCA systems check MCA_STATUS. If error is found then log it and also
+ * clear MCA_DESTAT.
+ * 3) SMCA systems check MCA_DESTAT, if error was not found in MCA_STATUS, and
+ * log it.
+ */
+static bool smca_should_log_poll_error(struct mce *m)
+{
+ if (m->status & MCI_STATUS_VAL)
+ return true;
+
+ m->status = mce_rdmsrq(MSR_AMD64_SMCA_MCx_DESTAT(m->bank));
+ if ((m->status & MCI_STATUS_VAL) && (m->status & MCI_STATUS_DEFERRED)) {
+ m->kflags |= MCE_CHECK_DFR_REGS;
+ return true;
+ }
+
+ return false;
+}
+
+/*
* Newer Intel systems that support software error
* recovery need to make additional checks. Other
* CPUs should skip over uncorrected errors, but log
@@ -740,6 +766,9 @@ static bool should_log_poll_error(enum mcp_flags flags, struct mce_hw_err *err)
{
struct mce *m = &err->m;
+ if (mce_flags.smca)
+ return smca_should_log_poll_error(m);
+
/* If this entry is not valid, ignore it. */
if (!(m->status & MCI_STATUS_VAL))
return false;