diff options
| author | Linus Torvalds <torvalds@linux-foundation.org> | 2025-12-02 11:04:37 -0800 |
|---|---|---|
| committer | Linus Torvalds <torvalds@linux-foundation.org> | 2025-12-02 11:04:37 -0800 |
| commit | a61288200e8b6f42bff116508dc72ebcc206f10a (patch) | |
| tree | a959547d2ce66db015808d7b2df4fee754d4cab8 /arch/x86/kernel/cpu/mce/core.c | |
| parent | 49219bba0149157774b7091c3ea9ad22b2114285 (diff) | |
| parent | 5c4663ed1eac01987a1421f059380db48ab7b1a3 (diff) | |
Merge tag 'ras_core_for_v6.19_rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull x86 RAS updates from Borislav Petkov:
- The second part of the AMD MCA interrupts rework after the
last-minute show-stopper from the last merge window was sorted out.
After this, the AMD MCA deferred errors, thresholding and corrected
errors interrupt handlers use common MCA code and are tightly
integrated into the core MCA code, thereby getting rid of
considerable duplication. All culminating into allowing CMCI error
thresholding storms to be detected at AMD too, using the common
infrastructure
- Add support for two new MCA bank bits on AMD Zen6 which denote
whether the error address logged is a system physical address, which
obviates the need for it to be translated before further error
recovery can be done
* tag 'ras_core_for_v6.19_rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
x86/mce: Handle AMD threshold interrupt storms
x86/mce: Do not clear bank's poll bit in mce_poll_banks on AMD SMCA systems
x86/mce: Add support for physical address valid bit
x86/mce: Save and use APEI corrected threshold limit
x86/mce/amd: Define threshold restart function for banks
x86/mce/amd: Remove redundant reset_block()
x86/mce/amd: Support SMCA Corrected Error Interrupt
x86/mce/amd: Enable interrupt vectors once per-CPU on SMCA systems
x86/mce: Unify AMD DFR handler with MCA Polling
x86/mce: Unify AMD THR handler with MCA Polling
Diffstat (limited to 'arch/x86/kernel/cpu/mce/core.c')
| -rw-r--r-- | arch/x86/kernel/cpu/mce/core.c | 31 |
1 files changed, 30 insertions, 1 deletions
diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index 460e90a1a0b1..4aff14e04287 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -687,7 +687,10 @@ static noinstr void mce_read_aux(struct mce_hw_err *err, int i) m->misc = mce_rdmsrq(mca_msr_reg(i, MCA_MISC)); if (m->status & MCI_STATUS_ADDRV) { - m->addr = mce_rdmsrq(mca_msr_reg(i, MCA_ADDR)); + if (m->kflags & MCE_CHECK_DFR_REGS) + m->addr = mce_rdmsrq(MSR_AMD64_SMCA_MCx_DEADDR(i)); + else + m->addr = mce_rdmsrq(mca_msr_reg(i, MCA_ADDR)); /* * Mask the reported address by the reported granularity. @@ -715,6 +718,29 @@ static noinstr void mce_read_aux(struct mce_hw_err *err, int i) DEFINE_PER_CPU(unsigned, mce_poll_count); /* + * We have three scenarios for checking for Deferred errors: + * + * 1) Non-SMCA systems check MCA_STATUS and log error if found. + * 2) SMCA systems check MCA_STATUS. If error is found then log it and also + * clear MCA_DESTAT. + * 3) SMCA systems check MCA_DESTAT, if error was not found in MCA_STATUS, and + * log it. + */ +static bool smca_should_log_poll_error(struct mce *m) +{ + if (m->status & MCI_STATUS_VAL) + return true; + + m->status = mce_rdmsrq(MSR_AMD64_SMCA_MCx_DESTAT(m->bank)); + if ((m->status & MCI_STATUS_VAL) && (m->status & MCI_STATUS_DEFERRED)) { + m->kflags |= MCE_CHECK_DFR_REGS; + return true; + } + + return false; +} + +/* * Newer Intel systems that support software error * recovery need to make additional checks. Other * CPUs should skip over uncorrected errors, but log @@ -740,6 +766,9 @@ static bool should_log_poll_error(enum mcp_flags flags, struct mce_hw_err *err) { struct mce *m = &err->m; + if (mce_flags.smca) + return smca_should_log_poll_error(m); + /* If this entry is not valid, ignore it. */ if (!(m->status & MCI_STATUS_VAL)) return false; |