Add F10h decoding support Signed-off-by: Borislav Petkov --- amd.c | 488 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---- amd.h | 42 ++++- mcelog.c | 26 +-- mcelog.h | 1 4 files changed, 506 insertions(+), 51 deletions(-) Index: mcelog-198/amd.c =================================================================== --- mcelog-198.orig/amd.c +++ mcelog-198/amd.c @@ -14,7 +14,7 @@ #include "mcelog.h" #include "amd.h" -static char *k8bank[] = { +static const char * const k8bank[] = { "data cache", "instruction cache", "bus unit", @@ -22,28 +22,34 @@ static char *k8bank[] = { "northbridge", "fixed-issue reoder" }; -static char *transaction[] = { +static const char * const transaction[] = { "instruction", "data", "generic", "reserved" -}; -static char *cachelevel[] = { +}; +static const char * const cachelevel[] = { "0", "1", "2", "generic" }; -static char *memtrans[] = { +static const char * const memtrans[] = { "generic error", "generic read", "generic write", "data read", "data write", "instruction fetch", "prefetch", "evict", "snoop", "?", "?", "?", "?", "?", "?", "?" }; -static char *partproc[] = { - "local node origin", "local node response", - "local node observed", "generic participation" +static const char * const partproc[] = { + "local node origin", + "local node response", + "local node observed", + "generic participation" }; -static char *timeout[] = { +static const char * const timeout[] = { "request didn't time out", "request timed out" }; -static char *memoryio[] = { +static const char * const memoryio[] = { "memory", "res.", "i/o", "generic" }; + +/* internal error type */ +static const char * const uu_msgs[] = { "RESV", "RESV", "HWA", "RESV" }; + static char *nbextendederr[] = { "RAM ECC error", "CRC error", @@ -65,6 +71,46 @@ static char *nbextendederr[] = { "L3 Cache Tag Error", "L3 Cache LRU Error" }; + +static const char * const mc4_mce_desc[] = { + "DRAM ECC error detected on the NB", + "CRC error detected on HT link", + "Link-defined sync error packets detected on HT link", + "HT Master abort", + "HT Target abort", + "Invalid GART PTE entry during GART table walk", + "Unsupported atomic RMW received from an IO link", + "Watchdog timeout due to lack of progress", + "DRAM ECC error detected on the NB", + "SVM DMA Exclusion Vector error", + "HT data error detected on link", + "Protocol error (link, L3, probe filter)", + "NB internal arrays parity error", + "DRAM addr/ctl signals parity error", + "IO link transmission error", + "L3 data cache ECC error", /* xec = 0x1c */ + "L3 cache tag error", + "L3 LRU parity bits error", + "ECC Error in the Probe Filter directory" +}; + +static const char * const mc5_mce_desc[] = { + "CPU Watchdog timer expire", + "Wakeup array dest tag", + "AG payload array", + "EX payload array", + "IDRF array", + "Retire dispatch queue", + "Mapper checkpoint array", + "Physical register file EX0 port", + "Physical register file EX1 port", + "Physical register file AG0 port", + "Physical register file AG1 port", + "Flag register file", + "DE error occurred", + "Retire status queue" +}; + static char *highbits[32] = { [31] = "valid", [30] = "error overflow (multiple errors)", @@ -100,6 +146,21 @@ static char *k8threshold[] = { "Unknown threshold counter", }; +static u8 xec_mask = 0xf; + +enum cputype select_amd_cputype(u32 family) +{ + switch (family) { + case 0xf: + return CPU_K8; + case 0x10: + return CPU_F10H; + default: + break; + } + + return CPU_GENERIC; +} static void decode_k8_generic_errcode(u64 status) { @@ -245,21 +306,393 @@ static decoder_t decoders[] = { [5] = decode_k8_fr_mc, }; -void decode_amd_mc(enum cputype cpu, struct mce *mce, int *ismemerr) +static bool k8_mc1_mce(u16 ec, u8 xec) +{ + u8 ll = LL(ec); + bool ret = true; + + if (!MEM_ERROR(ec)) + return false; + + if (ll == 0x2) + Wprintf("during a linefill from L2.\n"); + else if (ll == 0x1) { + switch (R4(ec)) { + case R4_IRD: + Wprintf("Parity error during data load.\n"); + break; + + case R4_EVICT: + Wprintf("Copyback Parity/Victim error.\n"); + break; + + case R4_SNOOP: + Wprintf("Tag Snoop error.\n"); + break; + + default: + ret = false; + break; + } + } else + ret = false; + + return ret; +} + +static bool f12h_mc0_mce(u16 ec, u8 xec) +{ + bool ret = false; + + if (MEM_ERROR(ec)) { + u8 ll = LL(ec); + ret = true; + + if (ll == LL_L2) + Wprintf("aduring L1 linefill from L2.\n"); + else if (ll == LL_L1) + Wprintf("Data/Tag %s error.\n", R4_MSG(ec)); + else + ret = false; + } + return ret; +} + +static bool f10h_mc0_mce(u16 ec, u8 xec) +{ + if (R4(ec) == R4_GEN && LL(ec) == LL_L1) { + Wprintf("during data scrub.\n"); + return true; + } + return f12h_mc0_mce(ec, xec); +} + +static void decode_mc0_mce(struct amd_decoder_ops *ops, struct mce *m) +{ + u16 ec = EC(m->status); + u8 xec = XEC(m->status, xec_mask); + + Wprintf(" MC0 Error: "); + + /* TLB error signatures are the same across families */ + if (TLB_ERROR(ec)) { + if (TT(ec) == TT_DATA) { + Wprintf("%s TLB %s.\n", LL_MSG(ec), + ((xec == 2) ? "locked miss" + : (xec ? "multimatch" : "parity"))); + return; + } + } else if (ops->mc0_mce(ec, xec)) + ; + else + Eprintf("Corrupted MC0 MCE info?\n"); +} + +static void decode_mc1_mce(struct amd_decoder_ops *ops, struct mce *m) { - if (mce->bank < NELE(decoders)) - decoders[mce->bank](mce->status, ismemerr); - else if (mce->bank >= K8_MCE_THRESHOLD_BASE && - mce->bank < K8_MCE_THRESHOLD_TOP) - decode_k8_threshold(mce->misc); + u16 ec = EC(m->status); + u8 xec = XEC(m->status, xec_mask); + + Wprintf(" MC1 Error: "); + + if (TLB_ERROR(ec)) + Wprintf("%s TLB %s.\n", LL_MSG(ec), + (xec ? "multimatch" : "parity error")); + else if (BUS_ERROR(ec)) { + bool k8 = ((ops->cpu == AMD_K8) && (m->status & BIT_64(58))); + + Wprintf("during %s.\n", (k8 ? "system linefill" : "NB data read")); + } else if (ops->mc1_mce(ec, xec)) + ; else - Wprintf(" no decoder for unknown bank %u\n", mce->bank); + Eprintf("Corrupted MC1 MCE info?\n"); +} + +static bool k8_mc2_mce(u16 ec, u8 xec) +{ + bool ret = true; + + if (xec == 0x1) + Wprintf(" in the write data buffers.\n"); + else if (xec == 0x3) + Wprintf(" in the victim data buffers.\n"); + else if (xec == 0x2 && MEM_ERROR(ec)) + Wprintf(": %s error in the L2 cache tags.\n", R4_MSG(ec)); + else if (xec == 0x0) { + if (TLB_ERROR(ec)) + Wprintf(": %s error in a Page Descriptor Cache or " + "Guest TLB.\n", TT_MSG(ec)); + else if (BUS_ERROR(ec)) + Wprintf(": %s/ECC error in data read from NB: %s.\n", + R4_MSG(ec), PP_MSG(ec)); + else if (MEM_ERROR(ec)) { + u8 r4 = R4(ec); + + if (r4 >= 0x7) + Wprintf(": %s error during data copyback.\n", + R4_MSG(ec)); + else if (r4 <= 0x1) + Wprintf(": %s parity/ECC error during data " + "access from L2.\n", R4_MSG(ec)); + else + ret = false; + } else + ret = false; + } else + ret = false; + + return ret; +} + +static void decode_mc2_mce(struct amd_decoder_ops *ops, struct mce *m) +{ + u16 ec = EC(m->status); + u8 xec = XEC(m->status, xec_mask); + + Wprintf(" MC2 Error: "); + + if (!ops->mc2_mce(ec, xec)) + Eprintf("Corrupted MC2 MCE info?\n"); +} + +static void decode_mc3_mce(struct amd_decoder_ops *ops, struct mce *m) +{ + u16 ec = EC(m->status); + u8 xec = XEC(m->status, xec_mask); + + if (ops->cpu >= AMD_F14H) { + Eprintf("You shouldn't be seeing MC3 MCE on this cpu family," + " please report on LKML.\n"); + return; + } + + Wprintf(" MC3 Error"); + + if (xec == 0x0) { + u8 r4 = R4(ec); + + if (!BUS_ERROR(ec) || (r4 != R4_DRD && r4 != R4_DWR)) + goto wrong_mc3_mce; + + Wprintf(" during %s.\n", R4_MSG(ec)); + } else + goto wrong_mc3_mce; + + return; + +wrong_mc3_mce: + Eprintf("Corrupted MC3 MCE info?\n"); +} + +static void decode_mc4_mce(struct amd_decoder_ops *ops, struct mce *m) +{ + u16 ec = EC(m->status); + u8 xec = XEC(m->status, 0x1f); + u8 offset = 0; + + Wprintf(" MC4 Error: "); + + switch (xec) { + case 0x0 ... 0xe: + + /* special handling for DRAM ECCs */ + if (xec == 0x0 || xec == 0x8) { + /* no ECCs on F11h */ + if (ops->cpu == AMD_F11H) + goto wrong_mc4_mce; + + Wprintf("%s.\n", mc4_mce_desc[xec]); + return; + } + break; + + case 0xf: + if (TLB_ERROR(ec)) + Wprintf("GART Table Walk data error.\n"); + else if (BUS_ERROR(ec)) + Wprintf("DMA Exclusion Vector Table Walk error.\n"); + else + goto wrong_mc4_mce; + return; + + case 0x19: + if (ops->cpu >= AMD_F15H || ops->cpu <= AMD_F16H) + Wprintf("Compute Unit Data Error.\n"); + else + goto wrong_mc4_mce; + return; + + case 0x1c ... 0x1f: + offset = 13; + break; + + default: + goto wrong_mc4_mce; + } + + Wprintf("%s.\n", mc4_mce_desc[xec - offset]); + return; + + wrong_mc4_mce: + Eprintf("Corrupted MC4 MCE info?\n"); +} + +static void decode_mc5_mce(struct amd_decoder_ops *ops, struct mce *m) +{ + u8 xec = XEC(m->status, xec_mask); + + if (ops->cpu == AMD_K8 || ops->cpu == AMD_F11H) + goto wrong_mc5_mce; + + Wprintf(" MC5 Error: "); + + if (xec == 0x0 || xec == 0xc) + Wprintf("%s.\n", mc5_mce_desc[xec]); + else if (xec <= 0xd) + Wprintf("%s parity error.\n", mc5_mce_desc[xec]); + else + goto wrong_mc5_mce; + + return; + + wrong_mc5_mce: + Eprintf("Corrupted MC5 MCE info?\n"); +} + +static void decode_mc6_mce(struct mce *m) +{ + u8 xec = XEC(m->status, xec_mask); + + Wprintf(" MC6 Error: "); + + switch (xec) { + case 0x1: + Wprintf("Free List"); + break; + + case 0x2: + Wprintf("Physical Register File"); + break; + + case 0x3: + Wprintf("Retire Queue"); + break; + + case 0x4: + Wprintf("Scheduler table"); + break; + + case 0x5: + Wprintf("Status Register File"); + break; + + default: + goto wrong_mc6_mce; + break; + } + + Wprintf(" parity error.\n"); + + return; + + wrong_mc6_mce: + Eprintf("Corrupted MC6 MCE info?\n"); +} + +static inline void amd_decode_err_code(u16 ec) +{ + if (INT_ERROR(ec)) { + Wprintf(" internal: %s\n", UU_MSG(ec)); + return; + } + + Wprintf(" cache level: %s", LL_MSG(ec)); + + if (BUS_ERROR(ec)) + Wprintf(", mem/io: %s", II_MSG(ec)); + else + Wprintf(", tx: %s", TT_MSG(ec)); + + if (MEM_ERROR(ec) || BUS_ERROR(ec)) { + Wprintf(", mem-tx: %s", R4_MSG(ec)); + + if (BUS_ERROR(ec)) + Wprintf(", part-proc: %s (%s)", PP_MSG(ec), TO_MSG(ec)); + } + + Wprintf("\n"); +} + +struct amd_decoder_ops fam_ops[] = { + [AMD_F10H] = { + .cpu = AMD_F10H, + .mc0_mce = f10h_mc0_mce, + .mc1_mce = k8_mc1_mce, + .mc2_mce = k8_mc2_mce, + }, +}; + +static void __decode_amd_mc(enum cputype cpu, struct mce *mce) +{ + struct amd_decoder_ops *ops; + + switch (cpu) { + case CPU_F10H: + ops = &fam_ops[AMD_F10H]; + break; + default: + Eprintf("Huh? What family is it: 0x%x?!\n", cpu); + return; + break; + } + + switch (mce->bank) { + case 0: + decode_mc0_mce(ops, mce); + break; + case 1: + decode_mc1_mce(ops, mce); + break; + case 2: + decode_mc2_mce(ops, mce); + break; + case 3: + decode_mc3_mce(ops, mce); + break; + case 4: + decode_mc4_mce(ops, mce); + break; + case 5: + decode_mc5_mce(ops, mce); + break; + case 6: + decode_mc6_mce(mce); + break; + + default: + break; + } + amd_decode_err_code(mce->status & 0xffff); +} + +void decode_amd_mc(enum cputype cpu, struct mce *mce, int *ismemerr) +{ + if (cpu == CPU_K8) { + if (mce->bank < NELE(decoders)) + decoders[mce->bank](mce->status, ismemerr); + else if (mce->bank >= K8_MCE_THRESHOLD_BASE && + mce->bank < K8_MCE_THRESHOLD_TOP) + decode_k8_threshold(mce->misc); + else + Wprintf(" no decoder for unknown bank %u\n", mce->bank); + } else + __decode_amd_mc(cpu, mce); } char *k8_bank_name(unsigned num) { static char buf[64]; - char *s = "unknown"; + const char *s = "unknown"; if (num < NELE(k8bank)) s = k8bank[num]; else if (num >= K8_MCE_THRESHOLD_BASE && @@ -270,13 +703,16 @@ char *k8_bank_name(unsigned num) return buf; } -int mce_filter_k8(struct mce *m) -{ - /* Filter out GART errors */ - if (m->bank == 4) { - unsigned short exterrcode = (m->status >> 16) & 0x0f; - if (exterrcode == 5 && (m->status & (1ULL<<61))) +int mce_filter_amd(struct mce *m) +{ + /* + * NB GART TLB error reporting is disabled by default. + */ + if (m->bank == 4) { + u8 xec = (m->status >> 16) & 0x1f; + + if (xec == 0x5 && (m->status & BIT_64(61))) return 0; - } - return 1; + } + return 1; } Index: mcelog-198/amd.h =================================================================== --- mcelog-198.orig/amd.h +++ mcelog-198/amd.h @@ -1,6 +1,25 @@ +#include + char *k8_bank_name(unsigned num); void decode_amd_mc(enum cputype, struct mce *mce, int *ismemerr); -int mce_filter_k8(struct mce *m); +int mce_filter_amd(struct mce *m); +enum cputype select_amd_cputype(u32 family); + +enum amdcpu { + AMD_K8 = 0, + AMD_F10H, + AMD_F11H, + AMD_F14H, + AMD_F15H, + AMD_F16H, +}; + +struct amd_decoder_ops { + enum amdcpu cpu; + bool (*mc0_mce)(u16, u8); + bool (*mc1_mce)(u16, u8); + bool (*mc2_mce)(u16, u8); +}; #define K8_MCE_THRESHOLD_BASE (MCE_EXTENDED_BANK + 1) /* MCE_AMD */ #define K8_MCE_THRESHOLD_TOP (K8_MCE_THRESHOLD_BASE + 6 * 9) @@ -10,6 +29,8 @@ int mce_filter_k8(struct mce *m); #define K8_MCELOG_THRESHOLD_L3_CACHE (4 * 9 + 2) #define K8_MCELOG_THRESHOLD_FBDIMM (4 * 9 + 3) +#define BIT_64(n) (1ULL << (n)) + #define EC(x) ((x) & 0xffff) #define XEC(x, mask) (((x) >> 16) & mask) @@ -22,20 +43,20 @@ int mce_filter_k8(struct mce *m); #define INT_ERROR(x) (((x) & 0xF4FF) == 0x0400) #define TT(x) (((x) >> 2) & 0x3) -#define TT_MSG(x) tt_msgs[TT(x)] +#define TT_MSG(x) transaction[TT(x)] #define II(x) (((x) >> 2) & 0x3) -#define II_MSG(x) ii_msgs[II(x)] +#define II_MSG(x) memoryio[II(x)] #define LL(x) ((x) & 0x3) -#define LL_MSG(x) ll_msgs[LL(x)] +#define LL_MSG(x) cachelevel[LL(x)] #define TO(x) (((x) >> 8) & 0x1) -#define TO_MSG(x) to_msgs[TO(x)] +#define TO_MSG(x) timeout[TO(x)] #define PP(x) (((x) >> 9) & 0x3) -#define PP_MSG(x) pp_msgs[PP(x)] +#define PP_MSG(x) partproc[PP(x)] #define UU(x) (((x) >> 8) & 0x3) #define UU_MSG(x) uu_msgs[UU(x)] #define R4(x) (((x) >> 4) & 0xf) -#define R4_MSG(x) ((R4(x) < 9) ? rrrr_msgs[R4(x)] : "Wrong R4!") +#define R4_MSG(x) ((R4(x) < 9) ? memtrans[R4(x)] : "Wrong R4!") enum tt_ids { TT_INSTR = 0, Index: mcelog-198/mcelog.c =================================================================== --- mcelog-198.orig/mcelog.c +++ mcelog-198/mcelog.c @@ -151,8 +151,8 @@ static int mce_filter(struct mce *m, uns /* Filter out known broken MCEs */ if (cputype >= CPU_INTEL) return mce_filter_intel(m, recordlen); - else if (cputype == CPU_K8) - return mce_filter_k8(m); + else if CASE_AMD_CPUS + return mce_filter_amd(m); return 1; } @@ -283,9 +283,7 @@ static enum cputype setup_cpuid(u32 cpuv case X86_VENDOR_INTEL: return select_intel_cputype(family, model); case X86_VENDOR_AMD: - if (family >= 15 && family <= 17) - return CPU_K8; - /* FALL THROUGH */ + return select_amd_cputype(family); default: Eprintf("Unknown CPU type vendor %u family %u model %u", cpuvendor, family, model); @@ -347,7 +345,7 @@ static void dump_mce(struct mce *m, unsi Wprintf("TIME %llu %s", m->time, ctime(&t)); } if CASE_AMD_CPUS - decode_amd_mc(m, &ismemerr); + decode_amd_mc(cputype, m, &ismemerr); else if (cputype >= CPU_INTEL) decode_intel_mc(m, cputype, &ismemerr, recordlen); /* else add handlers for other CPUs here */ @@ -463,14 +461,9 @@ int is_cpu_supported(void) } if (seen == ALL) { - if (!strcmp(vendor,"AuthenticAMD")) { - if (family == 15) { - cputype = CPU_K8; - } else if (family >= 16) { - Eprintf("ERROR: AMD Processor family %d: mcelog does not support this processor. Please use the edac_mce_amd module instead.\n", family); - return 0; - } - } else if (!strcmp(vendor,"HygonGenuine")) { + if (!strcmp(vendor,"AuthenticAMD")) + cputype = select_amd_cputype(family); + else if (!strcmp(vendor,"HygonGenuine")) { Eprintf("ERROR: Hygon Processor family %d: mcelog does not support this processor. Please use the edac_mce_amd module instead.\n", family); return 0; } else if (!strcmp(vendor,"GenuineIntel"))