From 4388981628ad9e2daba956210284017e1133cb99 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Wed, 7 May 2014 22:41:15 +0200 Subject: [PATCH] Start consolidating AMD-specific stuff ... in order to concentrate decoding for all families in amd.[ch]. Pass down cpu type in decode_amd_mc. Signed-off-by: Borislav Petkov --- Makefile | 2 amd.c | 282 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ amd.h | 14 +++ k8.c | 281 -------------------------------------------------------------- k8.h | 11 -- mcelog.c | 8 - 6 files changed, 301 insertions(+), 297 deletions(-) rename k8.c => amd.c (97%) rename k8.h => amd.h (79%) Index: mcelog-189/Makefile =================================================================== --- mcelog-189.orig/Makefile +++ mcelog-189/Makefile @@ -31,7 +31,7 @@ all: mcelog .PHONY: install install-nodoc clean depend FORCE -OBJ := p4.o k8.o mcelog.o dmi.o tsc.o core2.o bitfield.o intel.o \ +OBJ := p4.o amd.o mcelog.o dmi.o tsc.o core2.o bitfield.o intel.o \ nehalem.o dunnington.o tulsa.o config.o memutil.o msg.o \ eventloop.o leaky-bucket.o memdb.o server.o trigger.o \ client.o cache.o sysfs.o yellow.o page.o rbtree.o \ Index: mcelog-189/amd.c =================================================================== --- /dev/null +++ mcelog-189/amd.c @@ -0,0 +1,282 @@ +/* Based on K8 decoding code written for the 2.4 kernel by Andi Kleen and + * Eric Morton. Hacked and extended for mcelog by AK. + * Extended to support all AMD families by Borislav Petkov, SUSE Labs. + * + * Original copyright: + * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs. + * Additional K8 decoding and simplification Copyright 2003 Eric Morton, Newisys Inc + * K8 threshold counters decoding Copyright 2005,2006 Jacob Shin, AMD Inc. + * + * Subject to the GNU General Public License + */ + +#include +#include "mcelog.h" +#include "amd.h" + +static char *k8bank[] = { + "data cache", + "instruction cache", + "bus unit", + "load/store unit", + "northbridge", + "fixed-issue reoder" +}; +static char *transaction[] = { + "instruction", "data", "generic", "reserved" +}; +static char *cachelevel[] = { + "0", "1", "2", "generic" +}; +static char *memtrans[] = { + "generic error", "generic read", "generic write", "data read", + "data write", "instruction fetch", "prefetch", "evict", "snoop", + "?", "?", "?", "?", "?", "?", "?" +}; +static char *partproc[] = { + "local node origin", "local node response", + "local node observed", "generic participation" +}; +static char *timeout[] = { + "request didn't time out", + "request timed out" +}; +static char *memoryio[] = { + "memory", "res.", "i/o", "generic" +}; +static char *nbextendederr[] = { + "RAM ECC error", + "CRC error", + "Sync error", + "Master abort", + "Target abort", + "GART error", + "RMW error", + "Watchdog error", + "RAM Chipkill ECC error", + "DEV Error", + "Link Data Error", + "Link Protocol Error", + "NB Array Error", + "DRAM Parity Error", + "Link Retry", + "Tablew Walk Data Error", + "L3 Cache Data Error", + "L3 Cache Tag Error", + "L3 Cache LRU Error" +}; +static char *highbits[32] = { + [31] = "valid", + [30] = "error overflow (multiple errors)", + [29] = "error uncorrected", + [28] = "error enable", + [27] = "misc error valid", + [26] = "error address valid", + [25] = "processor context corrupt", + [24] = "res24", + [23] = "res23", + /* 22-15 ecc syndrome bits */ + [14] = "corrected ecc error", + [13] = "uncorrected ecc error", + [12] = "res12", + [11] = "L3 subcache in error bit 1", + [10] = "L3 subcache in error bit 0", + [9] = "sublink or DRAM channel", + [8] = "error found by scrub", + /* 7-4 ht link number of error */ + [3] = "err cpu3", + [2] = "err cpu2", + [1] = "err cpu1", + [0] = "err cpu0", +}; +static char *k8threshold[] = { + [0 ... K8_MCELOG_THRESHOLD_DRAM_ECC - 1] = "Unknow threshold counter", + [K8_MCELOG_THRESHOLD_DRAM_ECC] = "MC4_MISC0 DRAM threshold", + [K8_MCELOG_THRESHOLD_LINK] = "MC4_MISC1 Link threshold", + [K8_MCELOG_THRESHOLD_L3_CACHE] = "MC4_MISC2 L3 Cache threshold", + [K8_MCELOG_THRESHOLD_FBDIMM] = "MC4_MISC3 FBDIMM threshold", + [K8_MCELOG_THRESHOLD_FBDIMM + 1 ... + K8_MCE_THRESHOLD_TOP - K8_MCE_THRESHOLD_BASE - 1] = + "Unknown threshold counter", +}; + + +static void decode_k8_generic_errcode(u64 status) +{ + unsigned short errcode = status & 0xffff; + int i; + + for (i=0; i<32; i++) { + if (i==31 || i==28 || i==26) + continue; + if (highbits[i] && (status & (1ULL<<(i+32)))) { + Wprintf( " bit%d = %s\n", i+32, highbits[i]); + } + } + + if ((errcode & 0xFFF0) == 0x0010) { + Wprintf( " TLB error '%s transaction, level %s'\n", + transaction[(errcode >> 2) & 3], + cachelevel[errcode & 3]); + } + else if ((errcode & 0xFF00) == 0x0100) { + Wprintf( " memory/cache error '%s mem transaction, %s transaction, level %s'\n", + memtrans[(errcode >> 4) & 0xf], + transaction[(errcode >> 2) & 3], + cachelevel[errcode & 3]); + } + else if ((errcode & 0xF800) == 0x0800) { + Wprintf( " bus error '%s, %s\n %s mem transaction\n %s access, level %s'\n", + partproc[(errcode >> 9) & 0x3], + timeout[(errcode >> 8) & 1], + memtrans[(errcode >> 4) & 0xf], + memoryio[(errcode >> 2) & 0x3], + cachelevel[(errcode & 0x3)]); + } +} + +static void decode_k8_dc_mc(u64 status, int *err) +{ + unsigned short exterrcode = (status >> 16) & 0x0f; + unsigned short errcode = status & 0xffff; + + if(status&(3ULL<<45)) { + Wprintf( " Data cache ECC error (syndrome %x)", + (u32) (status >> 47) & 0xff); + if(status&(1ULL<<40)) { + Wprintf(" found by scrubber"); + } + Wprintf("\n"); + } + + if ((errcode & 0xFFF0) == 0x0010) { + Wprintf( " TLB parity error in %s array\n", + (exterrcode == 0) ? "physical" : "virtual"); + } + + decode_k8_generic_errcode(status); +} + +static void decode_k8_ic_mc(u64 status, int *err) +{ + unsigned short exterrcode = (status >> 16) & 0x0f; + unsigned short errcode = status & 0xffff; + + if(status&(3ULL<<45)) { + Wprintf(" Instruction cache ECC error\n"); + } + + if ((errcode & 0xFFF0) == 0x0010) { + Wprintf(" TLB parity error in %s array\n", + (exterrcode == 0) ? "physical" : "virtual"); + } + + decode_k8_generic_errcode(status); +} + +static void decode_k8_bu_mc(u64 status, int *err) +{ + unsigned short exterrcode = (status >> 16) & 0x0f; + + if(status&(3ULL<<45)) { + Wprintf(" L2 cache ECC error\n"); + } + + Wprintf(" %s array error\n", + (exterrcode == 0) ? "Bus or cache" : "Cache tag"); + + decode_k8_generic_errcode(status); +} + +static void decode_k8_ls_mc(u64 status, int *err) +{ + decode_k8_generic_errcode(status); +} + +static void decode_k8_nb_mc(u64 status, int *memerr) +{ + unsigned short exterrcode = (status >> 16) & 0x0f; + + Wprintf(" Northbridge %s\n", nbextendederr[exterrcode]); + + switch (exterrcode) { + case 0: + *memerr = 1; + Wprintf(" ECC syndrome = %x\n", + (u32) (status >> 47) & 0xff); + break; + case 8: + *memerr = 1; + Wprintf(" Chipkill ECC syndrome = %x\n", + (u32) ((((status >> 24) & 0xff) << 8) | ((status >> 47) & 0xff))); + break; + case 1: + case 2: + case 3: + case 4: + case 6: + Wprintf(" link number = %x\n", + (u32) (status >> 36) & 0xf); + break; + } + + decode_k8_generic_errcode(status); +} + +static void decode_k8_fr_mc(u64 status, int *err) +{ + decode_k8_generic_errcode(status); +} + +static void decode_k8_threshold(u64 misc) +{ + if (misc & MCI_THRESHOLD_OVER) + Wprintf(" Threshold error count overflow\n"); +} + +typedef void (*decoder_t)(u64, int *ismemerr); + +static decoder_t decoders[] = { + [0] = decode_k8_dc_mc, + [1] = decode_k8_ic_mc, + [2] = decode_k8_bu_mc, + [3] = decode_k8_ls_mc, + [4] = decode_k8_nb_mc, + [5] = decode_k8_fr_mc, +}; + +void decode_amd_mc(enum cputype cpu, struct mce *mce, int *ismemerr) +{ + if (mce->bank < NELE(decoders)) + decoders[mce->bank](mce->status, ismemerr); + else if (mce->bank >= K8_MCE_THRESHOLD_BASE && + mce->bank < K8_MCE_THRESHOLD_TOP) + decode_k8_threshold(mce->misc); + else + Wprintf(" no decoder for unknown bank %u\n", mce->bank); +} + +char *k8_bank_name(unsigned num) +{ + static char buf[64]; + char *s = "unknown"; + if (num < NELE(k8bank)) + s = k8bank[num]; + else if (num >= K8_MCE_THRESHOLD_BASE && + num < K8_MCE_THRESHOLD_TOP) + s = k8threshold[num - K8_MCE_THRESHOLD_BASE]; + buf[sizeof(buf)-1] = 0; + snprintf(buf, sizeof(buf) - 1, "%u %s", num, s); + return buf; +} + +int mce_filter_k8(struct mce *m) +{ + /* Filter out GART errors */ + if (m->bank == 4) { + unsigned short exterrcode = (m->status >> 16) & 0x0f; + if (exterrcode == 5 && (m->status & (1ULL<<61))) + return 0; + } + return 1; +} Index: mcelog-189/amd.h =================================================================== --- /dev/null +++ mcelog-189/amd.h @@ -0,0 +1,80 @@ +char *k8_bank_name(unsigned num); +void decode_amd_mc(enum cputype, struct mce *mce, int *ismemerr); +int mce_filter_k8(struct mce *m); + +#define K8_MCE_THRESHOLD_BASE (MCE_EXTENDED_BANK + 1) /* MCE_AMD */ +#define K8_MCE_THRESHOLD_TOP (K8_MCE_THRESHOLD_BASE + 6 * 9) + +#define K8_MCELOG_THRESHOLD_DRAM_ECC (4 * 9 + 0) +#define K8_MCELOG_THRESHOLD_LINK (4 * 9 + 1) +#define K8_MCELOG_THRESHOLD_L3_CACHE (4 * 9 + 2) +#define K8_MCELOG_THRESHOLD_FBDIMM (4 * 9 + 3) + +#define EC(x) ((x) & 0xffff) +#define XEC(x, mask) (((x) >> 16) & mask) + +#define LOW_SYNDROME(x) (((x) >> 15) & 0xff) +#define HIGH_SYNDROME(x) (((x) >> 24) & 0xff) + +#define TLB_ERROR(x) (((x) & 0xFFF0) == 0x0010) +#define MEM_ERROR(x) (((x) & 0xFF00) == 0x0100) +#define BUS_ERROR(x) (((x) & 0xF800) == 0x0800) +#define INT_ERROR(x) (((x) & 0xF4FF) == 0x0400) + +#define TT(x) (((x) >> 2) & 0x3) +#define TT_MSG(x) tt_msgs[TT(x)] +#define II(x) (((x) >> 2) & 0x3) +#define II_MSG(x) ii_msgs[II(x)] +#define LL(x) ((x) & 0x3) +#define LL_MSG(x) ll_msgs[LL(x)] +#define TO(x) (((x) >> 8) & 0x1) +#define TO_MSG(x) to_msgs[TO(x)] +#define PP(x) (((x) >> 9) & 0x3) +#define PP_MSG(x) pp_msgs[PP(x)] +#define UU(x) (((x) >> 8) & 0x3) +#define UU_MSG(x) uu_msgs[UU(x)] + +#define R4(x) (((x) >> 4) & 0xf) +#define R4_MSG(x) ((R4(x) < 9) ? rrrr_msgs[R4(x)] : "Wrong R4!") + +enum tt_ids { + TT_INSTR = 0, + TT_DATA, + TT_GEN, + TT_RESV, +}; + +enum ll_ids { + LL_RESV = 0, + LL_L1, + LL_L2, + LL_LG, +}; + +enum ii_ids { + II_MEM = 0, + II_RESV, + II_IO, + II_GEN, +}; + +enum rrrr_ids { + R4_GEN = 0, + R4_RD, + R4_WR, + R4_DRD, + R4_DWR, + R4_IRD, + R4_PREF, + R4_EVICT, + R4_SNOOP, +}; + +#define CASE_AMD_CPUS \ + (cputype == CPU_K8 || \ + cputype == CPU_F10H || \ + cputype == CPU_F11H || \ + cputype == CPU_F12H || \ + cputype == CPU_F14H || \ + cputype == CPU_F15H || \ + cputype == CPU_F16H) Index: mcelog-189/k8.c =================================================================== --- mcelog-189.orig/k8.c +++ /dev/null @@ -1,281 +0,0 @@ -/* Based on K8 decoding code written for the 2.4 kernel by Andi Kleen and - * Eric Morton. Hacked and extended for mcelog by AK. - * - * Original copyright: - * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs. - * Additional K8 decoding and simplification Copyright 2003 Eric Morton, Newisys Inc - * K8 threshold counters decoding Copyright 2005,2006 Jacob Shin, AMD Inc. - * - * Subject to the GNU General Public License - */ - -#include -#include "mcelog.h" -#include "k8.h" - -static char *k8bank[] = { - "data cache", - "instruction cache", - "bus unit", - "load/store unit", - "northbridge", - "fixed-issue reoder" -}; -static char *transaction[] = { - "instruction", "data", "generic", "reserved" -}; -static char *cachelevel[] = { - "0", "1", "2", "generic" -}; -static char *memtrans[] = { - "generic error", "generic read", "generic write", "data read", - "data write", "instruction fetch", "prefetch", "evict", "snoop", - "?", "?", "?", "?", "?", "?", "?" -}; -static char *partproc[] = { - "local node origin", "local node response", - "local node observed", "generic participation" -}; -static char *timeout[] = { - "request didn't time out", - "request timed out" -}; -static char *memoryio[] = { - "memory", "res.", "i/o", "generic" -}; -static char *nbextendederr[] = { - "RAM ECC error", - "CRC error", - "Sync error", - "Master abort", - "Target abort", - "GART error", - "RMW error", - "Watchdog error", - "RAM Chipkill ECC error", - "DEV Error", - "Link Data Error", - "Link Protocol Error", - "NB Array Error", - "DRAM Parity Error", - "Link Retry", - "Tablew Walk Data Error", - "L3 Cache Data Error", - "L3 Cache Tag Error", - "L3 Cache LRU Error" -}; -static char *highbits[32] = { - [31] = "valid", - [30] = "error overflow (multiple errors)", - [29] = "error uncorrected", - [28] = "error enable", - [27] = "misc error valid", - [26] = "error address valid", - [25] = "processor context corrupt", - [24] = "res24", - [23] = "res23", - /* 22-15 ecc syndrome bits */ - [14] = "corrected ecc error", - [13] = "uncorrected ecc error", - [12] = "res12", - [11] = "L3 subcache in error bit 1", - [10] = "L3 subcache in error bit 0", - [9] = "sublink or DRAM channel", - [8] = "error found by scrub", - /* 7-4 ht link number of error */ - [3] = "err cpu3", - [2] = "err cpu2", - [1] = "err cpu1", - [0] = "err cpu0", -}; -static char *k8threshold[] = { - [0 ... K8_MCELOG_THRESHOLD_DRAM_ECC - 1] = "Unknown threshold counter", - [K8_MCELOG_THRESHOLD_DRAM_ECC] = "MC4_MISC0 DRAM threshold", - [K8_MCELOG_THRESHOLD_LINK] = "MC4_MISC1 Link threshold", - [K8_MCELOG_THRESHOLD_L3_CACHE] = "MC4_MISC2 L3 Cache threshold", - [K8_MCELOG_THRESHOLD_FBDIMM] = "MC4_MISC3 FBDIMM threshold", - [K8_MCELOG_THRESHOLD_FBDIMM + 1 ... - K8_MCE_THRESHOLD_TOP - K8_MCE_THRESHOLD_BASE - 1] = - "Unknown threshold counter", -}; - - -static void decode_k8_generic_errcode(u64 status) -{ - unsigned short errcode = status & 0xffff; - int i; - - for (i=0; i<32; i++) { - if (i==31 || i==28 || i==26) - continue; - if (highbits[i] && (status & (1ULL<<(i+32)))) { - Wprintf( " bit%d = %s\n", i+32, highbits[i]); - } - } - - if ((errcode & 0xFFF0) == 0x0010) { - Wprintf( " TLB error '%s transaction, level %s'\n", - transaction[(errcode >> 2) & 3], - cachelevel[errcode & 3]); - } - else if ((errcode & 0xFF00) == 0x0100) { - Wprintf( " memory/cache error '%s mem transaction, %s transaction, level %s'\n", - memtrans[(errcode >> 4) & 0xf], - transaction[(errcode >> 2) & 3], - cachelevel[errcode & 3]); - } - else if ((errcode & 0xF800) == 0x0800) { - Wprintf( " bus error '%s, %s\n %s mem transaction\n %s access, level %s'\n", - partproc[(errcode >> 9) & 0x3], - timeout[(errcode >> 8) & 1], - memtrans[(errcode >> 4) & 0xf], - memoryio[(errcode >> 2) & 0x3], - cachelevel[(errcode & 0x3)]); - } -} - -static void decode_k8_dc_mc(u64 status, int *err) -{ - unsigned short exterrcode = (status >> 16) & 0x0f; - unsigned short errcode = status & 0xffff; - - if(status&(3ULL<<45)) { - Wprintf( " Data cache ECC error (syndrome %x)", - (u32) (status >> 47) & 0xff); - if(status&(1ULL<<40)) { - Wprintf(" found by scrubber"); - } - Wprintf("\n"); - } - - if ((errcode & 0xFFF0) == 0x0010) { - Wprintf( " TLB parity error in %s array\n", - (exterrcode == 0) ? "physical" : "virtual"); - } - - decode_k8_generic_errcode(status); -} - -static void decode_k8_ic_mc(u64 status, int *err) -{ - unsigned short exterrcode = (status >> 16) & 0x0f; - unsigned short errcode = status & 0xffff; - - if(status&(3ULL<<45)) { - Wprintf(" Instruction cache ECC error\n"); - } - - if ((errcode & 0xFFF0) == 0x0010) { - Wprintf(" TLB parity error in %s array\n", - (exterrcode == 0) ? "physical" : "virtual"); - } - - decode_k8_generic_errcode(status); -} - -static void decode_k8_bu_mc(u64 status, int *err) -{ - unsigned short exterrcode = (status >> 16) & 0x0f; - - if(status&(3ULL<<45)) { - Wprintf(" L2 cache ECC error\n"); - } - - Wprintf(" %s array error\n", - (exterrcode == 0) ? "Bus or cache" : "Cache tag"); - - decode_k8_generic_errcode(status); -} - -static void decode_k8_ls_mc(u64 status, int *err) -{ - decode_k8_generic_errcode(status); -} - -static void decode_k8_nb_mc(u64 status, int *memerr) -{ - unsigned short exterrcode = (status >> 16) & 0x0f; - - Wprintf(" Northbridge %s\n", nbextendederr[exterrcode]); - - switch (exterrcode) { - case 0: - *memerr = 1; - Wprintf(" ECC syndrome = %x\n", - (u32) (status >> 47) & 0xff); - break; - case 8: - *memerr = 1; - Wprintf(" Chipkill ECC syndrome = %x\n", - (u32) ((((status >> 24) & 0xff) << 8) | ((status >> 47) & 0xff))); - break; - case 1: - case 2: - case 3: - case 4: - case 6: - Wprintf(" link number = %x\n", - (u32) (status >> 36) & 0xf); - break; - } - - decode_k8_generic_errcode(status); -} - -static void decode_k8_fr_mc(u64 status, int *err) -{ - decode_k8_generic_errcode(status); -} - -static void decode_k8_threshold(u64 misc) -{ - if (misc & MCI_THRESHOLD_OVER) - Wprintf(" Threshold error count overflow\n"); -} - -typedef void (*decoder_t)(u64, int *ismemerr); - -static decoder_t decoders[] = { - [0] = decode_k8_dc_mc, - [1] = decode_k8_ic_mc, - [2] = decode_k8_bu_mc, - [3] = decode_k8_ls_mc, - [4] = decode_k8_nb_mc, - [5] = decode_k8_fr_mc, -}; - -void decode_k8_mc(struct mce *mce, int *ismemerr) -{ - if (mce->bank < NELE(decoders)) - decoders[mce->bank](mce->status, ismemerr); - else if (mce->bank >= K8_MCE_THRESHOLD_BASE && - mce->bank < K8_MCE_THRESHOLD_TOP) - decode_k8_threshold(mce->misc); - else - Wprintf(" no decoder for unknown bank %u\n", mce->bank); -} - -char *k8_bank_name(unsigned num) -{ - static char buf[64]; - char *s = "unknown"; - if (num < NELE(k8bank)) - s = k8bank[num]; - else if (num >= K8_MCE_THRESHOLD_BASE && - num < K8_MCE_THRESHOLD_TOP) - s = k8threshold[num - K8_MCE_THRESHOLD_BASE]; - buf[sizeof(buf)-1] = 0; - snprintf(buf, sizeof(buf) - 1, "%u %s", num, s); - return buf; -} - -int mce_filter_k8(struct mce *m) -{ - /* Filter out GART errors */ - if (m->bank == 4) { - unsigned short exterrcode = (m->status >> 16) & 0x0f; - if (exterrcode == 5 && (m->status & (1ULL<<61))) - return 0; - } - return 1; -} Index: mcelog-189/k8.h =================================================================== --- mcelog-189.orig/k8.h +++ /dev/null @@ -1,11 +0,0 @@ -char *k8_bank_name(unsigned num); -void decode_k8_mc(struct mce *mce, int *ismemerr); -int mce_filter_k8(struct mce *m); - -#define K8_MCE_THRESHOLD_BASE (MCE_EXTENDED_BANK + 1) /* MCE_AMD */ -#define K8_MCE_THRESHOLD_TOP (K8_MCE_THRESHOLD_BASE + 6 * 9) - -#define K8_MCELOG_THRESHOLD_DRAM_ECC (4 * 9 + 0) -#define K8_MCELOG_THRESHOLD_LINK (4 * 9 + 1) -#define K8_MCELOG_THRESHOLD_L3_CACHE (4 * 9 + 2) -#define K8_MCELOG_THRESHOLD_FBDIMM (4 * 9 + 3) Index: mcelog-189/mcelog.c =================================================================== --- mcelog-189.orig/mcelog.c +++ mcelog-189/mcelog.c @@ -41,7 +41,7 @@ #include #include "mcelog.h" #include "paths.h" -#include "k8.h" +#include "amd.h" #include "intel.h" #include "p4.h" #include "dmi.h" @@ -346,8 +346,8 @@ static void dump_mce(struct mce *m, unsi time_t t = m->time; Wprintf("TIME %llu %s", m->time, ctime(&t)); } - if (cputype == CPU_K8) - decode_k8_mc(m, &ismemerr); + if CASE_AMD_CPUS + decode_amd_mc(m, &ismemerr); else if (cputype >= CPU_INTEL) decode_intel_mc(m, cputype, &ismemerr, recordlen); /* else add handlers for other CPUs here */