diff --git a/mcelog-0.7-newcpus-1.diff b/mcelog-0.7-newcpus-1.diff new file mode 100644 index 0000000..07ac914 --- /dev/null +++ b/mcelog-0.7-newcpus-1.diff @@ -0,0 +1,1022 @@ +From: Andi Kleen +Subject: mcelog decoding support for Intel Tigerton + +Backport of the changes for Tigerton/Dunnington/Nehalem changes from mcelog git +git://git.kernel.org/pub/scm/utils/cpu/mce/mcelog.git + +The Tigerton support required adding Core2 support, they are all +lumped together. I also added "P6OLD" because that was in the mainline +mcelog git changes and would have been difficult to separate. +The differences to core2 are very minimal (just a few different events). +The actual decoder is all table driven. + +In the original git this was done as individual changes, but I lumped +it all together in the backport. + +While it adds quite a lot of new code there's not many changes to generic +code. Most of the new code is only used on the new CPUs. + +diff -x '*~' -urpN mcelog-0.7/bitfield.c mcelog-0.7-newcpus//bitfield.c +--- mcelog-0.7/bitfield.c 1970-01-01 01:00:00.000000000 +0100 ++++ mcelog-0.7-newcpus//bitfield.c 2008-09-26 20:28:29.000000000 +0200 +@@ -0,0 +1,61 @@ ++#include ++#include ++#include "mcelog.h" ++#include "bitfield.h" ++ ++char *reserved_3bits[8]; ++char *reserved_1bit[2]; ++char *reserved_2bits[4]; ++ ++static u64 bitmask(u64 i) ++{ ++ u64 mask = 1; ++ while (mask < i) ++ mask = (mask << 1) | 1; ++ return mask; ++} ++ ++void decode_bitfield(u64 status, struct field *fields) ++{ ++ struct field *f; ++ int linelen = 0; ++ char *delim = ""; ++ ++ for (f = fields; f->str; f++) { ++ u64 v = (status >> f->start_bit) & bitmask(f->stringlen - 1); ++ char *s = NULL; ++ if (v < f->stringlen) ++ s = f->str[v]; ++ if (!s) { ++ if (v == 0) ++ continue; ++ char buf[60]; ++ s = buf; ++ snprintf(buf, sizeof buf, "<%u:%Lx>", f->start_bit, v); ++ } ++ int len = strlen(s); ++ if (linelen + len > 75) { ++ delim = "\n"; ++ linelen = 0; ++ } ++ Wprintf("%s%s", delim, s); ++ delim = " "; ++ linelen += len + 1; ++ } ++ if (linelen > 0) ++ Wprintf("\n"); ++} ++ ++void decode_numfield(u64 status, struct numfield *fields) ++{ ++ struct numfield *f; ++ for (f = fields; f->name; f++) { ++ u64 mask = (1ULL << (f->end - f->start - 1)) - 1; ++ u64 v = (status >> f->start) & mask; ++ if (v > 0) { ++ char fmt[30]; ++ snprintf(fmt, 30, "%%s: %s\n", f->fmt ? f->fmt : "%Lu"); ++ Wprintf(fmt, f->name, v); ++ } ++ } ++} +diff -x '*~' -urpN mcelog-0.7/bitfield.h mcelog-0.7-newcpus//bitfield.h +--- mcelog-0.7/bitfield.h 1970-01-01 01:00:00.000000000 +0100 ++++ mcelog-0.7-newcpus//bitfield.h 2008-09-26 20:28:29.000000000 +0200 +@@ -0,0 +1,27 @@ ++/* Generic bitfield decoder */ ++ ++struct field { ++ int start_bit; ++ char **str; ++ int stringlen; ++}; ++ ++struct numfield { ++ int start, end; ++ char *name; ++ char *fmt; ++}; ++ ++#define FIELD(start_bit, name) { start_bit, name, NELE(name) } ++#define SBITFIELD(start_bit, string) { start_bit, ((char * [2]) { NULL, string }), 2 } ++ ++#define NUMBER(start, end, name) { start, end, name, "%Lu" } ++#define HEXNUMBER(start, end, name) { start, end, name, "%Lx" } ++ ++void decode_bitfield(u64 status, struct field *fields); ++void decode_numfield(u64 status, struct numfield *fields); ++ ++extern char *reserved_3bits[8]; ++extern char *reserved_1bit[2]; ++extern char *reserved_2bits[4]; ++ +diff -x '*~' -urpN mcelog-0.7/core2.c mcelog-0.7-newcpus//core2.c +--- mcelog-0.7/core2.c 1970-01-01 01:00:00.000000000 +0100 ++++ mcelog-0.7-newcpus//core2.c 2008-09-26 20:21:18.000000000 +0200 +@@ -0,0 +1,105 @@ ++#include ++#include ++#include ++#include "mcelog.h" ++#include "core2.h" ++#include "bitfield.h" ++ ++/* Decode P6 family (Core2) model specific errors. ++ The generic errors are decoded in p4.c */ ++ ++/* [19..24] */ ++static char *bus_queue_req_type[] = { ++ [0] = "BQ_DCU_READ_TYPE", ++ [2] = "BQ_IFU_DEMAND_TYPE", ++ [3] = "BQ_IFU_DEMAND_NC_TYPE", ++ [4] = "BQ_DCU_RFO_TYPE", ++ [5] = "BQ_DCU_RFO_LOCK_TYPE", ++ [6] = "BQ_DCU_ITOM_TYPE", ++ [8] = "BQ_DCU_WB_TYPE", ++ [10] = "BC_DCU_WCEVICT_TYPE", ++ [11] = "BQ_DCU_WCLINE_TYPE", ++ [12] = "BQ_DCU_BTM_TYPE", ++ [13] = "BQ_DCU_INTACK_TYPE", ++ [14] = "BQ_DCU_INVALL2_TYPE", ++ [15] = "BQ_DCU_FLUSHL2_TYPE", ++ [16] = "BQ_DCU_PART_RD_TYPE", ++ [18] = "BQ_DCU_PART_WR_TYPE", ++ [20] = "BQ_DCU_SPEC_CYC_TYPE", ++ [24] = "BQ_DCU_IO_RD_TYPE", ++ [25] = "BQ_DCU_IO_WR_TYPE", ++ [28] = "BQ_DCU_LOCK_RD_TYPE", ++ [30] = "BQ_DCU_SPLOCK_RD_TYPE", ++ [29] = "BQ_DCU_LOCK_WR_TYPE", ++}; ++ ++/* [25..27] */ ++static char *bus_queue_error_type[] = { ++ [0] = "BQ_ERR_HARD_TYPE", ++ [1] = "BQ_ERR_DOUBLE_TYPE", ++ [2] = "BQ_ERR_AERR2_TYPE", ++ [4] = "BQ_ERR_SINGLE_TYPE", ++ [5] = "BQ_ERR_AERR1_TYPE", ++}; ++ ++static struct field p6_shared_status[] = { ++ FIELD(16, reserved_3bits), ++ FIELD(19, bus_queue_req_type), ++ FIELD(25, bus_queue_error_type), ++ FIELD(25, bus_queue_error_type), ++ SBITFIELD(30, "internal BINIT"), ++ SBITFIELD(36, "received parity error on response transaction"), ++ SBITFIELD(38, "timeout BINIT (ROB timeout)." ++ " No micro-instruction retired for some time"), ++ FIELD(39, reserved_3bits), ++ SBITFIELD(42, "bus transaction received hard error response"), ++ SBITFIELD(43, "failure that caused IERR"), ++ /* The following are reserved for Core in the SDM. Let's keep them here anyways*/ ++ SBITFIELD(44, "two failing bus transactions with address parity error (AERR)"), ++ SBITFIELD(45, "uncorrectable ECC error"), ++ SBITFIELD(46, "correctable ECC error"), ++ /* [47..54]: ECC syndrome */ ++ FIELD(55, reserved_2bits), ++ {}, ++}; ++ ++static struct field p6old_status[] = { ++ SBITFIELD(28, "FRC error"), ++ SBITFIELD(29, "BERR on this CPU"), ++ FIELD(31, reserved_1bit), ++ FIELD(32, reserved_3bits), ++ SBITFIELD(35, "BINIT received from external bus"), ++ SBITFIELD(37, "Received hard error reponse on split transaction (Bus BINIT)"), ++ {} ++}; ++ ++static struct field core2_status[] = { ++ SBITFIELD(28, "MCE driven"), ++ SBITFIELD(29, "MCE is observed"), ++ SBITFIELD(31, "BINIT observed"), ++ FIELD(32, reserved_2bits), ++ SBITFIELD(34, "PIC or FSB data parity error"), ++ FIELD(35, reserved_1bit), ++ SBITFIELD(37, "FSB address parity error detected"), ++ {} ++}; ++ ++static struct numfield p6old_status_numbers[] = { ++ HEXNUMBER(47, 54, "ECC syndrome"), ++ {} ++}; ++ ++void core2_decode_model(u64 status) ++{ ++ decode_bitfield(status, p6_shared_status); ++ decode_bitfield(status, core2_status); ++ /* Normally reserved, but let's parse anyways: */ ++ decode_numfield(status, p6old_status_numbers); ++} ++ ++void p6old_decode_model(u64 status) ++{ ++ decode_bitfield(status, p6_shared_status); ++ decode_bitfield(status, p6old_status); ++ decode_numfield(status, p6old_status_numbers); ++} +diff -x '*~' -urpN mcelog-0.7/core2.h mcelog-0.7-newcpus//core2.h +--- mcelog-0.7/core2.h 1970-01-01 01:00:00.000000000 +0100 ++++ mcelog-0.7-newcpus//core2.h 2008-09-26 20:21:18.000000000 +0200 +@@ -0,0 +1,2 @@ ++void core2_decode_model(u64 status); ++void p6old_decode_model(u64 status); +diff -x '*~' -urpN mcelog-0.7/dunnington.c mcelog-0.7-newcpus//dunnington.c +--- mcelog-0.7/dunnington.c 1970-01-01 01:00:00.000000000 +0100 ++++ mcelog-0.7-newcpus//dunnington.c 2008-09-26 20:24:24.000000000 +0200 +@@ -0,0 +1,123 @@ ++/* Copyright (c) 2008 by Intel Corp. ++ Decode Intel Xeon Processor 7400 Model (Dunnington) specific MCEs ++ ++ mcelog is free software; you can redistribute it and/or ++ modify it under the terms of the GNU General Public ++ License as published by the Free Software Foundation; version ++ 2. ++ ++ mcelog is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ General Public License for more details. ++ ++ You should find a copy of v2 of the GNU General Public License somewhere ++ on your Linux system; if not, write to the Free Software Foundation, ++ Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA ++ ++ Author: ++ Andi Kleen ++*/ ++ ++/* other files ++ ++mcelog.h CPU_DUNNINGTON ++mcelog.c: cputype name ++intel.h CASE_INTEL_CPUS ++intel.c model == 0x1d CPU_DUNNINGTON ++p4.c: if (cpu == CPU_DUNNINGTON) dunnington_decode_model(log->status); ++ add to CORE2 cases ++ ++*/ ++ ++#include ++#include "mcelog.h" ++#include "bitfield.h" ++#include "dunnington.h" ++ ++/* Follows Intel IA32 SDM 3b Appendix E.2.1 ++ */ ++ ++static struct field dunnington_bus_status[] = { ++ SBITFIELD(16, "Parity error detected during FSB request phase"), ++ FIELD(17, reserved_3bits), ++ SBITFIELD(20, "Hard Failure response received for a local transaction"), ++ SBITFIELD(21, "Parity error on FSB response field detected"), ++ SBITFIELD(22, "Parity data error on inbound data detected"), ++ FIELD(23, reserved_3bits), ++ FIELD(25, reserved_3bits), ++ FIELD(28, reserved_3bits), ++ FIELD(31, reserved_1bit), ++ {} ++}; ++ ++static char *dnt_front_error[0xf] = { ++ [0x1] = "Inclusion error from core 0", ++ [0x2] = "Inclusion error from core 1", ++ [0x3] = "Write Exclusive error from core 0", ++ [0x4] = "Write Exclusive error from core 1", ++ [0x5] = "Inclusion error from FSB", ++ [0x6] = "SNP stall error from FSB", ++ [0x7] = "Write stall error from FSB", ++ [0x8] = "FSB Arbiter Timeout error", ++ [0xA] = "Inclusion error from core 2", ++ [0xB] = "Write exclusive error from core 2", ++}; ++ ++static char *dnt_int_error[0xf] = { ++ [0x2] = "Internal timeout error", ++ [0x3] = "Internal timeout error", ++ [0x4] = "Intel Cache Safe Technology Queue full error\n" ++ "or disabled ways in a set overflow", ++ [0x5] = "Quiet cycle timeout error (correctable)", ++}; ++ ++struct field dnt_int_status[] = { ++ FIELD(8, dnt_int_error), ++ {} ++}; ++ ++struct field dnt_front_status[] = { ++ FIELD(0, dnt_front_error), ++ {} ++}; ++ ++struct field dnt_cecc[] = { ++ SBITFIELD(1, "Correctable ECC event on outgoing core 0 data"), ++ SBITFIELD(2, "Correctable ECC event on outgoing core 1 data"), ++ SBITFIELD(3, "Correctable ECC event on outgoing core 3 data"), ++ {} ++}; ++ ++struct field dnt_uecc[] = { ++ SBITFIELD(1, "Uncorrectable ECC event on outgoing core 0 data"), ++ SBITFIELD(2, "Uncorrectable ECC event on outgoing core 1 data"), ++ SBITFIELD(3, "Uncorrectable ECC event on outgoing core 3 data"), ++ {} ++}; ++ ++static void dunnington_decode_bus(u64 status) ++{ ++ decode_bitfield(status, dunnington_bus_status); ++} ++ ++static void dunnington_decode_internal(u64 status) ++{ ++ u32 mca = (status >> 16) & 0xffff; ++ if ((mca & 0xfff0) == 0) ++ decode_bitfield(status, dnt_front_status); ++ else if ((mca & 0xf0ff) == 0) ++ decode_bitfield(status, dnt_int_status); ++ else if ((mca & 0xfff0) == 0xc000) ++ decode_bitfield(status, dnt_cecc); ++ else if ((mca & 0xfff0) == 0xe000) ++ decode_bitfield(status, dnt_uecc); ++} ++ ++void dunnington_decode_model(u64 status) ++{ ++ if ((status & 0xffff) == 0xe0f) ++ dunnington_decode_bus(status); ++ else if ((status & 0xffff) == (1 << 10)) ++ dunnington_decode_internal(status); ++} ++ +diff -x '*~' -urpN mcelog-0.7/dunnington.h mcelog-0.7-newcpus//dunnington.h +--- mcelog-0.7/dunnington.h 1970-01-01 01:00:00.000000000 +0100 ++++ mcelog-0.7-newcpus//dunnington.h 2008-09-26 20:24:24.000000000 +0200 +@@ -0,0 +1,2 @@ ++void dunnington_decode_model(u64 status); ++ +diff -x '*~' -urpN mcelog-0.7/intel.c mcelog-0.7-newcpus//intel.c +--- mcelog-0.7/intel.c 1970-01-01 01:00:00.000000000 +0100 ++++ mcelog-0.7-newcpus//intel.c 2008-09-26 20:32:52.000000000 +0200 +@@ -0,0 +1,22 @@ ++#include "mcelog.h" ++#include "intel.h" ++#include ++ ++enum cputype select_intel_cputype(int family, int model) ++{ ++ if (family == 15) { ++ return CPU_P4; ++ } ++ if (family == 6) { ++ if (model < 0xf) ++ return CPU_P6OLD; ++ else if (model == 0xf || model == 0x17) /* Merom/Penryn */ ++ return CPU_CORE2; ++ else if (model == 0x1d) ++ return CPU_DUNNINGTON; ++ else if (model == 0x1a) ++ return CPU_NEHALEM; ++ } ++ fprintf(stderr, "Unknown Intel CPU type family %x model %x\n", family, model); ++ return family == 6 ? CPU_P6OLD : CPU_GENERIC; ++} +diff -x '*~' -urpN mcelog-0.7/intel.h mcelog-0.7-newcpus//intel.h +--- mcelog-0.7/intel.h 1970-01-01 01:00:00.000000000 +0100 ++++ mcelog-0.7-newcpus//intel.h 2008-09-26 20:32:00.000000000 +0200 +@@ -0,0 +1,9 @@ ++enum cputype select_intel_cputype(int family, int model); ++ ++#define CASE_INTEL_CPUS \ ++ case CPU_P6OLD: \ ++ case CPU_CORE2: \ ++ case CPU_NEHALEM: \ ++ case CPU_DUNNINGTON: \ ++ case CPU_P4 ++ +diff -x '*~' -urpN mcelog-0.7/Makefile mcelog-0.7-newcpus//Makefile +--- mcelog-0.7/Makefile 2006-05-03 08:55:54.000000000 +0200 ++++ mcelog-0.7-newcpus//Makefile 2008-09-26 21:07:21.000000000 +0200 +@@ -5,7 +5,8 @@ all: mcelog + + .PHONY: install clean + +-mcelog: p4.o k8.o mcelog.o dmi.o ++mcelog: p4.o k8.o mcelog.o dmi.o core2.o dunnington.o nehalem.o \ ++ bitfield.o intel.o + + p4.o: p4.c mcelog.h p4.h + k8.o: k8.c mcelog.h k8.h +@@ -18,7 +19,8 @@ install: mcelog.c + echo "call mcelog regularly from your crontab" + + clean: +- rm -f mcelog mcelog.o k8.o p4.o dmi.o dmi ++ rm -f mcelog mcelog.o k8.o p4.o dmi.o dmi core2.o dunnington.o \ ++ nehalem.o bitfield.o intel.o + + dmi: dmi.c + gcc -o dmi ${CFLAGS} -DSTANDALONE dmi.c ${LDFLAGS} +diff -x '*~' -urpN mcelog-0.7/mcelog.8 mcelog-0.7-newcpus//mcelog.8 +--- mcelog-0.7/mcelog.8 2006-05-03 08:55:54.000000000 +0200 ++++ mcelog-0.7-newcpus//mcelog.8 2008-09-26 20:42:44.000000000 +0200 +@@ -2,9 +2,9 @@ + .SH NAME + mcelog \- Print machine check log from x86-64 kernel. + .SH SYNOPSIS +-mcelog [\-\-syslog] [\-\-k8|\-\-p4|\-\-generic] [\-\-ignorenodev] [\-\-dmi] [\-\-filter] [device] ++mcelog [\-\-syslog] [\-\-k8|\-\-p4|\-\-generic|...] [\-\-ignorenodev] [\-\-dmi] [\-\-filter] [device] + .br +-mcelog [\-\-k8|\-\-p4|\-\-generic] \-\-ascii ++mcelog [\-\-k8|\-\-p4|\-\-generic|...] \-\-ascii + .SH DESCRIPTION + Linux x86-64 kernels since 2.6.4 don't print recoverable machine check errors + to the kernel log anymore. Instead they are saved into a special +@@ -18,13 +18,21 @@ When the + .B \-\-syslog + option is specified redirect output to system log. + ++ + When + .B \-\-k8 + is specified assume the events are for a AMD Opteron or Athlon 64 or Athlon + FX CPU. + With + .B \-\-p4 +-is specified assume the events are for a Intel Pentium 4 or Intel Xeon. ++is specified assume the events are for a Intel Pentium 4 or Intel (older) Xeon. ++With ++.B \-\-core2 ++assume the events are for a Intel Core2 CPU or Intel Xeon 3000, 3200, 5100, 5300, 7300 ++series. When ++.B \-\-intel-cpu=family,model ++are specified then the family number and model number of the Intel CPU ++to be decoded should be specified (can be found in /proc/cpuinfo). + When + .B \-\-generic + all the fields are dumped without CPU specific decoding. +diff -x '*~' -urpN mcelog-0.7/mcelog.c mcelog-0.7-newcpus//mcelog.c +--- mcelog-0.7/mcelog.c 2006-05-03 08:55:54.000000000 +0200 ++++ mcelog-0.7-newcpus//mcelog.c 2008-09-26 20:45:50.000000000 +0200 +@@ -31,12 +31,10 @@ + #include "k8.h" + #include "p4.h" + #include "dmi.h" ++#include "intel.h" + +-enum { +- CPU_GENERIC, +- CPU_K8, +- CPU_P4 +-} cpu = CPU_GENERIC; ++ ++enum cputype cpu = CPU_GENERIC; + + char *logfn = "/dev/mcelog"; + +@@ -62,8 +60,8 @@ char *bankname(unsigned bank) + switch (cpu) { + case CPU_K8: + return k8_bank_name(bank); +- case CPU_P4: +- return p4_bank_name(bank); ++ CASE_INTEL_CPUS: ++ return intel_bank_name(bank); + /* add banks of other cpu types here */ + default: + sprintf(numeric, "BANK %d", bank); +@@ -98,7 +96,7 @@ int mce_filter(struct mce *m) + case CPU_K8: + return mce_filter_k8(m); + /* add more buggy CPUs here */ +- case CPU_P4: ++ CASE_INTEL_CPUS: + /* No bugs known */ + return 1; + default: +@@ -134,8 +132,8 @@ void dump_mce(struct mce *m) + case CPU_K8: + decode_k8_mc(m); + break; +- case CPU_P4: +- decode_p4_mc(m); ++ CASE_INTEL_CPUS: ++ decode_intel_mc(m, cpu); + break; + /* add handlers for other CPUs here */ + default: +@@ -153,23 +151,27 @@ void check_cpu(void) + if (f != NULL) { + int found = 0; + int family; ++ int model; + char vendor[64]; + char *line = NULL; + size_t linelen = 0; +- while (getdelim(&line, &linelen, '\n', f) > 0 && found < 2) { ++ while (getdelim(&line, &linelen, '\n', f) > 0 && found < 3) { + if (sscanf(line, "vendor_id : %63[^\n]", vendor) == 1) + found++; + if (sscanf(line, "cpu family : %d", &family) == 1) + found++; ++ if (sscanf(line, "model : %d", &model) == 1) ++ found++; + } +- if (found == 2) { ++ if (found == 3) { + if (!strcmp(vendor,"AuthenticAMD") && family == 15) + cpu = CPU_K8; +- if (!strcmp(vendor,"GenuineIntel") && family == 15) +- cpu = CPU_P4; ++ if (!strcmp(vendor,"GenuineIntel")) ++ cpu = select_intel_cputype(family, model); + /* Add checks for other CPUs here */ + } else { +- fprintf(stderr, "mcelog: warning: Cannot parse /proc/cpuinfo\n"); ++ fprintf(stderr, ++ "mcelog: warning: Cannot parse /proc/cpuinfo\n"); + } + fclose(f); + free(line); +@@ -303,9 +305,11 @@ void usage(void) + { + fprintf(stderr, + "Usage:\n" +- " mcelog [--k8|--p4|--generic] [--ignorenodev] [--dmi] [--syslog] [--filter] [mcelogdevice]\n" ++ " mcelog options [--ignorenodev] [--dmi] [--syslog] [--filter] [mcelogdevice]\n" + "Decode machine check error records from kernel\n" +- " mcelog [--k8|--p4|--generic] [--dmi] --ascii < log\n" ++ " mcelog options [--dmi] --ascii < log\n" ++ "Options:\n" ++ "--p4|--k8|--core2|--generic|--intel-cpu=family,model Set CPU type to decode\n" + "Decode machine check ASCII output from kernel logs\n"); + exit(1); + } +@@ -318,6 +322,17 @@ int modifier(char *s) + cpu = CPU_P4; + } else if (!strcmp(s, "--generic")) { + cpu = CPU_GENERIC; ++ } else if (!strcmp(s, "--core2")) { ++ cpu = CPU_CORE2; ++ } else if (!strncmp(s, "--intel-cpu=", 12)) { ++ unsigned fam, mod; ++ if (sscanf(s + 12, "%i,%i", &fam, &mod) != 2) ++ usage(); ++ cpu = select_intel_cputype(fam, mod); ++ if (cpu == CPU_GENERIC) { ++ fprintf(stderr, "Unknown Intel CPU\n"); ++ usage(); ++ } + } else if (!strcmp(s, "--ignorenodev")) { + ignore_nodev = 1; + } else if (!strcmp(s,"--filter")) { +diff -x '*~' -urpN mcelog-0.7/mcelog.h mcelog-0.7-newcpus//mcelog.h +--- mcelog-0.7/mcelog.h 2006-05-03 08:55:54.000000000 +0200 ++++ mcelog-0.7-newcpus//mcelog.h 2008-09-26 20:28:19.000000000 +0200 +@@ -61,3 +61,13 @@ struct mce { + #endif + + void Wprintf(char *fmt, ...) PRINTFLIKE; ++ ++enum cputype { ++ CPU_GENERIC, ++ CPU_K8, ++ CPU_P4, ++ CPU_NEHALEM, ++ CPU_DUNNINGTON, ++ CPU_P6OLD, ++ CPU_CORE2, ++}; +diff -x '*~' -urpN mcelog-0.7/nehalem.c mcelog-0.7-newcpus//nehalem.c +--- mcelog-0.7/nehalem.c 1970-01-01 01:00:00.000000000 +0100 ++++ mcelog-0.7-newcpus//nehalem.c 2008-09-26 20:24:20.000000000 +0200 +@@ -0,0 +1,163 @@ ++/* Copyright (C) 2008 Intel Corporation ++ Decode Intel Nehalem specific machine check errors. ++ ++ mcelog is free software; you can redistribute it and/or ++ modify it under the terms of the GNU General Public ++ License as published by the Free Software Foundation; version ++ 2. ++ ++ mcelog is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ General Public License for more details. ++ ++ You should find a copy of v2 of the GNU General Public License somewhere ++ on your Linux system; if not, write to the Free Software Foundation, ++ Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA ++ ++ Author: Andi Kleen ++*/ ++ ++/* other files ++ ++mcelog.h CPU_NEHALEM ++intel.h CASE_INTEL_CPUS ++intel.c model == 0x1a CPU_NEHALEM ++p4.c: if (cpu == CPU_NEHALEM) nehalem_decode_model(log->status, log->misc); ++ if (test_prefix(status, 7)) decode_memory_controller(log->status); ++mcelog.c/p4.c: syslog/trigger for memory controller ++ cputype_name ++*/ ++ ++#include ++#include ++#include "mcelog.h" ++#include "nehalem.h" ++#include "core2.h" ++#include "bitfield.h" ++ ++/* See IA32 SDM Vol3B Appendix E.3.2 ff */ ++ ++/* MC1_STATUS error */ ++static struct field qpi_status[] = { ++ SBITFIELD(16, "QPI header had bad parity"), ++ SBITFIELD(17, "QPI Data packet had bad parity"), ++ SBITFIELD(18, "Number of QPI retries exceeded"), ++ SBITFIELD(19, "Received QPI data packet that was poisoned by sender"), ++ SBITFIELD(20, "QPI reserved 20"), ++ SBITFIELD(21, "QPI reserved 21"), ++ SBITFIELD(22, "QPI received unsupported message encoding"), ++ SBITFIELD(23, "QPI credit type is not supported"), ++ SBITFIELD(24, "Sender sent too many QPI flits to the receiver"), ++ SBITFIELD(25, "QPI Sender sent a failed response to receiver"), ++ SBITFIELD(26, "Clock jitter detected in internal QPI clocking"), ++ {} ++}; ++ ++static struct field qpi_misc[] = { ++ SBITFIELD(14, "QPI misc reserved 14"), ++ SBITFIELD(15, "QPI misc reserved 15"), ++ SBITFIELD(24, "QPI Interleave/Head Indication Bit (IIB)"), ++ {} ++}; ++ ++static struct numfield qpi_numbers[] = { ++ HEXNUMBER(0, 7, "QPI class and opcode of packet with error"), ++ HEXNUMBER(8, 13, "QPI Request Transaction ID"), ++ NUMBER(16, 18, "QPI Requestor/Home Node ID (RHNID)"), ++ HEXNUMBER(19, 23, "QPI miscreserved 19-23"), ++}; ++ ++static struct field memory_controller_status[] = { ++ SBITFIELD(16, "Memory read ECC error"), ++ SBITFIELD(17, "Memory ECC error occurred during scrub"), ++ SBITFIELD(18, "Memory write parity error"), ++ SBITFIELD(19, "Memory error in half of redundant memory"), ++ SBITFIELD(20, "Memory reserved 20"), ++ SBITFIELD(21, "Memory access out of range"), ++ SBITFIELD(22, "Memory internal RTID invalid"), ++ SBITFIELD(23, "Memory address parity error"), ++ SBITFIELD(24, "Memory byte enable parity error"), ++ {} ++}; ++ ++static struct numfield memory_controller_numbers[] = { ++ HEXNUMBER(0, 7, "Memory transaction Tracker ID (RTId)"), ++ HEXNUMBER(8, 15, "Memory MISC reserved 8..15"), ++ NUMBER(16, 17, "Memory DIMM ID of error"), ++ NUMBER(18, 19, "Memory channel ID of error"), ++ HEXNUMBER(32, 63, "Memory ECC syndrome"), ++ HEXNUMBER(25, 37, "Memory MISC reserved 25..37"), ++ NUMBER(38, 52, "Memory corrected error count (CORE_ERR_CNT)"), ++ HEXNUMBER(53, 56, "Memory MISC reserved 53..56"), ++ {} ++}; ++ ++static char *internal_errors[] = { ++ [0x0] = "No Error", ++ [0x3] = "Reset firmware did not complete", ++ [0x8] = "Received an invalid CMPD", ++ [0xa] = "Invalid Power Management Request", ++ [0xd] = "Invalid S-state transition", ++ [0x11] = "VID controller does not match POC controller selected", ++ [0x1a] = "MSID from POC does not match CPU MSID", ++}; ++ ++static struct field internal_error_status[] = { ++ FIELD(24, internal_errors), ++ {} ++}; ++ ++static struct numfield internal_error_numbers[] = { ++ HEXNUMBER(16, 23, "Internal machine check status reserved 16..23"), ++ HEXNUMBER(32, 56, "Internal machine check status reserved 32..56"), ++ {}, ++}; ++ ++/* Generic architectural memory controller encoding */ ++ ++static char *mmm_mnemonic[] = { ++ "GEN", "RD", "WR", "AC", "MS", "RES5", "RES6", "RES7" ++}; ++static char *mmm_desc[] = { ++ "Generic undefined request", ++ "Memory read error", ++ "Memory write error", ++ "Address/Command error", ++ "Memory scrubbing error", ++ "Reserved 5", ++ "Reserved 6", ++ "Reserved 7" ++}; ++ ++void decode_memory_controller(u32 status) ++{ ++ char channel[30]; ++ if ((status & 0xf) == 0xf) ++ strcpy(channel, "unspecified"); ++ else ++ sprintf(channel, "%u", status & 0xf); ++ Wprintf("MEMORY CONTROLLER %s_CHANNEL%s_ERR\n", ++ mmm_mnemonic[(status >> 4) & 7], ++ channel); ++ Wprintf("Transaction: %s\n", mmm_desc[(status >> 4) & 7]); ++ Wprintf("Channel: %s\n", channel); ++} ++ ++void nehalem_decode_model(u64 status, u64 misc) ++{ ++ u32 mca = status & 0xffff; ++ core2_decode_model(status); ++ if ((mca >> 11) == 1) { /* bus and interconnect QPI */ ++ decode_bitfield(status, qpi_status); ++ decode_numfield(status, qpi_numbers); ++ decode_bitfield(misc, qpi_misc); ++ } else if (mca == 0x0001) { /* internal unspecified */ ++ decode_bitfield(status, internal_error_status); ++ decode_numfield(status, internal_error_numbers); ++ } else if ((mca >> 8) == 1) { /* memory controller */ ++ decode_bitfield(status, memory_controller_status); ++ decode_numfield(status, memory_controller_numbers); ++ } ++} ++ +diff -x '*~' -urpN mcelog-0.7/nehalem.h mcelog-0.7-newcpus//nehalem.h +--- mcelog-0.7/nehalem.h 1970-01-01 01:00:00.000000000 +0100 ++++ mcelog-0.7-newcpus//nehalem.h 2008-09-26 20:24:20.000000000 +0200 +@@ -0,0 +1,2 @@ ++void nehalem_decode_model(u64 status, u64 misc); ++void decode_memory_controller(u32 status); +diff -x '*~' -urpN mcelog-0.7/p4.c mcelog-0.7-newcpus//p4.c +--- mcelog-0.7/p4.c 2006-05-03 08:55:54.000000000 +0200 ++++ mcelog-0.7-newcpus//p4.c 2008-09-26 20:34:41.000000000 +0200 +@@ -1,7 +1,6 @@ + /* Copyright (c) 2005 by Intel Corp. + +- Decode IA32/x86-64 machine check for Pentium 4, Intel Xeon +- or EM64T. ++ Decode Intel machine check (generic and P4 specific) + + mcelog is free software; you can redistribute it and/or + modify it under the terms of the GNU General Public +@@ -19,12 +18,17 @@ + + Authors: + Racing Guo ++ Andi Kleen + */ +- ++ + #include + #include "mcelog.h" ++#include "p4.h" ++#include "core2.h" ++#include "nehalem.h" ++#include "dunnington.h" + +-/* decode mce for P4/Xeon family */ ++/* decode mce for P4/Xeon and Core2 family */ + + static inline int test_prefix(int nr, __u32 value) + { +@@ -73,13 +77,12 @@ static char* get_RRRR_str(__u8 rrrr) + } + + return "UNKNOWN"; +- + } + + static char* get_PP_str(__u8 pp) + { + static char* PP[] = { +- "Originated-request", ++ "Local-CPU-originated-request", + "Responed-to-request", + "Observed-error-as-third-party", + "Generic" +@@ -112,7 +115,7 @@ static char* get_II_str(__u8 i) + return II[i]; + } + +-static int decode_mca(__u32 mca, char *buf, int len) ++static void decode_mca(__u32 mca) + { + #define TLB_LL_MASK 0x3 /*bit 0, bit 1*/ + #define TLB_LL_SHIFT 0x0 +@@ -137,64 +140,59 @@ static int decode_mca(__u32 mca, char *b + #define BUS_PP_MASK 0x600 /*bit 9, bit 10*/ + #define BUS_PP_SHIFT 0x9 + +- mca = mca & 0xFFFF; ++ static char *msg[] = { ++ [0] = "No Error", ++ [1] = "Unclassified", ++ [2] = "Microcode ROM parity error", ++ [3] = "External error", ++ [4] = "FRC error", ++ }; ++ ++ if (mca & (1UL << 12)) { ++ Wprintf("corrected filtering (some unreported errors in same region)\n"); ++ mca &= ~(1UL << 12); ++ } + +- switch(mca) { +- case 0x0: +- return snprintf(buf, len, "%s", "No Error"); +- break; +- case 0x1: +- return snprintf(buf, len, "%s", "Unclassified"); +- break; +- case 0x2: +- return snprintf(buf, len, "%s", "Microcode ROM Parity Error"); +- break; +- case 0x3: +- return snprintf(buf, len, "%s", "External Error"); +- break; +- case 0x4: +- return snprintf(buf, len, "%s", "FRC Error"); +- break; +- default: +- break; ++ if (mca < NELE(msg)) { ++ Wprintf("%s\n", msg[mca]); ++ return; + } + +- if (test_prefix(4, mca)) { +- return snprintf(buf, len, "%s TLB %s Error", ++ if ((mca >> 2) == 3) { ++ Wprintf("%s Generic memory hierarchy error\n", get_LL_str(mca & 3)); ++ } else if (test_prefix(4, mca)) { ++ Wprintf("%s TLB %s Error\n", + get_TT_str((mca & TLB_TT_MASK) >> TLB_TT_SHIFT), + get_LL_str((mca & TLB_LL_MASK) >> + TLB_LL_SHIFT)); +- } +- if (test_prefix(8, mca)) { +- return snprintf(buf, len, "%s CACHE %s %s Error", ++ } else if (test_prefix(8, mca)) { ++ Wprintf("%s CACHE %s %s Error\n", + get_TT_str((mca & CACHE_TT_MASK) >> + CACHE_TT_SHIFT), + get_LL_str((mca & CACHE_LL_MASK) >> + CACHE_LL_SHIFT), + get_RRRR_str((mca & CACHE_RRRR_MASK) >> + CACHE_RRRR_SHIFT)); +- } +- if (test_prefix(10, mca)) { ++ } else if (test_prefix(10, mca)) { + if (mca == 0x400) +- return snprintf(buf, len, "Internal Timer error"); ++ Wprintf("Internal Timer error\n"); + else +- return snprintf(buf, len, +- "Internal unclassified errors"); +- } +- if (test_prefix(11, mca)) { +- +- return snprintf(buf, len, "BUS %s %s %s %s %s Error", ++ Wprintf("Internal unclassified error: %x\n", mca & 0xffff); ++ } else if (test_prefix(11, mca)) { ++ Wprintf("BUS %s %s %s %s %s Error\n", + get_LL_str((mca & BUS_LL_MASK) >> BUS_LL_SHIFT), + get_PP_str((mca & BUS_PP_MASK) >> BUS_PP_SHIFT), + get_RRRR_str((mca & BUS_RRRR_MASK) >> + BUS_RRRR_SHIFT), + get_II_str((mca & BUS_II_MASK) >> BUS_II_SHIFT), + get_T_str((mca & BUS_T_MASK) >> BUS_T_SHIFT)); +- } +- return snprintf(buf, len, "Unknown Error"); ++ } else if (test_prefix(7, mca)) { ++ decode_memory_controller(mca); ++ } else ++ Wprintf("Unknown Error %x\n", mca); + } + +-static void decode_model(__u32 model) ++static void p4_decode_model(__u32 model) + { + static struct { + int value; +@@ -219,17 +217,27 @@ static void decode_model(__u32 model) + Wprintf("\n"); + } + +-static void decode_mci(__u64 status) ++static void decode_tracking(u64 track, int cpu) + { +-#define BUF_LEN 200 +- char buf[BUF_LEN]; +- __u32 mca; ++ static char *msg[] = { ++ [1] = "green", ++ [2] = "yellow\n" ++"Large number of corrected errors. System operating, but you should\n" ++"schedule it for service within a few weeks", ++ [3] ="res3" }; ++ if (track) { ++ Wprintf("Threshold based error status: %s\n", msg[track]); ++ if (track == 2) ++ Wprintf( ++ "CPU %d has large number of corrected errors. Consider replacement", cpu); ++ } ++} + ++static void decode_mci(__u64 status, int cpu) ++{ + Wprintf("MCi status:\n"); +- if (!(status & MCI_STATUS_VAL)) { +- Wprintf("Invalid log\n"); +- return; +- } ++ if (!(status & MCI_STATUS_VAL)) ++ Wprintf("Machine check not valid\n"); + + if (status & MCI_STATUS_OVER) + Wprintf("Error overflow\n"); +@@ -249,15 +257,9 @@ static void decode_mci(__u64 status) + if (status & MCI_STATUS_PCC) + Wprintf("Processor context corrupt\n"); + +- mca = status & 0xFFFFL; +- decode_mca(mca, buf, BUF_LEN); +- Wprintf("MCA:%s\n", buf); +- +- if (test_prefix(11, mca)) { +- __u32 model; +- model = (status & 0xFFFF0000L); +- decode_model(model); +- } ++ decode_tracking((status >> 54) & 3, cpu); ++ Wprintf("MCA: "); ++ decode_mca(status & 0xffffL); + } + + static void decode_mcg(__u64 mcgstatus) +@@ -272,13 +274,36 @@ static void decode_mcg(__u64 mcgstatus) + Wprintf("\n"); + } + +-void decode_p4_mc(struct mce *log) ++void decode_intel_mc(struct mce *log, int cputype) + { ++ int cpu = log->cpu; ++ + decode_mcg(log->mcgstatus); +- decode_mci(log->status); ++ decode_mci(log->status, cpu); ++ ++ if (test_prefix(11, (log->status & 0xffffL))) { ++ switch (cputype) { ++ case CPU_P6OLD: ++ p6old_decode_model(log->status); ++ break; ++ case CPU_DUNNINGTON: ++ case CPU_CORE2: ++ core2_decode_model(log->status); ++ break; ++ case CPU_P4: ++ p4_decode_model(log->status & 0xffff0000L); ++ break; ++ case CPU_NEHALEM: ++ nehalem_decode_model(log->status, log->misc); ++ break; ++ } ++ } ++ ++ if (cputype == CPU_DUNNINGTON) ++ dunnington_decode_model(log->status); + } + +-char *p4_bank_name(int num) ++char *intel_bank_name(int num) + { + static char bname[64]; + sprintf(bname, "BANK %d", num); +diff -x '*~' -urpN mcelog-0.7/p4.h mcelog-0.7-newcpus//p4.h +--- mcelog-0.7/p4.h 2006-05-03 08:55:54.000000000 +0200 ++++ mcelog-0.7-newcpus//p4.h 2008-09-26 20:35:46.000000000 +0200 +@@ -1,2 +1,2 @@ +-char *p4_bank_name(int num); +-void decode_p4_mc(struct mce* mce); ++char *intel_bank_name(int num); ++void decode_intel_mc(struct mce *log, int cpu); diff --git a/mcelog-thermal b/mcelog-thermal deleted file mode 100644 index 64061f3..0000000 --- a/mcelog-thermal +++ /dev/null @@ -1,42 +0,0 @@ ---- mcelog/p4.c~ 2005-02-09 12:20:20.000000000 +0100 -+++ mcelog/p4.c 2006-05-27 06:22:06.000000000 +0200 -@@ -24,6 +24,8 @@ - #include - #include "mcelog.h" - -+#define BANK_THERMAL 128 -+ - /* decode mce for P4/Xeon family */ - - static inline int test_prefix(int nr, __u32 value) -@@ -272,8 +274,21 @@ - Wprintf("\n"); - } - -+static void decode_thermal(struct mce *log) -+{ -+ if (log->status & 1) -+ Wprintf("Processor core is above trip temperature. Throttling enabled.\n"); -+ else -+ Wprintf("Processor core below trip temperature. Throttling disabled\n"); -+} -+ - void decode_p4_mc(struct mce *log) - { -+ if (log->bank == BANK_THERMAL) { -+ decode_thermal(log); -+ return; -+ } -+ - decode_mcg(log->mcgstatus); - decode_mci(log->status); - } -@@ -281,6 +296,8 @@ - char *p4_bank_name(int num) - { - static char bname[64]; -+ if (num == BANK_THERMAL) -+ return "THERMAL EVENT"; - sprintf(bname, "BANK %d", num); - return bname; - } diff --git a/mcelog-thermal.diff b/mcelog-thermal.diff new file mode 100644 index 0000000..ee7283d --- /dev/null +++ b/mcelog-thermal.diff @@ -0,0 +1,52 @@ +--- + p4.c | 19 +++++++++++++++++++ + 1 file changed, 19 insertions(+) + +Index: mcelog-0.7/p4.c +=================================================================== +--- mcelog-0.7.orig/p4.c ++++ mcelog-0.7/p4.c +@@ -28,6 +28,8 @@ + #include "nehalem.h" + #include "dunnington.h" + ++#define BANK_THERMAL 128 ++ + /* decode mce for P4/Xeon and Core2 family */ + + static inline int test_prefix(int nr, __u32 value) +@@ -274,10 +276,25 @@ static void decode_mcg(__u64 mcgstatus) + Wprintf("\n"); + } + ++static void decode_thermal(struct mce *log) ++{ ++ if (log->status & 1) ++ Wprintf("Processor core is above trip temperature. " ++ "Throttling enabled.\n"); ++ else ++ Wprintf("Processor core below trip temperature. " ++ "Throttling disabled\n"); ++} ++ + void decode_intel_mc(struct mce *log, int cputype) + { + int cpu = log->cpu; + ++ if (log->bank == BANK_THERMAL) { ++ decode_thermal(log); ++ return; ++ } ++ + decode_mcg(log->mcgstatus); + decode_mci(log->status, cpu); + +@@ -306,6 +323,8 @@ void decode_intel_mc(struct mce *log, in + char *intel_bank_name(int num) + { + static char bname[64]; ++ if (num == BANK_THERMAL) ++ return "THERMAL EVENT"; + sprintf(bname, "BANK %d", num); + return bname; + } diff --git a/mcelog.changes b/mcelog.changes index 235b984..28de8a3 100644 --- a/mcelog.changes +++ b/mcelog.changes @@ -1,3 +1,10 @@ +------------------------------------------------------------------- +Sat Sep 27 21:50:27 CEST 2008 - trenn@suse.de + +- fate #304279 mcelog support for Tigerton/Dunnington + Patch is from Andi himself with this statement: + While it looks large most of it is just new tables. + ------------------------------------------------------------------- Mon May 29 16:23:12 CEST 2006 - ak@suse.de diff --git a/mcelog.spec b/mcelog.spec index 7912a3e..2158389 100644 --- a/mcelog.spec +++ b/mcelog.spec @@ -1,23 +1,32 @@ # # spec file for package mcelog (Version 0.7) # -# Copyright (c) 2006 SUSE LINUX Products GmbH, Nuernberg, Germany. -# This file and all modifications and additions to the pristine -# package are under the same license as the package itself. +# Copyright (c) 2008 SUSE LINUX Products GmbH, Nuernberg, Germany. # +# All modifications and additions to the file contributed by third parties +# remain the property of their copyright owners, unless otherwise agreed +# upon. The license for this file, and modifications and additions to the +# file, is the same license as for the pristine package itself (unless the +# license for the pristine package is not an Open Source License, in which +# case the license is the MIT License). An "Open Source License" is a +# license that conforms to the Open Source Definition (Version 1.9) +# published by the Open Source Initiative. + # Please submit bugfixes or comments via http://bugs.opensuse.org/ # # norootforbuild + Name: mcelog -License: GPL +License: GPL v2 or later Summary: Log Machine Check Events Version: 0.7 -Release: 3 -Autoreqprov: on +Release: 111 +AutoReqProv: on Source: mcelog-%{version}.tar.gz -Patch0: mcelog-thermal +Patch0: mcelog-0.7-newcpus-1.diff +Patch1: mcelog-thermal.diff Group: System/Monitoring BuildRoot: %{_tmppath}/%{name}-%{version}-build @@ -35,11 +44,12 @@ In addition, it allows decoding machine check kernel panic messages. Authors: -------- - Andi Kleen + Andi Kleen %prep %setup %patch0 -p1 +%patch1 -p1 %build make CFLAGS="$RPM_OPT_FLAGS" @@ -64,40 +74,44 @@ rm -rf $RPM_BUILD_ROOT /etc/cron.hourly/mcelog /etc/logrotate.d/mcelog -%changelog -n mcelog -* Mon May 29 2006 - ak@suse.de +%changelog +* Sat Sep 27 2008 trenn@suse.de +- fate #304279 mcelog support for Tigerton/Dunnington + Patch is from Andi himself with this statement: + While it looks large most of it is just new tables. +* Mon May 29 2006 ak@suse.de - decode intel thermal events too (#179327) -* Fri May 05 2006 - ak@suse.de +* Fri May 05 2006 ak@suse.de - Update to 0.7. This fixes -- Fix --dmi option (#166324) -- Incorporate old patches -* Fri Mar 03 2006 - ak@suse.de + - Fix --dmi option (#166324) + - Incorporate old patches +* Fri Mar 03 2006 ak@suse.de - Avoid cosmetic problem in --filter (#153347) -* Wed Feb 08 2006 - ak@suse.de +* Wed Feb 08 2006 ak@suse.de - update to mcelog 0.6 * Fixes bugs (#148869, #137985) * Adds --dmi option to map addresses to DIMMs using SMBIOS (default to off) -* Wed Jan 25 2006 - mls@suse.de +* Wed Jan 25 2006 mls@suse.de - converted neededforbuild to BuildRequires -* Mon Dec 19 2005 - sf@suse.de +* Mon Dec 19 2005 sf@suse.de - update to version 0.5 * Clarify --ascii in the manpage *Support for AMD K8 Revision F machine check DRAM error thresholding -* Fri Feb 11 2005 - ak@suse.de +* Fri Feb 11 2005 ak@suse.de - Use RPM_OPT_FLAGS - Improve description again -* Thu Feb 10 2005 - ak@suse.de +* Thu Feb 10 2005 ak@suse.de - mcelog-0.4: * add support to decode AMD K8 (Opteron/Athlon64/AthlonFX) and Intel P4 (Xeon and Pentium 4) events * add --ascii option to decode machine check panic information - Rewrite description in .spec file -* Wed Jun 09 2004 - ak@suse.de +* Wed Jun 09 2004 ak@suse.de - memlog-0.2: * fix mcelog looping (#41863) * Add GPL notices -* Thu Mar 25 2004 - sf@suse.de +* Thu Mar 25 2004 sf@suse.de - initial version - fixes #36898