mcelog/mcelog-0.7-newcpus-1.diff

1023 lines
30 KiB
Diff

From: Andi Kleen <ak@linux.intel.com>
Subject: mcelog decoding support for Intel Tigerton
Backport of the changes for Tigerton/Dunnington/Nehalem changes from mcelog git
git://git.kernel.org/pub/scm/utils/cpu/mce/mcelog.git
The Tigerton support required adding Core2 support, they are all
lumped together. I also added "P6OLD" because that was in the mainline
mcelog git changes and would have been difficult to separate.
The differences to core2 are very minimal (just a few different events).
The actual decoder is all table driven.
In the original git this was done as individual changes, but I lumped
it all together in the backport.
While it adds quite a lot of new code there's not many changes to generic
code. Most of the new code is only used on the new CPUs.
diff -x '*~' -urpN mcelog-0.7/bitfield.c mcelog-0.7-newcpus//bitfield.c
--- mcelog-0.7/bitfield.c 1970-01-01 01:00:00.000000000 +0100
+++ mcelog-0.7-newcpus//bitfield.c 2008-09-26 20:28:29.000000000 +0200
@@ -0,0 +1,61 @@
+#include <string.h>
+#include <stdio.h>
+#include "mcelog.h"
+#include "bitfield.h"
+
+char *reserved_3bits[8];
+char *reserved_1bit[2];
+char *reserved_2bits[4];
+
+static u64 bitmask(u64 i)
+{
+ u64 mask = 1;
+ while (mask < i)
+ mask = (mask << 1) | 1;
+ return mask;
+}
+
+void decode_bitfield(u64 status, struct field *fields)
+{
+ struct field *f;
+ int linelen = 0;
+ char *delim = "";
+
+ for (f = fields; f->str; f++) {
+ u64 v = (status >> f->start_bit) & bitmask(f->stringlen - 1);
+ char *s = NULL;
+ if (v < f->stringlen)
+ s = f->str[v];
+ if (!s) {
+ if (v == 0)
+ continue;
+ char buf[60];
+ s = buf;
+ snprintf(buf, sizeof buf, "<%u:%Lx>", f->start_bit, v);
+ }
+ int len = strlen(s);
+ if (linelen + len > 75) {
+ delim = "\n";
+ linelen = 0;
+ }
+ Wprintf("%s%s", delim, s);
+ delim = " ";
+ linelen += len + 1;
+ }
+ if (linelen > 0)
+ Wprintf("\n");
+}
+
+void decode_numfield(u64 status, struct numfield *fields)
+{
+ struct numfield *f;
+ for (f = fields; f->name; f++) {
+ u64 mask = (1ULL << (f->end - f->start - 1)) - 1;
+ u64 v = (status >> f->start) & mask;
+ if (v > 0) {
+ char fmt[30];
+ snprintf(fmt, 30, "%%s: %s\n", f->fmt ? f->fmt : "%Lu");
+ Wprintf(fmt, f->name, v);
+ }
+ }
+}
diff -x '*~' -urpN mcelog-0.7/bitfield.h mcelog-0.7-newcpus//bitfield.h
--- mcelog-0.7/bitfield.h 1970-01-01 01:00:00.000000000 +0100
+++ mcelog-0.7-newcpus//bitfield.h 2008-09-26 20:28:29.000000000 +0200
@@ -0,0 +1,27 @@
+/* Generic bitfield decoder */
+
+struct field {
+ int start_bit;
+ char **str;
+ int stringlen;
+};
+
+struct numfield {
+ int start, end;
+ char *name;
+ char *fmt;
+};
+
+#define FIELD(start_bit, name) { start_bit, name, NELE(name) }
+#define SBITFIELD(start_bit, string) { start_bit, ((char * [2]) { NULL, string }), 2 }
+
+#define NUMBER(start, end, name) { start, end, name, "%Lu" }
+#define HEXNUMBER(start, end, name) { start, end, name, "%Lx" }
+
+void decode_bitfield(u64 status, struct field *fields);
+void decode_numfield(u64 status, struct numfield *fields);
+
+extern char *reserved_3bits[8];
+extern char *reserved_1bit[2];
+extern char *reserved_2bits[4];
+
diff -x '*~' -urpN mcelog-0.7/core2.c mcelog-0.7-newcpus//core2.c
--- mcelog-0.7/core2.c 1970-01-01 01:00:00.000000000 +0100
+++ mcelog-0.7-newcpus//core2.c 2008-09-26 20:21:18.000000000 +0200
@@ -0,0 +1,105 @@
+#include <string.h>
+#include <stdio.h>
+#include <assert.h>
+#include "mcelog.h"
+#include "core2.h"
+#include "bitfield.h"
+
+/* Decode P6 family (Core2) model specific errors.
+ The generic errors are decoded in p4.c */
+
+/* [19..24] */
+static char *bus_queue_req_type[] = {
+ [0] = "BQ_DCU_READ_TYPE",
+ [2] = "BQ_IFU_DEMAND_TYPE",
+ [3] = "BQ_IFU_DEMAND_NC_TYPE",
+ [4] = "BQ_DCU_RFO_TYPE",
+ [5] = "BQ_DCU_RFO_LOCK_TYPE",
+ [6] = "BQ_DCU_ITOM_TYPE",
+ [8] = "BQ_DCU_WB_TYPE",
+ [10] = "BC_DCU_WCEVICT_TYPE",
+ [11] = "BQ_DCU_WCLINE_TYPE",
+ [12] = "BQ_DCU_BTM_TYPE",
+ [13] = "BQ_DCU_INTACK_TYPE",
+ [14] = "BQ_DCU_INVALL2_TYPE",
+ [15] = "BQ_DCU_FLUSHL2_TYPE",
+ [16] = "BQ_DCU_PART_RD_TYPE",
+ [18] = "BQ_DCU_PART_WR_TYPE",
+ [20] = "BQ_DCU_SPEC_CYC_TYPE",
+ [24] = "BQ_DCU_IO_RD_TYPE",
+ [25] = "BQ_DCU_IO_WR_TYPE",
+ [28] = "BQ_DCU_LOCK_RD_TYPE",
+ [30] = "BQ_DCU_SPLOCK_RD_TYPE",
+ [29] = "BQ_DCU_LOCK_WR_TYPE",
+};
+
+/* [25..27] */
+static char *bus_queue_error_type[] = {
+ [0] = "BQ_ERR_HARD_TYPE",
+ [1] = "BQ_ERR_DOUBLE_TYPE",
+ [2] = "BQ_ERR_AERR2_TYPE",
+ [4] = "BQ_ERR_SINGLE_TYPE",
+ [5] = "BQ_ERR_AERR1_TYPE",
+};
+
+static struct field p6_shared_status[] = {
+ FIELD(16, reserved_3bits),
+ FIELD(19, bus_queue_req_type),
+ FIELD(25, bus_queue_error_type),
+ FIELD(25, bus_queue_error_type),
+ SBITFIELD(30, "internal BINIT"),
+ SBITFIELD(36, "received parity error on response transaction"),
+ SBITFIELD(38, "timeout BINIT (ROB timeout)."
+ " No micro-instruction retired for some time"),
+ FIELD(39, reserved_3bits),
+ SBITFIELD(42, "bus transaction received hard error response"),
+ SBITFIELD(43, "failure that caused IERR"),
+ /* The following are reserved for Core in the SDM. Let's keep them here anyways*/
+ SBITFIELD(44, "two failing bus transactions with address parity error (AERR)"),
+ SBITFIELD(45, "uncorrectable ECC error"),
+ SBITFIELD(46, "correctable ECC error"),
+ /* [47..54]: ECC syndrome */
+ FIELD(55, reserved_2bits),
+ {},
+};
+
+static struct field p6old_status[] = {
+ SBITFIELD(28, "FRC error"),
+ SBITFIELD(29, "BERR on this CPU"),
+ FIELD(31, reserved_1bit),
+ FIELD(32, reserved_3bits),
+ SBITFIELD(35, "BINIT received from external bus"),
+ SBITFIELD(37, "Received hard error reponse on split transaction (Bus BINIT)"),
+ {}
+};
+
+static struct field core2_status[] = {
+ SBITFIELD(28, "MCE driven"),
+ SBITFIELD(29, "MCE is observed"),
+ SBITFIELD(31, "BINIT observed"),
+ FIELD(32, reserved_2bits),
+ SBITFIELD(34, "PIC or FSB data parity error"),
+ FIELD(35, reserved_1bit),
+ SBITFIELD(37, "FSB address parity error detected"),
+ {}
+};
+
+static struct numfield p6old_status_numbers[] = {
+ HEXNUMBER(47, 54, "ECC syndrome"),
+ {}
+};
+
+void core2_decode_model(u64 status)
+{
+ decode_bitfield(status, p6_shared_status);
+ decode_bitfield(status, core2_status);
+ /* Normally reserved, but let's parse anyways: */
+ decode_numfield(status, p6old_status_numbers);
+}
+
+void p6old_decode_model(u64 status)
+{
+ decode_bitfield(status, p6_shared_status);
+ decode_bitfield(status, p6old_status);
+ decode_numfield(status, p6old_status_numbers);
+}
diff -x '*~' -urpN mcelog-0.7/core2.h mcelog-0.7-newcpus//core2.h
--- mcelog-0.7/core2.h 1970-01-01 01:00:00.000000000 +0100
+++ mcelog-0.7-newcpus//core2.h 2008-09-26 20:21:18.000000000 +0200
@@ -0,0 +1,2 @@
+void core2_decode_model(u64 status);
+void p6old_decode_model(u64 status);
diff -x '*~' -urpN mcelog-0.7/dunnington.c mcelog-0.7-newcpus//dunnington.c
--- mcelog-0.7/dunnington.c 1970-01-01 01:00:00.000000000 +0100
+++ mcelog-0.7-newcpus//dunnington.c 2008-09-26 20:24:24.000000000 +0200
@@ -0,0 +1,123 @@
+/* Copyright (c) 2008 by Intel Corp.
+ Decode Intel Xeon Processor 7400 Model (Dunnington) specific MCEs
+
+ mcelog is free software; you can redistribute it and/or
+ modify it under the terms of the GNU General Public
+ License as published by the Free Software Foundation; version
+ 2.
+
+ mcelog is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should find a copy of v2 of the GNU General Public License somewhere
+ on your Linux system; if not, write to the Free Software Foundation,
+ Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+
+ Author:
+ Andi Kleen
+*/
+
+/* other files
+
+mcelog.h CPU_DUNNINGTON
+mcelog.c: cputype name
+intel.h CASE_INTEL_CPUS
+intel.c model == 0x1d CPU_DUNNINGTON
+p4.c: if (cpu == CPU_DUNNINGTON) dunnington_decode_model(log->status);
+ add to CORE2 cases
+
+*/
+
+#include <stddef.h>
+#include "mcelog.h"
+#include "bitfield.h"
+#include "dunnington.h"
+
+/* Follows Intel IA32 SDM 3b Appendix E.2.1 ++ */
+
+static struct field dunnington_bus_status[] = {
+ SBITFIELD(16, "Parity error detected during FSB request phase"),
+ FIELD(17, reserved_3bits),
+ SBITFIELD(20, "Hard Failure response received for a local transaction"),
+ SBITFIELD(21, "Parity error on FSB response field detected"),
+ SBITFIELD(22, "Parity data error on inbound data detected"),
+ FIELD(23, reserved_3bits),
+ FIELD(25, reserved_3bits),
+ FIELD(28, reserved_3bits),
+ FIELD(31, reserved_1bit),
+ {}
+};
+
+static char *dnt_front_error[0xf] = {
+ [0x1] = "Inclusion error from core 0",
+ [0x2] = "Inclusion error from core 1",
+ [0x3] = "Write Exclusive error from core 0",
+ [0x4] = "Write Exclusive error from core 1",
+ [0x5] = "Inclusion error from FSB",
+ [0x6] = "SNP stall error from FSB",
+ [0x7] = "Write stall error from FSB",
+ [0x8] = "FSB Arbiter Timeout error",
+ [0xA] = "Inclusion error from core 2",
+ [0xB] = "Write exclusive error from core 2",
+};
+
+static char *dnt_int_error[0xf] = {
+ [0x2] = "Internal timeout error",
+ [0x3] = "Internal timeout error",
+ [0x4] = "Intel Cache Safe Technology Queue full error\n"
+ "or disabled ways in a set overflow",
+ [0x5] = "Quiet cycle timeout error (correctable)",
+};
+
+struct field dnt_int_status[] = {
+ FIELD(8, dnt_int_error),
+ {}
+};
+
+struct field dnt_front_status[] = {
+ FIELD(0, dnt_front_error),
+ {}
+};
+
+struct field dnt_cecc[] = {
+ SBITFIELD(1, "Correctable ECC event on outgoing core 0 data"),
+ SBITFIELD(2, "Correctable ECC event on outgoing core 1 data"),
+ SBITFIELD(3, "Correctable ECC event on outgoing core 3 data"),
+ {}
+};
+
+struct field dnt_uecc[] = {
+ SBITFIELD(1, "Uncorrectable ECC event on outgoing core 0 data"),
+ SBITFIELD(2, "Uncorrectable ECC event on outgoing core 1 data"),
+ SBITFIELD(3, "Uncorrectable ECC event on outgoing core 3 data"),
+ {}
+};
+
+static void dunnington_decode_bus(u64 status)
+{
+ decode_bitfield(status, dunnington_bus_status);
+}
+
+static void dunnington_decode_internal(u64 status)
+{
+ u32 mca = (status >> 16) & 0xffff;
+ if ((mca & 0xfff0) == 0)
+ decode_bitfield(status, dnt_front_status);
+ else if ((mca & 0xf0ff) == 0)
+ decode_bitfield(status, dnt_int_status);
+ else if ((mca & 0xfff0) == 0xc000)
+ decode_bitfield(status, dnt_cecc);
+ else if ((mca & 0xfff0) == 0xe000)
+ decode_bitfield(status, dnt_uecc);
+}
+
+void dunnington_decode_model(u64 status)
+{
+ if ((status & 0xffff) == 0xe0f)
+ dunnington_decode_bus(status);
+ else if ((status & 0xffff) == (1 << 10))
+ dunnington_decode_internal(status);
+}
+
diff -x '*~' -urpN mcelog-0.7/dunnington.h mcelog-0.7-newcpus//dunnington.h
--- mcelog-0.7/dunnington.h 1970-01-01 01:00:00.000000000 +0100
+++ mcelog-0.7-newcpus//dunnington.h 2008-09-26 20:24:24.000000000 +0200
@@ -0,0 +1,2 @@
+void dunnington_decode_model(u64 status);
+
diff -x '*~' -urpN mcelog-0.7/intel.c mcelog-0.7-newcpus//intel.c
--- mcelog-0.7/intel.c 1970-01-01 01:00:00.000000000 +0100
+++ mcelog-0.7-newcpus//intel.c 2008-09-26 20:32:52.000000000 +0200
@@ -0,0 +1,22 @@
+#include "mcelog.h"
+#include "intel.h"
+#include <stdio.h>
+
+enum cputype select_intel_cputype(int family, int model)
+{
+ if (family == 15) {
+ return CPU_P4;
+ }
+ if (family == 6) {
+ if (model < 0xf)
+ return CPU_P6OLD;
+ else if (model == 0xf || model == 0x17) /* Merom/Penryn */
+ return CPU_CORE2;
+ else if (model == 0x1d)
+ return CPU_DUNNINGTON;
+ else if (model == 0x1a)
+ return CPU_NEHALEM;
+ }
+ fprintf(stderr, "Unknown Intel CPU type family %x model %x\n", family, model);
+ return family == 6 ? CPU_P6OLD : CPU_GENERIC;
+}
diff -x '*~' -urpN mcelog-0.7/intel.h mcelog-0.7-newcpus//intel.h
--- mcelog-0.7/intel.h 1970-01-01 01:00:00.000000000 +0100
+++ mcelog-0.7-newcpus//intel.h 2008-09-26 20:32:00.000000000 +0200
@@ -0,0 +1,9 @@
+enum cputype select_intel_cputype(int family, int model);
+
+#define CASE_INTEL_CPUS \
+ case CPU_P6OLD: \
+ case CPU_CORE2: \
+ case CPU_NEHALEM: \
+ case CPU_DUNNINGTON: \
+ case CPU_P4
+
diff -x '*~' -urpN mcelog-0.7/Makefile mcelog-0.7-newcpus//Makefile
--- mcelog-0.7/Makefile 2006-05-03 08:55:54.000000000 +0200
+++ mcelog-0.7-newcpus//Makefile 2008-09-26 21:07:21.000000000 +0200
@@ -5,7 +5,8 @@ all: mcelog
.PHONY: install clean
-mcelog: p4.o k8.o mcelog.o dmi.o
+mcelog: p4.o k8.o mcelog.o dmi.o core2.o dunnington.o nehalem.o \
+ bitfield.o intel.o
p4.o: p4.c mcelog.h p4.h
k8.o: k8.c mcelog.h k8.h
@@ -18,7 +19,8 @@ install: mcelog.c
echo "call mcelog regularly from your crontab"
clean:
- rm -f mcelog mcelog.o k8.o p4.o dmi.o dmi
+ rm -f mcelog mcelog.o k8.o p4.o dmi.o dmi core2.o dunnington.o \
+ nehalem.o bitfield.o intel.o
dmi: dmi.c
gcc -o dmi ${CFLAGS} -DSTANDALONE dmi.c ${LDFLAGS}
diff -x '*~' -urpN mcelog-0.7/mcelog.8 mcelog-0.7-newcpus//mcelog.8
--- mcelog-0.7/mcelog.8 2006-05-03 08:55:54.000000000 +0200
+++ mcelog-0.7-newcpus//mcelog.8 2008-09-26 20:42:44.000000000 +0200
@@ -2,9 +2,9 @@
.SH NAME
mcelog \- Print machine check log from x86-64 kernel.
.SH SYNOPSIS
-mcelog [\-\-syslog] [\-\-k8|\-\-p4|\-\-generic] [\-\-ignorenodev] [\-\-dmi] [\-\-filter] [device]
+mcelog [\-\-syslog] [\-\-k8|\-\-p4|\-\-generic|...] [\-\-ignorenodev] [\-\-dmi] [\-\-filter] [device]
.br
-mcelog [\-\-k8|\-\-p4|\-\-generic] \-\-ascii
+mcelog [\-\-k8|\-\-p4|\-\-generic|...] \-\-ascii
.SH DESCRIPTION
Linux x86-64 kernels since 2.6.4 don't print recoverable machine check errors
to the kernel log anymore. Instead they are saved into a special
@@ -18,13 +18,21 @@ When the
.B \-\-syslog
option is specified redirect output to system log.
+
When
.B \-\-k8
is specified assume the events are for a AMD Opteron or Athlon 64 or Athlon
FX CPU.
With
.B \-\-p4
-is specified assume the events are for a Intel Pentium 4 or Intel Xeon.
+is specified assume the events are for a Intel Pentium 4 or Intel (older) Xeon.
+With
+.B \-\-core2
+assume the events are for a Intel Core2 CPU or Intel Xeon 3000, 3200, 5100, 5300, 7300
+series. When
+.B \-\-intel-cpu=family,model
+are specified then the family number and model number of the Intel CPU
+to be decoded should be specified (can be found in /proc/cpuinfo).
When
.B \-\-generic
all the fields are dumped without CPU specific decoding.
diff -x '*~' -urpN mcelog-0.7/mcelog.c mcelog-0.7-newcpus//mcelog.c
--- mcelog-0.7/mcelog.c 2006-05-03 08:55:54.000000000 +0200
+++ mcelog-0.7-newcpus//mcelog.c 2008-09-26 20:45:50.000000000 +0200
@@ -31,12 +31,10 @@
#include "k8.h"
#include "p4.h"
#include "dmi.h"
+#include "intel.h"
-enum {
- CPU_GENERIC,
- CPU_K8,
- CPU_P4
-} cpu = CPU_GENERIC;
+
+enum cputype cpu = CPU_GENERIC;
char *logfn = "/dev/mcelog";
@@ -62,8 +60,8 @@ char *bankname(unsigned bank)
switch (cpu) {
case CPU_K8:
return k8_bank_name(bank);
- case CPU_P4:
- return p4_bank_name(bank);
+ CASE_INTEL_CPUS:
+ return intel_bank_name(bank);
/* add banks of other cpu types here */
default:
sprintf(numeric, "BANK %d", bank);
@@ -98,7 +96,7 @@ int mce_filter(struct mce *m)
case CPU_K8:
return mce_filter_k8(m);
/* add more buggy CPUs here */
- case CPU_P4:
+ CASE_INTEL_CPUS:
/* No bugs known */
return 1;
default:
@@ -134,8 +132,8 @@ void dump_mce(struct mce *m)
case CPU_K8:
decode_k8_mc(m);
break;
- case CPU_P4:
- decode_p4_mc(m);
+ CASE_INTEL_CPUS:
+ decode_intel_mc(m, cpu);
break;
/* add handlers for other CPUs here */
default:
@@ -153,23 +151,27 @@ void check_cpu(void)
if (f != NULL) {
int found = 0;
int family;
+ int model;
char vendor[64];
char *line = NULL;
size_t linelen = 0;
- while (getdelim(&line, &linelen, '\n', f) > 0 && found < 2) {
+ while (getdelim(&line, &linelen, '\n', f) > 0 && found < 3) {
if (sscanf(line, "vendor_id : %63[^\n]", vendor) == 1)
found++;
if (sscanf(line, "cpu family : %d", &family) == 1)
found++;
+ if (sscanf(line, "model : %d", &model) == 1)
+ found++;
}
- if (found == 2) {
+ if (found == 3) {
if (!strcmp(vendor,"AuthenticAMD") && family == 15)
cpu = CPU_K8;
- if (!strcmp(vendor,"GenuineIntel") && family == 15)
- cpu = CPU_P4;
+ if (!strcmp(vendor,"GenuineIntel"))
+ cpu = select_intel_cputype(family, model);
/* Add checks for other CPUs here */
} else {
- fprintf(stderr, "mcelog: warning: Cannot parse /proc/cpuinfo\n");
+ fprintf(stderr,
+ "mcelog: warning: Cannot parse /proc/cpuinfo\n");
}
fclose(f);
free(line);
@@ -303,9 +305,11 @@ void usage(void)
{
fprintf(stderr,
"Usage:\n"
- " mcelog [--k8|--p4|--generic] [--ignorenodev] [--dmi] [--syslog] [--filter] [mcelogdevice]\n"
+ " mcelog options [--ignorenodev] [--dmi] [--syslog] [--filter] [mcelogdevice]\n"
"Decode machine check error records from kernel\n"
- " mcelog [--k8|--p4|--generic] [--dmi] --ascii < log\n"
+ " mcelog options [--dmi] --ascii < log\n"
+ "Options:\n"
+ "--p4|--k8|--core2|--generic|--intel-cpu=family,model Set CPU type to decode\n"
"Decode machine check ASCII output from kernel logs\n");
exit(1);
}
@@ -318,6 +322,17 @@ int modifier(char *s)
cpu = CPU_P4;
} else if (!strcmp(s, "--generic")) {
cpu = CPU_GENERIC;
+ } else if (!strcmp(s, "--core2")) {
+ cpu = CPU_CORE2;
+ } else if (!strncmp(s, "--intel-cpu=", 12)) {
+ unsigned fam, mod;
+ if (sscanf(s + 12, "%i,%i", &fam, &mod) != 2)
+ usage();
+ cpu = select_intel_cputype(fam, mod);
+ if (cpu == CPU_GENERIC) {
+ fprintf(stderr, "Unknown Intel CPU\n");
+ usage();
+ }
} else if (!strcmp(s, "--ignorenodev")) {
ignore_nodev = 1;
} else if (!strcmp(s,"--filter")) {
diff -x '*~' -urpN mcelog-0.7/mcelog.h mcelog-0.7-newcpus//mcelog.h
--- mcelog-0.7/mcelog.h 2006-05-03 08:55:54.000000000 +0200
+++ mcelog-0.7-newcpus//mcelog.h 2008-09-26 20:28:19.000000000 +0200
@@ -61,3 +61,13 @@ struct mce {
#endif
void Wprintf(char *fmt, ...) PRINTFLIKE;
+
+enum cputype {
+ CPU_GENERIC,
+ CPU_K8,
+ CPU_P4,
+ CPU_NEHALEM,
+ CPU_DUNNINGTON,
+ CPU_P6OLD,
+ CPU_CORE2,
+};
diff -x '*~' -urpN mcelog-0.7/nehalem.c mcelog-0.7-newcpus//nehalem.c
--- mcelog-0.7/nehalem.c 1970-01-01 01:00:00.000000000 +0100
+++ mcelog-0.7-newcpus//nehalem.c 2008-09-26 20:24:20.000000000 +0200
@@ -0,0 +1,163 @@
+/* Copyright (C) 2008 Intel Corporation
+ Decode Intel Nehalem specific machine check errors.
+
+ mcelog is free software; you can redistribute it and/or
+ modify it under the terms of the GNU General Public
+ License as published by the Free Software Foundation; version
+ 2.
+
+ mcelog is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should find a copy of v2 of the GNU General Public License somewhere
+ on your Linux system; if not, write to the Free Software Foundation,
+ Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+
+ Author: Andi Kleen
+*/
+
+/* other files
+
+mcelog.h CPU_NEHALEM
+intel.h CASE_INTEL_CPUS
+intel.c model == 0x1a CPU_NEHALEM
+p4.c: if (cpu == CPU_NEHALEM) nehalem_decode_model(log->status, log->misc);
+ if (test_prefix(status, 7)) decode_memory_controller(log->status);
+mcelog.c/p4.c: syslog/trigger for memory controller
+ cputype_name
+*/
+
+#include <string.h>
+#include <stdio.h>
+#include "mcelog.h"
+#include "nehalem.h"
+#include "core2.h"
+#include "bitfield.h"
+
+/* See IA32 SDM Vol3B Appendix E.3.2 ff */
+
+/* MC1_STATUS error */
+static struct field qpi_status[] = {
+ SBITFIELD(16, "QPI header had bad parity"),
+ SBITFIELD(17, "QPI Data packet had bad parity"),
+ SBITFIELD(18, "Number of QPI retries exceeded"),
+ SBITFIELD(19, "Received QPI data packet that was poisoned by sender"),
+ SBITFIELD(20, "QPI reserved 20"),
+ SBITFIELD(21, "QPI reserved 21"),
+ SBITFIELD(22, "QPI received unsupported message encoding"),
+ SBITFIELD(23, "QPI credit type is not supported"),
+ SBITFIELD(24, "Sender sent too many QPI flits to the receiver"),
+ SBITFIELD(25, "QPI Sender sent a failed response to receiver"),
+ SBITFIELD(26, "Clock jitter detected in internal QPI clocking"),
+ {}
+};
+
+static struct field qpi_misc[] = {
+ SBITFIELD(14, "QPI misc reserved 14"),
+ SBITFIELD(15, "QPI misc reserved 15"),
+ SBITFIELD(24, "QPI Interleave/Head Indication Bit (IIB)"),
+ {}
+};
+
+static struct numfield qpi_numbers[] = {
+ HEXNUMBER(0, 7, "QPI class and opcode of packet with error"),
+ HEXNUMBER(8, 13, "QPI Request Transaction ID"),
+ NUMBER(16, 18, "QPI Requestor/Home Node ID (RHNID)"),
+ HEXNUMBER(19, 23, "QPI miscreserved 19-23"),
+};
+
+static struct field memory_controller_status[] = {
+ SBITFIELD(16, "Memory read ECC error"),
+ SBITFIELD(17, "Memory ECC error occurred during scrub"),
+ SBITFIELD(18, "Memory write parity error"),
+ SBITFIELD(19, "Memory error in half of redundant memory"),
+ SBITFIELD(20, "Memory reserved 20"),
+ SBITFIELD(21, "Memory access out of range"),
+ SBITFIELD(22, "Memory internal RTID invalid"),
+ SBITFIELD(23, "Memory address parity error"),
+ SBITFIELD(24, "Memory byte enable parity error"),
+ {}
+};
+
+static struct numfield memory_controller_numbers[] = {
+ HEXNUMBER(0, 7, "Memory transaction Tracker ID (RTId)"),
+ HEXNUMBER(8, 15, "Memory MISC reserved 8..15"),
+ NUMBER(16, 17, "Memory DIMM ID of error"),
+ NUMBER(18, 19, "Memory channel ID of error"),
+ HEXNUMBER(32, 63, "Memory ECC syndrome"),
+ HEXNUMBER(25, 37, "Memory MISC reserved 25..37"),
+ NUMBER(38, 52, "Memory corrected error count (CORE_ERR_CNT)"),
+ HEXNUMBER(53, 56, "Memory MISC reserved 53..56"),
+ {}
+};
+
+static char *internal_errors[] = {
+ [0x0] = "No Error",
+ [0x3] = "Reset firmware did not complete",
+ [0x8] = "Received an invalid CMPD",
+ [0xa] = "Invalid Power Management Request",
+ [0xd] = "Invalid S-state transition",
+ [0x11] = "VID controller does not match POC controller selected",
+ [0x1a] = "MSID from POC does not match CPU MSID",
+};
+
+static struct field internal_error_status[] = {
+ FIELD(24, internal_errors),
+ {}
+};
+
+static struct numfield internal_error_numbers[] = {
+ HEXNUMBER(16, 23, "Internal machine check status reserved 16..23"),
+ HEXNUMBER(32, 56, "Internal machine check status reserved 32..56"),
+ {},
+};
+
+/* Generic architectural memory controller encoding */
+
+static char *mmm_mnemonic[] = {
+ "GEN", "RD", "WR", "AC", "MS", "RES5", "RES6", "RES7"
+};
+static char *mmm_desc[] = {
+ "Generic undefined request",
+ "Memory read error",
+ "Memory write error",
+ "Address/Command error",
+ "Memory scrubbing error",
+ "Reserved 5",
+ "Reserved 6",
+ "Reserved 7"
+};
+
+void decode_memory_controller(u32 status)
+{
+ char channel[30];
+ if ((status & 0xf) == 0xf)
+ strcpy(channel, "unspecified");
+ else
+ sprintf(channel, "%u", status & 0xf);
+ Wprintf("MEMORY CONTROLLER %s_CHANNEL%s_ERR\n",
+ mmm_mnemonic[(status >> 4) & 7],
+ channel);
+ Wprintf("Transaction: %s\n", mmm_desc[(status >> 4) & 7]);
+ Wprintf("Channel: %s\n", channel);
+}
+
+void nehalem_decode_model(u64 status, u64 misc)
+{
+ u32 mca = status & 0xffff;
+ core2_decode_model(status);
+ if ((mca >> 11) == 1) { /* bus and interconnect QPI */
+ decode_bitfield(status, qpi_status);
+ decode_numfield(status, qpi_numbers);
+ decode_bitfield(misc, qpi_misc);
+ } else if (mca == 0x0001) { /* internal unspecified */
+ decode_bitfield(status, internal_error_status);
+ decode_numfield(status, internal_error_numbers);
+ } else if ((mca >> 8) == 1) { /* memory controller */
+ decode_bitfield(status, memory_controller_status);
+ decode_numfield(status, memory_controller_numbers);
+ }
+}
+
diff -x '*~' -urpN mcelog-0.7/nehalem.h mcelog-0.7-newcpus//nehalem.h
--- mcelog-0.7/nehalem.h 1970-01-01 01:00:00.000000000 +0100
+++ mcelog-0.7-newcpus//nehalem.h 2008-09-26 20:24:20.000000000 +0200
@@ -0,0 +1,2 @@
+void nehalem_decode_model(u64 status, u64 misc);
+void decode_memory_controller(u32 status);
diff -x '*~' -urpN mcelog-0.7/p4.c mcelog-0.7-newcpus//p4.c
--- mcelog-0.7/p4.c 2006-05-03 08:55:54.000000000 +0200
+++ mcelog-0.7-newcpus//p4.c 2008-09-26 20:34:41.000000000 +0200
@@ -1,7 +1,6 @@
/* Copyright (c) 2005 by Intel Corp.
- Decode IA32/x86-64 machine check for Pentium 4, Intel Xeon
- or EM64T.
+ Decode Intel machine check (generic and P4 specific)
mcelog is free software; you can redistribute it and/or
modify it under the terms of the GNU General Public
@@ -19,12 +18,17 @@
Authors:
Racing Guo <racing.guo@intel.com>
+ Andi Kleen
*/
-
+
#include <stdio.h>
#include "mcelog.h"
+#include "p4.h"
+#include "core2.h"
+#include "nehalem.h"
+#include "dunnington.h"
-/* decode mce for P4/Xeon family */
+/* decode mce for P4/Xeon and Core2 family */
static inline int test_prefix(int nr, __u32 value)
{
@@ -73,13 +77,12 @@ static char* get_RRRR_str(__u8 rrrr)
}
return "UNKNOWN";
-
}
static char* get_PP_str(__u8 pp)
{
static char* PP[] = {
- "Originated-request",
+ "Local-CPU-originated-request",
"Responed-to-request",
"Observed-error-as-third-party",
"Generic"
@@ -112,7 +115,7 @@ static char* get_II_str(__u8 i)
return II[i];
}
-static int decode_mca(__u32 mca, char *buf, int len)
+static void decode_mca(__u32 mca)
{
#define TLB_LL_MASK 0x3 /*bit 0, bit 1*/
#define TLB_LL_SHIFT 0x0
@@ -137,64 +140,59 @@ static int decode_mca(__u32 mca, char *b
#define BUS_PP_MASK 0x600 /*bit 9, bit 10*/
#define BUS_PP_SHIFT 0x9
- mca = mca & 0xFFFF;
+ static char *msg[] = {
+ [0] = "No Error",
+ [1] = "Unclassified",
+ [2] = "Microcode ROM parity error",
+ [3] = "External error",
+ [4] = "FRC error",
+ };
+
+ if (mca & (1UL << 12)) {
+ Wprintf("corrected filtering (some unreported errors in same region)\n");
+ mca &= ~(1UL << 12);
+ }
- switch(mca) {
- case 0x0:
- return snprintf(buf, len, "%s", "No Error");
- break;
- case 0x1:
- return snprintf(buf, len, "%s", "Unclassified");
- break;
- case 0x2:
- return snprintf(buf, len, "%s", "Microcode ROM Parity Error");
- break;
- case 0x3:
- return snprintf(buf, len, "%s", "External Error");
- break;
- case 0x4:
- return snprintf(buf, len, "%s", "FRC Error");
- break;
- default:
- break;
+ if (mca < NELE(msg)) {
+ Wprintf("%s\n", msg[mca]);
+ return;
}
- if (test_prefix(4, mca)) {
- return snprintf(buf, len, "%s TLB %s Error",
+ if ((mca >> 2) == 3) {
+ Wprintf("%s Generic memory hierarchy error\n", get_LL_str(mca & 3));
+ } else if (test_prefix(4, mca)) {
+ Wprintf("%s TLB %s Error\n",
get_TT_str((mca & TLB_TT_MASK) >> TLB_TT_SHIFT),
get_LL_str((mca & TLB_LL_MASK) >>
TLB_LL_SHIFT));
- }
- if (test_prefix(8, mca)) {
- return snprintf(buf, len, "%s CACHE %s %s Error",
+ } else if (test_prefix(8, mca)) {
+ Wprintf("%s CACHE %s %s Error\n",
get_TT_str((mca & CACHE_TT_MASK) >>
CACHE_TT_SHIFT),
get_LL_str((mca & CACHE_LL_MASK) >>
CACHE_LL_SHIFT),
get_RRRR_str((mca & CACHE_RRRR_MASK) >>
CACHE_RRRR_SHIFT));
- }
- if (test_prefix(10, mca)) {
+ } else if (test_prefix(10, mca)) {
if (mca == 0x400)
- return snprintf(buf, len, "Internal Timer error");
+ Wprintf("Internal Timer error\n");
else
- return snprintf(buf, len,
- "Internal unclassified errors");
- }
- if (test_prefix(11, mca)) {
-
- return snprintf(buf, len, "BUS %s %s %s %s %s Error",
+ Wprintf("Internal unclassified error: %x\n", mca & 0xffff);
+ } else if (test_prefix(11, mca)) {
+ Wprintf("BUS %s %s %s %s %s Error\n",
get_LL_str((mca & BUS_LL_MASK) >> BUS_LL_SHIFT),
get_PP_str((mca & BUS_PP_MASK) >> BUS_PP_SHIFT),
get_RRRR_str((mca & BUS_RRRR_MASK) >>
BUS_RRRR_SHIFT),
get_II_str((mca & BUS_II_MASK) >> BUS_II_SHIFT),
get_T_str((mca & BUS_T_MASK) >> BUS_T_SHIFT));
- }
- return snprintf(buf, len, "Unknown Error");
+ } else if (test_prefix(7, mca)) {
+ decode_memory_controller(mca);
+ } else
+ Wprintf("Unknown Error %x\n", mca);
}
-static void decode_model(__u32 model)
+static void p4_decode_model(__u32 model)
{
static struct {
int value;
@@ -219,17 +217,27 @@ static void decode_model(__u32 model)
Wprintf("\n");
}
-static void decode_mci(__u64 status)
+static void decode_tracking(u64 track, int cpu)
{
-#define BUF_LEN 200
- char buf[BUF_LEN];
- __u32 mca;
+ static char *msg[] = {
+ [1] = "green",
+ [2] = "yellow\n"
+"Large number of corrected errors. System operating, but you should\n"
+"schedule it for service within a few weeks",
+ [3] ="res3" };
+ if (track) {
+ Wprintf("Threshold based error status: %s\n", msg[track]);
+ if (track == 2)
+ Wprintf(
+ "CPU %d has large number of corrected errors. Consider replacement", cpu);
+ }
+}
+static void decode_mci(__u64 status, int cpu)
+{
Wprintf("MCi status:\n");
- if (!(status & MCI_STATUS_VAL)) {
- Wprintf("Invalid log\n");
- return;
- }
+ if (!(status & MCI_STATUS_VAL))
+ Wprintf("Machine check not valid\n");
if (status & MCI_STATUS_OVER)
Wprintf("Error overflow\n");
@@ -249,15 +257,9 @@ static void decode_mci(__u64 status)
if (status & MCI_STATUS_PCC)
Wprintf("Processor context corrupt\n");
- mca = status & 0xFFFFL;
- decode_mca(mca, buf, BUF_LEN);
- Wprintf("MCA:%s\n", buf);
-
- if (test_prefix(11, mca)) {
- __u32 model;
- model = (status & 0xFFFF0000L);
- decode_model(model);
- }
+ decode_tracking((status >> 54) & 3, cpu);
+ Wprintf("MCA: ");
+ decode_mca(status & 0xffffL);
}
static void decode_mcg(__u64 mcgstatus)
@@ -272,13 +274,36 @@ static void decode_mcg(__u64 mcgstatus)
Wprintf("\n");
}
-void decode_p4_mc(struct mce *log)
+void decode_intel_mc(struct mce *log, int cputype)
{
+ int cpu = log->cpu;
+
decode_mcg(log->mcgstatus);
- decode_mci(log->status);
+ decode_mci(log->status, cpu);
+
+ if (test_prefix(11, (log->status & 0xffffL))) {
+ switch (cputype) {
+ case CPU_P6OLD:
+ p6old_decode_model(log->status);
+ break;
+ case CPU_DUNNINGTON:
+ case CPU_CORE2:
+ core2_decode_model(log->status);
+ break;
+ case CPU_P4:
+ p4_decode_model(log->status & 0xffff0000L);
+ break;
+ case CPU_NEHALEM:
+ nehalem_decode_model(log->status, log->misc);
+ break;
+ }
+ }
+
+ if (cputype == CPU_DUNNINGTON)
+ dunnington_decode_model(log->status);
}
-char *p4_bank_name(int num)
+char *intel_bank_name(int num)
{
static char bname[64];
sprintf(bname, "BANK %d", num);
diff -x '*~' -urpN mcelog-0.7/p4.h mcelog-0.7-newcpus//p4.h
--- mcelog-0.7/p4.h 2006-05-03 08:55:54.000000000 +0200
+++ mcelog-0.7-newcpus//p4.h 2008-09-26 20:35:46.000000000 +0200
@@ -1,2 +1,2 @@
-char *p4_bank_name(int num);
-void decode_p4_mc(struct mce* mce);
+char *intel_bank_name(int num);
+void decode_intel_mc(struct mce *log, int cpu);